Пример #1
0
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
        parser.add_argument(
            "-t",
            "--text",
            type=str,
            help="input text",
        )

        parser.add_argument("-s",
                            "--sep",
                            type=bool,
                            help=f"default: {self.separator}",
                            default=self.separator)

        parser.add_argument("-e",
                            "--engine",
                            type=str,
                            help=f"default: {self.engine}",
                            default=self.engine)

        parser.add_argument("-w",
                            "--keep-whitespace",
                            type=bool,
                            help=f"default: {self.keep_whitespace}",
                            default=self.keep_whitespace)

        args = parser.parse_args(argv)

        self.args = args

        cli.exit_if_empty(args.text, parser)

        result = self.run(args.text, engine=args.engine)
        print(self.separator.join(result))
Пример #2
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize"))
        parser.add_argument("subcommand",
                            type=str,
                            nargs="?",
                            help="[subword|syllable|word|sent]")

        args = parser.parse_args(argv[2:3])

        cli.exit_if_empty(args.subcommand, parser)
        subcommand = str.lower(args.subcommand)

        argv = argv[3:]

        if subcommand.startswith("word"):
            WordTokenizationApp("word", argv)
        elif subcommand.startswith("syl"):
            SyllableTokenizationApp("syllable", argv)
        elif subcommand.startswith("subw"):
            SubwordTokenizationApp("subword", argv)
        elif subcommand.startswith("sent"):
            SubwordTokenizationApp("sent", argv)
        else:
            raise NotImplementedError(
                f"Subcommand not available: {subcommand}")
Пример #3
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("corpus"))

        parser.add_argument(
            "subcommand",
            type=str,
            default="",
            nargs="?",
            help="[download|remove]"  # there should be a "list" subcommand
        )

        parser.add_argument(
            "--name",
            type=str,
            help="corpus's name",
        )

        args = parser.parse_args(argv[2:])

        cli.exit_if_empty(args.subcommand, parser)
        subcommand = str.lower(args.subcommand)

        if hasattr(App, subcommand):
            getattr(App, subcommand)(args)
        else:
            raise NotImplementedError(
                f"Subcommand not available: {subcommand}")
Пример #4
0
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(
            **cli.make_usage("tokenization " + name)
        )
        parser.add_argument(
            "--text",
            type=str,
            help="input text",
        )

        parser.add_argument(
            "--engine",
            type=str,
            help="default: %s" % self.default_engine,
            default=self.default_engine
        )

        args = parser.parse_args(argv)

        self.args = args

        cli.exit_if_empty(args.text, parser)

        print(f"Using engine={args.engine}")
        result = self.run(args.text, engine=args.engine)
        print(self.separator.join(result))
Пример #5
0
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name))
        parser.add_argument(
            "text",
            type=str,
            nargs="?",
            help="input text",
        )
        parser.add_argument(
            "-s",
            "--sep",
            dest="separator",
            type=str,
            help=f"default: {self.separator}",
            default=self.separator,
        )
        parser.add_argument(
            "-a",
            "--algo",
            dest="algorithm",
            type=str,
            help=f"default: {self.algorithm}",
            default=self.algorithm,
        )
        parser.add_argument(
            "-w",
            "--keep-whitespace",
            dest="keep_whitespace",
            action="store_true",
        )
        parser.add_argument(
            "-nw",
            "--no-whitespace",
            dest="keep_whitespace",
            action="store_false",
        )
        parser.set_defaults(keep_whitespace=True)

        args = parser.parse_args(argv)
        self.args = args

        cli.exit_if_empty(args.text, parser)
        result = self.run(
            args.text,
            engine=args.algorithm,
            keep_whitespace=args.keep_whitespace,
        )
        print(args.separator.join(result) + args.separator)
Пример #6
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tag"))
        parser.add_argument("subcommand", type=str, nargs="?", help="[pos]")

        args = parser.parse_args(argv[2:3])

        cli.exit_if_empty(args.subcommand, parser)
        subcommand = str.lower(args.subcommand)

        argv = argv[3:]

        if subcommand == "pos":
            POSTaggingApp("Part-of-Speech tagging", argv)
        else:
            raise NotImplementedError(
                f"Subcommand not available: {subcommand}")
Пример #7
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tagging"))
        parser.add_argument(
            "command",
            type=str,
            nargs="?",
            help="[pos]"
        )

        args = parser.parse_args(argv[2:3])
        command = args.command

        cli.exit_if_empty(args.command, parser)

        argv = argv[3:]

        if command == "pos":
            POSTaggingApp("Part-of-Speech tagging", argv)
        else:
            raise ValueError(f"no command:{subcommand}")
Пример #8
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("tokenization"))
        parser.add_argument(
            "command",
            type=str,
            nargs="?",
            help="[word|syllable]"
        )

        args = parser.parse_args(argv[2:3])
        command = args.command

        cli.exit_if_empty(command, parser)

        argv = argv[3:]

        if command == "word":
            WordTokenizationApp("word", argv)
        elif command == "syllable":
            SyllableTokenizationApp("syllable", argv)
Пример #9
0
    def __init__(self, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("corpus"))

        parser.add_argument(
            "--name",
            type=str,
            help="corpus's name",
        )

        parser.add_argument("command",
                            type=str,
                            default="",
                            nargs="?",
                            help="[download|remove]")

        args = parser.parse_args(argv[2:])

        cli.exit_if_empty(args.command, parser)
        command = args.command

        if hasattr(App, command):
            getattr(App, command)(args)
        else:
            print("No command available: %s" % command)
Пример #10
0
    def __init__(self, name, argv):
        parser = argparse.ArgumentParser(**cli.make_usage("benchmark " + name))

        parser.add_argument(
            "--input-file",
            action="store",
            help="Path to input file to compare against the test file",
        )

        parser.add_argument(
            "--test-file",
            action="store",
            help="Path to test file i.e. ground truth",
        )

        parser.add_argument(
            "--save-details",
            default=False,
            action="store_true",
            help=("Save comparison details to files (eval-XXX.json"
                  " and eval-details-XXX.json)"),
        )

        args = parser.parse_args(argv)

        actual = _read_file(args.input_file)
        expected = _read_file(args.test_file)

        assert len(actual) == len(
            expected
        ), "Input and test files do not have the same number of samples"

        print("Benchmarking %s against %s with %d samples in total" %
              (args.input_file, args.test_file, len(actual)))

        df_raw = word_tokenization.benchmark(expected, actual)

        columns = [
            "char_level:tp",
            "char_level:fp",
            "char_level:tn",
            "char_level:fn",
            "word_level:correctly_tokenised_words",
            "word_level:total_words_in_sample",
            "word_level:total_words_in_ref_sample",
        ]

        statistics = dict()

        for c in columns:
            statistics[c] = float(df_raw[c].sum())

        statistics["char_level:precision"] = statistics["char_level:tp"] / (
            statistics["char_level:tp"] + statistics["char_level:fp"])

        statistics["char_level:recall"] = statistics["char_level:tp"] / (
            statistics["char_level:tp"] + statistics["char_level:fn"])

        statistics["word_level:precision"] = (
            statistics["word_level:correctly_tokenised_words"] /
            statistics["word_level:total_words_in_sample"])

        statistics["word_level:recall"] = (
            statistics["word_level:correctly_tokenised_words"] /
            statistics["word_level:total_words_in_ref_sample"])

        print("============== Benchmark Result ==============")

        for c in ["tp", "fn", "tn", "fp", "precision", "recall"]:
            c = f"char_level:{c}"
            v = statistics[c]
            print(f"{c:>40s} {v:.4f}")

        for c in [
                "total_words_in_sample",
                "total_words_in_ref_sample",
                "correctly_tokenised_words",
                "precision",
                "recall",
        ]:
            c = f"word_level:{c}"
            v = statistics[c]
            print(f"{c:>40s} {v:.4f}")

        if args.save_details:
            dir_name = os.path.dirname(args.input_file)
            file_name = args.input_file.split("/")[-1].split(".")[0]

            res_path = "%s/eval-%s.yml" % (dir_name, file_name)
            print("Evaluation result is saved to %s" % res_path)

            with open(res_path, "w", encoding="utf-8") as outfile:
                yaml.dump(statistics, outfile, default_flow_style=False)

            res_path = "%s/eval-details-%s.json" % (dir_name, file_name)
            print("Details of comparisons is saved to %s" % res_path)

            with open(res_path, "w", encoding="utf-8") as f:
                samples = []
                for i, r in enumerate(df_raw.to_dict("records")):
                    expected, actual = r["expected"], r["actual"]
                    del r["expected"]
                    del r["actual"]

                    samples.append(
                        dict(metrics=r, expected=expected, actual=actual,
                             id=i))

                details = dict(metrics=statistics, samples=samples)

                json.dump(details, f, ensure_ascii=False)