def __init__(self, name, argv): parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name)) parser.add_argument( "-t", "--text", type=str, help="input text", ) parser.add_argument("-s", "--sep", type=bool, help=f"default: {self.separator}", default=self.separator) parser.add_argument("-e", "--engine", type=str, help=f"default: {self.engine}", default=self.engine) parser.add_argument("-w", "--keep-whitespace", type=bool, help=f"default: {self.keep_whitespace}", default=self.keep_whitespace) args = parser.parse_args(argv) self.args = args cli.exit_if_empty(args.text, parser) result = self.run(args.text, engine=args.engine) print(self.separator.join(result))
def __init__(self, argv): parser = argparse.ArgumentParser(**cli.make_usage("tokenize")) parser.add_argument("subcommand", type=str, nargs="?", help="[subword|syllable|word|sent]") args = parser.parse_args(argv[2:3]) cli.exit_if_empty(args.subcommand, parser) subcommand = str.lower(args.subcommand) argv = argv[3:] if subcommand.startswith("word"): WordTokenizationApp("word", argv) elif subcommand.startswith("syl"): SyllableTokenizationApp("syllable", argv) elif subcommand.startswith("subw"): SubwordTokenizationApp("subword", argv) elif subcommand.startswith("sent"): SubwordTokenizationApp("sent", argv) else: raise NotImplementedError( f"Subcommand not available: {subcommand}")
def __init__(self, argv): parser = argparse.ArgumentParser(**cli.make_usage("corpus")) parser.add_argument( "subcommand", type=str, default="", nargs="?", help="[download|remove]" # there should be a "list" subcommand ) parser.add_argument( "--name", type=str, help="corpus's name", ) args = parser.parse_args(argv[2:]) cli.exit_if_empty(args.subcommand, parser) subcommand = str.lower(args.subcommand) if hasattr(App, subcommand): getattr(App, subcommand)(args) else: raise NotImplementedError( f"Subcommand not available: {subcommand}")
def __init__(self, name, argv): parser = argparse.ArgumentParser( **cli.make_usage("tokenization " + name) ) parser.add_argument( "--text", type=str, help="input text", ) parser.add_argument( "--engine", type=str, help="default: %s" % self.default_engine, default=self.default_engine ) args = parser.parse_args(argv) self.args = args cli.exit_if_empty(args.text, parser) print(f"Using engine={args.engine}") result = self.run(args.text, engine=args.engine) print(self.separator.join(result))
def __init__(self, name, argv): parser = argparse.ArgumentParser(**cli.make_usage("tokenize " + name)) parser.add_argument( "text", type=str, nargs="?", help="input text", ) parser.add_argument( "-s", "--sep", dest="separator", type=str, help=f"default: {self.separator}", default=self.separator, ) parser.add_argument( "-a", "--algo", dest="algorithm", type=str, help=f"default: {self.algorithm}", default=self.algorithm, ) parser.add_argument( "-w", "--keep-whitespace", dest="keep_whitespace", action="store_true", ) parser.add_argument( "-nw", "--no-whitespace", dest="keep_whitespace", action="store_false", ) parser.set_defaults(keep_whitespace=True) args = parser.parse_args(argv) self.args = args cli.exit_if_empty(args.text, parser) result = self.run( args.text, engine=args.algorithm, keep_whitespace=args.keep_whitespace, ) print(args.separator.join(result) + args.separator)
def __init__(self, argv): parser = argparse.ArgumentParser(**cli.make_usage("tag")) parser.add_argument("subcommand", type=str, nargs="?", help="[pos]") args = parser.parse_args(argv[2:3]) cli.exit_if_empty(args.subcommand, parser) subcommand = str.lower(args.subcommand) argv = argv[3:] if subcommand == "pos": POSTaggingApp("Part-of-Speech tagging", argv) else: raise NotImplementedError( f"Subcommand not available: {subcommand}")
def __init__(self, argv): parser = argparse.ArgumentParser(**cli.make_usage("tagging")) parser.add_argument( "command", type=str, nargs="?", help="[pos]" ) args = parser.parse_args(argv[2:3]) command = args.command cli.exit_if_empty(args.command, parser) argv = argv[3:] if command == "pos": POSTaggingApp("Part-of-Speech tagging", argv) else: raise ValueError(f"no command:{subcommand}")
def __init__(self, argv): parser = argparse.ArgumentParser(**cli.make_usage("tokenization")) parser.add_argument( "command", type=str, nargs="?", help="[word|syllable]" ) args = parser.parse_args(argv[2:3]) command = args.command cli.exit_if_empty(command, parser) argv = argv[3:] if command == "word": WordTokenizationApp("word", argv) elif command == "syllable": SyllableTokenizationApp("syllable", argv)
def __init__(self, argv): parser = argparse.ArgumentParser(**cli.make_usage("corpus")) parser.add_argument( "--name", type=str, help="corpus's name", ) parser.add_argument("command", type=str, default="", nargs="?", help="[download|remove]") args = parser.parse_args(argv[2:]) cli.exit_if_empty(args.command, parser) command = args.command if hasattr(App, command): getattr(App, command)(args) else: print("No command available: %s" % command)
def __init__(self, name, argv): parser = argparse.ArgumentParser(**cli.make_usage("benchmark " + name)) parser.add_argument( "--input-file", action="store", help="Path to input file to compare against the test file", ) parser.add_argument( "--test-file", action="store", help="Path to test file i.e. ground truth", ) parser.add_argument( "--save-details", default=False, action="store_true", help=("Save comparison details to files (eval-XXX.json" " and eval-details-XXX.json)"), ) args = parser.parse_args(argv) actual = _read_file(args.input_file) expected = _read_file(args.test_file) assert len(actual) == len( expected ), "Input and test files do not have the same number of samples" print("Benchmarking %s against %s with %d samples in total" % (args.input_file, args.test_file, len(actual))) df_raw = word_tokenization.benchmark(expected, actual) columns = [ "char_level:tp", "char_level:fp", "char_level:tn", "char_level:fn", "word_level:correctly_tokenised_words", "word_level:total_words_in_sample", "word_level:total_words_in_ref_sample", ] statistics = dict() for c in columns: statistics[c] = float(df_raw[c].sum()) statistics["char_level:precision"] = statistics["char_level:tp"] / ( statistics["char_level:tp"] + statistics["char_level:fp"]) statistics["char_level:recall"] = statistics["char_level:tp"] / ( statistics["char_level:tp"] + statistics["char_level:fn"]) statistics["word_level:precision"] = ( statistics["word_level:correctly_tokenised_words"] / statistics["word_level:total_words_in_sample"]) statistics["word_level:recall"] = ( statistics["word_level:correctly_tokenised_words"] / statistics["word_level:total_words_in_ref_sample"]) print("============== Benchmark Result ==============") for c in ["tp", "fn", "tn", "fp", "precision", "recall"]: c = f"char_level:{c}" v = statistics[c] print(f"{c:>40s} {v:.4f}") for c in [ "total_words_in_sample", "total_words_in_ref_sample", "correctly_tokenised_words", "precision", "recall", ]: c = f"word_level:{c}" v = statistics[c] print(f"{c:>40s} {v:.4f}") if args.save_details: dir_name = os.path.dirname(args.input_file) file_name = args.input_file.split("/")[-1].split(".")[0] res_path = "%s/eval-%s.yml" % (dir_name, file_name) print("Evaluation result is saved to %s" % res_path) with open(res_path, "w", encoding="utf-8") as outfile: yaml.dump(statistics, outfile, default_flow_style=False) res_path = "%s/eval-details-%s.json" % (dir_name, file_name) print("Details of comparisons is saved to %s" % res_path) with open(res_path, "w", encoding="utf-8") as f: samples = [] for i, r in enumerate(df_raw.to_dict("records")): expected, actual = r["expected"], r["actual"] del r["expected"] del r["actual"] samples.append( dict(metrics=r, expected=expected, actual=actual, id=i)) details = dict(metrics=statistics, samples=samples) json.dump(details, f, ensure_ascii=False)