def test_iterations(config, model_type): filename = "test_files/models/%s_%s_iterations" % (FORMATS[0], model_type) remove_existing(filename) config.update(dict(classifier=model_type)) passages = list(map(load_passage, passage_files(FORMATS[0]))) last = 0 iterations = [] for i in 2, 5, 9, (11, True), (4, True): if isinstance(i, tuple): i, simple = i else: simple = False iterations.append(Iterations("%d --word-dim=%d" % (i, i))) scores = list( Parser(model_files=filename, config=config).train( passages, dev=passages, iterations=i if simple else iterations)) assert max(0, i - last) == len(scores) last = i for iterations in ((3, 2), (4, 4)): with pytest.raises( ValueError): # Number of epochs must be strictly increasing list( Parser(model_files=filename, config=config).train(passages, dev=passages, iterations=iterations))
def test_empty_features(empty_features_config, model_type): filename = "test_files/models/%s_%s_empty_features" % (FORMATS[0], model_type) remove_existing(filename) empty_features_config.update(dict(classifier=model_type)) passages = list(map(load_passage, passage_files(FORMATS[0]))) p = Parser(model_files=filename, config=empty_features_config) list(p.train(passages, dev=passages, test=True, iterations=2)) list(p.parse(passages, evaluate=True))
def test_ensemble(config, model_type): config.update(dict(classifier=model_type, lstm_layers=0)) filenames = ["test_files/models/%s_%s_ensemble%d" % (FORMATS[0], model_type, i) for i in range(1, 3)] passages = list(map(load_passage, passage_files(FORMATS[0]))) for i, filename in enumerate(filenames, start=1): config.update(dict(seed=i)) remove_existing(filename) list(Parser(model_files=filename, config=config).train(passages, dev=passages, iterations=2)) list(Parser(model_files=filenames, config=config).parse(passages, evaluate=True))
def test_copy_shared(config, model_type): filename = "test_files/models/%s_%s_copy_shared" % ("_".join(FORMATS), model_type) remove_existing(filename) config.update(dict(classifier=model_type, lstm_layers=0, copy_shared=[FORMATS[0]])) for formats in ((FORMATS[0],), FORMATS): p = Parser(model_files=filename, config=config) passages = list(map(load_passage, passage_files(*formats))) list(p.train(passages, dev=passages, test=True, iterations=2)) config.update_hyperparams(ucca={"lstm_layers": 1})
def test_extra_classifiers(config, model_type, default_setting): filename = "test_files/models/%s_%s%s" % (FORMATS[0], model_type, default_setting.suffix()) remove_existing(filename) config.update(default_setting.dict()) passages = list(map(load_passage, passage_files(FORMATS[0]))) for mode in "train", "load": print("-- %sing %s" % (mode, model_type)) config.update(dict(classifier=model_type, copy_shared=None)) p = Parser(model_files=filename, config=config) list(p.train(passages if mode == "train" else None, dev=passages, test=True, iterations=2)) assert p.model.is_finalized, "Model should be finalized after %sing" % mode assert not getattr(p.model.feature_extractor, "node_dropout", 0), p.model.feature_extractor.node_dropout
def get_parser(): if not UCCA_PARSER_PATH.parent.exists(): download_ucca_model() update_ucca_path() with mock_sys_argv(['']): # Need to mock sysargs otherwise the parser will use try to use them and throw an exception return Parser(str(UCCA_PARSER_PATH))
def get_parser(model_path): global PARSER global PARSER_PATH if PARSER_PATH is not model_path or PARSER is None: PARSER_PATH = model_path PARSER = Parser(model_path) return PARSER
def test_train_empty(config, model_type, default_setting): config.update(default_setting.dict()) config.update(dict(classifier=model_type)) filename = "empty" remove_existing(filename) with pytest.raises(ParserException): list(Parser(model_files=filename, config=config).train(load_passage("nonexistent file")))
def get_parser(): if app.parser is None: print("Initializing parser...") print("PARSER_MODEL=" + PARSER_MODEL) print("PARSER_TYPE=" + PARSER_TYPE) app.parser = Parser(PARSER_MODEL, PARSER_TYPE) return app.parser
def get_parser(): global PARSER if PARSER is None: # import at the top of the file ruins any other importing code that # uses argparse model_path = "/cs/labs/oabend/borgr/tupa/models/bilstm" PARSER = Parser(model_path, "bilstm") return PARSER
def train_test(self, model_type, compare=True): scores = [] p = None for mode in "train", "load": print("-- %sing %s" % (mode, model_type)) p = Parser(model_file="test_files/models/%s" % model_type, model_type=model_type) p.train(self.load_passages() if mode == "train" else None, iterations=200) score = evaluation.Scores.aggregate([s for _, s in p.parse(self.load_passages(), evaluate=True)]) scores.append(score.average_f1()) print() print("-- average labeled f1: %.3f, %.3f\n" % tuple(scores)) if compare: self.assertAlmostEqual(*scores) p.parse(convert.to_text(self.load_passages()[0])) self.assertFalse(list(p.parse(()))) # parsing nothing returns nothing
def test_parser(config, model_type, formats, default_setting, text=True): filename = "test_files/models/%s_%s%s" % ("_".join(formats), model_type, default_setting.suffix()) remove_existing(filename) config.update(default_setting.dict()) scores = [] params = [] passages = list(map(load_passage, passage_files(*formats))) evaluate = ("amr" not in formats) for mode in "train", "load": print("-- %sing %s" % (mode, model_type)) config.update(dict(classifier=model_type, copy_shared=None)) p = Parser(model_files=filename, config=config) p.save_init = True list( p.train(passages if mode == "train" else None, dev=passages, test=True, iterations=2)) assert p.model.is_finalized, "Model should be finalized after %sing" % mode assert not getattr(p.model.feature_extractor, "node_dropout", 0), p.model.feature_extractor.node_dropout all_params = p.model.all_params() params.append(all_params) param1, param2 = [ d.get("W") for d in (all_params, p.model.feature_extractor.params) ] if param1 is not None and param2 and param2.init is not None and not config.args.update_word_vectors: assert_allclose(param1, weight_decay(p.model) * param2.init, rtol=1e-6) text_results = results = list(p.parse(passages, evaluate=evaluate)) if text: print("Converting to text and parsing...") text_results = list( p.parse([ p3 for p1 in passages for p2 in convert.to_text(p1, sentences=False) for p3 in convert.from_text( p2, p1.ID, extra_format=p1.extra.get("format")) ])) assert len(results) == len(text_results) if evaluate: scores.append(Scores(tuple(zip(*results))[1]).average_f1()) if text: for t, (r, s) in zip(text_results, results): print(" %s F1=%.3f" % (r.ID, s.average_f1())) assert not list(p.parse(())) # parsing nothing returns nothing print() assert_all_params_equal(*params) if evaluate: print("-- average f1: %.3f, %.3f\n" % tuple(scores)) assert scores[0] == pytest.approx(scores[1], 0.1)
def __init__(self, model_prefix): """ Parameters ---------- """ # the sys.argv assignment is a necessary: without it tupa.parse.Parser will throw exceptions remember_argv = sys.argv sys.argv = ['-m', model_prefix] parser = Parser(model_files=model_prefix) parser.models[0].load() parser.trained = True self.__parser = parser # Since 'parse_sentence' calls 'annotate_all' which lazily instantiated a spacey pipeline, # and since we want all the initialization to occur in the __init__ method, we simply call # 'parse_sentence' with dummy input self.parse_sentence('Hello dummy world') # undo hack side effect sys.argv = remember_argv
def get_ucca_parser(): ucca_dir = RESOURCES_DIR / 'ucca' os.chdir(str(ucca_dir)) model_path = ucca_dir / 'models/ucca-bilstm' vocab_path = ucca_dir / 'vocab' argv = ['script_name', '-m', str(model_path), '--vocab', str(vocab_path)] with unittest.mock.patch('sys.argv', argv): Config.reload() args = Config().args model_files = [ base + '' + ext for base, ext in map(os.path.splitext, args.models or ( args.classifier, )) ] return Parser(model_files=model_files, config=Config(), beam=1)