def main(config_path): args = parse_config(config_path) # Load sentences test_sentences = load_sentences(args["path_test"], args["replace_digit"]) # Update tagging scheme (IOB/IOBES) update_tag_scheme(test_sentences, args["tag_scheme"]) # Load mappings from disk id_to_word, id_to_char, id_to_tag = load_mappings(args["mappings_path"]) word_to_id = {v: k for k, v in id_to_word.items()} char_to_id = {v: k for k, v in id_to_char.items()} tag_to_id = {v: k for k, v in id_to_tag.items()} # Index data test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, None, args["lowercase"]) test_iter = iterators.SerialIterator(test_data, args["batch_size"], repeat=False, shuffle=False) model = Model(len(word_to_id), len(char_to_id), len(tag_to_id), args) serializers.load_npz(args['path_model'], model) model.id_to_tag = id_to_tag model.parameters = args device = args['gpus'] if device['main'] >= 0: cuda.get_device_from_id(device['main']).use() model.to_gpu() pred_tags = [] gold_tags = [] words = [] # Collect predictions for ts, ys, xs in predict(test_iter, model, args['mode']): gold_tags.extend(ts) pred_tags.extend(ys) words.extend(xs) evaluate(model, pred_tags, gold_tags, words)
def main(config_path): # Init args args = parse_config(config_path) # Load sentences train_sentences = load_sentences(args["path_train"], args["replace_digit"]) dev_sentences = load_sentences(args["path_dev"], args["replace_digit"]) # Update tagging scheme (IOB/IOBES) update_tag_scheme(train_sentences, args["tag_scheme"]) update_tag_scheme(dev_sentences, args["tag_scheme"]) # Create a dictionary / mapping of words if args['path_pre_emb']: dico_words_train = word_mapping(train_sentences, args["lowercase"])[0] dico_words, word_to_id, id_to_word, pretrained = augment_with_pretrained( dico_words_train.copy(), args['path_pre_emb'], list(itertools.chain.from_iterable([[w[0] for w in s] for s in dev_sentences]))) else: dico_words, word_to_id, id_to_word = word_mapping(train_sentences, args["lowercase"]) dico_words_train = dico_words # Create a dictionary and a mapping for words / POS tags / tags dico_chars, char_to_id, id_to_char = char_mapping(train_sentences + dev_sentences) dico_entities, entity_to_id, id_to_entity = entity_mapping(train_sentences + dev_sentences) # Set id of tag 'O' as 0 in order to make it easier for padding # Resort id_to_tag id_to_tag, tag_to_id = entity_tags(id_to_entity) if args["use_singletons"]: singletons = set([word_to_id[k] for k, v in dico_words_train.items() if v == 1]) else: singletons = None # Index data train_data = prepare_dataset(train_sentences, word_to_id, char_to_id, tag_to_id, singletons, args["lowercase"]) dev_data = prepare_dataset(dev_sentences, word_to_id, char_to_id, tag_to_id, None, args["lowercase"]) print("%i / %i sentences in train / dev." % (len(train_data), len(dev_data))) # Init model model = Model(len(word_to_id), len(char_to_id), len(tag_to_id), args) if args['gpus']['main'] >= 0: cuda.get_device_from_id(args['gpus']['main']).use() model.to_gpu() print('Saving the mappings to disk...') model.save_mappings(id_to_word, id_to_char, id_to_tag, args) if args['path_pre_emb']: print("Loading pretrained embedding...") model.load_pretrained(args['path_pre_emb']) result_path = '../result/' # Init Iterators train_iter = chainer.iterators.SerialIterator(train_data, model.batch_size) dev_iter = chainer.iterators.SerialIterator(dev_data, model.batch_size, repeat=False) # Reset cost matrix id_to_tag = model.id_to_tag cost = model.crf.cost.data model.crf.cost.data = load_cost_matrix(id_to_tag, cost) # Init Optimizer optimizer = chainer.optimizers.Adam(model.lr_param) optimizer.setup(model) optimizer.add_hook(chainer.optimizer.GradientClipping(model.threshold)) optimizer.add_hook(chainer.optimizer.WeightDecay(model.decay_rate)) # Init early_stopping_trigger early_stopping_trigger = EarlyStoppingTrigger(args["epoch"], key='dev/main/fscore', eps=args["early_stopping_eps"], early_stopping=args["early_stopping"]) # Init Updater, Trainer and Evaluator updater = Updater(train_iter, optimizer, args['gpus']) trainer = training.Trainer(updater, stop_trigger=early_stopping_trigger, out=result_path) trainer.extend(Evaluator(dev_iter, optimizer.target, args['gpus'])) # Save the best model trainer.extend(extensions.snapshot_object(model, 'model_iter_{.updater.iteration}'), trigger=training.triggers.MaxValueTrigger('dev/main/fscore')) trainer.extend(extensions.LogReport()) trainer.extend(extensions.PrintReport( ['epoch', 'main/loss', 'dev/main/loss', 'main/accuracy', 'dev/main/accuracy', 'elapsed_time'])) if extensions.PlotReport.available(): # Plot graph for loss,accuracy and fscore for each epoch trainer.extend(extensions.PlotReport(['main/loss', 'dev/main/loss'], x_key='epoch', file_name='loss.png')) trainer.extend(extensions.PlotReport(['main/accuracy', 'dev/main/accuracy'], x_key='epoch', file_name='accuracy.png')) trainer.extend(extensions.PlotReport(['dev/main/fscore'], x_key='epoch', file_name='fscore.png')) trainer.run()
def main(config_path): args = parse_config(config_path) # Load sentences test_sentences = load_sentences(args["path_test"], args["replace_digit"]) # Update tagging scheme (IOB/IOBES) update_tag_scheme(test_sentences, args["tag_scheme"]) # Load mappings from disk id_to_word, id_to_char, id_to_tag = load_mappings(args["mappings_path"]) word_to_id = {v: k for k, v in id_to_word.items()} char_to_id = {v: k for k, v in id_to_char.items()} tag_to_id = {v: k for k, v in id_to_tag.items()} # Index data test_data = prepare_dataset(test_sentences, word_to_id, char_to_id, tag_to_id, None, args["lowercase"]) test_iter = iterators.SerialIterator(test_data, args["batch_size"], repeat=False, shuffle=False) model = Model(len(word_to_id), len(char_to_id), len(tag_to_id), args) serializers.load_npz(args['path_model'], model) model.id_to_tag = id_to_tag model.parameters = args device = args['gpus'] if device['main'] >= 0: cuda.get_device_from_id(device['main']).use() model.to_gpu() pred_tags = [] gold_tags = [] words = [] # Collect predictions out = open(args['predictions_path'], "w", encoding="utf-8") all_true = {} all_pred = {} idx = 0 for ts, ys, xs in predict(test_iter, model, args['mode']): gold_tags.extend(ts) pred_tags.extend(ys) words.extend(xs) # for sentence in batch size for i in range(len(xs)): true_entities = get_entities(xs[i], ts[i], id_to_tag) pred_entities = get_entities(xs[i], ys[i], id_to_tag) out.write("%s\t%s\n" % ("|".join( ["%s %s %s" % (v[1], v[2], v[3]) for v in true_entities]), "|".join( ["%s %s %s" % (v[1], v[2], v[3]) for v in pred_entities]))) for sid, start, end, label in true_entities: all_true[(idx, sid, start, end, label)] = 1 for sid, start, end, label in pred_entities: all_pred[(idx, sid, start, end, label)] = 1 idx += 1 out.close() calc_f(all_pred, all_true) evaluate(model, pred_tags, gold_tags, words)