def run_model_targetid(text_lines, model_variables): """`text_input` is the document, which contains lines""" builders = model_variables['builders'] model_dir = model_variables['model_dir'] instances = [] offsets = [] # contains array with for each row, the offsets of each token for i, line in enumerate(text_lines): instance, offset = make_data_instance(line, i, get_offsets=True) instances.append(instance) offsets.append(offset) # filter empty rows instances = [el for el in instances if el] with open('{}offsets.json'.format(model_dir), 'w') as f: json.dump(offsets, f) out_conll_file = "{}predicted-targets.conll".format(model_dir) predictions = [] for instance in instances: _, prediction = identify_targets(builders, instance.tokens, instance.postags, instance.lemmas, model_variables) predictions.append(prediction) sys.stderr.write( "Printing output in CoNLL format to {}\n".format(out_conll_file)) print_as_conll_targetid(instances, predictions, out_conll_file) sys.stderr.write("Done!\n") return offsets
def load_instances(filepath): print("Loading Instances from {}".format(filepath)) with codecs.open(filepath, 'r', encoding='utf-8') as infile: lines = infile.readlines() instances = [ make_data_instance(line, i) for i, line in enumerate(lines) ] return instances
if options.mode in ["train", "refresh"]: dev_examples, _, _ = read_conll(DEV_CONLL) combined_dev = combine_examples(dev_examples) out_conll_file = "{}predicted-{}-targetid-dev.conll".format( model_dir, VERSION) elif options.mode == "test": dev_examples, m, t = read_conll(TEST_CONLL) combined_dev = combine_examples(dev_examples) out_conll_file = "{}predicted-{}-targetid-test.conll".format( model_dir, VERSION) elif options.mode == "predict": assert options.raw_input is not None with open(options.raw_input, "rb") as fin: instances = [ make_data_instance(line.decode(encoding='UTF-8', errors='strict'), i) for i, line in enumerate(fin) if line is not '\n' ] filename = options.raw_input.split('/')[-1].split('.')[0] out_conll_file = "{}/lingfn_comparison/predicted-targets_{}.conll".format( model_dir, filename) else: raise Exception("Invalid parser mode", options.mode) # Default configurations. configuration = { "train": train_conll, "unk_prob": 0.1, "dropout_rate": 0.01, "token_dim": 100, "pos_dim": 100,
UNKTOKEN = VOCDICT.getid(UNK) if options.mode in ["train", "refresh"]: dev_examples, _, _ = read_conll(DEV_CONLL) combined_dev = combine_examples(dev_examples) out_conll_file = "{}predicted-{}-targetid-dev.conll".format( model_dir, VERSION) elif options.mode == "test": dev_examples, m, t = read_conll(TEST_CONLL) combined_dev = combine_examples(dev_examples) out_conll_file = "{}predicted-{}-targetid-test.conll".format( model_dir, VERSION) elif options.mode == "predict": assert options.raw_input is not None with open(options.raw_input, "r") as fin: instances = [make_data_instance(line, i) for i, line in enumerate(fin)] out_conll_file = "{}predicted-targets.conll".format(model_dir) else: raise Exception("Invalid parser mode", options.mode) # Default configurations. configuration = { "train": train_conll, "unk_prob": 0.1, "dropout_rate": 0.01, "token_dim": 100, "pos_dim": 100, "lemma_dim": 100, "lstm_input_dim": 100, "lstm_dim": 100, "lstm_depth": 2,
def load_instances(story): lines = nltk.sent_tokenize(story) instances = [make_data_instance(line, i) for i, line in enumerate(lines)] return instances