def main(): parser = argparse.ArgumentParser( description="Dependency-Guided LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains = reader.read_conll(conf.train_file, -1, True) devs = reader.read_conll(conf.dev_file, conf.dev_num, False) tests = reader.read_conll(conf.test_file, conf.test_num, False) if conf.context_emb != ContextEmb.none: print('Loading the {} vectors for all datasets.'.format( conf.context_emb.name)) conf.context_emb_size = reader.load_elmo_vec( conf.train_file.replace(".sd", "").replace(".ud", "").replace( ".sud", "").replace(".predsd", "").replace( ".predud", "").replace(".stud", "").replace(".ssd", "") + "." + conf.context_emb.name + ".vec", trains) reader.load_elmo_vec( conf.dev_file.replace(".sd", "").replace(".ud", "").replace( ".sud", "").replace(".predsd", "").replace( ".predud", "").replace(".stud", "").replace(".ssd", "") + "." + conf.context_emb.name + ".vec", devs) reader.load_elmo_vec( conf.test_file.replace(".sd", "").replace(".ud", "").replace( ".sud", "").replace(".predsd", "").replace( ".predud", "").replace(".stud", "").replace(".ssd", "") + "." + conf.context_emb.name + ".vec", tests) conf.use_iobes(trains + devs + tests) conf.build_label_idx(trains) conf.build_deplabel_idx(trains + devs + tests) print("# deplabels: ", len(conf.deplabels)) print("dep label 2idx: ", conf.deplabel2idx) conf.build_word_idx(trains, devs, tests) conf.build_emb_table() conf.map_insts_ids(trains + devs + tests) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": if conf.train_num != -1: random.shuffle(trains) trains = trains[:conf.train_num] learn_from_insts(conf, conf.num_epochs, trains, devs, tests) else: ## Load the trained model. test_model(conf, tests) # pass print(opt.mode)
def read_parse_write(elmo, infile, outfile, mode): reader = Reader() insts = reader.read_conll(infile, -1, True) f = open(outfile, 'wb') all_vecs = [] for inst in insts: vec = parse_sentence(elmo, inst.input.words, mode=mode) all_vecs.append(vec) pickle.dump(all_vecs, f) f.close()
def main(): parser = argparse.ArgumentParser( description="Dependency-Guided LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains = reader.read_conll(conf.train_file, -1, True) devs = reader.read_conll(conf.dev_file, conf.dev_num, False) tests = reader.read_conll(conf.test_file, conf.test_num, False) conf.use_iobes(trains) conf.build_label_idx(trains) conf.build_deplabel_idx(trains + devs + tests) print("# deplabels: ", len(conf.deplabels)) print("dep label 2idx: ", conf.deplabel2idx) conf.build_word_idx(trains + devs + tests) conf.build_emb_table() conf.map_insts_ids(trains) print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": if conf.train_num != -1: random.shuffle(trains) trains = trains[:conf.train_num] learn_from_insts(conf, conf.num_epochs, trains) print(opt.mode)
def read_parse_write(elmo, infile, outfile,): reader = Reader() insts = reader.read_conll(infile, -1, True) f = open(outfile, 'wb') all_vecs = [] for inst in insts: sent = embed_sent(elmo, inst.input.words) # np.empty((len(sent)),dtype=np.float32) arr = [] for token in sent: # print(token) # print(token.embedding) arr.append(np.expand_dims(token.embedding.numpy(), axis=0)) # all_vecs.append(vec) all_vecs.append(np.concatenate(arr)) pickle.dump(all_vecs, f) f.close()
# # @author: Allan # from config.reader import Reader file = "data/ontonotes/train.sd.conllx" digit2zero = False reader = Reader(digit2zero) insts = reader.read_conll(file, -1, True) # devs = reader.read_conll(conf.dev_file, conf.dev_num, False) # tests = reader.read_conll(conf.test_file, conf.test_num, False) out_dep_label2num = {} out_doubledep2num = {} out_word2num = {} label2idx = {} def not_entity(label:str): if label.startswith("B-") or label.startswith("I-"): return False return True def is_entity(label:str): if label.startswith("B-") or label.startswith("I-"): return True return False
def main(): print('Reading arguments') parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) conf = Config(opt) conf_conll = Config_conll(opt) conf_ontonotes = Config_ontonotes(opt) reader = Reader(conf.digit2zero) setSeed(opt, conf.seed) trains_0 = reader.read_conll(conf.train_file_1, 0, conf.train_num, True) devs_0 = reader.read_conll(conf.dev_file_1, 0, conf.dev_num, False) tests_0 = reader.read_conll(conf.test_file_1, 0, conf.test_num, False) trains_1 = reader.read_conll(conf.train_file_2, 1, conf.train_num, True) devs_1 = reader.read_conll(conf.dev_file_2, 1, conf.dev_num, False) tests_1 = reader.read_conll(conf.test_file_2, 1, conf.test_num, False) trains_all = trains_0 + trains_1 devs_all = devs_0 + devs_1 tests_all = tests_0 + tests_1 if conf.context_emb != ContextEmb.none: print('Loading the elmo vectors for all datasets.') conf.context_emb_size = reader.load_elmo_vec( conf.train_file_1 + "." + conf.context_emb.name + ".vec", trains_1) reader.load_elmo_vec( conf.dev_file_1 + "." + conf.context_emb.name + ".vec", devs_1) reader.load_elmo_vec( conf.test_file_1 + "." + conf.context_emb.name + ".vec", tests_1) conf.use_iobes(trains_all) conf.use_iobes(devs_all) conf.use_iobes(tests_all) conf.build_label_idx(trains_all) conf.build_word_idx(trains_all, devs_all, tests_all) conf.build_emb_table() ids_train = conf.map_insts_ids(trains_all) ids_dev = conf.map_insts_ids(devs_all) ids_test = conf.map_insts_ids(tests_all) conf_conll.label_size = conf.label_size_0 conf_conll.label2idx = conf.label2idx_0 conf_conll.idx2labels = conf.idx2labels_0 conf_ontonotes.label_size = conf.label_size_1 conf_ontonotes.label2idx = conf.label2idx_1 conf_ontonotes.idx2labels = conf.idx2labels_1 print("num chars: " + str(conf.num_char)) # print(str(config.char2idx)) print("num words: " + str(len(conf.word2idx))) # print(config.word2idx) if opt.mode == "train": learn_from_insts(conf, conf_conll, conf_ontonotes, conf.num_epochs, trains_all, devs_all, tests_all) else: ## Load the trained model. test_model(conf, tests_all) # pass print(opt.mode)
if curr_entity.startswith("B-"): if next_entity.startswith("O") or next_entity.startswith("B-"): output[pos] = curr_entity.replace("B-", "S-") elif curr_entity.startswith("I-"): if next_entity.startswith("O") or next_entity.startswith("B-"): output[pos] = curr_entity.replace("I-", "E-") dataset = "ontonotes_chinese" train = "../data/"+dataset+"/train.sd.conllx" dev = "../data/"+dataset+"/dev.sd.conllx" test = "../data/"+dataset+"/test.sd.conllx" digit2zero = False reader = Reader(digit2zero) insts = reader.read_conll(train, -1, True) insts += reader.read_conll(dev, -1, False) insts += reader.read_conll(test, -1, False) use_iobes(insts) L = 3 def get_spans(output): output_spans = set() start = -1 for i in range(len(output)): if output[i].startswith("B-"): start = i if output[i].startswith("E-"): end = i output_spans.add(Span(start, end, output[i][2:]))
return list(self.nonterms_iter()) def __eq__(self, other): return other and self.pos == other.pos and self.children == other.children def __hash__(self): return hash((self.pos, self.children)) if __name__ == "__main__": '''###read the tree ''' from config.reader import Reader reader = Reader() insts = reader.read_conll("../data/abc/train.conllx", number=1) for inst in insts: nodes = [Tree(pos) for pos in range(len(inst.input.words))] root = Tree(-1) for pos, head in enumerate(inst.input.heads): if head != -1: nodes[head].add_child(nodes[pos]) else: root.add_child(nodes[pos]) inst.nodes = nodes for node in nodes: node.sort_children() print(root.leaves()) for pos, node in enumerate(nodes): if node.is_leaf():