def main(): print("start time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())) # create instance of config config = Config() config.dim_char = arg.dim_char config.hidden_size_char = arg.hidden_size_char config.hidden_size_lstm_1 = arg.hidden_size_lstm_1 config.hidden_size_lstm_2 = arg.hidden_size_lstm_2 config.batch_sample = arg.batch_sample config.elmo_scale = arg.elmo_scale config.lr_method = arg.lr_method config.batch_size = arg.batch_size config.learning_rate = arg.learning_rate config.decay_logic = arg.decay_logic config.run_name = arg.run_name # build model model = NERModel(config) model.build() # create datasets dev = CoNLLDataset(config.filename_dev, config.elmofile_dev, config.processing_word, config.processing_postags, config.generate_anchor, config.max_iter) train = CoNLLDataset(config.filename_train, config.elmofile_train, config.processing_word, config.processing_postags, config.generate_anchor, config.max_iter) # train model model.train(train, dev) print("end time:", time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
def main(): config = Config() #------------------------------------------------------------------- # build model # ------------------------------------------------------------------ model = NERModel(config) model.build() # ------------------------------------------------------------------ # train mode # ------------------------------------------------------------------ if config.mode == 'train': print('\n ... training model ... \n') test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) if config.periodic: split = CoNLLDataset(config.dummy_train, config.processing_word, config.processing_tag, config.max_iter) else: split = CoNLLDataset(config.train_split[config.split], config.processing_word, config.processing_tag, config.max_iter) model.train(split, test) # ------------------------------------------------------------------ # retrain mode # ------------------------------------------------------------------ if config.mode == 'retrain': print('\n ... retraining model ... \n') model.restore_session(config.dir_model) retrain = CoNLLDataset(config.filename_retrain, config.processing_word, config.processing_tag, config.max_iter) test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) model.train(retrain, test)
def main(): # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators 生成器 dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab; vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = [i for i in vocab_words if i in vocab_glove] vocab.append(UNK) vocab.append(NUM) vocab.append('</pad>') vocab_tags.append('</pad>') # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) vocab_chars.append('</pad>') write_vocab(vocab_chars, config.filename_chars)
def evaluate(): augment_pred = [] with NERModel(config) as model: # create datasets augment = CoNLLDataset(config.filename_augment, config.processing_word, config.processing_tag, config.max_iter) test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) # evaluate model.logger.info("\nEvaluation on Test") model.evaluate(test) model.logger.info("\nEvaluation on Augment") model.evaluate(augment, augment_pred) # model.logger.debug(augment_pred) # clear memory del model return augment_pred
def main(): # create instance of config config = Config() config.dim_char = arg.dim_char config.hidden_size_char = arg.hidden_size_char config.hidden_size_lstm_1 = arg.hidden_size_lstm_1 config.hidden_size_lstm_2 = arg.hidden_size_lstm_2 config.cls_hidden_size = arg.cls_hidden_size config.batch_sample = arg.batch_sample config.elmo_scale = arg.elmo_scale config.lr_method = arg.lr_method config.batch_size = arg.batch_size config.learning_rate = arg.learning_rate config.decay_logic = arg.decay_logic config.run_name = arg.run_name config.input_feature_dim = 600 #config.hidden_size_lstm * 2 #+ 1024 config.dir_saved_roi = arg.dir_saved_roi # build model model = NERModel(config) model.build() # create datasets config.filename_dev = config.dir_saved_roi + "dev_word_ids/" #config.filename_test = config.dir_saved_roi + "test_word_ids/" config.filename_train = config.dir_saved_roi + "train_word_ids/" dev = CoNLLDataset(config.filename_dev) print("Loading dev set done!") train = CoNLLDataset(config.filename_train) print("Loading train set done!") # train model model.train(train, dev, config.dev_total_entity)
def main(): config = Config() # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets [(char_ids), word_id] processing_word = get_processing_word(lowercase=False) dev = CoNLLDataset(config.filename_dev, processing_word) train = CoNLLDataset(config.filename_train, processing_word) test = CoNLLDataset(config.filename_test, processing_word) entities = [] for raw_words, raw_tags in test: chunks = get_chunks_from_tags(raw_tags) for _, chunk_start, chunk_end in chunks: entity = 'ENTITY/' for i in range(chunk_start, chunk_end): if i == chunk_end - 1: entity += raw_words[i] else: entity = entity + raw_words[i] + '_' entities.append(entity) # print(len(entities)) # print(entities) entities = set(entities) print(len(entities)) vocab_glove = get_glove_vocab(config.filename_glove) print(len(entities & vocab_glove))
def main(): # create instance of config config = Config() # build model model = NERModel(config) model.build("train") model.restore_session(config.dir_model) # create dataset # processing_word = get_processing_word(lowercase=True) if len(sys.argv) == 2: if sys.argv[1] == 'test': test = CoNLLDataset(config.filename_test) elif sys.argv[1] == 'dev': test = CoNLLDataset(config.filename_dev) else: assert len(sys.argv) == 1 test = CoNLLDataset(config.filename_test) test4cl = CoNLLdata4classifier(test, processing_word=config.processing_word, processing_tag=config.processing_tag) # evaluate and interact model.evaluate(test4cl)
def main(): # create instance of config config = Config() dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) predict = CoNLLDataset("data/source_data.txt", config.processing_word, config.max_iter) max_sequence_length = max(max([len(seq[0]) for seq in train]), max([len(seq[0]) for seq in dev]), max([len(seq[0]) for seq in test]), max([len(seq[0]) for seq in predict])) max_word_length = max( max([len(word[0]) for seq in train for word in seq[0]]), max([len(word[0]) for seq in test for word in seq[0]]), max([len(word[0]) for seq in dev for word in seq[0]])) print(max_word_length, max_sequence_length) model = NERModel(config, max_word_length, max_sequence_length) model.build() model.restore_session(config.dir_model) model.run_predict(predict)
def main(): # create instance of config,这里的config实现了load data的作用 #拥有词表、glove训练好的embeddings矩阵、str->id的function config = Config() # build model model = NERModel(config) model.build("train") # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets [(char_ids), word_id] # processing_word = get_processing_word(lowercase=True) dev = CoNLLDataset(config.filename_dev) train = CoNLLDataset(config.filename_train) test = CoNLLDataset(config.filename_test) train4cl = CoNLLdata4classifier(train, processing_word=config.processing_word, processing_tag=config.processing_tag, context_length=config.context_length) dev4cl = CoNLLdata4classifier(dev, processing_word=config.processing_word, processing_tag=config.processing_tag, context_length=config.context_length) test4cl = CoNLLdata4classifier(test, processing_word=config.processing_word, processing_tag=config.processing_tag, context_length=config.context_length) # train model model.train(train4cl, dev4cl, test4cl)
def main(): # create instance of config,这里的config实现了load data的作用 #拥有词表、glove训练好的embeddings矩阵、str->id的function config = Config() config.nepochs = 200 config.dropout = 0.5 config.batch_size = 19 config.lr_method = "adam" config.lr = 0.0001 config.lr_decay = 1.0 config.clip = -2.0 # if negative, no clipping config.nepoch_no_imprv = 5 config.dir_model = config.dir_output + "model.finetuning.weights/" # build model model = NERModel(config) model.build("fine_tuning") model.restore_session("results/test/model.weights/", indicate="fine_tuning") # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets [(char_ids), word_id] dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) # train model model.train(train, dev, test)
def main(): # create instance of config config = Config() config.layer=int(sys.argv[1]) config.step=int(sys.argv[2]) if config.task=='pos': print("USING POS") config.filename_train = "data/train.pos" # test config.filename_dev= "data/dev.pos" config.filename_test= "data/test.pos" else: print("USING NER") print("iteration: "+str(config.layer)) print("step: "+str(config.step)) # build model model = NERModel(config) model.build() # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) # train model model.train(train, dev, test)
def main(): # create instance of config config = Config() #build model model = BLSTMCRF(config) #Word_BLSTM(config) #model = Word_BLSTM(config) model.build() model.compile(optimizer=model.get_optimizer(), loss=model.get_loss()) #, metrics=['acc'] #model.summary() # Loading weights #model.load_weights('./saves/test20.h5') # create datasets dev = CoNLLDataset( config.filename_train, config.processing_word, #filename_dev config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) model.summary() # train model model.train(train, dev) # Save model model.save_weights('./saves/test20.h5')
def main(): # create instance of config config = Config() config.dir_model = config.dir_output + "model.finetuning.weights/" # build model model = NERModel(config) model.build("fine_tuning") model.restore_session(config.dir_model) # create dataset if len(sys.argv) == 2: if sys.argv[1] == 'test': test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, max_length=None) elif sys.argv[1] == 'dev': test = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, max_length=None) else: assert len(sys.argv) == 1 test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, max_length=None) # evaluate and interact model.evaluate(test)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word, task=config.task) test = CoNLLDataset(config.filename_test, processing_word, task=config.task) train = CoNLLDataset(config.filename_train, processing_word, task=config.task) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) #TODO get word2vec vocab too vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save word and tag vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # write and trim GloVe and word2vec Vectors vocab = load_vocab(config.filename_words) write_word2vec_to_txtfile(config.path_to_word2vec_bin_file, config.filename_word2vec) export_trimmed_word2vec_vectors(vocab, config.filename_word2vec, config.trimmed_word2vec_filename, config.dim_word) export_trimmed_glove_vectors(vocab, config.filename_glove, config.trimmed_glove_filename, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # 把字符全部小写,数字替换成NUM # Generators dev = CoNLLDataset(config.filename_dev, processing_word) # 创建一个生成器对象,每一次迭代产生tuple (words,tags) test = CoNLLDataset(config.filename_test, processing_word) # 返回一句话(words),和标签tags train = CoNLLDataset(config.filename_train, processing_word) #进一步处理数据 # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) # word词表, tags表 print(len(vocab_words)) vocab_glove = get_glove_vocab(config.filename_glove) # glove词表 vocab = vocab_words & vocab_glove # & 求交集 set,都是集合 vocab.add(UNK) vocab.add(NUM) # 手动添加 print("len of vocab without entity: ", len(vocab)) # vocab_entity = entity2vocab(datasets=[train, dev, test]) # vocab.update(vocab_entity) # vocab = entity2vocab(datasets=[train, dev], vocab=vocab) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) # 得到dict类型的vocab:{word:index} # 针对vocab,生成numpy的embedding文件,包含一个矩阵,对应词嵌入 export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab 生成字母表, 这里没用到小写化的东西。只有文件本身。 train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def main(): parser = argparse.ArgumentParser() parser.add_argument('--train_lang', type=str, default='en') parser.add_argument('--dev_lang', type=str, default='en') parser.add_argument('--test_lang', type=str, default='en') parser.add_argument('--is_pos', type=int, default=0, help='NER or POS?') parser.add_argument('--dataset', type=str, default='conll2003', help='Dataset directory') parser.add_argument('--dir', type=str, default=None, help='Output directory') parser.add_argument('--use_chars', type=int, default=1, help='Use character LSTM or not') parser.add_argument('--epoch', type=int, default=30) parser.add_argument('--emb_type', type=str, default='word', help='word | trans | word_trans') parser.add_argument('--emb_dim', type=int, default=300, help='Dimension of word embeddings') parser.add_argument('--model_dir', type=str, default='data/output_model_config_fb_wikitext103/', help='Transformer directory model') parser.add_argument('--layer', type=int, default=None, help='Select a single layer from Transformer') parser.add_argument('--trans_concat', type=str, default='all', help='all | sws | fws') parser.add_argument('--trans_dim', type=int, default=512, help='Transformer hidden size') parser.add_argument('--trans_layer', type=int, default=7, help='The total number of Transformer layers') parser.add_argument('--trans_type', type=str, default='monolingual', help="monolingual | crosslingual") parser.add_argument('--trans_vocab_src', type=str, default=None, help='Source language Transformer vocabulary') parser.add_argument('--trans_vocab_tgt', type=str, default=None, help='Target language Transformer vocabulary') args = parser.parse_args() # with tf.device('/cpu:0'): # create instance of config # print(args.use_attn, type(args.use_attn)) langs = [args.train_lang, args.dev_lang, args.test_lang] #config = Config(mix_vocab=args.mix_vocab, use_crf=args.use_crf, mono_trans=args.mono_trans, is_pos=args.is_pos, emb_dim=args.emb_dim, src_lang=args.train_lang, tgt_lang=args.test_lang, no_glove=args.no_glove, select_layer=args.select_layer, weighted_sum_full=args.weighted_sum_full, naive_proj=args.naive_proj, highway=args.highway, weighted_sum=args.trans_weighted_sum, trans_dim=args.trans_dim, dataset=args.dataset, trans_vocab=args.trans_vocab, use_transformer=args.use_trans, dir_=args.dir, use_chars=args.use_chars, use_attn=args.use_attn, char_init=args.char_init, model_dir=args.model_dir, trans_to_output=args.trans_to_output, epoch=args.epoch) config = Config(args) # create datasets dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter, lang=args.dev_lang) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter, lang=args.train_lang) test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter, lang=args.test_lang) #n_vocab = len(config.vocab_trans) #n_ctx = max([dev.max_seq, train.max_seq, test.max_seq]) # with tf.device('/cpu:0'): # build model model = NERModel(config) model.build() # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # train model model.train(train, dev)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test.py) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=False) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) # test.py = CoNLLDataset(config.filename_test, processing_word) 后面需要吧测试集的 也加进来 train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev]) # 这里先不加 get_glove_vocab # vocab_glove = get_glove_vocab(config.filename_glove) # vocab = vocab_words & vocab_glove vocab = vocab_words vocab.add(UNK) vocab.add(PAD) vocab.add(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors # vocab = load_vocab(config.filename_words) # export_trimmed_glove_vectors(vocab, config.filename_glove,config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars_train = get_char_vocab(train) dev = CoNLLDataset(config.filename_dev) vocab_chars_dev = get_char_vocab(dev) vocab_chars_train_dev = list(vocab_chars_dev & vocab_chars_train) vocab_chars = [UNK, PAD, NUM] vocab_chars.extend(vocab_chars_train_dev) write_vocab(vocab_chars, config.filename_chars)
def main(i, al, filenameextra): #Call in an iterator # create instance of config #config = Config() print("********Active training round ", i) # Initialize creating dataset # create datasets train_round = None select = None dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) #always keep the same dev and test test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) if (i == 1): train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) train = list(train) train_round = train[0:config.num_query] select = train[config.num_query:len(train)] else: fn = open(config.filename_pkl + str(i), 'rb') train_round, select = pickle.load(fn) fn.close() print("Training size ", len(train_round)) print("Number of left training samples ", len(select)) modename = str(i) + "_" + al + "_" + filenameextra out = train_active(train_round, dev, test, select, config, modename) #sort select list based on scores if config.active_strategy == "cluster": print('Scores from cluster ', out) else: if al == 'mu' or al == "mg": select = [x for _, x in sorted(zip(out, select)) ] #Sort based on output of selection elif al == 'lu': select = [x for _, x in sorted(zip(out, select), reverse=True)] elif al == 'rand': shuffle(select) num_samples = min(config.num_query, len(select)) train_round += select[0:num_samples] select = select[num_samples:len(select)] shuffle(train_round) shuffle(select) i = i + 1 fo = open(config.filename_pkl + str(i), 'wb') pickle.dump((train_round, select), fo) fo.close()
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) if config.task == 'pos': print("USING POS") config.filename_train = "data/train.pos" # test config.filename_dev = "data/dev.pos" config.filename_test = "data/test.pos" else: print("USING NER") processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def train(config): # build model model = NERModel(config) model.build() # create datasets dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) # train model model.train(train, dev)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab vocab_words, vocab_tags = get_vocabs([train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) # 与glove中的词集合求交,只保留有向量的那些词 vocab = vocab_words & vocab_glove vocab.add(UNK) vocab.add(NUM) # Save vocab, vocab: set() print("write vocab set to file: " + config.filename_words) write_vocab(vocab, config.filename_words) print("write vocab tags set to file: " + config.filename_tags) write_vocab(vocab_tags, config.filename_tags) # Trim GloVe Vectors, 只加载那些在词集合中出现过的词向量 vocab_to_index_dict = load_vocab(config.filename_words) # vocab: dict, vocab[word] = word_index print("export trimmed vocab embedding to file: " + config.filename_trimmed) export_trimmed_glove_vectors(vocab_to_index_dict, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) print("save char set to file:" + config.filename_chars) write_vocab(vocab_chars, config.filename_chars)
def main(): # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) processing_pos = get_processing_word() processing_chunk = get_processing_word() # Generators dev = CoNLLDataset(config.filename_dev, processing_word, processing_pos, processing_chunk) test = CoNLLDataset(config.filename_test, processing_word, processing_pos, processing_chunk) train = CoNLLDataset(config.filename_train, processing_word, processing_pos, processing_chunk) # Build Word and Tag vocab vocab_words, vocab_tags, vocab_poses, vocab_chunks = get_vocabs( [train, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = [i for i in vocab_words if i in vocab_glove] vocab.append(UNK) vocab.append(NUM) vocab.append("$pad$") vocab_poses.append("$pad$") vocab_chunks.append("$pad$") vocab_tags.append("$pad$") # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) write_vocab(vocab_poses, config.filename_poses) write_vocab(vocab_chunks, config.filename_chunks) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) print(len(vocab)) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) vocab = load_vocab(config.filename_poses) export_trimed_ont_hot_vectors(vocab, config.filename_pos_trimmed) vocab = load_vocab(config.filename_chunks) export_trimed_ont_hot_vectors(vocab, config.filename_chunk_trimmed) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) vocab_chars.append("$pad$") write_vocab(vocab_chars, config.filename_chars)
def main(): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev and test) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words config = Config(load=False) processing_word = get_processing_word(lowercase=True) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab (only from train!) vocab_words, vocab_freqs, vocab_tags = get_vocabs([train]) #, dev, test]) vocab_glove = get_glove_vocab(config.filename_glove) vocab = vocab_words & vocab_glove #vocab = make_unks(vocab, vocab_freqs, config.p_unk) #vocab.add(UNK) vocab.add(NUM) vocab = [UNK] + list(vocab) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # get singletons singletons = [k for k, v in vocab_freqs.items() if v == 1] write_vocab(singletons, config.filename_singletons) # Trim GloVe Vectors vocab = load_vocab(config.filename_words) export_trimmed_glove_vectors(vocab, config.filename_glove, config.filename_trimmed, config.dim_word) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) write_vocab(vocab_chars, config.filename_chars)
def build(config): """Procedure to build data You MUST RUN this procedure. It iterates over the whole dataset (train, dev ) and extract the vocabularies in terms of words, tags, and characters. Having built the vocabularies it writes them in a file. The writing of vocabulary in a file assigns an id (the line #) to each word. It then extract the relevant GloVe vectors and stores them in a np array such that the i-th entry corresponds to the i-th word in the vocabulary. Args: config: (instance of Config) has attributes like hyper-params... """ # get config and processing of words # config = Config(load=False, args=args) processing_word = get_processing_word(lowercase=True) # Generators train = CoNLLDataset(config.filename_train, processing_word) vocab, _ = get_vocabs([train], config.min_count) vocab.insert(0, UNK) special_flag = [NUM, NUU, FLT, FLU] for index, flag in enumerate(special_flag, 1): if flag in vocab: vocab.remove(flag) vocab.insert(index, flag) # Generators dev = CoNLLDataset(config.filename_dev, processing_word) # test = CoNLLDataset(config.filename_test, processing_word) train = CoNLLDataset(config.filename_train, processing_word) # Build Word and Tag vocab _, vocab_tags = get_vocabs([train, dev]) # Save vocab write_vocab(vocab, config.filename_words) write_vocab(vocab_tags, config.filename_tags) # Build and save char vocab train = CoNLLDataset(config.filename_train) vocab_chars = get_char_vocab(train) vocab_chars.insert(0, UNK) write_vocab(vocab_chars, config.filename_chars)
def main(): # create instance of config config = Config() pretrain_path = "/home/yinghong/project/tmp/s_t/ray_results/final/exp-final-epoch30" \ "/train_func_0_2018-06-16_01-24-13vmtghosb" config_path = os.path.join(pretrain_path, "params.json") with open(config_path) as fin: content = fin.read().replace('\n', '') import json j = json.loads(content) for (key, val) in j.items(): setattr(config, key, val) # build model model = NERModel(config) model.build() model.restore_session( os.path.join( pretrain_path, "results/tmptmptest/bz=10-training-" "bieo-nocnn/model.weights/")) # create dataset # test = CoNLLDataset(config.filename_test, config.processing_word, # config.processing_tag, config.max_iter) dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.tmp(dev, outfile="result-dev.txt") interactive_shell(model)
def main(): # create instance of config config = Config() config.dim_char = arg.dim_char config.hidden_size_char = arg.hidden_size_char config.hidden_size_lstm_1 = arg.hidden_size_lstm_1 config.hidden_size_lstm_2 = arg.hidden_size_lstm_2 config.batch_sample = arg.batch_sample config.elmo_scale = arg.elmo_scale config.lr_method = arg.lr_method config.batch_size = arg.batch_size config.learning_rate = arg.learning_rate config.decay_logic = arg.decay_logic config.run_name = arg.run_name # build model model = NERModel(config) model.build() model.restore_session(config.dir_model + config.run_name + '/') # create dataset test = CoNLLDataset(config.filename_test, config.elmofile_test, config.processing_word, config.processing_postags, config.generate_anchor, config.max_iter) model.evaluate(test)
def main(): # create instance of config config = Config() if config.use_elmo: config.processing_word = None #build model model = NERModel(config) # create datasets dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter, config.use_crf) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter, config.use_crf) learn = NERLearner(config, model) learn.fit(train, dev)
def main(): # create instance of config dir_output = "./results/" + sys.argv[2] + "/" config = Config(dir_output, load=False) config.filename_words = "./data/words_" + sys.argv[2] + ".txt" config.filename_chars = "./data/chars_" + sys.argv[2] + ".txt" config.filename_tags = "./data/tags_" + sys.argv[2] + ".txt" #config.dir_output = "./results/" + sys.argv[2] + "/" config.dir_model = config.dir_output + "model.weights/" config.path_log = config.dir_output + "log.txt" #config.filename_dev = sys.argv[1] config.filename_test = sys.argv[1] #config.filename_train = sys.argv[3] config.filename_pred = sys.argv[1].replace(".txt", ".pred") config.load() # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) # create dataset #test = CoNLLDataset(config.filename_test, config.processing_word, # config.processing_tag, config.max_iter) test = CoNLLDataset(sys.argv[1], config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.evaluate(test)
def main(): # Предсказания моделью первого уровня # config_first = Config(dir_output='./results/train_first/') model = NERModel(config_first) model.build() model.restore_session(config_first.dir_model) test = CoNLLDataset(config_first.filename_test, config_first.processing_word, config_first.processing_tag, config_first.max_iter) print() print('Predicting first stage!') model.evaluate(test) print() test_predictions = model.predict_test(test) formatted_predictions = format_predictions(test_predictions, 'test', config_first) # Предсказания моделью второго уровня # tf.reset_default_graph() config_second = Config(dir_output='./results/train_second/') model = NERModel2(config_second) model.build() model.restore_session(config_second.dir_model) print() print('Predicting second stage!') model.evaluate(formatted_predictions) print()
def main(): # create instance of config config = Config() # build model model = NERModel(config) model.build() # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets dev = CoNLLDataset(config.filename_dev, max_iter=config.max_iter) train = CoNLLDataset(config.filename_train, max_iter=config.max_iter) # train model model.train(train, dev)