def main(): config = Config() #------------------------------------------------------------------- # build model # ------------------------------------------------------------------ model = NERModel(config) model.build() # ------------------------------------------------------------------ # train mode # ------------------------------------------------------------------ if config.mode == 'train': print('\n ... training model ... \n') test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) if config.periodic: split = CoNLLDataset(config.dummy_train, config.processing_word, config.processing_tag, config.max_iter) else: split = CoNLLDataset(config.train_split[config.split], config.processing_word, config.processing_tag, config.max_iter) model.train(split, test) # ------------------------------------------------------------------ # retrain mode # ------------------------------------------------------------------ if config.mode == 'retrain': print('\n ... retraining model ... \n') model.restore_session(config.dir_model) retrain = CoNLLDataset(config.filename_retrain, config.processing_word, config.processing_tag, config.max_iter) test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) model.train(retrain, test)
def __init__(self): self.config = Config() self.config.dim_word = 250 self.config.dim_char = 50 self.model = NERModel(self.config) self.model.build() self.model.restore_session(self.MODEL_DIR)
def main(): # create instance of config config = Config() # create datasets dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.processing_pos, config.processing_chunk, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.processing_pos, config.processing_chunk, config.max_iter) test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.processing_pos, config.processing_chunk, config.max_iter) max_sequence_length = max(max([len(seq[0]) for seq in train]), max([len(seq[0]) for seq in dev]), max([len(seq[0]) for seq in test])) max_word_length = max( max([len(word[0]) for seq in train for word in seq[0]]), max([len(word[0]) for seq in test for word in seq[0]]), max([len(word[0]) for seq in dev for word in seq[0]])) print(max_word_length, max_sequence_length) model = NERModel(config, max_word_length, max_sequence_length) model.build() model.train(train, dev) model.restore_session(config.dir_model) model.evaluate(test)
def main(): # create instance of config config = Config() # build model model = NERModel(config) # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") model.train(train=config.dataset_train, dev=config.dataset_dev)
def main(): config = Config('./results/train_folds/') train_predictions_file = './data/predictions/formatted_train_predictions.npy' kf = KFold(n_splits=5) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) train = np.array([el for el in train]) predictions = [0 for _ in train] for train_ids, evaluate_ids in kf.split(train): train_dataset = train[train_ids] evaluate_dataset = train[evaluate_ids] tf.reset_default_graph() config = Config('./results/train_folds/') model = NERModel(config) model.build() model.train(train_dataset, evaluate_dataset) for id, tags in zip(evaluate_ids, model.predict_test(evaluate_dataset)): predictions[id] = tags model.close_session() predictions = np.array(predictions) formatted_predictions = format_predictions(predictions, 'train', config) np.save(train_predictions_file, formatted_predictions)
def main(): # create instance of config config = Config() config.dir_model = config.dir_output + "model.finetuning.weights/" # build model model = NERModel(config) model.build("fine_tuning") model.restore_session(config.dir_model) # create dataset if len(sys.argv) == 2: if sys.argv[1] == 'test': test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) elif sys.argv[1] == 'dev': test = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) else: assert len(sys.argv) == 1 test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.evaluate(test)
def main(): # create instance of config config_file = sys.argv[1] config = Config(config_file) print("dir model : ", config.dir_model) # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) # model.reinitialize_weights("words") # model.reinitialize_weights("chars") # model.reinitialize_weights("train_step") # Evaluate on another data set if len(sys.argv) > 2: test_file_name = sys.argv[2] test = CoNLLDataset(test_file_name, config.processing_word, config.processing_tag, config.max_iter) print("Testing on ", test_file_name, "..") # create dataset else: test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) print("Testing on ", config.filename_test, "..") # evaluate and interact # model.predict_test(test, output=sys.stdout) model.evaluate(test)
def main(): # create instance of config,这里的config实现了load data的作用 #拥有词表、glove训练好的embeddings矩阵、str->id的function config = Config() config.nepochs = 200 config.dropout = 0.5 config.batch_size = 40 config.lr_method = "adam" config.lr = 0.0007 config.lr_decay = 0.97 config.clip = -5.0 # if negative, no clipping config.nepoch_no_imprv = 20 # build model model = NERModel(config) model.build("fine_tuning") model.restore_session(config.dir_model) # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets [(char_ids), word_id] dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) # train model model.train(train, dev)
def pretrain(): config = Config() pretrain_path = "/home/yinghong/project/tmp/s_t_rollback/ray_results/06" \ "-19/01-HasCNN/try5" # pretrain_path = "/home/yinghong/project/tmp/s_t_rollback/ray_results/06-19/best-HasCNN/try4" # reverse = True # cv = False config_path = os.path.join(pretrain_path, "params.json") with open(config_path) as fin: content = fin.read().replace('\n', '') import json j = json.loads(content) for (key, val) in j.items(): setattr(config, key, val) model = NERModel(config) model.build() model.restore_session( os.path.join( pretrain_path, "results/tmptmptest/bz=10-training-" "bieo-nocnn/model.weights/")) # create dataset test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter, test=True) dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.tmp(dev, outfile="result-test-google85.63.txt")
def main(): # create instance of config config = Config() config.layer = int(sys.argv[1]) config.step = int(sys.argv[2]) if config.task == 'pos': print("USING POS") config.filename_train = "data/train.pos" # test config.filename_dev = "data/dev.pos" config.filename_test = "data/test.pos" else: print("USING NER") print("iteration: " + str(config.layer)) print("step: " + str(config.step)) # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) # create dataset test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.evaluate(test)
def main(): # create instance of config config = Config() dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) predict = CoNLLDataset("data/source_data.txt", config.processing_word, config.max_iter) max_sequence_length = max(max([len(seq[0]) for seq in train]), max([len(seq[0]) for seq in dev]), max([len(seq[0]) for seq in test]), max([len(seq[0]) for seq in predict])) max_word_length = max( max([len(word[0]) for seq in train for word in seq[0]]), max([len(word[0]) for seq in test for word in seq[0]]), max([len(word[0]) for seq in dev for word in seq[0]])) print(max_word_length, max_sequence_length) model = NERModel(config, max_word_length, max_sequence_length) model.build() model.restore_session(config.dir_model) model.run_predict(predict)
def main(): # create instance of config config = Config() pretrain_path = "/home/yinghong/project/tmp/s_t/ray_results/final/exp-final-epoch30" \ "/train_func_0_2018-06-16_01-24-13vmtghosb" config_path = os.path.join(pretrain_path, "params.json") with open(config_path) as fin: content = fin.read().replace('\n', '') import json j = json.loads(content) for (key, val) in j.items(): setattr(config, key, val) # build model model = NERModel(config) model.build() model.restore_session( os.path.join( pretrain_path, "results/tmptmptest/bz=10-training-" "bieo-nocnn/model.weights/")) # create dataset # test = CoNLLDataset(config.filename_test, config.processing_word, # config.processing_tag, config.max_iter) dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.tmp(dev, outfile="result-dev.txt") interactive_shell(model)
def main(): # create instance of config,这里的config实现了load data的作用 #拥有词表、glove训练好的embeddings矩阵、str->id的function config = Config() # build model model = NERModel(config) model.build("train") model.restore_session(config.dir_model) # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets [(char_ids), word_id] processing_word = get_processing_word(lowercase=True) dev = CoNLLDataset(config.filename_dev, processing_word) train = CoNLLDataset(config.filename_train, processing_word) test = CoNLLDataset(config.filename_test, processing_word) train4cl = CoNLLdata4classifier(train, processing_word=config.processing_word, processing_tag=config.processing_tag) dev4cl = CoNLLdata4classifier(dev, processing_word=config.processing_word, processing_tag=config.processing_tag) test4cl = CoNLLdata4classifier(test, processing_word=config.processing_word, processing_tag=config.processing_tag) # train model model.train(train4cl, dev4cl, test4cl)
def main2(): # create instance of config config = Config() # build model model = NERModel(config) model.build() pretrain_path = "/home/yinghong/project/tmp/s_t/ray_results/final/" \ "exp-final-epoch30-sgd/train_func_0_2018-06-15_14-18-14bqpn6jv1" model.restore_session(os.path.join(pretrain_path, "results/tmptmptest/bz=10-training-" "bieo-nocnn/model.weights/")) # model.restore_session("results/crf/model.weights/") # optional, restore weights #model.reinitialize_weights("proj") # create datasets dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) # train model model.train(train, dev)
def main(): # create instance of config config = Config() # build model model = NERModel(config) model.build("train") model.restore_session(config.dir_model) # create dataset if len(sys.argv) == 2: if sys.argv[1] == 'test': test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, max_length=None) elif sys.argv[1] == 'dev': test = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, max_length=None) else: assert len(sys.argv) == 1 test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, max_length=None) # evaluate and interact model.evaluate(test)
def main(): # create instance of config config = Config() config.dim_char = arg.dim_char config.hidden_size_char = arg.hidden_size_char config.hidden_size_lstm_1 = arg.hidden_size_lstm_1 config.hidden_size_lstm_2 = arg.hidden_size_lstm_2 config.cls_hidden_size = arg.cls_hidden_size config.batch_sample = arg.batch_sample config.elmo_scale = arg.elmo_scale config.lr_method = arg.lr_method config.batch_size = arg.batch_size config.learning_rate = arg.learning_rate config.decay_logic = arg.decay_logic config.run_name = arg.run_name config.input_feature_dim = 600 #config.hidden_size_lstm * 2 #+ 1024 config.dir_saved_roi = arg.dir_saved_roi # build model model = NERModel(config) model.build() model.restore_session(config.dir_model + config.run_name + '/') # create dataset config.filename_test = config.dir_saved_roi + "test_word_ids/" test = CoNLLDataset(config.filename_test) # evaluate and interact model.evaluate(test, config.test_total_entity)
def main(): # create instance of config dir_output = "./results/" + sys.argv[2] + "/" config = Config(dir_output, load=False) config.filename_words = "./data/words_" + sys.argv[2] + ".txt" config.filename_chars = "./data/chars_" + sys.argv[2] + ".txt" config.filename_tags = "./data/tags_" + sys.argv[2] + ".txt" #config.dir_output = "./results/" + sys.argv[2] + "/" config.dir_model = config.dir_output + "model.weights/" config.path_log = config.dir_output + "log.txt" #config.filename_dev = sys.argv[1] config.filename_test = sys.argv[1] #config.filename_train = sys.argv[3] config.filename_pred = sys.argv[1].replace(".txt", ".pred") config.load() # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) # create dataset #test = CoNLLDataset(config.filename_test, config.processing_word, # config.processing_tag, config.max_iter) test = CoNLLDataset(sys.argv[1], config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.evaluate(test)
def train_func(_config, reporter): # tf.reset_default_graph() config = Config() # for (key, val) in _config.items(): # # config[key] = val # setattr(config, key[3:], val) # config["dir_output"] = "" setattr(config, "dir_output", "pretrain") setattr(config, "nepochs", 50) setattr(config, "batch_size", 80) pretrain_path = _config["30-pretrain_path"] PRETRAIN_MODE = _config["31-pretrain_mode"] if PRETRAIN_MODE: config_path = os.path.join(pretrain_path, "params.json") with open(config_path) as fin: content = fin.read().replace('\n', '') import json j = json.loads(content) for (key, val) in j.items(): setattr(config, key, val) model = NERModel(config) model.build() if PRETRAIN_MODE: model.restore_session(os.path.join(pretrain_path, "results/tmptmptest/bz=10-training-" "bieo-nocnn/model.weights/")) model.train(train, dev, reporter)
def main(): # create instance of config config = Config() # build model model = NERModel(config) model.build("train") model.restore_session(config.dir_model) # create dataset processing_word = get_processing_word(lowercase=True) if len(sys.argv) == 2: if sys.argv[1] == 'test': test = CoNLLDataset(config.filename_test, processing_word) elif sys.argv[1] == 'dev': test = CoNLLDataset(config.filename_dev, processing_word) else: assert len(sys.argv) == 1 test = CoNLLDataset(config.filename_test, processing_word) test4cl = CoNLLdata4classifier(test, processing_word=config.processing_word, processing_tag=config.processing_tag) # evaluate and interact model.evaluate(test4cl)
def main(): # create instance of config config = Config() config.dim_char = arg.dim_char config.hidden_size_char = arg.hidden_size_char config.hidden_size_lstm_1 = arg.hidden_size_lstm_1 config.hidden_size_lstm_2 = arg.hidden_size_lstm_2 config.batch_sample = arg.batch_sample config.elmo_scale = arg.elmo_scale config.lr_method = arg.lr_method config.batch_size = arg.batch_size config.learning_rate = arg.learning_rate config.decay_logic = arg.decay_logic config.run_name = arg.run_name # build model model = NERModel(config) model.build() model.restore_session(config.dir_model + config.run_name + '/') # create dataset test = CoNLLDataset(config.filename_test, config.elmofile_test, config.processing_word, config.processing_postags, config.generate_anchor, config.max_iter) model.evaluate(test)
def __init__(self, load_lstm): import sys if load_lstm: sys.path.append('/home/rbshaffer/sequence_tagging') from model.ner_model import NERModel from model.config import Config config = Config() # build model self.model = NERModel(config) self.model.build() self.model.restore_session(config.dir_model)
def main(): # create instance of config config = Config() # build model if config.ensembles: model = Ensemble(config) else: model = NERModel(config) model.build() model.restore_session(config.dir_model) # create dataset test = CoNLLDataset(config.filename_test, config.processing_word) #config.processing_tag, config.max_iter) # evaluate and interact #model.evaluate(test) for words, _ in test: preds = model.predict(words) # print for i in preds: print(i) print()
def main(): # create instance of config config = Config() if config.use_elmo: config.processing_word = None #build model model = NERModel(config) learn = NERLearner(config, model) learn.load() if len(sys.argv) == 1: print("No arguments given. Running full test") sys.argv.append("eval") sys.argv.append("pred") if sys.argv[1] == "eval": # create datasets test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) learn.evaluate(test) if sys.argv[1] == "pred" or sys.argv[2] == "pred": try: sent = (sys.argv[2] if sys.argv[1] == "pred" else sys.argv[3]) except IndexError: sent = ["Peter", "Johnson", "lives", "in", "Los", "Angeles"] print("Predicting sentence: ", sent) pred = learn.predict(sent) print(pred)
class NerProcessor(PostProcessor): ner_model = NERModel() def process(self, pred, source, model_id): if model_id == 'ar2en': return self.ner_model.entity_capitalization(pred) else: return pred
def main(): # create instance of config config = Config() # build model model = NERModel(config) model.build() # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") # create datasets dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) # train model model.train(train, dev)
def main(predict_file,save_file): # create instance of config config = Config() predict=CoNLLDataset(predict_file, config.processing_word, config.max_iter) max_sequence_length = max([len(seq[0]) for seq in predict]) max_word_length = max([len(word[0]) for seq in predict for word in seq[0]]) print(max_word_length, max_sequence_length) model = NERModel(config, max_word_length, max_sequence_length) model.build() model.restore_session(config.dir_model) model.run_predict(predict,save_file)
def main(): # create instance of config config = Config() config.dim_char = arg.dim_char config.hidden_size_char = arg.hidden_size_char config.hidden_size_lstm_1 = arg.hidden_size_lstm_1 config.hidden_size_lstm_2 = arg.hidden_size_lstm_2 config.batch_sample = arg.batch_sample config.elmo_scale = arg.elmo_scale config.lr_method = arg.lr_method config.batch_size = arg.batch_size config.learning_rate = arg.learning_rate config.decay_logic = arg.decay_logic config.run_name = arg.run_name config.dir_saved_roi = arg.dir_saved_roi # build model model = NERModel(config) model.build() model.restore_session(config.dir_model + config.run_name + '/') # create dataset dev = CoNLLDataset(config.filename_dev, config.elmofile_dev, config.bertfile_dev, config.processing_word, config.processing_postags, config.generate_anchor, config.max_iter) #train = CoNLLDataset(config.filename_train, config.elmofile_train, config.bertfile_train, config.processing_word, # config.processing_postags, config.generate_anchor, # config.max_iter) test = CoNLLDataset(config.filename_test, config.elmofile_test, config.bertfile_test, config.processing_word, config.processing_postags, config.generate_anchor, config.max_iter) # evaluate and interact #model.dump(train, 'train') print("Dump Train feature done!") model.dump(dev, 'dev') print("Dump Dev feature done!") model.dump(test, 'test') print("Dump Test feature done!")
def fit(config, embedder, train, dev): #set output filename config.set_model_name(embedder.__class__.__name__) config.use_laser = False pad_len = 0 # no BPE fragments used # Initiate model model = NERModel(config, embedder, pad_len) # train learn = NERLearner(config, model, pad_len, pad_len) learn.fit(train, dev)
def buildModel(): global globalModel config = Config() # build model if config.use_embedding_proj_pred: globalModel = ProjectionNERModel(config) else: globalModel = NERModel(config) globalModel.build() globalModel.restore_latest_session(config.dir_model_evaluate) return 'Success'
def main(): config = Config() # ----------------------------------------------------- # restore model # ----------------------------------------------------- model = NERModel(config) model.build() model.restore_session(config.dir_model) # -------------------------------------------------------------- # create dataset (test for evaluation & dev for active learning) # -------------------------------------------------------------- test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) dev = CoNLLDataset(config.train_split[config.sample_split], config.processing_word, config.processing_tag, config.max_iter) sick = CoNLLDataset(config.filename_sick, config.processing_word, config.processing_scores, config.max_iter) # ----------------------------------------------------- # encode SICK dataset using pretrained NER model #------------------------------------------------------ if config.encode: model.get_encoded(sick) # ----------------------------------------------------- # determine threshold #----------------------------------------------------- #determine threshold for active learning #threshold = 20 #model.get_threshold(test, threshold) # ----------------------------------------------------- # ----------------------------------------------------- # evaluate and interact # ----------------------------------------------------- model.evaluate(test, dev, "test")
def main(data_prefix=None): # create instance of config config = Config() # build model model = NERModel(config) model.build() # model.restore_session("results/crf/model.weights/") # optional, restore weights # model.reinitialize_weights("proj") if data_prefix: cwd = os.getcwd() config.filename_dev = os.path.join( cwd, 'data', data_prefix + '_' + os.path.basename(config.filename_dev)) config.filename_test = os.path.join( cwd, 'data', data_prefix + '_' + os.path.basename(config.filename_test)) config.filename_train = os.path.join( cwd, 'data', data_prefix + '_' + os.path.basename(config.filename_train)) # create datasets dev = CoNLLDataset(config.filename_dev, config.processing_word, config.processing_tag, config.max_iter) train = CoNLLDataset(config.filename_train, config.processing_word, config.processing_tag, config.max_iter) # train model print('training') model.train(train, dev)
def main(): # create instance of config config = Config() # build model model = NERModel(config) model.build() model.restore_session(config.dir_model) # create dataset test = CoNLLDataset(config.filename_test, config.processing_word, config.processing_tag, config.max_iter) # evaluate and interact model.evaluate(test) interactive_shell(model)