print('loading model') ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], args.dataset_no, large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers = jd['highway_layers']) ner_model.load_state_dict(checkpoint_file['state_dict']) if args.gpu >= 0: if_cuda = True torch.cuda.set_device(args.gpu) ner_model.cuda() packer = CRFRepack_WC(len(l_map), True) else: if_cuda = False packer = CRFRepack_WC(len(l_map), False) decode_label = (args.decode_type == 'label') predictor = predict_wc(if_cuda, f_map, c_map, l_map, f_map['<eof>'], c_map['\n'], l_map['<pad>'], l_map['<start>'], decode_label, args.batch_size, jd['caseless']) # loading corpus print('loading corpus') lines = [] features = [] with codecs.open(args.input_file, 'r', 'utf-8') as f: for line in f: if line == '\n': features.append(utils.read_features(lines)) lines = [] continue tmp = line.split() lines.append(tmp[0]) for idx in range(args.dataset_no):
for i in range(file_num): best_pre.append(float('-inf')) best_rec = [] for i in range(file_num): best_rec.append(float('-inf')) track_list = list() start_time = time.time() epoch_list = range(args.start_epoch, args.start_epoch + args.epoch) patience_count = 0 evaluator = eval_wc(packer, l_map, args.eva_matrix) predictor = predict_wc(if_cuda, f_map, char_map, l_map, f_map['<eof>'], char_map['\n'], l_map['<pad>'], l_map['<start>'], True, args.batch_size, args.caseless) #NEW for epoch_idx, args.start_epoch in enumerate(epoch_list): sample_num = 1 epoch_loss = 0 ner_model.train() for sample_id in tqdm(range(sample_num), mininterval=2, desc=' - Tot it %d (epoch %d)' % (tot_length, args.start_epoch), leave=False, file=sys.stdout):
in_doc_words = checkpoint_file['in_doc_words'] if args.gpu >= 0: torch.cuda.set_device(args.gpu) # loading corpus print('loading corpus') with codecs.open(args.input_file, 'r', 'utf-8') as f: lines = f.readlines() # converting format features = utils.read_features(lines) # build model print('loading model') ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers = jd['highway_layers']) ner_model.load_state_dict(checkpoint_file['state_dict']) if args.gpu >= 0: if_cuda = True torch.cuda.set_device(args.gpu) ner_model.cuda() else: if_cuda = False decode_label = (args.decode_type == 'label') predictor = predict_wc(if_cuda, f_map, c_map, l_map, f_map['<eof>'], c_map['\n'], l_map['<pad>'], l_map['<start>'], decode_label, args.batch_size, jd['caseless']) print('annotating') with open(args.output_file, 'w') as fout: predictor.output_batch(ner_model, features, fout)
def get_multi_task_model( prefix='multi_task_tagger/data/pkl_files/num_gen_cas_tense_lem/50k_', args_file='multi_task_tagger/data/pkl_files/num_gen_cas_tense_lem/50k_all_data.pkl', order_pkl_file='num-gen-cas-tense-lem', order='num-gen-cas-tense-lem', num_tasks=5, output_directory='test_data/poetry/', eva_matrix='a', word_dim=200, char_dim=30, char_layers=1, word_layers=1, small_crf=True, checkpoint='multi_task_tagger/checkpoints_new/layered_num_gen_cas_tense_lem_15_May/', word_hidden=256, batch_size=16, char_hidden=100, load_check_point=True, do_eval=True, checkpoint_file='multi_task_tagger/checkpoints_new/layered_num_gen_cas_tense_lem_15_May/cwlm_lstm_crf_cas_2.model', out_files=None): #out_files = out_files assert len(order_pkl_file.split('-')) == len(order.split('-')) pkl_order_dict = {} pkl_orders = order_pkl_file.split('-') for i, o in enumerate(pkl_orders): pkl_order_dict[o] = i order_list = order.split('-') reorder_index = [] for o in order_list: reorder_index.append(pkl_order_dict[o]) print("Re-ordering list is ") print(reorder_index) output_directory = output_directory save_filename = args_file args_file = args_file num_tasks = num_tasks num_files = num_tasks print("Number of tasks : " + str(num_tasks)) print("Order of the tasks is " + order) print("CRF type -- " + str(small_crf)) file_prefix = prefix new_dataset_loader = [] new_dev_dataset_loader = [] new_test_dataset_loader = [] test_word = [] test_lines = [] #print('Loading --out files for annotation....') #for i in range(num_tasks): # with codecs.open(out_files[i], 'r', 'utf-8') as f: # test_lines0 = f.readlines() # test_lines.append(test_lines0) #for i in range(num_tasks): # test_word0 = utils.read_features_sentences(test_lines[i]) # print(test_word0[0]) # print("Number of docs : " + str(len(test_word0))) # test_word.append(test_word0) #test_word = reorder_list(test_word, reorder_index) #for i in range(num_files): for i in reorder_index: dataset, forw_corp, back_corp = utils.load_data_pkl_file(file_prefix + 'train_' + str(i) + ".pkl") dev_dataset, forw_dev, back_dev = utils.load_data_pkl_file( file_prefix + 'dev_' + str(i) + ".pkl") test_dataset, forw_test, back_test = utils.load_data_pkl_file( file_prefix + 'test_' + str(i) + ".pkl") new_dataset_loader.append([ torch.utils.data.DataLoader(tup, batch_size, shuffle=True, drop_last=False, num_workers=40) for tup in dataset ]) new_dev_dataset_loader.append([ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset ]) new_test_dataset_loader.append([ torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset ]) print('Loading data dictionary from file ' + save_filename) with open(save_filename, 'rb') as fp: d = pickle.load(fp) args = d['args'] args.gpu = 0 label_maps = d['label_maps'] char_map = d['char_map'] f_map = d['f_map'] file_num = d['file_num'] in_doc_words = d['in_doc_words'] embedding_tensor = d['embedding_tensor'] dataset_loader = d['dataset_loader'] dev_dataset_loader = d['dev_dataset_loader'] test_dataset_loader = d['test_dataset_loader'] forw_corp = d['forw_corp'] back_corp = d['back_corp'] forw_dev = d['forw_dev'] back_dev = d['back_dev'] forw_test = d['forw_test'] back_test = d['back_test'] file_num = num_tasks print('Shape of Embedding Tensor') print(embedding_tensor.shape) # Reorder label_maps label_maps = reorder_list(label_maps, reorder_index) args.checkpoint = checkpoint # Set args args.word_hidden = word_hidden args.char_hidden = char_hidden args.word_dim = word_dim args.char_dim = char_dim args.char_layers = char_layers args.word_layers = word_layers args.small_crf = small_crf args.eva_matrix = eva_matrix args.load_check_point = load_check_point args.do_eval = do_eval args.checkpoint_file = checkpoint_file print(args.word_hidden) print("Will save checkpoint in " + str(args.checkpoint)) inv_f_map = {} for k, v in f_map.items(): inv_f_map[v] = k #print(inv_f_map[6430]) print(f_map['<unk>']) args.output_annotation = True print("Number of files : " + str(file_num)) dataset_loader = new_dataset_loader dev_dataset_loader = new_dev_dataset_loader test_dataset_loader = new_test_dataset_loader if args.gpu >= 0: torch.cuda.set_device(args.gpu) print(args) args.batch_size = batch_size # build model print('building model') print(label_maps) label_maps_sizes = [len(lmap) for lmap in label_maps] print(label_maps_sizes) print('File_num' + str(file_num)) ner_model = LM_LSTM_CRF(label_maps_sizes, len(char_map), args.char_dim, args.char_hidden, args.char_layers, args.word_dim, args.word_hidden, args.word_layers, len(f_map), args.drop_out, file_num, large_CRF=args.small_crf, if_highway=args.high_way, in_doc_words=in_doc_words, highway_layers=args.highway_layers) if args.load_check_point: print(args.checkpoint_file) if os.path.isfile(args.checkpoint_file): print("loading checkpoint: '{}'".format(args.checkpoint_file)) checkpoint_file = torch.load(args.checkpoint_file) else: raise FileNotFoundError('File not found') ner_model.load_state_dict(checkpoint_file['state_dict']) else: if not args.rand_embedding: ner_model.load_pretrained_word_embedding(embedding_tensor) ner_model.rand_init(init_word_embedding=args.rand_embedding) if args.update == 'sgd': optimizer = optim.SGD(ner_model.parameters(), lr=args.lr, momentum=args.momentum) elif args.update == 'adam': optimizer = optim.Adam(ner_model.parameters(), lr=args.lr) if args.load_check_point and args.load_opt: optimizer.load_state_dict(checkpoint_file['optimizer']) crit_lm = nn.CrossEntropyLoss() crit_ner_list = nn.ModuleList() for i in range(file_num): ith_label_map = label_maps[i] crit_ner = CRFLoss_vb(len(ith_label_map), ith_label_map['<start>'], ith_label_map['<pad>']) crit_ner_list.append(crit_ner) if args.gpu >= 0: if_cuda = True print('device: ' + str(args.gpu)) torch.cuda.set_device(args.gpu) crit_lm.cuda() for i in range(file_num): crit_ner_list[i].cuda() ner_model.cuda() packer_list = [] for i in range(file_num): packer = CRFRepack_WC(len(label_maps[i]), True) packer_list.append(packer) else: if_cuda = False packer_list = [] for i in range(file_num): packer = CRFRepack_WC(len(label_maps[i]), False) packer_list.append(packer) tot_length = sum(map(lambda t: len(t), dataset_loader)) best_f1 = [] for i in range(file_num): best_f1.append(float('-inf')) best_pre = [] for i in range(file_num): best_pre.append(float('-inf')) best_rec = [] for i in range(file_num): best_rec.append(float('-inf')) best_acc = [] for i in range(file_num): best_acc.append(float('-inf')) track_list = list() start_time = time.time() print('Num of epochs : ' + str(args.epoch)) epoch_list = range(args.start_epoch, args.start_epoch + args.epoch) patience_count = 0 evaluator_list = [] predictor_list = [] for i in range(file_num): evaluator = eval_wc(packer_list[i], label_maps[i], args.eva_matrix) predictor = predict_wc(if_cuda, f_map, char_map, label_maps[i], f_map['<eof>'], char_map['\n'], label_maps[i]['<pad>'], label_maps[i]['<start>'], True, args.batch_size, args.caseless) #NEW evaluator_list.append(evaluator) predictor_list.append(predictor) return predictor_list, ner_model
def load_pretrain_model(file_path): print("CSCI548 model loading") parser = argparse.ArgumentParser(description='Evaluating LM-BLSTM-CRF') parser.add_argument('--load_arg', default='./checkpoint/cwlm_lstm_crf.json', help='path to arg json') parser.add_argument('--load_check_point', default='./checkpoint/cwlm_lstm_crf.model', help='path to model checkpoint file') parser.add_argument('--gpu', type=int, default=-1, help='gpu id') parser.add_argument( '--decode_type', choices=['label', 'string'], default='label', help= 'type of decode function, set `label` to couple label with text, or set `string` to insert label into test' ) parser.add_argument('--batch_size', type=int, default=50, help='size of batch') parser.add_argument('--input_file', default=file_path + "/test.tsv", help='path to input un-annotated corpus') parser.add_argument('--output_file', default='annotate/output', help='path to output file') parser.add_argument('--dataset_no', type=int, default=1, help='number of the datasets') args = parser.parse_args() print('loading dictionary') with open(args.load_arg, 'r') as f: jd = json.load(f) jd = jd['args'] checkpoint_file = torch.load(args.load_check_point, map_location=lambda storage, loc: storage) f_map = checkpoint_file['f_map'] l_map = checkpoint_file['l_map'] c_map = checkpoint_file['c_map'] in_doc_words = checkpoint_file['in_doc_words'] if args.gpu >= 0: torch.cuda.set_device(args.gpu) # build model print('loading model') ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], args.dataset_no, large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers=jd['highway_layers']) ner_model.load_state_dict(checkpoint_file['state_dict']) if args.gpu >= 0: if_cuda = True torch.cuda.set_device(args.gpu) ner_model.cuda() packer = CRFRepack_WC(len(l_map), True) else: if_cuda = False packer = CRFRepack_WC(len(l_map), False) decode_label = (args.decode_type == 'label') predictor = predict_wc(if_cuda, f_map, c_map, l_map, f_map['<eof>'], c_map['\n'], l_map['<pad>'], l_map['<start>'], decode_label, args.batch_size, jd['caseless']) evaluator = eval_wc(packer, l_map, args.eva_matrix) # loading corpus print('loading corpus') lines = [] features = [] with codecs.open(args.input_file, 'r', 'utf-8') as f: for i, line in enumerate(f): if i == 2000: break if line == '\n': features.append(utils.read_features(lines)) lines = [] continue tmp = line.split() lines.append(tmp[0]) for idx in range(args.dataset_no): print('annotating the entity type', idx) with open(args.output_file + str(idx) + '.txt', 'w') as fout: for feature in features: predictor.output_batch(ner_model, feature, fout, idx) fout.write('\n') return args.output_file + str(idx) + '.txt'
def build_model(self): print('building model') self.ner_model = LM_LSTM_CRF(len(self.l_map), len(self.char_map), self.args.char_dim, self.args.char_hidden, self.args.char_layers, self.args.word_dim, self.args.word_hidden, self.args.word_layers, len(self.f_map), self.args.drop_out, self.file_num, large_CRF=self.args.small_crf, if_highway=self.args.high_way, in_doc_words=self.in_doc_words, highway_layers=self.args.highway_layers) if self.args.load_check_point: self.ner_model.load_state_dict(self.checkpoint_file['state_dict']) else: if not self.args.rand_embedding: self.ner_model.load_pretrained_word_embedding( self.embedding_tensor) self.ner_model.rand_init( init_word_embedding=self.args.rand_embedding) if self.args.update == 'sgd': self.optimizer = optim.SGD(self.ner_model.parameters(), lr=self.args.lr, momentum=self.args.momentum) elif self.args.update == 'adam': self.optimizer = optim.Adam(self.ner_model.parameters(), lr=self.args.lr) if self.args.load_check_point and self.args.load_opt: self.optimizer.load_state_dict(self.checkpoint_file['optimizer']) self.crit_lm = nn.CrossEntropyLoss() self.crit_ner = CRFLoss_vb(len(self.l_map), self.l_map['<start>'], self.l_map['<pad>']) if self.args.gpu >= 0: if_cuda = True print('device: ' + str(self.args.gpu)) torch.cuda.set_device(self.args.gpu) self.crit_ner.cuda() self.crit_lm.cuda() self.ner_model.cuda() self.packer = CRFRepack_WC(len(self.l_map), True) else: if_cuda = False self.packer = CRFRepack_WC(len(self.l_map), False) self.evaluator = eval_wc(self.packer, self.l_map, self.args.eva_matrix) self.predictor = predict_wc(if_cuda, self.f_map, self.char_map, self.l_map, self.f_map['<eof>'], self.char_map['\n'], self.l_map['<pad>'], self.l_map['<start>'], True, self.args.batch_size, self.args.caseless) # NEW
def load_model(self, args): print("CSCI548 model loading") self.args = args print('loading dictionary') with open(self.args.load_arg, 'r') as f: jd = json.load(f) jd = jd['args'] checkpoint_file = torch.load(self.args.load_check_point, map_location=lambda storage, loc: storage) f_map = checkpoint_file['f_map'] l_map = checkpoint_file['l_map'] c_map = checkpoint_file['c_map'] in_doc_words = checkpoint_file['in_doc_words'] # build model print('loading model') self.ner_model = LM_LSTM_CRF(len(l_map), len(c_map), jd['char_dim'], jd['char_hidden'], jd['char_layers'], jd['word_dim'], jd['word_hidden'], jd['word_layers'], len(f_map), jd['drop_out'], self.args.dataset_no, large_CRF=jd['small_crf'], if_highway=jd['high_way'], in_doc_words=in_doc_words, highway_layers=jd['highway_layers']) self.ner_model.load_state_dict(checkpoint_file['state_dict']) if self.args.gpu >= 0: if_cuda = True torch.cuda.set_device(self.args.gpu) self.ner_model.cuda() packer = CRFRepack_WC(len(l_map), True) else: if_cuda = False packer = CRFRepack_WC(len(l_map), False) decode_label = (self.args.decode_type == 'label') predictor = predict_wc(if_cuda, f_map, c_map, l_map, f_map['<eof>'], c_map['\n'], l_map['<pad>'], l_map['<start>'], decode_label, self.args.batch_size, jd['caseless']) evaluator = eval_wc(packer, l_map, self.args.eva_matrix) # loading corpus print('loading corpus') lines = [] features = [] tags = [] feature_tags = [] with codecs.open(self.args.input_file, 'r', 'utf-8') as f: for i, line in enumerate(f): #if i == 2000: #break if line == '\r\n': features.append(utils.read_features2(lines)) feature_tags.append(tags) tags = [] lines = [] continue tmp = line.split(" ") lines.append(tmp[0]) tags.append((tmp[3])) #print(len(feature_tags),len(features)) for idx in range(self.args.dataset_no): print('annotating the entity type', idx) with open(self.args.output_file + str(idx) + '.txt', 'w') as fout: for feature, tag in zip(features, feature_tags): predictor.output_batch(self.ner_model, feature, tag, fout, idx) fout.write('\n') test_f1, test_pre, test_rec, test_acc = evaluator.calc_score( self.ner_model, self.test_dataset_loader[0], 0) print("Test evaluation: f1 = %.4f, recall = %.4f, precision = %.4f " % (test_f1, test_rec, test_pre)) return self.args.output_file + str(idx) + '.txt'