def generate_instance(self, name): self.fix_alphabet() if name == "train": self.train_texts, self.train_Ids = read_instance( self.train_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "dev": self.dev_texts, self.dev_Ids = read_instance( self.dev_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "test": self.test_texts, self.test_Ids = read_instance( self.test_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) elif name == "raw": self.raw_texts, self.raw_Ids = read_instance( self.raw_dir, self.word_alphabet, self.char_alphabet, self.feature_alphabets, self.label_alphabet, self.number_normalized, self.MAX_SENTENCE_LENGTH) else: print( "Error: you can only generate train/dev/test instance! Illegal input:%s" % (name))
def evaluate(data, model, name, nbest=None): if name == "train": instances = data.train_Ids elif name == "dev": instances = data.dev_Ids elif name == 'test': instances = data.test_Ids elif name == 'raw': instances = data.raw_Ids else: print("Error: wrong evaluate name,", name) right_token = 0 whole_token = 0 nbest_pred_results = [] pred_scores = [] pred_results = [] gold_results = [] ## set model in eval model model.eval() batch_size = data.HP_batch_size start_time = time.time() train_num = len(instances) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = instances[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu, True) if nbest: scores, nbest_tag_seq = model.decode_nbest( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask, nbest) nbest_pred_result = recover_nbest_label(nbest_tag_seq, mask, data.label_alphabet, batch_wordrecover) nbest_pred_results += nbest_pred_result pred_scores += scores[batch_wordrecover].cpu().data.numpy().tolist( ) ## select the best sequence to evalurate tag_seq = nbest_tag_seq[:, :, 0] else: tag_seq = model(batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, mask) # print("tag:",tag_seq) pred_label, gold_label = recover_label(tag_seq, batch_label, mask, data.label_alphabet, batch_wordrecover) pred_results += pred_label gold_results += gold_label decode_time = time.time() - start_time speed = len(instances) / decode_time acc, p, r, f = get_ner_fmeasure(gold_results, pred_results, data.tagScheme) if nbest: return speed, acc, p, r, f, nbest_pred_results, pred_scores return speed, acc, p, r, f, pred_results, pred_scores
def IOB2BIO(input_file, output_file): print("Convert IOB -> BIO for file: %s", input_file) with open(input_file,'r') as in_file: fins = in_file.readlines() fout = open(output_file,'w') words = [] labels = [] for line in fins: if len(line) < 3: sent_len = len(words) for idx in range(sent_len): if "I-" in labels[idx]: label_type = labels[idx].split('-')[-1] if (idx == 0) or (labels[idx-1] == "O") or (label_type != labels[idx-1].split('-')[-1]): fout.write(words[idx]+" B-"+label_type+"\n") else: fout.write(words[idx]+" "+labels[idx]+"\n") else: fout.write(words[idx]+" "+labels[idx]+"\n") fout.write('\n') words = [] labels = [] else: pair = line.strip('\n').split() words.append(pair[0]) labels.append(pair[-1].upper()) fout.close() print("BIO file generated: %s", output_file)
def recover_nbest_label(pred_variable, mask_variable, label_alphabet, word_recover): """ input: pred_variable (batch_size, sent_len, nbest): pred tag result mask_variable (batch_size, sent_len): mask variable word_recover (batch_size) output: nbest_pred_label list: [batch_size, nbest, each_seq_len] """ # print("word recover:", word_recover.size()) # exit(0) pred_variable = pred_variable[word_recover] mask_variable = mask_variable[word_recover] batch_size = pred_variable.size(0) seq_len = pred_variable.size(1) print(pred_variable.size()) nbest = pred_variable.size(2) mask = mask_variable.cpu().data.numpy() pred_tag = pred_variable.cpu().data.numpy() batch_size = mask.shape[0] pred_label = [] for idx in range(batch_size): pred = [] for idz in range(nbest): each_pred = [ label_alphabet.get_instance(pred_tag[idx][idy][idz]) for idy in range(seq_len) if mask[idx][idy] != 0 ] pred.append(each_pred) pred_label.append(pred) return pred_label
def write_decoded_results(self, predict_results, name): fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) for idx in range(sent_num): sent_length = len(predict_results[idx]) for idy in range(sent_length): ## content_list[idx] is a list with [word, char, label] fout.write(content_list[idx][0][idy] + " " + predict_results[idx][idy] + '\n') fout.write('\n') fout.close() print("Predict %s result has been written into file. %s" % (name, self.decode_dir))
def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu, bidirect_flag=True): super(CharBiGRU, self).__init__() print("build char sequence feature extractor: GRU ...") self.gpu = gpu self.hidden_dim = hidden_dim if bidirect_flag: self.hidden_dim = hidden_dim // 2 self.char_drop = nn.Dropout(dropout) self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim) if pretrain_char_embedding is not None: self.char_embeddings.weight.data.copy_( torch.from_numpy(pretrain_char_embedding)) else: self.char_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(alphabet_size, embedding_dim))) self.char_lstm = nn.GRU(embedding_dim, self.hidden_dim, num_layers=1, batch_first=True, bidirectional=bidirect_flag) if self.gpu: self.char_drop = self.char_drop.cuda() self.char_embeddings = self.char_embeddings.cuda() self.char_lstm = self.char_lstm.cuda()
def get_instance(self, index): if index == 0: if self.label: return self.instances[0] # First index is occupied by the wildcard element. return None try: return self.instances[index - 1] except IndexError: print( 'WARNING:Alphabet get_instance ,unknown instance, return the first label.' ) return self.instances[0]
def decode_nbest(self, word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover, mask, nbest): if not self.use_crf: print("Nbest output is currently supported only for CRF! Exit...") exit(0) outs = self.word_hidden(word_inputs, feature_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover) batch_size = word_inputs.size(0) seq_len = word_inputs.size(1) scores, tag_seq = self.crf._viterbi_decode_nbest(outs, mask, nbest) return scores, tag_seq
def load_model_decode(data, name): print("Load Model from file: ", data.model_dir) model = SeqModel(data) ## load model need consider if the model trained in GPU and load in CPU, or vice versa # if not gpu: # model.load_state_dict(torch.load(model_dir)) # # model.load_state_dict(torch.load(model_dir), map_location=lambda storage, loc: storage) # # model = torch.load(model_dir, map_location=lambda storage, loc: storage) # else: # model.load_state_dict(torch.load(model_dir)) # # model = torch.load(model_dir) model.load_state_dict(torch.load(data.load_model_dir)) print("Decode %s data, nbest: %s ..." % (name, data.nbest)) start_time = time.time() speed, acc, p, r, f, pred_results, pred_scores = evaluate( data, model, name, data.nbest) end_time = time.time() time_cost = end_time - start_time if data.seg: print( "%s: time:%.2fs, speed:%.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (name, time_cost, speed, acc, p, r, f)) else: print("%s: time:%.2fs, speed:%.2fst/s; acc: %.4f" % (name, time_cost, speed, acc)) return pred_results, pred_scores
def save(self, output_directory, name=None): """ Save both alhpabet records to the given directory. :param output_directory: Directory to save model and weights. :param name: The alphabet saving name, optional. :return: """ saving_name = name if name else self.__name try: json.dump( self.get_content(), open(os.path.join(output_directory, saving_name + ".json"), 'w')) except Exception as e: print("Exception: Alphabet is not saved: " % repr(e))
def __init__(self, tagset_size, gpu): super(CRF, self).__init__() print("build CRF...") self.gpu = gpu # Matrix of transition parameters. Entry i,j is the score of transitioning *to* i *from* j. self.tagset_size = tagset_size # # We add 2 here, because of START_TAG and STOP_TAG # # transitions (f_tag_size, t_tag_size), transition value from f_tag to t_tag init_transitions = torch.zeros(self.tagset_size + 2, self.tagset_size + 2) init_transitions[:, START_TAG] = -10000.0 init_transitions[STOP_TAG, :] = -10000.0 init_transitions[:, 0] = -10000.0 init_transitions[0, :] = -10000.0 if self.gpu: init_transitions = init_transitions.cuda() self.transitions = nn.Parameter(init_transitions)
def get_ner_fmeasure(golden_lists, predict_lists, label_type="BMES"): sent_num = len(golden_lists) golden_full = [] predict_full = [] right_full = [] right_tag = 0 all_tag = 0 for idx in range(0,sent_num): # word_list = sentence_lists[idx] golden_list = golden_lists[idx] predict_list = predict_lists[idx] for idy in range(len(golden_list)): if golden_list[idy] == predict_list[idy]: right_tag += 1 all_tag += len(golden_list) if label_type == "BMES": gold_matrix = get_ner_BMES(golden_list) pred_matrix = get_ner_BMES(predict_list) else: gold_matrix = get_ner_BIO(golden_list) pred_matrix = get_ner_BIO(predict_list) # print "gold", gold_matrix # print "pred", pred_matrix right_ner = list(set(gold_matrix).intersection(set(pred_matrix))) golden_full += gold_matrix predict_full += pred_matrix right_full += right_ner right_num = len(right_full) golden_num = len(golden_full) predict_num = len(predict_full) if predict_num == 0: precision = -1 else: precision = (right_num+0.0)/predict_num if golden_num == 0: recall = -1 else: recall = (right_num+0.0)/golden_num if (precision == -1) or (recall == -1) or (precision+recall) <= 0.: f_measure = -1 else: f_measure = 2*precision*recall/(precision+recall) accuracy = (right_tag+0.0)/all_tag # print "Accuracy: ", right_tag,"/",all_tag,"=",accuracy print(f"gold_num = {golden_num}, pred_num = {predict_num}, right_num = {right_num}") return accuracy, precision, recall, f_measure
def __init__(self, data): super(SeqModel, self).__init__() self.use_crf = data.use_crf print("build network...") print("use_char: %s", data.use_char) if data.use_char: print("char feature extractor: %s ", data.char_feature_extractor) print("word feature extractor: %s", data.word_feature_extractor) print("use crf: %s", self.use_crf) self.gpu = data.HP_gpu self.average_batch = data.average_batch_loss ## add two more label for downlayer lstm, use original label size for CRF label_size = data.label_alphabet_size data.label_alphabet_size += 2 self.word_hidden = WordSequence(data) if self.use_crf: self.crf = CRF(label_size, self.gpu)
def write_nbest_decoded_results(self, predict_results, pred_scores, name): ## predict_results : [whole_sent_num, nbest, each_sent_length] ## pred_scores: [whole_sent_num, nbest] fout = open(self.decode_dir, 'w') sent_num = len(predict_results) content_list = [] if name == 'raw': content_list = self.raw_texts elif name == 'test': content_list = self.test_texts elif name == 'dev': content_list = self.dev_texts elif name == 'train': content_list = self.train_texts else: print( "Error: illegal name during writing predict result, name should be within train/dev/test/raw !" ) assert (sent_num == len(content_list)) assert (sent_num == len(pred_scores)) for idx in range(sent_num): sent_length = len(predict_results[idx][0]) nbest = len(predict_results[idx]) score_string = "# " for idz in range(nbest): score_string += format(pred_scores[idx][idz], '.4f') + " " fout.write(score_string.strip() + "\n") for idy in range(sent_length): try: # Will fail with python3 label_string = content_list[idx][0][idy].encode( 'utf-8') + " " except: label_string = content_list[idx][0][idy] + " " for idz in range(nbest): label_string += predict_results[idx][idz][idy] + " " label_string = label_string.strip() + "\n" fout.write(label_string) fout.write('\n') fout.close() print("Predict %s %s-best result has been written into file. %s" % (name, nbest, self.decode_dir))
def fmeasure_from_file(golden_file, predict_file, label_type="BMES"): print(f"Get f measure from file: {gold_file} {predict_file}") print("Label format: %s",label_type) golden_sent,golden_labels = readSentence(golden_file) predict_sent,predict_labels = readSentence(predict_file) P,R,F = get_ner_fmeasure(golden_labels, predict_labels, label_type) print("P:%sm R:%s, F:%s"%(P,R,F))
def build_pretrain_emb(self): if self.word_emb_dir: print("Load pretrained word embedding, norm: %s, dir: %s" % (self.norm_word_emb, self.word_emb_dir)) self.pretrain_word_embedding, self.word_emb_dim = build_pretrain_embedding( self.word_emb_dir, self.word_alphabet, self.word_emb_dim, self.norm_word_emb) if self.char_emb_dir: print("Load pretrained char embedding, norm: %s, dir: %s" % (self.norm_char_emb, self.char_emb_dir)) self.pretrain_char_embedding, self.char_emb_dim = build_pretrain_embedding( self.char_emb_dir, self.char_alphabet, self.char_emb_dim, self.norm_char_emb) for idx in range(self.feature_num): if self.feature_emb_dirs[idx]: print( "Load pretrained feature %s embedding:, norm: %s, dir: %s" % (self.feature_name[idx], self.norm_feature_embs[idx], self.feature_emb_dirs[idx])) self.pretrain_feature_embeddings[idx], self.feature_emb_dims[ idx] = build_pretrain_embedding( self.feature_emb_dirs[idx], self.feature_alphabets[idx], self.feature_emb_dims[idx], self.norm_feature_embs[idx])
def config_file_to_dict(input_file): config = {} fins = open(input_file, 'r').readlines() for line in fins: if len(line) > 0 and line[0] == "#": continue if "=" in line: pair = line.strip().split('#', 1)[0].split('=', 1) item = pair[0] if item == "feature": if item not in config: feat_dict = {} config[item] = feat_dict feat_dict = config[item] new_pair = pair[-1].split() feat_name = new_pair[0] one_dict = {} one_dict["emb_dir"] = None one_dict["emb_size"] = 10 one_dict["emb_norm"] = False if len(new_pair) > 1: for idx in range(1, len(new_pair)): conf_pair = new_pair[idx].split('=') if conf_pair[0] == "emb_dir": one_dict["emb_dir"] = conf_pair[-1] elif conf_pair[0] == "emb_size": one_dict["emb_size"] = int(conf_pair[-1]) elif conf_pair[0] == "emb_norm": one_dict["emb_norm"] = str2bool(conf_pair[-1]) feat_dict[feat_name] = one_dict # print "feat",feat_dict else: if item in config: print( "Warning: duplicated config item found: %s, updated." % (pair[0])) config[item] = pair[-1] return config
def __init__(self, alphabet_size, pretrain_char_embedding, embedding_dim, hidden_dim, dropout, gpu): super(CharCNN, self).__init__() print("build char sequence feature extractor: CNN ...") self.gpu = gpu self.hidden_dim = hidden_dim self.char_drop = nn.Dropout(dropout) self.char_embeddings = nn.Embedding(alphabet_size, embedding_dim) if pretrain_char_embedding is not None: self.char_embeddings.weight.data.copy_( torch.from_numpy(pretrain_char_embedding)) else: self.char_embeddings.weight.data.copy_( torch.from_numpy( self.random_embedding(alphabet_size, embedding_dim))) self.char_cnn = nn.Conv1d(embedding_dim, self.hidden_dim, kernel_size=3, padding=1) if self.gpu: self.char_drop = self.char_drop.cuda() self.char_embeddings = self.char_embeddings.cuda() self.char_cnn = self.char_cnn.cuda()
def build_pretrain_embedding(embedding_path, word_alphabet, embedd_dim=100, norm=True): embedd_dict = dict() if embedding_path != None: embedd_dict, embedd_dim = load_pretrain_emb(embedding_path) alphabet_size = word_alphabet.size() scale = np.sqrt(3.0 / embedd_dim) pretrain_emb = np.empty([word_alphabet.size(), embedd_dim]) perfect_match = 0 case_match = 0 not_match = 0 for word, index in word_alphabet.iteritems(): if word in embedd_dict: if norm: pretrain_emb[index, :] = norm2one(embedd_dict[word]) else: pretrain_emb[index, :] = embedd_dict[word] perfect_match += 1 elif word.lower() in embedd_dict: if norm: pretrain_emb[index, :] = norm2one(embedd_dict[word.lower()]) else: pretrain_emb[index, :] = embedd_dict[word.lower()] case_match += 1 else: pretrain_emb[index, :] = np.random.uniform(-scale, scale, [1, embedd_dim]) not_match += 1 pretrained_size = len(embedd_dict) print( "Embedding:\n pretrain word:%s, prefect match:%s, case_match:%s, oov:%s, oov%%:%s" % (pretrained_size, perfect_match, case_match, not_match, (not_match + 0.) / alphabet_size)) return pretrain_emb, embedd_dim
def initial_feature_alphabets(self): items = open(self.train_dir, 'r').readline().strip('\n').split() total_column = len(items) if total_column > 2: for idx in range(1, total_column - 1): feature_prefix = items[idx].split(']', 1)[0] + "]" self.feature_alphabets.append(Alphabet(feature_prefix)) self.feature_name.append(feature_prefix) print("Find feature: %s", feature_prefix) self.feature_num = len(self.feature_alphabets) self.pretrain_feature_embeddings = [None] * self.feature_num self.feature_emb_dims = [20] * self.feature_num self.feature_emb_dirs = [None] * self.feature_num self.norm_feature_embs = [False] * self.feature_num self.feature_alphabet_sizes = [0] * self.feature_num if self.feat_config: for idx in range(self.feature_num): if self.feature_name[idx] in self.feat_config: self.feature_emb_dims[idx] = self.feat_config[ self.feature_name[idx]]['emb_size'] self.feature_emb_dirs[idx] = self.feat_config[ self.feature_name[idx]]['emb_dir'] self.norm_feature_embs[idx] = self.feat_config[ self.feature_name[idx]]['emb_norm']
def choose_label(input_file, output_file): with open(input_file,'r') as in_file: fins = in_file.readlines() with open(output_file,'w') as fout: for line in fins: if len(line) < 3: fout.write(line) else: pairs = line.strip('\n').split(' ') fout.write(pairs[0]+" "+ pairs[-1]+"\n") if __name__ == '__main__': '''Convert NER tag schemes among IOB/BIO/BIOES. For example: if you want to convert the IOB tag scheme to BIO, then you run as following: python NERSchemeConverter.py IOB2BIO input_iob_file output_bio_file Input data format is the standard CoNLL 2003 data format. ''' if sys.argv[1].upper() == "IOB2BIO": IOB2BIO(sys.argv[2],sys.argv[3]) elif sys.argv[1].upper() == "BIO2BIOES": BIO2BIOES(sys.argv[2],sys.argv[3]) elif sys.argv[1].upper() == "BIOES2BIO": BIOES2BIO(sys.argv[2],sys.argv[3]) elif sys.argv[1].upper() == "IOB2BIOES": IOB2BIO(sys.argv[2],"temp") BIO2BIOES("temp",sys.argv[3]) else: print("Argument error: sys.argv[1] should belongs to \"IOB2BIO/BIO2BIOES/BIOES2BIO/IOB2BIOES\"")
def load_pretrain_emb(embedding_path): embedd_dim = -1 embedd_dict = dict() with open(embedding_path, 'r') as file: for line in file: if line.startswith(' ') or line.startswith(' '): continue line = line.strip() if len(line) == 0: continue tokens = line.split() if embedd_dim < 0: embedd_dim = len(tokens) - 1 else: assert (embedd_dim + 1 == len(tokens)), f"INCORRECT{line}" embedd = np.empty([1, embedd_dim]) embedd[:] = tokens[1:] if sys.version_info[0] < 3: first_col = tokens[0].decode('utf-8') else: first_col = tokens[0] embedd_dict[first_col] = embedd return embedd_dict, embedd_dim if __name__ == '__main__': a = np.arange(9.0) print(a) print(norm2one(a))
def train(data): print("Training model...") data.show_data_summary() save_data_name = data.model_dir + ".dset" data.save(save_data_name) model = SeqModel(data) loss_function = nn.NLLLoss() if data.optimizer.lower() == "sgd": optimizer = optim.SGD(model.parameters(), lr=data.HP_lr, momentum=data.HP_momentum, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adagrad": optimizer = optim.Adagrad(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adadelta": optimizer = optim.Adadelta(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "rmsprop": optimizer = optim.RMSprop(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) elif data.optimizer.lower() == "adam": optimizer = optim.Adam(model.parameters(), lr=data.HP_lr, weight_decay=data.HP_l2) else: print("Optimizer illegal: %s" % (data.optimizer)) exit(0) best_dev = -10 # data.HP_iteration = 1 ## start training for idx in range(data.HP_iteration): epoch_start = time.time() temp_start = epoch_start print("Epoch: %s/%s" % (idx, data.HP_iteration)) if data.optimizer == "SGD": optimizer = lr_decay(optimizer, idx, data.HP_lr_decay, data.HP_lr) instance_count = 0 sample_id = 0 sample_loss = 0 total_loss = 0 right_token = 0 whole_token = 0 random.shuffle(data.train_Ids) ## set model in train model model.train() model.zero_grad() batch_size = data.HP_batch_size batch_id = 0 train_num = len(data.train_Ids) total_batch = train_num // batch_size + 1 for batch_id in range(total_batch): start = batch_id * batch_size end = (batch_id + 1) * batch_size if end > train_num: end = train_num instance = data.train_Ids[start:end] if not instance: continue batch_word, batch_features, batch_wordlen, batch_wordrecover, batch_char, batch_charlen, batch_charrecover, batch_label, mask = batchify_with_label( instance, data.HP_gpu) instance_count += 1 loss, tag_seq = model.neg_log_likelihood_loss( batch_word, batch_features, batch_wordlen, batch_char, batch_charlen, batch_charrecover, batch_label, mask) right, whole = predict_check(tag_seq, batch_label, mask) right_token += right whole_token += whole sample_loss += loss.data[0] total_loss += loss.data[0] if end % 500 == 0: temp_time = time.time() temp_cost = temp_time - temp_start temp_start = temp_time print( " Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) sys.stdout.flush() sample_loss = 0 loss.backward() optimizer.step() model.zero_grad() temp_time = time.time() temp_cost = temp_time - temp_start print(" Instance: %s; Time: %.2fs; loss: %.4f; acc: %s/%s=%.4f" % (end, temp_cost, sample_loss, right_token, whole_token, (right_token + 0.) / whole_token)) epoch_finish = time.time() epoch_cost = epoch_finish - epoch_start print( "Epoch: %s training finished. Time: %.2fs, speed: %.2fst/s, total loss: %s" % (idx, epoch_cost, train_num / epoch_cost, total_loss)) # continue speed, acc, p, r, f, _, _ = evaluate(data, model, "dev") dev_finish = time.time() dev_cost = dev_finish - epoch_finish if data.seg: current_score = f print( "Dev: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (dev_cost, speed, acc, p, r, f)) else: current_score = acc print("Dev: time: %.2fs speed: %.2fst/s; acc: %.4f" % (dev_cost, speed, acc)) if current_score > best_dev: if data.seg: print("Exceed previous best f score:", best_dev) else: print("Exceed previous best acc score:", best_dev) model_name = data.model_dir + '.' + str(idx) + ".model" print("Save current best model in file:", model_name) torch.save(model.state_dict(), model_name) best_dev = current_score # ## decode test speed, acc, p, r, f, _, _ = evaluate(data, model, "test") test_finish = time.time() test_cost = test_finish - dev_finish if data.seg: print( "Test: time: %.2fs, speed: %.2fst/s; acc: %.4f, p: %.4f, r: %.4f, f: %.4f" % (test_cost, speed, acc, p, r, f)) else: print("Test: time: %.2fs, speed: %.2fst/s; acc: %.4f" % (test_cost, speed, acc)) gc.collect()
def lr_decay(optimizer, epoch, decay_rate, init_lr): lr = init_lr / (1 + decay_rate * epoch) print(f" Learning rate is setted as: {lr}") for param_group in optimizer.param_groups: param_group['lr'] = lr return optimizer
parser.add_argument('--train', default="data/conll03/train.bmes") parser.add_argument('--dev', default="data/conll03/dev.bmes") parser.add_argument('--test', default="data/conll03/test.bmes") parser.add_argument('--seg', default="True") parser.add_argument('--raw') parser.add_argument('--loadmodel') parser.add_argument('--output') args = parser.parse_args() data = Data() data.train_dir = args.train data.dev_dir = args.dev data.test_dir = args.test data.model_dir = args.savemodel data.dset_dir = args.savedset print("aaa", data.dset_dir) status = args.status.lower() save_model_dir = args.savemodel data.HP_gpu = torch.cuda.is_available() print("Seed num:", seed_num) data.number_normalized = True data.word_emb_dir = "../data/glove.6B.100d.txt" if status == 'train': print("MODEL: train") data_initialization(data) data.use_char = True data.HP_batch_size = 10 data.HP_lr = 0.015 data.char_seq_feature = "CNN" data.generate_instance('train')
def fmeasure_from_singlefile(twolabel_file, label_type="BMES", pred_col=-1): sent,golden_labels,predict_labels = readTwoLabelSentence(twolabel_file, pred_col) P,R,F = get_ner_fmeasure(golden_labels, predict_labels, label_type) print("P:%s, R:%s, F:%s"%(P,R,F))
def __init__(self, data): super(WordSequence, self).__init__() print("build word sequence feature extractor: %s..." % (data.word_feature_extractor)) self.gpu = data.HP_gpu self.use_char = data.use_char # self.batch_size = data.HP_batch_size # self.hidden_dim = data.HP_hidden_dim self.droplstm = nn.Dropout(data.HP_dropout) self.bilstm_flag = data.HP_bilstm self.lstm_layer = data.HP_lstm_layer self.wordrep = WordRep(data) self.input_size = data.word_emb_dim if self.use_char: self.input_size += data.HP_char_hidden_dim if data.char_feature_extractor == "ALL": self.input_size += data.HP_char_hidden_dim for idx in range(data.feature_num): self.input_size += data.feature_emb_dims[idx] # The LSTM takes word embeddings as inputs, and outputs hidden states # with dimensionality hidden_dim. if self.bilstm_flag: lstm_hidden = data.HP_hidden_dim // 2 else: lstm_hidden = data.HP_hidden_dim self.word_feature_extractor = data.word_feature_extractor if self.word_feature_extractor == "GRU": self.lstm = nn.GRU(self.input_size, lstm_hidden, num_layers=self.lstm_layer, batch_first=True, bidirectional=self.bilstm_flag) elif self.word_feature_extractor == "LSTM": self.lstm = nn.LSTM(self.input_size, lstm_hidden, num_layers=self.lstm_layer, batch_first=True, bidirectional=self.bilstm_flag) elif self.word_feature_extractor == "CNN": # cnn_hidden = data.HP_hidden_dim self.word2cnn = nn.Linear(self.input_size, data.HP_hidden_dim) self.cnn_layer = data.HP_cnn_layer print("CNN layer: %s", self.cnn_layer) self.cnn_list = nn.ModuleList() self.cnn_drop_list = nn.ModuleList() self.cnn_batchnorm_list = nn.ModuleList() kernel = 3 pad_size = (kernel - 1) / 2 for idx in range(self.cnn_layer): self.cnn_list.append( nn.Conv1d(data.HP_hidden_dim, data.HP_hidden_dim, kernel_size=kernel, padding=pad_size)) self.cnn_drop_list.append(nn.Dropout(data.HP_dropout)) self.cnn_batchnorm_list.append( nn.BatchNorm1d(data.HP_hidden_dim)) # The linear layer that maps from hidden state space to tag space self.hidden2tag = nn.Linear(data.HP_hidden_dim, data.label_alphabet_size) if self.gpu: self.droplstm = self.droplstm.cuda() self.hidden2tag = self.hidden2tag.cuda() if self.word_feature_extractor == "CNN": self.word2cnn = self.word2cnn.cuda() for idx in range(self.cnn_layer): self.cnn_list[idx] = self.cnn_list[idx].cuda() self.cnn_drop_list[idx] = self.cnn_drop_list[idx].cuda() self.cnn_batchnorm_list[idx] = self.cnn_batchnorm_list[ idx].cuda() else: self.lstm = self.lstm.cuda()
return pred_results, pred_scores if __name__ == '__main__': from utils.logging import logger parser = argparse.ArgumentParser(description='Tuning with NCRF++') # parser.add_argument('--status', choices=['train', 'decode'], help='update algorithm', default='train') parser.add_argument('--config', help='Configuration File') args = parser.parse_args() data = Data() data.read_config(args.config) # save the config in the model dir expr_dir = os.path.dirname(data.model_dir) print(f"Experiment inside {expr_dir}") if not os.path.exists(expr_dir): os.makedirs(expr_dir) logger.addHandler(logging.FileHandler(os.path.join(expr_dir, 'log'), 'w')) print("Copying your config in the experiment dir") shutil.copy(args.config, expr_dir) status = data.status.lower() data.HP_gpu = torch.cuda.is_available() print("Seed num: %s", seed_num) if status == 'train': print("MODEL: train") data_initialization(data)
def show_data_summary(self): print("++" * 50) print("DATA SUMMARY START:") print(" I/O:") print(" Tag scheme: %s" % (self.tagScheme)) print(" MAX SENTENCE LENGTH: %s" % (self.MAX_SENTENCE_LENGTH)) print(" MAX WORD LENGTH: %s" % (self.MAX_WORD_LENGTH)) print(" Number normalized: %s" % (self.number_normalized)) print(" Word alphabet size: %s" % (self.word_alphabet_size)) print(" Char alphabet size: %s" % (self.char_alphabet_size)) print(" Label alphabet size: %s" % (self.label_alphabet_size)) print(" Word embedding dir: %s" % (self.word_emb_dir)) print(" Char embedding dir: %s" % (self.char_emb_dir)) print(" Word embedding size: %s" % (self.word_emb_dim)) print(" Char embedding size: %s" % (self.char_emb_dim)) print(" Norm word emb: %s" % (self.norm_word_emb)) print(" Norm char emb: %s" % (self.norm_char_emb)) print(" Train file directory: %s" % (self.train_dir)) print(" Dev file directory: %s" % (self.dev_dir)) print(" Test file directory: %s" % (self.test_dir)) print(" Raw file directory: %s" % (self.raw_dir)) print(" Dset file directory: %s" % (self.dset_dir)) print(" Model file directory: %s" % (self.model_dir)) print(" Loadmodel directory: %s" % (self.load_model_dir)) print(" Decode file directory: %s" % (self.decode_dir)) print(" Train instance number: %s" % (len(self.train_texts))) print(" Dev instance number: %s" % (len(self.dev_texts))) print(" Test instance number: %s" % (len(self.test_texts))) print(" Raw instance number: %s" % (len(self.raw_texts))) print(" FEATURE num: %s" % (self.feature_num)) for idx in range(self.feature_num): print(" Fe: %s alphabet size: %s" % (self.feature_alphabets[idx].name, self.feature_alphabet_sizes[idx])) print( " Fe: %s embedding dir: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dirs[idx])) print( " Fe: %s embedding size: %s" % (self.feature_alphabets[idx].name, self.feature_emb_dims[idx])) print(" Fe: %s norm emb: %s" % (self.feature_alphabets[idx].name, self.norm_feature_embs[idx])) print(" " + "++" * 20) print(" Model Network:") print(" Model use_crf: %s" % (self.use_crf)) print(" Model word extractor: %s" % (self.word_feature_extractor)) print(" Model use_char: %s" % (self.use_char)) if self.use_char: print(" Model char extractor: %s" % (self.char_feature_extractor)) print(" Model char_hidden_dim: %s" % (self.HP_char_hidden_dim)) print(" " + "++" * 20) print(" Training:") print(" Optimizer: %s" % (self.optimizer)) print(" Iteration: %s" % (self.HP_iteration)) print(" BatchSize: %s" % (self.HP_batch_size)) print(" Average batch loss: %s" % (self.average_batch_loss)) print(" " + "++" * 20) print(" Hyperparameters:") print(" Hyper lr: %s" % (self.HP_lr)) print(" Hyper lr_decay: %s" % (self.HP_lr_decay)) print(" Hyper HP_clip: %s" % (self.HP_clip)) print(" Hyper momentum: %s" % (self.HP_momentum)) print(" Hyper l2: %s" % (self.HP_l2)) print(" Hyper hidden_dim: %s" % (self.HP_hidden_dim)) print(" Hyper dropout: %s" % (self.HP_dropout)) print(" Hyper lstm_layer: %s" % (self.HP_lstm_layer)) print(" Hyper bilstm: %s" % (self.HP_bilstm)) print(" Hyper GPU: %s" % (self.HP_gpu)) print("DATA SUMMARY END.") print("++" * 50) sys.stdout.flush()
def __init__(self, data): super(WordRep, self).__init__() print("build word representation...") self.gpu = data.HP_gpu self.use_char = data.use_char self.batch_size = data.HP_batch_size self.char_hidden_dim = 0 self.char_all_feature = False if self.use_char: self.char_hidden_dim = data.HP_char_hidden_dim self.char_embedding_dim = data.char_emb_dim if data.char_feature_extractor == "CNN": self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_feature_extractor == "LSTM": self.char_feature = CharBiLSTM(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_feature_extractor == "GRU": self.char_feature = CharBiGRU(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) elif data.char_feature_extractor == "ALL": self.char_all_feature = True self.char_feature = CharCNN(data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) self.char_feature_extra = CharBiLSTM( data.char_alphabet.size(), data.pretrain_char_embedding, self.char_embedding_dim, self.char_hidden_dim, data.HP_dropout, self.gpu) else: print( "Error char feature selection, please check parameter data.char_feature_extractor (CNN/LSTM/GRU/ALL)." ) exit(0) self.embedding_dim = data.word_emb_dim self.drop = nn.Dropout(data.HP_dropout) self.word_embedding = nn.Embedding(data.word_alphabet.size(), self.embedding_dim) if data.pretrain_word_embedding is not None: self.word_embedding.weight.data.copy_( torch.from_numpy(data.pretrain_word_embedding)) else: self.word_embedding.weight.data.copy_( torch.from_numpy( self.random_embedding(data.word_alphabet.size(), self.embedding_dim))) self.feature_num = data.feature_num self.feature_embedding_dims = data.feature_emb_dims self.feature_embeddings = nn.ModuleList() for idx in range(self.feature_num): self.feature_embeddings.append( nn.Embedding(data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx])) for idx in range(self.feature_num): if data.pretrain_feature_embeddings[idx] is not None: self.feature_embeddings[idx].weight.data.copy_( torch.from_numpy(data.pretrain_feature_embeddings[idx])) else: self.feature_embeddings[idx].weight.data.copy_( torch.from_numpy( self.random_embedding( data.feature_alphabets[idx].size(), self.feature_embedding_dims[idx]))) if self.gpu: self.drop = self.drop.cuda() self.word_embedding = self.word_embedding.cuda() for idx in range(self.feature_num): self.feature_embeddings[idx] = self.feature_embeddings[ idx].cuda()