def eval_(output_dir, t_labels, p_labels, text): with open(os.path.join(output_dir, t_labels), 'r') as t, \ open(os.path.join(output_dir, p_labels), 'r') as p, \ open(os.path.join(output_dir, text), 'r') as textf: ne_class_list = set() true_labels_for_testing = [] results_of_prediction = [] for text, true_labels, predicted_labels in zip(textf, t, p): true_labels = true_labels.strip().replace('_', '-').split() predicted_labels = predicted_labels.strip().replace('_', '-').split() biluo_tags_true = get_biluo(true_labels) biluo_tags_predicted = get_biluo(predicted_labels) doc = Doc(text.strip()) offset_true_labels = offset_from_biluo(doc, biluo_tags_true) offset_predicted_labels = offset_from_biluo( doc, biluo_tags_predicted) ent_labels = dict() for ent in offset_true_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') ne_class_list.add(ent_type) if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] true_labels_for_testing.append(ent_labels) ent_labels = dict() for ent in offset_predicted_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] results_of_prediction.append(ent_labels) from eval.quality import calculate_prediction_quality print(ne_class_list) f1, precision, recall, results = \ calculate_prediction_quality(true_labels_for_testing, results_of_prediction, tuple(ne_class_list)) print(f1, precision, recall, results)
def train(args): vocab_path = os.path.join(args.data_dir, args.vocab) tag_path = os.path.join(args.data_dir, args.tag_set) word_to_idx, idx_to_word, tag_to_idx, idx_to_tag = load_vocabs(vocab_path, tag_path) train_sentences, train_labels, test_sentences, test_labels = prepare_text(args, tag_to_idx) device = get_device(args) start = time.time() bert_embedding1 = TransformerWordEmbeddings('distilbert-base-multilingual-cased', layers='-1', batch_size=args.batch_size, pooling_operation=args.pooling_operation, ) bert_embedding2 = TransformerWordEmbeddings('distilroberta-base', layers='-1', batch_size=args.batch_size, pooling_operation=args.pooling_operation, ) bert_embedding3 = TransformerWordEmbeddings('sentence-transformers/xlm-r-100langs-bert-base-nli-stsb-mean-tokens', layers='-1', batch_size=args.batch_size, pooling_operation=args.pooling_operation ) encoder = StackTransformerEmbeddings([bert_embedding1, bert_embedding2, bert_embedding3]) train_sentences_encoded = encoder.encode(train_sentences) test_sentences_encoded = encoder.encode(test_sentences) print(f'Encoding time:{time.time() - start}') # Update the Namespace args.vocab_size = len(idx_to_word) args.number_of_tags = len(idx_to_tag) # Update the embedding dim args.embedding_dim = encoder.embedding_length model = build_model(args, device) print(model) model = model.to(device) # optimizer = torch.optim.Adam(model.parameters()) betas = (0.9, 0.999) eps = 1e-8 optimizer = BertAdam(model, lr=args.learning_rate, b1=betas[0], b2=betas[1], e=eps) pad_id = word_to_idx['PAD'] pad_id_labels = tag_to_idx['PAD'] batcher = SamplingBatcherStackedTransformers(np.asarray(train_sentences_encoded, dtype=object), np.asarray(train_labels, dtype=object), batch_size=args.batch_size, pad_id=pad_id, pad_id_labels=pad_id_labels, embedding_length=encoder.embedding_length, device=device) updates = 1 total_loss = 0 best_loss = +inf stop_training = False output_dir = args.output_dir try: os.makedirs(output_dir) except: pass prefix = args.train_text.split('_')[0] if len(args.train_text.split('_')) > 1 \ else args.train_text.split('.')[0] start_time = time.time() for epoch in range(args.epochs): for batch in batcher: updates += 1 input_, labels, labels_mask = batch optimizer.zero_grad() loss = model.score(batch) loss.backward() optimizer.step() total_loss += loss.data if updates % args.patience == 0: print(f'Epoch: {epoch}, Updates:{updates}, Loss: {total_loss}') if best_loss > total_loss: save_state(f'{output_dir}/{prefix}_best_model.pt', model, loss_fn, optimizer, updates, args=args) best_loss = total_loss total_loss = 0 if updates % args.max_steps == 0: stop_training = True break if stop_training: break print('Training time:{}'.format(time.time() - start_time)) def get_idx_to_tag(label_ids): return [idx_to_tag.get(idx) for idx in label_ids] def get_idx_to_word(words_ids): return [idx_to_word.get(idx) for idx in words_ids] model, model_args = load_model_state(f'{output_dir}/{prefix}_best_model.pt', device) model = model.to(device) batcher_test = SamplingBatcherStackedTransformers(np.asarray(test_sentences_encoded, dtype=object), np.asarray(test_labels, dtype=object), batch_size=args.batch_size, pad_id=pad_id, pad_id_labels=pad_id_labels, embedding_length=encoder.embedding_length, device=device) ne_class_list = set() true_labels_for_testing = [] results_of_prediction = [] with open(f'{output_dir}/{prefix}_label.txt', 'w', encoding='utf8') as t, \ open(f'{output_dir}/{prefix}_predict.txt', 'w', encoding='utf8') as p, \ open(f'{output_dir}/{prefix}_text.txt', 'w', encoding='utf8') as textf: with torch.no_grad(): # predict() method returns final labels not the label_ids preds = predict_no_attn(batcher_test, model, idx_to_tag) cnt = 0 for text, labels, predict_labels in zip(test_sentences, test_labels, preds): cnt += 1 tag_labels_true = get_idx_to_tag(labels) text_ = text tag_labels_predicted = ' '.join(predict_labels) tag_labels_true = ' '.join(tag_labels_true) p.write(tag_labels_predicted + '\n') t.write(tag_labels_true + '\n') textf.write(text_ + '\n') tag_labels_true = tag_labels_true.strip().replace('_', '-').split() tag_labels_predicted = tag_labels_predicted.strip().replace('_', '-').split() biluo_tags_true = get_biluo(tag_labels_true) biluo_tags_predicted = get_biluo(tag_labels_predicted) doc = Doc(text_) offset_true_labels = offset_from_biluo(doc, biluo_tags_true) offset_predicted_labels = offset_from_biluo(doc, biluo_tags_predicted) ent_labels = dict() for ent in offset_true_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') ne_class_list.add(ent_type) if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] true_labels_for_testing.append(ent_labels) ent_labels = dict() for ent in offset_predicted_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] results_of_prediction.append(ent_labels) from eval.quality import calculate_prediction_quality f1, precision, recall, results = \ calculate_prediction_quality(true_labels_for_testing, results_of_prediction, tuple(ne_class_list)) print(f1, precision, recall, results)
def train(args): idx_to_word, idx_to_tag, train_sentences, train_labels, test_sentences, test_labels = prepare( args) word_to_idx = {idx_to_word[key]: key for key in idx_to_word} tag_to_idx = {idx_to_tag[key]: key for key in idx_to_tag} args.vocab_size = len(idx_to_word) args.number_of_tags = len(idx_to_tag) use_cuda = torch.cuda.is_available() device = torch.device("cuda:0" if use_cuda and not args.cpu else "cpu") model = build_model(args) print(model) model = model.to(device) # optimizer = torch.optim.Adam(model.parameters()) betas = (0.9, 0.999) eps = 1e-8 optimizer = BertAdam(model, lr=args.learning_rate, b1=betas[0], b2=betas[1], e=eps) pad_id = word_to_idx['PAD'] batcher = SamplingBatcher(np.asarray(train_sentences, dtype=object), np.asarray(train_labels, dtype=object), batch_size=args.batch_size, pad_id=pad_id) updates = 1 total_loss = 0 best_loss = +inf stop_training = False output_dir = args.output_dir try: os.makedirs(output_dir) except: pass prefix = args.train_text.split('_')[0] if len(args.train_text.split('_')) > 1 \ else args.train_text.split('.')[0] start_time = time.time() for epoch in range(args.epochs): for batch in batcher: updates += 1 batch_data, batch_labels, batch_len, mask_x, mask_y = batch optimizer.zero_grad() batch_data = batch_data.to(device) batch_labels = batch_labels.to(device) mask_y = mask_y.to(device) attn_mask = get_attn_pad_mask(batch_data, batch_data, pad_id) output_batch = model(batch_data, attn_mask) loss = loss_fn(output_batch, batch_labels, mask_y) loss.backward() optimizer.step() total_loss += loss.data if updates % args.patience == 0: print(f'Epoch: {epoch}, Updates:{updates}, Loss: {total_loss}') if best_loss > total_loss: save_state(f'{output_dir}/{prefix}_best_model.pt', model, loss_fn, optimizer, updates) best_loss = total_loss total_loss = 0 if updates % args.max_steps == 0: stop_training = True break if stop_training: break print('Training time:{}'.format(time.time() - start_time)) def get_idx_to_tag(label_ids): return [idx_to_tag.get(idx) for idx in label_ids] def get_idx_to_word(words_ids): return [idx_to_word.get(idx) for idx in words_ids] updates = load_model_state(f'{output_dir}/{prefix}_best_model.pt', model) ne_class_list = set() true_labels_for_testing = [] results_of_prediction = [] with open(f'{output_dir}/{prefix}_label.txt', 'w', encoding='utf8') as t, \ open(f'{output_dir}/{prefix}_predict.txt', 'w', encoding='utf8') as p, \ open(f'{output_dir}/{prefix}_text.txt', 'w', encoding='utf8') as textf: with torch.no_grad(): model.eval() cnt = 0 for text, label in zip(test_sentences, test_labels): cnt += 1 text_tensor = torch.LongTensor(text).unsqueeze(0).to(device) labels = torch.LongTensor(label).unsqueeze(0).to(device) predict = model(text_tensor) predict_labels = predict.argmax(dim=1) predict_labels = predict_labels.view(-1) labels = labels.view(-1) predicted_labels = predict_labels.cpu().data.tolist() true_labels = labels.cpu().data.tolist() tag_labels_predicted = get_idx_to_tag(predicted_labels) tag_labels_true = get_idx_to_tag(true_labels) text_ = get_idx_to_word(text) tag_labels_predicted = ' '.join(tag_labels_predicted) tag_labels_true = ' '.join(tag_labels_true) text_ = ' '.join(text_) p.write(tag_labels_predicted + '\n') t.write(tag_labels_true + '\n') textf.write(text_ + '\n') tag_labels_true = tag_labels_true.strip().replace('_', '-').split() tag_labels_predicted = tag_labels_predicted.strip().replace( '_', '-').split() biluo_tags_true = get_biluo(tag_labels_true) biluo_tags_predicted = get_biluo(tag_labels_predicted) doc = Doc(text_) offset_true_labels = offset_from_biluo(doc, biluo_tags_true) offset_predicted_labels = offset_from_biluo( doc, biluo_tags_predicted) ent_labels = dict() for ent in offset_true_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') ne_class_list.add(ent_type) if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] true_labels_for_testing.append(ent_labels) ent_labels = dict() for ent in offset_predicted_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] results_of_prediction.append(ent_labels) from eval.quality import calculate_prediction_quality f1, precision, recall, results = \ calculate_prediction_quality(true_labels_for_testing, results_of_prediction, tuple(ne_class_list)) print(f1, precision, recall, results)
def decode(options): prefix = options.test_text.split('_')[0] if len(options.test_text.split('_')) > 1 \ else options.test_text.split('.')[0] device = get_device(args) output_dir = options.output_dir try: os.makedirs(output_dir) except: pass model, model_args = load_model_state(options.model, device) model = model.to(device) vocab_path = os.path.join(model_args.data_dir, model_args.vocab) tag_path = os.path.join(model_args.data_dir, model_args.tag_set) word_to_idx, idx_to_word, tag_to_idx, idx_to_tag = load_vocabs( vocab_path, tag_path) *_, test_sentences, test_labels = prepare(options, word_to_idx, tag_to_idx) def get_idx_to_tag(label_ids): return [idx_to_tag.get(idx) for idx in label_ids] def get_idx_to_word(words_ids): return [idx_to_word.get(idx) for idx in words_ids] pad_id = word_to_idx['PAD'] pad_id_labels = tag_to_idx['PAD'] batcher_test = SamplingBatcher(np.asarray(test_sentences, dtype=object), np.asarray(test_labels, dtype=object), batch_size=args.batch_size, pad_id=pad_id, pad_id_labels=pad_id_labels) ne_class_list = set() true_labels_for_testing = [] results_of_prediction = [] with open(f'{output_dir}/{prefix}_label.txt', 'w', encoding='utf8') as t, \ open(f'{output_dir}/{prefix}_predict.txt', 'w', encoding='utf8') as p, \ open(f'{output_dir}/{prefix}_text.txt', 'w', encoding='utf8') as textf: with torch.no_grad(): preds = predict(batcher_test, model, idx_to_tag, pad_id=pad_id) cnt = 0 for text, labels, predict_labels in zip(test_sentences, test_labels, preds): cnt += 1 tag_labels_true = get_idx_to_tag(labels) text_ = get_idx_to_word(text) tag_labels_predicted = ' '.join(predict_labels) tag_labels_true = ' '.join(tag_labels_true) text_ = ' '.join(text_) p.write(tag_labels_predicted + '\n') t.write(tag_labels_true + '\n') textf.write(text_ + '\n') tag_labels_true = tag_labels_true.strip().replace('_', '-').split() tag_labels_predicted = tag_labels_predicted.strip().replace( '_', '-').split() biluo_tags_true = get_biluo(tag_labels_true) biluo_tags_predicted = get_biluo(tag_labels_predicted) doc = Doc(text_) offset_true_labels = offset_from_biluo(doc, biluo_tags_true) offset_predicted_labels = offset_from_biluo( doc, biluo_tags_predicted) ent_labels = dict() for ent in offset_true_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') ne_class_list.add(ent_type) if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] true_labels_for_testing.append(ent_labels) ent_labels = dict() for ent in offset_predicted_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] results_of_prediction.append(ent_labels) from eval.quality import calculate_prediction_quality f1, precision, recall, results = \ calculate_prediction_quality(true_labels_for_testing, results_of_prediction, tuple(ne_class_list)) print(f1, precision, recall, results)
ent_labels = dict() for ent in offset_predicted_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] predicted_labels_final.append(ent_labels) from eval.quality import calculate_prediction_quality print(ne_class_list) f1, precision, recall, results = \ calculate_prediction_quality(true_labels_final, predicted_labels_final, tuple(ne_class_list)) final_results = dict( f1=f1, precesion=precision, recall=recall, all=results ) print(final_results) # def parse_args(): # parser = argparse.ArgumentParser() # parser.add_argument('--output_dir', type=str, default='outputs') # parser.add_argument('--t_labels', type=str, default="ubuntu_label.txt") # parser.add_argument('--p_labels', type=str, default="ubuntu_predict.txt") # parser.add_argument('--text', type=str, default="ubuntu_text.txt")
def train(args): vocab_path = os.path.join(args.data_dir, args.vocab) tag_path = os.path.join(args.data_dir, args.tag_set) word_to_idx, idx_to_word, tag_to_idx, idx_to_tag = load_vocabs( vocab_path, tag_path) train_sentences, train_labels, test_sentences, test_labels = prepare_flair( args, tag_to_idx) device = get_device(args) flair.device = device start = time.time() # flair_forward_embedding = FlairEmbeddings('multi-forward') # flair_backward_embedding = FlairEmbeddings('multi-backward') # init multilingual BERT bert_embedding = TransformerWordEmbeddings( 'distilbert-base-multilingual-cased', layers='-1', batch_size=args.batch_size) # bert_embedding1 = TransformerWordEmbeddings('sentence-transformers/' # 'distilbert-multilingual-nli-stsb-quora-ranking', # layers='-1', # batch_size=args.batch_size) # bert_embedding2 = TransformerWordEmbeddings('sentence-transformers/quora-distilbert-multilingual', # layers='-1', # batch_size=args.batch_size) # now create the StackedEmbedding object that combines all embeddings embeddings = StackedEmbeddings(embeddings=[bert_embedding]) # Embed words in the train and test sentence start_idx = 0 n_samples = len(train_sentences) while start_idx < n_samples + args.batch_size: batch_slice = train_sentences[ start_idx:min(start_idx + args.batch_size, n_samples)] start_idx += args.batch_size embeddings.embed(batch_slice) start_idx = 0 n_samples = len(test_sentences) while start_idx <= n_samples + args.batch_size: batch_slice = test_sentences[start_idx:min(start_idx + args.batch_size, n_samples)] start_idx += args.batch_size embeddings.embed(batch_slice) print(f'Encoding time:{time.time() - start}') # Update the Namespace args.vocab_size = len(idx_to_word) args.number_of_tags = len(idx_to_tag) model = build_model(args, device) print(model) model = model.to(device) optimizer = torch.optim.Adam(model.parameters()) pad_id = word_to_idx['PAD'] pad_id_labels = tag_to_idx['PAD'] batcher = SamplingBatcherFlair( np.asarray(train_sentences, dtype=object), np.asarray(train_labels, dtype=object), batch_size=args.batch_size, pad_id=pad_id, pad_id_labels=pad_id_labels, embedding_length=embeddings.embedding_length) updates = 1 total_loss = 0 best_loss = +inf stop_training = False output_dir = args.output_dir try: os.makedirs(output_dir) except: pass prefix = args.train_text.split('_')[0] if len(args.train_text.split('_')) > 1 \ else args.train_text.split('.')[0] start_time = time.time() for epoch in range(args.epochs): for batch in batcher: updates += 1 input_, labels, labels_mask = batch optimizer.zero_grad() loss = model.score(batch) loss.backward() optimizer.step() total_loss += loss.data if updates % args.patience == 0: print(f'Epoch: {epoch}, Updates:{updates}, Loss: {total_loss}') if best_loss > total_loss: save_state(f'{output_dir}/{prefix}_best_model.pt', model, loss_fn, optimizer, updates, args=args) best_loss = total_loss total_loss = 0 if updates % args.max_steps == 0: stop_training = True break if stop_training: break print('Training time:{}'.format(time.time() - start_time)) def get_idx_to_tag(label_ids): return [idx_to_tag.get(idx) for idx in label_ids] def get_idx_to_word(words_ids): return [idx_to_word.get(idx) for idx in words_ids] model, model_args = load_model_state( f'{output_dir}/{prefix}_best_model.pt', device) model = model.to(device) batcher_test = SamplingBatcherFlair( np.asarray(test_sentences, dtype=object), np.asarray(test_labels, dtype=object), batch_size=args.batch_size, pad_id=pad_id, pad_id_labels=pad_id_labels, embedding_length=embeddings.embedding_length) ne_class_list = set() true_labels_for_testing = [] results_of_prediction = [] with open(f'{output_dir}/{prefix}_label.txt', 'w', encoding='utf8') as t, \ open(f'{output_dir}/{prefix}_predict.txt', 'w', encoding='utf8') as p, \ open(f'{output_dir}/{prefix}_text.txt', 'w', encoding='utf8') as textf: with torch.no_grad(): # predict() method returns final labels not the label_ids preds = predict_no_attn(batcher_test, model, idx_to_tag) cnt = 0 for text, labels, predict_labels in zip(test_sentences, test_labels, preds): cnt += 1 tag_labels_true = get_idx_to_tag(labels) text_ = text.to_original_text() tag_labels_predicted = ' '.join(predict_labels) tag_labels_true = ' '.join(tag_labels_true) p.write(tag_labels_predicted + '\n') t.write(tag_labels_true + '\n') textf.write(text_ + '\n') tag_labels_true = tag_labels_true.strip().replace('_', '-').split() tag_labels_predicted = tag_labels_predicted.strip().replace( '_', '-').split() biluo_tags_true = get_biluo(tag_labels_true) biluo_tags_predicted = get_biluo(tag_labels_predicted) doc = Doc(text_) offset_true_labels = offset_from_biluo(doc, biluo_tags_true) offset_predicted_labels = offset_from_biluo( doc, biluo_tags_predicted) ent_labels = dict() for ent in offset_true_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') ne_class_list.add(ent_type) if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] true_labels_for_testing.append(ent_labels) ent_labels = dict() for ent in offset_predicted_labels: start, stop, ent_type = ent ent_type = ent_type.replace('_', '') if ent_type in ent_labels: ent_labels[ent_type].append((start, stop)) else: ent_labels[ent_type] = [(start, stop)] results_of_prediction.append(ent_labels) from eval.quality import calculate_prediction_quality f1, precision, recall, results = \ calculate_prediction_quality(true_labels_for_testing, results_of_prediction, tuple(ne_class_list)) print(f1, precision, recall, results)