def test(args, ds, m, epoch='cmdline'): args.vbsz = 1 m.eval() k = 0 data = ds.mktestset(args) preds = [] golds = [] for b in tqdm(data): #if k == 10: break b = ds.fixBatch(b) ''' p,z = m(b) p = p[0].max(1)[1] gen = ds.reverse(p,b.rawent) ''' gen = m.beam_generate(b, beamsz=4, k=6) gen.sort() gen = ds.reverse(gen.done[0].words, b.rawent) k += 1 gold = ds.reverse(b.tgt[0][1:], b.rawent) preds.append(gen.lower()) golds.append(gold.lower()) m.train() path_pred = os.path.join(args.save, "pred.txt") path_gold = os.path.join(args.save, "gold.txt") fwrite('\n'.join(preds), path_pred) fwrite('\n'.join(golds), path_gold) return preds, golds
def save_to_md(self, category2md_elems): category2md = {k: '\n'.join(v) for k, v in category2md_elems.items()} output = [] for k, v in category2md.items(): output.append('## ' + k) output.append(v) from efficiency.log import fwrite fwrite('\n'.join(output), C.md_style_transfer)
def save_permutations(file_templ='./data/bitid_{:03d}.txt', num_shards=100, shard_size=20000): import os from efficiency.log import fwrite all_uids = make_permutations(num_shards=num_shards, shard_size=shard_size) for shard_ix in range(num_shards): uids = all_uids[shard_ix * shard_size:(shard_ix + 1) * shard_size] uids = '\n'.join(uids) save_to = os.path.join(file_templ.format(shard_ix)) fwrite(uids, save_to)
def get_init_data(): import os import json from efficiency.log import fwrite data = {} if os.path.isfile(save_to): with open(save_to) as f: content = f.read() data = json.loads(content) fwrite(content, save_to + '.prev') print('[Info] Previous data file exists. Made a backup at ' + save_to + '.prev') return data
def main(): import os import json from efficiency.log import fwrite data = {} dir = '/home/ubuntu/proj/1908_clickbait/bitly' file_filter = lambda f: f.startswith('bitly_') and f.endswith('.json') fm = FileManager(dir=dir, file_filter=file_filter) print(json.dumps(fm.files, indent=4)) for file in fm.files: with open(file) as f: content = json.load(f) data.update(content) show_var( ["file", "len(content)", "len(data)", "list(content.keys())[:3]"]) data = dict(sorted(data.items())) fwrite(json.dumps(data, indent=4), os.path.join(dir, 'bitly.json'))
def get_html_n_save(url, file=None): import os if file is not None: if os.path.isfile(file): html = open(file).read() return html import requests r = requests.get(url) if r.status_code == 200: from efficiency.log import fwrite html = r.text if file is not None: fwrite(html, file, verbose=True) return html else: print('[Error] {} for {}'.format(r.status_code, url)) return None
def get_txt(articles, file='articles_native.txt'): from tqdm import tqdm from efficiency.nlp import NLP from efficiency.log import fwrite nlp = NLP() text = [] pbar = tqdm(articles, desc='{} sents to {}'.format(0, file)) for article in pbar: text += article.clean_paper(nlp) pbar.set_description('{} sents to {}'.format(len(text), file), refresh=True) text = list(set(text)) import random random.shuffle(text) if file: fwrite('\n'.join(text), file) return text
def read_graph(file, ner_alphabet, coref_edge_filt, coref_edge_type, c_word_edge, c_sent_edge, c_sent_thres, save_path, terms=[], keep_word_ixs='non_eos', keep_case=True, cheat_densify=False): word_insts, tag_insts, feat_insts = _read_words_from_file( file, keep_case=keep_case, cheat_densify=cheat_densify) all_data = [] doc_n_words = [] doc_n_sents = [] sent_len = set() word_len = set() coref_mats = '' coref_val = [] coref_dens = [] for doc_i, (para, tag, feat) in enumerate(zip(word_insts, tag_insts, feat_insts)): # get meta data para_flat = [word for sent in para for word in sent] tag_flat = [t for ta in tag for t in ta] feat_flat = [f for fea in feat for f in fea] doc_n_words += [len(tag_flat)] doc_n_sents += [len(para)] sent_len |= set(len(sent) for sent in para) word_len |= set(len(word) for word in para_flat) keep_word_ixs = \ [i for i in range(len(para_flat)) if tag_flat[i] not in ["O"]] \ if coref_edge_filt == 'ib_tgt' \ else [i for i in range(len(para_flat)) if feat_flat[i] not in ["O"]] \ if coref_edge_filt == 'ib_feat' \ else [i for i in range(len(para_flat)) if para_flat[i] in terms] \ if coref_edge_filt == 'term' \ else [i for i in range(len(para_flat)) if para_flat[i] != EOS_WORD] # get content data doc = {} doc["word"] = para doc["tag"] = tag doc["feat"] = feat_flat if c_sent_edge != '': _, eos_ixs = _listify_eos(para_flat) doc["edge"], coref_mat = _find_sent_edges( para, keep_word_ixs, eos_ixs, c_sent_edge, c_sent_thres, keep_stopwords_coref=False) coref_mats += np.array_str(coref_mat) + '\n' coref_val += coref_mat[np.nonzero(coref_mat)].tolist() coref_dens += [len(doc['edge']['coref']) / doc_n_sents / doc_n_sents] doc["coref_groups"] = [] else: doc["edge"], doc["coref_groups"] = _find_edges(para_flat, tag_flat, set( keep_word_ixs), coref_edge_type, c_word_edge, keep_stopwords_coref=False, coref_edge_filt=coref_edge_filt) coref_dens.append( (len(doc["edge"]["coref"]) - len(para_flat)) / (len(para_flat) * (len(para_flat) - 1))) doc["id"] = doc_i all_data.append(doc) if False: # c_sent_edge: fwrite(coref_mats, save_path + '_coref_sent.matrix') import matplotlib matplotlib.use('Agg') import matplotlib.pyplot as plt bins = list(range(22)) plt.hist(coref_val, bins=bins, density=True) plt.savefig(save_path + '_coref_sent_hist.png') meta = {"max_doc_n_words": max(doc_n_words), "max_doc_n_sents": max(doc_n_sents), "max_word_len": max(word_len), "max_sent_len": max(sent_len), "graph_types": list(sorted(doc["edge"].keys())) } all_data = [meta] + all_data all_data_text = [json.dumps(i) + '\n' for i in all_data] fwrite(''.join(all_data_text), save_path + '_graph.json') print("[Info] Average density of coref: %.2f" % (sum(coref_dens) / len(coref_dens))) return all_data
def save_json(): import json from efficiency.log import fwrite fwrite(json.dumps(data, indent=4), save_to)
args = parser.parse_args() return args if __name__ == "__main__": raw_n_html = [ ('data/03conll.train.c_w_d_dw_ds_sw_word_ibo_dic', 'demo/03conll.train.html'), ('data/03conll.valid.c_w_d_dw_ds_sw_word_ibo_dic', 'demo/03conll.valid.html'), ('data/sample.c_w_d_dw_ds_sw_word_ibo_dic', 'dana.sample.html'), ('data/lstm_encoder_errors.conll', 'demo/lstm_encoder_errors.html'), ('data/cnn_encoder_errors.conll', 'demo/cnn_encoder_errors.html'), ('data/common_sents_diff_error.conll', 'demo/common_sents_diff_error.html'), ('data/common_sents_same_error.conll', 'demo/common_sents_same_error.html') ] args = get_args() if args.conll: raw_n_html = [('data/{}.conll'.format(args.conll), 'demo/{}.html'.format(args.conll))] for conll_file, html_file in raw_n_html: print("[Info] Visualizing {} into {}".format(conll_file, html_file)) soup = conll2html(conll_file) fwrite(soup, html_file) # soup = example()
def mkVocabs(self, args, save_vocab=False): args.path = args.datadir + args.data self.INP = data.Field(sequential=True, batch_first=True, init_token="<start>", eos_token="<eos>", include_lengths=True) self.OUTP = data.Field(sequential=True, batch_first=True, init_token="<start>", eos_token="<eos>", include_lengths=True) self.TGT = data.Field(sequential=True, batch_first=True, init_token="<start>", eos_token="<eos>") self.NERD = data.Field(sequential=True, batch_first=True, eos_token="<eos>") self.ENT = data.RawField() self.REL = data.RawField() self.SORDER = data.RawField() self.SORDER.is_target = False self.REL.is_target = False self.ENT.is_target = False self.fields = [("src", self.INP), ("ent", self.ENT), ("nerd", self.NERD), ("rel", self.REL), ("out", self.OUTP), ("sorder", self.SORDER)] train = data.TabularDataset(path=args.path, format='tsv', fields=self.fields) print('building vocab') self.OUTP.build_vocab(train, min_freq=args.outunk) generics = ['<method>', '<material>', '<otherscientificterm>', '<metric>', '<task>'] self.OUTP.vocab.itos.extend(generics) for x in generics: self.OUTP.vocab.stoi[x] = self.OUTP.vocab.itos.index(x) self.TGT.vocab = copy(self.OUTP.vocab) specials = "method material otherscientificterm metric task".split(" ") for x in specials: for y in range(40): s = "<" + x + "_" + str(y) + ">" self.TGT.vocab.stoi[s] = len(self.TGT.vocab.itos) + y self.NERD.build_vocab(train, min_freq=0) for x in generics: self.NERD.vocab.stoi[x] = self.OUTP.vocab.stoi[x] self.INP.build_vocab(train, min_freq=args.entunk) self.REL.special = ['<pad>', '<unk>', 'ROOT'] with open(args.datadir + "/" + args.relvocab) as f: rvocab = [x.strip() for x in f.readlines()] self.REL.size = len(rvocab) rvocab += [x + "_inv" for x in rvocab] relvocab = self.REL.special + rvocab self.REL.itos = relvocab self.ENT.itos, self.ENT.stoi = self.build_ent_vocab(args.path) if save_vocab: import os import json from efficiency.log import fwrite folder = '/home/ubuntu/proj/zhijing_g/tmp_intermediate/' if not os.path.isdir(folder): os.mkdir(folder) import pdb;pdb.set_trace() fwrite(json.dumps( { 'itos': self.OUTP.vocab.itos, 'stoi': self.OUTP.vocab.stoi, } ), folder + 'outp_vocab.json') fwrite(json.dumps( { 'itos': self.TGT.vocab.itos, 'stoi': self.TGT.vocab.stoi, } ), folder + 'tgt_vocab.json') fwrite(json.dumps( { 'itos': self.NERD.vocab.itos, 'stoi': self.NERD.vocab.stoi, } ), folder + 'nerd_vocab.json') fwrite(json.dumps( { 'itos': self.INP.vocab.itos, 'stoi': self.INP.vocab.stoi, } ), folder + 'inp_vocab.json') fwrite(json.dumps( { 'itos': self.ENT.itos, 'stoi': self.ENT.stoi, } ), folder + 'ent_vocab.json') fwrite(json.dumps( { 'itos': self.REL.itos, } ), folder + 'rel_vocab.json') import pdb;pdb.set_trace() print('done') if not self.args.eval: self.mkiters(train)
def main(): # Arguments parser parser = argparse.ArgumentParser(description='Tuning with DNN Model for NER') # Model Hyperparameters parser.add_argument('--mode', choices=['RNN', 'LSTM', 'GRU'], help='architecture of rnn', default='LSTM') parser.add_argument('--encoder_mode', choices=['cnn', 'lstm'], help='Encoder type for sentence encoding', default='lstm') parser.add_argument('--char_method', choices=['cnn', 'lstm'], help='Method to create character-level embeddings', required=True) parser.add_argument('--hidden_size', type=int, default=128, help='Number of hidden units in RNN for sentence level') parser.add_argument('--char_hidden_size', type=int, default=30, help='Output character-level embeddings size') parser.add_argument('--char_dim', type=int, default=30, help='Dimension of Character embeddings') parser.add_argument('--tag_space', type=int, default=0, help='Dimension of tag space') parser.add_argument('--num_layers', type=int, default=1, help='Number of layers of RNN') parser.add_argument('--dropout', choices=['std', 'gcn'], help='Dropout method', default='gcn') parser.add_argument('--p_em', type=float, default=0.33, help='dropout rate for input embeddings') parser.add_argument('--p_in', type=float, default=0.33, help='dropout rate for input of RNN model') parser.add_argument('--p_rnn', nargs=3, type=float, required=True, help='dropout rate for RNN') parser.add_argument('--p_tag', type=float, default=0.33, help='dropout rate for output layer') parser.add_argument('--bigram', action='store_true', help='bi-gram parameter for CRF') parser.add_argument('--adj_attn', choices=['cossim', 'flex_cossim', 'flex_cossim2', 'concat', '', 'multihead'], default='') # Data loading and storing params parser.add_argument('--embedding_dict', help='path for embedding dict') parser.add_argument('--dataset_name', type=str, default='alexa', help='Which dataset to use') parser.add_argument('--train', type=str, required=True, help='Path of train set') parser.add_argument('--dev', type=str, required=True, help='Path of dev set') parser.add_argument('--test', type=str, required=True, help='Path of test set') parser.add_argument('--results_folder', type=str, default='results', help='The folder to store results') parser.add_argument('--alphabets_folder', type=str, default='data/alphabets', help='The folder to store alphabets files') # Training parameters parser.add_argument('--cuda', action='store_true', help='whether using GPU') parser.add_argument('--num_epochs', type=int, default=100, help='Number of training epochs') parser.add_argument('--batch_size', type=int, default=16, help='Number of sentences in each batch') parser.add_argument('--learning_rate', type=float, default=0.001, help='Base learning rate') parser.add_argument('--decay_rate', type=float, default=0.95, help='Decay rate of learning rate') parser.add_argument('--schedule', type=int, default=3, help='schedule for learning rate decay') parser.add_argument('--gamma', type=float, default=0.0, help='weight for l2 regularization') parser.add_argument('--max_norm', type=float, default=1., help='Max norm for gradients') parser.add_argument('--gpu_id', type=int, nargs='+', required=True, help='which gpu to use for training') parser.add_argument('--learning_rate_gcn', type=float, default=5e-4, help='Base learning rate') parser.add_argument('--gcn_warmup', type=int, default=200, help='Base learning rate') parser.add_argument('--pretrain_lstm', type=float, default=10, help='Base learning rate') parser.add_argument('--adj_loss_lambda', type=float, default=0.) parser.add_argument('--lambda1', type=float, default=1.) parser.add_argument('--lambda2', type=float, default=0.) parser.add_argument('--seed', type=int, default=None) # Misc parser.add_argument('--embedding', choices=['glove', 'senna', 'alexa'], help='Embedding for words', required=True) parser.add_argument('--restore', action='store_true', help='whether restore from stored parameters') parser.add_argument('--save_checkpoint', type=str, default='', help='the path to save the model') parser.add_argument('--o_tag', type=str, default='O', help='The default tag for outside tag') parser.add_argument('--unk_replace', type=float, default=0., help='The rate to replace a singleton word with UNK') parser.add_argument('--evaluate_raw_format', action='store_true', help='The tagging format for evaluation') parser.add_argument('--eval_type', type=str, default="micro_f1",choices=['micro_f1', 'acc']) parser.add_argument('--show_network', action='store_true', help='whether to display the network structure') parser.add_argument('--smooth', action='store_true', help='whether to skip all pdb break points') parser.add_argument('--uid', type=str, default='temp') parser.add_argument('--misc', type=str, default='') args = parser.parse_args() show_var(['args']) uid = args.uid results_folder = args.results_folder dataset_name = args.dataset_name use_tensorboard = True save_dset_dir = '{}../dset/{}/graph'.format(results_folder, dataset_name) result_file_path = '{}/{dataset}_{uid}_result'.format(results_folder, dataset=dataset_name, uid=uid) save_loss_path = '{}/{dataset}_{uid}_loss'.format(results_folder, dataset=dataset_name, uid=uid) save_lr_path = '{}/{dataset}_{uid}_lr'.format(results_folder, dataset=dataset_name, uid='temp') save_tb_path = '{}/tensorboard/'.format(results_folder) logger = get_logger("NERCRF") loss_recorder = LossRecorder(uid=uid) record = TensorboardLossRecord(use_tensorboard, save_tb_path, uid=uid) # rename the parameters mode = args.mode encoder_mode = args.encoder_mode train_path = args.train dev_path = args.dev test_path = args.test num_epochs = args.num_epochs batch_size = args.batch_size hidden_size = args.hidden_size char_hidden_size = args.char_hidden_size char_method = args.char_method learning_rate = args.learning_rate momentum = 0.9 decay_rate = args.decay_rate gamma = args.gamma max_norm = args.max_norm schedule = args.schedule dropout = args.dropout p_em = args.p_em p_rnn = tuple(args.p_rnn) p_in = args.p_in p_tag = args.p_tag unk_replace = args.unk_replace bigram = args.bigram embedding = args.embedding embedding_path = args.embedding_dict evaluate_raw_format = args.evaluate_raw_format o_tag = args.o_tag restore = args.restore save_checkpoint = args.save_checkpoint alphabets_folder = args.alphabets_folder use_elmo = False p_em_vec = 0. graph_model = 'gnn' coref_edge_filt = '' learning_rate_gcn = args.learning_rate_gcn gcn_warmup = args.gcn_warmup pretrain_lstm = args.pretrain_lstm adj_loss_lambda = args.adj_loss_lambda lambda1 = args.lambda1 lambda2 = args.lambda2 if args.smooth: import pdb pdb.set_trace = lambda: None misc = "{}".format(str(args.misc)) score_file = "{}/{dataset}_{uid}_score".format(results_folder, dataset=dataset_name, uid=uid) for folder in [results_folder, alphabets_folder, save_dset_dir]: if not os.path.exists(folder): os.makedirs(folder) def set_seed(seed): if not seed: seed = int(show_time()) print("[Info] seed set to: {}".format(seed)) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) torch.backends.cudnn.deterministic = True set_seed(args.seed) embedd_dict, embedd_dim = utils.load_embedding_dict(embedding, embedding_path) logger.info("Creating Alphabets") word_alphabet, char_alphabet, ner_alphabet = conll03_data.create_alphabets( "{}/{}/".format(alphabets_folder, dataset_name), train_path, data_paths=[dev_path, test_path], embedd_dict=embedd_dict, max_vocabulary_size=50000) logger.info("Word Alphabet Size: %d" % word_alphabet.size()) logger.info("Character Alphabet Size: %d" % char_alphabet.size()) logger.info("NER Alphabet Size: %d" % ner_alphabet.size()) logger.info("Reading Data") device = torch.device('cuda') if args.cuda else torch.device('cpu') print(device) data_train = conll03_data.read_data(train_path, word_alphabet, char_alphabet, ner_alphabet, graph_model, batch_size, ori_order=False, total_batch="{}x".format(num_epochs + 1), unk_replace=unk_replace, device=device, save_path=save_dset_dir + '/train', coref_edge_filt=coref_edge_filt ) # , shuffle=True, num_data = data_train.data_len num_labels = ner_alphabet.size() graph_types = data_train.meta_info['graph_types'] data_dev = conll03_data.read_data(dev_path, word_alphabet, char_alphabet, ner_alphabet, graph_model, batch_size, ori_order=True, unk_replace=unk_replace, device=device, save_path=save_dset_dir + '/dev', coref_edge_filt=coref_edge_filt) data_test = conll03_data.read_data(test_path, word_alphabet, char_alphabet, ner_alphabet, graph_model, batch_size, ori_order=True, unk_replace=unk_replace, device=device, save_path=save_dset_dir + '/test', coref_edge_filt=coref_edge_filt) writer = CoNLL03Writer(word_alphabet, char_alphabet, ner_alphabet) def construct_word_embedding_table(): scale = np.sqrt(3.0 / embedd_dim) table = np.empty([word_alphabet.size(), embedd_dim], dtype=np.float32) table[conll03_data.UNK_ID, :] = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) oov = 0 for word, index in word_alphabet.items(): if word in embedd_dict: embedding = embedd_dict[word] elif word.lower() in embedd_dict: embedding = embedd_dict[word.lower()] else: embedding = np.random.uniform(-scale, scale, [1, embedd_dim]).astype(np.float32) oov += 1 table[index, :] = embedding print('oov: %d' % oov) return torch.from_numpy(table) word_table = construct_word_embedding_table() logger.info("constructing network...") char_dim = args.char_dim window = 3 num_layers = args.num_layers tag_space = args.tag_space initializer = nn.init.xavier_uniform_ p_gcn = [0.5, 0.5] d_graph = 256 d_out = 256 d_inner_hid = 128 d_k = 32 d_v = 32 n_head = 4 n_gcn_layer = 1 p_rnn2 = [0.0, 0.5, 0.5] adj_attn = args.adj_attn mask_singles = True post_lstm = 1 position_enc_mode = 'none' adj_memory = False if dropout == 'gcn': network = BiRecurrentConvGraphCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), char_hidden_size, window, mode, encoder_mode, hidden_size, num_layers, num_labels, graph_model, n_head, d_graph, d_inner_hid, d_k, d_v, p_gcn, n_gcn_layer, d_out, post_lstm=post_lstm, mask_singles=mask_singles, position_enc_mode=position_enc_mode, adj_attn=adj_attn, adj_loss_lambda=adj_loss_lambda, tag_space=tag_space, embedd_word=word_table, use_elmo=use_elmo, p_em_vec=p_em_vec, p_em=p_em, p_in=p_in, p_tag=p_tag, p_rnn=p_rnn, p_rnn2=p_rnn2, bigram=bigram, initializer=initializer) elif dropout == 'std': network = BiRecurrentConvCRF(embedd_dim, word_alphabet.size(), char_dim, char_alphabet.size(), char_hidden_size, window, mode, encoder_mode, hidden_size, num_layers, num_labels, tag_space=tag_space, embedd_word=word_table, use_elmo=use_elmo, p_em_vec=p_em_vec, p_em=p_em, p_in=p_in, p_tag=p_tag, p_rnn=p_rnn, bigram=bigram, initializer=initializer) # whether restore from trained model if restore: network.load_state_dict(torch.load(save_checkpoint + '_best.pth')) # load trained model logger.info("cuda()ing network...") network = network.to(device) if dataset_name == 'conll03' and data_dev.data_len > 26: sample = data_dev.pad_batch(data_dev.dataset[25:26]) else: sample = data_dev.pad_batch(data_dev.dataset[:1]) plot_att_change(sample, network, record, save_tb_path + 'att/', uid='temp', epoch=0, device=device, word_alphabet=word_alphabet, show_net=args.show_network, graph_types=data_train.meta_info['graph_types']) logger.info("finished cuda()ing network...") lr = learning_rate lr_gcn = learning_rate_gcn optim = Optimizer('sgd', 'adam', network, dropout, lr=learning_rate, lr_gcn=learning_rate_gcn, wd=0., wd_gcn=0., momentum=momentum, lr_decay=decay_rate, schedule=schedule, gcn_warmup=gcn_warmup, pretrain_lstm=pretrain_lstm) nn.utils.clip_grad_norm_(network.parameters(), max_norm) logger.info( "Network: %s, encoder_mode=%s, num_layer=%d, hidden=%d, char_hidden_size=%d, char_method=%s, tag_space=%d, crf=%s" % \ (mode, encoder_mode, num_layers, hidden_size, char_hidden_size, char_method, tag_space, 'bigram' if bigram else 'unigram')) logger.info("training: l2: %f, (#training data: %d, batch: %d, unk replace: %.2f)" % ( gamma, num_data, batch_size, unk_replace)) logger.info("dropout(in, out, rnn): (%.2f, %.2f, %s)" % (p_in, p_tag, p_rnn)) num_batches = num_data // batch_size + 1 dev_f1 = 0.0 dev_acc = 0.0 dev_precision = 0.0 dev_recall = 0.0 test_f1 = 0.0 test_acc = 0.0 test_precision = 0.0 test_recall = 0.0 best_epoch = 0 best_test_f1 = 0.0 best_test_acc = 0.0 best_test_precision = 0.0 best_test_recall = 0.0 best_test_epoch = 0.0 loss_recorder.start(save_loss_path, mode='w', misc=misc) fwrite('', save_lr_path) fwrite(json.dumps(vars(args)) + '\n', result_file_path) for epoch in range(1, num_epochs + 1): show_var(['misc']) lr_state = 'Epoch %d (uid=%s, lr=%.2E, lr_gcn=%.2E, decay rate=%.4f): ' % ( epoch, uid, Decimal(optim.curr_lr), Decimal(optim.curr_lr_gcn), decay_rate) print(lr_state) fwrite(lr_state[:-2] + '\n', save_lr_path, mode='a') train_err = 0. train_err2 = 0. train_total = 0. start_time = time.time() num_back = 0 network.train() for batch_i in range(1, num_batches + 1): batch_doc = data_train.next() char, word, posi, labels, feats, adjs, words_en = [batch_doc[i] for i in [ "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]] sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent( word, char, labels) optim.zero_grad() adjs_into_model = adjs if adj_memory else adjs.clone() loss, (ner_loss, adj_loss) = network.loss(None, word, char, adjs_into_model, labels, graph_types=graph_types, lambda1=lambda1, lambda2=lambda2) # loss = network.loss(_, sent_word, sent_char, sent_labels, mask=sent_mask) loss.backward() optim.step() with torch.no_grad(): num_inst = sent_mask.size(0) train_err += ner_loss * num_inst train_err2 += adj_loss * num_inst train_total += num_inst time_ave = (time.time() - start_time) / batch_i time_left = (num_batches - batch_i) * time_ave # update log if batch_i % 20 == 0: sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) log_info = 'train: %d/%d loss1: %.4f, loss2: %.4f, time left (estimated): %.2fs' % ( batch_i, num_batches, train_err / train_total, train_err2 / train_total, time_left) sys.stdout.write(log_info) sys.stdout.flush() num_back = len(log_info) optim.update(epoch, batch_i, num_batches, network) sys.stdout.write("\b" * num_back) sys.stdout.write(" " * num_back) sys.stdout.write("\b" * num_back) print('train: %d loss: %.4f, loss2: %.4f, time: %.2fs' % ( num_batches, train_err / train_total, train_err2 / train_total, time.time() - start_time)) # evaluate performance on dev data with torch.no_grad(): network.eval() tmp_filename = "{}/{dataset}_{uid}_output_dev".format(results_folder, dataset=dataset_name, uid=uid) writer.start(tmp_filename) for batch in data_dev: char, word, posi, labels, feats, adjs, words_en = [batch[i] for i in [ "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]] sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent( word, char, labels) preds, _ = network.decode( None, word, char, adjs.clone(), target=labels, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS, graph_types=graph_types) # preds, _ = network.decode(_, sent_word, sent_char, target=sent_labels, mask=sent_mask, # leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(sent_word.cpu().numpy(), preds.cpu().numpy(), sent_labels.cpu().numpy(), sent_length.cpu().numpy()) writer.close() if args.eval_type == "acc": acc, precision, recall, f1 =evaluate_tokenacc(tmp_filename) f1 = acc else: acc, precision, recall, f1 = evaluate(tmp_filename, score_file, evaluate_raw_format, o_tag) print('dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%%' % (acc, precision, recall, f1)) # plot loss and attention record.plot_loss(epoch, train_err / train_total, f1) plot_att_change(sample, network, record, save_tb_path + 'att/', uid="{}_{:03d}".format(uid, epoch), epoch=epoch, device=device, word_alphabet=word_alphabet, show_net=False, graph_types=graph_types) if dev_f1 < f1: dev_f1 = f1 dev_acc = acc dev_precision = precision dev_recall = recall best_epoch = epoch # evaluate on test data when better performance detected tmp_filename = "{}/{dataset}_{uid}_output_test".format(results_folder, dataset=dataset_name, uid=uid) writer.start(tmp_filename) for batch in data_test: char, word, posi, labels, feats, adjs, words_en = [batch[i] for i in [ "chars", "word_ids", "posi", "ner_ids", "feat_ids", "adjs", "words_en"]] sent_word, sent_char, sent_labels, sent_mask, sent_length, _ = network._doc2sent( word, char, labels) preds, _ = network.decode( None, word, char, adjs.clone(), target=labels, leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS, graph_types=graph_types) # preds, _ = network.decode(_, sent_word, sent_char, target=sent_labels, mask=sent_mask, # leading_symbolic=conll03_data.NUM_SYMBOLIC_TAGS) writer.write(sent_word.cpu().numpy(), preds.cpu().numpy(), sent_labels.cpu().numpy(), sent_length.cpu().numpy()) writer.close() if args.eval_type == "acc": test_acc, test_precision, test_recall, test_f1 = evaluate_tokenacc(tmp_filename) test_f1 = test_acc else: test_acc, test_precision, test_recall, test_f1 = evaluate(tmp_filename, score_file, evaluate_raw_format, o_tag) if best_test_f1 < test_f1: best_test_acc, best_test_precision, best_test_recall, best_test_f1 = test_acc, test_precision, test_recall, test_f1 best_test_epoch = epoch # save the model parameters if save_checkpoint: torch.save(network.state_dict(), save_checkpoint + '_best.pth') print("best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % ( dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) print("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % ( test_acc, test_precision, test_recall, test_f1, best_epoch)) print("overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)" % ( best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch)) # optim.update(epoch, 1, num_batches, network) loss_recorder.write(epoch, train_err / train_total, train_err2 / train_total, Decimal(optim.curr_lr), Decimal(optim.curr_lr_gcn), f1, best_test_f1, test_f1) with open(result_file_path, 'a') as ofile: ofile.write("best dev acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % ( dev_acc, dev_precision, dev_recall, dev_f1, best_epoch)) ofile.write("best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n" % ( test_acc, test_precision, test_recall, test_f1, best_epoch)) ofile.write("overall best test acc: %.2f%%, precision: %.2f%%, recall: %.2f%%, F1: %.2f%% (epoch: %d)\n\n" % ( best_test_acc, best_test_precision, best_test_recall, best_test_f1, best_test_epoch)) record.close() print('Training finished!')
def __init__(self, file): self.file = file self.content = [] fwrite(self._text(), file)
def print(self, *expressions): expression = ' '.join(str(e) for e in expressions) print(expression) self.content += [expression] fwrite(self._text(), self.file, mode='a')
def save_ids(self, file): from efficiency.log import fwrite pmids = [a.data['pmid'] for a in self.articles] fwrite('\n'.join(pmids), file)