def create_model(self, sess, config): text_cnn = TextCNN(config) saver = tf.train.Saver() if os.path.exists(FLAGS.ckpt_dir + "checkpoint"): print("Restoring Variables from Checkpoint.") saver.restore(sess, tf.train.latest_checkpoint(FLAGS.ckpt_dir)) if FLAGS.decay_lr_flag: for i in range(2): # decay learning rate if necessary. print(i, "Going to decay learning rate by half.") sess.run(text_cnn.learning_rate_decay_half_op) else: print('Initializing Variables') sess.run(tf.global_variables_initializer()) if not os.path.exists(FLAGS.ckpt_dir): os.makedirs(FLAGS.ckpt_dir) if FLAGS.use_pretrained_embedding: # 加载预训练的词向量 print("===>>>going to use pretrained word embeddings...") old_emb_matrix = sess.run(text_cnn.Embedding.read_value()) new_emb_matrix = load_word_embedding(old_emb_matrix, FLAGS.word2vec_model_path, FLAGS.embed_size, self.index_to_word) word_embedding = tf.constant(new_emb_matrix, dtype=tf.float32) # 转为tensor t_assign_embedding = tf.assign( text_cnn.Embedding, word_embedding) # 将word_embedding复制给text_cnn.Embedding sess.run(t_assign_embedding) print("using pre-trained word emebedding.ended...") return text_cnn, saver
test_file = '../data/SICK/SICK_test_annotated.txt' train = utils.load_SICK(train_file) dev = utils.load_SICK(dev_file) test = utils.load_SICK(test_file) # get emb data data = train + dev + test sentences = [] for x in data: sentences.append(x[0]) sentences.append(x[1]) idf_weight = utils.idf_calculator(sentences) w2i = {w: i for i, w in enumerate(idf_weight.keys())} emb = utils.load_word_embedding(w2i, utils.my_emb_rep['paragram300'], 300) # params params = utils.Params() params.memsize = 50 params.minval = 0 params.maxval = 5 params.nout = params.maxval - params.minval + 1 params.LW = 1e-03 params.LC = 1e-05 params.learner = lasagne.updates.adam params.batchsize = 50 params.dim = 300 params.eta = 0.01 params.clip = None # params.hid_size = 300
comp_filename = 'dataset/all.bin' train_filename = 'dataset/train.bin' test_filename = 'dataset/test.bin' dev_filename = 'dataset/dev.bin' embedding_filename = 'dataset/word_embedding.txt' sem_embed_filename = 'dataset/sememe_vector.txt' logdir_name = 'phrase_sim/SCMSA' # load hownet,并把hownet.comp分成test_set和train_set hownet = utils.Hownet(hownet_file=hownet_filename, comp_file=comp_filename) hownet.build_hownet() hownet.token2id() hownet.load_split_dataset(train_filename=train_filename, test_filename=test_filename, dev_filename=dev_filename) word_embedding_np, hownet = utils.load_word_embedding( embedding_filename, hownet, scale=False) # load word embedding sememe_embedding_np = utils.load_sememe_embedding( sem_embed_filename, hownet, scale=True) # load sememe embedding hownet, wordsim_words = utils.fliter_wordsim_all( hownet) # remove MWEs in testset train_num = len(hownet.comp_train) pos_dict, word_remove = utils.load_hownet_pos() hownet, cls_dict = utils.divide_data_with_pos(pos_dict, hownet) print("number of dataset in training set:{}".format(len( hownet.comp_train))) print("number of dataset in test set:{}".format(len(hownet.comp_test))) print("number of dataset in dev set:{}".format(len(hownet.comp_dev))) if not os.path.exists(logdir_name): os.makedirs(logdir_name) os.makedirs(os.path.join(logdir_name, 'print_files'))
def __init__(self, embedding_dim=100, batch_size=64, n_hidden=100, learning_rate=0.01, n_class=3, max_sentence_len=50, l2_reg=0., display_step=4, n_iter=100, type_=''): self.embedding_dim = embedding_dim self.batch_size = batch_size self.n_hidden = n_hidden self.learning_rate = learning_rate self.n_class = n_class self.max_sentence_len = max_sentence_len self.l2_reg = l2_reg self.display_step = display_step self.n_iter = n_iter self.type_ = type_ self.word_id_mapping, self.w2v = load_word_embedding( FLAGS.word_id_file_path, FLAGS.embedding_file_path, self.embedding_dim) # self.word_embedding = tf.constant(self.w2v, dtype=tf.float32, name='word_embedding') self.word_embedding = tf.Variable(self.w2v, dtype=tf.float32, name='word_embedding') # self.word_id_mapping = load_word_id_mapping(FLAGS.word_id_file_path) # self.word_embedding = tf.Variable( # tf.random_uniform([len(self.word_id_mapping), self.embedding_dim], -0.1, 0.1), name='word_embedding') self.aspect_id_mapping, self.aspect_embed = load_aspect2id( FLAGS.aspect_id_file_path, self.word_id_mapping, self.w2v, self.embedding_dim) self.aspect_embedding = tf.Variable(self.aspect_embed, dtype=tf.float32, name='aspect_embedding') self.keep_prob1 = tf.placeholder(tf.float32) self.keep_prob2 = tf.placeholder(tf.float32) with tf.name_scope('inputs'): self.x = tf.placeholder(tf.int32, [None, self.max_sentence_len], name='x') self.y = tf.placeholder(tf.int32, [None, self.n_class], name='y') self.sen_len = tf.placeholder(tf.int32, None, name='sen_len') self.aspect_id = tf.placeholder(tf.int32, None, name='aspect_id') with tf.name_scope('weights'): self.weights = { 'softmax': tf.get_variable( name='softmax_w', shape=[self.n_hidden, self.n_class], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) } with tf.name_scope('biases'): self.biases = { 'softmax': tf.get_variable( name='softmax_b', shape=[self.n_class], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) } self.W = tf.get_variable( name='W', shape=[ self.n_hidden + self.embedding_dim, self.n_hidden + self.embedding_dim ], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) self.w = tf.get_variable( name='w', shape=[self.n_hidden + self.embedding_dim, 1], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) self.Wp = tf.get_variable( name='Wp', shape=[self.n_hidden, self.n_hidden], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) self.Wx = tf.get_variable( name='Wx', shape=[self.n_hidden, self.n_hidden], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
def main(): parser = options.get_parser('Trainer') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) args = parser.parse_args() print(args) args.cuda = not args.disable_cuda and torch.cuda.is_available() # checkpoint checkpoint_dir = os.path.dirname(args.checkpoint) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) # load dataset train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir) assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!' train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus] val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus] test_corpus = [(line.sent, line.type, line.p1, line.p2) for line in test_raw_corpus] start_epoch = 0 caseless = args.caseless batch_size = args.batch_size num_epoch = args.num_epoch # preprocessing sents = [tup[0] for tup in train_corpus + val_corpus] feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless) ## # target_map = {c:i for i, c in enumerate(['null', 'true'])} target_map = ddi2013.target_map train_features, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless) val_features, val_targets = utils.build_corpus(val_corpus, feature_map, target_map, caseless) test_features, test_targets = utils.build_corpus(test_corpus, feature_map, target_map, caseless) class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None train_loader = utils.construct_bucket_dataloader(train_features, train_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=True) val_loader = utils.construct_bucket_dataloader(val_features, val_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False) test_loader = utils.construct_bucket_dataloader(test_features, test_targets, feature_map['PAD'], batch_size, args.position_bound, is_train=False) print('Preprocessing done! Vocab size: {}'.format(len(feature_map))) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = utils.build_model(args, vocab_size, tagset_size) # loss criterion = utils.build_loss(args, class_weights=class_weights) # load states if os.path.isfile(args.load_checkpoint): print('Loading checkpoint file from {}...'.format(args.load_checkpoint)) checkpoint_file = torch.load(args.load_checkpoint) start_epoch = checkpoint_file['epoch'] + 1 model.load_state_dict(checkpoint_file['state_dict']) # optimizer.load_state_dict(checkpoint_file['optimizer']) else: print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint)) if not args.rand_embedding: pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim) print(pretrained_word_embedding.size()) print(vocab_size) model.load_pretrained_embedding(pretrained_word_embedding) if args.disable_fine_tune: model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words model.rand_init(init_embedding=args.rand_embedding) # trainer trainer = SeqTrainer(args, model, criterion) if os.path.isfile(args.load_checkpoint): dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda) test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format( dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1)) track_list = [] best_f1 = float('-inf') patience_count = 0 start_time = time.time() for epoch in range(start_epoch, num_epoch): epoch_loss = train(train_loader, trainer, epoch) # update lr trainer.lr_step() dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda) if dev_f1 >= best_f1: patience_count = 0 best_f1 = dev_f1 test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) track_list.append({'epoch': epoch, 'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1)) try: utils.save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': trainer.optimizer.state_dict(), 'f_map': feature_map, 't_map': target_map, }, {'track_list': track_list, 'args': vars(args) }, args.checkpoint + '_lstm') except Exception as inst: print(inst) else: patience_count += 1 track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss)) print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time)) if patience_count >= args.patience: break
def __init__(self, embedding_dim=100, batch_size=64, n_hidden=100, learning_rate=0.01, n_class=3, max_sentence_len=50, l2_reg=0., display_step=4, n_iter=100, type_=''): self.embedding_dim = embedding_dim #300 self.batch_size = batch_size #25 self.n_hidden = n_hidden #300 self.learning_rate = learning_rate #0.01 self.n_class = n_class #3 self.max_sentence_len = max_sentence_len #80 self.l2_reg = l2_reg #0.001 self.display_step = display_step #4 self.n_iter = n_iter #20 self.type_ = type_ #AT self.word_id_mapping, self.w2v = load_word_embedding( FLAGS.word_id_file_path, FLAGS.embedding_file_path, self.embedding_dim) # dict(3909) 3910 * 300 word->id 路径 词嵌入的路径 词嵌入维度:300 # self.word_embedding = tf.constant(self.w2v, dtype=tf.float32, name='word_embedding') self.word_embedding = tf.Variable( self.w2v, dtype=tf.float32, name='word_embedding') # 定义word_embedding变量 # self.word_id_mapping = load_word_id_mapping(FLAGS.word_id_file_path) # self.word_embedding = tf.Variable( # tf.random_uniform([len(self.word_id_mapping), self.embedding_dim], -0.1, 0.1), name='word_embedding') self.aspect_id_mapping, self.aspect_embed = load_aspect2id( FLAGS.aspect_id_file_path, self.word_id_mapping, self.w2v, self.embedding_dim) # dict(1219) 1220 * 300 self.aspect_embedding = tf.Variable( self.aspect_embed, dtype=tf.float32, name='aspect_embedding') #定义word_embedding变量 #定义droupout占位符 self.keep_prob1 = tf.placeholder(tf.float32, name="dropout_keep_prob1") self.keep_prob2 = tf.placeholder(tf.float32, name="dropout_keep_prob2") with tf.name_scope('inputs'): self.x = tf.placeholder(tf.int32, [None, self.max_sentence_len], name='x') #25 * 80 #print (self.max_sentence_len) #80 #print ('sxl================') self.y = tf.placeholder(tf.int32, [None, self.n_class], name='y') #25 * 3 self.sen_len = tf.placeholder(tf.int32, None, name='sen_len') #list(25) self.aspect_id = tf.placeholder(tf.int32, None, name='aspect_id') #list(25) with tf.name_scope('weights'): self.weights = { 'softmax': tf.get_variable( name='softmax_w', shape=[self.n_hidden, self.n_class], #300 * 3 initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) } with tf.name_scope('biases'): self.biases = { 'softmax': tf.get_variable( name='softmax_b', shape=[self.n_class], # 3 initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) } self.W = tf.get_variable( name='W', shape=[ self.n_hidden + self.embedding_dim, self.n_hidden + self.embedding_dim ], #600 * 600 initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) self.w = tf.get_variable( name='w', shape=[self.n_hidden + self.embedding_dim, 1], #600 * 1 initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) self.Wp = tf.get_variable( name='Wp', shape=[self.n_hidden, self.n_hidden], #300 * 300 initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg)) self.Wx = tf.get_variable( name='Wx', shape=[self.n_hidden, self.n_hidden], #300 * 300 initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg))
def main(mode, args): # build vocab word2index, index2word = build_all_vocab(init_vocab={ UNK_WORD: 0, BOS_WORD: 1 }) args.vocab, args.vocab_size, args.index2word = word2index, len( word2index), index2word # get data_from_train from only_label = True, for same as train baseline args.only_label = True train_dataset = BindingDataset('train', args=args) data_from_train = (train_dataset.tokenize_max_len, train_dataset.columns_token_max_len, train_dataset.columns_split_marker_max_len, train_dataset.cells_token_max_len, train_dataset.cells_split_marker_max_len, train_dataset.pos_tag_vocab, train_dataset.bert_tokenize_max_len, train_dataset.bert_tokenize_marker_max_len, train_dataset.bert_columns_split_max_len, train_dataset.bert_columns_split_marker_max_len, train_dataset.bert_cells_split_max_len, train_dataset.bert_cells_split_marker_max_len) args.tokenize_max_len, args.columns_token_max_len, args.columns_split_marker_max_len, \ args.cells_token_max_len, args.cells_split_marker_max_len, args.pos_tag_vocab,\ args.bert_tokenize_max_len, args.bert_tokenize_marker_max_len, args.bert_columns_split_max_len, args.bert_columns_split_marker_max_len,\ args.bert_cells_split_max_len, args.bert_cells_split_marker_max_len = data_from_train logger.info('data_from_train'), logger.info(data_from_train) # set only_label if mode == 'train baseline': args.only_label = True elif mode == 'policy gradient': args.only_label = False elif mode == 'test model': args.only_label = True elif mode == 'add feature': args.only_label = False elif mode == 'write cases': args.only_label = True elif mode == 'anonymous': args.only_label = False # build train_dataloader train_dataset = BindingDataset('train', args=args, data_from_train=data_from_train) train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=args.shuffle) # build dev_dataloader args.shuffle = False dev_dataset = BindingDataset('dev', args=args, data_from_train=data_from_train) dev_dataloader = DataLoader(dataset=dev_dataset, batch_size=args.batch_size, shuffle=args.shuffle) # build test_dataloader # test_dataset = BindingDataset('test', args=args, data_from_train=data_from_train) # test_dataloader = DataLoader(dataset=test_dataset, batch_size=args.batch_size, shuffle=args.shuffle) # load word embedding if args.load_w2v: args.embed_matrix = load_word_embedding(args.word_dim, word2index) # train if mode == 'train baseline': if args.model == 'baseline': model = Baseline(args=args) elif args.model == 'gate': if args.bert_model is None: model = Gate(args=args) else: model = BertGate(args=args) else: raise NotImplementedError train(train_dataloader, dev_dataloader, args=args, model=model) elif mode == 'policy gradient': model = torch.load('./res/' + args.model + '/2816_False_True_True_726425', map_location=lambda storage, loc: storage.cuda(0)) train_rl(train_dataloader, dev_dataloader, args=args, model=model) elif mode == 'test model': # also need the correct 'model' for dataloader model = torch.load( './res/policy_gradient/0.819928_True_True_True_412532', map_location=lambda storage, loc: storage.cuda(0)) eval(dev_dataloader, args, model, epoch=0) eval_rl(dev_dataloader, args, model, epoch=0) elif mode == 'add feature': model = torch.load('./res/policy_gradient/0.804922_22-16-28', map_location=lambda storage, loc: storage.cuda(0)) res = test(dev_dataloader, args, model) add_abstraction('dev', res=res, args=args) elif mode == 'write cases': model = torch.load( './res/policy_gradient/0.819928_True_True_True_412532', map_location=lambda storage, loc: storage.cuda(0)) res_pg = test(dev_dataloader, args, model, sep=' ') model = torch.load('./res/gate/epoch100', map_location=lambda storage, loc: storage.cuda(0)) res_gate = test(dev_dataloader, args, model, sep=' ') with open('cases.txt', 'w', encoding='utf-8') as f: for key in res_pg.keys(): # diff between gate and policy if res_gate[key]['pred'] != res_pg[key]['pred']: if res_gate[key]['pred'] == res_gate[key]['label']: f.write(key + '\n') f.write('Pred_Gate:\t\t\t\t' + json.dumps(res_gate[key]['pred']) + '\n') f.write('Pred_Policy_Gradient:\t' + json.dumps(res_pg[key]['pred']) + '\n') f.write('Label:\t\t\t\t\t' + json.dumps(res_pg[key]['label']) + '\n') f.write('SQL_Labels:\t\t\t\t' + json.dumps(res_pg[key]['sql_labels']) + '\n' + '\n') elif mode == 'anonymous': model = torch.load( './res/policy_gradient/0.819928_True_True_True_412532', map_location=lambda storage, loc: storage.cuda(0)) res = test(train_dataloader, args, model, sep='') anonymous('train', res, args)
import tensorflow as tf import numpy as np from utils import load_w2v, batch_index, load_word_embedding, load_aspect2id, load_inputs_twitter_at x_raw = ["$T$ is always fresh and hot - ready to eat !", "food"] y_test = [1] word_id_mapping, w2v = load_word_embedding( 'data/restaurant/word_id_new.txt', 'data/restaurant/rest_2014_word_embedding_300_new.txt', 300) # dict(3909) 3910 * 300 aspect_id_mapping, aspect_embed = load_aspect2id( 'data/restaurant/aspect_id_new.txt', word_id_mapping, w2v, 300) # dict(1219) 1220 * 300 # print (aspect_id_mapping['food']) # print ('sxlllllllllll') def change_y_to_onehot(y): class_set = set([1, -1, 0]) n_class = 3 y_onehot_mapping = {0: 0, 1: 1, -1: 2} #print (y_onehot_mapping) onehot = [] for label in y: tmp = [0] * n_class tmp[y_onehot_mapping[label]] = 1 onehot.append(tmp) return np.asarray(onehot, dtype=np.int32)
def load_data(self): train_data, val_data, test_data = load_csv(self.args.data_path, self.args.data) self.train_y = to_categorical(train_data.valid) self.val_y = to_categorical(val_data.valid) self.test_y = to_categorical(test_data.valid) print('Load word embedding') self.word2idx, idx2word, self.word_vectors = load_word_embedding( self.args.embedding_path, self.args.word_embedding, self.args.data) print('Load graph embedding') cpc_embed_dict, ipc_embed_dict, uspc_embed_dict = load_code_embeddings( self.args.embedding_path, self.args.code_embedding, self.args.data) self.cpc2idx, idx2cpc, self.cpc_vectors = create_code_vocab( cpc_embed_dict) self.ipc2idx, idx2ipc, self.ipc_vectors = create_code_vocab( ipc_embed_dict) self.uspc2idx, idx2uspc, self.uspc_vectors = create_code_vocab( uspc_embed_dict) self.max_cpc_len, self.max_ipc_len, self.max_uspc_len = get_code_length( train_data) print('Preparing train data') train_cpcs, train_ipcs, train_uspcs = convert_code_to_idx( train_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len, self.cpc2idx, self.ipc2idx, self.uspc2idx) train_abs_sequence = get_text_sequence(train_data.abstract_text, self.word2idx, self.args.max_length) self.train.append(train_cpcs) self.train.append(train_ipcs) self.train.append(train_uspcs) self.train.append(train_abs_sequence) print('Preparing validation data') val_cpcs, val_ipcs, val_uspcs = convert_code_to_idx( val_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len, self.cpc2idx, self.ipc2idx, self.uspc2idx) val_abs_sequence = get_text_sequence(val_data.abstract_text, self.word2idx, self.args.max_length) self.val.append(val_cpcs) self.val.append(val_ipcs) self.val.append(val_uspcs) self.val.append(val_abs_sequence) print('preparing test data') test_cpcs, test_ipcs, test_uspcs = convert_code_to_idx( test_data, self.max_cpc_len, self.max_ipc_len, self.max_uspc_len, self.cpc2idx, self.ipc2idx, self.uspc2idx) test_abs_sequence = get_text_sequence(test_data.abstract_text, self.word2idx, self.args.max_length) self.test.append(test_cpcs) self.test.append(test_ipcs) self.test.append(test_uspcs) self.test.append(test_abs_sequence)
def main(args): conf = vars(args) device = torch.device("cpu") if torch.cuda.is_available(): torch.cuda.set_device(args.gpu) torch.backends.cudnn.benchmark = True device = torch.device("cuda", args.gpu) if args.output_dir: shutil.rmtree(args.output_dir, ignore_errors=True) os.makedirs(args.output_dir, exist_ok=True) if args.log_dir: shutil.rmtree(args.log_dir, ignore_errors=True) os.makedirs(args.log_dir, exist_ok=True) train_file = os.path.join(args.data_dir, args.train_file) valid_file = os.path.join(args.data_dir, args.valid_file) test_file = os.path.join(args.data_dir, args.test_file) label_file = os.path.join(args.data_dir, args.label_file) logging.info("Loading Data") if args.do_train: cache_path = utils.get_cache_path(train_file, args.cache_dir) if args.cache_dataset and os.path.exists(cache_path): train_data = torch.load(cache_path) else: train_data = TextDataset(train_file, label_file, args.max_seq_len, args.min_seq_len, args.doc_stride) if args.cache_dataset: os.makedirs(args.cache_dir, exist_ok=True) torch.save(train_data, cache_path) train_loader = DataLoader( train_data, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True, drop_last=True, ) cache_path = utils.get_cache_path(valid_file, args.cache_dir) if args.cache_dataset and os.path.exists(cache_path): valid_data = torch.load(cache_path) else: valid_data = TextDataset(valid_file, label_file, args.max_seq_len, args.min_seq_len, args.doc_stride) if args.cache_dataset: os.makedirs(args.cache_dir, exist_ok=True) torch.save(valid_data, cache_path) num_classes = len(valid_data.classes) valid_loader = DataLoader(valid_data, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True) if args.do_test: cache_path = utils.get_cache_path(test_file, args.cache_dir) if args.cache_dataset and os.path.exists(cache_path): test_data = torch.load(cache_path) else: test_data = TextDataset(test_file, label_file, args.max_seq_len, args.min_seq_len) if args.cache_dataset: os.makedirs(args.cache_dir, exist_ok=True) torch.save(test_data, cache_path) num_classes = len(test_data.classes) test_loader = DataLoader(test_data, batch_size=args.batch_size, num_workers=args.workers, pin_memory=True) logging.info("Creating Model") W = utils.load_word_embedding(args.embedding, add_oov=True) model = DTT( seq_len=args.max_seq_len, num_label=num_classes, k_sizes=args.filter_sizes, num_k=args.num_filters, embedding=W, drop_prob=args.drop_prob, ) optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) if args.resume: checkpoint = torch.load(args.resume, map_location="cpu") model.load_state_dict(checkpoint["model"]) optimizer.load_state_dict(checkpoint["optimizer"]) criterion = nn.CrossEntropyLoss() metric = utils.MetricLogger(args.log_dir) model.to(device) if args.do_train: logging.info("Start Training") for epoch in range(args.epochs): train(model, criterion, optimizer, train_loader, epoch, args.log_freq, device, metric) epoch_acc = evaluate(model, criterion, valid_loader, device, metric) if args.output_dir: checkpoint = { "model": model.state_dict(), "optimizer": optimizer.state_dict(), "epoch": epoch, "args": args, } if epoch_acc > conf.get("best_acc", 0): conf["best_acc"] = epoch_acc conf["best_epoch"] = epoch json.dump(conf, open(os.path.join(args.output_dir, "conf.json"), "w"), indent=2) torch.save( checkpoint, os.path.join(args.output_dir, "model_{}.pth".format(epoch))) logging.info("Finished Training") metric.close() if args.do_test: evaluate(model, criterion, test_loader, device, metric)
def main(): parser = options.get_parser('Trainer') options.add_dataset_args(parser) options.add_preprocessing_args(parser) options.add_model_args(parser) options.add_optimization_args(parser) options.add_checkpoint_args(parser) args = parser.parse_args() print(args) args.cuda = not args.disable_cuda and torch.cuda.is_available() torch.manual_seed(5) if args.cuda: torch.backends.cudnn.benchmark = True # increase recursion depth sys.setrecursionlimit(10000) # checkpoint checkpoint_dir = os.path.dirname(args.checkpoint) if not os.path.isdir(checkpoint_dir): os.mkdir(checkpoint_dir) # load dataset train_raw_corpus, val_raw_corpus, test_raw_corpus = utils.load_corpus(args.processed_dir, ddi=False) assert train_raw_corpus and val_raw_corpus and test_raw_corpus, 'Corpus not found, please run preprocess.py to obtain corpus!' train_corpus = [(line.sent, line.type, line.p1, line.p2) for line in train_raw_corpus] val_corpus = [(line.sent, line.type, line.p1, line.p2) for line in val_raw_corpus] start_epoch = 0 caseless = args.caseless batch_size = args.batch_size num_epoch = args.num_epoch # build vocab sents = [tup[0] for tup in train_corpus + val_corpus] feature_map = utils.build_vocab(sents, min_count=args.min_count, caseless=caseless) target_map = ddi2013.target_map # get class weights _, train_targets = utils.build_corpus(train_corpus, feature_map, target_map, caseless) class_weights = torch.Tensor(utils.get_class_weights(train_targets)) if args.class_weight else None train_loader, val_loader, test_loader = utils.load_datasets(args.processed_dir, args.train_size, args, feature_map, dataloader=True) # build model vocab_size = len(feature_map) tagset_size = len(target_map) model = RelationTreeModel(vocab_size, tagset_size, args) # loss criterion = utils.build_loss(args, class_weights=class_weights) # load states if os.path.isfile(args.load_checkpoint): print('Loading checkpoint file from {}...'.format(args.load_checkpoint)) checkpoint_file = torch.load(args.load_checkpoint) start_epoch = checkpoint_file['epoch'] + 1 model.load_state_dict(checkpoint_file['state_dict']) # optimizer.load_state_dict(checkpoint_file['optimizer']) else: print('no checkpoint file found: {}, train from scratch...'.format(args.load_checkpoint)) if not args.rand_embedding: pretrained_word_embedding, in_doc_word_indices = utils.load_word_embedding(args.emb_file, feature_map, args.embedding_dim) print(pretrained_word_embedding.size()) print(vocab_size) model.load_pretrained_embedding(pretrained_word_embedding) if args.disable_fine_tune: model.update_part_embedding(in_doc_word_indices) # update only non-pretrained words model.rand_init(init_embedding=args.rand_embedding) # trainer trainer = TreeTrainer(args, model, criterion) best_f1 = float('-inf') if os.path.isfile(args.load_checkpoint): dev_prec, dev_rec, dev_f1, _ = evaluate(trainer, val_loader, target_map, cuda=args.cuda) test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) best_f1 = dev_f1 print('checkpoint dev_prec: {:.4f}, dev_rec: {:.4f}, dev_f1: {:.4f}, test_prec: {:.4f}, test_rec: {:.4f}, test_f1: {:.4f}'.format( dev_prec, dev_rec, dev_f1, test_prec, test_rec, test_f1)) track_list = [] patience_count = 0 start_time = time.time() q = mp.Queue() # set start methods try: mp.set_start_method('spawn') except RuntimeError: pass for epoch in range(start_epoch, num_epoch): epoch_loss = train(train_loader, trainer, epoch) # processes = [] # for rank in range(args.num_processes): # p = mp.Process(target=train, args=(train_loader, trainer, epoch, q)) # p.start() # processes.append(p) # for p in processes: # p.join() # # epoch_loss = q.get() # update lr trainer.lr_step(epoch_loss) dev_prec, dev_rec, dev_f1, dev_loss = evaluate(trainer, val_loader, target_map, cuda=args.cuda) test_prec, test_rec, test_f1, _ = evaluate(trainer, test_loader, target_map, cuda=args.cuda) if dev_f1 >= best_f1: patience_count = 0 best_f1 = dev_f1 track_list.append({'epoch': epoch, 'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss, 'test_prec': test_prec, 'test_rec': test_rec, 'test_f1': test_f1}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}\tsaving...'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1)) try: utils.save_checkpoint({ 'epoch': epoch, 'state_dict': model.state_dict(), 'optimizer': trainer.optimizer.state_dict(), 'f_map': feature_map, 't_map': target_map, }, {'track_list': track_list, 'args': vars(args) }, args.checkpoint) except Exception as inst: print(inst) else: patience_count += 1 track_list.append({'epoch': epoch,'loss': epoch_loss, 'dev_prec': dev_prec, 'dev_rec': dev_rec, 'dev_f1': dev_f1, 'dev_loss': dev_loss}) print('epoch: {}, loss: {:.4f}, dev_f1: {:.4f}, dev_loss: {:.4f}, test_f1: {:.4f}'.format(epoch, epoch_loss, dev_f1, dev_loss, test_f1)) print('epoch: {} in {} take: {} s'.format(epoch, args.num_epoch, time.time() - start_time)) if patience_count >= args.patience: break
def main(): # Training settings parser = ArgumentParser() parser.add_argument('-d', '--device', default=None, type=str, help='indices of GPUs to enable (default: None)') parser.add_argument('-b', '--batch-size', type=int, default=1024, help='number of batch size for training') parser.add_argument('-e', '--epochs', type=int, default=100, help='number of epochs to train (default: 100)') parser.add_argument('--save-path', type=str, default='result/model.pth', help='path to trained model to save') parser.add_argument('--model', choices=['MLP', 'BiLSTM', 'BiLSTMAttn', 'CNN'], default='MLP', help='model name') parser.add_argument('--env', choices=['local', 'server'], default='server', help='development environment') parser.add_argument('--word-dim', type=int, default=128, help='the dimension of embedding') parser.add_argument( '--word-lim', type=int, default=None, help='If specified, input sequence length is limited from tail.') parser.add_argument('--lr', type=float, default=1e-3, help='learning rate') parser.add_argument('--seed', type=int, default=1, help='random seed (default: 1)') args = parser.parse_args() torch.manual_seed(args.seed) os.makedirs(os.path.dirname(args.save_path), exist_ok=True) if args.device: os.environ["CUDA_VISIBLE_DEVICES"] = args.device device = torch.device('cuda:0' if torch.cuda.is_available() and args.device is not None else 'cpu') model_w2v = KeyedVectors.load_word2vec_format(W2V_MODEL_FILE[args.env], binary=True) word_to_id = word2id(model_w2v) initial_embedding = load_word_embedding(model_w2v) # setup data_loader instances train_data_loader = PosNegDataLoader(TRAIN_FILE[args.env], word_to_id, args.word_lim, args.batch_size, shuffle=True, num_workers=2) valid_data_loader = PosNegDataLoader(VALID_FILE[args.env], word_to_id, args.word_lim, args.batch_size, shuffle=False, num_workers=2) # build model architecture if args.model == 'MLP': model = MLP(word_dim=args.word_dim, hidden_size=100, vocab_size=len(word_to_id)) elif args.model == 'BiLSTM': model = BiLSTM(word_dim=args.word_dim, hidden_size=100, vocab_size=len(word_to_id)) elif args.model == 'BiLSTMAttn': model = BiLSTMAttn(word_dim=args.word_dim, hidden_size=100, vocab_size=len(word_to_id)) elif args.model == 'CNN': model = CNN(word_dim=args.word_dim, word_lim=args.word_lim, vocab_size=len(word_to_id)) else: raise ValueError( f'model name should be "MLP", "BiLSTM", "BiLSTMAttn", or "CNN", but given {args.model}' ) model.set_initial_embedding(initial_embedding) model.to(device) # build optimizer optimizer = torch.optim.Adam(model.parameters(), lr=args.lr) best_valid_acc = -1 for epoch in range(1, args.epochs + 1): print(f'*** epoch {epoch} ***') # train model.train() total_loss = 0 total_correct = 0 for batch_idx, (source, mask, target) in enumerate(train_data_loader): source = source.to(device) # (b, len) mask = mask.to(device) # (b, len) target = target.to(device) # (b) # Forward pass output = model(source, mask) # (b, 2) loss = loss_fn(output, target) # Backward and optimize optimizer.zero_grad() loss.backward() optimizer.step() total_loss += loss.item() total_correct += metric_fn(output, target) print(f'train_loss={total_loss / train_data_loader.n_samples:.3f}', end=' ') print( f'train_accuracy={total_correct / train_data_loader.n_samples:.3f}' ) # validation model.eval() with torch.no_grad(): total_loss = 0 total_correct = 0 for batch_idx, (source, mask, target) in enumerate(valid_data_loader): source = source.to(device) # (b, len) mask = mask.to(device) # (b, len) target = target.to(device) # (b) output = model(source, mask) # (b, 2) total_loss += loss_fn(output, target) total_correct += metric_fn(output, target) valid_acc = total_correct / valid_data_loader.n_samples print(f'valid_loss={total_loss / valid_data_loader.n_samples:.3f}', end=' ') print(f'valid_accuracy={valid_acc:.3f}\n') if valid_acc > best_valid_acc: torch.save(model.state_dict(), args.save_path) best_valid_acc = valid_acc
def __init__(self, embedding_dim=100, batch_size=64, n_hidden=100, learning_rate=0.01, n_class=3, max_sentence_len=50, l2_reg=0., display_step=4, n_iter=100, type_=''): self.embedding_dim = embedding_dim self.batch_size = batch_size self.n_hidden = n_hidden self.learning_rate = learning_rate self.n_class = n_class self.max_sentence_len = max_sentence_len self.l2_reg = l2_reg self.display_step = display_step self.n_iter = n_iter self.type_ = type_ self.word_id_mapping, self.w2v = load_word_embedding(FLAGS.word_id_file_path, FLAGS.embedding_file_path, self.embedding_dim) # self.word_embedding = tf.constant(self.w2v, dtype=tf.float32, name='word_embedding') self.word_embedding = tf.Variable(self.w2v, dtype=tf.float32, name='word_embedding') # self.word_id_mapping = load_word_id_mapping(FLAGS.word_id_file_path) # self.word_embedding = tf.Variable( # tf.random_uniform([len(self.word_id_mapping), self.embedding_dim], -0.1, 0.1), name='word_embedding') self.aspect_id_mapping, self.aspect_embed = load_aspect2id(FLAGS.aspect_id_file_path, self.word_id_mapping, self.w2v, self.embedding_dim) self.aspect_embedding = tf.Variable(self.aspect_embed, dtype=tf.float32, name='aspect_embedding') self.keep_prob1 = tf.placeholder(tf.float32) self.keep_prob2 = tf.placeholder(tf.float32) with tf.name_scope('inputs'): self.x = tf.placeholder(tf.int32, [None, self.max_sentence_len], name='x') self.y = tf.placeholder(tf.int32, [None, self.n_class], name='y') self.sen_len = tf.placeholder(tf.int32, None, name='sen_len') self.aspect_id = tf.placeholder(tf.int32, None, name='aspect_id') with tf.name_scope('weights'): self.weights = { 'softmax': tf.get_variable( name='softmax_w', shape=[self.n_hidden, self.n_class], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ) } with tf.name_scope('biases'): self.biases = { 'softmax': tf.get_variable( name='softmax_b', shape=[self.n_class], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ) } self.W = tf.get_variable( name='W', shape=[self.n_hidden + self.embedding_dim, self.n_hidden + self.embedding_dim], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ) self.w = tf.get_variable( name='w', shape=[self.n_hidden + self.embedding_dim, 1], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ) self.Wp = tf.get_variable( name='Wp', shape=[self.n_hidden, self.n_hidden], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) ) self.Wx = tf.get_variable( name='Wx', shape=[self.n_hidden, self.n_hidden], initializer=tf.random_uniform_initializer(-0.01, 0.01), regularizer=tf.contrib.layers.l2_regularizer(self.l2_reg) )
config_file = sys.argv[1] configure = json.load(open(config_file)) config = configure["main_configuration"] print("Data extraction\nConfiguration: ") print(json.dumps(config, indent=2), end='\n') w2v_file = config["pretrained_embedding"] # w2v_file data_index = config["index"] # Indri index mapped_w2v_file = config["output_embedding"] # output shared w2v dict print('load word dict ...') word_dict = load_word_dict(data_index) print("Dictionary length: {}".format(len(word_dict))) print('load word vectors ...') embeddings = load_word_embedding(word_dict, w2v_file) print('save word vectors ...') with open(mapped_w2v_file, 'w') as fw: # assert word_dict for w, idx in tqdm(word_dict.items()): try: print(word_dict[w], ' '.join(map(str, embeddings[idx])), file=fw) except Exception as error: print('Error saving this word : {}\n'.format(word_dict[w]) + repr(error)) # print(embeddings[idx]) print('Map word vectors finished ...')