def main(_): from src.io.batch_iterator import BatchIterator train = pkl.load(open('../../../../data/se2014task06/tabsa-rest/train.pkl', 'rb'), encoding='latin') test = pkl.load(open('../../../../data/se2014task06/tabsa-rest/test.pkl', 'rb'), encoding='latin') fns = [ '../../../../data/se2014task06/tabsa-rest/train.pkl', '../../../../data/se2014task06/tabsa-rest/dev.pkl', '../../../../data/se2014task06/tabsa-rest/test.pkl', ] data_dir = '../classifier/data//0617' #data_dir = '/Users/wdxu//workspace/absa/TD-LSTM/data/restaurant/for_absa/' word2idx, embedding = preprocess_data( fns, '/Users/wdxu/data/glove/glove.6B/glove.6B.300d.txt', data_dir) train_it = BatchIterator(len(train), FLAGS.batch_size, [train], testing=False) test_it = BatchIterator(len(test), FLAGS.batch_size, [test], testing=False) configproto = tf.ConfigProto() configproto.gpu_options.allow_growth = True configproto.allow_soft_placement = True with tf.Session(config=configproto) as sess: tf.global_variables_initializer().run() model = APDecoder( word2idx=word2idx, embedding_dim=FLAGS.embedding_dim, n_hidden=FLAGS.n_hidden, learning_rate=FLAGS.learning_rate, n_class=FLAGS.n_class, max_sentence_len=FLAGS.max_sentence_len + 4, l2_reg=FLAGS.l2_reg, embedding=embedding, dim_z=3, #decoder_type=FLAGS.decoder_type, decoder_type='sclstm', grad_clip=FLAGS.grad_clip, position='distance-add', bidirection=True) model.run(sess, train_it, test_it, FLAGS.batch_size, FLAGS.n_iter, FLAGS.keep_rate, '.')
def main(_): from src.io.batch_iterator import BatchIterator train = pkl.load(open('../../../../data/se2014task06/tabsa-rest/train.pkl', 'rb'), encoding='latin') test = pkl.load(open('../../../../data/se2014task06/tabsa-rest/test.pkl', 'rb'), encoding='latin') fns = ['../../../../data/se2014task06/tabsa-rest/train.pkl', '../../../../data/se2014task06/tabsa-rest/dev.pkl', '../../../../data/se2014task06/tabsa-rest/test.pkl',] data_dir = 'tmp' #data_dir = '/Users/wdxu//workspace/absa/TD-LSTM/data/restaurant/for_absa/' word2idx, target2idx, word_embedding, target_embedding = preprocess_data(fns, '../../../../data/glove.6B/glove.6B.300d.txt', data_dir) word_embedding = np.concatenate([word_embedding, np.zeros([1, FLAGS.embedding_dim])]) #print(target_embedding[:10,:]) train_it = BatchIterator(len(train), FLAGS.batch_size, [train], testing=False) test_it = BatchIterator(len(test), FLAGS.batch_size, [test], testing=False) configproto = tf.ConfigProto() configproto.gpu_options.allow_growth = True configproto.allow_soft_placement = True with tf.Session(config=configproto) as sess: tf.global_variables_initializer().run() model = MEMClassifier(nwords=len(word2idx) + 1, word2idx = word2idx, target2idx = target2idx, init_hid=0.1, init_std=0.01, init_lr=0.01, batch_size=FLAGS.batch_size, nhop=3, edim=FLAGS.embedding_dim, mem_size=FLAGS.max_sentence_len, lindim=300, max_grad_norm=100, pad_idx=len(word2idx), pre_trained_context_wt=word_embedding, pre_trained_target_wt=target_embedding, n_class = 3,) logger.info(model) model.run(sess, train_it, test_it, FLAGS.n_iter, FLAGS.keep_rate, data_dir)
def main(_): tf.reset_default_graph() g = tf.Graph() with g.as_default(): tf.set_random_seed(1234) np.random.seed(1234) from src.io.batch_iterator import BatchIterator #train = pkl.load(open('../../../../data/se2014task06/tabsa-rest/train.pkl', 'rb'), encoding='latin') #test = pkl.load(open('../../../../data/se2014task06/tabsa-rest/test.pkl', 'rb'), encoding='latin') #fns = ['../../../../data/se2014task06/tabsa-rest/train.pkl', #'../../../../data/se2014task06/tabsa-rest/dev.pkl', #'../../../../data/se2014task06/tabsa-rest/test.pkl',] #train = pkl.load(open('../../../../data/se2014task06/tabsa-lapt/train.pkl', 'rb'), encoding='latin') #test = pkl.load(open('../../../../data/se2014task06/tabsa-lapt/test.pkl', 'rb'), encoding='latin') #fns = ['../../../../data/se2014task06/tabsa-lapt/train.pkl', #'../../../../data/se2014task06/tabsa-lapt/dev.pkl', #'../../../../data/se2014task06/tabsa-lapt/test.pkl',] train = pkl.load(open(FLAGS.train_file_path, 'rb'), encoding='latin') test = pkl.load(open(FLAGS.test_file_path, 'rb'), encoding='latin') fns = [FLAGS.train_file_path, FLAGS.test_file_path] #data_dir = '../data/rest/unlabel10k_filter/' #data_dir = '../data/rest/cbow_labelonly/' #data_dir = 'data/lapt_labelonly/' data_dir = '../data/lapt/cbow_labelonly' #data_dir = '../data/lapt/unlabel10k_filter/' #data_dir = '/Users/wdxu//workspace/absa/TD-LSTM/data/restaurant/for_absa/' #word2idx, embedding = preprocess_data(fns, '/home/weidi.xwd/workspace//ABSA/data/glove.7B//glove.6B.300d.txt', data_dir) #word2idx, embedding = preprocess_data(fns, '/home/fanyin.cxy/ABSA/data/word2vec//cbow.unlabel.300d.txt', data_dir) word2idx, embedding = preprocess_data( fns, '/home/weidi.xwd/workspace/ABSA/data/se2014task06/tabsa-lapt/cbow.unlabel.300d.txt', data_dir) train_it = BatchIterator(len(train), FLAGS.batch_size, [train], testing=False) test_it = BatchIterator(len(test), FLAGS.batch_size, [test], testing=True) configproto = tf.ConfigProto() configproto.gpu_options.allow_growth = True configproto.allow_soft_placement = True with tf.Session(config=configproto) as sess: tf.global_variables_initializer().run() model = TCClassifier(word2idx=word2idx, embedding_dim=FLAGS.embedding_dim, n_hidden=FLAGS.n_hidden, learning_rate=FLAGS.learning_rate, n_class=FLAGS.n_class, max_sentence_len=FLAGS.max_sentence_len, l2_reg=FLAGS.l2_reg, embedding=embedding, grad_clip=FLAGS.grad_clip) model.run(sess, train_it, test_it, FLAGS.n_iter, FLAGS.keep_rate, '.')
def selftraining(sess, classifier, label_data, unlabel_data, test_data, FLAGS): xa_inputs = classifier.create_placeholders('xa') hyper_inputs = classifier.create_placeholders('hyper') y_inputs = classifier.create_placeholders('y') logits = classifier.forward(xa_inputs, hyper_inputs) loss, acc, _ = classifier.get_loss(logits, y_inputs, [0.0] * classifier.n_class) pred = tf.argmax(logits, axis=1) prob = tf.reduce_max(tf.nn.softmax(logits), axis=1) import time, datetime timestamp = str(int(time.time())) timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") save_dir = FLAGS.save_dir + '/selftraining/' + str( timestamp) + '/' + __file__.split('.')[0] print(save_dir) logger = ExpLogger('semi_tabsa', save_dir) logger.write_args(vars(FLAGS)['__flags']) logger.write_variables(tf.trainable_variables()) logger.file_copy( ['*.py', 'encoder/*.py', 'decoder/*.py', 'classifier/*.py']) def get_feed_dict_help(classifier, plhs, data_dict, keep_rate, is_training): plh_dict = {} for plh in plhs: plh_dict.update(plh) data_dict.update({'keep_rate': keep_rate}) data_dict.update({'is_training': is_training}) feed_dict = classifier.get_feed_dict(plh_dict, data_dict) return feed_dict with tf.name_scope('train'): loss = tf.reduce_mean(loss) optimizer = classifier.training_op(loss, tf.trainable_variables(), FLAGS.grad_clip, 20, FLAGS.learning_rate, grads=None, opt='Adam') NUM_SELECT = 1000 NUM_ITER = 500 best_acc_in_rounds, best_f1_in_rounds = [], [] while len(unlabel_data): tf.global_variables_initializer().run() test_it = BatchIterator(len(test_data), FLAGS.batch_size, [test_data], testing=True) print(len(unlabel_data)) selected = [] new_unlabel = [] it_cnt = 0 best_acc, best_f1 = 0, 0 while True: train_it = BatchIterator(len(label_data), FLAGS.batch_size, [label_data], testing=False) unlabel_it = BatchIterator(len(unlabel_data), FLAGS.batch_size, [unlabel_data], testing=True) for samples, in train_it: it_cnt += 1 if it_cnt > NUM_ITER: break feed_dict = get_feed_dict_help( classifier, plhs=[xa_inputs, y_inputs, hyper_inputs], data_dict=classifier.prepare_data(samples), keep_rate=FLAGS.keep_rate, is_training=True) _, _loss, _acc, _step = sess.run( [optimizer, loss, acc, classifier.global_step], feed_dict=feed_dict) #print('Train: step {}, acc {}, loss {}'.format(it_cnt, _acc, _loss)) ### proc test test_acc, cnt = 0, 0 y_true = [] y_pred = [] for samples, in test_it: data_dict = classifier.prepare_data(samples) feed_dict = get_feed_dict_help( classifier, plhs=[xa_inputs, y_inputs, hyper_inputs], data_dict=data_dict, keep_rate=1.0, is_training=False) num = len(samples) _acc, _loss, _pred, _step = sess.run( [acc, loss, pred, classifier.global_step], feed_dict=feed_dict) y_pred.extend(list(_pred)) y_true.extend(list(np.argmax(data_dict['y'], 1))) test_acc += _acc * num cnt += num test_acc = test_acc / cnt test_f1 = f1_score(y_true, y_pred, average='macro') logger.info( 'Test: step {}, test acc={:.6f}, test f1={:.6f}'.format( it_cnt, test_acc, test_f1)) best_f1 = max(best_f1, test_f1) ### proc unlabel if best_acc < test_acc: best_acc = test_acc _unlabel = [] _preds = [] _probs = [] y_dict = {0: 'positive', 1: 'negative', 2: 'neutral'} for samples, in unlabel_it: feed_dict = get_feed_dict_help( classifier, plhs=[xa_inputs, hyper_inputs], data_dict=classifier.prepare_data(samples), keep_rate=1.0, is_training=False) _pred, _prob = sess.run([pred, prob], feed_dict=feed_dict) _unlabel.extend(samples) _preds.extend(list(_pred)) _probs.extend(list(_prob)) top_k_id = np.argsort(_probs)[::-1][:NUM_SELECT] remain_id = np.argsort(_probs)[::-1][NUM_SELECT:] selected = [_unlabel[idx] for idx in top_k_id] preds = [_preds[idx] for idx in top_k_id] for idx, sample in enumerate(selected): sample['polarity'] = y_dict[preds[idx]] new_unlabel = [_unlabel[idx] for idx in remain_id] if it_cnt > NUM_ITER: best_acc_in_rounds.append(best_acc) best_f1_in_rounds.append(best_f1) logger.info(str(best_acc_in_rounds) + str(best_f1_in_rounds)) break label_data.extend(selected) unlabel_data = new_unlabel #print(max(best_acc_in_rounds), max(best_f1_in_rounds)) logger.info(str(best_acc_in_rounds) + str(best_f1_in_rounds))
def main(_): FLAGS = tf.app.flags.FLAGS import time, datetime timestamp = str(int(time.time())) timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") save_dir = FLAGS.save_dir + '/' + str(timestamp) + '_' + 'ctype'+FLAGS.classifier_type + '_r' + str(FLAGS.learning_rate) + '_l' + str(FLAGS.l2_reg)\ + '_alpha' + str(FLAGS.alpha) + '_batchsize' + str(FLAGS.batch_size) + '_hidae' + str(FLAGS.n_hidden_ae)\ + '_dimz' + str(FLAGS.dim_z) + '_dec' + str(FLAGS.decoder_type) + '_unlabel' + str(FLAGS.n_unlabel)\ + '_positionenc' + str(FLAGS.position_enc) + '_bidirectionenc' + str(FLAGS.bidirection_enc)\ + '_positiondec' + str(FLAGS.position_dec) + '_bidirectiondec' + str(FLAGS.bidirection_dec)\ + '_hop3_opimadagrad_vochas10kunl_addunkpd_noapconcattindec_kl1e-4_noh_fixemb' #save_dir = 'tmp' from src.io.batch_iterator import BatchIterator train = pkl.load(open(FLAGS.train_file_path, 'rb'), encoding='latin') unlabel = pkl.load(open(FLAGS.unlabel_file_path, 'rb'), encoding='latin')[:FLAGS.n_unlabel] test = pkl.load(open(FLAGS.test_file_path, 'rb'), encoding='latin') val = pkl.load(open(FLAGS.validate_file_path, 'rb'), encoding='latin') def get_y(samples): y_dict = {'positive': [1,0,0], 'negative': [0, 1, 0], 'neutral': [0, 0, 1]} ys = [y_dict[sample['polarity']] for sample in samples] return ys y = get_y(train) pri_prob_y = (np.sum(y, axis=0)/len(y)).astype('float32') #pri_prob_y = np.ones(FLAGS.n_class).astype('float32')/FLAGS.n_class print(pri_prob_y) fns = [FLAGS.train_file_path, FLAGS.test_file_path, FLAGS.unlabel_file_path] data_dir = 'data/rest/ian/' #emb_file = "../../../data/word2vec/cbow.unlabel.300d.txt" emb_file = "../../../data/glove.6B/glove.6B.300d.txt" #emb_file = "../../../data/glove.840B/glove.840B.300d.txt" word2idx, target2idx, word_embedding, target_embedding = preprocess_data(fns, emb_file, data_dir, FLAGS) train_it = BatchIterator(len(train), FLAGS.batch_size, [train], testing=False) unlabel_it = BatchIterator(len(unlabel), FLAGS.batch_size, [unlabel], testing=False) test_it = BatchIterator(len(test), FLAGS.batch_size, [test], testing=False) # val_it = BatchIterator(len(val), FLAGS.batch_size, [val], testing=False) configproto = tf.ConfigProto() configproto.gpu_options.allow_growth = True configproto.allow_soft_placement = True with tf.Session(config=configproto) as sess: tf.global_variables_initializer().run() model = SemiTABSA(word2idx=word2idx, target2idx=target2idx, embedding_dim=FLAGS.embedding_dim, batch_size=FLAGS.batch_size, n_hidden=FLAGS.n_hidden, learning_rate=FLAGS.learning_rate, n_class=FLAGS.n_class, max_sentence_len=FLAGS.max_sentence_len, l2_reg=FLAGS.l2_reg, word_embedding=word_embedding, target_embedding=target_embedding, dim_z=FLAGS.dim_z, pri_prob_y=pri_prob_y, decoder_type=FLAGS.decoder_type, grad_clip=FLAGS.grad_clip, n_hidden_ae=FLAGS.n_hidden_ae, position_enc=FLAGS.position_enc, bidirection_enc=FLAGS.bidirection_enc, position_dec=FLAGS.position_dec, bidirection_dec=FLAGS.bidirection_dec, classifier_type=FLAGS.classifier_type, ) model.run(sess, train_it, unlabel_it, test_it, FLAGS.n_iter, FLAGS.keep_rate, save_dir, FLAGS.batch_size, FLAGS.alpha, vars(FLAGS)['__flags'])
def main(_): #tf.reset_default_graph() #tf.Graph().as_default() tf.set_random_seed(1234) np.random.seed(1234) FLAGS = tf.app.flags.FLAGS FLAGS.bidirection_enc = True if FLAGS.bidirection_enc == 'True' else False FLAGS.bidirection_dec = True if FLAGS.bidirection_dec == 'True' else False FLAGS.sharefc = True if FLAGS.sharefc == 'True' else False if 'lapt' in FLAGS.train_file_path: data_dir = 'data/lapt/tclstm' data_name = 'lapt' else: data_dir = 'data/rest/tclstm' data_name = 'rest' import time, datetime timestamp = str(int(time.time())) timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S") save_dir = FLAGS.save_dir + '/' + str(timestamp) + '_' + '_r' + str(FLAGS.learning_rate) + '_l2' + str(FLAGS.l2_reg)\ + '_alpha' + str(FLAGS.alpha) + '_bs' + str(FLAGS.batch_size) + '_hidae' + str(FLAGS.n_hidden_ae)\ + '_dz' + str(FLAGS.dim_z) + '_dec' + str(FLAGS.decoder_type) + '_u' + str(FLAGS.n_unlabel)\ + '_penc' + str(FLAGS.position_enc) + '_bienc' + str(FLAGS.bidirection_enc)\ + '_pdec' + str(FLAGS.position_dec) + '_bidec' + str(FLAGS.bidirection_dec)\ + '_sharefc' + str(FLAGS.sharefc) + '_data' + str(data_name)\ + '_trunctdis' #+ '_vochas10kunl_addunkpd_noapconcattindec_kl1e-4_noh_filteremb_trunctdis_sharefc_decfcdepb' #save_dir = 'tmp' from src.io.batch_iterator import BatchIterator train = pkl.load(open(FLAGS.train_file_path, 'rb'), encoding='latin') unlabel = pkl.load(open(FLAGS.unlabel_file_path, 'rb'), encoding='latin')[:FLAGS.n_unlabel] test = pkl.load(open(FLAGS.test_file_path, 'rb'), encoding='latin') def get_y(samples): y_dict = {'positive': [1,0,0], 'negative': [0, 1, 0], 'neutral': [0, 0, 1]} ys = [y_dict[sample['polarity']] for sample in samples] return ys y = get_y(train) pri_prob_y = (np.sum(y, axis=0)/len(y)).astype('float32') #pri_prob_y = np.ones(FLAGS.n_class).astype('float32')/FLAGS.n_class print(pri_prob_y) fns = [FLAGS.train_file_path, FLAGS.unlabel_file_path, FLAGS.test_file_path] word2idx, embedding = preprocess_data(fns, '../../../data/glove.6B/glove.6B.300d.txt', data_dir) train_it = BatchIterator(len(train), FLAGS.batch_size, [train], testing=False) unlabel_it = BatchIterator(len(unlabel), FLAGS.batch_size, [unlabel], testing=False) test_it = BatchIterator(len(test), FLAGS.batch_size, [test], testing=True) configproto = tf.ConfigProto() configproto.gpu_options.allow_growth = True configproto.allow_soft_placement = True with tf.Session(config=configproto) as sess: tf.global_variables_initializer().run() model = SemiTABSA(word2idx=word2idx, embedding_dim=FLAGS.embedding_dim, batch_size=FLAGS.batch_size, n_hidden=FLAGS.n_hidden, learning_rate=FLAGS.learning_rate, n_class=FLAGS.n_class, max_sentence_len=FLAGS.max_sentence_len, l2_reg=FLAGS.l2_reg, embedding=embedding, dim_z=FLAGS.dim_z, pri_prob_y=pri_prob_y, decoder_type=FLAGS.decoder_type, grad_clip=FLAGS.grad_clip, n_hidden_ae=FLAGS.n_hidden_ae, position_enc=FLAGS.position_enc, bidirection_enc=FLAGS.bidirection_enc, position_dec=FLAGS.position_dec, bidirection_dec=FLAGS.bidirection_dec, classifier_type=FLAGS.classifier_type, sharefc=FLAGS.sharefc, ) model.run(sess, train_it, unlabel_it, test_it, FLAGS.n_iter, FLAGS.keep_rate, save_dir, FLAGS.batch_size, FLAGS.alpha, vars(FLAGS)['__flags'])