예제 #1
0
def main(_):
    from src.io.batch_iterator import BatchIterator
    train = pkl.load(open('../../../../data/se2014task06/tabsa-rest/train.pkl',
                          'rb'),
                     encoding='latin')
    test = pkl.load(open('../../../../data/se2014task06/tabsa-rest/test.pkl',
                         'rb'),
                    encoding='latin')

    fns = [
        '../../../../data/se2014task06/tabsa-rest/train.pkl',
        '../../../../data/se2014task06/tabsa-rest/dev.pkl',
        '../../../../data/se2014task06/tabsa-rest/test.pkl',
    ]

    data_dir = '../classifier/data//0617'
    #data_dir = '/Users/wdxu//workspace/absa/TD-LSTM/data/restaurant/for_absa/'
    word2idx, embedding = preprocess_data(
        fns, '/Users/wdxu/data/glove/glove.6B/glove.6B.300d.txt', data_dir)
    train_it = BatchIterator(len(train),
                             FLAGS.batch_size, [train],
                             testing=False)
    test_it = BatchIterator(len(test), FLAGS.batch_size, [test], testing=False)

    configproto = tf.ConfigProto()
    configproto.gpu_options.allow_growth = True
    configproto.allow_soft_placement = True
    with tf.Session(config=configproto) as sess:
        tf.global_variables_initializer().run()

        model = APDecoder(
            word2idx=word2idx,
            embedding_dim=FLAGS.embedding_dim,
            n_hidden=FLAGS.n_hidden,
            learning_rate=FLAGS.learning_rate,
            n_class=FLAGS.n_class,
            max_sentence_len=FLAGS.max_sentence_len + 4,
            l2_reg=FLAGS.l2_reg,
            embedding=embedding,
            dim_z=3,
            #decoder_type=FLAGS.decoder_type,
            decoder_type='sclstm',
            grad_clip=FLAGS.grad_clip,
            position='distance-add',
            bidirection=True)

        model.run(sess, train_it, test_it, FLAGS.batch_size, FLAGS.n_iter,
                  FLAGS.keep_rate, '.')
예제 #2
0
def main(_):
    from src.io.batch_iterator import BatchIterator
    train = pkl.load(open('../../../../data/se2014task06/tabsa-rest/train.pkl', 'rb'), encoding='latin')
    test = pkl.load(open('../../../../data/se2014task06/tabsa-rest/test.pkl', 'rb'), encoding='latin')
    
    fns = ['../../../../data/se2014task06/tabsa-rest/train.pkl',
            '../../../../data/se2014task06/tabsa-rest/dev.pkl',
            '../../../../data/se2014task06/tabsa-rest/test.pkl',]

    data_dir = 'tmp'
    #data_dir = '/Users/wdxu//workspace/absa/TD-LSTM/data/restaurant/for_absa/'
    word2idx, target2idx, word_embedding, target_embedding = preprocess_data(fns, '../../../../data/glove.6B/glove.6B.300d.txt', data_dir)
    word_embedding = np.concatenate([word_embedding, np.zeros([1, FLAGS.embedding_dim])])
    #print(target_embedding[:10,:])
    train_it = BatchIterator(len(train), FLAGS.batch_size, [train], testing=False)
    test_it = BatchIterator(len(test), FLAGS.batch_size, [test], testing=False)
 
    configproto = tf.ConfigProto()
    configproto.gpu_options.allow_growth = True 
    configproto.allow_soft_placement = True
    with tf.Session(config=configproto) as sess:
        tf.global_variables_initializer().run()

        model = MEMClassifier(nwords=len(word2idx) + 1,
                  word2idx = word2idx,
                  target2idx = target2idx,
                  init_hid=0.1,
                  init_std=0.01,
                  init_lr=0.01,
                  batch_size=FLAGS.batch_size,
                  nhop=3,
                  edim=FLAGS.embedding_dim,
                  mem_size=FLAGS.max_sentence_len,
                  lindim=300,
                  max_grad_norm=100,
                  pad_idx=len(word2idx),
                  pre_trained_context_wt=word_embedding,
                  pre_trained_target_wt=target_embedding,
                  n_class = 3,)
        logger.info(model)            
        model.run(sess, train_it, test_it, FLAGS.n_iter, FLAGS.keep_rate, data_dir)
예제 #3
0
def main(_):
    tf.reset_default_graph()
    g = tf.Graph()
    with g.as_default():
        tf.set_random_seed(1234)
        np.random.seed(1234)

        from src.io.batch_iterator import BatchIterator
        #train = pkl.load(open('../../../../data/se2014task06/tabsa-rest/train.pkl', 'rb'), encoding='latin')
        #test = pkl.load(open('../../../../data/se2014task06/tabsa-rest/test.pkl', 'rb'), encoding='latin')

        #fns = ['../../../../data/se2014task06/tabsa-rest/train.pkl',
        #'../../../../data/se2014task06/tabsa-rest/dev.pkl',
        #'../../../../data/se2014task06/tabsa-rest/test.pkl',]

        #train = pkl.load(open('../../../../data/se2014task06/tabsa-lapt/train.pkl', 'rb'), encoding='latin')
        #test = pkl.load(open('../../../../data/se2014task06/tabsa-lapt/test.pkl', 'rb'), encoding='latin')

        #fns = ['../../../../data/se2014task06/tabsa-lapt/train.pkl',
        #'../../../../data/se2014task06/tabsa-lapt/dev.pkl',
        #'../../../../data/se2014task06/tabsa-lapt/test.pkl',]

        train = pkl.load(open(FLAGS.train_file_path, 'rb'), encoding='latin')
        test = pkl.load(open(FLAGS.test_file_path, 'rb'), encoding='latin')

        fns = [FLAGS.train_file_path, FLAGS.test_file_path]

        #data_dir = '../data/rest/unlabel10k_filter/'
        #data_dir = '../data/rest/cbow_labelonly/'
        #data_dir = 'data/lapt_labelonly/'
        data_dir = '../data/lapt/cbow_labelonly'
        #data_dir = '../data/lapt/unlabel10k_filter/'
        #data_dir = '/Users/wdxu//workspace/absa/TD-LSTM/data/restaurant/for_absa/'
        #word2idx, embedding = preprocess_data(fns, '/home/weidi.xwd/workspace//ABSA/data/glove.7B//glove.6B.300d.txt', data_dir)
        #word2idx, embedding = preprocess_data(fns, '/home/fanyin.cxy/ABSA/data/word2vec//cbow.unlabel.300d.txt', data_dir)
        word2idx, embedding = preprocess_data(
            fns,
            '/home/weidi.xwd/workspace/ABSA/data/se2014task06/tabsa-lapt/cbow.unlabel.300d.txt',
            data_dir)
        train_it = BatchIterator(len(train),
                                 FLAGS.batch_size, [train],
                                 testing=False)
        test_it = BatchIterator(len(test),
                                FLAGS.batch_size, [test],
                                testing=True)

        configproto = tf.ConfigProto()
        configproto.gpu_options.allow_growth = True
        configproto.allow_soft_placement = True
        with tf.Session(config=configproto) as sess:
            tf.global_variables_initializer().run()

            model = TCClassifier(word2idx=word2idx,
                                 embedding_dim=FLAGS.embedding_dim,
                                 n_hidden=FLAGS.n_hidden,
                                 learning_rate=FLAGS.learning_rate,
                                 n_class=FLAGS.n_class,
                                 max_sentence_len=FLAGS.max_sentence_len,
                                 l2_reg=FLAGS.l2_reg,
                                 embedding=embedding,
                                 grad_clip=FLAGS.grad_clip)

            model.run(sess, train_it, test_it, FLAGS.n_iter, FLAGS.keep_rate,
                      '.')
def selftraining(sess, classifier, label_data, unlabel_data, test_data, FLAGS):
    xa_inputs = classifier.create_placeholders('xa')
    hyper_inputs = classifier.create_placeholders('hyper')
    y_inputs = classifier.create_placeholders('y')

    logits = classifier.forward(xa_inputs, hyper_inputs)
    loss, acc, _ = classifier.get_loss(logits, y_inputs,
                                       [0.0] * classifier.n_class)
    pred = tf.argmax(logits, axis=1)
    prob = tf.reduce_max(tf.nn.softmax(logits), axis=1)

    import time, datetime
    timestamp = str(int(time.time()))
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")

    save_dir = FLAGS.save_dir + '/selftraining/' + str(
        timestamp) + '/' + __file__.split('.')[0]
    print(save_dir)
    logger = ExpLogger('semi_tabsa', save_dir)
    logger.write_args(vars(FLAGS)['__flags'])
    logger.write_variables(tf.trainable_variables())
    logger.file_copy(
        ['*.py', 'encoder/*.py', 'decoder/*.py', 'classifier/*.py'])

    def get_feed_dict_help(classifier, plhs, data_dict, keep_rate,
                           is_training):
        plh_dict = {}
        for plh in plhs:
            plh_dict.update(plh)
        data_dict.update({'keep_rate': keep_rate})
        data_dict.update({'is_training': is_training})
        feed_dict = classifier.get_feed_dict(plh_dict, data_dict)
        return feed_dict

    with tf.name_scope('train'):
        loss = tf.reduce_mean(loss)
        optimizer = classifier.training_op(loss,
                                           tf.trainable_variables(),
                                           FLAGS.grad_clip,
                                           20,
                                           FLAGS.learning_rate,
                                           grads=None,
                                           opt='Adam')

    NUM_SELECT = 1000
    NUM_ITER = 500
    best_acc_in_rounds, best_f1_in_rounds = [], []
    while len(unlabel_data):
        tf.global_variables_initializer().run()
        test_it = BatchIterator(len(test_data),
                                FLAGS.batch_size, [test_data],
                                testing=True)
        print(len(unlabel_data))

        selected = []
        new_unlabel = []
        it_cnt = 0
        best_acc, best_f1 = 0, 0
        while True:

            train_it = BatchIterator(len(label_data),
                                     FLAGS.batch_size, [label_data],
                                     testing=False)
            unlabel_it = BatchIterator(len(unlabel_data),
                                       FLAGS.batch_size, [unlabel_data],
                                       testing=True)

            for samples, in train_it:
                it_cnt += 1
                if it_cnt > NUM_ITER:
                    break
                feed_dict = get_feed_dict_help(
                    classifier,
                    plhs=[xa_inputs, y_inputs, hyper_inputs],
                    data_dict=classifier.prepare_data(samples),
                    keep_rate=FLAGS.keep_rate,
                    is_training=True)

                _, _loss, _acc, _step = sess.run(
                    [optimizer, loss, acc, classifier.global_step],
                    feed_dict=feed_dict)
                #print('Train: step {}, acc {}, loss {}'.format(it_cnt, _acc, _loss))

                ### proc test
                test_acc, cnt = 0, 0
                y_true = []
                y_pred = []
                for samples, in test_it:
                    data_dict = classifier.prepare_data(samples)
                    feed_dict = get_feed_dict_help(
                        classifier,
                        plhs=[xa_inputs, y_inputs, hyper_inputs],
                        data_dict=data_dict,
                        keep_rate=1.0,
                        is_training=False)

                    num = len(samples)
                    _acc, _loss, _pred, _step = sess.run(
                        [acc, loss, pred, classifier.global_step],
                        feed_dict=feed_dict)
                    y_pred.extend(list(_pred))
                    y_true.extend(list(np.argmax(data_dict['y'], 1)))
                    test_acc += _acc * num
                    cnt += num
                test_acc = test_acc / cnt
                test_f1 = f1_score(y_true, y_pred, average='macro')
                logger.info(
                    'Test: step {}, test acc={:.6f}, test f1={:.6f}'.format(
                        it_cnt, test_acc, test_f1))
                best_f1 = max(best_f1, test_f1)

                ### proc unlabel
                if best_acc < test_acc:
                    best_acc = test_acc
                    _unlabel = []
                    _preds = []
                    _probs = []
                    y_dict = {0: 'positive', 1: 'negative', 2: 'neutral'}

                    for samples, in unlabel_it:
                        feed_dict = get_feed_dict_help(
                            classifier,
                            plhs=[xa_inputs, hyper_inputs],
                            data_dict=classifier.prepare_data(samples),
                            keep_rate=1.0,
                            is_training=False)

                        _pred, _prob = sess.run([pred, prob],
                                                feed_dict=feed_dict)
                        _unlabel.extend(samples)
                        _preds.extend(list(_pred))
                        _probs.extend(list(_prob))

                    top_k_id = np.argsort(_probs)[::-1][:NUM_SELECT]
                    remain_id = np.argsort(_probs)[::-1][NUM_SELECT:]
                    selected = [_unlabel[idx] for idx in top_k_id]
                    preds = [_preds[idx] for idx in top_k_id]
                    for idx, sample in enumerate(selected):
                        sample['polarity'] = y_dict[preds[idx]]
                    new_unlabel = [_unlabel[idx] for idx in remain_id]

            if it_cnt > NUM_ITER:
                best_acc_in_rounds.append(best_acc)
                best_f1_in_rounds.append(best_f1)
                logger.info(str(best_acc_in_rounds) + str(best_f1_in_rounds))
                break

        label_data.extend(selected)
        unlabel_data = new_unlabel

    #print(max(best_acc_in_rounds), max(best_f1_in_rounds))
    logger.info(str(best_acc_in_rounds) + str(best_f1_in_rounds))
예제 #5
0
def main(_):
    FLAGS = tf.app.flags.FLAGS

    import time, datetime
    timestamp = str(int(time.time()))
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    save_dir = FLAGS.save_dir + '/' + str(timestamp) + '_' + 'ctype'+FLAGS.classifier_type + '_r' +  str(FLAGS.learning_rate) + '_l' + str(FLAGS.l2_reg)\
                + '_alpha' + str(FLAGS.alpha) + '_batchsize' + str(FLAGS.batch_size) + '_hidae' + str(FLAGS.n_hidden_ae)\
                + '_dimz' + str(FLAGS.dim_z)  + '_dec' + str(FLAGS.decoder_type) + '_unlabel' + str(FLAGS.n_unlabel)\
                + '_positionenc' + str(FLAGS.position_enc) + '_bidirectionenc' + str(FLAGS.bidirection_enc)\
                + '_positiondec' + str(FLAGS.position_dec) + '_bidirectiondec' + str(FLAGS.bidirection_dec)\
                + '_hop3_opimadagrad_vochas10kunl_addunkpd_noapconcattindec_kl1e-4_noh_fixemb'
    #save_dir = 'tmp'

    from src.io.batch_iterator import BatchIterator
    train = pkl.load(open(FLAGS.train_file_path, 'rb'), encoding='latin')
    unlabel = pkl.load(open(FLAGS.unlabel_file_path, 'rb'), encoding='latin')[:FLAGS.n_unlabel]
    test = pkl.load(open(FLAGS.test_file_path, 'rb'), encoding='latin')
    val = pkl.load(open(FLAGS.validate_file_path, 'rb'), encoding='latin')
     
    def get_y(samples):
        y_dict = {'positive': [1,0,0], 'negative': [0, 1, 0], 'neutral': [0, 0, 1]}
        ys = [y_dict[sample['polarity']] for sample in samples]
        return ys

    y = get_y(train)
    pri_prob_y = (np.sum(y, axis=0)/len(y)).astype('float32')
    #pri_prob_y = np.ones(FLAGS.n_class).astype('float32')/FLAGS.n_class
    print(pri_prob_y)
    
    fns = [FLAGS.train_file_path,  FLAGS.test_file_path, FLAGS.unlabel_file_path]

    data_dir = 'data/rest/ian/'
    #emb_file = "../../../data/word2vec/cbow.unlabel.300d.txt"
    emb_file = "../../../data/glove.6B/glove.6B.300d.txt"
    #emb_file = "../../../data/glove.840B/glove.840B.300d.txt"
    word2idx, target2idx, word_embedding, target_embedding = preprocess_data(fns, emb_file, data_dir, FLAGS)
    train_it = BatchIterator(len(train), FLAGS.batch_size, [train], testing=False)
    unlabel_it = BatchIterator(len(unlabel), FLAGS.batch_size, [unlabel], testing=False)
    test_it = BatchIterator(len(test), FLAGS.batch_size, [test], testing=False)
   # val_it = BatchIterator(len(val), FLAGS.batch_size, [val], testing=False)
    
    configproto = tf.ConfigProto()
    configproto.gpu_options.allow_growth = True
    configproto.allow_soft_placement = True
    with tf.Session(config=configproto) as sess:
        tf.global_variables_initializer().run()

        model = SemiTABSA(word2idx=word2idx, 
                target2idx=target2idx,
                embedding_dim=FLAGS.embedding_dim, 
                batch_size=FLAGS.batch_size, 
                n_hidden=FLAGS.n_hidden, 
                learning_rate=FLAGS.learning_rate, 
                n_class=FLAGS.n_class, 
                max_sentence_len=FLAGS.max_sentence_len, 
                l2_reg=FLAGS.l2_reg, 
                word_embedding=word_embedding,
                target_embedding=target_embedding,
                dim_z=FLAGS.dim_z,
                pri_prob_y=pri_prob_y,
                decoder_type=FLAGS.decoder_type,
                grad_clip=FLAGS.grad_clip,
                n_hidden_ae=FLAGS.n_hidden_ae,
                position_enc=FLAGS.position_enc,
                bidirection_enc=FLAGS.bidirection_enc,
                position_dec=FLAGS.position_dec,
                bidirection_dec=FLAGS.bidirection_dec,
                classifier_type=FLAGS.classifier_type,
                )

        model.run(sess, train_it, unlabel_it, test_it, FLAGS.n_iter, FLAGS.keep_rate, save_dir, FLAGS.batch_size, FLAGS.alpha, vars(FLAGS)['__flags'])
예제 #6
0
def main(_):
    #tf.reset_default_graph()
    #tf.Graph().as_default()
    tf.set_random_seed(1234)
    np.random.seed(1234)

    FLAGS = tf.app.flags.FLAGS
    FLAGS.bidirection_enc = True if FLAGS.bidirection_enc == 'True' else False
    FLAGS.bidirection_dec = True if FLAGS.bidirection_dec == 'True' else False
    FLAGS.sharefc         = True if FLAGS.sharefc         == 'True' else False

    if 'lapt' in FLAGS.train_file_path:
        data_dir = 'data/lapt/tclstm'
        data_name = 'lapt'
    else:
        data_dir = 'data/rest/tclstm'
        data_name = 'rest'

    import time, datetime
    timestamp = str(int(time.time()))
    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    save_dir = FLAGS.save_dir + '/' + str(timestamp) + '_' +  '_r' + str(FLAGS.learning_rate) + '_l2' + str(FLAGS.l2_reg)\
                + '_alpha' + str(FLAGS.alpha) + '_bs' + str(FLAGS.batch_size) + '_hidae' + str(FLAGS.n_hidden_ae)\
                + '_dz' + str(FLAGS.dim_z)  + '_dec' + str(FLAGS.decoder_type) + '_u' + str(FLAGS.n_unlabel)\
                + '_penc' + str(FLAGS.position_enc) + '_bienc' + str(FLAGS.bidirection_enc)\
                + '_pdec' + str(FLAGS.position_dec) + '_bidec' + str(FLAGS.bidirection_dec)\
                + '_sharefc' + str(FLAGS.sharefc)  + '_data' + str(data_name)\
                + '_trunctdis'
                #+ '_vochas10kunl_addunkpd_noapconcattindec_kl1e-4_noh_filteremb_trunctdis_sharefc_decfcdepb'
    #save_dir = 'tmp'

    from src.io.batch_iterator import BatchIterator
    train = pkl.load(open(FLAGS.train_file_path, 'rb'), encoding='latin')
    unlabel = pkl.load(open(FLAGS.unlabel_file_path, 'rb'), encoding='latin')[:FLAGS.n_unlabel]
    test = pkl.load(open(FLAGS.test_file_path, 'rb'), encoding='latin')

    def get_y(samples):
        y_dict = {'positive': [1,0,0], 'negative': [0, 1, 0], 'neutral': [0, 0, 1]}
        ys = [y_dict[sample['polarity']] for sample in samples]
        return ys

    y = get_y(train)
    pri_prob_y = (np.sum(y, axis=0)/len(y)).astype('float32')
    #pri_prob_y = np.ones(FLAGS.n_class).astype('float32')/FLAGS.n_class
    print(pri_prob_y)
    
    fns = [FLAGS.train_file_path, FLAGS.unlabel_file_path, FLAGS.test_file_path]

    word2idx, embedding = preprocess_data(fns, '../../../data/glove.6B/glove.6B.300d.txt', data_dir)
    train_it = BatchIterator(len(train), FLAGS.batch_size, [train], testing=False)
    unlabel_it = BatchIterator(len(unlabel), FLAGS.batch_size, [unlabel], testing=False)
    test_it = BatchIterator(len(test), FLAGS.batch_size, [test], testing=True)

    configproto = tf.ConfigProto()
    configproto.gpu_options.allow_growth = True
    configproto.allow_soft_placement = True
    with tf.Session(config=configproto) as sess:
        tf.global_variables_initializer().run()
        
        model = SemiTABSA(word2idx=word2idx, 
                embedding_dim=FLAGS.embedding_dim, 
                batch_size=FLAGS.batch_size, 
                n_hidden=FLAGS.n_hidden, 
                learning_rate=FLAGS.learning_rate, 
                n_class=FLAGS.n_class, 
                max_sentence_len=FLAGS.max_sentence_len, 
                l2_reg=FLAGS.l2_reg, 
                embedding=embedding,
                dim_z=FLAGS.dim_z,
                pri_prob_y=pri_prob_y,
                decoder_type=FLAGS.decoder_type,
                grad_clip=FLAGS.grad_clip,
                n_hidden_ae=FLAGS.n_hidden_ae,
                position_enc=FLAGS.position_enc,
                bidirection_enc=FLAGS.bidirection_enc,
                position_dec=FLAGS.position_dec,
                bidirection_dec=FLAGS.bidirection_dec,
                classifier_type=FLAGS.classifier_type,
                sharefc=FLAGS.sharefc,
                )

        model.run(sess, train_it, unlabel_it, test_it, FLAGS.n_iter, FLAGS.keep_rate, save_dir, FLAGS.batch_size, FLAGS.alpha, vars(FLAGS)['__flags'])