Exemplo n.º 1
0
def test_fun(var_file, pool_size, num_feature, stp, parse_type):

    #create result_directory
    res_dir = '/'.join(var_file.split('/')[:-1]) + '/results/'
    if not os.path.isdir(res_dir):
        os.mkdir(res_dir)

    #initalizing model
    nn, isize = init_model(var_file)
    isize = 50
    wvect = Word_vector(isize, vtype='msrp')
    # a.optimum.opt_var.g=g

    # getting data that is processed for testing porpose
    train, train_label, test, test_label = get_msrp_data(stp)

    # preprocessing for train and test set
    if os.path.isfile('/'.join(var_file.split('/')[:-1])+'/vectors.pickle'):
        wvect.load_vector('/'.join(var_file.split('/')[:-1])+'/vectors.pickle')
    data_processing = preprocess(parsing_type=parse_type, structure_type='h', stopword=stp, wvect=wvect)
    train_set, _ = data_processing.process_words_data(train)
    test_set, _ = data_processing.process_words_data(test)

    # generating fixed size phrase vector for train and test set
    otrain = generate_fixed_vector(nn, train_set, num_feature, pool_size)
    otest = generate_fixed_vector(nn, test_set, num_feature, pool_size)

    # classifier defination
    # clf = MLPClassifier(activation='logistic', solver='adam',alpha=0.0001,batch_size='auto',learning_rate='adaptive',max_iter=10000,tol=1e-5, verbose=0)
    clf = svm.LinearSVC(penalty='l2', loss='squared_hinge', dual=True, tol=0.00001, C=1.0, multi_class='ovr',fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=10000)
    clf.fit(otrain, train_label)
    score = clf.predict(otest)

    # nn classifier
    # train_label = [[0.0, 1.0] if i == 1 else [1.0, 0.0] for i in train_label]
    # i_size, o_size = len(otrain[0]), len(train_label[0])
    # clf = NN(i_size, 2*i_size, o_size, batch_size=50, epoch=200, neta=.0001, op_method='adam', errtol=0.0001)
    # clf.train(zip(otrain, train_label))
    # score = clf.pridect(otest)
    # score = [np.argmax(i) for i in score]

    # getting results
    tp, tn, fp, fn, acc, f1 = get_results(score, test_label)
    print '\npool size : %d,\tnumber feature : %d,\t stopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n'%(pool_size,num_feature,stp,tp, tn, fp, fn,acc, f1)

    # logging result in file
    open(res_dir+'res.txt','a').write('\npool size : %d,\tnumber feature : %d,\t stopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n'%(pool_size,num_feature,stp,tp, tn, fp, fn,acc, f1))
def main_train(args):

    print_setting(args)
    wfname, log = log_data(args)

    words_data_check = pickle.load(
        open("/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/110.pickle",
             'rb'))
    # words_data = pickle.load(open("/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/110.pickle", 'rb'))
    words_data = pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/msr_paraphrase_train.pickle', 'rb')) \
                 + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/bbc.pickle', 'rb')) \
                 + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/1all-news.pickle', 'rb')) \
                 # + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/2all-news.pickle', 'rb')) \

    # + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/3all-news.pickle', 'rb'))
    # + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/4all-news.pickle', 'rb')) \
    # + pickle.load(open('/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/5all-news.pickle', 'rb'))

    wvect = Word_vector(args['v_size'])
    data_processing = preprocess(parsing_type=args['parse-type'],
                                 structure_type=args['type'],
                                 stopword=args['stp'],
                                 wvect=wvect)
    data, wpresent = data_processing.process_words_data(words_data)
    nn = create_model(wfname,
                      args['wload'],
                      args['model'],
                      args['method'],
                      args['v_size'],
                      args['h_size'],
                      neta=args['neta'],
                      wpresent=wpresent,
                      logg=log['rae'],
                      vector=wvect)
    nn.train(xs=data, epoch=args['epoch'], batch_size=args['batch_size'])
    nn.save_variables(wfname)
    print "model variables saved."
    data, wpresent = data_processing.process_words_data(words_data_check)
    nn.gradient_check(data, sorted(wpresent))
    # wvect.save_vector('/'.join(wfname.split('/')[:-1])+'/vectors.pickle')
    # print "vector variables saved."
    return
Exemplo n.º 3
0
def test_fun(var_file, pool_size, num_feature, stp, parse_type):

    #create result_directory
    clfFile = 'weights/classifier_' + str(num_feature) + str(stp) + '.pkl'
    res_dir = '/'.join(var_file.split('/')[:-1]) + '/results/'
    if not os.path.isdir(res_dir):
        os.mkdir(res_dir)

    if os.path.isfile(clfFile):
        clf = pickle.load(open(clfFile, 'rb'))
    else:
        # getting data that is processed for testing porpose
        train, train_label, test, test_label = get_msrp_data(stp)

        #initalizing model
        isize, hsize, w, b, g = pickle.load(open(var_file, 'rb'))
        nn = stack_RAE(input_size=isize, hidden_size=hsize)
        nn.w = w
        nn.b = b
        # a.optimum.opt_var.g=g

        # preprocessing for train and test set
        wvect = Word_vector(isize)
        data_processing = preprocess(parsing_type=parse_type,
                                     structure_type='h',
                                     stopword=stp,
                                     wvect=wvect)
        train_set, _ = data_processing.process_words_data(train)
        test_set, _ = data_processing.process_words_data(test)

        # generating fixed size phrase vector for train and test set
        otrain = generate_fixed_vector(nn, train_set, num_feature, pool_size)
        otest = generate_fixed_vector(nn, test_set, num_feature, pool_size)

        # classifier defination
        # clf = MLPClassifier(activation='logistic', solver='adam',alpha=0.0001,batch_size='auto',learning_rate='adaptive',max_iter=10000,tol=1e-5, verbose=0)
        #
        # clf = svm.LinearSVC(penalty='l1',tol=0.001, C=1.0, loss='hinge', fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=7, max_iter=100000)

        clf = svm.LinearSVC(penalty='l2',
                            loss='squared_hinge',
                            dual=True,
                            tol=0.00001,
                            C=1.0,
                            multi_class='ovr',
                            fit_intercept=True,
                            intercept_scaling=1,
                            class_weight=None,
                            verbose=0,
                            random_state=None,
                            max_iter=1000)

        # performing classifier training
        clf.fit(otrain, train_label)
        pickle.dump(clf, open(clfFile, 'wb'))
    # performing pridection
    score = clf.predict(otest)

    # getting results
    tp, tn, fp, fn, acc, f1 = get_results(score, test_label)
    print acc, f1

    # logging result in file
    print '\npool size : %d,\tnumber feature : %d,\t stopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n' % (
        pool_size, num_feature, stp, tp, tn, fp, fn, acc, f1)
    open(res_dir + 'res.txt', 'a').write(
        '\npool size : %d,\tnumber feature : %d,\t stopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n'
        % (pool_size, num_feature, stp, tp, tn, fp, fn, acc, f1))
Exemplo n.º 4
0
from util.Preprocessing import preprocess
import pickle
word_data = pickle.load(
    open(
        '/media/zero/41FF48D81730BD9B/DT_RAE/data/pickle/msr_paraphrase_train.pickle',
        'rb'))
data_processing = preprocess(parsing_type='syn',
                             structure_type='nh',
                             stopword=1)
data1, _ = data_processing.process_words_data(word_data)

data_processing = preprocess(parsing_type='chk',
                             structure_type='h',
                             stopword=0)
data2, _ = data_processing.process_words_data(word_data)

for i in range(len(data1)):
    words = data1[i]['words']
    wlen = len(words)
    for j in data1[i]['h_vect']:
        print[words[k] for k in j],
        words[wlen] = ' '.join([words[k] for k in j])
        wlen += 1
    print
    words = data2[i]['words']
    for j in data2[i]['h_vect']:
        print[words[k] for k in j],
    print
    raw_input()
Exemplo n.º 5
0
def msrp_train_test(var_file, stp, parse_type):

    #create result_directory
    res_dir = '/'.join(var_file.split('/')[:-1]) + '/results/'
    log_file = '/'.join(var_file.split('/')[:-1]) + '/msrp_train_log.txt'
    if not os.path.isdir(res_dir):
        os.mkdir(res_dir)

    # getting data that is processed for testing porpose
    train, train_label, test, test_label = get_msrp_data(stp)

    #initalizing model
    isize = 50
    osize = 2
    wvect = Word_vector(isize, vtype='msrp')
    # a.optimum.opt_var.g=g

    # preprocessing for train and test set
    data_processing = preprocess(parsing_type=parse_type,
                                 structure_type='h',
                                 stopword=stp,
                                 wvect=wvect)
    train_set, _ = data_processing.process_words_data(train)
    test_set, _ = data_processing.process_words_data(test)
    train_label = [
        np.array([[0.0], [1.0]]) if i == 1 else np.array([[1.0], [0.0]])
        for i in train_label
    ]
    train = [[train_set[i], train_set[i + 1]]
             for i in range(0, len(train_set), 2)]
    test = [[test_set[i], test_set[i + 1]] for i in range(0, len(test_set), 2)]

    # model layers initalization
    isize, hsize, w, b, g = pickle.load(open(var_file, 'rb'))
    opt_var = Optimization_variable(method='rmsprop',
                                    isize=isize,
                                    osize=hsize,
                                    model_type='RAE',
                                    option={'wpresent': []})
    opt = Optimization(optimization_variable=opt_var, learning_rate=0.0001)
    rae_layer = stack_RAE(input_size=isize,
                          hidden_size=hsize,
                          optimization=opt,
                          hidden_activation=tanh,
                          hidden_deactivation=dtanh)
    rae_layer.w = w
    rae_layer.b = b
    rae_layer.optimum.opt_var.g = g
    rae_layer.init_dw(w, b)
    nn_layer = NN(2 * hsize, osize)
    nn_layer.optimum = Optimization(
        optimization_variable=Optimization_variable(method='rmsprop',
                                                    isize=2 * hsize,
                                                    osize=osize),
        learning_rate=0.0001)
    logg = logger('text', log_file)

    # trainning of model
    epoch = 10000
    batch_size = 50
    for ep in range(epoch):
        batches = mini_batch(zip(train, train_label), len(train_set),
                             batch_size)
        cost = 0.0
        ecount = 0
        for batch in range(len(batches)):
            for data in batches[batch]:
                # forward pass of RAE
                vect1, v1 = rae_layer.encoding(data[0][0])
                vect2, v2 = rae_layer.encoding(data[0][1])
                data[0][0]['vects'] = vect1
                data[0][1]['vects'] = vect2

                # forward pass of NN
                o, vnn = nn_layer.forward_pass(
                    np.concatenate(
                        (vect1[len(vect1) - 1], vect2[len(vect2) - 1]),
                        axis=0))

                # cost calculation
                cost += ce_erro(o, data[1])
                ecount += 1
                grad = o - data[1]

                # backward pass of NN
                nngrad = nn_layer.backward_pass(grad, vnn)

                # backward padd of RAE
                nngrad1, nngrad2 = np.split(nngrad, [hsize], axis=0)
                grad1 = rae_layer.encoding_back(nngrad1, v1, data[0][0])
                grad2 = rae_layer.encoding_back(nngrad2, v2, data[0][1])

                # calculating weight update
                nn_layer.calc_dw(
                    grad,
                    np.concatenate(
                        (vect1[len(vect1) - 1], vect2[len(vect2) - 1]),
                        axis=0))
                rae_layer.calc_dw(grad1, data[0][0])
                rae_layer.calc_dw(grad2, data[0][1])

            # updating weights
            nn_layer.update_weights()
            rae_layer.update_weights()
        if (ep + 1) % 50 == 0:
            score = msrp_test([rae_layer, nn_layer], test)
            tp, tn, fp, fn, acc, f1 = get_results(score, test_label)
            print 'stopword : %d, Tp : %d, Tn : %d, Fp : %d Fn : %d,acc : %ff1 score : %f' % (
                stp, tp, tn, fp, fn, acc, f1)
            logg.log_text(
                'stopword : %d, Tp : %d, Tn : %d, Fp : %d Fn : %d, acc : %f, f1 score : %f'
                % (stp, tp, tn, fp, fn, acc, f1))
            pickle.dump([rae_layer.w, rae_layer.b, nn_layer.w],
                        open(
                            '/'.join(var_file.split('/')[:-1]) +
                            '/results/weights' + str(ep) + '.pickle', 'wb'))

        print "%d/%d epoch completed .... error : %f" % (ep + 1, epoch,
                                                         cost / ecount)
        logg.log_text("%d/%d epoch completed ....\n" % (ep + 1, epoch))
        if cost / ecount < 0.01:
            break

    # getting results
    score = msrp_test([rae_layer, nn_layer], test)
    tp, tn, fp, fn, acc, f1 = get_results(score, test_label)
    print '\nstopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n' % (
        stp, tp, tn, fp, fn, acc, f1)

    # logging result in file
    open(res_dir + 'res.txt', 'a').write(
        '\nstopword : %d\n\tTrue positive : %d\n\tTrue negative : %d\n\tFalse positive : %d\n\tFalse negatie : %d\n\taccuracy : %f\n\tf1 score : %f\n'
        % (stp, tp, tn, fp, fn, acc, f1))