예제 #1
0
def validation(validate_model, testList, vocab, batch_size, words_num_dim):
    index, score_list = int(0), []
    while True:
        x1, x2, x3, m1, m2, m3 = load_test_data(testList, vocab, index, batch_size, words_num_dim)
        batch_scores, nouse = validate_model(x1, x2, x3, m1, m2, m3)
        for score in batch_scores:
            score_list.append(score)
        index += batch_size
        if index >= len(testList):
            break
        #log('Evalution' + str(index), logfile_path)
        print 'Evalution...', str(index)
    sdict, index = {}, int(0)
    qa_count = 0
    for items in testList:
        qid = items[1].split(':')[1]
        if not qid in sdict:
            sdict[qid] = []
        sdict[qid].append((score_list[index], items[0]))
        index += 1
        if int(qid) > qa_count:
            qa_count = int(qid)
    qa_count += 1
    top1 = float(0)
    map_sum = float(0)
    mrr_sum = float(0)
    for qid, items in sdict.items():
        items.sort(key=operator.itemgetter(0), reverse=True)
        #for top1
        score, flag = items[0]
        if flag == '1':
            top1 += 1
        #for mrp
        mrr_index = 0
        for score, flag in items:
            #log('[debug]' + ' qid=' + str(qid) + ' score=' + str(score) + ' flag=' + str(flag), logfilepath)
            mrr_index += 1
            if flag == '1':
                mrr_sum += float(1) / float(mrr_index)

        #for map
        map_index_down = 0
        map_index_up = 0
        temp_map_sum = 0
        for score, flag in items:
            #log('[debug]' + ' qid=' + str(qid) + ' score=' + str(score) + ' flag=' + str(flag))
            map_index_down += 1
            if flag == '1':
                map_index_up += 1
                temp_map_sum += float(map_index_up) / float(map_index_down)
        temp_map_sum /= float(map_index_up)
        map_sum += temp_map_sum
        log('qid = ' + str(qid) + ' / top1 count = ' + str(top1), logfile_path)
    top1 /= float(qa_count)
    mrr_sum /= float(qa_count)
    map_sum /= float(qa_count)
    log('top-1 = ' + str(top1) + ' / ' + 'mrr = ' + str(mrr_sum) + ' / ' + 'map = ' + str(map_sum), logfile_path)
예제 #2
0
def train():
    global logfile_path
    global train1file
    global train0file
    global test1file
    global idf_file_path

    batch_size = int(256)
    filter_sizes = [1, 2, 3]
    num_filters = 1000
    words_num_dim = 50
    #normal embedding size
    embedding_size = 300
    #new embedding size with idf
    #embedding_size = 301
    learning_rate = 0.001
    n_epochs = 20000
    validation_freq = 50
    keep_prob_value = 0.7
    margin_size = 0.05

    logfile_path = os.path.join(logfile_path, 'CNN-' \
                   + GetNowTime() + '-' \
                   + '-log.txt')

    log("New start ...", logfile_path)
    log(str(time.asctime(time.localtime(time.time()))), logfile_path)
    log("batch_size = " + str(batch_size), logfile_path)
    log("filter_sizes = " + str(filter_sizes), logfile_path)
    log("num_filters = " + str(num_filters), logfile_path)
    log("embedding_size = " + str(embedding_size), logfile_path)
    log("learning_rate = " + str(learning_rate), logfile_path)
    log("n_epochs = " + str(n_epochs), logfile_path)
    log("margin_size = " + str(margin_size), logfile_path)
    log("words_num_dim = " + str(words_num_dim), logfile_path)
    log("validation_freq = " + str(validation_freq), logfile_path)
    log("keep_prob_value = " + str(keep_prob_value), logfile_path)
    log("train_1_file = " + str(train1file.split('/')[-1]), logfile_path)
    log("train_0_file = " + str(train0file.split('/')[-1]), logfile_path)
    log("test_file = " + str(test1file.split('/')[-1]), logfile_path)
    log("vector_file = " + str(vectorsfile.split('/')[-1]), logfile_path)
    log("idf_file_path = " + str(idf_file_path.split('/')[-1]), logfile_path)
    log("lda_train_file_path = " + str(lda_train_file_path.split('/')[-1]),
        logfile_path)
    log("lda_test_file_path = " + str(lda_test_file_path.split('/')[-1]),
        logfile_path)

    vocab = build_vocab()
    #word_embeddings is list, shape = numOfWords*100
    #for normal embeddings
    word_embeddings = load_word_embeddings(vocab, embedding_size)
    #for new embeddings with idf features
    #word_embeddings = load_word_embeddings_with_idf(vocab, embedding_size, idf_file_path)
    trainList = load_train1_list()
    testList = load_test_list()
    train0Dict = load_train0_dict()
    #train_x1.shape = 256*100
    #train_x1, train_x2, train_x3 = load_train_data(trainList, vocab, batch_size, words_num_dim)
    train_x1, train_x2, train_x3 = load_train_data_from_2files(
        train0Dict, trainList, vocab, batch_size, words_num_dim)

    x1, x2, x3 = T.matrix('x1'), T.matrix('x2'), T.matrix('x3')
    keep_prob = T.fscalar('keep_prob')
    model = QACnn(input1=x1,
                  input2=x2,
                  input3=x3,
                  keep_prob=keep_prob,
                  word_embeddings=word_embeddings,
                  batch_size=batch_size,
                  sequence_len=train_x1.shape[1],
                  embedding_size=embedding_size,
                  filter_sizes=filter_sizes,
                  num_filters=num_filters,
                  margin_size=margin_size)
    dbg_x1 = model.dbg_x1
    dbg_outputs_1 = model.dbg_outputs_1

    cost, cos12, cos13 = model.cost, model.cos12, model.cos13
    params, accuracy = model.params, model.accuracy
    grads = T.grad(cost, params)

    updates = [(param_i, param_i - learning_rate * grad_i)
               for param_i, grad_i in zip(params, grads)]

    p1, p2, p3 = T.matrix('p1'), T.matrix('p2'), T.matrix('p3')
    prob = T.fscalar('prob')
    train_model = theano.function([p1, p2, p3, prob],
                                  [cost, accuracy, dbg_x1, dbg_outputs_1],
                                  updates=updates,
                                  givens={
                                      x1: p1,
                                      x2: p2,
                                      x3: p3,
                                      keep_prob: prob
                                  })

    v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3')
    validate_model = theano.function(
        inputs=[v1, v2, v3, prob],
        outputs=[cos12, cos13],
        #updates=updates,
        givens={
            x1: v1,
            x2: v2,
            x3: v3,
            keep_prob: prob
        })

    epoch = 0
    done_looping = False
    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #train_x1, train_x2, train_x3 = load_train_data(trainList, vocab, batch_size)
        train_x1, train_x2, train_x3 = load_train_data_from_2files(
            train0Dict, trainList, vocab, batch_size, words_num_dim)
        #print train_x3.shape
        cost_ij, acc, dbg_x1, dbg_outputs_1 = train_model(
            train_x1, train_x2, train_x3, keep_prob_value)
        log(
            'load data done ...... epoch:' + str(epoch) + ' cost:' +
            str(cost_ij) + ', acc:' + str(acc), logfile_path)
        if epoch % validation_freq == 0:
            log('Evaluation ......', logfile_path)
            validation(validate_model, testList, vocab, batch_size,
                       words_num_dim)
예제 #3
0
def validation(validate_model, testList, vocab, batch_size, words_num_dim):
    index, score_list = int(0), []
    while True:
        x1, x2, x3 = load_test_data(testList, vocab, index, batch_size,
                                    words_num_dim)
        batch_scores, nouse = validate_model(x1, x2, x3, 1.0)
        for score in batch_scores:
            if len(score_list) < len(testList):
                score_list.append(score)
            else:
                break
        index += batch_size
        #log('Evalution' + str(index), logfile_path)
        print 'Evalution...', str(index)
        if index >= len(testList):
            break
    sdict, index = {}, int(0)
    qa_count = 0
    for items in testList:
        qid = items[1].split(':')[1]
        question = items[2].strip('_<a>')
        answer = items[3].strip('_<a>')
        if not qid in sdict:
            sdict[qid] = []
        sdict[qid].append((score_list[index], items[0], question, answer))
        index += 1
        if int(qid) > qa_count:
            qa_count = int(qid)
    qa_count += 1
    top1 = float(0)
    map_sum = float(0)
    mrr_sum = float(0)
    #qid_count = 0

    for qid, items in sdict.items():
        items.sort(key=operator.itemgetter(0), reverse=True)
        #just for analysis
        '''
        global logfile_path
        analysis_log_file_path = logfile_path + '.analysis'
        for score, flag, question, answer in items:
            log('[' + str(qid) + ']' + question, analysis_log_file_path)
            log('[Predicted][' + '1:' + str(len(items)) + '] '
                + answer
                , analysis_log_file_path)
            break
        expected_answer_index = 0
        expected_answer_flag = False
        for score, flag, question, answer in items:
            expected_answer_index += 1
            if flag == '1':
                log('[Expected][' + str(expected_answer_index) + ':' + str(len(items)) + '] '
                    + answer
                    , analysis_log_file_path)
                expected_answer_flag = True
                break
        if expected_answer_flag == False:
            log('[Expected][' + str(qid) + '/' + flag + '/' + 'Not Exist!'
                    , analysis_log_file_path)
        log('', analysis_log_file_path)
        '''
        #for top1
        score, flag, question, answer = items[0]
        if flag == '1':
            top1 += 1
        #for mrr
        mrr_index = 0
        for score, flag, question, answer in items:
            mrr_index += 1
            if flag == '1':
                mrr_sum += float(1) / float(mrr_index)
        #for map
        map_index_down = 0
        map_index_up = 0
        temp_map_sum = 0
        for score, flag, question, answer in items:
            #log('[debug]' + ' qid=' + str(qid) + ' score=' + str(score) + ' flag=' + str(flag))
            map_index_down += 1
            if flag == '1':
                map_index_up += 1
                temp_map_sum += float(map_index_up) / float(map_index_down)
        temp_map_sum /= float(map_index_up)
        map_sum += temp_map_sum
        #log('qid = ' + str(qid) + ' / top1 count = ' + str(top1), logfile_path)
    top1 /= float(qa_count)
    mrr_sum /= float(qa_count)
    map_sum /= float(qa_count)
    log(
        'top-1 = ' + str(top1) + ' / ' + 'mrr = ' + str(mrr_sum) + ' / ' +
        'map = ' + str(map_sum), logfile_path)
예제 #4
0
def validation(validate_model, testList, vocab, batch_size, words_num_dim):
    index, score_list = int(0), []
    while True:
        x1, x2, x3 = load_test_data(testList, vocab, index, batch_size,
                                    words_num_dim)
        batch_scores, nouse = validate_model(x1, x2, x3, 1.0)
        for score in batch_scores:
            if len(score_list) < len(testList):
                score_list.append(score)
            else:
                break
        index += batch_size
        #log('Evalution' + str(index), logfile_path)
        print 'Evalution...', str(index)
        if index >= len(testList):
            break
    sdict, index = {}, int(0)
    qa_count = 0
    for items in testList:
        qid = items[1].split(':')[1]
        if not qid in sdict:
            sdict[qid] = []
        sdict[qid].append((score_list[index], items[0]))
        index += 1
        if int(qid) > qa_count:
            qa_count = int(qid)
    qa_count += 1

    dcg3 = float(0)
    dcg5 = float(0)
    dcg_all = float(0)
    idcg3 = float(0)
    idcg5 = float(0)
    idcg_all = float(0)
    #top_1 = float(0)
    #top_3 = float(0)

    for qid, items in sdict.items():
        items.sort(key=operator.itemgetter(0), reverse=True)
        label_list = []
        index = 0
        top_3_flag = False
        for score, label in items:
            label_list.append(int(label))
            index += 1
            #if index <= 1 and str(label) == '2':
            #    top_1 += 1
            #if index <= 3 and str(label) == '2' and top_3_flag == False:
            #    top_3 += 1
            #    top_3_flag = True
        top_k_3 = min(3, len(label_list))
        top_k_5 = min(5, len(label_list))
        top_k_all = len(label_list)
        dcg3 += DCG(label_list, top_k_3)
        dcg5 += DCG(label_list, top_k_5)
        dcg_all += DCG(label_list, top_k_all)
        #idcg
        temp = sorted(label_list, reverse=True)
        idcg3 += DCG(temp, top_k_3)
        idcg5 += DCG(temp, top_k_5)
        idcg_all += DCG(temp, top_k_all)

        log(
            'qid = ' + str(qid) + ' / dcg3 = ' +
            str(DCG(label_list, top_k_3)) + ' / idcg3 = ' + str(idcg3),
            logfile_path)
        log(
            'qid = ' + str(qid) + ' / dcg5 = ' +
            str(DCG(label_list, top_k_5)) + ' / idcg5 = ' + str(idcg5),
            logfile_path)
        log(
            'qid = ' + str(qid) + ' / dcg_all = ' +
            str(DCG(label_list, top_k_all)) + ' / idcg_all' + str(idcg_all),
            logfile_path)
    log(
        'ALL-DCG: all-dcg3 = ' + str(dcg3) + ' / ' + 'all-dcg5 = ' +
        str(dcg5) + ' / ' + 'all-dcg = ' + str(dcg_all), logfile_path)
    log(
        'ALL-iDCG: all-idcg3 = ' + str(idcg3) + ' / ' + 'all-idcg5 = ' +
        str(idcg5) + ' / ' + 'all-idcg = ' + str(idcg_all), logfile_path)
    log(
        'AVG-DCG3 = ' + str(dcg3 / idcg3) + ' / AVG-DCG5 = ' +
        str(dcg5 / idcg5) + ' / AVG-DCG = ' + str(dcg_all / idcg_all),
        logfile_path)
예제 #5
0
def train():
    global logfile_path
    global trainfile
    global train0file
    global test1file

    batch_size = int(256)
    embedding_size = 300
    learning_rate = 0.005
    n_epochs = 20000
    words_num_dim = 1200
    validation_freq = 10
    filter_sizes = [1, 2, 3, 5]
    num_filters = 500
    margin_size = 0.05

    logfile_path = os.path.join(logfile_path, 'LSTM-' + GetNowTime() + '-' \
                   + 'batch_size-' + str(batch_size) + '-' \
                   + 'num_filters-' + str(num_filters) + '-' \
                   + 'embedding_size-' + str(embedding_size) + '-' \
                   + 'n_epochs-' + str(n_epochs) + '-' \
                   + 'freq-' + str(validation_freq) + '-' \
                   + '-log.txt')

    log("New start ...", logfile_path)
    log(str(time.asctime(time.localtime(time.time()))), logfile_path)
    log("batch_size = " + str(batch_size), logfile_path)
    log("filter_sizes = " + str(filter_sizes), logfile_path)
    log("num_filters = " + str(num_filters), logfile_path)
    log("embedding_size = " + str(embedding_size), logfile_path)
    log("learning_rate = " + str(learning_rate), logfile_path)
    log("words_num_dim = " + str(words_num_dim), logfile_path)
    log("n_epochs = " + str(n_epochs), logfile_path)
    log("margin_size = " + str(margin_size), logfile_path)
    log("validation_freq = " + str(validation_freq), logfile_path)
    log("train_1_file = " + str(trainfile.split('/')[-1]), logfile_path)
    log("train_0_file = " + str(train0file.split('/')[-1]), logfile_path)
    log("test_file = " + str(test1file.split('/')[-1]), logfile_path)
    log("vector_file = " + str(vectorsfile.split('/')[-1]), logfile_path)

    vocab = build_vocab()
    word_embeddings = load_word_embeddings(vocab, embedding_size)
    trainList = load_train_list()
    testList = load_test_list()
    train0Dict = load_train0_dict()
    train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim)
    x1, x2, x3 = T.fmatrix('x1'), T.fmatrix('x2'), T.fmatrix('x3')
    m1, m2, m3 = T.fmatrix('m1'), T.fmatrix('m2'), T.fmatrix('m3')
    model = LSTM(
        input1=x1, input2=x2, input3=x3,
        mask1=m1, mask2=m2, mask3=m3,
        word_embeddings=word_embeddings,
        batch_size=batch_size,
        sequence_len=train_x1.shape[0], #row is sequence_len
        embedding_size=embedding_size,
        filter_sizes=filter_sizes,
        num_filters=num_filters,
        margin_size = margin_size)

    cost, cos12, cos13 = model.cost, model.cos12, model.cos13
    params, accuracy = model.params, model.accuracy
    grads = T.grad(cost, params)
    updates = [
        (param_i, param_i - learning_rate * grad_i)
        for param_i, grad_i in zip(params, grads)
    ]

    p1, p2, p3 = T.fmatrix('p1'), T.fmatrix('p2'), T.fmatrix('p3')
    q1, q2, q3 = T.fmatrix('q1'), T.fmatrix('q2'), T.fmatrix('q3')
    train_model = theano.function(
        [p1, p2, p3, q1, q2, q3], 
        [cost, accuracy], 
        updates=updates,
        givens={
            x1: p1, x2: p2, x3: p3, m1: q1, m2: q2, m3: q3
        }
    )

    v1, v2, v3 = T.matrix('v1'), T.matrix('v2'), T.matrix('v3')
    u1, u2, u3 = T.matrix('u1'), T.matrix('u2'), T.matrix('u3')
    validate_model = theano.function(
        inputs=[v1, v2, v3, u1, u2, u3],
        outputs=[cos12, cos13],
        #updates=updates,
        givens={
            x1: v1, x2: v2, x3: v3, m1: u1, m2: u2, m3: u3
        }
    )

    epoch = 0
    done_looping = False
    while (epoch < n_epochs) and (not done_looping):
        epoch += 1
        train_x1, train_x2, train_x3, mask1, mask2, mask3 = load_train_data_from_2files(train0Dict, trainList, vocab, batch_size, words_num_dim)
        #print('train_x1, train_x2, train_x3')
        #print(train_x1.shape, train_x2.shape, train_x3.shape)
        cost_ij, acc = train_model(train_x1, train_x2, train_x3, mask1, mask2, mask3)
        log('load data done ...... epoch:' + str(epoch) + ' cost:' + str(cost_ij) + ', acc:' + str(acc), logfile_path)
        if epoch % validation_freq == 0:
            log('Evaluation ......', logfile_path)
            validation(validate_model, testList, vocab, batch_size, words_num_dim)