示例#1
0
    return auc


if __name__ == "__main__":
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    print(f"Running on {device}.")

    goal = "classification"
    # goal = "regression"

    dataset_path = os.path.join('dataset_vad')
    target_df_path = os.path.join('targets.csv')
    filenames = os.listdir(dataset_path)
    splits_path = "splits"
    splits_name = "5_2021_02_25_23_05"
    _, test_files = load_data.load_train_test(filenames, splits_path,
                                              splits_name)
    dataset = AggregatorDataset(test_files,
                                dataset_path,
                                target_df_path,
                                goal,
                                sample_duration=1)

    models_path = os.path.join('models', 'resnet50')
    model = models.ResNet50(pretrained=True, goal=goal)
    model_name = "2_2021_02_25_23_39"
    model.load_state_dict(torch.load(os.path.join(models_path, model_name)))
    print("Loaded model", model_name)

    ## Aggregate: full evaluation script
    evaluate(dataset, model, goal=goal, display=True)
示例#2
0
import joblib
from load_data import load_train_test


def fit_naivebayes(X_train, y_train, X_test, y_test, batch=10000):
    clf = MultinomialNB()
    # Train ml
    for i in range(0, X_train.shape[0], batch):
        clf.partial_fit(X_train[i:i + batch, :], y_train[i:i + batch], classes=[0, 1])
    yy_test = clf.predict(X_test)
    print('Accuracy:{}'.format(accuracy_score(y_test, yy_test)))
    return clf

if __name__ == "__main__":
    print("Loading data")
    train_X, test_X, train_y, test_y = load_train_test()
    print("Extracting features(TFDF)")
    # Get Tf-idf object: Feature extracting

    # tfdf = TfidfVectorizer(analyzer="word", ngram_range=(1, 3)) # bigram features, need more memory
    tfdf = TfidfVectorizer(analyzer="word")
    tfdf.fit(train_X)  # fit or traing data
    X_train_dtm = tfdf.transform(train_X)  # transform our training data
    X_test_dtm = tfdf.transform(test_X)  # transform our testing data
    train_X, test_X = None, None # free memory

    print("Training logistic regression")
    current_clf = fit_naivebayes(X_train_dtm, train_y, X_test_dtm, test_y)

    print("Saving Arctifacts")
    joblib.dump(dict(clf=current_clf, tfdf=tfdf), open("./cache/predict_param.pickle", "wb"))
示例#3
0
def train():
    # load data
    # vectors = [28782+1,200]
    # vectors = numpy.load('vectors.npy')
    train_sent, train_label, train_length, test_sent, test_label, test_length = load_data.load_train_test(
    )
    # word embedding
    words_embedding = tf.random_uniform([vocab_size + 1, embedding_size],
                                        -1.0,
                                        1.0,
                                        name="embedding")
    # input is a sentence
    train_data_node = tf.placeholder(tf.int32,
                                     shape=(None, max_sentence_length))
    train_length_node = tf.placeholder(tf.int32, shape=(None, ))
    train_labels_node = tf.placeholder(tf.int32,
                                       shape=(None, max_sentence_length))

    weights = tf.Variable(tf.random_uniform([num_hidden * 2, num_tag], -1.0,
                                            1.0),
                          name="w")
    biases = tf.Variable(tf.random_normal(shape=[num_tag], dtype=tf.float32),
                         name="b")
    # CRF
    transitions = tf.Variable(tf.random_uniform([num_tag, num_tag], -1.0, 1.0),
                              name="trans")

    # inputs = [batch_size,max_sentence_length]
    # lengths = [batch_size,]
    def blstm_crf(inputs):
        # sents = [batch_size,max_path_length,embedding_size]
        sents = tf.nn.embedding_lookup(words_embedding, inputs)
        x = tf.transpose(sents, [1, 0, 2])
        x = tf.reshape(x, [-1, embedding_size])
        #  get a list of 'n_steps' tensors of shape (batch_size, embeddings)
        x = tf.split(0, num_steps, x)
        # bi-lstm
        fw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden,
                                          forget_bias=1.0,
                                          state_is_tuple=True)
        fw_cell = tf.nn.rnn_cell.DropoutWrapper(fw_cell, output_keep_prob=0.5)
        bw_cell = tf.nn.rnn_cell.LSTMCell(num_hidden,
                                          forget_bias=1.0,
                                          state_is_tuple=True)
        bw_cell = tf.nn.rnn_cell.DropoutWrapper(bw_cell, output_keep_prob=0.5)
        if rnn_layer > 1:
            fw_cell = tf.nn.rnn_cell.MultiRNNCell([fw_cell] * rnn_layer)
            bw_cell = tf.nn.rnn_cell.MultiRNNCell([bw_cell] * rnn_layer)

        # output = [batch_size,num_hidden*2]
        outputs, fw_final_state, bw_final_state = tf.nn.bidirectional_rnn(
            fw_cell, bw_cell, x, dtype=tf.float32)
        # linear
        # rnn_output = [batch_size,num_steps,num_hidden*2]
        rnn_output = tf.transpose(tf.pack(outputs), perm=[1, 0, 2])
        # output = [batch_size*num_steps,num_tag]
        output = tf.matmul(tf.reshape(rnn_output, [-1, num_hidden * 2]),
                           weights) + biases
        # output = [batch_size,num_steps,num_tag]
        output = tf.reshape(output, [-1, num_steps, num_tag])
        return output

    # unary_scores = [batch_size,num_steps,num_tag]
    unary_scores = blstm_crf(train_data_node)
    # CRF
    log_likelihood, transition_params = tf.contrib.crf.crf_log_likelihood(
        unary_scores,
        train_labels_node,
        train_length_node,
        transition_params=transitions)
    loss = tf.reduce_mean(-log_likelihood, name='cross_entropy_mean_loss')

    # train
    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-3)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    # runing the training
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        print('Initialized!')
        # generate batches
        batches = data_helpers.batch_iter(
            list(zip(train_sent, train_label, train_length)), BATCH_SIZE,
            NUM_EPOCHS)
        # batch count
        batch_count = 0
        epoch = 1
        print("Epoch " + str(epoch) + ":")
        for batch in batches:
            batch_count += 1
            # train process
            x_batch, y_batch, length_batch = zip(*batch)
            feed_dict = {
                train_data_node: x_batch,
                train_labels_node: y_batch,
                train_length_node: length_batch
            }
            _, step, losses, tf_transition_params = sess.run(
                [train_op, global_step, loss, transition_params],
                feed_dict=feed_dict)
            time_str = datetime.datetime.now().isoformat()
            print("{}: step {}, loss {:g}".format(time_str, step, losses))
            # test process
            if float((batch_count * BATCH_SIZE) / Train_size) > epoch:
                epoch += 1
                print("Epoch " + str(epoch) + ":")
            if batch_count % EVAL_FREQUENCY == 0:
                # get test scores
                feed_dict = {
                    train_data_node: test_sent,
                    train_labels_node: test_label,
                    train_length_node: test_length
                }
                step, losses, scores = sess.run(
                    [global_step, loss, unary_scores], feed_dict=feed_dict)
                correct_labels = 0
                total_labels = 0
                for i in range(Test_size):
                    # Remove padding from the scores and tag sequence.
                    current_score = scores[i][:test_length[i]]
                    current_label = test_label[i][:test_length[i]]
                    # Compute the highest scoring sequence.
                    viterbi_sequence, _ = tf.contrib.crf.viterbi_decode(
                        current_score, tf_transition_params)
                    # Evaluate word-level accuracy.
                    correct_labels += numpy.sum(
                        numpy.equal(viterbi_sequence, current_label))
                    total_labels += test_length[i]

                time_str = datetime.datetime.now().isoformat()
                acc = 100.0 * correct_labels / float(total_labels)
                print("\n")
                print("{}: step {}, loss {:g}, acc {:g}".format(
                    time_str, step, losses, acc))
                print("\n")
示例#4
0
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=10, char_nkerns=4, batch_size=1, window_width=3,
                    emb_size=50, char_emb_size=4, hidden_size=200,
                    margin=0.5, L2_weight=0.0003, update_freq=1, norm_threshold=5.0, max_truncate=40, 
                    max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=5, test_neg_size=5, valid_neg_size=5, neg_all=5):
#     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files=['annotated_fb_data_train_PNQ_'+str(train_neg_size)+'nega_str&des.txt', 'annotated_fb_data_valid_PNQ_'+str(valid_neg_size)+'nega_str&des.txt', 'annotated_fb_data_test_PNQ_'+str(test_neg_size)+'nega_str&des.txt']
    question_files=['annotated_fb_data_train_mention_remainQ.txt', 'annotated_fb_data_valid_mention_remainQ.txt', 'annotated_fb_data_test_mention_remainQ.txt']
    
    rng = numpy.random.RandomState(23455)
    datasets, vocab_size, char_size=load_train_test(triple_files, question_files, max_char_len, max_des_len, max_relation_len, max_Q_len, neg_all)#max_char_len, max_des_len, max_relation_len, max_Q_len
    print 'vocab_size:', vocab_size, 'char_size:', char_size
    train_data=datasets[0]
    valid_data=datasets[1]
    test_data=datasets[2]

    train_pos_entity_char=train_data[0]
    train_pos_entity_des=train_data[1]
    train_relations=train_data[2]
    train_entity_char_lengths=train_data[3]
    train_entity_des_lengths=train_data[4]
    train_relation_lengths=train_data[5]
    train_mention_char_ids=train_data[6]
    train_remainQ_word_ids=train_data[7]
    train_mention_char_lens=train_data[8]
    train_remainQ_word_len=train_data[9]

    valid_pos_entity_char=valid_data[0]
    valid_pos_entity_des=valid_data[1]
    valid_relations=valid_data[2]
    valid_entity_char_lengths=valid_data[3]
    valid_entity_des_lengths=valid_data[4]
    valid_relation_lengths=valid_data[5]
    valid_mention_char_ids=valid_data[6]
    valid_remainQ_word_ids=valid_data[7]
    valid_mention_char_lens=valid_data[8]
    valid_remainQ_word_len=valid_data[9]

    test_pos_entity_char=test_data[0]       #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
    test_pos_entity_des=test_data[1]        #matrix, each row for a examle: 20*2*51
    test_relations=test_data[2]             #matrix, each row for a example: 5*51
    test_entity_char_lengths=test_data[3]   #matrix, each row for a example: 3*2*51  (three valies for one entity)
    test_entity_des_lengths=test_data[4]    #matrix, each row for a example: 3*2*51  (three values for one entity)
    test_relation_lengths=test_data[5]      #matrix, each row for a example: 3*51
    test_mention_char_ids=test_data[6]      #matrix, each row for a mention: 40
    test_remainQ_word_ids=test_data[7]      #matrix, each row for a question: 30
    test_mention_char_lens=test_data[8]     #matrix, each three values for a mention: 3
    test_remainQ_word_len=test_data[9]      #matrix, each three values for a remain question: 3
    
    expected_train_size=len(train_pos_entity_char)
    train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
           len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len)]
    if sum(train_sizes)/len(train_sizes)!=expected_train_size:
        print 'weird size:', train_sizes
        exit(0)
    expected_test_size=len(test_pos_entity_char)
    test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len)]
    if sum(test_sizes)/len(test_sizes)!=expected_test_size:
        print 'weird size:', test_sizes
        exit(0)
    n_train_batches=expected_train_size/batch_size
    n_test_batches=expected_test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)
    
    indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char)
    indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des)
    indices_train_relations=pythonList_into_theanoIntMatrix(train_relations)
    indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths)
    indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths)
    indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths)
    indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids)
    indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids)
    indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens)
    indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len)    
    
    indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char)
    indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des)
    indices_test_relations=pythonList_into_theanoIntMatrix(test_relations)
    indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths)
    indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths)
    indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths)
    indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids)
    indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids)
    indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens)
    indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len)   


    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      

    char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
    index = T.lscalar()
    ent_char_ids_M = T.lmatrix()   
    ent_lens_M = T.lmatrix()
    men_char_ids = T.lvector()  
    men_lens=T.lvector()
    rel_word_ids_M=T.lmatrix()
    rel_word_lens_M=T.lmatrix()
    desH_word_ids_M=T.lmatrix()
    desH_word_lens_M=T.lmatrix()
    desT_word_ids_M=T.lmatrix()
    desT_word_lens_M=T.lmatrix()
    q_word_ids=T.lvector()
    q_word_lens=T.lvector()

#max_char_len, max_des_len, max_relation_len, max_Q_len
#     ent_men_ishape = (char_emb_size, max_char_len)  # this is the size of MNIST images
#     rel_ishape=(emb_size, max_relation_len)
#     des_ishape=(emb_size, max_des_len)
#     q_ishape=(emb_size, max_Q_len)
    
    filter_size=(emb_size,window_width)
    char_filter_size=(char_emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1])
    word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    q_desT_conv_W, q_desT_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)

    def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f,desH_word_ids_f,
                       desH_word_lens_f,desT_word_ids_f,desT_word_lens_f):
#         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
        desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
        #ent_mention
        ent_char_conv = Conv_with_input_para(rng, input=ent_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng, input=men_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng, input=rel_word_input,
                image_shape=(batch_size, 1, emb_size, max_relation_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        #q_desH
        q_desH_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
        desH_conv = Conv_with_input_para(rng, input=desH_word_input,
                image_shape=(batch_size, 1, emb_size, max_des_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
        #q_desT
        q_desT_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
        desT_conv = Conv_with_input_para(rng, input=desT_word_input,
                image_shape=(batch_size, 1, emb_size, max_des_len),
                filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
    #     ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output')
    #     men_char_output=debug_print(men_char_conv.output, 'men_char.output')
        
        
        
        ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2])
        men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens[0], right_l=men_lens[2])
        
        q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2])
        rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2])
        
        q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2])
        desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2])
        
        q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2])
        desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2])    
        
        
        overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\
                    cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)+\
                    cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling)+\
                    cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi
    
    simi_list, updates = theano.scan(
        SimpleQ_matches_Triple,
        sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M,desH_word_ids_M,
                   desH_word_lens_M,desT_word_ids_M,desT_word_lens_M])
    
    posi_simi=simi_list[0]
    nega_simies=simi_list[1:]
    loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) 
    loss_simi=T.sum(loss_simi_list)

    

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((char_embeddings** 2).sum()+(embeddings** 2).sum()+(char_conv_W** 2).sum()+(q_rel_conv_W** 2).sum()+(q_desH_conv_W** 2).sum()+(q_desT_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()

    cost=loss_simi+L2_weight*L2_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    

    
    test_model = theano.function([index], [loss_simi,simi_list],
          givens={
            ent_char_ids_M : indices_test_pos_entity_char[index].reshape(((test_neg_size+1)*2, max_char_len))[::2],  
            ent_lens_M : indices_test_entity_char_lengths[index].reshape(((test_neg_size+1)*2, 3))[::2],
            men_char_ids : indices_test_mention_char_ids[index],
            men_lens : indices_test_mention_char_lens[index],
            rel_word_ids_M : indices_test_relations[index].reshape((test_neg_size+1, max_relation_len)),
            rel_word_lens_M : indices_test_relation_lengths[index].reshape((test_neg_size+1, 3)),
            desH_word_ids_M : indices_test_pos_entity_des[index].reshape(((test_neg_size+1)*2, max_des_len))[::2], 
            desH_word_lens_M : indices_test_entity_des_lengths[index].reshape(((test_neg_size+1)*2, 3))[::2],
            desT_word_ids_M : indices_test_pos_entity_des[index].reshape(((test_neg_size+1)*2, max_des_len))[1::2], 
            desT_word_lens_M : indices_test_entity_des_lengths[index].reshape(((test_neg_size+1)*2, 3))[1::2],
            q_word_ids : indices_test_remainQ_word_ids[index],
            q_word_lens : indices_test_remainQ_word_len[index]}, on_unused_input='ignore')


    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b, q_desT_conv_W, q_desT_conv_b]#+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
#         updates.append((acc_i, acc))    
        if param_i == embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size)))))   #Ada
        elif param_i == char_embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(char_emb_size)))))   #AdaGrad
        else:
            updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc)) 
  
    train_model = theano.function([index], [loss_simi, cost], updates=updates,
          givens={
            ent_char_ids_M : indices_train_pos_entity_char[index].reshape(((train_neg_size+1)*2, max_char_len))[::2],  
            ent_lens_M : indices_train_entity_char_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2],
            men_char_ids : indices_train_mention_char_ids[index],
            men_lens : indices_train_mention_char_lens[index],
            rel_word_ids_M : indices_train_relations[index].reshape((train_neg_size+1, max_relation_len)),
            rel_word_lens_M : indices_train_relation_lengths[index].reshape((train_neg_size+1, 3)),
            desH_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[::2], 
            desH_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2],
            desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[1::2], 
            desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[1::2],
            q_word_ids : indices_train_remainQ_word_ids[index],
            q_word_lens : indices_train_remainQ_word_len[index]}, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    best_test_accu=0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0

#         ent_char_ids_M_train = indices_train_pos_entity_char[index].reshape(((train_neg_size+1)*2, max_char_len))[::2],  
#         ent_lens_M : indices_train_entity_char_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2],
#         men_char_ids : indices_train_mention_char_ids[index],
#         men_lens : indices_train_mention_char_lens[index],
#         rel_word_ids_M : indices_train_relations[index].reshape((train_neg_size+1, max_relation_len)),
#         rel_word_lens_M : indices_train_relation_lengths[index].reshape((train_neg_size+1, 3)),
#         desH_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[::2], 
#         desH_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2],
#         desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[1::2], 
#         desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[1::2],
#         q_word_ids : indices_train_remainQ_word_ids[index],
#         q_word_lens : indices_train_remainQ_word_len[index]}, on_unused_input='ignore')

        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1

            minibatch_index=minibatch_index+1
            #print batch_start
            loss_simi_i, cost_i= train_model(batch_start)
            if batch_start%10==0:
                print batch_start, 'loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i

            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+'loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)
            
            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_loss=[]
                succ=0
                for i in test_batch_start:
                    loss_simi_i,simi_list_i=test_model(i)
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0]>=max(simi_list_i[1:]):
                        succ+=1

                succ=succ*1.0/len(expected_test_size)
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                           'model %f') %
                          (epoch, minibatch_index, n_train_batches,succ))

                if best_test_accu< succ:
                    best_test_accu=succ
                    store_model_to_file(rootPath, params)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock() 

            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    word_nkerns=10,
                    char_nkerns=4,
                    batch_size=1,
                    window_width=3,
                    emb_size=50,
                    char_emb_size=4,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0003,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40,
                    max_char_len=40,
                    max_des_len=20,
                    max_relation_len=5,
                    max_Q_len=30,
                    train_neg_size=5,
                    test_neg_size=5,
                    valid_neg_size=5,
                    neg_all=5):
    #     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files = [
        'annotated_fb_data_train_PNQ_' + str(train_neg_size) +
        'nega_str&des.txt', 'annotated_fb_data_valid_PNQ_' +
        str(valid_neg_size) + 'nega_str&des.txt',
        'annotated_fb_data_test_PNQ_' + str(test_neg_size) + 'nega_str&des.txt'
    ]
    question_files = [
        'annotated_fb_data_train_mention_remainQ.txt',
        'annotated_fb_data_valid_mention_remainQ.txt',
        'annotated_fb_data_test_mention_remainQ.txt'
    ]

    rng = numpy.random.RandomState(23455)
    datasets, vocab_size, char_size = load_train_test(
        triple_files, question_files, max_char_len, max_des_len,
        max_relation_len, max_Q_len,
        neg_all)  #max_char_len, max_des_len, max_relation_len, max_Q_len
    print 'vocab_size:', vocab_size, 'char_size:', char_size
    train_data = datasets[0]
    valid_data = datasets[1]
    test_data = datasets[2]

    train_pos_entity_char = train_data[0]
    train_pos_entity_des = train_data[1]
    train_relations = train_data[2]
    train_entity_char_lengths = train_data[3]
    train_entity_des_lengths = train_data[4]
    train_relation_lengths = train_data[5]
    train_mention_char_ids = train_data[6]
    train_remainQ_word_ids = train_data[7]
    train_mention_char_lens = train_data[8]
    train_remainQ_word_len = train_data[9]

    valid_pos_entity_char = valid_data[0]
    valid_pos_entity_des = valid_data[1]
    valid_relations = valid_data[2]
    valid_entity_char_lengths = valid_data[3]
    valid_entity_des_lengths = valid_data[4]
    valid_relation_lengths = valid_data[5]
    valid_mention_char_ids = valid_data[6]
    valid_remainQ_word_ids = valid_data[7]
    valid_mention_char_lens = valid_data[8]
    valid_remainQ_word_len = valid_data[9]

    test_pos_entity_char = test_data[
        0]  #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
    test_pos_entity_des = test_data[1]  #matrix, each row for a examle: 20*2*51
    test_relations = test_data[2]  #matrix, each row for a example: 5*51
    test_entity_char_lengths = test_data[
        3]  #matrix, each row for a example: 3*2*51  (three valies for one entity)
    test_entity_des_lengths = test_data[
        4]  #matrix, each row for a example: 3*2*51  (three values for one entity)
    test_relation_lengths = test_data[5]  #matrix, each row for a example: 3*51
    test_mention_char_ids = test_data[6]  #matrix, each row for a mention: 40
    test_remainQ_word_ids = test_data[7]  #matrix, each row for a question: 30
    test_mention_char_lens = test_data[
        8]  #matrix, each three values for a mention: 3
    test_remainQ_word_len = test_data[
        9]  #matrix, each three values for a remain question: 3

    expected_train_size = len(train_pos_entity_char)
    train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
           len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len)]
    if sum(train_sizes) / len(train_sizes) != expected_train_size:
        print 'weird size:', train_sizes
        exit(0)
    expected_test_size = len(test_pos_entity_char)
    test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len)]
    if sum(test_sizes) / len(test_sizes) != expected_test_size:
        print 'weird size:', test_sizes
        exit(0)
    n_train_batches = expected_train_size / batch_size
    n_test_batches = expected_test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_pos_entity_char = pythonList_into_theanoIntMatrix(
        train_pos_entity_char)
    indices_train_pos_entity_des = pythonList_into_theanoIntMatrix(
        train_pos_entity_des)
    indices_train_relations = pythonList_into_theanoIntMatrix(train_relations)
    indices_train_entity_char_lengths = pythonList_into_theanoIntMatrix(
        train_entity_char_lengths)
    indices_train_entity_des_lengths = pythonList_into_theanoIntMatrix(
        train_entity_des_lengths)
    indices_train_relation_lengths = pythonList_into_theanoIntMatrix(
        train_relation_lengths)
    indices_train_mention_char_ids = pythonList_into_theanoIntMatrix(
        train_mention_char_ids)
    indices_train_remainQ_word_ids = pythonList_into_theanoIntMatrix(
        train_remainQ_word_ids)
    indices_train_mention_char_lens = pythonList_into_theanoIntMatrix(
        train_mention_char_lens)
    indices_train_remainQ_word_len = pythonList_into_theanoIntMatrix(
        train_remainQ_word_len)

    indices_test_pos_entity_char = pythonList_into_theanoIntMatrix(
        test_pos_entity_char)
    indices_test_pos_entity_des = pythonList_into_theanoIntMatrix(
        test_pos_entity_des)
    indices_test_relations = pythonList_into_theanoIntMatrix(test_relations)
    indices_test_entity_char_lengths = pythonList_into_theanoIntMatrix(
        test_entity_char_lengths)
    indices_test_entity_des_lengths = pythonList_into_theanoIntMatrix(
        test_entity_des_lengths)
    indices_test_relation_lengths = pythonList_into_theanoIntMatrix(
        test_relation_lengths)
    indices_test_mention_char_ids = pythonList_into_theanoIntMatrix(
        test_mention_char_ids)
    indices_test_remainQ_word_ids = pythonList_into_theanoIntMatrix(
        test_remainQ_word_ids)
    indices_test_mention_char_lens = pythonList_into_theanoIntMatrix(
        test_mention_char_lens)
    indices_test_remainQ_word_len = pythonList_into_theanoIntMatrix(
        test_remainQ_word_len)

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values, rootPath + 'word_emb.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    char_rand_values = random_value_normal((char_size + 1, char_emb_size),
                                           theano.config.floatX,
                                           numpy.random.RandomState(1234))
    char_rand_values[0] = numpy.array(numpy.zeros(char_emb_size),
                                      dtype=theano.config.floatX)
    char_embeddings = theano.shared(value=char_rand_values, borrow=True)

    # allocate symbolic variables for the data
    index = T.lscalar()
    ent_char_ids_M = T.lmatrix()
    ent_lens_M = T.lmatrix()
    men_char_ids = T.lvector()
    men_lens = T.lvector()
    rel_word_ids_M = T.lmatrix()
    rel_word_lens_M = T.lmatrix()
    desH_word_ids_M = T.lmatrix()
    desH_word_lens_M = T.lmatrix()
    desT_word_ids_M = T.lmatrix()
    desT_word_lens_M = T.lmatrix()
    q_word_ids = T.lvector()
    q_word_lens = T.lvector()

    #max_char_len, max_des_len, max_relation_len, max_Q_len
    #     ent_men_ishape = (char_emb_size, max_char_len)  # this is the size of MNIST images
    #     rel_ishape=(emb_size, max_relation_len)
    #     des_ishape=(emb_size, max_des_len)
    #     q_ishape=(emb_size, max_Q_len)

    filter_size = (emb_size, window_width)
    char_filter_size = (char_emb_size, window_width)
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    char_filter_shape = (char_nkerns, 1, char_filter_size[0],
                         char_filter_size[1])
    word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b = create_conv_para(rng,
                                                filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b = create_conv_para(
        rng, filter_shape=word_filter_shape)
    q_desH_conv_W, q_desH_conv_b = create_conv_para(
        rng, filter_shape=word_filter_shape)
    q_desT_conv_W, q_desT_conv_b = create_conv_para(
        rng, filter_shape=word_filter_shape)

    def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f,
                               rel_word_lens_f, desH_word_ids_f,
                               desH_word_lens_f, desT_word_ids_f,
                               desT_word_lens_f):
        #         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape(
            (batch_size, max_char_len,
             char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids.flatten()].reshape(
            (batch_size, max_char_len,
             char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape(
            (batch_size, max_relation_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape(
            (batch_size, max_des_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape(
            (batch_size, max_des_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids.flatten()].reshape(
            (batch_size, max_Q_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        #ent_mention
        ent_char_conv = Conv_with_input_para(rng,
                                             input=ent_char_input,
                                             image_shape=(batch_size, 1,
                                                          char_emb_size,
                                                          max_char_len),
                                             filter_shape=char_filter_shape,
                                             W=char_conv_W,
                                             b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng,
                                             input=men_char_input,
                                             image_shape=(batch_size, 1,
                                                          char_emb_size,
                                                          max_char_len),
                                             filter_shape=char_filter_shape,
                                             W=char_conv_W,
                                             b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng,
                                          input=q_word_input,
                                          image_shape=(batch_size, 1, emb_size,
                                                       max_Q_len),
                                          filter_shape=word_filter_shape,
                                          W=q_rel_conv_W,
                                          b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng,
                                        input=rel_word_input,
                                        image_shape=(batch_size, 1, emb_size,
                                                     max_relation_len),
                                        filter_shape=word_filter_shape,
                                        W=q_rel_conv_W,
                                        b=q_rel_conv_b)
        #q_desH
        q_desH_conv = Conv_with_input_para(rng,
                                           input=q_word_input,
                                           image_shape=(batch_size, 1,
                                                        emb_size, max_Q_len),
                                           filter_shape=word_filter_shape,
                                           W=q_desH_conv_W,
                                           b=q_desH_conv_b)
        desH_conv = Conv_with_input_para(rng,
                                         input=desH_word_input,
                                         image_shape=(batch_size, 1, emb_size,
                                                      max_des_len),
                                         filter_shape=word_filter_shape,
                                         W=q_desH_conv_W,
                                         b=q_desH_conv_b)
        #q_desT
        q_desT_conv = Conv_with_input_para(rng,
                                           input=q_word_input,
                                           image_shape=(batch_size, 1,
                                                        emb_size, max_Q_len),
                                           filter_shape=word_filter_shape,
                                           W=q_desT_conv_W,
                                           b=q_desT_conv_b)
        desT_conv = Conv_with_input_para(rng,
                                         input=desT_word_input,
                                         image_shape=(batch_size, 1, emb_size,
                                                      max_des_len),
                                         filter_shape=word_filter_shape,
                                         W=q_desT_conv_W,
                                         b=q_desT_conv_b)
        #     ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output')
        #     men_char_output=debug_print(men_char_conv.output, 'men_char.output')

        ent_conv_pool = Max_Pooling(rng,
                                    input_l=ent_char_conv.output,
                                    left_l=ent_lens_f[0],
                                    right_l=ent_lens_f[2])
        men_conv_pool = Max_Pooling(rng,
                                    input_l=men_char_conv.output,
                                    left_l=men_lens[0],
                                    right_l=men_lens[2])

        q_rel_pool = Max_Pooling(rng,
                                 input_l=q_rel_conv.output,
                                 left_l=q_word_lens[0],
                                 right_l=q_word_lens[2])
        rel_conv_pool = Max_Pooling(rng,
                                    input_l=rel_conv.output,
                                    left_l=rel_word_lens_f[0],
                                    right_l=rel_word_lens_f[2])

        q_desH_pool = Max_Pooling(rng,
                                  input_l=q_desH_conv.output,
                                  left_l=q_word_lens[0],
                                  right_l=q_word_lens[2])
        desH_conv_pool = Max_Pooling(rng,
                                     input_l=desH_conv.output,
                                     left_l=desH_word_lens_f[0],
                                     right_l=desH_word_lens_f[2])

        q_desT_pool = Max_Pooling(rng,
                                  input_l=q_desT_conv.output,
                                  left_l=q_word_lens[0],
                                  right_l=q_word_lens[2])
        desT_conv_pool = Max_Pooling(rng,
                                     input_l=desT_conv.output,
                                     left_l=desT_word_lens_f[0],
                                     right_l=desT_word_lens_f[2])


        overall_simi=cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\
                    cosine(q_rel_pool.output_maxpooling, rel_conv_pool.output_maxpooling)+\
                    cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling)+\
                    cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi

    simi_list, updates = theano.scan(SimpleQ_matches_Triple,
                                     sequences=[
                                         ent_char_ids_M, ent_lens_M,
                                         rel_word_ids_M, rel_word_lens_M,
                                         desH_word_ids_M, desH_word_lens_M,
                                         desT_word_ids_M, desT_word_lens_M
                                     ])

    posi_simi = simi_list[0]
    nega_simies = simi_list[1:]
    loss_simi_list = T.maximum(
        0.0, margin - posi_simi.reshape((1, 1)) + nega_simies)
    loss_simi = T.sum(loss_simi_list)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (char_embeddings**2).sum() + (embeddings**2).sum() +
        (char_conv_W**2).sum() + (q_rel_conv_W**2).sum() +
        (q_desH_conv_W**2).sum() + (q_desT_conv_W**2).sum(),
        'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()

    cost = loss_simi + L2_weight * L2_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function(
        [index], [loss_simi, simi_list],
        givens={
            ent_char_ids_M:
            indices_test_pos_entity_char[index].reshape(
                ((test_neg_size + 1) * 2, max_char_len))[::2],
            ent_lens_M:
            indices_test_entity_char_lengths[index].reshape(
                ((test_neg_size + 1) * 2, 3))[::2],
            men_char_ids:
            indices_test_mention_char_ids[index],
            men_lens:
            indices_test_mention_char_lens[index],
            rel_word_ids_M:
            indices_test_relations[index].reshape(
                (test_neg_size + 1, max_relation_len)),
            rel_word_lens_M:
            indices_test_relation_lengths[index].reshape(
                (test_neg_size + 1, 3)),
            desH_word_ids_M:
            indices_test_pos_entity_des[index].reshape(
                ((test_neg_size + 1) * 2, max_des_len))[::2],
            desH_word_lens_M:
            indices_test_entity_des_lengths[index].reshape(
                ((test_neg_size + 1) * 2, 3))[::2],
            desT_word_ids_M:
            indices_test_pos_entity_des[index].reshape(
                ((test_neg_size + 1) * 2, max_des_len))[1::2],
            desT_word_lens_M:
            indices_test_entity_des_lengths[index].reshape(
                ((test_neg_size + 1) * 2, 3))[1::2],
            q_word_ids:
            indices_test_remainQ_word_ids[index],
            q_word_lens:
            indices_test_remainQ_word_len[index]
        },
        on_unused_input='ignore')

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    params = [
        char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W,
        q_rel_conv_b, q_desH_conv_W, q_desH_conv_b, q_desT_conv_W,
        q_desT_conv_b
    ]  #+[embeddings]# + layer1.params
    #     params_conv = [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        #         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        #         updates.append((acc_i, acc))
        if param_i == embeddings:
            updates.append(
                (param_i,
                 T.set_subtensor(
                     (param_i -
                      learning_rate * grad_i / T.sqrt(acc + 1e-10))[0],
                     theano.shared(numpy.zeros(emb_size)))))  #Ada
        elif param_i == char_embeddings:
            updates.append(
                (param_i,
                 T.set_subtensor(
                     (param_i -
                      learning_rate * grad_i / T.sqrt(acc + 1e-10))[0],
                     theano.shared(numpy.zeros(char_emb_size)))))  #AdaGrad
        else:
            updates.append(
                (param_i, param_i -
                 learning_rate * grad_i / T.sqrt(acc + 1e-10)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index], [loss_simi, cost],
        updates=updates,
        givens={
            ent_char_ids_M:
            indices_train_pos_entity_char[index].reshape(
                ((train_neg_size + 1) * 2, max_char_len))[::2],
            ent_lens_M:
            indices_train_entity_char_lengths[index].reshape(
                ((train_neg_size + 1) * 2, 3))[::2],
            men_char_ids:
            indices_train_mention_char_ids[index],
            men_lens:
            indices_train_mention_char_lens[index],
            rel_word_ids_M:
            indices_train_relations[index].reshape(
                (train_neg_size + 1, max_relation_len)),
            rel_word_lens_M:
            indices_train_relation_lengths[index].reshape(
                (train_neg_size + 1, 3)),
            desH_word_ids_M:
            indices_train_pos_entity_des[index].reshape(
                ((train_neg_size + 1) * 2, max_des_len))[::2],
            desH_word_lens_M:
            indices_train_entity_des_lengths[index].reshape(
                ((train_neg_size + 1) * 2, 3))[::2],
            desT_word_ids_M:
            indices_train_pos_entity_des[index].reshape(
                ((train_neg_size + 1) * 2, max_des_len))[1::2],
            desT_word_lens_M:
            indices_train_entity_des_lengths[index].reshape(
                ((train_neg_size + 1) * 2, 3))[1::2],
            q_word_ids:
            indices_train_remainQ_word_ids[index],
            q_word_lens:
            indices_train_remainQ_word_len[index]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    best_test_accu = 0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0

        #         ent_char_ids_M_train = indices_train_pos_entity_char[index].reshape(((train_neg_size+1)*2, max_char_len))[::2],
        #         ent_lens_M : indices_train_entity_char_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2],
        #         men_char_ids : indices_train_mention_char_ids[index],
        #         men_lens : indices_train_mention_char_lens[index],
        #         rel_word_ids_M : indices_train_relations[index].reshape((train_neg_size+1, max_relation_len)),
        #         rel_word_lens_M : indices_train_relation_lengths[index].reshape((train_neg_size+1, 3)),
        #         desH_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[::2],
        #         desH_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[::2],
        #         desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((train_neg_size+1)*2, max_des_len))[1::2],
        #         desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((train_neg_size+1)*2, 3))[1::2],
        #         q_word_ids : indices_train_remainQ_word_ids[index],
        #         q_word_lens : indices_train_remainQ_word_len[index]}, on_unused_input='ignore')

        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #print batch_start
            loss_simi_i, cost_i = train_model(batch_start)
            if batch_start % 10 == 0:
                print batch_start, 'loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + 'loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)

            if iter % validation_frequency == 0:
                #write_file=open('log.txt', 'w')
                test_loss = []
                succ = 0
                for i in test_batch_start:
                    loss_simi_i, simi_list_i = test_model(i)
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0] >= max(simi_list_i[1:]):
                        succ += 1

                succ = succ * 1.0 / len(expected_test_size)
                #now, check MAP and MRR
                print((
                    '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                    'model %f') %
                      (epoch, minibatch_index, n_train_batches, succ))

                if best_test_accu < succ:
                    best_test_accu = succ
                    store_model_to_file(rootPath, params)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
示例#6
0
def main():
  # Read input args
  args = parse_to_argdict()

  ROOT_DIR = args['root_dir']
  MODEL_NAME = args['model_name']
  EPOCHS = args['epochs']
  SAVE_DEMO = args['save_demo_results']
  SAVE_PREDS = args['save_preds']

  TRAIN_PATH = ROOT_DIR + '/data/cell_imgs/'
  MASK_PATH = ROOT_DIR + '/data/mask_imgs/'
  TEST_PATH = ROOT_DIR + '/data/test_imgs/'

  OUTPUT_DIR = ROOT_DIR + '/outs'
  WEIGHTS = OUTPUT_DIR + '/' + MODEL_NAME +'_weights.h5'
  LOG_DIR = OUTPUT_DIR + "/logs"
  RESULTS_DIR = OUTPUT_DIR + '/results'

  # Load train test data
  function_time_outs = ""
  start = time.time()
  X_train, Y_train, X_test, train_ids, test_ids, sizes_test = load_train_test(TRAIN_PATH, MASK_PATH, TEST_PATH,
                                                                              IMG_WIDTH, IMG_HEIGHT, IMG_CHANNELS)
  function_time_outs += "Load train test: %.3f sec\n" % (time.time() - start)
  train_size = int(X_train.shape[0] * 0.9)

  # Data augmentation
  train_generator, val_generator = create_image_mask_generator(X_train, Y_train, BATCH_SIZE, seed)

  # Build U-Net model
  start = time.time()
  model = build_unet(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)
  function_time_outs += "Set up UNet: %.3f sec\n" % (time.time() - start)

  # Fit model
  start = time.time()
  earlystopper = EarlyStopping(patience=100, verbose=1)
  function_time_outs += "Set up EarlyStopper: %.3f sec\n" % (time.time() - start)
  start = time.time()
  checkpointer = ModelCheckpoint(WEIGHTS, verbose=1, save_best_only=True)
  function_time_outs += "Set up Checkpointer: %.3f sec\n" % (time.time() - start)
  start = time.time()
  tensorboard = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1, profile_batch='500,520')
  function_time_outs += "Set up TensorBoard: %.3f sec\n" % (time.time() - start)

  start = time.time()
  model_results = model.fit_generator(train_generator, validation_data=val_generator, validation_steps=10,
                                      steps_per_epoch=200, epochs=EPOCHS,
                                      callbacks=[tensorboard, earlystopper, checkpointer])
  function_time_outs += "Model training: %.3f sec\n" % (time.time() - start)


  # Predict on train, val and test
  start = time.time()
  model = load_model(WEIGHTS)  # custom_objects={'mean_iou': mean_iou}
  function_time_outs += "Load pretrained model: %.3f sec\n" % (time.time() - start)

  start = time.time()
  preds_train = model.predict(X_train[:train_size], verbose=1)
  preds_val = model.predict(X_train[train_size:], verbose=1)
  preds_test = model.predict(X_test, verbose=1)

  # Threshold predictions
  preds_train_t = (preds_train > 0.5).astype(np.uint8)
  preds_val_t = (preds_val > 0.5).astype(np.uint8)
  preds_test_t = (preds_test > 0.5).astype(np.uint8)
  function_time_outs += "Predict on train & test: %.3f sec\n" % (time.time() - start)

  # Validation Loss and Acc
  start = time.time()
  val_results = model.evaluate(X_train[train_size:], Y_train[train_size:], batch_size=BATCH_SIZE)
  function_time_outs += "Validation eval: %.3f sec" % (time.time() - start)
  print("Validation Loss:", val_results[0])
  print("Validation Accuracy :", val_results[1] * 100, "%")

  # Save all predictions
  if SAVE_PREDS:
    start = time.time()
    for i, id_ in tqdm(enumerate(test_ids), total=len(test_ids)):
      test_mask = resize(np.squeeze(preds_test_t[i]), (sizes_test[i][0], sizes_test[i][1]),
                         mode='constant', preserve_range=True)
      imsave(RESULTS_DIR + '/test%d_pred.png' % i, test_mask)
    function_time_outs += "Save predicted masks: %.3f sec\n" % (time.time() - start)

  # Save example prediction results
  if SAVE_DEMO:
    # Plot learning curve
    plot_learning(model_results, savepath=OUTPUT_DIR + "/learning_curve.png")

    i = 58
    show_images(i, i, X_train, Y_train, preds_train, preds_train_t, savename=RESULTS_DIR + '/train%d_pred.png' % i)

    i = 20
    show_images(i, i, X_train[train_size:], Y_train[train_size:], preds_val, preds_val_t, savename=RESULTS_DIR + '/val%d_pred.png' % i)

    i = 18
    show_images(i, i, X_test, None, preds_test, preds_test_t, savename=RESULTS_DIR + '/test%d_pred.png' % i)

  return function_time_outs
示例#7
0
def train():
    # load data
    # train_data = [train_size,3]
    # test_data = [test_size,3]
    print("loading data...")
    test_data, train_data = load_data.load_train_test()
    # train_data = test_data[:Train_size, :]
    test_data = test_data[Test_size:, :]

    train_data_node = tf.placeholder(tf.int32, shape=(None, 3))
    train_neg_node = tf.placeholder(tf.int32, shape=(None, 2 * num_sample, 3))

    test_scores_node = tf.placeholder(tf.float32,
                                      shape=(Test_size, entity_size))
    test_labels_node = tf.placeholder(tf.int32, shape=(Test_size, ))

    entity_embedding = tf.Variable(tf.random_uniform(
        [entity_size, embedding_size], -1.0, 1.0),
                                   name="entity_embedding")
    relation_embedding = tf.Variable(tf.random_uniform(
        [relation_size, embedding_size], -1.0, 1.0),
                                     name="relation_embedding")

    # inputs = [batch_size,3]
    # neg_inputs = [batch_size,2*(entity_size-1),3]
    def model(inputs, neg_inputs):
        # [batch_size]
        inputs_h = inputs[:, 0]
        inputs_t = inputs[:, 1]
        inputs_r = inputs[:, 2]
        # [batch_size,2*(entity_size-1)]
        neg_inputs_h = neg_inputs[:, :, 0]
        neg_inputs_t = neg_inputs[:, :, 1]
        neg_inputs_r = neg_inputs[:, :, 2]
        # [batch_size,embedding_size]
        h_embed = tf.nn.embedding_lookup(entity_embedding, inputs_h)
        t_embed = tf.nn.embedding_lookup(entity_embedding, inputs_t)
        r_embed = tf.nn.embedding_lookup(relation_embedding, inputs_r)
        # [batch_size , 2*(entity_size-1),embedding_size]
        h_neg = tf.nn.embedding_lookup(entity_embedding, neg_inputs_h)
        t_neg = tf.nn.embedding_lookup(entity_embedding, neg_inputs_t)
        r_neg = tf.nn.embedding_lookup(relation_embedding, neg_inputs_r)
        # [batch_size,1]
        delta = tf.reduce_sum((h_embed + r_embed - t_embed)**2,
                              1,
                              keep_dims=True)
        # neg delta = [batch_size,2*(entity_size-1)]
        neg_delta = tf.reduce_sum((h_neg + t_neg - r_neg)**2, 2)
        # neg delta = [batch_size,1], equals to div (2*entity_size-2)
        neg_delta = tf.reduce_mean(neg_delta, 1, keep_dims=True)
        return delta, neg_delta

    pos_one, neg_one = model(train_data_node, train_neg_node)
    margin = 0.0
    # loss = tf.reduce_mean(tf.maximum(pos_one + margin - neg_one, 0))
    loss = tf.reduce_mean(pos_one + margin - neg_one)

    # predict
    # test_inputs = [batch_size,3]
    def get_embeddings(test_inputs):
        inputs_h = test_inputs[:, 0]
        inputs_t = test_inputs[:, 1]
        # labels = [batch_size]
        inputs_r = test_inputs[:, 2]
        # [batch_size,embedding_size]
        h_embed = tf.nn.embedding_lookup(entity_embedding, inputs_h)
        t_embed = tf.nn.embedding_lookup(entity_embedding, inputs_t)
        r_embed = tf.nn.embedding_lookup(relation_embedding, inputs_r)
        return h_embed, t_embed, r_embed

    def evalution(scores, labels):
        # get top k
        # scores = [Test_size,entity_size]
        # labels = [Test_size]
        # h_result = [Test_size]
        h_result = tf.nn.in_top_k(scores, labels, k=10)
        # acc
        h_acc = tf.reduce_mean(tf.cast(h_result, tf.float32))
        return h_acc

    h_embed, t_embed, r_embed = get_embeddings(train_data_node)
    acc = evalution(test_scores_node, test_labels_node)

    # train
    global_step = tf.Variable(0, name="global_step", trainable=False)
    optimizer = tf.train.AdamOptimizer(1e-3)
    grads_and_vars = optimizer.compute_gradients(loss)
    train_op = optimizer.apply_gradients(grads_and_vars,
                                         global_step=global_step)

    # runing the training
    with tf.Session() as sess:
        tf.initialize_all_variables().run()
        print('Initialized!')
        # generate batches
        batches = data_helpers.batch_iter(list(zip(train_data)), BATCH_SIZE,
                                          NUM_EPOCHS)
        # batch count
        batch_count = 0
        epoch = 1
        print("Epoch " + str(epoch) + ":")
        for batch in batches:
            batch_count += 1
            # train process
            x_batch = numpy.squeeze(batch)
            # generate neg data
            neg_x_batch = load_data.generate_neg_data(x_batch,
                                                      num_sample=num_sample)
            feed_dict = {train_data_node: x_batch, train_neg_node: neg_x_batch}
            _, step, losses = sess.run([train_op, global_step, loss],
                                       feed_dict=feed_dict)
            time_str = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
            print("{}: step {}, loss {:g}".format(time_str, step, losses))

            # test process
            if float((batch_count * BATCH_SIZE) / Train_size) > epoch:
                epoch += 1
                print("Epoch " + str(epoch) + ":")
            if batch_count % EVAL_FREQUENCY == 0:
                # get test scores
                feed_dict = {train_data_node: test_data}
                # get embedding
                print("get embedding...")
                h_embedding, t_embedding, r_embedding, entity_embed = sess.run(
                    [h_embed, t_embed, r_embed, entity_embedding],
                    feed_dict=feed_dict)
                # compute score
                t_start = time.time()
                h_acc, t_acc, h_mean_rank, t_mean_rank = evaluate.compute_acc(
                    h_embedding, t_embedding, r_embedding, entity_embed)
                t_end = time.time()
                t = t_end - t_start
                print("computing acc..., cost :%s" % t)
                hit_acc = (h_acc + t_acc) / 2.0
                mean_rank = int((h_mean_rank + t_mean_rank) / 2)
                time_str = datetime.datetime.now().strftime(
                    "%Y-%m-%d %H:%M:%S")
                print(
                    "{}: step {}, h-acc {:g}, t-acc {:g}, Hit@10 {:g}, h_rank {}, t_rank {}, mean_rank {}"
                    .format(time_str, step, h_acc, t_acc, hit_acc, h_mean_rank,
                            t_mean_rank, mean_rank))
                print("\n")
示例#8
0
def main(args):
    N_ITER = args.n_iter
    K_CV = args.k_cv

    print('loading data')
    label_name = 'label'
    train, test = load_train_test(f=args.infile, label_name=label_name)

    #train=train.head(1000).copy()

    print('loaded', train.shape[0], test.shape[0])
    print('label rate', train[label_name].mean(), test[label_name].mean())

    # Read in feature set to use
    with open('models/in_vars.p', 'rb') as f:
        in_vars = pickle.load(f)
    print('Using', len(in_vars), 'vars')

    if args.model_type == 'rf':
        print('Fiting a RandomForestClassifier')
        rf = RandomForestClassifier(oob_score=True,
                                    bootstrap=True,
                                    random_state=42)

        # Look at parameters used by our current forest
        print('Starting parameters currently in use:\n')
        pprint(rf.get_params())

        # Number of trees in random forest
        n_estimators = [
            int(x) for x in np.linspace(start=10, stop=1000, num=10)
        ]
        # Number of features to consider at every split
        max_features = ['auto', 'sqrt']
        # Maximum number of levels in tree
        max_depth = [int(x) for x in np.linspace(5, 100, num=20)]
        max_depth.append(None)
        # Minimum number of samples required to split a node
        min_samples_split = [2, 5, 10]
        # Minimum number of samples required at each leaf node
        min_samples_leaf = [1, 2, 4]
        # Method of selecting samples for training each tree
        random_grid = {
            'n_estimators': n_estimators,
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_split': min_samples_split,
            'min_samples_leaf': min_samples_leaf
        }
        pprint(random_grid)

        # Use the random grid to search for best hyperparameters
        # Random search of parameters, using k fold cross validation,
        # search across n_iter different combinations, and use all available cores
        rf_random = RandomizedSearchCV(estimator=rf,
                                       param_distributions=random_grid,
                                       scoring='roc_auc',
                                       n_iter=N_ITER,
                                       cv=K_CV,
                                       verbose=2,
                                       random_state=42,
                                       n_jobs=-1)
        # Fit the random search model
        rf_random.fit(train[in_vars], train['label'])

        # Save Model
        with open('models/rf_random_search.p', 'wb') as f:
            pickle.dump(rf_random, f, pickle.HIGHEST_PROTOCOL)
        with open('models/rf_args.p', 'wb') as f:
            pickle.dump(args, f, pickle.HIGHEST_PROTOCOL)

    if args.model_type == 'gb':
        print('Fiting a GradientBoostingClassifier')
        gb = GradientBoostingClassifier(verbose=1,
                                        subsample=0.9,
                                        random_state=42,
                                        n_iter_no_change=5)

        print('Parameters currently in use:\n')
        pprint(gb.get_params())

        max_features = ['auto', 'sqrt']
        learning_rate = np.linspace(0.01, 0.2, num=10)
        max_depth = [int(x) for x in np.linspace(5, 100, num=20)]
        max_depth.append(None)
        min_samples_leaf = [1, 2, 4]
        min_samples_split = [2, 5, 10]
        n_estimators = [
            int(x) for x in np.linspace(start=10, stop=1000, num=10)
        ]
        subsample = [0.5, 0.8, 1.0]
        loss = ['deviance', 'exponential']

        random_grid = {
            'max_features': max_features,
            'max_depth': max_depth,
            'min_samples_leaf': min_samples_leaf,
            'min_samples_split': min_samples_split,
            'n_estimators': n_estimators,
            'subsample': subsample,
            'learning_rate': learning_rate,
            'loss': loss
        }
        pprint(random_grid)
        gb_random = RandomizedSearchCV(estimator=gb,
                                       param_distributions=random_grid,
                                       scoring='roc_auc',
                                       n_iter=N_ITER,
                                       cv=K_CV,
                                       verbose=2,
                                       random_state=42,
                                       n_jobs=-1)
        # Fit the random search model
        gb_random.fit(train[in_vars], train['label'])
        with open('models/gb_random_search.p', 'wb') as f:
            pickle.dump(gb_random, f, pickle.HIGHEST_PROTOCOL)
        with open('models/gb_args.p', 'wb') as f:
            pickle.dump(args, f, pickle.HIGHEST_PROTOCOL)