def median_forecast(remove_outliers=False, date_min=None, date_max=None):
    train = load_train()
    if remove_outliers:
        train = replace_outliers(train)
    train_flattened = pd.melt(train[list(train.columns[-49:]) + ['Page']],
                              id_vars='Page',
                              var_name='date',
                              value_name='Visits')
    train_flattened['date'] = train_flattened['date'].astype('datetime64[ns]')
    if date_min and date_max:
        date_min = datetime.datetime.strptime(date_min, '%Y-%m-%d')
        date_max = datetime.datetime.strptime(date_max, '%Y-%m-%d')
        train_flattened = train_flattened[
            (train_flattened['date'] >= date_min)
            & (train_flattened['date'] <= date_max)]
    train_flattened['dayofweek'] = (train_flattened.date.dt.dayofweek >=
                                    5).astype(float)

    test = pd.read_csv(os.path.join('..', 'input', 'key_1.csv'))
    test['date'] = test.Page.apply(lambda a: a[-10:])
    test['Page'] = test.Page.apply(lambda a: a[:-11])
    test['date'] = test['date'].astype('datetime64[ns]')
    test['dayofweek'] = (test.date.dt.dayofweek >= 5).astype(float)

    train_page_per_dow = train_flattened.groupby(['Page', 'dayofweek'
                                                  ]).median().reset_index()

    test = test.merge(train_page_per_dow, how='left')
    test.loc[test.Visits.isnull(), 'Visits'] = 0

    test[['Id', 'Visits']].to_csv(os.path.join('..', 'output',
                                               'submission_median.csv'),
                                  index=False)
def median_week():
    train = load_train()
    train_flattened = pd.melt(train[list(train.columns[-49:]) + ['Page']],
                              id_vars='Page',
                              var_name='date',
                              value_name='Visits')
    train_flattened['date'] = train_flattened['date'].astype('datetime64[ns]')
    train_flattened['weekend'] = (train_flattened.date.dt.dayofweek >=
                                  5).astype(float)

    test = pd.read_csv(os.path.join('..', 'input', 'key_1.csv'))
    test['date'] = test.Page.apply(lambda a: a[-10:])
    test['Page'] = test.Page.apply(lambda a: a[:-11])
    test['date'] = test['date'].astype('datetime64[ns]')
    test['weekend'] = (test.date.dt.dayofweek >= 5).astype(float)

    train_page_per_dow = train_flattened.groupby(['Page', 'weekend'
                                                  ]).median().reset_index()

    test = test.merge(train_page_per_dow, how='left')
    test.loc[test.Visits.isnull(), 'Visits'] = 0

    test[['Id', 'Visits']].to_csv(os.path.join('..', 'output',
                                               'submission_median.csv'),
                                  index=False)
예제 #3
0
def main():
    parser = ld.parse_arguments()

    print "#### LEAVE ONE OUT "
    print "# KNN Classifier", parser.k

    # stopwords
    stopwords = None
    if parser.stopwords_path:
        stopwords = ld.load_stopwords(parser.stopwords_path)

    # loading the necessary data
    (vocabulary, neigh_classes) = ld.load_train(parser.train_path, stopwords)

    # transforming each item to a v-dimensional space
    train = space.train_transform(vocabulary, parser.train_path)

    print "# Classifying"
    acc = 0

    for x in xrange(len(train)):
        dist_heap = []
        item = train[x]
        for i in xrange(len(train)):
            # skipping the own element
            if x == i:
                continue
            point = train[i]
            distance = spd.euclidean(item, point)

            tup = (distance, i)
            heapq.heappush(dist_heap, tup)

        # return the highest k similar points
        top_k = heapq.nsmallest(parser.k, dist_heap)

        # classifing
        classification = np.zeros(2)
        for (_, idi) in top_k:
            classe = neigh_classes[idi]
            classification[int(classe)] += 1

        out_class = 0
        if classification[0] >= classification[1]:
            out_class = 0
        else:
            out_class = 1

        print x, " -> ", out_class, neigh_classes[x]

        # increment the acc
        if out_class == neigh_classes[x]:
            acc += 1

    print "# Acurácia para ", parser.k, ": ",
    print acc, float(acc) / float(len(train))
예제 #4
0
파일: train_att.py 프로젝트: yzyz7/attgit
def train_rnn():
    rng = numpy.random.RandomState(1234)

    q = T.lvector('q')
    pa = T.lscalar('pa')   
    pp = T.lvector('pp')
    pt = T.lscalar('pt')   
    na = T.lscalar('na')
    np = T.lvector('np')
    nt = T.lscalar('nt')   

    inputs = [q, pa, pp, pt, na, np, nt]

    dim = 128
    v_size = 7810
    embLayer = emb_layer(pre_train=None, v = v_size, dim = dim) 
    rnnLayer = rnn_layer(input=None, wx=None, wh=None, bh=None, emb_layer = embLayer, nh = dim) 
    margin = 1.0
    att = attention_layer(input = inputs, rnn_layer=rnnLayer, margin = margin)

    cost = att.loss()
    print att.params
    gradient = T.grad(cost, att.params)
    lr = 0.001
    updates = OrderedDict((p,p-lr*g) for p,g in zip(att.params, gradient))
    train = theano.function(inputs=[q, pa, pp, pt, na, np, nt], outputs=cost, updates = updates) 
    
    #theano.printing.pydotprint(train, outfile="pic_train.png", compact=True, var_with_name_simple=True)
    #print rnnLayer.emb.eval()[0]

#Training process

    e0 = rnnLayer.emb.eval()
 
    train_list = load_train('./data/train_5neg.id')
    epoch = 1
    while True:
        for t in train_list:
            q, pa, pp, pt, na, np, nt = t
            train(q, pa, pp, pt, na, np, nt)
            rnnLayer.normalize()
            att.normalize()
        att.savemodel('./model/epoch' + str(epoch) + '.model')
        print rnnLayer.emb.eval()[0] - e0[0]
        print 'training epoch ' + str(epoch) + ' done.'
        epoch += 1
예제 #5
0
from model import FFM
os.environ['KMP_DUPLICATE_LIB_OK']='True'


logging.info("# save hparams")
hparams = Hparams()
parser = hparams.parser
hp = parser.parse_args()

if not os.path.exists(hp.logdir):
    os.makedirs(hp.logdir)
save_hparams(hp, hp.logdir)


logging.info("# Prepare train data")
df_train,train_labels = load_train(hp.train_path,hp.item_path,hp.user_path,hp.prefix_sep)
logging.info('df_train.shape ' + str(df_train.shape))
logging.info('train_labels.shape ' + str(train_labels.shape))

# 特征长度
feature_length = df_train.shape[1]
hp.feature_length = feature_length

# 特征名称列表
feature_cols = df_train.columns.tolist()
# 获取feature2field_dict
feature2field_dict,field_list = get_feature2field_dict(feature_cols,hp.prefix_sep)
hp.field_num = len(field_list)

# 样本数量
train_num = df_train.shape[0]
예제 #6
0
def evaluate_lenet5(learning_rate=0.05, n_epochs=2000, word_nkerns=50, char_nkerns=4, batch_size=1, window_width=[2, 5],
                    emb_size=50, char_emb_size=4, hidden_size=200,
                    margin=0.5, L2_weight=0.0003, Div_reg=0.03, update_freq=1, norm_threshold=5.0, max_truncate=40, 
                    max_char_len=40, max_des_len=20, max_relation_len=5, max_Q_len=30, train_neg_size=21, 
                    neg_all=100, train_size=200, test_size=200, mark='_forfun'):  #train_size=75909, test_size=17386
#     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath='/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files=['annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt', 'annotated_fb_data_test.entitylinking.top20_succSet_asInput.txt']

    rng = numpy.random.RandomState(23455)
    datasets, datasets_test, length_per_example_test, vocab_size, char_size=load_train(triple_files[0], triple_files[1], max_char_len, max_des_len, max_relation_len, max_Q_len, train_size, test_size, mark)#max_char_len, max_des_len, max_relation_len, max_Q_len

    
    print 'vocab_size:', vocab_size, 'char_size:', char_size

    train_data=datasets
#     valid_data=datasets[1]
    test_data=datasets_test
#     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
#     
    train_pos_entity_char=train_data[0]
    train_pos_entity_des=train_data[1]
    train_relations=train_data[2]
    train_entity_char_lengths=train_data[3]
    train_entity_des_lengths=train_data[4]
    train_relation_lengths=train_data[5]
    train_mention_char_ids=train_data[6]
    train_remainQ_word_ids=train_data[7]
    train_mention_char_lens=train_data[8]
    train_remainQ_word_len=train_data[9]
    train_entity_scores=train_data[10]

    test_pos_entity_char=test_data[0]
    test_pos_entity_des=test_data[1]
    test_relations=test_data[2]
    test_entity_char_lengths=test_data[3]
    test_entity_des_lengths=test_data[4]
    test_relation_lengths=test_data[5]
    test_mention_char_ids=test_data[6]
    test_remainQ_word_ids=test_data[7]
    test_mention_char_lens=test_data[8]
    test_remainQ_word_len=test_data[9]
    test_entity_scores=test_data[10]
# 
#     test_pos_entity_char=test_data[0]       #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
#     test_pos_entity_des=test_data[1]        #matrix, each row for a examle: 20*2*51
#     test_relations=test_data[2]             #matrix, each row for a example: 5*51
#     test_entity_char_lengths=test_data[3]   #matrix, each row for a example: 3*2*51  (three valies for one entity)
#     test_entity_des_lengths=test_data[4]    #matrix, each row for a example: 3*2*51  (three values for one entity)
#     test_relation_lengths=test_data[5]      #matrix, each row for a example: 3*51
#     test_mention_char_ids=test_data[6]      #matrix, each row for a mention: 40
#     test_remainQ_word_ids=test_data[7]      #matrix, each row for a question: 30
#     test_mention_char_lens=test_data[8]     #matrix, each three values for a mention: 3
#     test_remainQ_word_len=test_data[9]      #matrix, each three values for a remain question: 3
    

    train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
           len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)]
    if sum(train_sizes)/len(train_sizes)!=train_size:
        print 'weird size:', train_sizes
        exit(0)

    test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)]
    if sum(test_sizes)/len(test_sizes)!=test_size:
        print 'weird size:', test_sizes
        exit(0)

    n_train_batches=train_size/batch_size
    n_test_batches=test_size/batch_size
    
    train_batch_start=list(numpy.arange(n_train_batches)*batch_size)
    test_batch_start=list(numpy.arange(n_test_batches)*batch_size)
    
    indices_train_pos_entity_char=pythonList_into_theanoIntMatrix(train_pos_entity_char)
    indices_train_pos_entity_des=pythonList_into_theanoIntMatrix(train_pos_entity_des)
    indices_train_relations=pythonList_into_theanoIntMatrix(train_relations)
    indices_train_entity_char_lengths=pythonList_into_theanoIntMatrix(train_entity_char_lengths)
    indices_train_entity_des_lengths=pythonList_into_theanoIntMatrix(train_entity_des_lengths)
    indices_train_relation_lengths=pythonList_into_theanoIntMatrix(train_relation_lengths)
    indices_train_mention_char_ids=pythonList_into_theanoIntMatrix(train_mention_char_ids)
    indices_train_remainQ_word_ids=pythonList_into_theanoIntMatrix(train_remainQ_word_ids)
    indices_train_mention_char_lens=pythonList_into_theanoIntMatrix(train_mention_char_lens)
    indices_train_remainQ_word_len=pythonList_into_theanoIntMatrix(train_remainQ_word_len)   
    indices_train_entity_scores=pythonList_into_theanoFloatMatrix(train_entity_scores) 
    
#     indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char)
#     indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des)
#     indices_test_relations=pythonList_into_theanoIntMatrix(test_relations)
#     indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths)
#     indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths)
#     indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths)
#     indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids)
#     indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids)
#     indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens)
#     indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len)   
#     indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores)

    rand_values=random_value_normal((vocab_size+1, emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    rand_values[0]=numpy.array(numpy.zeros(emb_size),dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values=load_word2vec_to_init(rand_values, rootPath+'word_emb'+mark+'.txt')
    embeddings=theano.shared(value=rand_values, borrow=True)      

    char_rand_values=random_value_normal((char_size+1, char_emb_size), theano.config.floatX, numpy.random.RandomState(1234))
    char_rand_values[0]=numpy.array(numpy.zeros(char_emb_size),dtype=theano.config.floatX)
    char_embeddings=theano.shared(value=char_rand_values, borrow=True)      

    
    # allocate symbolic variables for the data
    index = T.lscalar()
    chosed_indices=T.lvector()
    
    ent_char_ids_M = T.lmatrix()   
    ent_lens_M = T.lmatrix()
    men_char_ids_M = T.lmatrix()  
    men_lens_M=T.lmatrix()
    rel_word_ids_M=T.lmatrix()
    rel_word_lens_M=T.lmatrix()
    desH_word_ids_M=T.lmatrix()
    desH_word_lens_M=T.lmatrix()
#     desT_word_ids_M=T.lmatrix()
#     desT_word_lens_M=T.lmatrix()
    q_word_ids_M=T.lmatrix()
    q_word_lens_M=T.lmatrix()
    ent_scores=T.dvector()

#max_char_len, max_des_len, max_relation_len, max_Q_len
#     ent_men_ishape = (char_emb_size, max_char_len)  # this is the size of MNIST images
#     rel_ishape=(emb_size, max_relation_len)
#     des_ishape=(emb_size, max_des_len)
#     q_ishape=(emb_size, max_Q_len)
    
    filter_size=(emb_size,window_width[0])
    char_filter_size=(char_emb_size, window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
#     length_after_wideConv=ishape[1]+filter_size[1]-1
    
    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'
    

    char_filter_shape=(char_nkerns, 1, char_filter_size[0], char_filter_size[1])
    word_filter_shape=(word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b=create_conv_para(rng, filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    q_desH_conv_W, q_desH_conv_b=create_conv_para(rng, filter_shape=word_filter_shape)
    params = [char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W, q_rel_conv_b, q_desH_conv_W, q_desH_conv_b]
    char_conv_W_into_matrix=char_conv_W.reshape((char_conv_W.shape[0], char_conv_W.shape[2]*char_conv_W.shape[3]))
    q_rel_conv_W_into_matrix=q_rel_conv_W.reshape((q_rel_conv_W.shape[0], q_rel_conv_W.shape[2]*q_rel_conv_W.shape[3]))
    q_desH_conv_W_into_matrix=q_desH_conv_W.reshape((q_desH_conv_W.shape[0], q_desH_conv_W.shape[2]*q_desH_conv_W.shape[3]))
#     load_model_from_file(rootPath, params, '')

    def SimpleQ_matches_Triple(ent_char_ids_f,ent_lens_f,rel_word_ids_f,rel_word_lens_f,desH_word_ids_f,
                       desH_word_lens_f,
                       men_char_ids_f, q_word_ids_f, men_lens_f, q_word_lens_f):
        

#         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape((batch_size,max_char_len, char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape((batch_size,max_relation_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        
#         desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape((batch_size,max_Q_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
    
    
        #ent_mention
        ent_char_conv = Conv_with_input_para(rng, input=ent_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng, input=men_char_input,
                image_shape=(batch_size, 1, char_emb_size, max_char_len),
                filter_shape=char_filter_shape, W=char_conv_W, b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng, input=rel_word_input,
                image_shape=(batch_size, 1, emb_size, max_relation_len),
                filter_shape=word_filter_shape, W=q_rel_conv_W, b=q_rel_conv_b)
        #q_desH
        q_desH_conv = Conv_with_input_para(rng, input=q_word_input,
                image_shape=(batch_size, 1, emb_size, max_Q_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
        desH_conv = Conv_with_input_para(rng, input=desH_word_input,
                image_shape=(batch_size, 1, emb_size, max_des_len),
                filter_shape=word_filter_shape, W=q_desH_conv_W, b=q_desH_conv_b)
#         #q_desT
#         q_desT_conv = Conv_with_input_para(rng, input=q_word_input,
#                 image_shape=(batch_size, 1, emb_size, max_Q_len),
#                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
#         desT_conv = Conv_with_input_para(rng, input=desT_word_input,
#                 image_shape=(batch_size, 1, emb_size, max_des_len),
#                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
    #     ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output')
    #     men_char_output=debug_print(men_char_conv.output, 'men_char.output')
        
        
        
        ent_conv_pool=Max_Pooling(rng, input_l=ent_char_conv.output, left_l=ent_lens_f[0], right_l=ent_lens_f[2])
        men_conv_pool=Max_Pooling(rng, input_l=men_char_conv.output, left_l=men_lens_f[0], right_l=men_lens_f[2])
        
#         q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool=Max_Pooling(rng, input_l=rel_conv.output, left_l=rel_word_lens_f[0], right_l=rel_word_lens_f[2])
        q_rel_pool=Average_Pooling_for_SimpleQA(rng, input_l=q_rel_conv.output, input_r=rel_conv_pool.output_maxpooling, 
                                                left_l=q_word_lens_f[0], right_l=q_word_lens_f[2], length_l=q_word_lens_f[1]+filter_size[1]-1, 
                                                dim=max_Q_len+filter_size[1]-1, topk=2)
        
        
        q_desH_pool=Max_Pooling(rng, input_l=q_desH_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        desH_conv_pool=Max_Pooling(rng, input_l=desH_conv.output, left_l=desH_word_lens_f[0], right_l=desH_word_lens_f[2])
        
#         q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2])
#         desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2])    
        
        
        overall_simi=(cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\
                    cosine(q_rel_pool.topk_max_pooling, rel_conv_pool.output_maxpooling)+\
                    0.1*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling))/3.0

#                     cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi
    
    simi_list, updates = theano.scan(
        SimpleQ_matches_Triple,
                sequences=[ent_char_ids_M,ent_lens_M,rel_word_ids_M,rel_word_lens_M,desH_word_ids_M,
                   desH_word_lens_M,
                   men_char_ids_M, q_word_ids_M, men_lens_M, q_word_lens_M])
    
    simi_list+=0.5*ent_scores
    
    posi_simi=simi_list[0]
    nega_simies=simi_list[1:]
    loss_simi_list=T.maximum(0.0, margin-posi_simi.reshape((1,1))+nega_simies) 
    loss_simi=T.mean(loss_simi_list)

    

    
    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg =debug_print((char_embeddings** 2).sum()+(embeddings** 2).sum()+(char_conv_W** 2).sum()+(q_rel_conv_W** 2).sum()+(q_desH_conv_W** 2).sum(), 'L2_reg')#+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg= Diversify_Reg(char_conv_W_into_matrix)+Diversify_Reg(q_rel_conv_W_into_matrix)+Diversify_Reg(q_desH_conv_W_into_matrix)
    cost=loss_simi+L2_weight*L2_reg+Div_reg*diversify_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')
    



    test_model = theano.function([ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M, rel_word_lens_M, desH_word_ids_M, desH_word_lens_M,
                                  q_word_ids_M, q_word_lens_M, ent_scores], [loss_simi, simi_list],on_unused_input='ignore')
#           givens={
#             ent_char_ids_M : test_pos_entity_char[index].reshape((length_per_example_test[index], max_char_len)),  
#             ent_lens_M : test_entity_char_lengths[index].reshape((length_per_example_test[index], 3)),
#             men_char_ids_M : test_mention_char_ids[index].reshape((length_per_example_test[index], max_char_len)),  
#             men_lens_M : test_mention_char_lens[index].reshape((length_per_example_test[index], 3)),
#             rel_word_ids_M : test_relations[index].reshape((length_per_example_test[index], max_relation_len)),  
#             rel_word_lens_M : test_relation_lengths[index].reshape((length_per_example_test[index], 3)),
#             desH_word_ids_M : test_pos_entity_des[index].reshape((length_per_example_test[index], max_des_len)), 
#             desH_word_lens_M : test_entity_des_lengths[index].reshape((length_per_example_test[index], 3)),
# #             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], 
# #             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
#             q_word_ids_M : test_remainQ_word_ids[index].reshape((length_per_example_test[index], max_Q_len)), 
#             q_word_lens_M : test_remainQ_word_len[index].reshape((length_per_example_test[index], 3)),
#             ent_scores : test_entity_scores[index]},
                                  
    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    #+[embeddings]# + layer1.params 
#     params_conv = [conv_W, conv_b]
    
    accumulator=[]
    for para_i in params:
        eps_p=numpy.zeros_like(para_i.get_value(borrow=True),dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))
      
    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i=debug_print(grad_i,'grad_i')
        acc = acc_i + T.sqr(grad_i)
#         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
#         updates.append((acc_i, acc))    
        if param_i == embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(emb_size)))))   #Ada
        elif param_i == char_embeddings:
            updates.append((param_i, T.set_subtensor((param_i - learning_rate * grad_i / T.sqrt(acc+1e-10))[0], theano.shared(numpy.zeros(char_emb_size)))))   #AdaGrad
        else:
            updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        updates.append((acc_i, acc)) 
  
    train_model = theano.function([index, chosed_indices], [loss_simi, cost], updates=updates,
          givens={
            ent_char_ids_M : indices_train_pos_entity_char[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)),  
            ent_lens_M : indices_train_entity_char_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            men_char_ids_M : indices_train_mention_char_ids[index].reshape((neg_all, max_char_len))[chosed_indices].reshape((train_neg_size, max_char_len)),  
            men_lens_M : indices_train_mention_char_lens[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            rel_word_ids_M : indices_train_relations[index].reshape((neg_all, max_relation_len))[chosed_indices].reshape((train_neg_size, max_relation_len)),  
            rel_word_lens_M : indices_train_relation_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            desH_word_ids_M : indices_train_pos_entity_des[index].reshape((neg_all, max_des_len))[chosed_indices].reshape((train_neg_size, max_des_len)), 
            desH_word_lens_M : indices_train_entity_des_lengths[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
#             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2], 
#             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
            q_word_ids_M : indices_train_remainQ_word_ids[index].reshape((neg_all, max_Q_len))[chosed_indices].reshape((train_neg_size, max_Q_len)), 
            q_word_lens_M : indices_train_remainQ_word_len[index].reshape((neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            ent_scores : indices_train_entity_scores[index][chosed_indices]
            
            }, on_unused_input='ignore')




    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
                           # found
    improvement_threshold = 0.995  # a relative improvement of this much is
                                   # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
                                  # go through this many
                                  # minibatche before checking the network
                                  # on the validation set; in this case we
                                  # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False
    
    best_test_accu=0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index=0


        for batch_start in train_batch_start: 
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index +1
 
            minibatch_index=minibatch_index+1
            #print batch_start
            sample_indices=[0]+random.sample(range(1, neg_all), train_neg_size-1)
            loss_simi_i, cost_i= train_model(batch_start, sample_indices)
#             if batch_start%1==0:
#                 print batch_start, '\t loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
#                 store_model_to_file(rootPath, params)
 
            if iter % n_train_batches == 0:
                print 'training @ iter = '+str(iter)+'\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)
#             
            if iter % n_train_batches == 0:
                 
                test_loss=[]
                succ=0
                for i in range(test_size):
#                     print 'testing', i, '...'
                    #prepare data
                    test_ent_char_ids_M= numpy.asarray(test_pos_entity_char[i], dtype='int64').reshape((length_per_example_test[i], max_char_len))  
                    test_ent_lens_M = numpy.asarray(test_entity_char_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_men_char_ids_M = numpy.asarray(test_mention_char_ids[i], dtype='int64').reshape((length_per_example_test[i], max_char_len))
                    test_men_lens_M = numpy.asarray(test_mention_char_lens[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_rel_word_ids_M = numpy.asarray(test_relations[i], dtype='int64').reshape((length_per_example_test[i], max_relation_len))  
                    test_rel_word_lens_M = numpy.asarray(test_relation_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_desH_word_ids_M =numpy.asarray( test_pos_entity_des[i], dtype='int64').reshape((length_per_example_test[i], max_des_len))
                    test_desH_word_lens_M = numpy.asarray(test_entity_des_lengths[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_q_word_ids_M = numpy.asarray(test_remainQ_word_ids[i], dtype='int64').reshape((length_per_example_test[i], max_Q_len))
                    test_q_word_lens_M = numpy.asarray(test_remainQ_word_len[i], dtype='int64').reshape((length_per_example_test[i], 3))
                    test_ent_scores = numpy.asarray(test_entity_scores[i], dtype=theano.config.floatX)
             
             
             
             
                                
                    loss_simi_i,simi_list_i=test_model(test_ent_char_ids_M, test_ent_lens_M, test_men_char_ids_M, test_men_lens_M, test_rel_word_ids_M, test_rel_word_lens_M,
                                                       test_desH_word_ids_M, test_desH_word_lens_M, test_q_word_ids_M, test_q_word_lens_M, test_ent_scores)
#                     print 'simi_list_i:', simi_list_i[:10]
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0]>=max(simi_list_i[1:]):
                        succ+=1
#                     print 'testing', i, '...acc:', succ*1.0/(i+1)
                succ=succ*1.0/test_size
                #now, check MAP and MRR
                print(('\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                           'model %f') %
                          (epoch, minibatch_index, n_train_batches,succ))

                if best_test_accu< succ:
                    best_test_accu=succ
                    store_model_to_file(rootPath, params, mark)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock()-mid_time)/60.0, 'min'
        mid_time = time.clock() 

            
        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' +
                          os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
예제 #7
0
#!/usr/bin/env python
# coding=utf-8

import numpy as np

from sklearn.cross_validation import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold

from load_data import load_train, load_test 

train_data, labels = load_train()

n_folds = 10
clfs = [RandomForestClassifier(n_estimators=80, bootstrap=True, max_features="auto", max_depth=30, criterion='gini'),
        RandomForestClassifier(n_estimators=80, bootstrap=True, max_features="auto", max_depth=30, criterion='entropy'),
        RandomForestClassifier(n_estimators=80, bootstrap=True, max_features="sqrt", max_depth=30, criterion='gini'),
        ExtraTreesClassifier(n_estimators=50, criterion='gini'),
        ExtraTreesClassifier(n_estimators=60, criterion='entropy'),
        GradientBoostingClassifier(subsample=0.5, max_depth=20, n_estimators=50)]

datas, labels = load_train()

split_points = 12000
train_datas = np.array(datas[0:split_points])
train_labels = np.array(labels[0:split_points])
val_datas = np.array(datas[split_points:])
val_lables = np.array(labels[split_points:])
예제 #8
0
    return model


# model parameters:
vocabs = 3000
max_sentence = 40
batch_size = 50
latent_dim = 512
dim = 4096
samples = 1450
train_samples = 1450
val_samples = samples- train_samples

if __name__ == '__main__':
    ### Load data
    xx, xx_decoder, yy, EOS= load_train(max_sentence, vocabs)
    xx = xx.reshape(samples*80,dim)
    xx = (xx-xx.mean(axis=0))/(xx.std(axis=0)+0.001)
    xx = xx.reshape(samples,80,dim)



    EOS = to_categorical(EOS, vocabs)
    pad = to_categorical(0, vocabs)
    masked_categorical_crossentropy = get_loss(pad)
    generator = cap_generator(xx, xx_decoder, yy, batch_size)
    generator_val = val_generator(xx,xx_decoder,yy, val_samples)

    model, encoder = seq2seq(latent_dim, dim, vocabs, pad)
    print(model.summary())
    #model = s2vt(latent_dim)
예제 #9
0
"""
Break up train data to each individual week
"""
import pandas as pd

from load_data import load_train

all_data = load_train()

for week in range(3, 10):
    fname = '../week' + str(week) + '.csv'
    all_data[all_data['Semana']==week].to_csv(fname)
 def __init__(self, yearly_seasonality=True):
     self.train = load_train()
     self.yearly_seasonality = yearly_seasonality
     self.app_path = os.path.dirname(os.path.realpath(__file__))
예제 #11
0
def evaluate_lenet5(learning_rate=0.05,
                    n_epochs=2000,
                    word_nkerns=50,
                    char_nkerns=4,
                    batch_size=1,
                    window_width=[2, 5],
                    emb_size=50,
                    char_emb_size=4,
                    hidden_size=200,
                    margin=0.5,
                    L2_weight=0.0003,
                    Div_reg=0.03,
                    update_freq=1,
                    norm_threshold=5.0,
                    max_truncate=40,
                    max_char_len=40,
                    max_des_len=20,
                    max_relation_len=5,
                    max_Q_len=30,
                    train_neg_size=21,
                    neg_all=100,
                    train_size=200,
                    test_size=200,
                    mark='_forfun'):  #train_size=75909, test_size=17386
    #     maxSentLength=max_truncate+2*(window_width-1)
    model_options = locals().copy()
    print "model options", model_options
    rootPath = '/mounts/data/proj/wenpeng/Dataset/freebase/SimpleQuestions_v2/'
    triple_files = [
        'annotated_fb_data_train.entitylinking.top20_succSet_asInput.txt',
        'annotated_fb_data_test.entitylinking.top20_succSet_asInput.txt'
    ]

    rng = numpy.random.RandomState(23455)
    datasets, datasets_test, length_per_example_test, vocab_size, char_size = load_train(
        triple_files[0], triple_files[1], max_char_len, max_des_len,
        max_relation_len, max_Q_len, train_size, test_size,
        mark)  #max_char_len, max_des_len, max_relation_len, max_Q_len

    print 'vocab_size:', vocab_size, 'char_size:', char_size

    train_data = datasets
    #     valid_data=datasets[1]
    test_data = datasets_test
    #     result=(pos_entity_char, pos_entity_des, relations, entity_char_lengths, entity_des_lengths, relation_lengths, mention_char_ids, remainQ_word_ids, mention_char_lens, remainQ_word_lens, entity_scores)
    #
    train_pos_entity_char = train_data[0]
    train_pos_entity_des = train_data[1]
    train_relations = train_data[2]
    train_entity_char_lengths = train_data[3]
    train_entity_des_lengths = train_data[4]
    train_relation_lengths = train_data[5]
    train_mention_char_ids = train_data[6]
    train_remainQ_word_ids = train_data[7]
    train_mention_char_lens = train_data[8]
    train_remainQ_word_len = train_data[9]
    train_entity_scores = train_data[10]

    test_pos_entity_char = test_data[0]
    test_pos_entity_des = test_data[1]
    test_relations = test_data[2]
    test_entity_char_lengths = test_data[3]
    test_entity_des_lengths = test_data[4]
    test_relation_lengths = test_data[5]
    test_mention_char_ids = test_data[6]
    test_remainQ_word_ids = test_data[7]
    test_mention_char_lens = test_data[8]
    test_remainQ_word_len = test_data[9]
    test_entity_scores = test_data[10]
    #
    #     test_pos_entity_char=test_data[0]       #matrix, each row for line example, all head and tail entity, iteratively: 40*2*51
    #     test_pos_entity_des=test_data[1]        #matrix, each row for a examle: 20*2*51
    #     test_relations=test_data[2]             #matrix, each row for a example: 5*51
    #     test_entity_char_lengths=test_data[3]   #matrix, each row for a example: 3*2*51  (three valies for one entity)
    #     test_entity_des_lengths=test_data[4]    #matrix, each row for a example: 3*2*51  (three values for one entity)
    #     test_relation_lengths=test_data[5]      #matrix, each row for a example: 3*51
    #     test_mention_char_ids=test_data[6]      #matrix, each row for a mention: 40
    #     test_remainQ_word_ids=test_data[7]      #matrix, each row for a question: 30
    #     test_mention_char_lens=test_data[8]     #matrix, each three values for a mention: 3
    #     test_remainQ_word_len=test_data[9]      #matrix, each three values for a remain question: 3


    train_sizes=[len(train_pos_entity_char), len(train_pos_entity_des), len(train_relations), len(train_entity_char_lengths), len(train_entity_des_lengths),\
           len(train_relation_lengths), len(train_mention_char_ids), len(train_remainQ_word_ids), len(train_mention_char_lens), len(train_remainQ_word_len), len(train_entity_scores)]
    if sum(train_sizes) / len(train_sizes) != train_size:
        print 'weird size:', train_sizes
        exit(0)

    test_sizes=[len(test_pos_entity_char), len(test_pos_entity_des), len(test_relations), len(test_entity_char_lengths), len(test_entity_des_lengths),\
           len(test_relation_lengths), len(test_mention_char_ids), len(test_remainQ_word_ids), len(test_mention_char_lens), len(test_remainQ_word_len), len(test_entity_scores)]
    if sum(test_sizes) / len(test_sizes) != test_size:
        print 'weird size:', test_sizes
        exit(0)

    n_train_batches = train_size / batch_size
    n_test_batches = test_size / batch_size

    train_batch_start = list(numpy.arange(n_train_batches) * batch_size)
    test_batch_start = list(numpy.arange(n_test_batches) * batch_size)

    indices_train_pos_entity_char = pythonList_into_theanoIntMatrix(
        train_pos_entity_char)
    indices_train_pos_entity_des = pythonList_into_theanoIntMatrix(
        train_pos_entity_des)
    indices_train_relations = pythonList_into_theanoIntMatrix(train_relations)
    indices_train_entity_char_lengths = pythonList_into_theanoIntMatrix(
        train_entity_char_lengths)
    indices_train_entity_des_lengths = pythonList_into_theanoIntMatrix(
        train_entity_des_lengths)
    indices_train_relation_lengths = pythonList_into_theanoIntMatrix(
        train_relation_lengths)
    indices_train_mention_char_ids = pythonList_into_theanoIntMatrix(
        train_mention_char_ids)
    indices_train_remainQ_word_ids = pythonList_into_theanoIntMatrix(
        train_remainQ_word_ids)
    indices_train_mention_char_lens = pythonList_into_theanoIntMatrix(
        train_mention_char_lens)
    indices_train_remainQ_word_len = pythonList_into_theanoIntMatrix(
        train_remainQ_word_len)
    indices_train_entity_scores = pythonList_into_theanoFloatMatrix(
        train_entity_scores)

    #     indices_test_pos_entity_char=pythonList_into_theanoIntMatrix(test_pos_entity_char)
    #     indices_test_pos_entity_des=pythonList_into_theanoIntMatrix(test_pos_entity_des)
    #     indices_test_relations=pythonList_into_theanoIntMatrix(test_relations)
    #     indices_test_entity_char_lengths=pythonList_into_theanoIntMatrix(test_entity_char_lengths)
    #     indices_test_entity_des_lengths=pythonList_into_theanoIntMatrix(test_entity_des_lengths)
    #     indices_test_relation_lengths=pythonList_into_theanoIntMatrix(test_relation_lengths)
    #     indices_test_mention_char_ids=pythonList_into_theanoIntMatrix(test_mention_char_ids)
    #     indices_test_remainQ_word_ids=pythonList_into_theanoIntMatrix(test_remainQ_word_ids)
    #     indices_test_mention_char_lens=pythonList_into_theanoIntMatrix(test_mention_char_lens)
    #     indices_test_remainQ_word_len=pythonList_into_theanoIntMatrix(test_remainQ_word_len)
    #     indices_test_entity_scores=pythonList_into_theanoIntMatrix(test_entity_scores)

    rand_values = random_value_normal((vocab_size + 1, emb_size),
                                      theano.config.floatX,
                                      numpy.random.RandomState(1234))
    rand_values[0] = numpy.array(numpy.zeros(emb_size),
                                 dtype=theano.config.floatX)
    #rand_values[0]=numpy.array([1e-50]*emb_size)
    rand_values = load_word2vec_to_init(rand_values,
                                        rootPath + 'word_emb' + mark + '.txt')
    embeddings = theano.shared(value=rand_values, borrow=True)

    char_rand_values = random_value_normal((char_size + 1, char_emb_size),
                                           theano.config.floatX,
                                           numpy.random.RandomState(1234))
    char_rand_values[0] = numpy.array(numpy.zeros(char_emb_size),
                                      dtype=theano.config.floatX)
    char_embeddings = theano.shared(value=char_rand_values, borrow=True)

    # allocate symbolic variables for the data
    index = T.lscalar()
    chosed_indices = T.lvector()

    ent_char_ids_M = T.lmatrix()
    ent_lens_M = T.lmatrix()
    men_char_ids_M = T.lmatrix()
    men_lens_M = T.lmatrix()
    rel_word_ids_M = T.lmatrix()
    rel_word_lens_M = T.lmatrix()
    desH_word_ids_M = T.lmatrix()
    desH_word_lens_M = T.lmatrix()
    #     desT_word_ids_M=T.lmatrix()
    #     desT_word_lens_M=T.lmatrix()
    q_word_ids_M = T.lmatrix()
    q_word_lens_M = T.lmatrix()
    ent_scores = T.dvector()

    #max_char_len, max_des_len, max_relation_len, max_Q_len
    #     ent_men_ishape = (char_emb_size, max_char_len)  # this is the size of MNIST images
    #     rel_ishape=(emb_size, max_relation_len)
    #     des_ishape=(emb_size, max_des_len)
    #     q_ishape=(emb_size, max_Q_len)

    filter_size = (emb_size, window_width[0])
    char_filter_size = (char_emb_size, window_width[1])
    #poolsize1=(1, ishape[1]-filter_size[1]+1) #?????????????????????????????
    #     length_after_wideConv=ishape[1]+filter_size[1]-1

    ######################
    # BUILD ACTUAL MODEL #
    ######################
    print '... building the model'

    char_filter_shape = (char_nkerns, 1, char_filter_size[0],
                         char_filter_size[1])
    word_filter_shape = (word_nkerns, 1, filter_size[0], filter_size[1])
    char_conv_W, char_conv_b = create_conv_para(rng,
                                                filter_shape=char_filter_shape)
    q_rel_conv_W, q_rel_conv_b = create_conv_para(
        rng, filter_shape=word_filter_shape)
    q_desH_conv_W, q_desH_conv_b = create_conv_para(
        rng, filter_shape=word_filter_shape)
    params = [
        char_embeddings, embeddings, char_conv_W, char_conv_b, q_rel_conv_W,
        q_rel_conv_b, q_desH_conv_W, q_desH_conv_b
    ]
    char_conv_W_into_matrix = char_conv_W.reshape(
        (char_conv_W.shape[0], char_conv_W.shape[2] * char_conv_W.shape[3]))
    q_rel_conv_W_into_matrix = q_rel_conv_W.reshape(
        (q_rel_conv_W.shape[0], q_rel_conv_W.shape[2] * q_rel_conv_W.shape[3]))
    q_desH_conv_W_into_matrix = q_desH_conv_W.reshape(
        (q_desH_conv_W.shape[0],
         q_desH_conv_W.shape[2] * q_desH_conv_W.shape[3]))

    #     load_model_from_file(rootPath, params, '')

    def SimpleQ_matches_Triple(ent_char_ids_f, ent_lens_f, rel_word_ids_f,
                               rel_word_lens_f, desH_word_ids_f,
                               desH_word_lens_f, men_char_ids_f, q_word_ids_f,
                               men_lens_f, q_word_lens_f):

        #         rng = numpy.random.RandomState(23455)
        ent_char_input = char_embeddings[ent_char_ids_f.flatten()].reshape(
            (batch_size, max_char_len,
             char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        men_char_input = char_embeddings[men_char_ids_f.flatten()].reshape(
            (batch_size, max_char_len,
             char_emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        rel_word_input = embeddings[rel_word_ids_f.flatten()].reshape(
            (batch_size, max_relation_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        desH_word_input = embeddings[desH_word_ids_f.flatten()].reshape(
            (batch_size, max_des_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        #         desT_word_input = embeddings[desT_word_ids_f.flatten()].reshape((batch_size,max_des_len, emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)
        q_word_input = embeddings[q_word_ids_f.flatten()].reshape(
            (batch_size, max_Q_len,
             emb_size)).transpose(0, 2, 1).dimshuffle(0, 'x', 1, 2)

        #ent_mention
        ent_char_conv = Conv_with_input_para(rng,
                                             input=ent_char_input,
                                             image_shape=(batch_size, 1,
                                                          char_emb_size,
                                                          max_char_len),
                                             filter_shape=char_filter_shape,
                                             W=char_conv_W,
                                             b=char_conv_b)
        men_char_conv = Conv_with_input_para(rng,
                                             input=men_char_input,
                                             image_shape=(batch_size, 1,
                                                          char_emb_size,
                                                          max_char_len),
                                             filter_shape=char_filter_shape,
                                             W=char_conv_W,
                                             b=char_conv_b)
        #q-rel
        q_rel_conv = Conv_with_input_para(rng,
                                          input=q_word_input,
                                          image_shape=(batch_size, 1, emb_size,
                                                       max_Q_len),
                                          filter_shape=word_filter_shape,
                                          W=q_rel_conv_W,
                                          b=q_rel_conv_b)
        rel_conv = Conv_with_input_para(rng,
                                        input=rel_word_input,
                                        image_shape=(batch_size, 1, emb_size,
                                                     max_relation_len),
                                        filter_shape=word_filter_shape,
                                        W=q_rel_conv_W,
                                        b=q_rel_conv_b)
        #q_desH
        q_desH_conv = Conv_with_input_para(rng,
                                           input=q_word_input,
                                           image_shape=(batch_size, 1,
                                                        emb_size, max_Q_len),
                                           filter_shape=word_filter_shape,
                                           W=q_desH_conv_W,
                                           b=q_desH_conv_b)
        desH_conv = Conv_with_input_para(rng,
                                         input=desH_word_input,
                                         image_shape=(batch_size, 1, emb_size,
                                                      max_des_len),
                                         filter_shape=word_filter_shape,
                                         W=q_desH_conv_W,
                                         b=q_desH_conv_b)
        #         #q_desT
        #         q_desT_conv = Conv_with_input_para(rng, input=q_word_input,
        #                 image_shape=(batch_size, 1, emb_size, max_Q_len),
        #                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
        #         desT_conv = Conv_with_input_para(rng, input=desT_word_input,
        #                 image_shape=(batch_size, 1, emb_size, max_des_len),
        #                 filter_shape=word_filter_shape, W=q_desT_conv_W, b=q_desT_conv_b)
        #     ent_char_output=debug_print(ent_char_conv.output, 'ent_char.output')
        #     men_char_output=debug_print(men_char_conv.output, 'men_char.output')

        ent_conv_pool = Max_Pooling(rng,
                                    input_l=ent_char_conv.output,
                                    left_l=ent_lens_f[0],
                                    right_l=ent_lens_f[2])
        men_conv_pool = Max_Pooling(rng,
                                    input_l=men_char_conv.output,
                                    left_l=men_lens_f[0],
                                    right_l=men_lens_f[2])

        #         q_rel_pool=Max_Pooling(rng, input_l=q_rel_conv.output, left_l=q_word_lens_f[0], right_l=q_word_lens_f[2])
        rel_conv_pool = Max_Pooling(rng,
                                    input_l=rel_conv.output,
                                    left_l=rel_word_lens_f[0],
                                    right_l=rel_word_lens_f[2])
        q_rel_pool = Average_Pooling_for_SimpleQA(
            rng,
            input_l=q_rel_conv.output,
            input_r=rel_conv_pool.output_maxpooling,
            left_l=q_word_lens_f[0],
            right_l=q_word_lens_f[2],
            length_l=q_word_lens_f[1] + filter_size[1] - 1,
            dim=max_Q_len + filter_size[1] - 1,
            topk=2)

        q_desH_pool = Max_Pooling(rng,
                                  input_l=q_desH_conv.output,
                                  left_l=q_word_lens_f[0],
                                  right_l=q_word_lens_f[2])
        desH_conv_pool = Max_Pooling(rng,
                                     input_l=desH_conv.output,
                                     left_l=desH_word_lens_f[0],
                                     right_l=desH_word_lens_f[2])

        #         q_desT_pool=Max_Pooling(rng, input_l=q_desT_conv.output, left_l=q_word_lens[0], right_l=q_word_lens[2])
        #         desT_conv_pool=Max_Pooling(rng, input_l=desT_conv.output, left_l=desT_word_lens_f[0], right_l=desT_word_lens_f[2])


        overall_simi=(cosine(ent_conv_pool.output_maxpooling, men_conv_pool.output_maxpooling)+\
                    cosine(q_rel_pool.topk_max_pooling, rel_conv_pool.output_maxpooling)+\
                    0.1*cosine(q_desH_pool.output_maxpooling, desH_conv_pool.output_maxpooling))/3.0

        #                     cosine(q_desT_pool.output_maxpooling, desT_conv_pool.output_maxpooling)
        return overall_simi

    simi_list, updates = theano.scan(SimpleQ_matches_Triple,
                                     sequences=[
                                         ent_char_ids_M, ent_lens_M,
                                         rel_word_ids_M, rel_word_lens_M,
                                         desH_word_ids_M, desH_word_lens_M,
                                         men_char_ids_M, q_word_ids_M,
                                         men_lens_M, q_word_lens_M
                                     ])

    simi_list += 0.5 * ent_scores

    posi_simi = simi_list[0]
    nega_simies = simi_list[1:]
    loss_simi_list = T.maximum(
        0.0, margin - posi_simi.reshape((1, 1)) + nega_simies)
    loss_simi = T.mean(loss_simi_list)

    #L2_reg =(layer3.W** 2).sum()+(layer2.W** 2).sum()+(layer1.W** 2).sum()+(conv_W** 2).sum()
    L2_reg = debug_print(
        (char_embeddings**2).sum() + (embeddings**2).sum() +
        (char_conv_W**2).sum() + (q_rel_conv_W**2).sum() +
        (q_desH_conv_W**2).sum(),
        'L2_reg')  #+(layer1.W** 2).sum()++(embeddings**2).sum()
    diversify_reg = Diversify_Reg(char_conv_W_into_matrix) + Diversify_Reg(
        q_rel_conv_W_into_matrix) + Diversify_Reg(q_desH_conv_W_into_matrix)
    cost = loss_simi + L2_weight * L2_reg + Div_reg * diversify_reg
    #cost=debug_print((cost_this+cost_tmp)/update_freq, 'cost')

    test_model = theano.function([
        ent_char_ids_M, ent_lens_M, men_char_ids_M, men_lens_M, rel_word_ids_M,
        rel_word_lens_M, desH_word_ids_M, desH_word_lens_M, q_word_ids_M,
        q_word_lens_M, ent_scores
    ], [loss_simi, simi_list],
                                 on_unused_input='ignore')
    #           givens={
    #             ent_char_ids_M : test_pos_entity_char[index].reshape((length_per_example_test[index], max_char_len)),
    #             ent_lens_M : test_entity_char_lengths[index].reshape((length_per_example_test[index], 3)),
    #             men_char_ids_M : test_mention_char_ids[index].reshape((length_per_example_test[index], max_char_len)),
    #             men_lens_M : test_mention_char_lens[index].reshape((length_per_example_test[index], 3)),
    #             rel_word_ids_M : test_relations[index].reshape((length_per_example_test[index], max_relation_len)),
    #             rel_word_lens_M : test_relation_lengths[index].reshape((length_per_example_test[index], 3)),
    #             desH_word_ids_M : test_pos_entity_des[index].reshape((length_per_example_test[index], max_des_len)),
    #             desH_word_lens_M : test_entity_des_lengths[index].reshape((length_per_example_test[index], 3)),
    # #             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2],
    # #             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
    #             q_word_ids_M : test_remainQ_word_ids[index].reshape((length_per_example_test[index], max_Q_len)),
    #             q_word_lens_M : test_remainQ_word_len[index].reshape((length_per_example_test[index], 3)),
    #             ent_scores : test_entity_scores[index]},

    #params = layer3.params + layer2.params + layer1.params+ [conv_W, conv_b]
    #+[embeddings]# + layer1.params
    #     params_conv = [conv_W, conv_b]

    accumulator = []
    for para_i in params:
        eps_p = numpy.zeros_like(para_i.get_value(borrow=True),
                                 dtype=theano.config.floatX)
        accumulator.append(theano.shared(eps_p, borrow=True))

    # create a list of gradients for all model parameters
    grads = T.grad(cost, params)

    updates = []
    for param_i, grad_i, acc_i in zip(params, grads, accumulator):
        grad_i = debug_print(grad_i, 'grad_i')
        acc = acc_i + T.sqr(grad_i)
        #         updates.append((param_i, param_i - learning_rate * grad_i / T.sqrt(acc+1e-10)))   #AdaGrad
        #         updates.append((acc_i, acc))
        if param_i == embeddings:
            updates.append(
                (param_i,
                 T.set_subtensor(
                     (param_i -
                      learning_rate * grad_i / T.sqrt(acc + 1e-10))[0],
                     theano.shared(numpy.zeros(emb_size)))))  #Ada
        elif param_i == char_embeddings:
            updates.append(
                (param_i,
                 T.set_subtensor(
                     (param_i -
                      learning_rate * grad_i / T.sqrt(acc + 1e-10))[0],
                     theano.shared(numpy.zeros(char_emb_size)))))  #AdaGrad
        else:
            updates.append(
                (param_i, param_i -
                 learning_rate * grad_i / T.sqrt(acc + 1e-10)))  #AdaGrad
        updates.append((acc_i, acc))

    train_model = theano.function(
        [index, chosed_indices],
        [loss_simi, cost],
        updates=updates,
        givens={
            ent_char_ids_M:
            indices_train_pos_entity_char[index].reshape(
                (neg_all, max_char_len))[chosed_indices].reshape(
                    (train_neg_size, max_char_len)),
            ent_lens_M:
            indices_train_entity_char_lengths[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            men_char_ids_M:
            indices_train_mention_char_ids[index].reshape(
                (neg_all, max_char_len))[chosed_indices].reshape(
                    (train_neg_size, max_char_len)),
            men_lens_M:
            indices_train_mention_char_lens[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            rel_word_ids_M:
            indices_train_relations[index].reshape(
                (neg_all, max_relation_len))[chosed_indices].reshape(
                    (train_neg_size, max_relation_len)),
            rel_word_lens_M:
            indices_train_relation_lengths[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            desH_word_ids_M:
            indices_train_pos_entity_des[index].reshape(
                (neg_all, max_des_len))[chosed_indices].reshape(
                    (train_neg_size, max_des_len)),
            desH_word_lens_M:
            indices_train_entity_des_lengths[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            #             desT_word_ids_M : indices_train_pos_entity_des[index].reshape(((neg_all)*2, max_des_len))[1::2],
            #             desT_word_lens_M : indices_train_entity_des_lengths[index].reshape(((neg_all)*2, 3))[1::2],
            q_word_ids_M:
            indices_train_remainQ_word_ids[index].reshape(
                (neg_all, max_Q_len))[chosed_indices].reshape(
                    (train_neg_size, max_Q_len)),
            q_word_lens_M:
            indices_train_remainQ_word_len[index].reshape(
                (neg_all, 3))[chosed_indices].reshape((train_neg_size, 3)),
            ent_scores:
            indices_train_entity_scores[index][chosed_indices]
        },
        on_unused_input='ignore')

    ###############
    # TRAIN MODEL #
    ###############
    print '... training'
    # early-stopping parameters
    patience = 500000000000000  # look as this many examples regardless
    patience_increase = 2  # wait this much longer when a new best is
    # found
    improvement_threshold = 0.995  # a relative improvement of this much is
    # considered significant
    validation_frequency = min(n_train_batches, patience / 2)
    # go through this many
    # minibatche before checking the network
    # on the validation set; in this case we
    # check every epoch

    best_params = None
    best_validation_loss = numpy.inf
    best_iter = 0
    test_score = 0.
    start_time = time.clock()
    mid_time = start_time

    epoch = 0
    done_looping = False

    best_test_accu = 0.0

    while (epoch < n_epochs) and (not done_looping):
        epoch = epoch + 1
        #for minibatch_index in xrange(n_train_batches): # each batch
        minibatch_index = 0

        for batch_start in train_batch_start:
            # iter means how many batches have been runed, taking into loop
            iter = (epoch - 1) * n_train_batches + minibatch_index + 1

            minibatch_index = minibatch_index + 1
            #print batch_start
            sample_indices = [0] + random.sample(range(1, neg_all),
                                                 train_neg_size - 1)
            loss_simi_i, cost_i = train_model(batch_start, sample_indices)
            #             if batch_start%1==0:
            #                 print batch_start, '\t loss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #                 store_model_to_file(rootPath, params)

            if iter % n_train_batches == 0:
                print 'training @ iter = ' + str(
                    iter) + '\tloss_simi_i: ', loss_simi_i, 'cost_i:', cost_i
            #if iter ==1:
            #    exit(0)
#
            if iter % n_train_batches == 0:

                test_loss = []
                succ = 0
                for i in range(test_size):
                    #                     print 'testing', i, '...'
                    #prepare data
                    test_ent_char_ids_M = numpy.asarray(
                        test_pos_entity_char[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_char_len))
                    test_ent_lens_M = numpy.asarray(
                        test_entity_char_lengths[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_men_char_ids_M = numpy.asarray(
                        test_mention_char_ids[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_char_len))
                    test_men_lens_M = numpy.asarray(
                        test_mention_char_lens[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_rel_word_ids_M = numpy.asarray(
                        test_relations[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_relation_len))
                    test_rel_word_lens_M = numpy.asarray(
                        test_relation_lengths[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_desH_word_ids_M = numpy.asarray(
                        test_pos_entity_des[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_des_len))
                    test_desH_word_lens_M = numpy.asarray(
                        test_entity_des_lengths[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_q_word_ids_M = numpy.asarray(
                        test_remainQ_word_ids[i], dtype='int64').reshape(
                            (length_per_example_test[i], max_Q_len))
                    test_q_word_lens_M = numpy.asarray(
                        test_remainQ_word_len[i], dtype='int64').reshape(
                            (length_per_example_test[i], 3))
                    test_ent_scores = numpy.asarray(test_entity_scores[i],
                                                    dtype=theano.config.floatX)

                    loss_simi_i, simi_list_i = test_model(
                        test_ent_char_ids_M, test_ent_lens_M,
                        test_men_char_ids_M, test_men_lens_M,
                        test_rel_word_ids_M, test_rel_word_lens_M,
                        test_desH_word_ids_M, test_desH_word_lens_M,
                        test_q_word_ids_M, test_q_word_lens_M, test_ent_scores)
                    #                     print 'simi_list_i:', simi_list_i[:10]
                    test_loss.append(loss_simi_i)
                    if simi_list_i[0] >= max(simi_list_i[1:]):
                        succ += 1


#                     print 'testing', i, '...acc:', succ*1.0/(i+1)
                succ = succ * 1.0 / test_size
                #now, check MAP and MRR
                print((
                    '\t\t\t\t\t\tepoch %i, minibatch %i/%i, test accu of best '
                    'model %f') %
                      (epoch, minibatch_index, n_train_batches, succ))

                if best_test_accu < succ:
                    best_test_accu = succ
                    store_model_to_file(rootPath, params, mark)
            if patience <= iter:
                done_looping = True
                break
        print 'Epoch ', epoch, 'uses ', (time.clock() - mid_time) / 60.0, 'min'
        mid_time = time.clock()

        #print 'Batch_size: ', update_freq
    end_time = time.clock()
    print('Optimization complete.')
    print('Best validation score of %f %% obtained at iteration %i,'\
          'with test performance %f %%' %
          (best_validation_loss * 100., best_iter + 1, test_score * 100.))
    print >> sys.stderr, ('The code for file ' + os.path.split(__file__)[1] +
                          ' ran for %.2fm' % ((end_time - start_time) / 60.))
예제 #12
0
import matplotlib.pyplot as plt

import time
from load_data import load_train

start_time = time.time()
subset = False

all_data = load_train(subset)

# Histogram for weeks
plt.hist(all_data['Semana'], bins=7)
plt.xlabel('Week')
plt.ylabel('Count')
plt.title('Week histogram')
plt.grid(True)
plt.savefig('histograms/week.png', format='png')
plt.clf()

# Histogram for weekly sales units
plt.hist(all_data['Venta_uni_hoy'], log=True, bins=50)
plt.xlabel('Unit sales per week')
plt.ylabel('Count')
plt.title('Histogram of unit sales per week')
plt.grid(True)
plt.savefig('histograms/unit_sales.png', format='png')
plt.clf()

# Histogram for weekly sales in pesos
plt.hist(all_data['Venta_hoy'], log=True, bins=50)
plt.xlabel('Peso sales per week')
예제 #13
0
def main():
    print "# KNN Classifier"
    parser = ld.parse_arguments()

    stopwords = None
    if parser.stopwords_path:
        stopwords = ld.load_stopwords(parser.stopwords_path)

    # priting args
    print '\t-k = ' + str(parser.k)
    print '\t-d = ' + parser.distance

    # loading the necessary data
    (vocabulary, neigh_classes) = ld.load_train(parser.train_path, stopwords)

    print "# Tamanho do vocabulário:", len(vocabulary)

    # transforming each item to a v-dimensional space
    (train, test) = space.transform(vocabulary, parser.train_path,
                                    parser.test_path)

    # output file
    out_path = parser.distance + '_' + str(parser.k)
    out_path += '.txt'
    out_file = open(out_path, 'w')

    # knn classification
    print "# Classifying", len(train) * parser.percentage
    for item in test:
        dist_heap = []

        # calculates the distance to every point in the training set
        for i in xrange(int(len(train) * parser.percentage)):
            point = train[i]
            distance = 0.0

            if parser.distance == 'cosine':
                distance = spd.cosine(item, point)
            elif parser.distance == 'jaccard':
                distance = spd.jaccard(item, point)
            elif parser.distance == 'euclidean':
                distance = spd.euclidean(item, point)
            elif parser.distance == 'hamming':
                distance = spd.hamming(item, point)
            elif parser.distance == 'correlation':
                distance = spd.correlation(item, point)
            elif parser.distance == 'manhattan':
                distance = spd.cityblock(item, point)
            else:
                print >> stderr, "ERRO! -  Distância informada inválida."
                exit()

            tup = (distance, i)
            heapq.heappush(dist_heap, tup)

        # return the highest k similar points
        top_k = heapq.nsmallest(parser.k, dist_heap)

        # classifing
        classification = np.zeros(2)
        for (_, idi) in top_k:
            classe = neigh_classes[idi]
            classification[int(classe)] += 1

        # DEBUG
        print classification,

        # outputing classification
        if(classification[0] >= classification[1]):
            print >> out_file, '0'
            print '0'
        else:
            print >> out_file, '1'
            print '1'

    print
    print "# Resultados salvos no arquivo: " + out_path
    out_file.close()
    result.result("../data/imdb_test", out_path)
예제 #14
0
#Loading Library
import pandas as pd
import numpy as np
from datetime import timedelta, datetime
from load_data import load_train, load_test
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

#train読み込み
train = load_train()

x_train = train.drop(["date", "unit_sales"], axis=1)
y_train = train.unit_sales

#xgboostのために型変換
fac = x_train.columns

for feat in fac:
    x_train[feat] = pd.factorize(x_train[feat], sort=True)[0]

x_train = np.array(x_train)
x_train = xgb.DMatrix(x_train)

#xgboostによりモデルフィッティング

reg = xgb.Regressor()

reg_cv = GridSearchCV(reg, {
    "max_depth": [2, 4, 6],
    "n_estimators": [50, 100, 200]
},