예제 #1
0
def test():

    model = CNN(num_used_hists, num_words, num_word_embedding_dims)
    model.create_model()
    model.get_model_summary()

    print('loading pre-trained model...')
    model.model.load_weights(r'%s/3dcnn_word_80_neg_1_epoch_14_val_loss_0.26.model'%config['3DCNN']['path_model_folder'])
    
    print('loading doc embedding...')
    mp_doc_embedding = my_utils.read_pkl(config['DEFAULT']['path_all_news_doc_embedding'])

    test_user_ids = list(mp_test_hist.keys())
    test_news_ids = get_test_news_ids()

    user_ids, news_ids = [], []

    print('start predicting...')
    user_in = []
    article_in = []
    for user_id in tqdm(test_user_ids):
        clicked_news_ids = set(mp_test_hist[user_id])
        un_clicked_news_ids = list(test_news_ids - clicked_news_ids)
        user_train_hist = mp_train_hist[user_id]

        user_embedding = []
        if len(user_train_hist) > num_used_hists:
            for news_id in user_train_hist[:num_used_hists]:
                user_embedding.append(mp_doc_embedding[news_id])
        else:
            for news_id in user_train_hist[:-1]:
                user_embedding.append(mp_doc_embedding[news_id])
            num_paddings = num_used_hists - len(user_embedding)
            for i in range(num_paddings):
                user_embedding.append(np.zeros((num_words, num_word_embedding_dims)))
            
        for news_id in random.sample(un_clicked_news_ids, int(config['DEFAULT']['num_test_negatives'])):
            user_in.append(user_embedding)
            article_in.append(mp_doc_embedding[news_id])

            user_ids.append(user_id)
            news_ids.append(news_id)

        for news_id in clicked_news_ids:
            user_in.append(user_embedding)
            article_in.append(mp_doc_embedding[news_id])

            user_ids.append(user_id)
            news_ids.append(news_id)

    user_in = np.array(user_in)
    article_in = np.array(article_in)
    user_in = np.resize(user_in, (user_in.shape[0], 1) + user_in.shape[1:])
    article_in = np.resize(article_in, (article_in.shape[0], 1) + article_in.shape[1:])

    out = model.model.predict([user_in, article_in], batch_size = sample_batch_size, verbose = 1)
    
    save_prediction(user_ids, news_ids, out)
예제 #2
0
def train():

    model = CNN(num_used_hists, num_words, num_word_embedding_dims)
    model.create_model()
    model.get_model_summary()

    print('loading doc embedding...')
    mp_doc_embedding = my_utils.read_pkl(
        config['DEFAULT']['path_all_news_doc_embedding'])

    print('constructing input data...')
    train_news_ids = get_train_news_ids()
    user_in = []
    article_in = []
    truth = []
    for user_id, clicked_news_ids in tqdm(mp_train_hist.items()):
        user_embedding = []
        if len(clicked_news_ids) > num_used_hists:
            for news_id in clicked_news_ids[:num_used_hists]:
                user_embedding.append(mp_doc_embedding[news_id])
            for news_id in clicked_news_ids[num_used_hists:]:
                article_in.append(mp_doc_embedding[news_id])
                user_in.append(user_embedding)
                truth.append(1)
                for i in range(num_train_negatives):
                    article_in.append(mp_doc_embedding[get_negative_news_id(
                        train_news_ids, user_id)])
                    user_in.append(user_embedding)
                    truth.append(0)
        else:
            for news_id in clicked_news_ids[:-1]:
                user_embedding.append(mp_doc_embedding[news_id])
            num_paddings = num_used_hists - len(user_embedding)
            for i in range(num_paddings):
                user_embedding.append(
                    np.zeros((num_words, num_word_embedding_dims)))
            article_in.append(mp_doc_embedding[clicked_news_ids[-1]])
            user_in.append(user_embedding)
            truth.append(1)
            for i in range(num_train_negatives):
                article_in.append(mp_doc_embedding[get_negative_news_id(
                    train_news_ids, user_id)])
                user_in.append(user_embedding)
                truth.append(0)

    print('reshaping input data...')
    user_in = np.array(user_in)
    article_in = np.array(article_in)
    user_in = np.resize(user_in, (user_in.shape[0], 1) + user_in.shape[1:])
    article_in = np.resize(article_in,
                           (article_in.shape[0], 1) + article_in.shape[1:])

    print('start training...')
    model.fit_model([user_in, article_in], np.array(truth), sample_batch_size,
                    num_epochs)
예제 #3
0
def batch_test():

    model = CNN(num_used_hists, num_words, num_word_embedding_dims)
    model.create_model()
    model.get_model_summary()

    print('loading pre-trained model...')
    model.model.load_weights(r'%s/3dcnn_word_80_neg_1_epoch_07_val_loss_0.27.model'%config['3DCNN']['path_model_folder'])
    
    print('loading doc embedding...')
    mp_doc_embedding = my_utils.read_pkl(config['DEFAULT']['path_all_news_doc_embedding'])

    test_user_ids = list(mp_test_hist.keys())
    test_news_ids = get_test_news_ids()

    user_ids, news_ids, outs = [], [], []

    print('start predicting...')
    for user_batch_id in tqdm(range(0, len(test_user_ids), test_user_batch_size)):
        user_in = []
        article_in = []
        for user_id in test_user_ids[user_batch_id : user_batch_id + test_user_batch_size]:
            clicked_news_ids = set(mp_test_hist[user_id])
            un_clicked_news_ids = list(test_news_ids - clicked_news_ids)
            user_train_hist = mp_train_hist[user_id]

            user_embedding = []
            if len(user_train_hist) > num_used_hists:
                for news_id in user_train_hist[:num_used_hists]:
                    user_embedding.append(mp_doc_embedding[news_id])
            else:
                for news_id in user_train_hist[:-1]:
                    user_embedding.append(mp_doc_embedding[news_id])
                num_paddings = num_used_hists - len(user_embedding)
                for i in range(num_paddings):
                    user_embedding.append(np.zeros((num_words, num_word_embedding_dims)))
                
            for news_id in random.sample(un_clicked_news_ids, int(config['DEFAULT']['num_test_negatives'])):
                user_in.append(user_embedding)
                article_in.append(mp_doc_embedding[news_id])

                user_ids.append(user_id)
                news_ids.append(news_id)

            for news_id in clicked_news_ids:
                user_in.append(user_embedding)
                article_in.append(mp_doc_embedding[news_id])

                user_ids.append(user_id)
                news_ids.append(news_id)

        # #records必须是batchsize的整数倍,要不然会崩溃:https://stackoverflow.com/a/59971264/2468587
        while len(user_in) % sample_batch_size != 0:
            for user_id in test_user_ids[user_batch_id : user_batch_id + test_user_batch_size]:
                clicked_news_ids = set(mp_test_hist[user_id])
                un_clicked_news_ids = list(test_news_ids - clicked_news_ids)
                user_train_hist = mp_train_hist[user_id]

                user_embedding = []
                if len(user_train_hist) > num_used_hists:
                    for news_id in user_train_hist[:num_used_hists]:
                        user_embedding.append(mp_doc_embedding[news_id])
                else:
                    for news_id in user_train_hist[:-1]:
                        user_embedding.append(mp_doc_embedding[news_id])
                    num_paddings = num_used_hists - len(user_embedding)
                    for i in range(num_paddings):
                        user_embedding.append(np.zeros((num_words, num_word_embedding_dims)))
                
                news_id = random.sample(list(test_news_ids), 1)[0]
                user_in.append(user_embedding)
                article_in.append(mp_doc_embedding[news_id])

                user_ids.append(user_id)
                news_ids.append(news_id)
                if len(user_in) % sample_batch_size == 0:
                    break

        print('#records=%d,batchsize=%d'%(len(user_in), sample_batch_size))
        user_in = np.array(user_in)
        article_in = np.array(article_in)
        user_in = np.resize(user_in, (user_in.shape[0], 1) + user_in.shape[1:])
        article_in = np.resize(article_in, (article_in.shape[0], 1) + article_in.shape[1:])
        
        out = model.model.predict([user_in, article_in], batch_size = sample_batch_size, verbose = 1)
        outs.extend(list(out))
    
    save_prediction(user_ids, news_ids, outs)