Пример #1
0
def model_predict(path_finish, path_like):
    Config["normal_config"]["pretrain"] = True
    Config["normal_config"]["pretrain_model_dir"] = path_finish
    engine_finish = ModelEngine(config=Config, model=NFFM)

    sample_generator = SampleGenerator()

    print()
    print("------------start testing finish--------------")
    test_loader = sample_generator.instance_a_loader(t="test")
    df_finish = engine_finish.predict(test_loader)
    print("------------finish testing -------------------")
    print("------------start testing like ----------------")
    Config["normal_config"]["pretrain_model_dir"] = path_like
    engine_like = ModelEngine(config=Config, model=NFFM)
    df_like = engine_like.predict(test_loader)

    df_finish["like_probability"] = df_like["pred_probability"]
    df_finish.columns = [
        "uid", "item_id", "finish_probability", "like_probability"
    ]
    df_finish.to_csv(Config["normal_config"]["predict_file"] +
                     Config["normal_config"]["model_name"],
                     index=None,
                     float_format="%.6f")
Пример #2
0
def train(args):
    # Load Data
    spec = MODEL_SPEC[args.model]
    config = spec['config']
    config['pretrain'] = False
    wework_dir = config['data_dir']
    wework_rating = pd.read_csv(os.path.join(wework_dir,  config['train_filename']), sep=',', header=0, names=['','account_id', 'atlas_location_uuid', 'rating', 'timestamp', 'weight'],  engine='python')
    # # Reindex
    # account_id = ml1m_rating[['uid']].drop_duplicates().reindex()
    # account_id['userId'] = np.arange(len(user_id))
    # ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
    # item_id = ml1m_rating[['mid']].drop_duplicates()
    # item_id['itemId'] = np.arange(len(item_id))
    # ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')
    # ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]
    print('Range of AccountId is [{}, {}]'.format(wework_rating.account_id.min(), wework_rating.account_id.max()))
    print('Range of LocationId is [{}, {}]'.format(wework_rating.atlas_location_uuid.min(), wework_rating.atlas_location_uuid.max()))
    
    Engine = spec['engine']
    # DataLoader for training
    sample_generator = SampleGenerator(wework_rating, config)
    sample_generator.test()
    test_data = sample_generator.evaluate_data
    sample_generator.val()
    val_data = sample_generator.evaluate_data

   
    # Specify the exact model
    engine = Engine(config)
    # gamma = decaying factor
    scheduler = StepLR(engine.opt, step_size=1, gamma=0.75)
    train_negatives = []
    best_epoch = 0
    best_metric = float('inf')
    HR_10, NDCG_10 = 0, 0
    for epoch in range(config['num_epoch']):
        print('Epoch {} starts !'.format(epoch))
        print('-' * 80)

        train_loader, train_negative = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
        engine.train_an_epoch(train_loader, epoch_id=epoch)
        scheduler.step()
        plot_grad_flow(engine.model.named_parameters(), epoch)
        train_negative = flatten(train_negative)
        if len(train_negatives) != 0:
            train_negatives = pd.concat([train_negatives, train_negative], axis=0)
        else:
            train_negatives = train_negative
        metric, auc, HR5, HR10, NDCG5, NDCG10 = engine.evaluate(val_data, train_negatives, epoch_id=epoch)
        if metric < best_metric :
            best_epoch = epoch
            best_metric = metric
            HR_10, NDCG_10 = HR10, NDCG10
            engine.save(config['alias'], epoch, HR_10, NDCG_10)
            print ('Epoch {}: found best results on validation data: metric = {:.4f}, HR10 = {:.4f}, NDCG10 = {:.4f}'.format(epoch, best_metric, HR_10, NDCG_10))

    engine.load(config['alias'], best_epoch, HR_10, NDCG_10)
    metric, auc, HR5, HR10, NDCG5, NDCG10 = engine.evaluate(test_data, train_negatives, epoch_id=epoch)
    print('Best Epoch {}: metric = {:.4f}, auc = {:.4f}, HR@5 = {:.4f}, HR@10 = {:.4f},\
          NDCG@5 = {:.4f}, NDCG@10 = {:.4f}'.format(best_epoch, metric, auc, HR5, HR10, NDCG5, NDCG10))
def test_nlp_algs(review_json):
    bert_config = BertConfig()
    bert_engine = Engine(bert_config)
    bert_embedding_generator = BertEmbeddingGenerator(review_json)
    bert_tensors = generate_tensors(bert_embedding_generator, review_json)

    seq2seq_config = Seq2Seq.get_config()
    seq2seq_engine = Engine(seq2seq_config)
    seq2seq_embedding_generator = Seq2SeqEmbeddingGenerator(review_json)
    seq2seq_tensors = generate_tensors(seq2seq_embedding_generator,
                                       review_json)

    bert_rating_dataset = UserItemRatingDataset(bert_tensors)
    seq2seq_rating_dataset = UserItemRatingDataset(seq2seq_tensors)

    bert_evaluation_tool = SampleGenerator(bert_rating_dataset)
    seq2seq_evaluation_tool = SampleGenerator(seq2seq_rating_dataset)

    return (bert_evaluation_tool.evaluate_data(),
            seq2seq_evaluation_tool.evaluate_data())
# Reindex
user_id = ml1m_rating[['uid']].drop_duplicates().reindex()
user_id['userId'] = np.arange(len(user_id))
ml1m_rating = pd.merge(ml1m_rating, user_id, on=['uid'], how='left')
item_id = ml1m_rating[['mid']].drop_duplicates()
item_id['itemId'] = np.arange(len(item_id))
ml1m_rating = pd.merge(ml1m_rating, item_id, on=['mid'], how='left')
ml1m_rating = ml1m_rating[['userId', 'itemId', 'rating', 'timestamp']]
print('Range of userId is [{}, {}]'.format(ml1m_rating.userId.min(),
                                           ml1m_rating.userId.max()))
print('Range of itemId is [{}, {}]'.format(ml1m_rating.itemId.min(),
                                           ml1m_rating.itemId.max()))

# 加载训练数据
sample_generator = SampleGenerator(ratings=ml1m_rating)
evaluate_data = sample_generator.evaluate_data

# 指定训练的参数和训练模型
for config in [gmf_config, mlp_config, neumf_config]:
    if config == mlp_config:
        engine = MLPEngine(config)
    elif config == gmf_config:
        engine = GMFEngine(config)
    else:
        engine = NeuMFEngine(config)
    for epoch in range(config['num_epoch']):
        print('Epoch {} starts !'.format(epoch))
        print('-' * 80)
        train_loader = sample_generator.instance_a_train_loader(
            config['num_negative'], config['batch_size'])
Пример #5
0
elif args.model.lower() == "global_sum_embedding_gmf":

    config['latent_dim'] = config['latent_dim_mf']
    engine =New_Gloabl_sum_embedding_gmfEngine(config)

elif args.model.lower() == "global_sum_embedding_mlp":

    config['latent_dim'] = config['latent_dim_mf']
    engine =New_Gloabl_sum_embedding_MLPEngine(config)





# DataLoader for training
sample_generator = SampleGenerator(ratings=data_rating, train=data_rating_train, test=data_rating_test)

# Train this model
evaluate_data = sample_generator.evaluate_data
sample_train_data = sample_generator.sample_train_data

print("TRAINING:---------------------")
engine.evaluate(sample_train_data, epoch_id=0, save=False)
print("TESTING:----------------------")
hit_ratio_max, ndcg_max = engine.evaluate(evaluate_data, epoch_id=0)

for epoch in range(config['num_epoch']):
    print('Epoch {} starts !'.format(epoch))
    print('-' * 80)
    train_loader = sample_generator.instance_a_train_loader(config['num_negative'], config['batch_size'])
    engine.train_an_epoch(train_loader, epoch_id=epoch)
Пример #6
0
def main(params):
    books_df_sample, movies_df_sample = sample_data(params['books_df'],
                                                    params['movies_df'],
                                                    params['sample_size'])
    if params['use_itemVec']:
        books_df_sample, movies_df_sample = assign_vec(books_df_sample,
                                                       movies_df_sample,
                                                       params['itemVec_file'])
    books_df_sample, movies_df_sample = lbencoder(books_df_sample,
                                                  movies_df_sample, 5000)
    sample_generator = SampleGenerator(ratings_s=books_df_sample,
                                       ratings_t=movies_df_sample)
    evaluate_data = sample_generator.evaluate_data
    alias = 'conetItemVecc_factor{}neg{}_bz{}_{}_reg_0.0000001_{}'.format(\
        params['latent_dim'],params['num_negative'],params['batch_size'],''.join(params['layers']),params['id'])
    config = {
        'alias':
        alias,
        'num_epoch':
        params['epoch'],
        'batch_size':
        params['batch_size'],
        'optimizer':
        'adam',
        'adam_lr':
        1e-3,
        'num_users':
        books_df_sample['userId'].nunique(),
        'num_items_s':
        books_df_sample['itemId'].nunique(),
        'num_items_t':
        movies_df_sample['itemId'].nunique(),
        'device_id':
        0,
        'latent_dim':
        params['latent_dim'],
        'num_negative':
        params['num_negative'],
        'layers':
        params[
            'layers'],  # layers[0] is the concat of latent user vector & latent item vector
        'l2_regularization':
        0.0000001,  # MLP model is sensitive to hyper params
        'use_cuda':
        params['use_cuda'],
        'pretrain':
        False,
        'model_dir':
        'checkpoints/{}_Epoch{}_HR_s{:.4f}_NDCG_s{:.4f}_HR_t{:.4f}_NDCG_t{:.4f}.model'
    }
    engine = CoNetEngine(config)
    train_loader = sample_generator.instance_a_train_loader(
        config['num_negative'], config['batch_size'])
    res = []
    for epoch in range(config['num_epoch']):
        print('Epoch {} starts !'.format(epoch))
        print('-' * 80)
        engine.train_an_epoch(train_loader, epoch_id=epoch)
        hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t = engine.evaluate(
            evaluate_data, epoch_id=epoch)
        res.append([hit_ratio_s, ndcg_s, hit_ratio_t, ndcg_t])
        engine.save(config['alias'], epoch, hit_ratio_s, ndcg_s, hit_ratio_t,
                    ndcg_t)
    return res
Пример #7
0
w_dir = '/Users/linyishi/Desktop/毕业论文/recommendation_system/CoNet-torch/src'
os.chdir(w_dir)
import sys

sys.path.append(w_dir)
import pandas as pd
import numpy as np

from CoNet import CoNetEngine
from data import SampleGenerator

books_df_sample = pd.read_csv('books_df_sample.csv')
movies_df_sample = pd.read_csv('movies_df_sample.csv')

sample_generator = SampleGenerator(ratings_s=books_df_sample,
                                   ratings_t=movies_df_sample)
evaluate_data = sample_generator.evaluate_data

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'

conet_config = {
    'alias':
    'conet_factor8neg4_bz256_166432168_reg_0.0000001',
    'num_epoch':
    20,
    'batch_size':
    256,  # 1024,
    'optimizer':
    'adam',
    'adam_lr':
    1e-3,
Пример #8
0
book['user_embedding'] = book['user_embedding'].map(eval)
book['item_embedding'] = book['item_embedding'].map(eval)
movie['user_embedding'] = movie['user_embedding'].map(eval)
movie['item_embedding'] = movie['item_embedding'].map(eval)
music['user_embedding'] = music['user_embedding'].map(eval)
music['item_embedding'] = music['item_embedding'].map(eval)

book_user = list(set(book['userId']))     #1005 users
movie_user = list(set(movie['userId']))   #2007 users
music_user = list(set(music['userId']))   #160 users

book_movie_overlap = list(set(book['userId']).intersection(movie['userId']))     # 195 users
movie_music_overlap = list(set(movie['userId']).intersection(music['userId']))   # 40 users
book_music_overlap = list(set(music['userId']).intersection(book['userId']))     # 23 users
    
sample_book_generator = SampleGenerator(ratings=book)
evaluate_book_data = sample_book_generator.evaluate_data
sample_movie_generator = SampleGenerator(ratings=movie)
evaluate_movie_data = sample_movie_generator.evaluate_data
sample_music_generator = SampleGenerator(ratings=music)
evaluate_music_data = sample_music_generator.evaluate_data

engine = Engine(config)
train_book_loader = sample_book_generator.instance_a_train_loader(config['batch_size'])
train_music_loader = sample_music_generator.instance_a_train_loader(config['batch_size'])
train_movie_loader = sample_movie_generator.instance_a_train_loader(config['batch_size'])

with open('overlap_movie_music_index','r') as f:
    overlap = json.load(f)
movie_overlap = overlap['movie']
music_overlap = overlap['music']
Пример #9
0
from data import SampleGenerator
from config import Config
from models_engine import ModelEngine
from model.xDeepFM import xDeepFM
from model.mlp import MLP
# from model.DTFM import DTFM
from model.nffm import NFFM
from model.affm import AFFM

engine = ModelEngine(config=Config, model=AFFM)
sample_generator = SampleGenerator()

for epoch in range(Config["training_config"]['num_epoch']):
    print('Epoch {} starts!'.format(epoch))
    print('-' * 80)
    train_loader = sample_generator.instance_a_loader(t="train")
    engine.train_an_epoch(train_loader, epoch_id=epoch)
    # evaluation
    print()
    print("------------start evaluating-----------")
    evaluate_loader = sample_generator.instance_a_loader(t="val")
    auc = engine.evaluate(evaluate_loader, epoch_id=epoch)
    engine.save(epoch, auc=auc)

# close hd5 file
    200,
    'optimizer':
    'adam',
    'adam_lr':
    1e-3,
    'l2_regularization':
    0,
    'test_size':
    500,
    'GNNStep':
    3
}

train_data = get_train_data(config['train_path'], config['attack_types'])

valid_data = get_query(config['valid_query_path'])
test_data = get_query(config['test_query_path'])

sample_generator = SampleGenerator(config, train_data, valid_data, test_data)
engine = ProcedureEngine(config)

for epoch in range(config['num_epoch']):
    print('Epoch{}starts !'.format(epoch))
    print('_' * 80)

    #engine.train_an_epoch(sample_generator,epoch)
    #val_f1=engine.evaluate(sample_generator,epoch)
    #engine.save(config['alias'],epoch,val_f1)

    engine.get_result(sample_generator, epoch)