Пример #1
0
def wrsampleae(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0,
               batch_size=256, gpu_on=True, **unused):
    progress = WorkSplitter()

    progress.section("WRSampleAE: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("WRSampleAE: Training")
    m, n = matrix_train.shape

    marks = sparse.csr_matrix(matrix_train.shape)
    marks[(matrix_train != 0).nonzero()] = 1

    matrix_train += matrix_unif_train

    model = WRSampleAE(n, rank, m, lamb=lam, batch_size=batch_size, gpu_on=gpu_on)
    metric_names = ['NLL', 'AUC']

    RQ, X, xBias, Y, yBias = model.train_model(matrix_train, marks, matrix_valid, iteration, metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, X, xBias, Y, yBias
Пример #2
0
def batchsampleae(matrix_train,
                  matrix_valid,
                  matrix_unif_train,
                  iteration=100,
                  lam=0.01,
                  rank=50,
                  seed=0,
                  batch_size=256,
                  gpu_on=True,
                  step=3,
                  **unused):
    progress = WorkSplitter()

    progress.section("BatchSampleAE: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("BatchSampleAE: Training")
    m, n = matrix_train.shape
    model = BatchSampleAE(n,
                          rank,
                          lamb=lam,
                          batch_size=batch_size,
                          step=step,
                          gpu_on=gpu_on)
    metric_names = ['NLL', 'AUC']

    RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_unif_train,
                                               matrix_valid, iteration,
                                               metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, X, xBias, Y, yBias
Пример #3
0
def sensitivity(train, validation, params):
    progress = WorkSplitter()
    progress.section("PMI-PLRec Default")
    RQ, Yt, _ = params['models']['NCE-PLRec'](train,
                                              embeded_matrix=np.empty((0)),
                                              iteration=params['iter'],
                                              rank=params['rank'],
                                              lam=params['lambda'],
                                              root=1.0)
    Y = Yt.T

    default_prediction = predict(matrix_U=RQ,
                                 matrix_V=Y,
                                 topK=params['topK'][-1],
                                 matrix_Train=train,
                                 gpu=True)

    default_result = evaluate(default_prediction, validation, params['metric'],
                              params['topK'])
    print("-")
    print("Rank: {0}".format(params['rank']))
    print("Lambda: {0}".format(params['lambda']))
    print("SVD Iteration: {0}".format(params['iter']))
    print("Evaluation Ranking Topk: {0}".format(params['topK']))
    for key in default_result.keys():
        print("{0} :{1}".format(key, default_result[key]))

    sensitivity_results = dict()
    for root in tqdm(params['root']):
        progress.section("PMI-PLRec, Root: " + str(root))
        RQ, Yt, _ = params['models']['NCE-PLRec'](train,
                                                  embeded_matrix=np.empty((0)),
                                                  iteration=params['iter'],
                                                  rank=params['rank'],
                                                  lam=params['lambda'],
                                                  root=root)
        Y = Yt.T

        prediction = predict(matrix_U=RQ,
                             matrix_V=Y,
                             topK=params['topK'][-1],
                             matrix_Train=train,
                             gpu=True)

        result = evaluate(prediction, validation, params['metric'],
                          params['topK'])

        sensitivity_results[root] = result
        print("-")
        print("Root: {0}".format(root))
        print("Rank: {0}".format(params['rank']))
        print("Lambda: {0}".format(params['lambda']))
        print("SVD Iteration: {0}".format(params['iter']))
        print("Evaluation Ranking Topk: {0}".format(params['topK']))
        for key in result.keys():
            print("{0} :{1}".format(key, result[key]))

    return default_result, sensitivity_results
def main(args):
    # Progress bar
    progress = WorkSplitter()

    progress.section("Load Data")
    if args.emb_type == 'bert':
        emb_size = 768
    elif args.emb_type == 'xlmr':
        emb_size = 1024

    # Load Data
    start_time = time.time()
    print("WARNING: Embedding size is set to", emb_size)
    data = Data(args, args.path, args.train, args.valid,emb_size, is_lb=True)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))
    
    #build model
    progress.section("Build Model")

    if args.network_architecture == 'embedding_net':
        model = EmbeddingNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100],corruption=args.corruption)
    elif args.network_architecture == 'embedding_highway_net':
        model = EmbeddingHighWayNet(data.n_token, data.n_feature, emb_size, [1024, 2000, 1000, 500, 100])
    else:
        raise NotImplementedError('either use embedding_net or embedding_highway_net')
    model.cuda()
    print(model)

    model.load_state_dict(torch.load(args.checkpoint))

    print(model)
    lb_loader = data.instance_a_lb_loader(args.batch)

    lbs = {'user_lb': list(), 'tweet_lb': list()}
    preds = []
    model.eval()
    with torch.no_grad():
        lb_iterator = tqdm(lb_loader, desc="lb")
        for _, batch in enumerate(lb_iterator):
            token, feature, tweet_lb, user_lb, embedding = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3], batch[4].float().cuda()#,batch[4].cuda()
            pred = torch.sigmoid(model(token,feature,embedding)).detach().cpu().numpy()
            
            if "Valid" in args.valid:
                lbs['tweet_lb'] += tweet_lb
            else:
                lbs['tweet_lb'] += tweet_lb[0]
            lbs['user_lb'] += user_lb[0]
            preds.append(pred)

        final_csv = pd.DataFrame(lbs)
        preds = np.float64(np.vstack(preds))
        if not os.path.exists(args.spath):
            os.makedirs(args.spath)

        print("Generating CSVs...")
        for i, engage in enumerate(["reply", "retweet", "comment", "like"]):
            final_csv[engage] = preds[:,i]
            final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
Пример #5
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(
        args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir,
                                   name=args.train_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(
        R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(
        path=args.data_dir, name=args.train_item_keyphrase_set).toarray()

    table_path = load_yaml('config/global.yml', key='path')['tables']
    parameters = find_best_hyperparameters(table_path + args.dataset_name,
                                           'NDCG')
    parameters_row = parameters.loc[parameters['model'] == args.model]

    if args.dataset_name == "yelp/":
        R_train_item_keyphrase = R_train_item_keyphrase.T

    start_time = time.time()

    results = critiquing(
        matrix_Train=R_train,
        matrix_Test=R_test,
        keyphrase_freq=R_train_keyphrase,
        item_keyphrase_freq=R_train_item_keyphrase,
        num_users_sampled=args.num_users_sampled,
        num_items_sampled=args.num_items_sampled,
        max_iteration_threshold=args.max_iteration_threshold,
        dataset_name=args.dataset_name,
        model=models[args.model],
        parameters_row=parameters_row,
        critiquing_model_name=args.critiquing_model_name,
        keyphrase_selection_method=args.keyphrase_selection_method,
        topk=args.topk,
        lamb=args.lamb)

    print("Final Time Elapsed: {}".format(inhour(time.time() - start_time)))

    table_path = load_yaml('config/global.yml', key='path')['tables']
    save_dataframe_csv(results, table_path, args.save_path)
Пример #6
0
def main(args):
    progress = WorkSplitter()
    progress.section("Tune Parameters")
    params = load_yaml(args.grid)
    params['models'] = {params['models']: models[params['models']]}
    train = load_numpy(path=args.path, name=args.dataset + args.train)
    unif_train = load_numpy(path=args.path, name=args.dataset + args.unif_train)
    valid = load_numpy(path=args.path, name=args.dataset + args.valid)
    hyper_parameter_tuning(train, valid, params, unif_train=unif_train, save_path=args.dataset + args.name,
                           gpu_on=args.gpu, seed=args.seed, way=args.way, dataset=args.dataset)
Пример #7
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyper parameter settings
    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.path))
    print("Train File Name: {}".format(args.train))
    if args.validation:
        print("Valid File Name: {}".format(args.valid))
    print("Algorithm: {}".format(args.model))
    print("Lambda Diversity: {}".format(args.lambda_diversity))
    print("Lambda Serendipity: {}".format(args.lambda_serendipity))
    print("Nearest Neighbor Number: {}".format(args.k))
    print("Evaluation Ranking Topk: {}".format(args.topk))

    # Load Data
    progress.section("Loading Data")
    start_time = time.time()
    R_train = load_numpy(path=args.path, name=args.train)
    print("Elapsed: {}".format(inhour(time.time() - start_time)))
    print("Train U-I Dimensions: {}".format(R_train.shape))

    progress.section("Train")
    model = models[args.model]()
    model.train(R_train)

    progress.section("Predict")
    prediction_score = model.predict(
        R_train,
        k=args.k,
        lambda_diversity=args.lambda_diversity,
        lambda_serendipity=args.lambda_serendipity)

    prediction = predict(prediction_score=prediction_score,
                         topK=args.topk,
                         matrix_Train=R_train)

    if args.validation:
        progress.section("Create Metrics")
        start_time = time.time()

        metric_names = [
            'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'
        ]

        R_valid = load_numpy(path=args.path, name=args.valid)
        result = evaluate(prediction, R_valid, metric_names, [args.topk])
        print("-")
        for metric in result.keys():
            print("{}:{}".format(metric, result[metric]))
        print("Elapsed: {}".format(inhour(time.time() - start_time)))
Пример #8
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(path=args.data_dir, name=args.train_item_keyphrase_set).toarray()

    table_path = load_yaml('config/global.yml', key='path')['tables']
    # parameters = find_best_hyperparameters(table_path+args.dataset_name, 'NDCG')
    # parameters_row = parameters.loc[parameters['model'] == args.model]

    parameters_row = {
        'iter' : 10,
        'lambda' : 200,
        'rank' : 200 
    }
    
    keyphrases_names = load_dataframe_csv(path = args.data_dir, name = "Keyphrases.csv")['Phrases'].tolist()
    
    results = critiquing(matrix_Train=R_train,
                         matrix_Test=R_test,
                         keyphrase_freq=R_train_keyphrase,
                         item_keyphrase_freq=R_train_item_keyphrase.T,
                         num_users_sampled=args.num_users_sampled,
                         num_items_sampled=args.num_items_sampled,
                         max_iteration_threshold=args.max_iteration_threshold,
                         dataset_name=args.dataset_name,
                         model=models[args.model],
                         parameters_row=parameters_row,
                         critiquing_model_name=args.critiquing_model_name,
                         lamb = args.lambdas,
                         keyphrases_names = keyphrases_names,
                         keyphrase_selection_method = args.keyphrase_selection_method)

    table_path = load_yaml('config/global.yml', key='path')['tables']
    save_dataframe_csv(results, table_path, args.save_path)
Пример #9
0
def main(args):
    progress = WorkSplitter()

    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.data_dir))
    print("Implicit User Feedback: {}".format(args.implicit))

    progress.section("Load Raw Data")
    rating_matrix, timestamp_matrix = get_yelp_df(
        args.data_dir + args.data_name,
        sampling=True,
        top_user_num=args.top_user_num,
        top_item_num=args.top_item_num)

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit)
    import ipdb
    ipdb.set_trace()

    progress.section("Save NPZ")
    save_numpy(rtrain, args.data_dir, "Rtrain")
    save_numpy(rvalid, args.data_dir, "Rvalid")
    save_numpy(rtest, args.data_dir, "Rtest")
    save_numpy(rtime, args.data_dir, "Rtime")
    save_array(nonzero_index, args.data_dir, "Index")
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(
        args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir,
                                   name=args.train_keyphrase_set).toarray()
    print("Train User Keyphrase U-I Dimensions: {}".format(
        R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(
        path=args.data_dir, name=args.train_item_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(
        R_train_item_keyphrase.shape))

    # table_path = load_yaml('config/global.yml', key='path')['tables']
    # parameters = find_best_hyperparameters(table_path+args.dataset_name, 'NDCG')
    # parameters_row = parameters.loc[parameters['model'] == args.model]
    parameters_row = pd.DataFrame({'iter': [4], 'lambda': [80], 'rank': [200]})

    results = critiquing(matrix_Train=R_train,
                         matrix_Test=R_test,
                         keyphrase_freq=R_train_keyphrase,
                         item_keyphrase_freq=R_train_item_keyphrase,
                         num_users_sampled=args.num_users_sampled,
                         num_items_sampled=args.num_items_sampled,
                         max_iteration_threshold=args.max_iteration_threshold,
                         dataset_name=args.dataset_name,
                         model=models[args.model],
                         parameters_row=parameters_row,
                         critiquing_model_name=args.critiquing_model_name)

    # table_path = load_yaml('config/global.yml', key='path')['tables']
    table_path = '/home/shuyang/data4/LatentLinearCritiquingforConvRecSys/'
    save_dataframe_csv(results, table_path, args.save_path)
Пример #11
0
def hyper_parameter_tuning(train, validation, params, save_path):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']

    try:
        df = load_dataframe_csv(table_path, save_path)
    except:
        df = pd.DataFrame(columns=['model', 'k', 'topK'])

    num_user = train.shape[0]

    for algorithm in params['models']:

        for k in params['k']:

            if ((df['model'] == algorithm) & (df['k'] == k)).any():
                continue

            format = "model: {}, k: {}"
            progress.section(format.format(algorithm, k))

            progress.subsection("Training")
            model = params['models'][algorithm]()
            model.train(train)

            progress.subsection("Prediction")
            prediction_score = model.predict(train, k=k)

            prediction = predict(prediction_score=prediction_score,
                                 topK=params['topK'][-1],
                                 matrix_Train=train)

            progress.subsection("Evaluation")
            result = evaluate(prediction, validation, params['metric'],
                              params['topK'])

            result_dict = {'model': algorithm, 'k': k}

            for name in result.keys():
                result_dict[name] = [
                    round(result[name][0], 4),
                    round(result[name][1], 4)
                ]

            df = df.append(result_dict, ignore_index=True)

            save_dataframe_csv(df, table_path, save_path)
Пример #12
0
def biasedmf(matrix_train,
             matrix_valid,
             matrix_unif_train,
             iteration=100,
             lam=0.01,
             rank=50,
             seed=0,
             batch_size=500,
             way=None,
             gpu_on=True,
             **unused):
    progress = WorkSplitter()

    progress.section("BiasedMF: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("BiasedMF: Training")
    m, n = matrix_train.shape
    model = BiasedMF(m,
                     n,
                     rank,
                     lamb=lam,
                     batch_size=batch_size,
                     gpu_on=gpu_on)
    metric_names = ['NLL', 'AUC']

    if way == 'unif':
        RQ, Y, user_bias, item_bias, _ = model.train_model(
            matrix_unif_train, matrix_valid, iteration, metric_names)
    elif way == 'combine':
        matrix_train += matrix_unif_train
        RQ, Y, user_bias, item_bias, _ = model.train_model(
            matrix_train, matrix_valid, iteration, metric_names)
    else:
        RQ, Y, user_bias, item_bias, prediction = model.train_model(
            matrix_train, matrix_valid, iteration, metric_names)

    # if gpu_on:
    #     np.savetxt('Matlab/biasedmf_prediction.txt', cp.asnumpy(prediction))
    # else:
    #     np.savetxt('Matlab/biasedmf_prediction.txt', prediction)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, Y, user_bias, item_bias
Пример #13
0
def main(args):
    progress = WorkSplitter()

    progress.section("Yahoo R3: Load Raw Data")
    user_df = pd.read_csv(args.path + args.dataset + args.user,
                          sep=args.sep,
                          header=None,
                          names=args.names)
    random_df = pd.read_csv(args.path + args.dataset + args.random,
                            sep=args.sep,
                            header=None,
                            names=args.names)

    if args.implicit:
        """
        If only implicit (clicks, views, binary) feedback, convert to implicit feedback
        """
        user_df['rating'].loc[user_df['rating'] < args.threshold] = -1
        user_df['rating'].loc[user_df['rating'] >= args.threshold] = 1

        random_df['rating'].loc[random_df['rating'] < args.threshold] = -1
        random_df['rating'].loc[random_df['rating'] >= args.threshold] = 1

    progress.section("Yahoo R3: Randomly Split Random Set")
    m, n = max(user_df['uid']) + 1, max(user_df['iid']) + 1
    unif_train, validation, test = seed_randomly_split(df=random_df,
                                                       ratio=args.ratio,
                                                       split_seed=args.seed,
                                                       shape=(m, n))

    progress.section("Yahoo R3: Save NPZ")
    save_dir = args.path + args.dataset
    train = sparse.csr_matrix(
        (user_df['rating'], (user_df['uid'], user_df['iid'])),
        shape=(m, n),
        dtype='float32')
    save_numpy(train, save_dir, "S_c")
    save_numpy(unif_train, save_dir, "S_t")
    save_numpy(validation, save_dir, "S_va")
    save_numpy(test, save_dir, "S_te")

    progress.section("Yahoo R3: Statistics of Data Sets")
    print('* S_c  #num: %6d, pos: %.6f, neg: %.6f' %
          (train.count_nonzero(), np.sum(train == 1) / train.count_nonzero(),
           1 - np.sum(train == 1) / train.count_nonzero()))
    print('* S_t  #num: %6d, pos: %.6f, neg: %.6f' %
          (unif_train.count_nonzero(),
           np.sum(unif_train == 1) / unif_train.count_nonzero(),
           1 - np.sum(unif_train == 1) / unif_train.count_nonzero()))
    print('* S_va #num: %6d, pos: %.6f, neg: %.6f' %
          (validation.count_nonzero(),
           np.sum(validation == 1) / validation.count_nonzero(),
           1 - np.sum(validation == 1) / validation.count_nonzero()))
    print('* S_te #num: %6d, pos: %.6f, neg: %.6f' %
          (test.count_nonzero(), np.sum(test == 1) / test.count_nonzero(),
           1 - np.sum(test == 1) / test.count_nonzero()))
Пример #14
0
def execute(train, test, params, model, gpu_on=True, analytical=False):
    progress = WorkSplitter()

    columns = ['model', 'rank', 'lambda', 'epoch', 'corruption', 'topK']

    progress.section("\n".join(
        [":".join((str(k), str(params[k]))) for k in columns]))

    df = pd.DataFrame(columns=columns)

    progress.subsection("Train")
    RQ, Yt, Bias = model(train,
                         epoch=params['epoch'],
                         lamb=params['lambda'],
                         rank=params['rank'],
                         corruption=params['corruption'])
    Y = Yt.T

    progress.subsection("Prediction")
    prediction = predict(matrix_U=RQ,
                         matrix_V=Y,
                         bias=Bias,
                         topK=params['topK'][-1],
                         matrix_Train=train,
                         gpu=gpu_on)

    progress.subsection("Evaluation")
    result = evaluate(prediction,
                      test,
                      params['metric'],
                      params['topK'],
                      analytical=analytical)

    if analytical:
        return result
    else:
        result_dict = params

        for name in result.keys():
            result_dict[name] = [
                round(result[name][0], 4),
                round(result[name][1], 4)
            ]
        df = df.append(result_dict, ignore_index=True)

        return df
Пример #15
0
def uncertainty(Rtrain, df_input, rank):
    progress = WorkSplitter()
    m, n = Rtrain.shape

    valid_models = vaes.keys()

    results = []

    for run in range(1):

        for idx, row in df_input.iterrows():
            row = row.to_dict()

            if row['model'] not in valid_models:
                continue

            progress.section(json.dumps(row))

            if 'optimizer' not in row.keys():
                row['optimizer'] = 'RMSProp'

            model = vaes[row['model']](n,
                                       rank,
                                       batch_size=100,
                                       lamb=row['lambda'],
                                       optimizer=Regularizer[row['optimizer']])

            model.train_model(Rtrain,
                              corruption=row['corruption'],
                              epoch=row['iter'])
            data_batches = model.get_batches(Rtrain, batch_size=100)
            progress.subsection("Predict")
            for batch in tqdm(data_batches):
                batch_size = batch.shape[0]
                _, stds = model.uncertainty(batch.todense())
                num_rated = np.squeeze(np.asarray(np.sum(batch, axis=1)))
                std = np.mean(stds, axis=1)
                results.append(
                    pd.DataFrame({
                        'model': [row['model']] * batch_size,
                        'numRated': num_rated,
                        'std': std
                    }))

    return pd.concat(results)
Пример #16
0
def wrsamplemf(matrix_train,
               matrix_valid,
               matrix_unif_train,
               iteration=100,
               lam=0.01,
               rank=50,
               seed=0,
               batch_size=500,
               gpu_on=True,
               **unused):
    progress = WorkSplitter()

    progress.section("WRSampleMF: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("WRSampleMF: Training")
    m, n = matrix_train.shape

    marks = sparse.csr_matrix(matrix_train.shape)
    marks[(matrix_train != 0).nonzero()] = 1

    matrix_train += matrix_unif_train
    num_samples = len(matrix_train.nonzero()[0])

    model = WRSampleMF(m,
                       n,
                       rank,
                       num_samples,
                       lamb=lam,
                       batch_size=batch_size,
                       gpu_on=gpu_on)
    metric_names = ['NLL', 'AUC']

    RQ, Y, user_bias, item_bias, confidence, user_item_pairs, prediction = model.train_model(
        matrix_train, marks, matrix_valid, iteration, metric_names)

    # np.savetxt('Matlab/wrsamplemf_samples.txt', user_item_pairs)
    # np.savetxt('Matlab/wrsamplemf_weights.txt', confidence)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, Y, user_bias, item_bias
Пример #17
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Load Data
    progress.section("Load Data")
    start_time = time.time()
    data = Data(args.path, args.train, args.valid,is_lb=True)
    print("Elapsed: {0}".format(inhour(time.time() - start_time)))
    
    #build model
    progress.section("Build Model")

    model = FeatureNet(data.n_token, data.n_feature, [1024, 2000, 1000, 500, 100])
    model.cuda()
    print(model)

    model.cuda()
    model.load_state_dict(torch.load(args.checkpoint))

    print(model)
    lb_loader = data.instance_a_lb_loader(args.batch)

    lbs = {'user_lb': list(), 'tweet_lb': list()}
    preds = []
    model = model.eval()
    with torch.no_grad():
        lb_iterator = tqdm(lb_loader, desc="lb")
        for _, batch in enumerate(lb_iterator):
            token, feature, tweet_lb, user_lb = batch[0].float().cuda(), batch[1].float().cuda(), batch[2], batch[3]#,batch[4].cuda()
            pred = torch.sigmoid(model(token,feature)).detach().cpu().numpy()
            lbs['tweet_lb'] += tweet_lb[0]
            lbs['user_lb'] += user_lb[0]
            preds.append(pred)

        final_csv = pd.DataFrame(lbs)
        preds = np.float64(np.vstack(preds))
        if not os.path.exists(args.spath):
            os.makedirs(args.spath)

        print("Generating CSVs...")
        for i, engage in enumerate(["reply", "retweet", "comment", "like"]):
            final_csv[engage] = preds[:,i]
            final_csv[['tweet_lb','user_lb',engage]].to_csv(os.path.join(args.spath, engage+'.csv'),index=False, header=False)
Пример #18
0
def execute(train, test, params, model, analytical=False):
    progress = WorkSplitter()

    columns = ['model', 'k', 'topK']

    progress.section("\n".join(
        [":".join((str(k), str(params[k]))) for k in columns]))

    df = pd.DataFrame(columns=columns)

    progress.subsection("Train")
    model = model()
    model.train(train)

    progress.subsection("Prediction")
    prediction_score = model.predict(train, k=params['k'])

    prediction = predict(prediction_score=prediction_score,
                         topK=params['topK'][-1],
                         matrix_Train=train)

    progress.subsection("Evaluation")
    result = evaluate(prediction,
                      test,
                      params['metric'],
                      params['topK'],
                      analytical=analytical)

    if analytical:
        return result
    else:
        result_dict = params

        for name in result.keys():
            result_dict[name] = [
                round(result[name][0], 4),
                round(result[name][1], 4)
            ]
        df = df.append(result_dict, ignore_index=True)

        return df
Пример #19
0
def main(args):
    progress = WorkSplitter()

    progress.section("Load Raw Data")
    rating_matrix = load_pandas(row_name='userId',
                                col_name='itemId',
                                value_name=None,
                                path=args.path,
                                name=args.name,
                                shape=args.shape)
    timestamp_matrix = load_pandas(row_name='userId',
                                   col_name='itemId',
                                   value_name='Timestamp',
                                   path=args.path,
                                   name=args.name,
                                   shape=args.shape)
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit,
        sampling=True,
        percentage=0.2)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
def main(args):
    progress = WorkSplitter()
    progress.section("Load Raw Data")
    rating_matrix = load_pandas_without_names(
        path=args.path,
        name=args.name,
        row_name='userId',
        sep='\t',
        col_name='trackId',
        value_name='rating',
        shape=args.shape,
        names=['userId', 'trackId', 'rating'])
    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index = split_seed_randomly(
        rating_matrix=rating_matrix,
        ratio=args.ratio,
        threshold=80,
        implicit=args.implicit,
        sampling=True,
        percentage=0.2)
    print("Done splitting Yahoo dataset")
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_array(nonzero_index, args.path, "Index")
    print("Done saving data for yahoo after splitting")
def initfeatureembedae(matrix_train, matrix_valid, iteration=100, lam=0.01, rank=50, seed=0, batch_size=256, way='both',
                       dataset=None, gpu_on=True, **unused):
    progress = WorkSplitter()

    progress.section("InitFeatureEmbedAE: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("InitFeatureEmbedAE: Load the variables trained on S_t")

    X = np.load('latent/' + dataset + 'unif_X_AutoRec_200.npy')
    xBias = np.load('latent/' + dataset + 'unif_xB_AutoRec_200.npy')
    Y = np.load('latent/' + dataset + 'unif_Y_AutoRec_200.npy')
    yBias = np.load('latent/' + dataset + 'unif_yB_AutoRec_200.npy')

    progress.section("InitFeatureEmbedAE: Training")
    m, n = matrix_train.shape
    model = InitFeatureEmbedAE(n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on, init_X=X, init_Y=Y,
                               init_xBias=xBias, init_yBias=yBias, way=way)
    metric_names = ['NLL', 'AUC']
    RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_valid, iteration, metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, X, xBias, Y, yBias
Пример #22
0
def restrictedbatchsamplemf(matrix_train,
                            matrix_valid,
                            matrix_unif_train,
                            iteration=100,
                            lam=0.01,
                            rank=50,
                            seed=0,
                            batch_size=500,
                            gpu_on=True,
                            step=3,
                            way=None,
                            **unused):
    progress = WorkSplitter()

    progress.section("RestrictedBatchSampleMF: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("RestrictedBatchSampleMF: Training")
    m, n = matrix_train.shape

    model = BatchSampleMF(m,
                          n,
                          rank,
                          lamb=lam,
                          batch_size=batch_size,
                          step=step,
                          gpu_on=gpu_on,
                          way=way)
    metric_names = ['NLL', 'AUC']

    RQ, Y, user_bias, item_bias = model.train_model(matrix_train,
                                                    matrix_unif_train,
                                                    matrix_valid, iteration,
                                                    metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, Y, user_bias, item_bias
Пример #23
0
def propensitymf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, rank=50, seed=0, batch_size=500,
                 gpu_on=True, **unused):
    progress = WorkSplitter()

    progress.section("PropensityMF: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("PropensityMF: Calculating Propensity Score")
    m, n = matrix_train.shape
    P_O = matrix_train.count_nonzero() / (m * n)
    P_YO = np.array([np.sum(matrix_train == -1) / matrix_train.count_nonzero(),
                     1 - np.sum(matrix_train == -1) / matrix_train.count_nonzero()])
    P_Y = np.array([np.sum(matrix_unif_train == -1) / matrix_unif_train.count_nonzero(),
                    1 - np.sum(matrix_unif_train == -1) / matrix_unif_train.count_nonzero()])
    invP = 1 / (P_YO * P_O / P_Y)

    # Note: Propensity MF uses S_c and S_t as training set
    matrix_train += matrix_unif_train

    progress.section("PropensityMF: Training")
    m, n = matrix_train.shape
    model = PropensityMF(m, n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on)
    metric_names = ['NLL', 'AUC']
    RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_valid, invP, iteration, metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, Y, user_bias, item_bias
Пример #24
0
def unionsamplemf(matrix_train,
                  matrix_valid,
                  matrix_unif_train,
                  iteration=100,
                  lam=0.01,
                  rank=50,
                  seed=0,
                  batch_size=500,
                  confidence=0.9,
                  gpu_on=True,
                  **unused):
    progress = WorkSplitter()

    progress.section("UnionSampleMF: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("UnionSampleMF: Training")
    m, n = matrix_train.shape
    model = UnionSampleMF(m,
                          n,
                          rank,
                          lamb=lam,
                          batch_size=batch_size,
                          gpu_on=gpu_on,
                          confidence=confidence)
    metric_names = ['NLL', 'AUC']

    marks = sparse.csr_matrix(matrix_train.shape)
    marks[(matrix_train != 0).nonzero()] = 1

    matrix_train += matrix_unif_train
    RQ, Y, user_bias, item_bias = model.train_model(matrix_train, marks,
                                                    matrix_valid, iteration,
                                                    metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, Y, user_bias, item_bias
Пример #25
0
def main(args):
    progress = WorkSplitter()

    raw = pd.read_csv(args.path + args.name,
                      names=['user', 'item', 'rating', 'timestamp'])

    raw['userID'] = pd.factorize(raw.user)[0]
    raw['itemID'] = pd.factorize(raw.item)[0]

    progress.section("Load Raw Data")
    rating_matrix = getSparseMatrix(raw,
                                    row_name='userID',
                                    col_name='itemID',
                                    value_name='rating')
    timestamp_matrix = getSparseMatrix(raw,
                                       row_name='userID',
                                       col_name='itemID',
                                       value_name='timestamp')

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, nonzero_index, rtime = time_ordered_split(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.ratio,
        implicit=args.implicit)
    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
    save_array(nonzero_index, args.path, "Index")
Пример #26
0
def autorec(matrix_train,
            matrix_valid,
            matrix_unif_train,
            iteration=100,
            lam=0.01,
            rank=50,
            seed=0,
            batch_size=256,
            way=None,
            gpu_on=True,
            **unused):
    progress = WorkSplitter()

    progress.section("AutoRec: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("AutoRec: Training")
    m, n = matrix_train.shape
    model = AutoRec(n, rank, lamb=lam, batch_size=batch_size, gpu_on=gpu_on)
    metric_names = ['NLL', 'AUC']

    if way == 'unif':
        RQ, X, xBias, Y, yBias = model.train_model(matrix_unif_train,
                                                   matrix_valid, iteration,
                                                   metric_names)
    elif way == 'combine':
        matrix_train += matrix_unif_train
        RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_valid,
                                                   iteration, metric_names)
    else:
        RQ, X, xBias, Y, yBias = model.train_model(matrix_train, matrix_valid,
                                                   iteration, metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, X, xBias, Y, yBias
Пример #27
0
def causalsamplemf(matrix_train, matrix_valid, matrix_unif_train, iteration=100, lam=0.01, lam2=0.01, rank=50,
                   seed=0, batch_size=500, gpu_on=True, **unused):
    progress = WorkSplitter()

    progress.section("CausalSampleMF: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("CausalSampleMF: Training")
    m, n = matrix_train.shape

    # Create new item IDs for S_t (i.e., [n, n*2)
    unif_user_item_matrix = lil_matrix(matrix_unif_train)
    unif_user_item_pairs = np.asarray(unif_user_item_matrix.nonzero()).T
    unif_label = np.asarray(matrix_unif_train[unif_user_item_pairs[:, 0], unif_user_item_pairs[:, 1]]).T
    unif_user_item_pairs[:, 1] += n

    # Create new csr matrix including union of S_c and S_t
    norm_user_item_matrix = lil_matrix(matrix_train)
    norm_user_item_pairs = np.asarray(norm_user_item_matrix.nonzero()).T
    norm_label = np.asarray(matrix_train[norm_user_item_pairs[:, 0], norm_user_item_pairs[:, 1]]).T

    user_item_pairs = np.vstack((unif_user_item_pairs, norm_user_item_pairs))
    labels = np.vstack((unif_label, norm_label))
    matrix_train = sparse.csr_matrix(
        (labels[:, 0], (user_item_pairs[:, 0], user_item_pairs[:, 1])),
        shape=(m, n * 2), dtype='float32')

    model = CausalSampleMF(m, n, rank, lamb=lam, lamb2=lam2, batch_size=batch_size, gpu_on=gpu_on)
    metric_names = ['NLL', 'AUC']
    RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_valid, iteration, metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, Y, user_bias, item_bias
Пример #28
0
def softlabelae(matrix_train,
                matrix_valid,
                iteration=100,
                lam=0.01,
                rank=50,
                rank2=50,
                tau=2,
                seed=0,
                batch_size=256,
                confidence=0.9,
                dataset=None,
                gpu_on=True,
                **unused):
    progress = WorkSplitter()

    progress.section("SoftLabelAE: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("SoftLabelAE: Load the variables trained on S_t")

    X = np.load('latent/' + dataset + 'unif_X_DeepAutoRec_200.npy')
    Y = np.load('latent/' + dataset + 'unif_Y_DeepAutoRec_200.npy')
    Z = np.load('latent/' + dataset + 'unif_Z_DeepAutoRec_200.npy')
    K = np.load('latent/' + dataset + 'unif_K_DeepAutoRec_200.npy')
    xBias = np.load('latent/' + dataset + 'unif_xB_DeepAutoRec_200.npy')
    yBias = np.load('latent/' + dataset + 'unif_yB_DeepAutoRec_200.npy')
    zBias = np.load('latent/' + dataset + 'unif_zB_DeepAutoRec_200.npy')
    kBias = np.load('latent/' + dataset + 'unif_kB_DeepAutoRec_200.npy')

    progress.section("SoftLabelAE: Training")
    m, n = matrix_train.shape
    model = SoftLabelAE(n,
                        rank,
                        rank2,
                        lamb=lam,
                        batch_size=batch_size,
                        gpu_on=gpu_on,
                        init_X=X,
                        init_Y=Y,
                        init_Z=Z,
                        init_K=K,
                        init_xBias=xBias,
                        init_yBias=yBias,
                        init_zBias=zBias,
                        init_kBias=kBias,
                        tau=tau,
                        confidence=confidence)
    metric_names = ['NLL', 'AUC']
    RQ, X, xBias, Y, yBias, Z, zBias, K, kBias = model.train_model(
        matrix_train, matrix_valid, iteration, metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, X, xBias, Y, yBias, Z, zBias, K, kBias
Пример #29
0
def main(args):
    progress = WorkSplitter()

    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.path))
    print("Validation: {}".format(args.validation))
    print("Implicit: {}".format(args.implicit))

    progress.section("Load Raw Data")
    rating_matrix = load_pandas(path=args.path,
                                name=args.name,
                                shape=args.shape)
    timestamp_matrix = load_pandas(path=args.path,
                                   value_name='timestamp',
                                   name=args.name,
                                   shape=args.shape)

    progress.section("Split CSR Matrices")
    rtrain, rvalid, rtest, _, _, rtime = split_user_randomly(
        rating_matrix=rating_matrix,
        timestamp_matrix=timestamp_matrix,
        ratio=args.split_user_ratio,
        implicit=args.implicit)

    if args.validation:
        rtrain, rvalid, _, _, _ = time_ordered_split(
            rating_matrix=rtrain,
            timestamp_matrix=rtime,
            ratio=args.split_train_valid_ratio,
            implicit=False,
            remove_empty=False)

    ractive, rtest, _, _, _ = time_ordered_split(
        rating_matrix=rtest,
        timestamp_matrix=rtime,
        ratio=args.split_active_test_ratio,
        implicit=False,
        remove_empty=False)

    progress.section("Save NPZ")
    save_numpy(rtrain, args.path, "Rtrain")
    save_numpy(rvalid, args.path, "Rvalid")
    save_numpy(ractive, args.path, "Ractive")
    save_numpy(rtest, args.path, "Rtest")
    save_numpy(rtime, args.path, "Rtime")
Пример #30
0
def bridgelabelmf(matrix_train,
                  matrix_valid,
                  iteration=100,
                  lam=0.01,
                  lam2=0.01,
                  rank=50,
                  seed=0,
                  batch_size=500,
                  gpu_on=True,
                  dataset=None,
                  **unused):
    progress = WorkSplitter()

    progress.section("BridgeLabelMF: Set the random seed")
    np.random.seed(seed)
    tf.set_random_seed(seed)

    progress.section("BridgeLabelMF: Load the variables trained on S_c/S_t")

    norm_RQ = np.load('latent/' + dataset + 'U_BiasedMF_10.npy')
    norm_Y = np.load('latent/' + dataset + 'V_BiasedMF_10.npy')
    norm_uBias = np.load('latent/' + dataset + 'uB_BiasedMF_10.npy')
    norm_iBias = np.load('latent/' + dataset + 'iB_BiasedMF_10.npy')

    unif_RQ = np.load('latent/' + dataset + 'unif_U_BiasedMF_10.npy')
    unif_Y = np.load('latent/' + dataset + 'unif_V_BiasedMF_10.npy')
    unif_uBias = np.load('latent/' + dataset + 'unif_uB_BiasedMF_10.npy')
    unif_iBias = np.load('latent/' + dataset + 'unif_iB_BiasedMF_10.npy')

    progress.section("BridgeLabelMF: Training")
    m, n = matrix_train.shape
    model = BridgeLabelMF(m,
                          n,
                          rank,
                          lamb=lam,
                          lamb2=lam2,
                          batch_size=batch_size,
                          gpu_on=gpu_on,
                          norm_init_U=norm_RQ,
                          norm_init_V=norm_Y,
                          norm_init_uBias=norm_uBias,
                          norm_init_iBias=norm_iBias,
                          unif_init_U=unif_RQ,
                          unif_init_V=unif_Y,
                          unif_init_uBias=unif_uBias,
                          unif_init_iBias=unif_iBias)
    metric_names = ['NLL', 'AUC']
    RQ, Y, user_bias, item_bias = model.train_model(matrix_train, matrix_valid,
                                                    iteration, metric_names)

    model.sess.close()
    tf.reset_default_graph()

    return RQ, Y, user_bias, item_bias