Exemplo n.º 1
0
def main(args):
    progress = WorkSplitter()

    table_path = 'tables/'

    test = load_numpy(path=args.path, name=args.dataset + args.test)

    df = pd.DataFrame({
        'model': [
            'AutoRec', 'AutoRec', 'AutoRec', 'InitFeatureEmbedAE',
            'InitFeatureEmbedAE', 'InitFeatureEmbedAE', 'AlterFeatureEmbedAE',
            'ConcatFeatureEmbedAE', 'UnionSampleAE', 'WRSampleAE',
            'BatchSampleAE', 'BridgeLabelAE', 'RefineLabelAE', 'DeepAutoRec',
            'DeepAutoRec', 'SoftLabelAE', 'HintAE'
        ],
        'way': [
            None, 'unif', 'combine', 'user', 'item', 'both', None, None, None,
            None, None, None, None, None, 'unif', None, None
        ]
    })

    progress.subsection("Reproduce")
    frame = []
    for idx, row in df.iterrows():
        row = row.to_dict()
        row['metric'] = ['NLL', 'AUC']
        row['rank'] = 200
        result = execute(test, row, folder=args.model_folder + args.dataset)
        frame.append(result)

    results = pd.concat(frame)
    save_dataframe_csv(results, table_path, args.name)
Exemplo n.º 2
0
def main(args):
    table_path = load_yaml('config/global.yml', key='path')['tables']

    df = find_best_hyperparameters(table_path+args.tuning_result_path, 'MAP@10')

    R_train = load_numpy(path=args.path, name=args.train)
    R_valid = load_numpy(path=args.path, name=args.valid)
    R_test = load_numpy(path=args.path, name=args.test)

    R_train = R_train + R_valid

#    R_train[(R_train <= 3).nonzero()] = 0
#    R_test[(R_test <= 3).nonzero()] = 0

#    R_train[(R_train > 3).nonzero()] = 1
#    R_test[(R_test > 3).nonzero()] = 1
#    import ipdb; ipdb.set_trace()

    topK = [5, 10, 15, 20, 50]

    frame = []
    for idx, row in df.iterrows():
        start = timeit.default_timer()
        row = row.to_dict()
        row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"]
        row['topK'] = topK
        result = execute(R_train, R_test, row, models[row['model']])
        stop = timeit.default_timer()
        print('Time: ', stop - start)
        frame.append(result)

    results = pd.concat(frame)
    save_dataframe_csv(results, table_path, args.name)
def main(args):
    progress = WorkSplitter()

    table_path = 'tables/'

    test = load_numpy(path=args.path, name=args.dataset + args.test)

    df = pd.DataFrame({
        'model': [
            "BiasedMF", "BiasedMF", "BiasedMF", "PropensityMF",
            "InitFeatureEmbedMF", "InitFeatureEmbedMF", "InitFeatureEmbedMF",
            "AlterFeatureEmbedMF", "ConcatFeatureEmbedMF", "CausalSampleMF",
            "UnionSampleMF", "WRSampleMF", "BatchSampleMF", "BridgeLabelMF",
            "RefineLabelMF"
        ],
        'way': [
            None, "unif", "combine", None, "user", "item", "both", None, None,
            None, None, None, None, None, None
        ]
    })

    progress.subsection("Reproduce")
    frame = []
    for idx, row in df.iterrows():
        row = row.to_dict()
        row['metric'] = ['NLL', 'AUC']
        row['rank'] = 10
        result = execute(test, row, folder=args.model_folder + args.dataset)
        frame.append(result)

    results = pd.concat(frame)
    save_dataframe_csv(results, table_path, args.name)
Exemplo n.º 4
0
def main(args):
    table_path = load_yaml('config/global.yml', key='path')['tables']

    df = find_best_hyperparameters(table_path + args.problem, 'NDCG')

    R_train = load_numpy(path=args.path, name=args.train)
    R_valid = load_numpy(path=args.path, name=args.valid)
    R_test = load_numpy(path=args.path, name=args.test)

    R_train = R_train + R_valid

    topK = [5, 10, 15, 20, 50]

    frame = []
    for idx, row in df.iterrows():
        start = timeit.default_timer()
        row = row.to_dict()
        row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"]
        row['topK'] = topK
        result = execute(R_train,
                         R_test,
                         row,
                         models[row['model']],
                         gpu_on=args.gpu)
        stop = timeit.default_timer()
        print('Time: ', stop - start)
        frame.append(result)

    results = pd.concat(frame)
    save_dataframe_csv(results, table_path, args.name)
Exemplo n.º 5
0
def main(args):
    progress = WorkSplitter()

    table_path = 'tables/'

    test = load_numpy(path=args.path, name=args.dataset + args.test)

    df = pd.DataFrame({
        'model': [
            'RestrictedBatchSampleMF', 'RestrictedBatchSampleMF',
            'RestrictedBatchSampleMF', 'RestrictedBatchSampleMF',
            'RestrictedBatchSampleMF'
        ],
        'way': [None, 'head_users', 'tail_users', 'head_items', 'tail_items']
    })

    progress.subsection("Gain Analysis")
    frame = []
    for idx, row in df.iterrows():
        row = row.to_dict()
        row['metric'] = ['NLL', 'AUC']
        row['rank'] = 10
        result = execute(test, row, folder=args.model_folder + args.dataset)
        frame.append(result)

    results = pd.concat(frame)
    save_dataframe_csv(results, table_path, args.name)
Exemplo n.º 6
0
def main(args):
    table_path = load_yaml('config/global.yml', key='path')['tables']

    df = find_best_hyperparameters(table_path + args.tuning_result_path,
                                   'NDCG')

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    R_valid = load_numpy(path=args.data_dir, name=args.valid_set)
    R_test = load_numpy(path=args.data_dir, name=args.test_set)

    R_train = R_train + R_valid

    topK = [5, 10, 15, 20, 50]

    frame = []
    for idx, row in df.iterrows():
        start = timeit.default_timer()
        row = row.to_dict()
        row['metric'] = ['R-Precision', 'NDCG', 'Precision', 'Recall', "MAP"]
        row['topK'] = topK
        result = general(R_train,
                         R_test,
                         row,
                         models[row['model']],
                         measure=row['similarity'],
                         gpu_on=args.gpu,
                         model_folder=args.model_folder)
        stop = timeit.default_timer()
        print('Time: ', stop - start)
        frame.append(result)

    results = pd.concat(frame)
    save_dataframe_csv(results, table_path, args.save_path)
Exemplo n.º 7
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(
        args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir,
                                   name=args.train_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(
        R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(
        path=args.data_dir, name=args.train_item_keyphrase_set).toarray()

    table_path = load_yaml('config/global.yml', key='path')['tables']
    parameters = find_best_hyperparameters(table_path + args.dataset_name,
                                           'NDCG')
    parameters_row = parameters.loc[parameters['model'] == args.model]

    if args.dataset_name == "yelp/":
        R_train_item_keyphrase = R_train_item_keyphrase.T

    start_time = time.time()

    results = critiquing(
        matrix_Train=R_train,
        matrix_Test=R_test,
        keyphrase_freq=R_train_keyphrase,
        item_keyphrase_freq=R_train_item_keyphrase,
        num_users_sampled=args.num_users_sampled,
        num_items_sampled=args.num_items_sampled,
        max_iteration_threshold=args.max_iteration_threshold,
        dataset_name=args.dataset_name,
        model=models[args.model],
        parameters_row=parameters_row,
        critiquing_model_name=args.critiquing_model_name,
        keyphrase_selection_method=args.keyphrase_selection_method,
        topk=args.topk,
        lamb=args.lamb)

    print("Final Time Elapsed: {}".format(inhour(time.time() - start_time)))

    table_path = load_yaml('config/global.yml', key='path')['tables']
    save_dataframe_csv(results, table_path, args.save_path)
Exemplo n.º 8
0
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir, name=args.train_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(path=args.data_dir, name=args.train_item_keyphrase_set).toarray()

    table_path = load_yaml('config/global.yml', key='path')['tables']
    # parameters = find_best_hyperparameters(table_path+args.dataset_name, 'NDCG')
    # parameters_row = parameters.loc[parameters['model'] == args.model]

    parameters_row = {
        'iter' : 10,
        'lambda' : 200,
        'rank' : 200 
    }
    
    keyphrases_names = load_dataframe_csv(path = args.data_dir, name = "Keyphrases.csv")['Phrases'].tolist()
    
    results = critiquing(matrix_Train=R_train,
                         matrix_Test=R_test,
                         keyphrase_freq=R_train_keyphrase,
                         item_keyphrase_freq=R_train_item_keyphrase.T,
                         num_users_sampled=args.num_users_sampled,
                         num_items_sampled=args.num_items_sampled,
                         max_iteration_threshold=args.max_iteration_threshold,
                         dataset_name=args.dataset_name,
                         model=models[args.model],
                         parameters_row=parameters_row,
                         critiquing_model_name=args.critiquing_model_name,
                         lamb = args.lambdas,
                         keyphrases_names = keyphrases_names,
                         keyphrase_selection_method = args.keyphrase_selection_method)

    table_path = load_yaml('config/global.yml', key='path')['tables']
    save_dataframe_csv(results, table_path, args.save_path)
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(
        args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir,
                                   name=args.train_keyphrase_set).toarray()
    print("Train User Keyphrase U-I Dimensions: {}".format(
        R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(
        path=args.data_dir, name=args.train_item_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(
        R_train_item_keyphrase.shape))

    # table_path = load_yaml('config/global.yml', key='path')['tables']
    # parameters = find_best_hyperparameters(table_path+args.dataset_name, 'NDCG')
    # parameters_row = parameters.loc[parameters['model'] == args.model]
    parameters_row = pd.DataFrame({'iter': [4], 'lambda': [80], 'rank': [200]})

    results = critiquing(matrix_Train=R_train,
                         matrix_Test=R_test,
                         keyphrase_freq=R_train_keyphrase,
                         item_keyphrase_freq=R_train_item_keyphrase,
                         num_users_sampled=args.num_users_sampled,
                         num_items_sampled=args.num_items_sampled,
                         max_iteration_threshold=args.max_iteration_threshold,
                         dataset_name=args.dataset_name,
                         model=models[args.model],
                         parameters_row=parameters_row,
                         critiquing_model_name=args.critiquing_model_name)

    # table_path = load_yaml('config/global.yml', key='path')['tables']
    table_path = '/home/shuyang/data4/LatentLinearCritiquingforConvRecSys/'
    save_dataframe_csv(results, table_path, args.save_path)
Exemplo n.º 10
0
def hyper_parameter_tuning(train, validation, params, save_path):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']

    try:
        df = load_dataframe_csv(table_path, save_path)
    except:
        df = pd.DataFrame(columns=['model', 'k', 'topK'])

    num_user = train.shape[0]

    for algorithm in params['models']:

        for k in params['k']:

            if ((df['model'] == algorithm) & (df['k'] == k)).any():
                continue

            format = "model: {}, k: {}"
            progress.section(format.format(algorithm, k))

            progress.subsection("Training")
            model = params['models'][algorithm]()
            model.train(train)

            progress.subsection("Prediction")
            prediction_score = model.predict(train, k=k)

            prediction = predict(prediction_score=prediction_score,
                                 topK=params['topK'][-1],
                                 matrix_Train=train)

            progress.subsection("Evaluation")
            result = evaluate(prediction, validation, params['metric'],
                              params['topK'])

            result_dict = {'model': algorithm, 'k': k}

            for name in result.keys():
                result_dict[name] = [
                    round(result[name][0], 4),
                    round(result[name][1], 4)
                ]

            df = df.append(result_dict, ignore_index=True)

            save_dataframe_csv(df, table_path, save_path)
Exemplo n.º 11
0
def explanation_converge(num_users,
                         num_items,
                         user_col,
                         item_col,
                         rating_col,
                         keyphrase_vector_col,
                         df_train,
                         df_test,
                         keyphrase_names,
                         df,
                         table_path,
                         file_name,
                         epoch=10):
    progress = WorkSplitter()

    results = pd.DataFrame(columns=[
        'model', 'rank', 'num_layers', 'train_batch_size',
        'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch',
        'negative_sampling_size', 'optimizer'
    ])

    for run in range(3):

        for idx, row in df.iterrows():
            row = row.to_dict()
            if row['model'] not in explanable_models:
                continue

            progress.section(json.dumps(row))

            row['metric'] = ['NDCG', 'Recall', 'Precision', 'MAP']
            row['topK'] = [10]

            if 'optimizer' not in row.keys():
                row['optimizer'] = 'Adam'

            negative_sampler = Negative_Sampler(
                df_train[[user_col, item_col, keyphrase_vector_col]],
                user_col,
                item_col,
                rating_col,
                keyphrase_vector_col,
                num_items=num_items,
                batch_size=row['train_batch_size'],
                num_keyphrases=len(keyphrase_names),
                negative_sampling_size=1)
            # explanation does not sensitive to negative samples

            model = models[row['model']](num_users=num_users,
                                         num_items=num_items,
                                         text_dim=len(keyphrase_names),
                                         embed_dim=row['rank'],
                                         num_layers=row['num_layers'],
                                         negative_sampler=negative_sampler,
                                         lamb=row['lambda'],
                                         learning_rate=row['learning_rate'])

            batches = negative_sampler.get_batches()

            epoch_batch = 10

            for i in range(epoch // epoch_batch):

                if i == 0:
                    model.train_model(df_train,
                                      user_col,
                                      item_col,
                                      rating_col,
                                      epoch=epoch_batch,
                                      batches=batches,
                                      init_embedding=True)
                else:
                    model.train_model(df_train,
                                      user_col,
                                      item_col,
                                      rating_col,
                                      epoch=epoch_batch,
                                      batches=batches,
                                      init_embedding=False)

                df_valid_explanation = predict_explanation(
                    model,
                    df_test,
                    user_col,
                    item_col,
                    topk_keyphrase=row['topK'][0])

                result = evaluate_explanation(df_valid_explanation, df_test,
                                              row['metric'], row['topK'],
                                              user_col, item_col, rating_col,
                                              keyphrase_vector_col)

                # Note Finished yet
                result_dict = {
                    'model': row['model'],
                    'rank': row['rank'],
                    'num_layers': row['num_layers'],
                    'train_batch_size': row['train_batch_size'],
                    'predict_batch_size': row['predict_batch_size'],
                    'lambda': row['lambda'],
                    'topK': row['topK'][0],
                    'learning_rate': row['learning_rate'],
                    'epoch': (i + 1) * epoch_batch,
                    'negative_sampling_size': row['negative_sampling_size'],
                    'optimizer': row['optimizer']
                }

                for name in result.keys():
                    result_dict[name] = round(result[name][0], 4)
                results = results.append(result_dict, ignore_index=True)
                print("result is \n {}".format(results))

            model.sess.close()
            tf.reset_default_graph()

            save_dataframe_csv(results, table_path, file_name)

    return results
Exemplo n.º 12
0
def explanation_parameter_tuning(num_users, num_items, user_col, item_col,
                                 rating_col, keyphrase_vector_col, df_train,
                                 df_valid, keyphrase_names, params, save_path):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']
    try:
        df = load_dataframe_csv(table_path, save_path)
    except:
        df = pd.DataFrame(columns=[
            'model', 'rank', 'num_layers', 'train_batch_size',
            'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch',
            'negative_sampling_size'
        ])

    for algorithm in params['models']:

        for rank in params['rank']:

            for num_layers in params['num_layers']:

                for train_batch_size in params['train_batch_size']:

                    for predict_batch_size in params['predict_batch_size']:

                        for lamb in params['lambda']:

                            for learning_rate in params['learning_rate']:

                                for epoch in params['epoch']:

                                    for negative_sampling_size in params[
                                            'negative_sampling_size']:

                                        if ((df['model'] == algorithm) &
                                            (df['rank'] == rank) &
                                            (df['num_layers'] == num_layers) &
                                            (df['train_batch_size']
                                             == train_batch_size) &
                                            (df['predict_batch_size']
                                             == predict_batch_size) &
                                            (df['lambda'] == lamb) &
                                            (df['learning_rate']
                                             == learning_rate) &
                                            (df['epoch'] == epoch) &
                                            (df['negative_sampling_size']
                                             == negative_sampling_size)).any():
                                            continue

                                        format = "model: {0}, rank: {1}, num_layers: {2}, " \
                                                 "train_batch_size: {3}, predict_batch_size: {4}, " \
                                                 "lambda: {5}, learning_rate: {6}, epoch: {7}, " \
                                                 "negative_sampling_size: {8}"
                                        progress.section(
                                            format.format(
                                                algorithm, rank, num_layers,
                                                train_batch_size,
                                                predict_batch_size, lamb,
                                                learning_rate, epoch,
                                                negative_sampling_size))

                                        progress.subsection(
                                            "Initializing Negative Sampler")

                                        negative_sampler = Negative_Sampler(
                                            df_train[[
                                                user_col, item_col,
                                                keyphrase_vector_col
                                            ]],
                                            user_col,
                                            item_col,
                                            rating_col,
                                            keyphrase_vector_col,
                                            num_items=num_items,
                                            batch_size=train_batch_size,
                                            num_keyphrases=len(
                                                keyphrase_names),
                                            negative_sampling_size=
                                            negative_sampling_size)

                                        model = params['models'][algorithm](
                                            num_users=num_users,
                                            num_items=num_items,
                                            text_dim=len(keyphrase_names),
                                            embed_dim=rank,
                                            num_layers=num_layers,
                                            negative_sampler=negative_sampler,
                                            lamb=lamb,
                                            learning_rate=learning_rate)

                                        progress.subsection("Training")

                                        model.train_model(df_train,
                                                          user_col,
                                                          item_col,
                                                          rating_col,
                                                          epoch=epoch)

                                        progress.subsection("Prediction")

                                        df_valid_explanation = predict_explanation(
                                            model,
                                            df_valid,
                                            user_col,
                                            item_col,
                                            topk_keyphrase=params['topK'][-1])

                                        progress.subsection("Evaluation")

                                        explanation_result = evaluate_explanation(
                                            df_valid_explanation, df_valid,
                                            params['metric'], params['topK'],
                                            user_col, item_col, rating_col,
                                            keyphrase_vector_col)

                                        result_dict = {
                                            'model':
                                            algorithm,
                                            'rank':
                                            rank,
                                            'num_layers':
                                            num_layers,
                                            'train_batch_size':
                                            train_batch_size,
                                            'predict_batch_size':
                                            predict_batch_size,
                                            'lambda':
                                            lamb,
                                            'learning_rate':
                                            learning_rate,
                                            'epoch':
                                            epoch,
                                            'negative_sampling_size':
                                            negative_sampling_size
                                        }

                                        for name in explanation_result.keys():
                                            result_dict[name] = [
                                                round(
                                                    explanation_result[name]
                                                    [0], 4),
                                                round(
                                                    explanation_result[name]
                                                    [1], 4)
                                            ]

                                        df = df.append(result_dict,
                                                       ignore_index=True)

                                        model.sess.close()
                                        tf.reset_default_graph()

                                        save_dataframe_csv(
                                            df, table_path, save_path)
Exemplo n.º 13
0
def hyper_parameter_tuning(train, validation, params, save_path, measure='Cosine', gpu_on=True):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']

    try:
        df = load_dataframe_csv(table_path, save_path)
    except:
        df = pd.DataFrame(columns=['model', 'similarity', 'alpha', 'batch_size',
                                   'corruption', 'epoch', 'iteration', 'key_dimension',
                                   'lambda', 'learning_rate', 'mode_dimension',
                                   'normalize', 'rank', 'root', 'topK'])

    num_user = train.shape[0]

    for algorithm in params['models']:

        for alpha in params['alpha']:

            for batch_size in params['batch_size']:

                for corruption in params['corruption']:

                    for epoch in params['epoch']:

                        for iteration in params['iteration']:

                            for key_dim in params['key_dimension']:

                                for lamb in params['lambda']:

                                    for learning_rate in params['learning_rate']:

                                        for mode_dim in params['mode_dimension']:

                                            for rank in params['rank']:

                                                for root in params['root']:

                                                    if ((df['model'] == algorithm) &
                                                        (df['alpha'] == alpha) &
                                                        (df['batch_size'] == batch_size) &
                                                        (df['corruption'] == corruption) &
                                                        (df['epoch'] == epoch) &
                                                        (df['iteration'] == iteration) &
                                                        (df['key_dimension'] == key_dim) &
                                                        (df['lambda'] == lamb) &
                                                        (df['learning_rate'] == learning_rate) &
                                                        (df['mode_dimension'] == mode_dim) &
                                                        (df['rank'] == rank) &
                                                        (df['root'] == root)).any():
                                                        continue

                                                    format = "model: {}, alpha: {}, batch_size: {}, corruption: {}, epoch: {}, iteration: {}, \
                                                        key_dimension: {}, lambda: {}, learning_rate: {}, mode_dimension: {}, rank: {}, root: {}"
                                                    progress.section(format.format(algorithm, alpha, batch_size, corruption, epoch, iteration,
                                                                                   key_dim, lamb, learning_rate, mode_dim, rank, root))
                                                    RQ, Yt, Bias = params['models'][algorithm](train,
                                                                                               embedded_matrix=np.empty((0)),
                                                                                               mode_dim=mode_dim,
                                                                                               key_dim=key_dim,
                                                                                               batch_size=batch_size,
                                                                                               learning_rate=learning_rate,
                                                                                               iteration=iteration,
                                                                                               epoch=epoch,
                                                                                               rank=rank,
                                                                                               corruption=corruption,
                                                                                               gpu_on=gpu_on,
                                                                                               lamb=lamb,
                                                                                               alpha=alpha,
                                                                                               root=root)
                                                    Y = Yt.T

                                                    progress.subsection("Prediction")

                                                    prediction = predict(matrix_U=RQ,
                                                                         matrix_V=Y,
                                                                         bias=Bias,
                                                                         topK=params['topK'][-1],
                                                                         matrix_Train=train,
                                                                         measure=measure,
                                                                         gpu=gpu_on)

                                                    progress.subsection("Evaluation")

                                                    result = evaluate(prediction,
                                                                      validation,
                                                                      params['metric'],
                                                                      params['topK'])

                                                    result_dict = {'model': algorithm,
                                                                   'alpha': alpha,
                                                                   'batch_size': batch_size,
                                                                   'corruption': corruption,
                                                                   'epoch': epoch,
                                                                   'iteration': iteration,
                                                                   'key_dimension': key_dim,
                                                                   'lambda': lamb,
                                                                   'learning_rate': learning_rate,
                                                                   'mode_dimension': mode_dim,
                                                                   'rank': rank,
                                                                   'similarity': params['similarity'],
                                                                   'root': root}

                                                    for name in result.keys():
                                                        result_dict[name] = [round(result[name][0], 4), round(result[name][1], 4)]

                                                    df = df.append(result_dict, ignore_index=True)

                                                    save_dataframe_csv(df, table_path, save_path)
Exemplo n.º 14
0
def hyper_parameter_tuning(train,
                           validation,
                           params,
                           save_path,
                           measure='Cosine',
                           gpu_on=True):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']

    try:
        df = load_dataframe_csv(table_path, save_path)
    except:
        df = pd.DataFrame(columns=[
            'model', 'rank', 'alpha', 'lambda', 'iter', 'similarity',
            'corruption', 'root', 'topK'
        ])

    num_user = train.shape[0]

    for algorithm in params['models']:

        for rank in params['rank']:

            for alpha in params['alpha']:

                for lam in params['lambda']:

                    for corruption in params['corruption']:

                        for root in params['root']:

                            if ((df['model'] == algorithm) &
                                (df['rank'] == rank) & (df['alpha'] == alpha) &
                                (df['lambda'] == lam) &
                                (df['corruption'] == corruption) &
                                (df['root'] == root)).any():
                                continue

                            format = "model: {0}, rank: {1}, alpha: {2}, lambda: {3}, corruption: {4}, root: {5}"
                            progress.section(
                                format.format(algorithm, rank, alpha, lam,
                                              corruption, root))
                            RQ, Yt, Bias = params['models'][algorithm](
                                train,
                                embeded_matrix=np.empty((0)),
                                iteration=params['iter'],
                                rank=rank,
                                lam=lam,
                                alpha=alpha,
                                corruption=corruption,
                                root=root,
                                gpu_on=gpu_on)
                            Y = Yt.T

                            progress.subsection("Prediction")

                            prediction = predict(matrix_U=RQ,
                                                 matrix_V=Y,
                                                 measure=measure,
                                                 bias=Bias,
                                                 topK=params['topK'][-1],
                                                 matrix_Train=train,
                                                 gpu=gpu_on)

                            progress.subsection("Evaluation")

                            result = evaluate(prediction, validation,
                                              params['metric'], params['topK'])

                            result_dict = {
                                'model': algorithm,
                                'rank': rank,
                                'alpha': alpha,
                                'lambda': lam,
                                'iter': params['iter'],
                                'similarity': params['similarity'],
                                'corruption': corruption,
                                'root': root
                            }

                            for name in result.keys():
                                result_dict[name] = [
                                    round(result[name][0], 4),
                                    round(result[name][1], 4)
                                ]

                            df = df.append(result_dict, ignore_index=True)

                            save_dataframe_csv(df, table_path, save_path)
Exemplo n.º 15
0
def converge(Rtrain, Rtest, df, table_path, file_name, epochs=10, gpu_on=True):
    progress = WorkSplitter()
    m, n = Rtrain.shape

    valid_models = autoencoders.keys()

    results = pd.DataFrame(
        columns=['model', 'rank', 'lambda', 'epoch', 'optimizer'])

    for run in range(3):

        for idx, row in df.iterrows():
            row = row.to_dict()

            if row['model'] not in valid_models:
                continue

            progress.section(json.dumps(row))

            row['metric'] = ['NDCG', 'R-Precision']
            row['topK'] = [50]
            if 'optimizer' not in row.keys():
                row['optimizer'] = 'RMSProp'
            try:
                model = autoencoders[row['model']](
                    n,
                    row['rank'],
                    batch_size=100,
                    lamb=row['lambda'],
                    optimizer=Regularizer[row['optimizer']])

            except:
                model = autoencoders[row['model']](
                    m,
                    n,
                    row['rank'],
                    batch_size=100,
                    lamb=row['lambda'],
                    optimizer=Regularizer[row['optimizer']])

            batches = model.get_batches(Rtrain, 100)

            epoch_batch = 50

            for i in range(epochs // epoch_batch):

                model.train_model(Rtrain,
                                  corruption=row['corruption'],
                                  epoch=epoch_batch,
                                  batches=batches)

                RQ = model.get_RQ(Rtrain)
                Y = model.get_Y()
                Bias = model.get_Bias()

                Y = Y.T

                prediction = predict(matrix_U=RQ,
                                     matrix_V=Y,
                                     bias=Bias,
                                     topK=row['topK'][0],
                                     matrix_Train=Rtrain,
                                     measure='Cosine',
                                     gpu=gpu_on)

                result = evaluate(prediction, Rtest, row['metric'],
                                  row['topK'])
                # Note Finished yet
                result_dict = {
                    'model': row['model'],
                    'rank': row['rank'],
                    'lambda': row['lambda'],
                    'optimizer': row['optimizer'],
                    'epoch': (i + 1) * epoch_batch
                }

                for name in result.keys():
                    result_dict[name] = round(result[name][0], 4)
                results = results.append(result_dict, ignore_index=True)

            model.sess.close()
            tf.reset_default_graph()

            save_dataframe_csv(results, table_path, file_name)

    return results
Exemplo n.º 16
0
def hyper_parameter_tuning(train, validation, params, unif_train, save_path,
                           seed, way, dataset, gpu_on):
    progress = WorkSplitter()

    table_path = 'tables/'
    data_name = save_path.split('/')[0]
    save_dir = 'tables/' + data_name + '/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for algorithm in params['models']:
        if algorithm in ['AutoRec']:
            df = pd.DataFrame(
                columns=['model', 'rank', 'batch_size', 'lambda', 'iter'])
            for rank in params['rank']:
                for batch_size in params['batch_size']:
                    for lam in params['lambda']:
                        format = "model: {0}, rank: {1}, batch_size: {2}, lambda: {3}"
                        progress.section(
                            format.format(algorithm, rank, batch_size, lam))
                        RQ, X, xBias, Y, yBias = params['models'][algorithm](
                            train,
                            validation,
                            matrix_unif_train=unif_train,
                            iteration=params['iter'],
                            rank=rank,
                            gpu_on=gpu_on,
                            lam=lam,
                            seed=seed,
                            batch_size=batch_size,
                            way=way,
                            dataset=dataset)

                        progress.subsection("Prediction")
                        prediction = predict(matrix_U=RQ,
                                             matrix_V=Y.T,
                                             matrix_Valid=validation,
                                             bias=yBias,
                                             gpu=gpu_on)

                        progress.subsection("Evaluation")
                        result = evaluate(prediction,
                                          validation,
                                          params['metric'],
                                          gpu=gpu_on)
                        result_dict = {
                            'model': algorithm,
                            'rank': rank,
                            'batch_size': batch_size,
                            'lambda': lam,
                            'iter': params['iter']
                        }
                        for name in result.keys():
                            result_dict[name] = round(result[name][0], 8)
                        df = df.append(result_dict, ignore_index=True)
                        save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['InitFeatureEmbedAE', 'ConcatFeatureEmbedAE']:
            df = pd.DataFrame(
                columns=['model', 'batch_size', 'lambda', 'iter'])
            for batch_size in params['batch_size']:
                for lam in params['lambda']:
                    format = "model: {0}, batch_size: {1}, lambda: {2}"
                    progress.section(format.format(algorithm, batch_size, lam))
                    RQ, X, xBias, Y, yBias = params['models'][algorithm](
                        train,
                        validation,
                        matrix_unif_train=unif_train,
                        iteration=params['iter'],
                        rank=params['rank'],
                        gpu_on=gpu_on,
                        lam=lam,
                        seed=seed,
                        batch_size=batch_size,
                        way=way,
                        dataset=dataset)

                    progress.subsection("Prediction")
                    prediction = predict(matrix_U=RQ,
                                         matrix_V=Y.T,
                                         matrix_Valid=validation,
                                         bias=yBias,
                                         gpu=gpu_on)

                    progress.subsection("Evaluation")
                    result = evaluate(prediction,
                                      validation,
                                      params['metric'],
                                      gpu=gpu_on)
                    result_dict = {
                        'model': algorithm,
                        'batch_size': batch_size,
                        'lambda': lam,
                        'iter': params['iter']
                    }
                    for name in result.keys():
                        result_dict[name] = round(result[name][0], 8)
                    df = df.append(result_dict, ignore_index=True)
                    save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['UnionSampleAE', 'RefineLabelAE']:
            df = pd.DataFrame(columns=['model', 'confidence', 'iter'])
            for conf in params['confidence']:
                format = "model: {0}, confidence: {1}"
                progress.section(format.format(algorithm, conf))
                RQ, X, xBias, Y, yBias = params['models'][algorithm](
                    train,
                    validation,
                    matrix_unif_train=unif_train,
                    iteration=params['iter'],
                    rank=params['rank'],
                    gpu_on=gpu_on,
                    lam=params['lambda'],
                    seed=seed,
                    batch_size=params['batch_size'],
                    way=way,
                    confidence=conf,
                    dataset=dataset)

                progress.subsection("Prediction")
                prediction = predict(matrix_U=RQ,
                                     matrix_V=Y.T,
                                     matrix_Valid=validation,
                                     bias=yBias,
                                     gpu=gpu_on)

                progress.subsection("Evaluation")
                result = evaluate(prediction,
                                  validation,
                                  params['metric'],
                                  gpu=gpu_on)
                result_dict = {
                    'model': algorithm,
                    'confidence': conf,
                    'iter': params['iter']
                }
                for name in result.keys():
                    result_dict[name] = round(result[name][0], 8)
                df = df.append(result_dict, ignore_index=True)
                save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['BatchSampleAE']:
            df = pd.DataFrame(columns=['model', 'step', 'iter'])
            for step in params['step']:
                format = "model: {0}, step: {1}"
                progress.section(format.format(algorithm, step))
                RQ, X, xBias, Y, yBias = params['models'][algorithm](
                    train,
                    validation,
                    matrix_unif_train=unif_train,
                    iteration=params['iter'],
                    rank=params['rank'],
                    gpu_on=gpu_on,
                    lam=params['lambda'],
                    seed=seed,
                    batch_size=params['batch_size'],
                    way=way,
                    step=step,
                    dataset=dataset)

                progress.subsection("Prediction")
                prediction = predict(matrix_U=RQ,
                                     matrix_V=Y.T,
                                     matrix_Valid=validation,
                                     bias=yBias,
                                     gpu=gpu_on)

                progress.subsection("Evaluation")
                result = evaluate(prediction,
                                  validation,
                                  params['metric'],
                                  gpu=gpu_on)
                result_dict = {
                    'model': algorithm,
                    'step': step,
                    'iter': params['iter']
                }
                for name in result.keys():
                    result_dict[name] = round(result[name][0], 8)
                df = df.append(result_dict, ignore_index=True)
                save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['BridgeLabelAE']:
            df = pd.DataFrame(columns=['model', 'lambda', 'lambda2', 'iter'])
            for lam in params['lambda']:
                for lam2 in params['lambda2']:
                    format = "model: {0}, lambda: {1}, lambda2: {2}"
                    progress.section(format.format(algorithm, lam, lam2))
                    RQ, X, xBias, Y, yBias = params['models'][algorithm](
                        train,
                        validation,
                        matrix_unif_train=unif_train,
                        iteration=params['iter'],
                        rank=params['rank'],
                        gpu_on=gpu_on,
                        lam=lam,
                        lam2=lam2,
                        seed=seed,
                        batch_size=params['batch_size'],
                        way=way,
                        dataset=dataset)

                    progress.subsection("Prediction")
                    prediction = predict(matrix_U=RQ,
                                         matrix_V=Y.T,
                                         matrix_Valid=validation,
                                         bias=yBias,
                                         gpu=gpu_on)

                    progress.subsection("Evaluation")
                    result = evaluate(prediction,
                                      validation,
                                      params['metric'],
                                      gpu=gpu_on)
                    result_dict = {
                        'model': algorithm,
                        'lambda': lam,
                        'lambda2': lam2,
                        'iter': params['iter']
                    }
                    for name in result.keys():
                        result_dict[name] = round(result[name][0], 8)
                    df = df.append(result_dict, ignore_index=True)
                    save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['SoftLabelAE']:
            df = pd.DataFrame(columns=['model', 'confidence', 'tau', 'iter'])
            for conf in params['confidence']:
                for tau in params['tau']:
                    format = "model: {0}, confidence: {1}, tau: {2}"
                    progress.section(format.format(algorithm, conf, tau))
                    RQ, X, xBias, Y, yBias, Z, zBias, K, kBias = params[
                        'models'][algorithm](train,
                                             validation,
                                             matrix_unif_train=unif_train,
                                             iteration=params['iter'],
                                             rank=params['rank'],
                                             rank2=params['rank2'],
                                             gpu_on=gpu_on,
                                             lam=params['lambda'],
                                             seed=seed,
                                             batch_size=params['batch_size'],
                                             confidence=conf,
                                             tau=tau,
                                             dataset=dataset)

                    progress.subsection("Prediction")
                    prediction = predict(matrix_U=RQ,
                                         matrix_V=K.T,
                                         matrix_Valid=validation,
                                         bias=yBias,
                                         gpu=gpu_on)

                    progress.subsection("Evaluation")
                    result = evaluate(prediction,
                                      validation,
                                      params['metric'],
                                      gpu=gpu_on)
                    result_dict = {
                        'model': algorithm,
                        'confidence': conf,
                        'tau': tau,
                        'iter': params['iter']
                    }
                    for name in result.keys():
                        result_dict[name] = round(result[name][0], 8)
                    df = df.append(result_dict, ignore_index=True)
                    save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['HintAE']:
            df = pd.DataFrame(columns=['model', 'confidence', 'iter'])
            for conf in params['confidence']:
                format = "model: {0}, confidence: {1}"
                progress.section(format.format(algorithm, conf))
                RQ, X, xBias, Y, yBias, Z, zBias, K, kBias = params['models'][
                    algorithm](train,
                               validation,
                               matrix_unif_train=unif_train,
                               iteration=params['iter'],
                               rank=params['rank'],
                               rank2=params['rank2'],
                               gpu_on=gpu_on,
                               lam=params['lambda'],
                               seed=seed,
                               batch_size=params['batch_size'],
                               confidence=conf,
                               dataset=dataset)

                progress.subsection("Prediction")
                prediction = predict(matrix_U=RQ,
                                     matrix_V=K.T,
                                     matrix_Valid=validation,
                                     bias=yBias,
                                     gpu=gpu_on)

                progress.subsection("Evaluation")
                result = evaluate(prediction,
                                  validation,
                                  params['metric'],
                                  gpu=gpu_on)
                result_dict = {
                    'model': algorithm,
                    'confidence': conf,
                    'iter': params['iter']
                }
                for name in result.keys():
                    result_dict[name] = round(result[name][0], 8)
                df = df.append(result_dict, ignore_index=True)
                save_dataframe_csv(df, table_path, save_path)
Exemplo n.º 17
0
def general(num_users, num_items, user_col, item_col, rating_col,
            keyphrase_vector_col, df_train, df_test, keyphrase_names, params,
            save_path):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']
    df = find_best_hyperparameters(table_path + params['tuning_result_path'],
                                   'NDCG')

    try:
        output_df = load_dataframe_csv(table_path, save_path)
    except:
        output_df = pd.DataFrame(columns=[
            'model', 'rank', 'num_layers', 'train_batch_size',
            'predict_batch_size', 'lambda', 'topK', 'learning_rate', 'epoch',
            'negative_sampling_size'
        ])

    for index, row in df.iterrows():

        algorithm = row['model']
        rank = row['rank']
        num_layers = row['num_layers']
        train_batch_size = row['train_batch_size']
        predict_batch_size = row['predict_batch_size']
        lamb = row['lambda']
        learning_rate = row['learning_rate']
        epoch = 300
        negative_sampling_size = row['negative_sampling_size']

        row['topK'] = [5, 10, 15, 20, 50]
        row['metric'] = [
            'R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP'
        ]

        format = "model: {0}, rank: {1}, num_layers: {2}, train_batch_size: {3}, " \
                 "predict_batch_size: {4}, lambda: {5}, learning_rate: {6}, epoch: {7}, negative_sampling_size: {8}"
        progress.section(
            format.format(algorithm, rank, num_layers, train_batch_size,
                          predict_batch_size, lamb, learning_rate, epoch,
                          negative_sampling_size))

        progress.subsection("Initializing Negative Sampler")

        negative_sampler = Negative_Sampler(
            df_train[[user_col, item_col, keyphrase_vector_col]],
            user_col,
            item_col,
            rating_col,
            keyphrase_vector_col,
            num_items=num_items,
            batch_size=train_batch_size,
            num_keyphrases=len(keyphrase_names),
            negative_sampling_size=negative_sampling_size)

        model = models[algorithm](num_users=num_users,
                                  num_items=num_items,
                                  text_dim=len(keyphrase_names),
                                  embed_dim=rank,
                                  num_layers=num_layers,
                                  negative_sampler=negative_sampler,
                                  lamb=lamb,
                                  learning_rate=learning_rate)

        progress.subsection("Training")

        pretrained_path = load_yaml('config/global.yml',
                                    key='path')['pretrained']
        # try:
        #     model.load_model(pretrained_path+params['tuning_result_path'], row['model'])
        # except:
        model.train_model(df_train,
                          user_col,
                          item_col,
                          rating_col,
                          epoch=epoch)
        # model.save_model(pretrained_path+params['tuning_result_path'], row['model'])

        progress.subsection("Prediction")

        prediction, explanation = predict_elementwise(
            model,
            df_train,
            user_col,
            item_col,
            row['topK'][-1],
            batch_size=row['predict_batch_size'],
            enable_explanation=False,
            keyphrase_names=keyphrase_names)

        R_test = to_sparse_matrix(df_test, num_users, num_items, user_col,
                                  item_col, rating_col)

        result = evaluate(prediction, R_test, row['metric'], row['topK'])

        # Note Finished yet
        result_dict = {
            'model': row['model'],
            'rank': row['rank'],
            'num_layers': row['num_layers'],
            'train_batch_size': row['train_batch_size'],
            'predict_batch_size': row['predict_batch_size'],
            'lambda': row['lambda'],
            'topK': row['topK'][-1],
            'learning_rate': row['learning_rate'],
            'epoch': epoch,
            'negative_sampling_size': row['negative_sampling_size'],
        }

        for name in result.keys():
            result_dict[name] = round(result[name][0], 4)
        output_df = output_df.append(result_dict, ignore_index=True)

        model.sess.close()
        tf.reset_default_graph()

        save_dataframe_csv(output_df, table_path, save_path)

    return output_df
Exemplo n.º 18
0
def hyper_parameter_tuning(train, validation, params, save_path, gpu_on=True):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']

    try:
        df = load_dataframe_csv(table_path, save_path)
    except:
        df = pd.DataFrame(
            columns=['model', 'rank', 'lambda', 'epoch', 'corruption', 'topK'])

    num_user = train.shape[0]

    for algorithm in params['models']:

        for rank in params['rank']:

            for lamb in params['lambda']:

                for corruption in params['corruption']:

                    if ((df['model'] == algorithm) & (df['rank'] == rank) &
                        (df['lambda'] == lamb) &
                        (df['corruption'] == corruption)).any():
                        continue

                    format = "model: {}, rank: {}, lambda: {}, corruption: {}"
                    progress.section(
                        format.format(algorithm, rank, lamb, corruption))
                    RQ, Yt, Bias = params['models'][algorithm](
                        train,
                        epoch=params['epoch'],
                        lamb=lamb,
                        rank=rank,
                        corruption=corruption)
                    Y = Yt.T

                    progress.subsection("Prediction")

                    prediction = predict(matrix_U=RQ,
                                         matrix_V=Y,
                                         bias=Bias,
                                         topK=params['topK'][-1],
                                         matrix_Train=train,
                                         gpu=gpu_on)

                    progress.subsection("Evaluation")

                    result = evaluate(prediction, validation, params['metric'],
                                      params['topK'])

                    result_dict = {
                        'model': algorithm,
                        'rank': rank,
                        'lambda': lamb,
                        'epoch': params['epoch'],
                        'corruption': corruption
                    }

                    for name in result.keys():
                        result_dict[name] = [
                            round(result[name][0], 4),
                            round(result[name][1], 4)
                        ]

                    df = df.append(result_dict, ignore_index=True)

                    save_dataframe_csv(df, table_path, save_path)
Exemplo n.º 19
0
def critiquing(num_users, num_items, user_col, item_col, rating_col,
               keyphrase_vector_col, df_train, keyphrase_names, params,
               num_users_sampled, load_path, save_path):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']
    df = pd.read_csv(table_path + load_path)

    dfs_fmap = []

    for index, row in df.iterrows():

        if row['model'] not in critiquing_models:
            continue

        algorithm = row['model']
        rank = row['rank']
        num_layers = row['num_layers']
        train_batch_size = row['train_batch_size']
        predict_batch_size = row['predict_batch_size']
        lamb = row['lambda']
        learning_rate = row['learning_rate']
        epoch = 200
        negative_sampling_size = 1

        format = "model: {0}, rank: {1}, num_layers: {2}, train_batch_size: {3}, " \
                 "predict_batch_size: {4}, lambda: {5}, learning_rate: {6}, epoch: {7}, negative_sampling_size: {8}"
        progress.section(
            format.format(algorithm, rank, num_layers, train_batch_size,
                          predict_batch_size, lamb, learning_rate, epoch,
                          negative_sampling_size))

        progress.subsection("Initializing Negative Sampler")

        negative_sampler = Negative_Sampler(
            df_train[[user_col, item_col, keyphrase_vector_col]],
            user_col,
            item_col,
            rating_col,
            keyphrase_vector_col,
            num_items=num_items,
            batch_size=train_batch_size,
            num_keyphrases=len(keyphrase_names),
            negative_sampling_size=negative_sampling_size)

        model = critiquing_models[algorithm](num_users=num_users,
                                             num_items=num_items,
                                             text_dim=len(keyphrase_names),
                                             embed_dim=rank,
                                             num_layers=num_layers,
                                             negative_sampler=negative_sampler,
                                             lamb=lamb,
                                             learning_rate=learning_rate)

        pretrained_path = load_yaml('config/global.yml',
                                    key='path')['pretrained']
        try:
            model.load_model(pretrained_path + params['model_saved_path'],
                             row['model'])
        except:
            model.train_model(df_train,
                              user_col,
                              item_col,
                              rating_col,
                              epoch=epoch)
            model.save_model(pretrained_path + params['model_saved_path'],
                             row['model'])

        df_fmap = critiquing_evaluation(model,
                                        algorithm,
                                        num_users,
                                        num_items,
                                        num_users_sampled,
                                        topk=[5, 10, 20])

        dfs_fmap.append(df_fmap)

        model.sess.close()
        tf.reset_default_graph()

    df_output_fmap = pd.concat(dfs_fmap)

    save_dataframe_csv(df_output_fmap,
                       table_path,
                       name=save_path + '_FMAP.csv')
def main(args):
    # Progress bar
    progress = WorkSplitter()

    # Show hyperparameter settings
    progress.section("Parameter Setting")
    print("Data Directory: {}".format(args.data_dir))
    print("Number of Users Sampled: {}".format(args.num_users_sampled))
    print("Number of Items Sampled: {}".format(args.num_items_sampled))
    print("Number of Max Allowed Iterations: {}".format(
        args.max_iteration_threshold))
    print("Critiquing Model: {}".format(args.critiquing_model_name))

    R_train = load_numpy(path=args.data_dir, name=args.train_set)
    print("Train U-I Dimensions: {}".format(R_train.shape))

    R_test = load_numpy(path=args.data_dir, name=args.test_set)
    print("Test U-I Dimensions: {}".format(R_test.shape))

    R_train_keyphrase = load_numpy(path=args.data_dir,
                                   name=args.train_keyphrase_set).toarray()
    print("Train Item Keyphrase U-I Dimensions: {}".format(
        R_train_keyphrase.shape))

    R_train_item_keyphrase = load_numpy(
        path=args.data_dir, name=args.train_item_keyphrase_set).toarray()

    table_path = load_yaml('config/global.yml', key='path')['tables']
    parameters = find_best_hyperparameters(table_path + args.dataset_name,
                                           'NDCG')
    parameters_row = parameters.loc[parameters['model'] == args.model]

    lambs = [
        0.001, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 30, 50, 70, 90, 100, 200, 500,
        1000, 10000, 100000
    ]
    topks = [10, 20, 50, 100]

    if args.dataset_name == "yelp/":
        R_train_item_keyphrase = R_train_item_keyphrase.T

    for topk in topks:
        for lamb in lambs:
            results = critiquing(
                matrix_Train=R_train,
                matrix_Test=R_test,
                keyphrase_freq=R_train_keyphrase,
                item_keyphrase_freq=R_train_item_keyphrase,
                num_users_sampled=args.num_users_sampled,
                num_items_sampled=args.num_items_sampled,
                max_iteration_threshold=args.max_iteration_threshold,
                dataset_name=args.dataset_name,
                model=models[args.model],
                parameters_row=parameters_row,
                critiquing_model_name=args.critiquing_model_name,
                keyphrase_selection_method=args.keyphrase_selection_method,
                topk=topk,
                lamb=lamb)
            table_path = load_yaml('config/global.yml', key='path')['tables']
            topk_path = "topk_" + str(topk) + "/"
            save_name = args.save_path + topk_path + "tuning_at_lamb_" + str(
                lamb) + "_with_" + args.keyphrase_selection_method + ".csv"
            save_dataframe_csv(results, table_path, save_name)
Exemplo n.º 21
0
def hyper_parameter_tuning(train, validation, keyphrase_train, keyphrase_validation, params, save_path, tune_explanation=False):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']

    try:
        df = load_dataframe_csv(table_path, save_path)
    except:
        df = pd.DataFrame(columns=['model', 'rank', 'beta', 'lambda_l2', 'lambda_keyphrase', 'lambda_latent', 'lambda_rating', 'topK', 'learning_rate', 'epoch', 'corruption', 'optimizer'])

    for algorithm in params['models']:

        for rank in params['rank']:

            for beta in params['beta']:

                for lamb_l2 in params['lambda_l2']:

                    for lamb_keyphrase in params['lambda_keyphrase']:

                        for lamb_latent in params['lambda_latent']:

                            for lamb_rating in params['lambda_rating']:

                                for learning_rate in params['learning_rate']:

                                    for epoch in params['epoch']:

                                        for corruption in params['corruption']:

                                            for optimizer in params['optimizer']:

                                                if ((df['model'] == algorithm) &
                                                    (df['rank'] == rank) &
                                                    (df['beta'] == beta) &
                                                    (df['lambda_l2'] == lamb_l2) &
                                                    (df['lambda_keyphrase'] == lamb_keyphrase) &
                                                    (df['lambda_latent'] == lamb_latent) &
                                                    (df['lambda_rating'] == lamb_rating) &
                                                    (df['learning_rate'] == learning_rate) &
                                                    (df['epoch'] == epoch) &
                                                    (df['corruption'] == corruption) &
                                                    (df['optimizer'] == optimizer)).any() or (lamb_latent != lamb_keyphrase):
                                                    continue

                                                format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, " \
                                                    "lambda_keyphrase: {}, lambda_latent: {}, lambda_rating: {}, " \
                                                    "learning_rate: {}, epoch: {}, corruption: {}, optimizer: {}"
                                                progress.section(format.format(algorithm,
                                                                               rank,
                                                                               beta,
                                                                               lamb_l2,
                                                                               lamb_keyphrase,
                                                                               lamb_latent,
                                                                               lamb_rating,
                                                                               learning_rate,
                                                                               epoch,
                                                                               corruption,
                                                                               optimizer))

                                                progress.subsection("Training")

                                                model = models[algorithm](matrix_train=train,
                                                                          epoch=epoch,
                                                                          lamb_l2=lamb_l2,
                                                                          lamb_keyphrase=lamb_keyphrase,
                                                                          lamb_latent=lamb_latent,
                                                                          lamb_rating=lamb_rating,
                                                                          beta=beta,
                                                                          learning_rate=learning_rate,
                                                                          rank=rank,
                                                                          corruption=corruption,
                                                                          optimizer=optimizer,
                                                                          matrix_train_keyphrase=keyphrase_train)

                                                progress.subsection("Prediction")

                                                rating_score, keyphrase_score = model.predict(train.todense())

                                                progress.subsection("Evaluation")

                                                if tune_explanation:
                                                    prediction = predict_keyphrase(keyphrase_score,
                                                                                   topK=params['topK'][-1])

                                                    result = evaluate(prediction,
                                                                      keyphrase_validation,
                                                                      params['metric'],
                                                                      params['topK'])
                                                else:
                                                    prediction = predict(rating_score,
                                                                         topK=params['topK'][-1],
                                                                         matrix_Train=train)

                                                    result = evaluate(prediction,
                                                                      validation,
                                                                      params['metric'],
                                                                      params['topK'])

                                                result_dict = {'model': algorithm,
                                                               'rank': rank,
                                                               'beta': beta,
                                                               'lambda_l2': lamb_l2,
                                                               'lambda_keyphrase': lamb_keyphrase,
                                                               'lambda_latent': lamb_latent,
                                                               'lambda_rating': lamb_rating,
                                                               'learning_rate': learning_rate,
                                                               'epoch': epoch,
                                                               'corruption': corruption,
                                                               'optimizer': optimizer}

                                                for name in result.keys():
                                                    result_dict[name] = [round(result[name][0], 4),
                                                                         round(result[name][1], 4)]

                                                df = df.append(result_dict, ignore_index=True)

                                                model.sess.close()
                                                tf.reset_default_graph()

                                                save_dataframe_csv(df, table_path, save_path)
Exemplo n.º 22
0
def general(train, test, keyphrase_train, keyphrase_test, params, save_path, final_explanation=False):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']
    df = find_best_hyperparameters(table_path + params['tuning_result_path'], 'NDCG')

    try:
        output_df = load_dataframe_csv(table_path, save_path)
    except:
        output_df = pd.DataFrame(columns=['model', 'rank', 'beta', 'lambda_l2', 'lambda_keyphrase', 'lambda_latent', 'lambda_rating', 'topK', 'learning_rate', 'epoch', 'corruption', 'optimizer'])

    for index, row in df.iterrows():

        algorithm = row['model']
        rank = row['rank']
        beta = row['beta']
        lamb_l2 = row['lambda_l2']
        lamb_keyphrase = row['lambda_keyphrase']
        lamb_latent = row['lambda_latent']
        lamb_rating = row['lambda_rating']
        learning_rate = row['learning_rate']
        epoch = row['epoch']
        corruption = row['corruption']
        optimizer = row['optimizer']

        row['topK'] = [5, 10, 15, 20, 50]
        row['metric'] = ['R-Precision', 'NDCG', 'Clicks', 'Recall', 'Precision', 'MAP']

        format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, lambda_keyphrase: {}, " \
                 "lambda_latent: {}, lambda_rating: {}, learning_rate: {}, " \
                 "epoch: {}, corruption: {}, optimizer: {}"

        progress.section(format.format(algorithm, rank, beta, lamb_l2, lamb_keyphrase, lamb_latent, lamb_rating, learning_rate, epoch, corruption, optimizer))

        progress.subsection("Training")

        model = models[algorithm](matrix_train=train,
                                  epoch=epoch,
                                  lamb_l2=lamb_l2,
                                  lamb_keyphrase=lamb_keyphrase,
                                  lamb_latent=lamb_latent,
                                  lamb_rating=lamb_rating,
                                  beta=beta,
                                  learning_rate=learning_rate,
                                  rank=rank,
                                  corruption=corruption,
                                  optimizer=optimizer,
                                  matrix_train_keyphrase=keyphrase_train)

        progress.subsection("Prediction")

        rating_score, keyphrase_score = model.predict(train.todense())

        progress.subsection("Evaluation")

        if final_explanation:
            prediction = predict_keyphrase(keyphrase_score,
                                           topK=row['topK'][-2])

            result = evaluate_explanation(prediction,
                                          keyphrase_test,
                                          row['metric'],
                                          row['topK'])
        else:
            prediction = predict(rating_score,
                                 topK=row['topK'][-1],
                                 matrix_Train=train)

            result = evaluate(prediction, test, row['metric'], row['topK'])

        result_dict = {'model': algorithm,
                       'rank': rank,
                       'beta': beta,
                       'lambda_l2': lamb_l2,
                       'lambda_keyphrase': lamb_keyphrase,
                       'lambda_latent': lamb_latent,
                       'lambda_rating': lamb_rating,
                       'learning_rate': learning_rate,
                       'epoch': epoch,
                       'corruption': corruption,
                       'optimizer': optimizer}

        for name in result.keys():
            result_dict[name] = [round(result[name][0], 4),
                                 round(result[name][1], 4)]

        output_df = output_df.append(result_dict, ignore_index=True)

        model.sess.close()
        tf.reset_default_graph()

        save_dataframe_csv(output_df, table_path, save_path)

    return output_df
def critiquing(train_set, keyphrase_train_set, item_keyphrase_train_set,
               params, num_users_sampled, load_path, save_path,
               critiquing_function):
    progress = WorkSplitter()
    table_path = load_yaml('config/global.yml', key='path')['tables']
    df = pd.read_csv(table_path + load_path)

    dfs_fmap = []

    for index, row in df.iterrows():

        if row['model'] not in critiquing_models:
            continue

        algorithm = row['model']
        rank = row['rank']
        beta = row['beta']
        lamb_l2 = row['lambda_l2']
        lamb_keyphrase = row['lambda_keyphrase']
        lamb_latent = row['lambda_latent']
        lamb_rating = row['lambda_rating']
        learning_rate = row['learning_rate']
        epoch = row['epoch']
        corruption = row['corruption']
        optimizer = row['optimizer']

        format = "model: {}, rank: {}, beta: {}, lambda_l2: {}, lambda_keyphrase: {}, " \
                 "lambda_latent: {}, lambda_rating: {}, learning_rate: {}, " \
                 "epoch: {}, corruption: {}, optimizer: {}"
        progress.section(
            format.format(algorithm, rank, beta, lamb_l2, lamb_keyphrase,
                          lamb_latent, lamb_rating, learning_rate, epoch,
                          corruption, optimizer))

        progress.subsection("Training")

        model = critiquing_models[algorithm](
            matrix_train=train_set,
            epoch=epoch,
            lamb_l2=lamb_l2,
            lamb_keyphrase=lamb_keyphrase,
            lamb_latent=lamb_latent,
            lamb_rating=lamb_rating,
            beta=beta,
            learning_rate=learning_rate,
            rank=rank,
            corruption=corruption,
            optimizer=optimizer,
            matrix_train_keyphrase=keyphrase_train_set)

        num_users, num_items = train_set.shape
        df_fmap = critiquing_evaluation(train_set,
                                        keyphrase_train_set,
                                        item_keyphrase_train_set,
                                        model,
                                        algorithm,
                                        num_users,
                                        num_items,
                                        num_users_sampled,
                                        critiquing_function,
                                        topk=[5, 10, 20])

        df_fmap['model'] = algorithm
        df_fmap['rank'] = rank
        df_fmap['beta'] = beta
        df_fmap['lambda_l2'] = lamb_l2
        df_fmap['lambda_keyphrase'] = lamb_keyphrase
        df_fmap['lambda_latent'] = lamb_latent
        df_fmap['lambda_rating'] = lamb_rating
        df_fmap['learning_rate'] = learning_rate
        df_fmap['epoch'] = epoch
        df_fmap['corruption'] = corruption
        df_fmap['optimizer'] = optimizer

        dfs_fmap.append(df_fmap)

        model.sess.close()
        tf.reset_default_graph()

    df_output_fmap = pd.concat(dfs_fmap)

    save_dataframe_csv(df_output_fmap,
                       table_path,
                       name=save_path + '_FMAP.csv')
Exemplo n.º 24
0
def hyper_parameter_tuning(train, validation, params, unif_train, save_path, seed, way, dataset, gpu_on):
    progress = WorkSplitter()

    table_path = 'tables/'
    data_name = save_path.split('/')[0]
    save_dir = 'tables/' + data_name + '/'
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)

    for algorithm in params['models']:
        if algorithm in ['BiasedMF', 'PropensityMF']:
            df = pd.DataFrame(columns=['model', 'batch_size', 'lambda', 'iter'])
            for batch_size in params['batch_size']:
                for lam in params['lambda']:
                    format = "model: {0}, batch_size: {1}, lambda: {2}"
                    progress.section(format.format(algorithm, batch_size, lam))
                    RQ, Y, uBias, iBias = params['models'][algorithm](train, validation,
                                                                      matrix_unif_train=unif_train,
                                                                      iteration=params['iter'],
                                                                      rank=params['rank'], gpu_on=gpu_on,
                                                                      lam=lam, seed=seed,
                                                                      batch_size=batch_size,
                                                                      way=way,
                                                                      dataset=dataset)

                    progress.subsection("Prediction")
                    prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias,
                                         gpu=gpu_on)

                    progress.subsection("Evaluation")
                    result = evaluate(prediction, validation, params['metric'], gpu=gpu_on)
                    result_dict = {'model': algorithm, 'batch_size': batch_size, 'lambda': lam, 'iter': params['iter']}
                    for name in result.keys():
                        result_dict[name] = round(result[name][0], 8)
                    df = df.append(result_dict, ignore_index=True)
                    save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['InitFeatureEmbedMF', 'AlterFeatureEmbedMF', 'WRSampleMF']:
            df = pd.DataFrame(columns=['model', 'lambda', 'iter'])
            for lam in params['lambda']:
                format = "model: {0}, lambda: {1}"
                progress.section(format.format(algorithm, lam))
                RQ, Y, uBias, iBias = params['models'][algorithm](train, validation,
                                                                  matrix_unif_train=unif_train,
                                                                  iteration=params['iter'],
                                                                  rank=params['rank'],
                                                                  gpu_on=gpu_on,
                                                                  lam=lam, seed=seed,
                                                                  batch_size=params['batch_size'],
                                                                  way=way,
                                                                  dataset=dataset)

                progress.subsection("Prediction")
                prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias,
                                     gpu=gpu_on)

                progress.subsection("Evaluation")
                result = evaluate(prediction, validation, params['metric'], gpu=gpu_on)
                result_dict = {'model': algorithm, 'lambda': lam, 'iter': params['iter']}
                for name in result.keys():
                    result_dict[name] = round(result[name][0], 8)
                df = df.append(result_dict, ignore_index=True)
                save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['CausalSampleMF', 'BridgeLabelMF']:
            df = pd.DataFrame(columns=['model', 'lambda', 'lambda2', 'iter'])
            for lam in params['lambda']:
                for lam2 in params['lambda2']:
                    format = "model: {0}, lambda: {1}, lambda2: {2}"
                    progress.section(format.format(algorithm, lam, lam2))
                    RQ, Y, uBias, iBias = params['models'][algorithm](train, validation,
                                                                      matrix_unif_train=unif_train,
                                                                      iteration=params['iter'],
                                                                      rank=params['rank'],
                                                                      gpu_on=gpu_on,
                                                                      lam=lam, lam2=lam2,
                                                                      seed=seed,
                                                                      batch_size=params['batch_size'],
                                                                      way=way,
                                                                      dataset=dataset)

                    progress.subsection("Prediction")
                    prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias,
                                         gpu=gpu_on)

                    progress.subsection("Evaluation")
                    result = evaluate(prediction, validation, params['metric'], gpu=gpu_on)
                    result_dict = {'model': algorithm, 'lambda': lam, 'lambda2': lam2, 'iter': params['iter']}
                    for name in result.keys():
                        result_dict[name] = round(result[name][0], 8)
                    df = df.append(result_dict, ignore_index=True)
                    save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['UnionSampleMF', 'RefineLabelMF']:
            df = pd.DataFrame(columns=['model', 'confidence', 'iter'])
            for conf in params['confidence']:
                format = "model: {0}, confidence: {1}"
                progress.section(format.format(algorithm, conf))
                RQ, Y, uBias, iBias = params['models'][algorithm](train, validation,
                                                                  matrix_unif_train=unif_train,
                                                                  iteration=params['iter'],
                                                                  rank=params['rank'],
                                                                  gpu_on=gpu_on,
                                                                  lam=params['lambda'], seed=seed,
                                                                  batch_size=params['batch_size'],
                                                                  way=way,
                                                                  confidence=conf,
                                                                  dataset=dataset)

                progress.subsection("Prediction")
                prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias,
                                     gpu=gpu_on)

                progress.subsection("Evaluation")
                result = evaluate(prediction, validation, params['metric'], gpu=gpu_on)
                result_dict = {'model': algorithm, 'confidence': conf, 'iter': params['iter']}
                for name in result.keys():
                    result_dict[name] = round(result[name][0], 8)
                df = df.append(result_dict, ignore_index=True)
                save_dataframe_csv(df, table_path, save_path)
        elif algorithm in ['BatchSampleMF']:
            df = pd.DataFrame(columns=['model', 'step', 'iter'])
            for step in params['step']:
                format = "model: {0}, step: {1}"
                progress.section(format.format(algorithm, step))
                RQ, Y, uBias, iBias = params['models'][algorithm](train, validation,
                                                                  matrix_unif_train=unif_train,
                                                                  iteration=params['iter'],
                                                                  rank=params['rank'],
                                                                  gpu_on=gpu_on,
                                                                  lam=params['lambda'], seed=seed,
                                                                  batch_size=params['batch_size'],
                                                                  way=way,
                                                                  step=step,
                                                                  dataset=dataset)

                progress.subsection("Prediction")
                prediction = predict(matrix_U=RQ, matrix_V=Y, matrix_Valid=validation, ubias=uBias, ibias=iBias,
                                     gpu=gpu_on)

                progress.subsection("Evaluation")
                result = evaluate(prediction, validation, params['metric'], gpu=gpu_on)
                result_dict = {'model': algorithm, 'step': step, 'iter': params['iter']}
                for name in result.keys():
                    result_dict[name] = round(result[name][0], 8)
                df = df.append(result_dict, ignore_index=True)
                save_dataframe_csv(df, table_path, save_path)
Exemplo n.º 25
0
def multiple_run_tune(defaul_params, tune_params, save_path):
    # Set up data stream
    start = time.time()
    print('Setting up data stream')
    data_continuum = continuum(defaul_params.data, defaul_params.cl_type,
                               defaul_params)
    data_end = time.time()
    print('data setup time: {}'.format(data_end - start))

    #store table
    # set up storing table
    table_path = load_yaml('config/global.yml', key='path')['tables']
    metric_list = ['Avg_End_Acc'] + ['Avg_End_Fgt'] + ['Time'] + [
        "Batch" + str(i)
        for i in range(defaul_params.num_val, data_continuum.task_nums)
    ]
    param_list = list(tune_params.keys()) + metric_list
    table_columns = ['Run'] + param_list
    table_path = table_path + defaul_params.data
    os.makedirs(table_path, exist_ok=True)
    if not save_path:
        save_path = defaul_params.model_name + '_' + defaul_params.data_name + '.csv'
    df = pd.DataFrame(columns=table_columns)
    # store list
    accuracy_list = []
    params_keep = []
    for run in range(defaul_params.num_runs):
        tmp_acc = []
        tune_data = []
        run_start = time.time()
        data_continuum.new_run()
        # prepare val data loader
        test_loaders = setup_test_loader(data_continuum.test_data(),
                                         defaul_params)
        tune_test_loaders = test_loaders[:defaul_params.num_val]
        test_loaders = test_loaders[defaul_params.num_val:]
        for i, (x_train, y_train, labels) in enumerate(data_continuum):
            if i < defaul_params.num_val:
                #collection tune data
                tune_data.append((x_train, y_train, labels))
                if len(tune_data) == defaul_params.num_val:
                    # tune
                    best_params = tune_hyper(tune_data, tune_test_loaders,
                                             defaul_params, tune_params)
                    params_keep.append(best_params)
                    final_params = vars(defaul_params)
                    final_params.update(best_params)
                    final_params = SimpleNamespace(**final_params)
                    # set up
                    print('Tuning is done. Best hyper parameter set is {}'.
                          format(best_params))
                    model = setup_architecture(final_params)
                    model = maybe_cuda(model, final_params.cuda)
                    opt = setup_opt(final_params.optimizer, model,
                                    final_params.learning_rate,
                                    final_params.weight_decay)
                    agent = agents[final_params.agent](model, opt,
                                                       final_params)
                    print('Training Start')
            else:
                print("----------run {} training batch {}-------------".format(
                    run, i))
                print('size: {}, {}'.format(x_train.shape, y_train.shape))
                agent.train_learner(x_train, y_train)
                acc_array = agent.evaluate(test_loaders)
                tmp_acc.append(acc_array)

        run_end = time.time()
        print(
            "-----------run {}-----------avg_end_acc {}-----------train time {}"
            .format(run, np.mean(tmp_acc[-1]), run_end - run_start))
        accuracy_list.append(np.array(tmp_acc))

        #store result
        result_dict = {'Run': run}
        result_dict.update(best_params)
        end_task_acc = tmp_acc[-1]
        for i in range(data_continuum.task_nums - defaul_params.num_val):
            result_dict["Batch" +
                        str(i + defaul_params.num_val)] = end_task_acc[i]
        result_dict['Avg_End_Acc'] = np.mean(tmp_acc[-1])
        result_dict['Avg_End_Fgt'] = single_run_avg_end_fgt(np.array(tmp_acc))
        result_dict['Time'] = run_end - run_start
        df = df.append(result_dict, ignore_index=True)
        save_dataframe_csv(df, table_path, save_path)
    accuracy_list = np.array(accuracy_list)
    avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt = compute_performance(
        accuracy_list)
    end = time.time()
    final_result = {'Run': 'Final Result'}
    final_result['Avg_End_Acc'] = avg_end_acc
    final_result['Avg_End_Fgt'] = avg_end_fgt
    final_result['Time'] = end - start
    df = df.append(final_result, ignore_index=True)
    save_dataframe_csv(df, table_path, save_path)
    print('----------- Total {} run: {}s -----------'.format(
        defaul_params.num_runs, end - start))
    print(
        '----------- Avg_End_Acc {} Avg_End_Fgt {} Avg_Acc {} Avg_Bwtp {} Avg_Fwt {}-----------'
        .format(avg_end_acc, avg_end_fgt, avg_acc, avg_bwtp, avg_fwt))
Exemplo n.º 26
0
def main(args):
    progress = WorkSplitter()

    progress.section("Parameter Setting")
    print("Data Path: {}".format(args.data_dir))
    reviewJsonToronto = args.data_dir + args.data_name

    progress.section("Load data")
    df = get_yelp_df(path='', filename=reviewJsonToronto, sampling=True)
    print('Data loaded sucessfully')

    progress.section("Matrix Generation")
    rating_matrix, timestamp_matrix, I_C_matrix, IC_dictionary = get_rating_timestamp_matrix(
        df)
    # get ratingWuserAvg_matrix
    rating_array = rating_matrix.toarray()
    user_average_array = rating_array.sum(axis=1) / np.count_nonzero(
        rating_array, axis=1)
    init_UI = np.zeros(rating_array.shape)
    init_UI[rating_array.nonzero()] = 1

    #Creating rating with user average array array
    for i in range(user_average_array.shape[0]):
        init_UI[i] = init_UI[i] * (user_average_array[i] - 0.001)
    user_average_array = init_UI
    ratingWuserAvg_array = rating_array - user_average_array
    ratingWuserAvg_matrix = sparse.csr_matrix(ratingWuserAvg_array)

    progress.section("Split for training")
    rtrain_implicit, rvalid_implicit, rtest_implicit, rtrain_userAvg_implicit, rvalid_userAvg_implicit, \
    rtest_userAvg_implicit, nonzero_index, rtime, item_idx_matrix_train_implicit,item_idx_matrix_valid_implicit, item_idx_matrix_test_implicit \
    = time_ordered_splitModified(rating_matrix=rating_matrix, ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix,
                                                                     ratio=[0.5,0.2,0.3],
                                                                     implicit=True,
                                                                     remove_empty=False, threshold=3,sampling=False,
                                                                     sampling_ratio=0.1, trainSampling=0.95)

    rtrain, rvalid, rtest, rtrain_userAvg, rvalid_userAvg, rtest_userAvg, nonzero_index, rtime, \
    item_idx_matrix_train,item_idx_matrix_valid, item_idx_matrix_test = time_ordered_splitModified(rating_matrix=rating_matrix,
                                                                     ratingWuserAvg_matrix=ratingWuserAvg_matrix, timestamp_matrix=timestamp_matrix,
                                                                     ratio=[0.5,0.2,0.3],
                                                                     implicit=False,
                                                                     remove_empty=False, threshold=3,
                                                                     sampling=False, sampling_ratio=0.1,
                                                                     trainSampling=0.95)

    rtrain = rtrain + rvalid + rtest
    rtrain_implicit = rtrain_implicit + rvalid_implicit + rtest_implicit

    progress.section("Get UC Matrix")
    #Get UC matrices
    U_C_matrix_explicit, U_C_matrix_implicit = get_UC_Matrix(
        I_C_matrix, rtrain_implicit)

    progress.section("Get IK Similarity")
    IK_MATRIX = ikGeneration(df)
    IK_similarity = train(IK_MATRIX)
    '''
    progress.section("Get IC Similarity")
    IC_similarity = train(I_C_matrix)
    '''

    progress.section("Get IP, IS, ID Dictionary")
    #intersection = get_intersection()
    intersection_yonge_and_finch, intersection_bloor_and_bathurst, intersection_spadina_and_dundas,\
    intersection_queen_and_spadina, intersection_bloor_and_yonge, intersection_dundas_and_yonge = get_intersection()
    IP_df, IP_dictionary = get_IP_matrix_dictionary(df, IK_similarity)
    IS_dictionary = get_IS_dictionary(df)
    #ID_dictionary = get_ID_dictionary(df,list(set(df['business_num_id'])),intersection)
    ID_dictionary_yonge_and_finch = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_yonge_and_finch)
    ID_dictionary_bloor_and_bathurst = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_bloor_and_bathurst)
    ID_dictionary_spadina_and_dundas = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_spadina_and_dundas)
    ID_dictionary_queen_and_spadina = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_queen_and_spadina)
    ID_dictionary_bloor_and_yonge = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_bloor_and_yonge)
    ID_dictionary_dundas_and_yonge = get_ID_dictionary(
        df, list(set(df['business_num_id'])), intersection_dundas_and_yonge)

    progress.section("user item predict")
    user_item_prediction_score = predict(rtrain,
                                         110,
                                         IK_similarity,
                                         item_similarity_en=True)
    UI_Prediction_Matrix = prediction(user_item_prediction_score, rtrain)

    progress.section("Save datafiles csv")
    save_dataframe_csv(df, args.data_dir, "Dataframe")

    progress.section("Save datafiles JSON")
    saveDictToJson(IC_dictionary,
                   args.data_dir,
                   'icDictionary',
                   trainOrTest='train')
    saveDictToJson(IP_dictionary,
                   args.data_dir,
                   'ipDictionary',
                   trainOrTest='train')
    saveDictToJson(IS_dictionary,
                   args.data_dir,
                   'isDictionary',
                   trainOrTest='train')
    #saveDictToJson(ID_dictionary, args.data_dir, 'idDictionary', trainOrTest='train')
    saveDictToJson(ID_dictionary_yonge_and_finch,
                   args.data_dir,
                   'idDictionary_yongefinch',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_bloor_and_bathurst,
                   args.data_dir,
                   'idDictionary_bloorbathurst',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_spadina_and_dundas,
                   args.data_dir,
                   'idDictionary_spadinadundas',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_queen_and_spadina,
                   args.data_dir,
                   'idDictionary_queenspadina',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_bloor_and_yonge,
                   args.data_dir,
                   'idDictionary_blooryonge',
                   trainOrTest='train')
    saveDictToJson(ID_dictionary_dundas_and_yonge,
                   args.data_dir,
                   'idDictionary_dundasyonge',
                   trainOrTest='train')

    progress.section("Save datafiles Numpy")
    save_numpy_csr(rtrain, args.data_dir, "rtrain")
    save_numpy_csr(I_C_matrix, args.data_dir, "icmatrix")
    #save_numpy(user_item_prediction_score, args.data_dir, "predictionScore")
    save_numpy(IK_similarity, args.data_dir,
               "IKbased_II_similarity")  #Tina requested for this name
    save_numpy(UI_Prediction_Matrix, args.data_dir, "UI_prediction_matrix")
    '''