Exemplo n.º 1
0
    def run_cb_predictions(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name +'_cv_' + cv + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
                train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

                for cb_pred in self.cb_predictions:
                    pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' + cb_pred  + '.pkl'
                    print pred_save_loc
                    if os.path.isdir(pred_save_loc)==False:
                        print 'Running ' + cb_pred + ' for user vector ' + uv + ' and content vector ' + cv
                        if cb_pred=='cb_vect':
                            predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred=='cb_kmeans_100':
                            predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred=='cb_kmeans_1000':
                            predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        else:
                            break
        print 'All CB predictions saved'
Exemplo n.º 2
0
    def run_cb_predictions(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name + "_cv_" + cv + ".pkl"
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl"
                train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

                for cb_pred in self.cb_predictions:
                    pred_save_loc = (
                        self.directory + self.data_name + "_predictions_" + uv + "_" + cv + "_" + cb_pred + ".pkl"
                    )
                    print pred_save_loc
                    if os.path.isdir(pred_save_loc) == False:
                        print "Running " + cb_pred + " for user vector " + uv + " and content vector " + cv
                        if cb_pred == "cb_vect":
                            predictions = content_based.predict(
                                train_ratings, content_vect, num_partitions=self.num_partitions
                            )
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred == "cb_kmeans_100":
                            predictions = content_based_kmeans.predict(
                                train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions
                            )
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred == "cb_kmeans_1000":
                            predictions = content_based_kmeans.predict(
                                train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions
                            )
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        else:
                            break
        print "All CB predictions saved"
Exemplo n.º 3
0
    def run_cf_results(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl"
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
            test_ratings_loc = self.directory + self.data_name + "_uv_test_" + uv + ".pkl"
            test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

            # get the first content vector for results purposes
            content_path = self.directory + self.data_name + "_cv_" + self.content_vector_types[0] + ".pkl"
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

            # Calculate statistics about the dataset
            stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + "_predictions_" + uv + "_" + cf_pred + ".pkl"
                print "Getting results for: " + pred_save_loc
                preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)

                for run in self.results_runs:
                    results = performance_metrics.get_perform_metrics(
                        test_ratings,
                        train_ratings,
                        preds,
                        content_vect,
                        self.sqlCtx,
                        num_predictions=run,
                        num_partitions=self.num_partitions,
                    )
                    # Merge the stats (which do not change run to run) with the results
                    results.update(stats)

                    # add some information to the results dictionary if it gets jumbled

                    results["N"] = run
                    results["dataset"] = self.data_name
                    results["CF_CB"] = "CF"
                    results["alg_type"] = cf_pred
                    results["user_vector"] = uv
                    results["content_vector"] = self.content_vector_types[0]
                    print results

                    # save off the results
                    results_path = (
                        self.results_directory
                        + self.data_name
                        + "_results_"
                        + uv
                        + "_"
                        + cf_pred
                        + "_"
                        + str(run)
                        + ".pkl"
                    )
                    f = open(results_path, "w")
                    f.write(str(results))
                    f.close()
        print "All CF predictions results aquired"
Exemplo n.º 4
0
    def run_single_prediction(self, user_vector, content_vector, alg_type):
        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

        if content_vector:
            content_path = self.directory + self.data_name +'_cv_' + content_vector + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)
            print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type  + '.pkl'
            print pred_save_loc

            if alg_type=='cb_vect':
                predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cb_kmeans_100':
                predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cb_kmeans_1000':
                predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)

        else:
            print 'Running ' + alg_type + ' for user vector ' + user_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type  + '.pkl'
            print pred_save_loc

            if alg_type=='cf_mllib':
                predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_item':
                predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_user':
                predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_bayes_map':
                predictions = cf.calc_naive_bayes_map(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_bayes_mse':
                predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_bayes_mae':
                predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_random':
                predictions = random_recommender.predict(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
Exemplo n.º 5
0
    def run_cf_results(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(
                train_ratings_loc, self.sc).repartition(self.num_partitions)
            test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl'
            test_ratings = sl.load_from_hadoop(
                test_ratings_loc, self.sc).repartition(self.num_partitions)

            #get the first content vector for results purposes
            content_path = self.directory + self.data_name + '_cv_' + self.content_vector_types[
                0] + '.pkl'
            content_vect = sl.load_from_hadoop(
                content_path, self.sc).repartition(self.num_partitions)

            # Calculate statistics about the dataset
            stats = dataset_stats.get_dataset_stats(train_ratings,
                                                    test_ratings)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl'
                print 'Getting results for: ' + pred_save_loc
                preds = sl.load_from_hadoop(
                    pred_save_loc, self.sc).repartition(self.num_partitions)

                for run in self.results_runs:
                    results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                    content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                    # Merge the stats (which do not change run to run) with the results
                    results.update(stats)

                    #add some information to the results dictionary if it gets jumbled

                    results['N'] = run
                    results['dataset'] = self.data_name
                    results['CF_CB'] = 'CF'
                    results['alg_type'] = cf_pred
                    results['user_vector'] = uv
                    results['content_vector'] = self.content_vector_types[0]
                    print results

                    #save off the results
                    results_path = self.results_directory + self.data_name + '_results_' + uv + '_' \
                                + cf_pred  + '_' + str(run) + '.pkl'
                    f = open(results_path, 'w')
                    f.write(str(results))
                    f.close()
        print 'All CF predictions results aquired'
Exemplo n.º 6
0
    def run_single_result(self, user_vector, content_vector, alg_type,
                          algorithm, num_preds):

        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(
            train_ratings_loc, self.sc).repartition(self.num_partitions)
        test_ratings_loc = self.directory + self.data_name + '_uv_test_' + user_vector + '.pkl'
        test_ratings = sl.load_from_hadoop(
            test_ratings_loc, self.sc).repartition(self.num_partitions)

        content_path = self.directory + self.data_name + '_cv_' + content_vector + '.pkl'
        content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(
            self.num_partitions)

        stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

        if alg_type == 'cb':
            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' \
                                + algorithm + '.pkl'
            results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' + content_vector + '_' \
                + algorithm  + '_' + str(num_preds) + '.csv'
        else:
            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector +  '_' \
                                + algorithm + '.pkl'
            results_path = self.results_directory + self.data_name + '_results_' + user_vector  + '_' \
                + algorithm  + '_' + str(num_preds) + '.csv'
        print 'Getting results for: ' + pred_save_loc
        preds = sl.load_from_hadoop(pred_save_loc,
                                    self.sc).repartition(self.num_partitions)

        results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                             content_vect, self.sqlCtx, num_predictions = num_preds, num_partitions=self.num_partitions)
        # Merge the stats (which do not change run to run) with the results
        results.update(stats)
        #add some information to the results dictionary if it gets jumbled
        results['N'] = num_preds
        results['dataset'] = self.data_name
        results['CF_CB'] = 'CB'
        results['alg_type'] = algorithm
        results['user_vector'] = user_vector
        results['content_vector'] = content_vector
        print results

        #save off the results
        print results_path
        f = open(results_path, 'w')
        f.write(str(results))
        f.close()
Exemplo n.º 7
0
    def run_cf_predictions(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(
                train_ratings_loc, self.sc).repartition(self.num_partitions)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl'

                if os.path.isdir(pred_save_loc) == False:
                    print 'Running ' + cf_pred + ' for user vector ' + uv
                    print pred_save_loc
                    if cf_pred == 'cf_mllib':
                        predictions = cf.calc_cf_mllib(
                            train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == 'cf_item':
                        predictions = cf.calc_item_item_cf(
                            train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == 'cf_user':
                        predictions = cf.calc_user_user_cf2(
                            train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    else:
                        break
        print 'All CF predictions saved'
Exemplo n.º 8
0
    def run_cf_results(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
            test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl'
            test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

            #get the first content vector for results purposes
            content_path = self.directory + self.data_name +'_cv_' + self.content_vector_types[0] + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

            # Calculate statistics about the dataset
            stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred  + '.pkl'
                print 'Getting results for: ' + pred_save_loc
                preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)

                for run in self.results_runs:
                    results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                    content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                    # Merge the stats (which do not change run to run) with the results
                    results.update(stats)

                    #add some information to the results dictionary if it gets jumbled

                    results['N'] = run
                    results['dataset'] = self.data_name
                    results['CF_CB'] = 'CF'
                    results['alg_type'] = cf_pred
                    results['user_vector'] = uv
                    results['content_vector'] = self.content_vector_types[0]
                    print results

                    #save off the results
                    results_path = self.results_directory + self.data_name + '_results_' + uv + '_' \
                                + cf_pred  + '_' + str(run) + '.pkl'
                    f = open(results_path, 'w')
                    f.write(str(results))
                    f.close()
        print 'All CF predictions results aquired'
Exemplo n.º 9
0
    def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds):

        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
        test_ratings_loc = self.directory + self.data_name + '_uv_test_' + user_vector + '.pkl'
        test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

        content_path = self.directory + self.data_name +'_cv_' + content_vector + '.pkl'
        content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

        stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

        if alg_type=='cb':
            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' \
                                + algorithm + '.pkl'
            results_path = self.results_directory + self.data_name + '_results_' + user_vector + '_' + content_vector + '_' \
                + algorithm  + '_' + str(num_preds) + '.csv'
        else:
            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector +  '_' \
                                + algorithm + '.pkl'
            results_path = self.results_directory + self.data_name + '_results_' + user_vector  + '_' \
                + algorithm  + '_' + str(num_preds) + '.csv'
        print 'Getting results for: ' + pred_save_loc
        preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)

        results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                             content_vect, self.sqlCtx, num_predictions = num_preds, num_partitions=self.num_partitions)
        # Merge the stats (which do not change run to run) with the results
        results.update(stats)
        #add some information to the results dictionary if it gets jumbled
        results['N'] = num_preds
        results['dataset'] = self.data_name
        results['CF_CB'] = 'CB'
        results['alg_type'] = algorithm
        results['user_vector'] = user_vector
        results['content_vector'] = content_vector
        print results

        #save off the results
        print results_path
        f = open(results_path, 'w')
        f.write(str(results))
        f.close()
Exemplo n.º 10
0
    def run_cb_predictions(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name + '_cv_' + cv + '.pkl'
            content_vect = sl.load_from_hadoop(
                content_path, self.sc).repartition(self.num_partitions)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
                train_ratings = sl.load_from_hadoop(train_ratings_loc,
                                                    self.sc).repartition(
                                                        self.num_partitions)

                for cb_pred in self.cb_predictions:
                    pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' + cb_pred + '.pkl'
                    print pred_save_loc
                    if os.path.isdir(pred_save_loc) == False:
                        print 'Running ' + cb_pred + ' for user vector ' + uv + ' and content vector ' + cv
                        if cb_pred == 'cb_vect':
                            predictions = content_based.predict(
                                train_ratings,
                                content_vect,
                                num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred == 'cb_kmeans_100':
                            predictions = content_based_kmeans.predict(
                                train_ratings,
                                content_vect,
                                num_predictions=100,
                                num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        elif cb_pred == 'cb_kmeans_1000':
                            predictions = content_based_kmeans.predict(
                                train_ratings,
                                content_vect,
                                num_predictions=1000,
                                num_partitions=self.num_partitions)
                            sl.save_to_hadoop(predictions, pred_save_loc)
                        else:
                            break
        print 'All CB predictions saved'
Exemplo n.º 11
0
    def run_cf_predictions(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred  + '.pkl'

                if os.path.isdir(pred_save_loc)==False:
                    print 'Running ' + cf_pred + ' for user vector ' + uv
                    print pred_save_loc
                    if cf_pred=='cf_mllib':
                        predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred=='cf_item':
                        predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred=='cf_user':
                        predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    else:
                        break
        print 'All CF predictions saved'
Exemplo n.º 12
0
    def run_cf_predictions(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl"
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + "_predictions_" + uv + "_" + cf_pred + ".pkl"

                if os.path.isdir(pred_save_loc) == False:
                    print "Running " + cf_pred + " for user vector " + uv
                    print pred_save_loc
                    if cf_pred == "cf_mllib":
                        predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == "cf_item":
                        predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == "cf_user":
                        predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    else:
                        break
        print "All CF predictions saved"
Exemplo n.º 13
0
    def run_single_prediction(self, user_vector, content_vector, alg_type):
        train_ratings_loc = self.directory + self.data_name + "_uv_train_" + user_vector + ".pkl"
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

        if content_vector:
            content_path = self.directory + self.data_name + "_cv_" + content_vector + ".pkl"
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)
            print "Running " + alg_type + " for user vector " + user_vector + " and content vector " + content_vector

            pred_save_loc = (
                self.directory
                + self.data_name
                + "_predictions_"
                + user_vector
                + "_"
                + content_vector
                + "_"
                + alg_type
                + ".pkl"
            )
            print pred_save_loc

            if alg_type == "cb_vect":
                predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cb_kmeans_100":
                predictions = content_based_kmeans.predict(
                    train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions
                )
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cb_kmeans_1000":
                predictions = content_based_kmeans.predict(
                    train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions
                )
                sl.save_to_hadoop(predictions, pred_save_loc)

        else:
            print "Running " + alg_type + " for user vector " + user_vector

            pred_save_loc = self.directory + self.data_name + "_predictions_" + user_vector + "_" + alg_type + ".pkl"
            print pred_save_loc

            if alg_type == "cf_mllib":
                predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_item":
                predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_user":
                predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_bayes_map":
                predictions = cf.calc_naive_bayes_map(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_bayes_mse":
                predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_bayes_mae":
                predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_random":
                predictions = random_recommender.predict(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
Exemplo n.º 14
0
    def run_cb_results(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name +'_cv_' + cv + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
                train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
                test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl'
                test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

                # Calculate statistics about the dataset
                stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

                for cb_pred in self.cb_predictions:

                    pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' \
                                + cb_pred + '.pkl'
                    print 'Getting results for: ' + pred_save_loc
                    preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)
                    #print preds.count()

                    #if we ran the kmeans we do not need to complete both runs
                    #otherwise we do
                    if cb_pred=='cb_kmeans_100' or cb_pred=='cb_kmeans_1000':
                        if cb_pred=='cb_kmeans_1000':
                            run = 1000
                        else:
                            run = 100
                        results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                            content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                        # Merge the stats (which do not change run to run) with the results
                        results.update(stats)
                        #add some information to the results dictionary if it gets jumbled
                        results['N'] = run
                        results['dataset'] = self.data_name
                        results['CF_CB'] = 'CB'
                        results['alg_type'] = cb_pred
                        results['user_vector'] = uv
                        results['content_vector'] = cv
                        print results

                        #save off the results
                        results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv + '_' \
                                        + cb_pred  + '_' + str(run) + '.csv'
                        print results_path
                        f = open(results_path, 'w')
                        f.write(str(results))
                        f.close()


                    else:
                        for run in self.results_runs:
                            results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                            content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                            # Merge the stats (which do not change run to run) with the results
                            results.update(stats)
                            #add some information to the results dictionary if it gets jumbled
                            results['N'] = run
                            results['dataset'] = self.data_name
                            results['CF_CB'] = 'CB'
                            results['alg_type'] = cb_pred
                            results['user_vector'] = uv
                            results['content_vector'] = cv
                            print results

                            #save off the results
                            results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv \
                                        + '_' + cb_pred  + '_' + str(run) + '.csv'
                            print results_path
                            f = open(results_path, 'w')
                            f.write(str(results))
                            f.close()
        print 'All CB predictions results aquired'
Exemplo n.º 15
0
    def run_cb_results(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name + "_cv_" + cv + ".pkl"
            content_vect = sl.load_from_hadoop(content_path, self.sc)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl"
                train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
                test_ratings_loc = self.directory + self.data_name + "_uv_test_" + uv + ".pkl"
                test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

                # Calculate statistics about the dataset
                stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

                for cb_pred in self.cb_predictions:

                    pred_save_loc = (
                        self.directory + self.data_name + "_predictions_" + uv + "_" + cv + "_" + cb_pred + ".pkl"
                    )
                    print "Getting results for: " + pred_save_loc
                    preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)
                    # print preds.count()

                    # if we ran the kmeans we do not need to complete both runs
                    # otherwise we do
                    if cb_pred == "cb_kmeans_100" or cb_pred == "cb_kmeans_1000":
                        if cb_pred == "cb_kmeans_1000":
                            run = 1000
                        else:
                            run = 100
                        results = performance_metrics.get_perform_metrics(
                            test_ratings,
                            train_ratings,
                            preds,
                            content_vect,
                            self.sqlCtx,
                            num_predictions=run,
                            num_partitions=self.num_partitions,
                        )
                        # Merge the stats (which do not change run to run) with the results
                        results.update(stats)
                        # add some information to the results dictionary if it gets jumbled
                        results["N"] = run
                        results["dataset"] = self.data_name
                        results["CF_CB"] = "CB"
                        results["alg_type"] = cb_pred
                        results["user_vector"] = uv
                        results["content_vector"] = cv
                        print results

                        # save off the results
                        results_path = (
                            self.results_directory
                            + self.data_name
                            + "_results_"
                            + uv
                            + "_"
                            + cv
                            + "_"
                            + cb_pred
                            + "_"
                            + str(run)
                            + ".csv"
                        )
                        print results_path
                        f = open(results_path, "w")
                        f.write(str(results))
                        f.close()

                    else:
                        for run in self.results_runs:
                            results = performance_metrics.get_perform_metrics(
                                test_ratings,
                                train_ratings,
                                preds,
                                content_vect,
                                self.sqlCtx,
                                num_predictions=run,
                                num_partitions=self.num_partitions,
                            )
                            # Merge the stats (which do not change run to run) with the results
                            results.update(stats)
                            # add some information to the results dictionary if it gets jumbled
                            results["N"] = run
                            results["dataset"] = self.data_name
                            results["CF_CB"] = "CB"
                            results["alg_type"] = cb_pred
                            results["user_vector"] = uv
                            results["content_vector"] = cv
                            print results

                            # save off the results
                            results_path = (
                                self.results_directory
                                + self.data_name
                                + "_results_"
                                + uv
                                + "_"
                                + cv
                                + "_"
                                + cb_pred
                                + "_"
                                + str(run)
                                + ".csv"
                            )
                            print results_path
                            f = open(results_path, "w")
                            f.write(str(results))
                            f.close()
        print "All CB predictions results aquired"
Exemplo n.º 16
0
    def run_single_result(self, user_vector, content_vector, alg_type, algorithm, num_preds):

        train_ratings_loc = self.directory + self.data_name + "_uv_train_" + user_vector + ".pkl"
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)
        test_ratings_loc = self.directory + self.data_name + "_uv_test_" + user_vector + ".pkl"
        test_ratings = sl.load_from_hadoop(test_ratings_loc, self.sc).repartition(self.num_partitions)

        content_path = self.directory + self.data_name + "_cv_" + content_vector + ".pkl"
        content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)

        stats = dataset_stats.get_dataset_stats(train_ratings, test_ratings)

        if alg_type == "cb":
            pred_save_loc = (
                self.directory
                + self.data_name
                + "_predictions_"
                + user_vector
                + "_"
                + content_vector
                + "_"
                + algorithm
                + ".pkl"
            )
            results_path = (
                self.results_directory
                + self.data_name
                + "_results_"
                + user_vector
                + "_"
                + content_vector
                + "_"
                + algorithm
                + "_"
                + str(num_preds)
                + ".csv"
            )
        else:
            pred_save_loc = self.directory + self.data_name + "_predictions_" + user_vector + "_" + algorithm + ".pkl"
            results_path = (
                self.results_directory
                + self.data_name
                + "_results_"
                + user_vector
                + "_"
                + algorithm
                + "_"
                + str(num_preds)
                + ".csv"
            )
        print "Getting results for: " + pred_save_loc
        preds = sl.load_from_hadoop(pred_save_loc, self.sc).repartition(self.num_partitions)

        results = performance_metrics.get_perform_metrics(
            test_ratings,
            train_ratings,
            preds,
            content_vect,
            self.sqlCtx,
            num_predictions=num_preds,
            num_partitions=self.num_partitions,
        )
        # Merge the stats (which do not change run to run) with the results
        results.update(stats)
        # add some information to the results dictionary if it gets jumbled
        results["N"] = num_preds
        results["dataset"] = self.data_name
        results["CF_CB"] = "CB"
        results["alg_type"] = algorithm
        results["user_vector"] = user_vector
        results["content_vector"] = content_vector
        print results

        # save off the results
        print results_path
        f = open(results_path, "w")
        f.write(str(results))
        f.close()
Exemplo n.º 17
0
    def run_single_prediction(self, user_vector, content_vector, alg_type):
        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(
            train_ratings_loc, self.sc).repartition(self.num_partitions)

        if content_vector:
            content_path = self.directory + self.data_name + '_cv_' + content_vector + '.pkl'
            content_vect = sl.load_from_hadoop(
                content_path, self.sc).repartition(self.num_partitions)
            print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type + '.pkl'
            print pred_save_loc

            if alg_type == 'cb_vect':
                predictions = content_based.predict(
                    train_ratings,
                    content_vect,
                    num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cb_kmeans_100':
                predictions = content_based_kmeans.predict(
                    train_ratings,
                    content_vect,
                    num_predictions=100,
                    num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cb_kmeans_1000':
                predictions = content_based_kmeans.predict(
                    train_ratings,
                    content_vect,
                    num_predictions=1000,
                    num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)

        else:
            print 'Running ' + alg_type + ' for user vector ' + user_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type + '.pkl'
            print pred_save_loc

            if alg_type == 'cf_mllib':
                predictions = cf.calc_cf_mllib(
                    train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_item':
                predictions = cf.calc_item_item_cf(
                    train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_user':
                predictions = cf.calc_user_user_cf2(
                    train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_bayes_map':
                predictions = cf.calc_naive_bayes_map(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_bayes_mse':
                predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_bayes_mae':
                predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_random':
                predictions = random_recommender.predict(
                    train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
Exemplo n.º 18
0
    def run_cb_results(self):
        for cv in self.content_vector_types:
            content_path = self.directory + self.data_name + '_cv_' + cv + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc)

            for uv in self.user_vector_types:
                train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
                train_ratings = sl.load_from_hadoop(train_ratings_loc,
                                                    self.sc).repartition(
                                                        self.num_partitions)
                test_ratings_loc = self.directory + self.data_name + '_uv_test_' + uv + '.pkl'
                test_ratings = sl.load_from_hadoop(
                    test_ratings_loc, self.sc).repartition(self.num_partitions)

                # Calculate statistics about the dataset
                stats = dataset_stats.get_dataset_stats(
                    train_ratings, test_ratings)

                for cb_pred in self.cb_predictions:

                    pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cv + '_' \
                                + cb_pred + '.pkl'
                    print 'Getting results for: ' + pred_save_loc
                    preds = sl.load_from_hadoop(pred_save_loc,
                                                self.sc).repartition(
                                                    self.num_partitions)
                    #print preds.count()

                    #if we ran the kmeans we do not need to complete both runs
                    #otherwise we do
                    if cb_pred == 'cb_kmeans_100' or cb_pred == 'cb_kmeans_1000':
                        if cb_pred == 'cb_kmeans_1000':
                            run = 1000
                        else:
                            run = 100
                        results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                            content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                        # Merge the stats (which do not change run to run) with the results
                        results.update(stats)
                        #add some information to the results dictionary if it gets jumbled
                        results['N'] = run
                        results['dataset'] = self.data_name
                        results['CF_CB'] = 'CB'
                        results['alg_type'] = cb_pred
                        results['user_vector'] = uv
                        results['content_vector'] = cv
                        print results

                        #save off the results
                        results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv + '_' \
                                        + cb_pred  + '_' + str(run) + '.csv'
                        print results_path
                        f = open(results_path, 'w')
                        f.write(str(results))
                        f.close()

                    else:
                        for run in self.results_runs:
                            results = performance_metrics.get_perform_metrics(test_ratings, train_ratings, preds, \
                                                            content_vect, self.sqlCtx, num_predictions = run, num_partitions=self.num_partitions)
                            # Merge the stats (which do not change run to run) with the results
                            results.update(stats)
                            #add some information to the results dictionary if it gets jumbled
                            results['N'] = run
                            results['dataset'] = self.data_name
                            results['CF_CB'] = 'CB'
                            results['alg_type'] = cb_pred
                            results['user_vector'] = uv
                            results['content_vector'] = cv
                            print results

                            #save off the results
                            results_path = self.results_directory + self.data_name + '_results_' + uv + '_' + cv \
                                        + '_' + cb_pred  + '_' + str(run) + '.csv'
                            print results_path
                            f = open(results_path, 'w')
                            f.write(str(results))
                            f.close()
        print 'All CB predictions results aquired'