Exemplo n.º 1
0
    def run_cf_predictions(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(
                train_ratings_loc, self.sc).repartition(self.num_partitions)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred + '.pkl'

                if os.path.isdir(pred_save_loc) == False:
                    print 'Running ' + cf_pred + ' for user vector ' + uv
                    print pred_save_loc
                    if cf_pred == 'cf_mllib':
                        predictions = cf.calc_cf_mllib(
                            train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == 'cf_item':
                        predictions = cf.calc_item_item_cf(
                            train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == 'cf_user':
                        predictions = cf.calc_user_user_cf2(
                            train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    else:
                        break
        print 'All CF predictions saved'
Exemplo n.º 2
0
    def run_single_prediction(self, user_vector, content_vector, alg_type):
        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

        if content_vector:
            content_path = self.directory + self.data_name +'_cv_' + content_vector + '.pkl'
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)
            print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type  + '.pkl'
            print pred_save_loc

            if alg_type=='cb_vect':
                predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cb_kmeans_100':
                predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cb_kmeans_1000':
                predictions = content_based_kmeans.predict(train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)

        else:
            print 'Running ' + alg_type + ' for user vector ' + user_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type  + '.pkl'
            print pred_save_loc

            if alg_type=='cf_mllib':
                predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_item':
                predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_user':
                predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_bayes_map':
                predictions = cf.calc_naive_bayes_map(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_bayes_mse':
                predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_bayes_mae':
                predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type=='cf_random':
                predictions = random_recommender.predict(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
Exemplo n.º 3
0
    def run_cf_predictions(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + '_uv_train_' + uv + '.pkl'
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + '_predictions_' + uv + '_' + cf_pred  + '.pkl'

                if os.path.isdir(pred_save_loc)==False:
                    print 'Running ' + cf_pred + ' for user vector ' + uv
                    print pred_save_loc
                    if cf_pred=='cf_mllib':
                        predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred=='cf_item':
                        predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred=='cf_user':
                        predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    else:
                        break
        print 'All CF predictions saved'
Exemplo n.º 4
0
    def run_cf_predictions(self):
        for uv in self.user_vector_types:
            train_ratings_loc = self.directory + self.data_name + "_uv_train_" + uv + ".pkl"
            train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

            for cf_pred in self.cf_predictions:

                pred_save_loc = self.directory + self.data_name + "_predictions_" + uv + "_" + cf_pred + ".pkl"

                if os.path.isdir(pred_save_loc) == False:
                    print "Running " + cf_pred + " for user vector " + uv
                    print pred_save_loc
                    if cf_pred == "cf_mllib":
                        predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == "cf_item":
                        predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    elif cf_pred == "cf_user":
                        predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                        sl.save_to_hadoop(predictions, pred_save_loc)
                    else:
                        break
        print "All CF predictions saved"
Exemplo n.º 5
0
    def run_single_prediction(self, user_vector, content_vector, alg_type):
        train_ratings_loc = self.directory + self.data_name + '_uv_train_' + user_vector + '.pkl'
        train_ratings = sl.load_from_hadoop(
            train_ratings_loc, self.sc).repartition(self.num_partitions)

        if content_vector:
            content_path = self.directory + self.data_name + '_cv_' + content_vector + '.pkl'
            content_vect = sl.load_from_hadoop(
                content_path, self.sc).repartition(self.num_partitions)
            print 'Running ' + alg_type + ' for user vector ' + user_vector + ' and content vector ' + content_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + content_vector + '_' + alg_type + '.pkl'
            print pred_save_loc

            if alg_type == 'cb_vect':
                predictions = content_based.predict(
                    train_ratings,
                    content_vect,
                    num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cb_kmeans_100':
                predictions = content_based_kmeans.predict(
                    train_ratings,
                    content_vect,
                    num_predictions=100,
                    num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cb_kmeans_1000':
                predictions = content_based_kmeans.predict(
                    train_ratings,
                    content_vect,
                    num_predictions=1000,
                    num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)

        else:
            print 'Running ' + alg_type + ' for user vector ' + user_vector

            pred_save_loc = self.directory + self.data_name + '_predictions_' + user_vector + '_' + alg_type + '.pkl'
            print pred_save_loc

            if alg_type == 'cf_mllib':
                predictions = cf.calc_cf_mllib(
                    train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_item':
                predictions = cf.calc_item_item_cf(
                    train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_user':
                predictions = cf.calc_user_user_cf2(
                    train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_bayes_map':
                predictions = cf.calc_naive_bayes_map(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_bayes_mse':
                predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_bayes_mae':
                predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == 'cf_random':
                predictions = random_recommender.predict(
                    train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
Exemplo n.º 6
0
    def run_single_prediction(self, user_vector, content_vector, alg_type):
        train_ratings_loc = self.directory + self.data_name + "_uv_train_" + user_vector + ".pkl"
        train_ratings = sl.load_from_hadoop(train_ratings_loc, self.sc).repartition(self.num_partitions)

        if content_vector:
            content_path = self.directory + self.data_name + "_cv_" + content_vector + ".pkl"
            content_vect = sl.load_from_hadoop(content_path, self.sc).repartition(self.num_partitions)
            print "Running " + alg_type + " for user vector " + user_vector + " and content vector " + content_vector

            pred_save_loc = (
                self.directory
                + self.data_name
                + "_predictions_"
                + user_vector
                + "_"
                + content_vector
                + "_"
                + alg_type
                + ".pkl"
            )
            print pred_save_loc

            if alg_type == "cb_vect":
                predictions = content_based.predict(train_ratings, content_vect, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cb_kmeans_100":
                predictions = content_based_kmeans.predict(
                    train_ratings, content_vect, num_predictions=100, num_partitions=self.num_partitions
                )
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cb_kmeans_1000":
                predictions = content_based_kmeans.predict(
                    train_ratings, content_vect, num_predictions=1000, num_partitions=self.num_partitions
                )
                sl.save_to_hadoop(predictions, pred_save_loc)

        else:
            print "Running " + alg_type + " for user vector " + user_vector

            pred_save_loc = self.directory + self.data_name + "_predictions_" + user_vector + "_" + alg_type + ".pkl"
            print pred_save_loc

            if alg_type == "cf_mllib":
                predictions = cf.calc_cf_mllib(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_item":
                predictions = cf.calc_item_item_cf(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_user":
                predictions = cf.calc_user_user_cf2(train_ratings, num_partitions=self.num_partitions)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_bayes_map":
                predictions = cf.calc_naive_bayes_map(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_bayes_mse":
                predictions = cf.calc_naive_bayes_mse(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_bayes_mae":
                predictions = cf.calc_naive_bayes_mae(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)
            elif alg_type == "cf_random":
                predictions = random_recommender.predict(train_ratings, self.sc)
                sl.save_to_hadoop(predictions, pred_save_loc)