def calculate_sparsity(self):
        """
        Returns the percentage of missing ratings in the list of reviews of this
        ReviewsDatasetAnalyzer

        :return: the rate of missing ratings
        (i.e. number of missing ratings / (number of items * number of users))
        :raise ValueError: in case an empty list is given
        """
        if not self.reviews:
            raise ValueError("Can not determine the sparsity for an empty list")

        user_ids = extractor.get_groupby_list(self.reviews, "user_id")
        item_ids = extractor.get_groupby_list(self.reviews, "offering_id")

        non_missing_reviews = 0.0
        total_expected_reviews = len(user_ids) * len(item_ids)

        for user in user_ids:
            user_reviews = ETLUtils.filter_records(self.reviews, "user_id", [user])
            user_items = extractor.get_groupby_list(user_reviews, "offering_id")

            non_missing_reviews += len(set(item_ids).intersection(set(user_items)))

        return 1 - non_missing_reviews / total_expected_reviews
 def load(self, reviews):
     self.reviews = reviews
     self.user_ids = extractor.get_groupby_list(self.reviews, "user_id")
     self.item_ids = extractor.get_groupby_list(self.reviews, "offering_id")
     self.num_users = len(self.user_ids)
     self.num_items = len(self.item_ids)
     self.data_matrix = create_matrix(self.reviews)
예제 #3
0
    def initialize(self):
        self.user_ids =\
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        self.item_ids =\
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(self.user_ids))
        print('total items', len(self.item_ids))
        self.user_item_map = self.create_user_item_map()

        self.find_important_records()
예제 #4
0
    def initialize(self):
        self.user_ids =\
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        self.item_ids =\
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(self.user_ids))
        print('total items', len(self.item_ids))
        self.user_item_map = self.create_user_item_map()

        self.find_important_records()
예제 #5
0
    def full_cycle(self):
        Constants.print_properties()
        print('%s: full cycle' % time.strftime("%Y/%m/%d-%H:%M:%S"))
        utilities.plant_seeds()

        if self.use_cache and \
                os.path.exists(Constants.PROCESSED_RECORDS_FILE):
            print('Records have already been processed')
            self.records = \
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)
        else:
            self.load_records()

            if 'yelp' in Constants.ITEM_TYPE:
                self.transform_yelp_records()
            elif 'fourcity' in Constants.ITEM_TYPE:
                self.transform_fourcity_records()

            self.add_integer_ids()
            self.clean_reviews()
            self.remove_duplicate_reviews()
            self.tag_reviews_language()
            self.remove_foreign_reviews()
            self.lemmatize_records()
            self.remove_users_with_low_reviews()
            self.remove_items_with_low_reviews()
            self.count_frequencies()
            self.shuffle_records()
            print('total_records: %d' % len(self.records))
            self.classify_reviews()
            self.build_bag_of_words()
            self.tag_contextual_reviews()
            # self.load_full_records()
            self.build_dictionary()
            self.build_corpus()
            self.label_review_targets()
            self.export_records()

        self.count_specific_generic_ratio()
        # self.export_to_triplet()

        rda = ReviewsDatasetAnalyzer(self.records)
        print('density: %f' % rda.calculate_density_approx())
        print('sparsity: %f' % rda.calculate_sparsity_approx())
        print('total_records: %d' % len(self.records))
        user_ids = \
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        item_ids = \
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(user_ids))
        print('total items', len(item_ids))

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.separate_recsys_topic_model_records()
예제 #6
0
def get_unknown_items(reviews, user_id, num_unknown=1000):
    item_ids = extractor.get_groupby_list(reviews, 'offering_id')
    user_reviews = ETLUtils.filter_records(reviews, 'user_id', [user_id])
    user_items = extractor.get_groupby_list(user_reviews, 'offering_id')

    # We calculate which are the items that the user hasn't rated, which is the
    # items that are in the list item_ids but not in the list user_items
    s = set(user_items)
    unknown_items = [x for x in item_ids if x not in s]
    # TODO: Uncomment this line, the items have to be shuffled
    # shuffle(unknown_items)

    return unknown_items[:num_unknown]
    def __init__(self, reviews):

        if not reviews:
            raise ValueError("Can not analyze an empty list")

        self.reviews = reviews
        self.user_ids = extractor.get_groupby_list(self.reviews, "user_id")
        self.item_ids = extractor.get_groupby_list(self.reviews, "offering_id")
        self.num_reviews = len(self.reviews)
        self.num_users = len(self.user_ids)
        self.num_items = len(self.item_ids)
        self.data_frame = DataFrame(self.reviews)
        self.users_count = self.data_frame.groupby("user_id").size()
        self.items_count = self.data_frame.groupby("offering_id").size()
예제 #8
0
    def __init__(self, reviews):

        if not reviews:
            raise ValueError('Can not analyze an empty list')

        self.reviews = reviews
        self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id')
        self.item_ids = extractor.get_groupby_list(self.reviews, 'offering_id')
        self.num_reviews = len(self.reviews)
        self.num_users = len(self.user_ids)
        self.num_items = len(self.item_ids)
        self.data_frame = DataFrame(self.reviews)
        self.users_count = self.data_frame.groupby('user_id').size()
        self.items_count = self.data_frame.groupby('offering_id').size()
예제 #9
0
    def __init__(self, reviews):

        if not reviews:
            raise ValueError('Can not analyze an empty list')

        self.reviews = reviews
        self.user_ids = extractor.get_groupby_list(self.reviews, Constants.USER_ID_FIELD)
        self.item_ids = extractor.get_groupby_list(self.reviews, Constants.ITEM_ID_FIELD)
        self.num_reviews = len(self.reviews)
        self.num_users = len(self.user_ids)
        self.num_items = len(self.item_ids)
        self.data_frame = DataFrame(self.reviews)
        self.users_count = self.data_frame.groupby(Constants.USER_ID_FIELD).size()
        self.items_count = self.data_frame.groupby(Constants.ITEM_ID_FIELD).size()
    def build_user_clusters(reviews, significant_criteria_ranges=None):
        """
        Builds a series of clusters for users according to their significant
        criteria. Users that have exactly the same significant criteria will belong
        to the same cluster.

        :param reviews: the list of reviews
        :return: a dictionary where all the keys are the cluster names and the
        values for those keys are list of users that belong to that cluster
        """

        user_list = extractor.get_groupby_list(reviews, "user_id")
        user_cluster_dictionary = {}

        for user in user_list:
            weights = extractor.get_criteria_weights(reviews, user)
            significant_criteria, cluster_name = extractor.get_significant_criteria(
                weights, significant_criteria_ranges
            )

            if cluster_name in user_cluster_dictionary:
                user_cluster_dictionary[cluster_name].append(user)
            else:
                user_cluster_dictionary[cluster_name] = [user]

        return user_cluster_dictionary
예제 #11
0
    def build_user_clusters(reviews, significant_criteria_ranges=None):
        """
        Builds a series of clusters for users according to their significant
        criteria. Users that have exactly the same significant criteria will belong
        to the same cluster.

        :param reviews: the list of reviews
        :return: a dictionary where all the keys are the cluster names and the
        values for those keys are list of users that belong to that cluster
        """

        user_list = extractor.get_groupby_list(reviews, 'user_id')
        user_cluster_dictionary = {}

        for user in user_list:
            weights = extractor.get_criteria_weights(reviews, user)
            significant_criteria, cluster_name =\
                extractor.get_significant_criteria(weights, significant_criteria_ranges)

            if cluster_name in user_cluster_dictionary:
                user_cluster_dictionary[cluster_name].append(user)
            else:
                user_cluster_dictionary[cluster_name] = [user]

        return user_cluster_dictionary
예제 #12
0
def modify_properties_file(fold):
    with open(CARSKIT_ORIGINAL_CONF_FILE) as read_file:
        properties = jprops.load_properties(read_file, collections.OrderedDict)

    recommender = Constants.CARSKIT_RECOMMENDERS
    ratings_fold_folder = CARSKIT_RATINGS_FOLD_FOLDER % fold

    if not os.path.exists(ratings_fold_folder):
        os.makedirs(ratings_fold_folder)

    modified_file = CARSKIT_MODIFIED_CONF_FILE % (fold, recommender)
    properties['recommender'] = recommender
    properties['dataset.ratings.lins'] = \
        ratings_fold_folder + 'carskit_train.csv'
    test_file = ratings_fold_folder + 'carskit_test.csv'
    properties['evaluation.setup'] = \
        'test-set -f %s --rand-seed 1 --test-view all' % test_file

    records = ETLUtils.load_json_file(
        Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
    num_items = \
        len(extractor.get_groupby_list(records, Constants.ITEM_ID_FIELD))
    extra_items = num_items
    # extra_items = 10
    if Constants.CARSKIT_ITEM_RANKING:
        properties['item.ranking'] = 'on -topN %d' % extra_items
    else:
        properties['item.ranking'] = 'off'

    with open(modified_file, 'w') as write_file:
        jprops.store_properties(write_file, properties)
예제 #13
0
    def load(self, records):
        self.records = records
        self.ratings_matrix = basic_knn.create_ratings_matrix(records)
        self.reviews_matrix = create_reviews_matrix(records)
        self.user_dictionary = extractor.initialize_users(self.records, False)
        self.user_ids = extractor.get_groupby_list(self.records, 'user_id')

        # self.lda_model =\
        #     lda_context_utils.discover_topics(text_reviews, self.num_topics)
        if self.reviews:
            lda_based_context = LdaBasedContext()
            lda_based_context.reviews = self.reviews
            lda_based_context.init_reviews()
        else:
            text_reviews = []
            for record in self.records:
                text_reviews.append(record['text'])
            lda_based_context = LdaBasedContext(text_reviews)
            lda_based_context.init_reviews()
        self.context_rich_topics = lda_based_context.get_context_rich_topics()

        self.lda_model = lda_based_context.topic_model
        print('building similarity matrix', time.strftime("%H:%M:%S"))
        self.context_matrix = self.create_context_matrix(records)
        self.similarity_matrix = self.create_similarity_matrix()
        print('finished building similarity matrix', time.strftime("%H:%M:%S"))
 def load(self, reviews):
     self.reviews = reviews
     self.user_ids = extractor.get_groupby_list(self.reviews, "user_id")
     self.user_dictionary = extractor.initialize_cluster_users(self.reviews, self._significant_criteria_ranges)
     self.user_cluster_dictionary = self.build_user_clusters(self.reviews, self._significant_criteria_ranges)
     if self._similarity_matrix_builder._similarity_metric is not None:
         self.user_similarity_matrix = self._similarity_matrix_builder.build_similarity_matrix(
             self.user_dictionary, self.user_ids
         )
예제 #15
0
    def load(self):
        print('load: %s' % time.strftime("%Y/%m/%d-%H:%M:%S"))

        if Constants.SEPARATE_TOPIC_MODEL_RECSYS_REVIEWS:
            self.original_records =\
                ETLUtils.load_json_file(
                    Constants.RECSYS_CONTEXTUAL_PROCESSED_RECORDS_FILE)
        else:
            self.original_records =\
                ETLUtils.load_json_file(Constants.PROCESSED_RECORDS_FILE)

        print('num_records: %d' % len(self.original_records))
        user_ids = extractor.get_groupby_list(self.original_records,
                                              Constants.USER_ID_FIELD)
        item_ids = extractor.get_groupby_list(self.original_records,
                                              Constants.ITEM_ID_FIELD)
        print('total users', len(user_ids))
        print('total items', len(item_ids))
예제 #16
0
def build_item_index_map(reviews):
    user_ids = extractor.get_groupby_list(reviews, 'offering_id')
    item_index_map = {}
    index = 0

    for user_id in user_ids:
        item_index_map[user_id] = index
        index += 1

    return item_index_map
예제 #17
0
def build_user_index_map(reviews):
    user_ids = extractor.get_groupby_list(reviews, 'user_id')
    user_index_map = {}
    index = 0

    for user_id in user_ids:
        user_index_map[user_id] = index
        index += 1

    return user_index_map
예제 #18
0
    def preprocess(self):

        self.load_records()

        if 'yelp' in Constants.ITEM_TYPE:
            self.transform_yelp_records()
        elif 'fourcity' in Constants.ITEM_TYPE:
            self.transform_fourcity_records()

        self.add_integer_ids()
        self.clean_reviews()
        self.remove_duplicate_reviews()
        self.tag_reviews_language()
        self.remove_foreign_reviews()
        self.remove_reviews_from_classifier_training_set()
        self.lemmatize_records()
        self.remove_users_with_low_reviews()
        self.remove_items_with_low_reviews()
        self.count_frequencies()
        self.shuffle_records()
        print('total_records: %d' % len(self.records))
        self.classify_reviews()
        self.build_bag_of_words()
        self.tag_contextual_reviews()
        # self.load_full_records()
        self.build_dictionary()
        self.build_corpus()
        self.label_review_targets()
        self.export_records()

        self.count_specific_generic_ratio()
        self.export_to_triplet()

        rda = ReviewsDatasetAnalyzer(self.records)
        print('density: %f' % rda.calculate_density_approx())
        print('sparsity: %f' % rda.calculate_sparsity_approx())
        print('total_records: %d' % len(self.records))
        user_ids = \
            extractor.get_groupby_list(self.records, Constants.USER_ID_FIELD)
        item_ids = \
            extractor.get_groupby_list(self.records, Constants.ITEM_ID_FIELD)
        print('total users', len(user_ids))
        print('total items', len(item_ids))
예제 #19
0
def create_user_item_map(records):
    user_ids = extractor.get_groupby_list(records, constants.USER_ID_FIELD)
    user_item_map = {}
    user_count = 0

    for user_id in user_ids:
        user_records =\
            ETLUtils.filter_records(records, constants.USER_ID_FIELD, [user_id])
        user_items =\
            extractor.get_groupby_list(user_records, constants.ITEM_ID_FIELD)
        user_item_map[user_id] = user_items
        user_count += 1

        # print("user count %d" % user_count),
        print 'user count: {0}\r'.format(user_count),

    print

    return user_item_map
예제 #20
0
 def load(self, reviews):
     self.reviews = reviews
     self.user_dictionary =\
         extractor.initialize_users(
             self.reviews,
             self._similarity_matrix_builder._is_multi_criteria)
     self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id')
     if self._similarity_matrix_builder._similarity_metric is not None:
         self.user_similarity_matrix =\
             self._similarity_matrix_builder.build_similarity_matrix(
                 self.user_dictionary, self.user_ids)
예제 #21
0
 def load(self, reviews):
     self.reviews = reviews
     self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id')
     self.user_dictionary =\
         extractor.initialize_cluster_users(self.reviews, self._significant_criteria_ranges)
     self.user_cluster_dictionary = self.build_user_clusters(
         self.reviews, self._significant_criteria_ranges)
     if self._similarity_matrix_builder._similarity_metric is not None:
         self.user_similarity_matrix =\
             self._similarity_matrix_builder.build_similarity_matrix(
                 self.user_dictionary, self.user_ids)
예제 #22
0
 def load(self, reviews):
     self.reviews = reviews
     self.user_dictionary =\
         extractor.initialize_users(
             self.reviews,
             self._similarity_matrix_builder._is_multi_criteria)
     self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id')
     if self._similarity_matrix_builder._similarity_metric is not None:
         self.user_similarity_matrix =\
             self._similarity_matrix_builder.build_similarity_matrix(
                 self.user_dictionary, self.user_ids)
예제 #23
0
    def calculate_sparsity_approx(self):
        """
        Returns the approximate percentage of missing ratings in the list of
        reviews of this ReviewsDatasetAnalyzer. This method is an approximation
        because it counts two reviews from the same user to the same item as
        two, when the correct count should be one. This method was created to
        calculate the sparsity in very big datasets where calculating the exact
        sparsity can be a very slow process.

        :return: the rate of approximate missing ratings
        (i.e. number of missing ratings / (number of reviews))
        :raise ValueError: in case an empty list is given
        """
        if not self.reviews:
            raise ValueError('Can not determine the sparsity for an empty list')

        user_ids = extractor.get_groupby_list(self.reviews, 'user_id')
        item_ids = extractor.get_groupby_list(self.reviews, 'offering_id')
        total_expected_reviews = float(len(user_ids) * len(item_ids))

        return 1 - float(len(self.reviews)) / total_expected_reviews
    def calculate_sparsity_approx(self):
        """
        Returns the approximate percentage of missing ratings in the list of
        reviews of this ReviewsDatasetAnalyzer. This method is an approximation
        because it counts two reviews from the same user to the same item as
        two, when the correct count should be one. This method was created to
        calculate the sparsity in very big datasets where calculating the exact
        sparsity can be a very slow process.

        :return: the rate of approximate missing ratings
        (i.e. number of missing ratings / (number of reviews))
        :raise ValueError: in case an empty list is given
        """
        if not self.reviews:
            raise ValueError("Can not determine the sparsity for an empty list")

        user_ids = extractor.get_groupby_list(self.reviews, "user_id")
        item_ids = extractor.get_groupby_list(self.reviews, "offering_id")
        total_expected_reviews = float(len(user_ids) * len(item_ids))

        return 1 - float(len(self.reviews)) / total_expected_reviews
예제 #25
0
    def load(self, reviews):
        self.reviews = reviews
        self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id')
        self.item_ids = extractor.get_groupby_list(self.reviews, 'offering_id')
        self.num_users = len(self.user_ids)
        self.num_items = len(self.item_ids)
        self.data_matrix = create_matrix(self.reviews)
        self.user_index_map = build_user_index_map(reviews)
        self.item_index_map = build_item_index_map(reviews)

        R = create_matrix(self.reviews).todense()

        R = np.array(R)

        N = len(R)
        M = len(R[0])
        K = 2

        P = np.random.rand(N, K)
        Q = np.random.rand(M, K)
        # P.fill(0.1)
        # Q.fill(0.1)

        self.n_p, self.n_q = matrix_factorization(R, P, Q, K)
예제 #26
0
    def load(self, records):
        # self.records = records
        self.user_dictionary = extractor.initialize_users(records, False)
        self.user_ids = extractor.get_groupby_list(records, 'user_id')

        if self.has_context:
            self.load_context(records)

        self.user_similarity_calculator.load(
            self.user_ids, self.user_dictionary, self.context_rich_topics)
        self.neighbourhood_calculator.load(
            self.user_ids, self.user_dictionary, self.context_rich_topics,
            self.num_neighbours)
        self.user_baseline_calculator.load(
            self.user_dictionary, self.context_rich_topics)
        self.neighbour_contribution_calculator.load(
            self.user_baseline_calculator)
 def load(self, reviews):
     self.reviews = reviews
     self.user_dictionary = extractor.initialize_users(self.reviews, False)
     self.user_ids = extractor.get_groupby_list(self.reviews, "user_id")
예제 #28
0
 def load(self, reviews):
     self.reviews = reviews
     self.user_dictionary = extractor.initialize_users(self.reviews, False)
     self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id')
예제 #29
0
 def load(self, reviews):
     self.reviews = reviews
     self.ratings_matrix = create_ratings_matrix(reviews)
     self.user_dictionary = extractor.initialize_users(self.reviews, False)
     self.user_ids = extractor.get_groupby_list(self.reviews, 'user_id')
     self.similarity_matrix = self.create_similarity_matrix()
    def test_build_similarity_matrix_pearson(self):

        user_dictionary = extractor.initialize_users(reviews_matrix_5, False)
        user_ids = extractor.get_groupby_list(reviews_matrix_5, "user_id")
        similarity_matrix_builder = SingleSimilarityMatrixBuilder("pearson")
        self.assertEqual(pearson_matrix, similarity_matrix_builder.build_similarity_matrix(user_dictionary, user_ids))