Пример #1
0
    def setUp(self):
        """
        Setup method that is called at the beginning of each test.
        """
        self.documents, self.users = 18, 10
        documents_cnt, users_cnt = self.documents, self.users
        self.n_iterations = 15
        self.k_folds = 3
        self.hyperparameters = {'n_factors': 5, '_lambda': 0.01}
        self.options = {'n_iterations': self.n_iterations, 'k_folds': self.k_folds}
        self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iterations)
        self.n_recommendations = 1

        def mock_get_ratings_matrix(self=None):
            return [[int(not bool((article + user) % 3)) for article in range(documents_cnt)]
                    for user in range(users_cnt)]

        self.ratings_matrix = numpy.array(mock_get_ratings_matrix())
        setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix)

        self.evaluator = Evaluator(self.ratings_matrix)
        self.cf = CollaborativeFiltering(self.initializer, self.evaluator, self.hyperparameters,
                                         self.options, load_matrices=True)
        self.cf.train()
        self.cf.evaluator.k_folds = self.k_folds
        self.test_data = self.cf.test_data
        self.predictions = self.cf.get_predictions()
        self.rounded_predictions = self.cf.rounded_predictions()
Пример #2
0
    def setUp(self):
        """
        Setup method that is called at the beginning of each test.
        """
        self.documents, self.users = 8, 10
        documents_cnt, users_cnt = self.documents, self.users
        self.n_iterations = 5
        self.n_factors = 5
        self.k_folds = 5
        self.hyperparameters = {'n_factors': self.n_factors}
        self.options = {'n_iterations': self.n_iterations, 'k_folds': self.k_folds}
        self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iterations)

        def mock_process(self=None):
            pass

        def mock_get_abstracts(self=None):
            return {0: 'hell world berlin dna evolution', 1: 'freiburg is green',
                    2: 'the best dna is the dna of dinasours', 3: 'truth is absolute',
                    4: 'berlin is not that green', 5: 'truth manifests itself',
                    6: 'plato said truth is beautiful', 7: 'freiburg has dna'}

        def mock_get_ratings_matrix(self=None):
            return [[int(not bool((article + user) % 3)) for article in range(documents_cnt)]
                    for user in range(users_cnt)]

        def mock_get_word_distribution(self=None):
            abstracts = mock_get_abstracts()
            vocab = set(itertools.chain(*list(map(lambda ab: ab.split(' '), abstracts.values()))))
            w2i = dict(zip(vocab, range(len(vocab))))
            word_to_count = [(w2i[word], sum(abstract.split(' ').count(word)
                                             for doc_id, abstract in abstracts.items())) for word in vocab]
            article_to_word = list(set([(doc_id, w2i[word])
                                        for doc_id, abstract in abstracts.items() for word in abstract.split(' ')]))
            article_to_word_to_count = list(set([(doc_id, w2i[word], abstract.count(word))
                                                 for doc_id, abstract in abstracts.items()
                                                 for word in abstract.split(' ')]))
            return word_to_count, article_to_word, article_to_word_to_count

        abstracts = mock_get_abstracts()
        word_to_count, article_to_word,  article_to_word_to_count = mock_get_word_distribution()
        self.abstracts_preprocessor = AbstractsPreprocessor(abstracts, word_to_count,
                                                            article_to_word, article_to_word_to_count)
        self.ratings_matrix = numpy.array(mock_get_ratings_matrix())
        self.evaluator = Evaluator(self.ratings_matrix, self.abstracts_preprocessor)
        setattr(DataParser, "get_abstracts", mock_get_abstracts)
        setattr(DataParser, "process", mock_process)
        setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix)
        setattr(DataParser, "get_word_distribution", mock_get_word_distribution)
Пример #3
0
    def setUp(self):
        """
        Setup method that is called at the beginning of each test.
        """
        self.documents, self.users = 30, 4
        documents_cnt, users_cnt = self.documents, self.users
        self.n_factors = 5
        self.n_iterations = 20
        self.k_folds = 3
        self.hyperparameters = {'n_factors': self.n_factors, '_lambda': 0.01}
        self.options = {'k_folds': self.k_folds, 'n_iterations': self.n_iterations}
        self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iterations)

        def mock_get_ratings_matrix(self=None):
            return [[int(not bool((article + user) % 3)) for article in range(documents_cnt)]
                    for user in range(users_cnt)]
        self.ratings_matrix = numpy.array(mock_get_ratings_matrix())
        self.evaluator = Evaluator(self.ratings_matrix)
        setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix)
Пример #4
0
 def runTest(self):
     users_cnt, documents_cnt = self.users, self.documents
     config = RecommenderConfiguration().get_hyperparameters()
     config['n_factors'] = 5
     initializer = ModelInitializer(config, 1)
     path = initializer._create_path('user_v', (users_cnt, documents_cnt))
     self.assertTrue(path.endswith('n_iterations-1,n_rows-10user_v.dat'))
     matrix_shape = (users_cnt, config['n_factors'])
     users_mat = numpy.random.random(matrix_shape)
     initializer.save_matrix(users_mat, 'user_v')
     self.assertTrue(os.path.isfile(path))
     loaded, loaded_matrix = initializer.load_matrix(
         config, 'user_v', matrix_shape)
     self.assertTrue(loaded)
     self.assertTrue(numpy.alltrue(loaded_matrix == users_mat))
Пример #5
0
    def setUp(self):
        """
        Setup method that is called at the beginning of each test.
        """
        self.documents, self.users = 8, 10
        documents_cnt, users_cnt = self.documents, self.users
        self.hyperparameters = {
            '_lambda': [0.0001, 0.1],
            'n_factors': [10, 20]
        }
        self.n_iterations = 15
        self.options = {'n_iterations': self.n_iterations, 'k_folds': 1}
        self.initial_config = {'_lambda': 0, 'n_factors': 10}
        self.initializer = ModelInitializer(self.initial_config.copy(),
                                            self.n_iterations)

        def mock_get_ratings_matrix(self=None):
            return [[
                int(not bool((article + user) % 3))
                for article in range(documents_cnt)
            ] for user in range(users_cnt)]

        self.ratings_matrix = numpy.array(mock_get_ratings_matrix())
        setattr(DataParser, "get_ratings_matrix", mock_get_ratings_matrix)
Пример #6
0
    def __init__(self,
                 initializer=None,
                 abstracts_preprocessor=None,
                 ratings=None,
                 config=None,
                 process_parser=False,
                 verbose=False,
                 load_matrices=True,
                 dump_matrices=True,
                 train_more=True,
                 random_seed=False,
                 results_file_name='top_recommendations'):
        """
        Constructor of the RecommenderSystem.

        :param ModelInitializer initializer: A model initializer.
        :param AbstractsPreprocessor abstracts_preprocessor: A preprocessor of abstracts, if None then queried.
        :param int[][] ratings: Ratings matrix; if None, matrix gets queried from the database.
        :param boolean process_parser: A Flag deceiding process the dataparser.
        :param boolean verbose: A flag deceiding to print progress.
        :param boolean dump_matrices: A flag for saving output matrices.
        :param boolean train_more: train_more the collaborative filtering after loading matrices.
        :param boolean random_seed: A flag to determine if we will use random seed or not.
        :param str results_file_name: Top recommendations results' file name
        """
        if process_parser:
            DataParser.process()

        if ratings is None:
            self.ratings = numpy.array(DataParser.get_ratings_matrix())
        else:
            self.ratings = ratings

        if abstracts_preprocessor is None:
            self.abstracts_preprocessor = AbstractsPreprocessor(
                DataParser.get_abstracts(),
                *DataParser.get_word_distribution())
        else:
            self.abstracts_preprocessor = abstracts_preprocessor

        # Get configurations
        self.config = RecommenderConfiguration(config)

        # Set flags
        self.results_file_name = results_file_name + '.dat'
        self._verbose = verbose
        self._dump_matrices = dump_matrices
        self._load_matrices = load_matrices
        self._train_more = train_more
        self._split_type = 'user'
        self._random_seed = random_seed

        self.set_hyperparameters(self.config.get_hyperparameters())
        self.set_options(self.config.get_options())

        self.initializer = ModelInitializer(self.hyperparameters.copy(),
                                            self.n_iter, self._verbose)

        if self.config.get_error_metric() == 'RMS':
            self.evaluator = Evaluator(self.ratings,
                                       self.abstracts_preprocessor,
                                       self._random_seed, self._verbose)
        else:
            raise NameError(
                "Not a valid error metric %s. Only option is 'RMS'" %
                self.config.get_error_metric())

        # Initialize content based.
        if self.config.get_content_based() == 'None':
            self.content_based = ContentBased(self.initializer, self.evaluator,
                                              self.hyperparameters,
                                              self.options, self._verbose,
                                              self._load_matrices,
                                              self._dump_matrices)
        elif self.config.get_content_based() == 'LDA':
            self.content_based = LDARecommender(self.initializer,
                                                self.evaluator,
                                                self.hyperparameters,
                                                self.options, self._verbose,
                                                self._load_matrices,
                                                self._dump_matrices)
        elif self.config.get_content_based() == 'LDA2Vec':
            self.content_based = LDA2VecRecommender(
                self.initializer, self.evaluator, self.hyperparameters,
                self.options, self._verbose, self._load_matrices,
                self._dump_matrices)
        else:
            raise NameError(
                "Not a valid content based %s. Options are 'None', "
                "'LDA', 'LDA2Vec'" % self.config.get_content_based())

        # Initialize collaborative filtering.
        if self.config.get_collaborative_filtering() == 'ALS':
            is_hybrid = self.config.get_recommender() == 'hybrid'
            if self.config.get_content_based() == 'None':
                raise NameError(
                    "Not valid content based 'None' with hybrid recommender")
            self.collaborative_filtering = CollaborativeFiltering(
                self.initializer, self.evaluator, self.hyperparameters,
                self.options, self._verbose, self._load_matrices,
                self._dump_matrices, self._train_more, is_hybrid)
        elif self.config.get_collaborative_filtering() == 'SDAE':
            self.collaborative_filtering = SDAERecommender(
                self.initializer, self.evaluator, self.hyperparameters,
                self.options, self._verbose, self._load_matrices,
                self._dump_matrices)
            if not self.config.get_content_based() == 'None':
                raise NameError(
                    "Not a valid content based %s with SDAE. You can only use 'None'"
                    % self.config.get_content_based())
        elif self.config.get_collaborative_filtering() == 'None':
            if not self.config.get_recommender() == 'itembased':
                raise NameError(
                    "None collaborative filtering is only valid with itembased recommender type"
                )
            elif self.config.get_content_based() == 'None':
                raise NameError(
                    "Not valid content based 'None' with item-based recommender"
                )
            self.collaborative_filtering = None
        else:
            raise NameError("Not a valid collaborative filtering %s. "
                            "Only options are 'None', 'ALS', 'SDAE'" %
                            self.config.get_collaborative_filtering())

        # Initialize recommender
        if self.config.get_recommender() == 'itembased':
            self.recommender = self.content_based
        elif self.config.get_recommender() == 'userbased':
            self.recommender = self.collaborative_filtering
        elif self.config.get_recommender() == 'hybrid':
            self.recommender = self
        else:
            raise NameError(
                "Invalid recommender type %s. "
                "Only options are 'userbased','itembased', and 'hybrid'" %
                self.config.get_recommender())
Пример #7
0
    def __init__(self,
                 use_database=True,
                 verbose=True,
                 load_matrices=True,
                 dump=True,
                 train_more=True,
                 random_seed=False,
                 config=None):
        """
        Setup the data and configuration for the recommenders.
        """
        if use_database:
            self.ratings = numpy.array(DataParser.get_ratings_matrix())
            self.documents, self.users = self.ratings.shape
            self.abstracts_preprocessor = AbstractsPreprocessor(
                DataParser.get_abstracts(),
                *DataParser.get_word_distribution())
        else:
            abstracts = {
                0: 'hell world berlin dna evolution',
                1: 'freiburg is green',
                2: 'the best dna is the dna of dinasours',
                3: 'truth is absolute',
                4: 'berlin is not that green',
                5: 'truth manifests itself',
                6: 'plato said truth is beautiful',
                7: 'freiburg has dna'
            }

            vocab = set(
                itertools.chain(
                    *list(map(lambda ab: ab.split(' '), abstracts.values()))))
            w2i = dict(zip(vocab, range(len(vocab))))
            word_to_count = [(w2i[word],
                              sum(
                                  abstract.split(' ').count(word)
                                  for doc_id, abstract in abstracts.items()))
                             for word in vocab]
            article_to_word = list(
                set([(doc_id, w2i[word])
                     for doc_id, abstract in abstracts.items()
                     for word in abstract.split(' ')]))
            article_to_word_to_count = list(
                set([(doc_id, w2i[word], abstract.count(word))
                     for doc_id, abstract in abstracts.items()
                     for word in abstract.split(' ')]))
            self.abstracts_preprocessor = AbstractsPreprocessor(
                abstracts, word_to_count, article_to_word,
                article_to_word_to_count)
            self.documents, self.users = 8, 10
            self.ratings = numpy.array([[
                int(not bool((article + user) % 3))
                for article in range(self.documents)
            ] for user in range(self.users)])

        self.verbose = verbose
        self.load_matrices = load_matrices
        self.dump = dump
        self.train_more = train_more
        self.random_seed = random_seed
        self.evaluator = Evaluator(self.ratings, self.abstracts_preprocessor,
                                   self.random_seed, self.verbose)
        self.config = RecommenderConfiguration()
        self.hyperparameters = self.config.get_hyperparameters()
        self.options = self.config.get_options()
        self.initializer = ModelInitializer(self.hyperparameters.copy(),
                                            self.options['n_iterations'],
                                            self.verbose)
Пример #8
0
    def __init__(self, initializer=None, abstracts_preprocessor=None, ratings=None, config=None,
                 process_parser=False, verbose=False, load_matrices=True, dump_matrices=True, train_more=True):
        """
        Constructor of the RecommenderSystem.

        :param ModelInitializer initializer: A model initializer.
        :param AbstractsPreprocessor abstracts_preprocessor: A preprocessor of abstracts, if None then queried.
        :param int[][] ratings: Ratings matrix; if None, matrix gets queried from the database.
        :param boolean process_parser: A Flag deceiding process the dataparser.
        :param boolean verbose: A flag deceiding to print progress.
        :param boolean dump_matrices: A flag for saving output matrices.
        :param boolean train_more: train_more the collaborative filtering after loading matrices.
        """
        if process_parser:
            DataParser.process()

        if ratings is None:
            self.ratings = numpy.array(DataParser.get_ratings_matrix())
        else:
            self.ratings = ratings

        if abstracts_preprocessor is None:
            self.abstracts_preprocessor = AbstractsPreprocessor(DataParser.get_abstracts(),
                                                                *DataParser.get_word_distribution())
        else:
            self.abstracts_preprocessor = abstracts_preprocessor

        # Get configurations
        self.config = RecommenderConfiguration(config)
        self.set_hyperparameters(self.config.get_hyperparameters())
        self.set_options(self.config.get_options())

        # Set flags
        self._verbose = verbose
        self._dump_matrices = dump_matrices
        self._load_matrices = load_matrices
        self._train_more = train_more

        self.initializer = ModelInitializer(self.hyperparameters.copy(), self.n_iter, self._verbose)

        if self.config.get_error_metric() == 'RMS':
            self.evaluator = Evaluator(self.ratings, self.abstracts_preprocessor)
        else:
            raise NameError("Not a valid error metric %s. Only option is 'RMS'" % self.config.get_error_metric())

        # Initialize content based.
        if self.config.get_content_based() == 'None':
            self.content_based = ContentBased(self.initializer, self.evaluator, self.hyperparameters, self.options,
                                              self._verbose, self._load_matrices, self._dump_matrices)
        elif self.config.get_content_based() == 'LDA':
            self.content_based = LDARecommender(self.initializer, self.evaluator, self.hyperparameters, self.options,
                                                self._verbose, self._load_matrices, self._dump_matrices)
        elif self.config.get_content_based() == 'LDA2Vec':
            self.content_based = LDA2VecRecommender(self.initializer, self.evaluator, self.hyperparameters,
                                                    self.options, self._verbose,
                                                    self._load_matrices, self._dump_matrices)
        else:
            raise NameError("Not a valid content based %s. Options are 'None', "
                            "'LDA', 'LDA2Vec'" % self.config.get_content_based())

        # Initialize collaborative filtering.
        if self.config.get_collaborative_filtering() == 'ALS':
            self.collaborative_filtering = CollaborativeFiltering(self.initializer, self.evaluator,
                                                                  self.hyperparameters, self.options,
                                                                  self._verbose, self._load_matrices,
                                                                  self._dump_matrices, self._train_more)
        else:
            raise NameError("Not a valid collaborative filtering %s. "
                            "Only option is 'ALS'" % self.config.get_collaborative_filtering())

        # Initialize recommender
        if self.config.get_recommender() == 'itembased':
            self.recommender = self.content_based
        elif self.config.get_recommender() == 'userbased':
            self.recommender = self.collaborative_filtering
        else:
            raise NameError("Invalid recommender type %s. "
                            "Only options are 'userbased' and 'itembased'" % self.config.get_recommender())