def test(self):
     # This tests that GraphConvolutionalMatrixCompletion runs without error, and its loss and rmse are small enough.
     n_users = 101
     n_items = 233
     n_data = 3007
     am1 = _make_sparse_matrix(n_users, n_items, n_data)
     am2 = 2 * _make_sparse_matrix(n_users, n_items, n_data)
     adjacency_matrix = am1 + am2
     user_ids = adjacency_matrix.tocoo().row
     item_ids = adjacency_matrix.tocoo().col
     ratings = adjacency_matrix.tocoo().data
     item_features = [{i: np.array([i]) for i in range(n_items)}]
     rating_data = GcmcDataset(user_ids,
                               item_ids,
                               ratings,
                               item_features=item_features)
     dataset = GcmcGraphDataset(dataset=rating_data, test_size=0.2)
     self.assertEqual(
         (n_items + 1, 1),
         dataset.item_features[0].shape)  # because of default index.
    def run(self):
        tf.reset_default_graph()
        df = self.load_data_frame('train_data',
                                  required_columns={
                                      self.user_column_name,
                                      self.item_column_name,
                                      self.rating_column_name
                                  })
        user_features = self.load('user_features')
        item_features = self.load('item_features')

        df.drop_duplicates(
            subset=[self.user_column_name, self.item_column_name],
            inplace=True)
        df = sklearn.utils.shuffle(df)
        df = df.head(n=int(self.max_data_size))

        user_ids = df[self.user_column_name].values
        item_ids = df[self.item_column_name].values
        ratings = df[self.rating_column_name].values

        dataset = GcmcDataset(user_ids=user_ids,
                              item_ids=item_ids,
                              ratings=ratings,
                              user_features=user_features,
                              item_features=item_features)
        graph_dataset = GcmcGraphDataset(
            dataset=dataset,
            test_size=self.test_size,
            min_user_click_count=self.min_user_click_count,
            max_user_click_count=self.max_user_click_count)
        model = GraphConvolutionalMatrixCompletion(graph_dataset=graph_dataset,
                                                   **self.model_kwargs)
        self.task_log['report'] = [str(self.model_kwargs)] + model.fit(
            try_count=self.try_count, decay_speed=self.decay_speed)
        self.dump(self.task_log['report'], 'report')
        self.dump(model, 'model')
예제 #3
0
 def test_with_information(self):
     user_ids = np.array([1, 1, 2, 2, 2])
     item_ids = np.array([1, 2, 1, 2, 3])
     ratings = np.array([1, 0, 1, 0, 1])
     test_size = 0.0
     user_features = [{1: np.array([10, 11]), 2: np.array([20, 21])}]
     item_features = [{
         1: np.array([10, 11, 12]),
         2: np.array([20, 21, 22]),
         3: np.array([30, 31, 32])
     }]
     dataset = GcmcDataset(user_ids=user_ids,
                           item_ids=item_ids,
                           ratings=ratings,
                           user_features=user_features,
                           item_features=item_features)
     graph_dataset = GcmcGraphDataset(dataset, test_size)
     data = graph_dataset.train_data()
     self.assertEqual(user_ids.shape, data['user'].shape)
     self.assertEqual(item_ids.shape, data['item'].shape)
     self.assertEqual((ratings.shape[0], 2), data['label'].shape)
     self.assertEqual(ratings.shape, data['rating'].shape)
     self.assertEqual(user_ids.shape, data['user_feature_indices'].shape)
     self.assertEqual(item_ids.shape, data['item_feature_indices'].shape)
class GraphConvolutionalMatrixCompletion(object):
    def __init__(
            self,
            user_ids: np.ndarray,
            item_ids: np.ndarray,
            ratings: np.ndarray,
            encoder_hidden_size: int,
            encoder_size: int,
            scope_name: str,
            test_size: float,
            batch_size: int,
            epoch_size: int,
            dropout_rate: float,
            learning_rate: float,
            normalization_type: str,
            weight_sharing: bool = True,
            ignore_item_embedding: bool = False,
            save_directory_path: str = None,
            user_features: Optional[List[Dict[Any, np.ndarray]]] = None,
            item_features: Optional[List[Dict[Any,
                                              np.ndarray]]] = None) -> None:
        self.session = tf.Session()
        self.user_ids = user_ids
        self.item_ids = item_ids
        self.ratings = ratings
        self.item_features = item_features
        self.user_features = user_features
        self.encoder_hidden_size = encoder_hidden_size
        self.encoder_size = encoder_size
        self.test_size = test_size
        self.batch_size = batch_size
        self.epoch_size = epoch_size
        self.scope_name = scope_name
        self.dropout_rate = dropout_rate
        self.learning_rate = learning_rate
        self.normalization_type = normalization_type
        self.weight_sharing = weight_sharing
        self.ignore_item_embedding = ignore_item_embedding
        self.save_directory_path = save_directory_path
        self.dataset = GcmcDataset(self.user_ids,
                                   self.item_ids,
                                   self.ratings,
                                   self.test_size,
                                   user_information=self.user_features,
                                   item_information=self.item_features,
                                   min_user_click_count=5)
        self.graph = None

    def fit(self, try_count=1, decay_speed=10.) -> List[str]:
        if self.graph is None:
            logger.info('making graph...')
            self.graph = self._make_graph()
            logger.info('done making graph')

        early_stopping = EarlyStopping(try_count=try_count,
                                       decay_speed=decay_speed,
                                       save_directory=self.save_directory_path,
                                       learning_rate=self.learning_rate,
                                       threshold=1e-4)

        test_data = self.dataset.test_data()
        report = []
        with self.session.as_default():
            self.session.run(tf.global_variables_initializer())
            dataset = tf.data.Dataset.from_tensor_slices(
                self.dataset.train_data())
            dataset = dataset.shuffle(buffer_size=self.batch_size)
            batch = dataset.batch(self.batch_size)
            iterator = batch.make_initializable_iterator()
            next_batch = iterator.get_next()
            rating_adjacency_matrix = self.dataset.train_rating_adjacency_matrix(
            )

            logger.info('start to optimize...')
            for i in range(self.epoch_size):
                self.session.run(iterator.initializer)
                while True:
                    try:
                        train_data = self.session.run(next_batch)
                        _rating_adjacency_matrix = [
                            self._eliminate(matrix, train_data['user'],
                                            train_data['item'])
                            for matrix in rating_adjacency_matrix
                        ]
                        feed_dict = {
                            self.graph.input_learning_rate:
                            early_stopping.learning_rate,
                            self.graph.input_dropout:
                            self.dropout_rate,
                            self.graph.input_user:
                            train_data['user'],
                            self.graph.input_item:
                            train_data['item'],
                            self.graph.input_label:
                            train_data['label'],
                            self.graph.input_rating:
                            train_data['rating'],
                            self.graph.input_user_information:
                            train_data['user_information'],
                            self.graph.input_item_information:
                            train_data['item_information'],
                        }
                        feed_dict.update({
                            g: _convert_sparse_matrix_to_sparse_tensor(m)
                            for g, m in zip(self.graph.input_adjacency_matrix,
                                            _rating_adjacency_matrix)
                        })
                        feed_dict.update({
                            g: m.count_nonzero()
                            for g, m in zip(self.graph.input_edge_size,
                                            _rating_adjacency_matrix)
                        })
                        _, train_loss, train_rmse = self.session.run(
                            [self.graph.op, self.graph.loss, self.graph.rmse],
                            feed_dict=feed_dict)
                        report.append(
                            f'train: epoch={i + 1}/{self.epoch_size}, loss={train_loss}, rmse={train_rmse}.'
                        )
                    except tf.errors.OutOfRangeError:
                        logger.info(report[-1])
                        feed_dict = {
                            self.graph.input_dropout:
                            0.0,
                            self.graph.input_user:
                            test_data['user'],
                            self.graph.input_item:
                            test_data['item'],
                            self.graph.input_label:
                            test_data['label'],
                            self.graph.input_rating:
                            test_data['rating'],
                            self.graph.input_user_information:
                            test_data['user_information'],
                            self.graph.input_item_information:
                            test_data['item_information'],
                        }
                        feed_dict.update({
                            g: _convert_sparse_matrix_to_sparse_tensor(m)
                            for g, m in zip(self.graph.input_adjacency_matrix,
                                            rating_adjacency_matrix)
                        })
                        feed_dict.update({
                            g: m.count_nonzero()
                            for g, m in zip(self.graph.input_edge_size,
                                            rating_adjacency_matrix)
                        })
                        test_loss, test_rmse = self.session.run(
                            [self.graph.loss, self.graph.rmse],
                            feed_dict=feed_dict)
                        report.append(
                            f'test: epoch={i + 1}/{self.epoch_size}, loss={test_loss}, rmse={test_rmse}.'
                        )
                        logger.info(report[-1])
                        break

                if early_stopping.does_stop(test_rmse, self.session):
                    break
        return report

    def predict(self,
                user_ids: List,
                item_ids: List,
                with_user_embedding: bool = True) -> np.ndarray:
        if self.graph is None:
            RuntimeError('Please call fit first.')

        rating_adjacency_matrix = self.dataset.train_rating_adjacency_matrix()
        user_indices, item_indices = self.dataset.to_indices(
            user_ids, item_ids)
        if not with_user_embedding:
            user_indices = np.array(
                [0] * len(user_indices))  # TODO use default user index.

        user_information_indices, item_information_indices = self.dataset.to_information_indices(
            user_ids, item_ids)
        feed_dict = {
            self.graph.input_dropout: 0.0,
            self.graph.input_user: user_indices,
            self.graph.input_item: item_indices,
            self.graph.input_user_information: user_information_indices,
            self.graph.input_item_information: item_information_indices,
        }
        feed_dict.update({
            g: _convert_sparse_matrix_to_sparse_tensor(m)
            for g, m in zip(self.graph.input_adjacency_matrix,
                            rating_adjacency_matrix)
        })
        feed_dict.update({
            g: m.count_nonzero()
            for g, m in zip(self.graph.input_edge_size,
                            rating_adjacency_matrix)
        })
        with self.session.as_default():
            predictions = self.session.run(self.graph.expectation,
                                           feed_dict=feed_dict)
        predictions = predictions.flatten()
        predictions = np.clip(predictions,
                              self.dataset.rating()[0],
                              self.dataset.rating()[-1])
        return predictions

    def predict_item_scores(self,
                            item_ids: List,
                            with_user_embedding: bool = True) -> pd.DataFrame:
        user_ids = list(self.dataset.user_id_map.id2index.keys())
        _test_users, _test_items = zip(
            *list(itertools.product(user_ids, item_ids)))
        predicts = self.predict(user_ids=_test_users,
                                item_ids=_test_items,
                                with_user_embedding=with_user_embedding)
        results = pd.DataFrame(
            dict(user=_test_users, item=_test_items, score=predicts))
        results.sort_values('score', ascending=False, inplace=True)
        return results

    def _make_graph(self) -> GraphConvolutionalMatrixCompletionGraph:
        return GraphConvolutionalMatrixCompletionGraph(
            n_rating=len(self.dataset.rating_id_map.id2index),
            n_user=len(self.dataset.user_id_map.id2index) + 1,  # TODO
            n_item=len(self.dataset.item_id_map.id2index) + 1,  # TODO
            rating=self.dataset.rating(),
            normalization_type=self.normalization_type,
            encoder_hidden_size=self.encoder_hidden_size,
            encoder_size=self.encoder_size,
            weight_sharing=self.weight_sharing,
            scope_name=self.scope_name,
            user_side_information=self.dataset.user_information,
            item_side_information=self.dataset.item_information,
            ignore_item_embedding=self.ignore_item_embedding)

    @staticmethod
    def _eliminate(matrix: sp.csr_matrix, user_indices, item_indices):
        matrix = matrix.copy()
        matrix[user_indices, item_indices] = 0
        matrix.eliminate_zeros()
        return matrix

    def save(self, file_path: str) -> None:
        redshells.model.utils.save_tf_session(self, self.session, file_path)

    @staticmethod
    def load(file_path: str) -> 'GraphConvolutionalMatrixCompletion':
        session = tf.Session()
        model = redshells.model.utils.load_tf_session(
            GraphConvolutionalMatrixCompletion, session, file_path,
            GraphConvolutionalMatrixCompletion._make_graph
        )  # type: GraphConvolutionalMatrixCompletion
        return model