def test_produces_synthetic_interactions_with_right_content(self):
        np.random.seed(0)

        left_matrix_num_rows = 4
        left_matrix_num_cols = 8
        left_matrix = np.ones((left_matrix_num_rows, left_matrix_num_cols))

        right_matrix_num_rows = 16
        right_matrix_num_cols = 32
        right_matrix_num_non_zeros = 100
        right_matrix = random_binary_sparse_matrix(right_matrix_num_non_zeros,
                                                   right_matrix_num_rows,
                                                   right_matrix_num_cols)
        right_matrix = right_matrix.tocoo()

        num_shards = len(left_matrix)
        train_output_file = self.create_tempfile("temp_train.pkl")
        train_output_shards = [
            self.create_tempfile("temp_train.pkl_%d" % shard_idx)
            for shard_idx in range(num_shards)
        ]
        test_output_file = self.create_tempfile("temp_test.pkl")

        train_meta_output_file = self.create_tempfile("temp_train_meta.pkl")
        test_meta_output_file = self.create_tempfile("temp_test_meta.pkl")

        graph_expansion.output_randomized_kronecker_to_pickle(
            left_matrix,
            right_matrix,
            train_output_file.full_path,
            test_output_file.full_path,
            train_meta_output_file.full_path,
            test_meta_output_file.full_path,
            remove_empty_rows=False)

        serialized_rows = []
        for shard_output_file in train_output_shards:
            serialized_rows.extend(
                read_from_serialized_file(shard_output_file.full_path))

        self.assertLen(serialized_rows,
                       left_matrix_num_rows * right_matrix_num_rows)

        output_item_set = set(itertools.chain(*serialized_rows))
        self.assertEqual(
            output_item_set,
            set(range(left_matrix_num_cols * right_matrix_num_cols)))
예제 #2
0
def main(_):

    # Fix seed for reproducibility
    np.random.seed(FLAGS.random_seed)

    logging.info("Loading MovieLens 20m from %s.", FLAGS.input_csv_file)
    ratings_df = util.load_df_from_file(FLAGS.input_csv_file)
    logging.info("Done loading MovieLens 20m from %s.", FLAGS.input_csv_file)

    logging.info("Preprocessing MovieLens 20m.")
    ratings_df, train_ratings_df, test_ratings_df = _preprocess_movie_lens(
        ratings_df)
    logging.info("Done preprocessing MovieLens 20m.")

    num_users, num_items, _ = util.describe_rating_df(ratings_df,
                                                      "original set")
    _, _, num_train_ratings = util.describe_rating_df(train_ratings_df,
                                                      "train set")
    _, _, num_test_ratings = util.describe_rating_df(test_ratings_df,
                                                     "test set")

    logging.info("Converting data frames to sparse matrices.")
    train_ratings_matrix = util.convert_df_to_sparse_matrix(train_ratings_df,
                                                            shape=(num_users,
                                                                   num_items))
    test_ratings_matrix = util.convert_df_to_sparse_matrix(test_ratings_df,
                                                           shape=(num_users,
                                                                  num_items))
    logging.info("Done converting data frames to sparse matrices.")

    reduced_num_rows = FLAGS.num_row_multiplier
    reduced_num_cols = FLAGS.num_col_multiplier
    k = min(reduced_num_rows, reduced_num_cols)
    logging.info("Computing SVD of training matrix (top %d values).", k)
    (u_train, s_train, v_train) = sparse_svd(train_ratings_matrix,
                                             k,
                                             max_iter=None)
    logging.info("Done computing SVD of training matrix.")

    logging.info("Creating reduced rating matrix (size %d, %d)",
                 reduced_num_rows, reduced_num_cols)
    reduced_train_matrix = resize_matrix((u_train, s_train, v_train),
                                         reduced_num_rows, reduced_num_cols)
    reduced_train_matrix = normalize_matrix(reduced_train_matrix)
    logging.info("Creating reduced rating matrix.")

    average_sampling_rate = reduced_train_matrix.mean()
    logging.info("Average sampling rate: %2f.", average_sampling_rate)
    logging.info("Expected number of synthetic train samples: %s",
                 average_sampling_rate * num_train_ratings)
    logging.info("Expected number of synthetic test samples: %s",
                 average_sampling_rate * num_test_ratings)

    # Mark test data by a bit flip.
    logging.info("Creating signed train/test matrix.")
    train_test_ratings_matrix = train_ratings_matrix - test_ratings_matrix
    train_test_ratings_matrix = train_test_ratings_matrix.tocoo()
    logging.info("Done creating signed train/test matrix.")

    output_train_file = (FLAGS.output_prefix + "trainx" +
                         str(reduced_num_rows) + "x" + str(reduced_num_cols))
    output_test_file = (FLAGS.output_prefix + "testx" + str(reduced_num_rows) +
                        "x" + str(reduced_num_cols))
    output_train_file_metadata = None
    output_test_file_metadata = None

    logging.info("Creating synthetic train data set and dumping to %s.",
                 output_train_file)
    logging.info("Creating synthetic train data set and dumping to %s.",
                 output_test_file)
    output_randomized_kronecker_to_pickle(
        left_matrix=reduced_train_matrix,
        right_matrix=train_test_ratings_matrix,
        train_indices_out_path=output_train_file,
        test_indices_out_path=output_test_file,
        train_metadata_out_path=output_train_file_metadata,
        test_metadata_out_path=output_test_file_metadata)
    logging.info("Done creating synthetic train data set and dumping to %s.",
                 output_train_file)
    logging.info("Done creating synthetic test data set and dumping to %s.",
                 output_test_file)
    def test_produces_synthetic_interactions_with_right_shape(self):
        np.random.seed(0)

        left_matrix_num_rows = 4
        left_matrix_num_cols = 8
        left_matrix = np.ones((left_matrix_num_rows, left_matrix_num_cols))

        right_matrix_num_rows = 16
        right_matrix_num_cols = 32
        right_matrix = random_binary_sparse_matrix(
            50, right_matrix_num_rows,
            right_matrix_num_cols) - random_binary_sparse_matrix(
                50, right_matrix_num_rows, right_matrix_num_cols)
        right_matrix = right_matrix.tocoo()
        right_matrix_num_non_zeros = right_matrix.nnz

        right_matrix_num_train = (right_matrix == 1).nnz
        right_matrix_num_test = (right_matrix == -1).nnz

        train_output_file = self.create_tempfile("temp_train.pkl")
        test_output_file = self.create_tempfile("temp_test.pkl")

        train_meta_output_file = self.create_tempfile("temp_train_meta.pkl")
        test_meta_output_file = self.create_tempfile("temp_test_meta.pkl")

        (metadata, train_metadata, test_metadata
         ) = graph_expansion.output_randomized_kronecker_to_pickle(
             left_matrix,
             right_matrix,
             train_output_file.full_path,
             test_output_file.full_path,
             train_meta_output_file.full_path,
             test_meta_output_file.full_path,
             remove_empty_rows=False)

        # Left matrix is filled with 1s here.
        self.assertEqual(
            metadata.num_interactions, left_matrix_num_rows *
            left_matrix_num_cols * right_matrix_num_non_zeros)
        self.assertEqual(metadata.num_rows,
                         left_matrix_num_rows * right_matrix_num_rows)
        self.assertEqual(metadata.num_cols,
                         left_matrix_num_cols * right_matrix_num_cols)

        # Right matrix is filled with 1s here so there should be no test set.
        self.assertEqual(
            train_metadata.num_interactions, left_matrix_num_rows *
            left_matrix_num_cols * right_matrix_num_train)
        self.assertEqual(train_metadata.num_rows,
                         left_matrix_num_rows * right_matrix_num_rows)
        self.assertEqual(train_metadata.num_cols,
                         left_matrix_num_cols * right_matrix_num_cols)

        self.assertEqual(
            test_metadata.num_interactions, left_matrix_num_rows *
            left_matrix_num_cols * right_matrix_num_test)
        self.assertEqual(test_metadata.num_rows,
                         left_matrix_num_rows * right_matrix_num_rows)
        self.assertEqual(test_metadata.num_cols,
                         left_matrix_num_cols * right_matrix_num_cols)

        pickled_train_metadata = read_from_serialized_file(
            train_meta_output_file.full_path)
        pickled_test_metadata = read_from_serialized_file(
            test_meta_output_file.full_path)

        self.assertEqual(train_metadata, pickled_train_metadata)
        self.assertEqual(test_metadata, pickled_test_metadata)