Exemplo n.º 1
0
def get_inputs(params):
    """Returns some parameters used by the model."""
    #if FLAGS.download_if_missing and not FLAGS.use_synthetic_data:
    #  movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    if FLAGS.use_synthetic_data:
        producer = data_pipeline.DummyConstructor()
        num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[
            FLAGS.dataset]
        num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
        num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH
    else:
        num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
            dataset=FLAGS.dataset,
            data_dir=FLAGS.data_dir,
            params=params,
            constructor_type=FLAGS.constructor_type,
            deterministic=FLAGS.seed is not None)

        num_train_steps = (producer.train_batches_per_epoch //
                           params["batches_per_step"])
        num_eval_steps = (producer.eval_batches_per_epoch //
                          params["batches_per_step"])
        assert not producer.train_batches_per_epoch % params["batches_per_step"]
        assert not producer.eval_batches_per_epoch % params["batches_per_step"]

    return num_users, num_items, num_train_steps, num_eval_steps, producer
Exemplo n.º 2
0
def get_inputs(params):
    """Returns some parameters used by the model."""
    if FLAGS.download_if_missing:
        movielens.download(FLAGS.dataset, FLAGS.data_dir)

    if FLAGS.seed is not None:
        np.random.seed(FLAGS.seed)

    num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
        dataset=FLAGS.dataset,
        data_dir=FLAGS.data_dir,
        params=params,
        constructor_type=FLAGS.constructor_type,
        deterministic=FLAGS.seed is not None)

    num_train_steps = (producer.train_batches_per_epoch //
                       params["batches_per_step"])
    num_eval_steps = (producer.eval_batches_per_epoch //
                      params["batches_per_step"])
    assert not producer.train_batches_per_epoch % params["batches_per_step"]
    assert not producer.eval_batches_per_epoch % params["batches_per_step"]

    return num_users, num_items, num_train_steps, num_eval_steps, producer
Exemplo n.º 3
0
def prepare_raw_data(flag_obj):
  """Downloads and prepares raw data for data generation."""
  movielens.download(flag_obj.dataset, flag_obj.data_dir)

  data_processing_params = {
      "train_epochs": flag_obj.num_train_epochs,
      "batch_size": flag_obj.prebatch_size,
      "eval_batch_size": flag_obj.prebatch_size,
      "batches_per_step": 1,
      "stream_files": True,
      "num_neg": flag_obj.num_negative_samples,
  }

  num_users, num_items, producer = data_preprocessing.instantiate_pipeline(
      dataset=flag_obj.dataset,
      data_dir=flag_obj.data_dir,
      params=data_processing_params,
      constructor_type=flag_obj.constructor_type,
      epoch_dir=flag_obj.data_dir,
      generate_data_offline=True)

  # pylint: disable=protected-access
  input_metadata = {
      "num_users": num_users,
      "num_items": num_items,
      "constructor_type": flag_obj.constructor_type,
      "num_train_elements": producer._elements_in_epoch,
      "num_eval_elements": producer._eval_elements_in_epoch,
      "num_train_epochs": flag_obj.num_train_epochs,
      "train_prebatch_size": flag_obj.train_prebatch_size,
      "eval_prebatch_size": flag_obj.eval_prebatch_size,
      "num_train_steps": producer.train_batches_per_epoch,
      "num_eval_steps": producer.eval_batches_per_epoch,
  }
  # pylint: enable=protected-access

  return producer, input_metadata
    def _test_fresh_randomness(self, constructor_type):
        train_epochs = 5
        params = self.make_params(train_epochs=train_epochs)
        _, _, producer = data_preprocessing.instantiate_pipeline(
            dataset=DATASET,
            data_dir=self.temp_data_dir,
            params=params,
            constructor_type=constructor_type,
            deterministic=True)

        producer.start()

        results = []
        g = tf.Graph()
        with g.as_default():
            for _ in range(train_epochs):
                input_fn = producer.make_input_fn(is_training=True)
                dataset = input_fn(params)
                results.extend(self.drain_dataset(dataset=dataset, g=g))

        producer.join()
        assert producer._fatal_exception is None

        positive_counts, negative_counts = defaultdict(int), defaultdict(int)
        md5 = hashlib.md5()
        for features, labels in results:
            data_list = [
                features[movielens.USER_COLUMN],
                features[movielens.ITEM_COLUMN],
                features[rconst.VALID_POINT_MASK], labels
            ]
            for i in data_list:
                md5.update(i.tobytes())

            for u, i, v, l in zip(*data_list):
                if not v:
                    continue  # ignore padding

                if l:
                    positive_counts[(u, i)] += 1
                else:
                    negative_counts[(u, i)] += 1

        self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5)

        # The positive examples should appear exactly once each epoch
        self.assertAllEqual(list(positive_counts.values()),
                            [train_epochs for _ in positive_counts])

        # The threshold for the negatives is heuristic, but in general repeats are
        # expected, but should not appear too frequently.

        pair_cardinality = NUM_USERS * NUM_ITEMS
        neg_pair_cardinality = pair_cardinality - len(self.seen_pairs)

        # Approximation for the expectation number of times that a particular
        # negative will appear in a given epoch. Implicit in this calculation is the
        # treatment of all negative pairs as equally likely. Normally is not
        # necessarily reasonable; however the generation in self.setUp() will
        # approximate this behavior sufficiently for heuristic testing.
        e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality

        # The frequency of occurance of a given negative pair should follow an
        # approximately binomial distribution in the limit that the cardinality of
        # the negative pair set >> number of samples per epoch.
        approx_pdf = scipy.stats.binom.pmf(k=np.arange(train_epochs + 1),
                                           n=train_epochs,
                                           p=e_sample)

        # Tally the actual observed counts.
        count_distribution = [0 for _ in range(train_epochs + 1)]
        for i in negative_counts.values():
            i = min([i, train_epochs])  # round down tail for simplicity.
            count_distribution[i] += 1
        count_distribution[0] = neg_pair_cardinality - sum(
            count_distribution[1:])

        # Check that the frequency of negative pairs is approximately binomial.
        for i in range(train_epochs + 1):
            if approx_pdf[i] < 0.05:
                continue  # Variance will be high at the tails.

            observed_fraction = count_distribution[i] / neg_pair_cardinality
            deviation = (2 * abs(observed_fraction - approx_pdf[i]) /
                         (observed_fraction + approx_pdf[i]))

            self.assertLess(deviation, 0.2)
    def _test_end_to_end(self, constructor_type):
        params = self.make_params(train_epochs=1)
        _, _, producer = data_preprocessing.instantiate_pipeline(
            dataset=DATASET,
            data_dir=self.temp_data_dir,
            params=params,
            constructor_type=constructor_type,
            deterministic=True)

        producer.start()
        producer.join()
        assert producer._fatal_exception is None

        user_inv_map = {v: k for k, v in producer.user_map.items()}
        item_inv_map = {v: k for k, v in producer.item_map.items()}

        # ==========================================================================
        # == Training Data =========================================================
        # ==========================================================================
        g = tf.Graph()
        with g.as_default():
            input_fn = producer.make_input_fn(is_training=True)
            dataset = input_fn(params)

        first_epoch = self.drain_dataset(dataset=dataset, g=g)

        counts = defaultdict(int)
        train_examples = {
            True: set(),
            False: set(),
        }

        md5 = hashlib.md5()
        for features, labels in first_epoch:
            data_list = [
                features[movielens.USER_COLUMN],
                features[movielens.ITEM_COLUMN],
                features[rconst.VALID_POINT_MASK], labels
            ]
            for i in data_list:
                md5.update(i.tobytes())

            for u, i, v, l in zip(*data_list):
                if not v:
                    continue  # ignore padding

                u_raw = user_inv_map[u]
                i_raw = item_inv_map[i]
                if ((u_raw, i_raw) in self.seen_pairs) != l:
                    # The evaluation item is not considered during false negative
                    # generation, so it will occasionally appear as a negative example
                    # during training.
                    assert not l
                    self.assertEqual(i_raw, self.holdout[u_raw][1])
                train_examples[l].add((u_raw, i_raw))
                counts[(u_raw, i_raw)] += 1

        self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5)

        num_positives_seen = len(train_examples[True])
        self.assertEqual(producer._train_pos_users.shape[0],
                         num_positives_seen)

        # This check is more heuristic because negatives are sampled with
        # replacement. It only checks that negative generation is reasonably random.
        self.assertGreater(
            len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9)

        # This checks that the samples produced are independent by checking the
        # number of duplicate entries. If workers are not properly independent there
        # will be lots of repeated pairs.
        self.assertLess(np.mean(list(counts.values())), 1.1)

        # ==========================================================================
        # == Eval Data =============================================================
        # ==========================================================================
        with g.as_default():
            input_fn = producer.make_input_fn(is_training=False)
            dataset = input_fn(params)

        eval_data = self.drain_dataset(dataset=dataset, g=g)

        current_user = None
        md5 = hashlib.md5()
        for features in eval_data:
            data_list = [
                features[movielens.USER_COLUMN],
                features[movielens.ITEM_COLUMN],
                features[rconst.DUPLICATE_MASK]
            ]
            for i in data_list:
                md5.update(i.tobytes())

            for idx, (u, i, d) in enumerate(zip(*data_list)):
                u_raw = user_inv_map[u]
                i_raw = item_inv_map[i]
                if current_user is None:
                    current_user = u

                # Ensure that users appear in blocks, as the evaluation logic expects
                # this structure.
                self.assertEqual(u, current_user)

                # The structure of evaluation data is 999 negative examples followed
                # by the holdout positive.
                if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1):
                    # Check that the last element in each chunk is the holdout item.
                    self.assertEqual(i_raw, self.holdout[u_raw][1])
                    current_user = None

                elif i_raw == self.holdout[u_raw][1]:
                    # Because the holdout item is not given to the negative generation
                    # process, it can appear as a negative. In that case, it should be
                    # masked out as a duplicate. (Since the true positive is placed at
                    # the end and would therefore lose the tie.)
                    assert d

                else:
                    # Otherwise check that the other 999 points for a user are selected
                    # from the negatives.
                    assert (u_raw, i_raw) not in self.seen_pairs

        self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)