def get_inputs(params): """Returns some parameters used by the model.""" #if FLAGS.download_if_missing and not FLAGS.use_synthetic_data: # movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) if FLAGS.use_synthetic_data: producer = data_pipeline.DummyConstructor() num_users, num_items = data_preprocessing.DATASET_TO_NUM_USERS_AND_ITEMS[ FLAGS.dataset] num_train_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH num_eval_steps = rconst.SYNTHETIC_BATCHES_PER_EPOCH else: num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] return num_users, num_items, num_train_steps, num_eval_steps, producer
def get_inputs(params): """Returns some parameters used by the model.""" if FLAGS.download_if_missing: movielens.download(FLAGS.dataset, FLAGS.data_dir) if FLAGS.seed is not None: np.random.seed(FLAGS.seed) num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=FLAGS.dataset, data_dir=FLAGS.data_dir, params=params, constructor_type=FLAGS.constructor_type, deterministic=FLAGS.seed is not None) num_train_steps = (producer.train_batches_per_epoch // params["batches_per_step"]) num_eval_steps = (producer.eval_batches_per_epoch // params["batches_per_step"]) assert not producer.train_batches_per_epoch % params["batches_per_step"] assert not producer.eval_batches_per_epoch % params["batches_per_step"] return num_users, num_items, num_train_steps, num_eval_steps, producer
def prepare_raw_data(flag_obj): """Downloads and prepares raw data for data generation.""" movielens.download(flag_obj.dataset, flag_obj.data_dir) data_processing_params = { "train_epochs": flag_obj.num_train_epochs, "batch_size": flag_obj.prebatch_size, "eval_batch_size": flag_obj.prebatch_size, "batches_per_step": 1, "stream_files": True, "num_neg": flag_obj.num_negative_samples, } num_users, num_items, producer = data_preprocessing.instantiate_pipeline( dataset=flag_obj.dataset, data_dir=flag_obj.data_dir, params=data_processing_params, constructor_type=flag_obj.constructor_type, epoch_dir=flag_obj.data_dir, generate_data_offline=True) # pylint: disable=protected-access input_metadata = { "num_users": num_users, "num_items": num_items, "constructor_type": flag_obj.constructor_type, "num_train_elements": producer._elements_in_epoch, "num_eval_elements": producer._eval_elements_in_epoch, "num_train_epochs": flag_obj.num_train_epochs, "train_prebatch_size": flag_obj.train_prebatch_size, "eval_prebatch_size": flag_obj.eval_prebatch_size, "num_train_steps": producer.train_batches_per_epoch, "num_eval_steps": producer.eval_batches_per_epoch, } # pylint: enable=protected-access return producer, input_metadata
def _test_fresh_randomness(self, constructor_type): train_epochs = 5 params = self.make_params(train_epochs=train_epochs) _, _, producer = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, params=params, constructor_type=constructor_type, deterministic=True) producer.start() results = [] g = tf.Graph() with g.as_default(): for _ in range(train_epochs): input_fn = producer.make_input_fn(is_training=True) dataset = input_fn(params) results.extend(self.drain_dataset(dataset=dataset, g=g)) producer.join() assert producer._fatal_exception is None positive_counts, negative_counts = defaultdict(int), defaultdict(int) md5 = hashlib.md5() for features, labels in results: data_list = [ features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], features[rconst.VALID_POINT_MASK], labels ] for i in data_list: md5.update(i.tobytes()) for u, i, v, l in zip(*data_list): if not v: continue # ignore padding if l: positive_counts[(u, i)] += 1 else: negative_counts[(u, i)] += 1 self.assertRegexpMatches(md5.hexdigest(), FRESH_RANDOMNESS_MD5) # The positive examples should appear exactly once each epoch self.assertAllEqual(list(positive_counts.values()), [train_epochs for _ in positive_counts]) # The threshold for the negatives is heuristic, but in general repeats are # expected, but should not appear too frequently. pair_cardinality = NUM_USERS * NUM_ITEMS neg_pair_cardinality = pair_cardinality - len(self.seen_pairs) # Approximation for the expectation number of times that a particular # negative will appear in a given epoch. Implicit in this calculation is the # treatment of all negative pairs as equally likely. Normally is not # necessarily reasonable; however the generation in self.setUp() will # approximate this behavior sufficiently for heuristic testing. e_sample = len(self.seen_pairs) * NUM_NEG / neg_pair_cardinality # The frequency of occurance of a given negative pair should follow an # approximately binomial distribution in the limit that the cardinality of # the negative pair set >> number of samples per epoch. approx_pdf = scipy.stats.binom.pmf(k=np.arange(train_epochs + 1), n=train_epochs, p=e_sample) # Tally the actual observed counts. count_distribution = [0 for _ in range(train_epochs + 1)] for i in negative_counts.values(): i = min([i, train_epochs]) # round down tail for simplicity. count_distribution[i] += 1 count_distribution[0] = neg_pair_cardinality - sum( count_distribution[1:]) # Check that the frequency of negative pairs is approximately binomial. for i in range(train_epochs + 1): if approx_pdf[i] < 0.05: continue # Variance will be high at the tails. observed_fraction = count_distribution[i] / neg_pair_cardinality deviation = (2 * abs(observed_fraction - approx_pdf[i]) / (observed_fraction + approx_pdf[i])) self.assertLess(deviation, 0.2)
def _test_end_to_end(self, constructor_type): params = self.make_params(train_epochs=1) _, _, producer = data_preprocessing.instantiate_pipeline( dataset=DATASET, data_dir=self.temp_data_dir, params=params, constructor_type=constructor_type, deterministic=True) producer.start() producer.join() assert producer._fatal_exception is None user_inv_map = {v: k for k, v in producer.user_map.items()} item_inv_map = {v: k for k, v in producer.item_map.items()} # ========================================================================== # == Training Data ========================================================= # ========================================================================== g = tf.Graph() with g.as_default(): input_fn = producer.make_input_fn(is_training=True) dataset = input_fn(params) first_epoch = self.drain_dataset(dataset=dataset, g=g) counts = defaultdict(int) train_examples = { True: set(), False: set(), } md5 = hashlib.md5() for features, labels in first_epoch: data_list = [ features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], features[rconst.VALID_POINT_MASK], labels ] for i in data_list: md5.update(i.tobytes()) for u, i, v, l in zip(*data_list): if not v: continue # ignore padding u_raw = user_inv_map[u] i_raw = item_inv_map[i] if ((u_raw, i_raw) in self.seen_pairs) != l: # The evaluation item is not considered during false negative # generation, so it will occasionally appear as a negative example # during training. assert not l self.assertEqual(i_raw, self.holdout[u_raw][1]) train_examples[l].add((u_raw, i_raw)) counts[(u_raw, i_raw)] += 1 self.assertRegexpMatches(md5.hexdigest(), END_TO_END_TRAIN_MD5) num_positives_seen = len(train_examples[True]) self.assertEqual(producer._train_pos_users.shape[0], num_positives_seen) # This check is more heuristic because negatives are sampled with # replacement. It only checks that negative generation is reasonably random. self.assertGreater( len(train_examples[False]) / NUM_NEG / num_positives_seen, 0.9) # This checks that the samples produced are independent by checking the # number of duplicate entries. If workers are not properly independent there # will be lots of repeated pairs. self.assertLess(np.mean(list(counts.values())), 1.1) # ========================================================================== # == Eval Data ============================================================= # ========================================================================== with g.as_default(): input_fn = producer.make_input_fn(is_training=False) dataset = input_fn(params) eval_data = self.drain_dataset(dataset=dataset, g=g) current_user = None md5 = hashlib.md5() for features in eval_data: data_list = [ features[movielens.USER_COLUMN], features[movielens.ITEM_COLUMN], features[rconst.DUPLICATE_MASK] ] for i in data_list: md5.update(i.tobytes()) for idx, (u, i, d) in enumerate(zip(*data_list)): u_raw = user_inv_map[u] i_raw = item_inv_map[i] if current_user is None: current_user = u # Ensure that users appear in blocks, as the evaluation logic expects # this structure. self.assertEqual(u, current_user) # The structure of evaluation data is 999 negative examples followed # by the holdout positive. if not (idx + 1) % (rconst.NUM_EVAL_NEGATIVES + 1): # Check that the last element in each chunk is the holdout item. self.assertEqual(i_raw, self.holdout[u_raw][1]) current_user = None elif i_raw == self.holdout[u_raw][1]: # Because the holdout item is not given to the negative generation # process, it can appear as a negative. In that case, it should be # masked out as a duplicate. (Since the true positive is placed at # the end and would therefore lose the tie.) assert d else: # Otherwise check that the other 999 points for a user are selected # from the negatives. assert (u_raw, i_raw) not in self.seen_pairs self.assertRegexpMatches(md5.hexdigest(), END_TO_END_EVAL_MD5)