Пример #1
0
 def generate_data(self, data_dir, _):
     generator_utils.generate_files(
         algorithmic.identity_generator(self.num_symbols, 40, 100000),
         self.training_filepaths(data_dir, 1, shuffled=True), 100)
     generator_utils.generate_files(
         algorithmic.identity_generator(self.num_symbols, 400, 10000),
         self.dev_filepaths(data_dir, 1, shuffled=True), 100)
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     train_paths = self.training_filepaths(data_dir, 1, shuffled=True)
     dev_paths = self.dev_filepaths(data_dir, 1, shuffled=True)
     generator_utils.generate_files(self.generator(data_dir, tmp_dir, True),
                                    train_paths)
     generator_utils.generate_files(
         self.generator(data_dir, tmp_dir, False), dev_paths)
Пример #3
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    self.data_dir = data_dir
    # Determine whether we are in training or validation mode.
    self.mode = {problem.DatasetSplit.TRAIN: 'train',
                 problem.DatasetSplit.EVAL: 'dev',
                 problem.DatasetSplit.TEST: 'test'}
    filepath_fns = {problem.DatasetSplit.TRAIN: self.training_filepaths,
                    problem.DatasetSplit.EVAL: self.dev_filepaths,
                    problem.DatasetSplit.TEST: self.test_filepaths}

    split_paths = [(split['split'], filepath_fns[split['split']](
      data_dir, split['shards'], shuffled=self.already_shuffled))
      for split in self.dataset_splits]
    all_paths = []
    for _, paths in split_paths:
      all_paths.extend(paths)

    if self.is_generate_per_split:
      for split, paths in split_paths:
        # Create the source and target txt files from the raw data.
        self.preprocess_data(self.mode[split])
        generator_utils.generate_files(
            self.generate_encoded_samples(data_dir, tmp_dir, split), paths)
    else:
      self.preprocess_data(self.mode[problem.DatasetSplit.TRAIN])
      generator_utils.generate_files(
          self.generate_encoded_samples(
              data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths)

    generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
Пример #4
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):

    filepath_fns = {
        problem.DatasetSplit.TRAIN: self.training_filepaths,
        problem.DatasetSplit.EVAL: self.dev_filepaths,
        problem.DatasetSplit.TEST: self.test_filepaths,
    }

    split_paths = [(split["split"], filepath_fns[split["split"]](
        data_dir, split["shards"], shuffled=self.already_shuffled))
                   for split in self.dataset_splits]
    all_paths = []
    for _, paths in split_paths:
      all_paths.extend(paths)

    if self.is_generate_per_split:
      for split, paths in split_paths:
        generator_utils.generate_files(
            self.generate_encoded_samples(data_dir, tmp_dir, split), paths)
    else:
      generator_utils.generate_files(
          self.generate_encoded_samples(
              data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths)

    generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
Пример #5
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.num_dev_shards,
                                       shuffled=False)
        test_paths = self.test_filepaths(data_dir,
                                         self.num_test_shards,
                                         shuffled=True)

        generator_utils.generate_files(
            self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths)

        if self.use_train_shards_for_dev:
            all_paths = train_paths + dev_paths
            generator_utils.generate_files(
                self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS),
                all_paths)
            generator_utils.shuffle_dataset(all_paths)
        else:
            generator_utils.generate_dataset_and_shuffle(
                self.generator(data_dir, tmp_dir,
                               self.TRAIN_DATASETS), train_paths,
                self.generator(data_dir, tmp_dir, self.DEV_DATASETS),
                dev_paths)
Пример #6
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        """The function generating the data."""
        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }

        # We set shuffled=True as we don't want to shuffle on disk later.
        split_paths = [(split["split"],
                        filepath_fns[split["split"]](data_dir,
                                                     split["shards"],
                                                     shuffled=True))
                       for split in self.dataset_splits]
        all_paths = []
        for _, paths in split_paths:
            all_paths.extend(paths)

        if self.is_generate_per_split:
            for split, paths in split_paths:
                generator_utils.generate_files(
                    self.generate_encoded_samples_debug(
                        data_dir, tmp_dir, split),
                    paths,
                    cycle_every_n=self.total_number_of_frames // len(paths))
        else:
            generator_utils.generate_files(
                self.generate_encoded_samples_debug(
                    data_dir, tmp_dir, problem.DatasetSplit.TRAIN),
                all_paths,
                cycle_every_n=self.total_number_of_frames // len(all_paths))
Пример #7
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   train_paths = self.training_filepaths(data_dir, 1, shuffled=True)
   dev_paths = self.dev_filepaths(data_dir, 1, shuffled=True)
   generator_utils.generate_files(
       self.generator(data_dir, tmp_dir, True), train_paths)
   generator_utils.generate_files(
       self.generator(data_dir, tmp_dir, False), dev_paths)
Пример #8
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    """Saves the rollout history to disk."""
    # Shuffle rollouts globally taking advantage of the fact that we have
    # everything in memory.
    epoch_rollout_tuples = list()
    for epoch_nr, rollouts in self.rollouts_by_epoch.items():
      for rollout in rollouts:
        epoch_rollout_tuples.append((epoch_nr, rollout))

    random.shuffle(epoch_rollout_tuples)

    filepath_fns = {
        problem.DatasetSplit.TRAIN: self.training_filepaths,
        problem.DatasetSplit.EVAL: self.dev_filepaths,
        problem.DatasetSplit.TEST: self.test_filepaths,
    }

    # We set shuffled=True as we don't want to shuffle on disk later.
    paths = [
        path
        for split in self.dataset_splits
        for path in filepath_fns[split["split"]](
            data_dir, split["shards"], shuffled=True
        )
    ]

    num_frames = sum(len(rollout) for (_, rollout) in epoch_rollout_tuples)
    shard_size = num_frames // len(paths)
    generator_utils.generate_files(
        self._generate_frames(epoch_rollout_tuples), paths,
        cycle_every_n=shard_size
    )
Пример #9
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    filepath_fns = {
        problem.DatasetSplit.TRAIN: self.training_filepaths,
        problem.DatasetSplit.EVAL: self.dev_filepaths,
        problem.DatasetSplit.TEST: self.test_filepaths,
    }

    split_paths = [(split["split"], filepath_fns[split["split"]](
        data_dir, split["shards"], shuffled=False))
                   for split in self.dataset_splits]

    all_paths = []
    for _, paths in split_paths:
      all_paths.extend(paths)

    if self.is_generate_per_split:
      for split, paths in split_paths:
        generator_utils.generate_files(
            self.generate_samples(data_dir, tmp_dir, split), paths)
    else:
      generator_utils.generate_files(
          self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN),
          all_paths)

    generator_utils.shuffle_dataset(all_paths)
Пример #10
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    """The function generating the data."""
    filepath_fns = {
        problem.DatasetSplit.TRAIN: self.training_filepaths,
        problem.DatasetSplit.EVAL: self.dev_filepaths,
        problem.DatasetSplit.TEST: self.test_filepaths,
    }

    # We set shuffled=True as we don't want to shuffle on disk later.
    split_paths = [(split["split"], filepath_fns[split["split"]](
        data_dir, split["shards"], shuffled=True))
                   for split in self.dataset_splits]
    all_paths = []
    for _, paths in split_paths:
      all_paths.extend(paths)

    if self.is_generate_per_split:
      for split, paths in split_paths:
        generator_utils.generate_files(
            self.generate_encoded_samples_debug(
                data_dir, tmp_dir, split), paths,
            cycle_every_n=self.total_number_of_frames // len(paths))
    else:
      generator_utils.generate_files(
          self.generate_encoded_samples_debug(
              data_dir, tmp_dir, problem.DatasetSplit.TRAIN),
          all_paths,
          cycle_every_n=self.total_number_of_frames // len(all_paths))
Пример #11
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):

        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }

        split_paths = dict([(split["split"],
                             filepath_fns[split["split"]](data_dir,
                                                          split["shards"],
                                                          shuffled=False))
                            for split in self.dataset_splits])
        all_paths = []
        for paths in split_paths.values():
            all_paths.extend(paths)

        if self.is_generate_per_split:
            for split, paths in split_paths.items():
                generator_utils.generate_files(
                    self._maybe_pack_examples(
                        self.generate_encoded_samples(data_dir, tmp_dir,
                                                      split)), paths)
        else:
            generator_utils.generate_files(
                self._maybe_pack_examples(
                    self.generate_encoded_samples(data_dir, tmp_dir,
                                                  problem.DatasetSplit.TRAIN)),
                all_paths)

        generator_utils.shuffle_dataset(all_paths)
Пример #12
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        # 构造对应文件名的方法dict
        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }
        # 根据dataset_splits 的设置,得到各类文件名
        split_paths = [
            (split["split"],
             filepath_fns[split["split"]](data_dir,
                                          split["shards"],
                                          shuffled=self.already_shuffled))
            for split in self.dataset_splits
        ]
        all_paths = []
        for _, paths in split_paths:
            all_paths.extend(paths)
        # split 是train/eval/test,对应的paths是对应类别下根据shards生成的文件名list
        if self.is_generate_per_split:
            for split, paths in split_paths:
                # generate_files(generator, filenames) 将generator生成的token id sample 写入 filenames
                generator_utils.generate_files(
                    # generate_encoded_samples yield token id形式的sample
                    self.generate_encoded_samples(data_dir, tmp_dir, split),
                    paths)
        else:
            generator_utils.generate_files(
                self.generate_encoded_samples(data_dir, tmp_dir,
                                              problem.DatasetSplit.TRAIN),
                all_paths)

        generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
Пример #13
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        del tmp_dir  # unused argument
        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }

        split_paths = [(split['split'],
                        filepath_fns[split['split']](data_dir,
                                                     split['shards'],
                                                     shuffled=False))
                       for split in self.dataset_splits]
        all_paths = []
        for _, paths in split_paths:
            all_paths.extend(paths)

        if self.is_generate_per_split:
            for split, paths in split_paths:
                generator_utils.generate_files(
                    self.generate_encoded_samples(data_dir, tmp_dir, split),
                    paths)
        else:
            generator_utils.generate_files(
                self.generate_encoded_samples(data_dir, tmp_dir,
                                              problem.DatasetSplit.TRAIN),
                all_paths)

        generator_utils.shuffle_dataset(all_paths)
Пример #14
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        """Saves the rollout history to disk, split into train/dev sets."""
        self._split_current_epoch()

        splits_and_paths = self.splits_and_paths(data_dir)
        num_epochs = len(self._rollouts_by_epoch_and_split)

        for (epoch_index, (epoch, rollouts_by_split)) in enumerate(
                six.iteritems(self._rollouts_by_epoch_and_split)):
            for (split, paths) in splits_and_paths:
                num_shards = len(paths) // num_epochs
                paths = paths[epoch_index * num_shards:(epoch_index + 1) *
                              num_shards]

                rollouts = rollouts_by_split[split]
                num_frames = self._calc_num_frames(rollouts)
                shard_size = num_frames // len(paths)

                frame_gen = self._generate_frames(epoch, rollouts)
                for (path_index, path) in enumerate(paths):
                    limit = shard_size
                    # Put the remainder in the last shard to preserve the ordering.
                    if path_index == len(paths) - 1:
                        limit = None
                    generator_utils.generate_files(itertools.islice(
                        frame_gen, limit), [path],
                                                   cycle_every_n=float("inf"))
Пример #15
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    """Saves the rollout history to disk."""
    # Suffle rollouts globally taking advantage of the fact that we have
    # everything in memory.
    shuffled_history = self.history[:]
    random.shuffle(shuffled_history)

    filepath_fns = {
        problem.DatasetSplit.TRAIN: self.training_filepaths,
        problem.DatasetSplit.EVAL: self.dev_filepaths,
        problem.DatasetSplit.TEST: self.test_filepaths,
    }

    # We set shuffled=True as we don't want to shuffle on disk later.
    splits_and_paths = [
        (split["split"], path)
        for split in self.dataset_splits
        for path in filepath_fns[split["split"]](
            data_dir, split["shards"], shuffled=True
        )
    ]

    # Split entire rollouts into shards so that no rollout is broken on shard
    # boundary.
    shard_size = int(math.ceil(len(shuffled_history)) / len(splits_and_paths))
    for (i, (split, path)) in enumerate(splits_and_paths):
      rollouts = shuffled_history[i * shard_size : (i + 1) * shard_size]
      generator_utils.generate_files(
          self._generate_frames(rollouts), [path], cycle_every_n=float("inf")
      )
Пример #16
0
    def generate_data_shard(self, thread_ix, meta_list, out_file, encoder):
        tf.logging.info("[thread %d], %d image-label pairs" %
                        (thread_ix, len(meta_list)))

        generator_utils.generate_files(
            _example_generator(meta_list, self.image_shape, encoder,
                               self.get_helper()), [out_file])
Пример #17
0
 def generate_data(self, data_dir, _):
   identity_problem = algorithmic.AlgorithmicIdentityBinary40()
   generator_utils.generate_files(
       identity_problem.generator(self.num_symbols, 40, 100000),
       self.training_filepaths(data_dir, 1, shuffled=True), 100)
   generator_utils.generate_files(
       identity_problem.generator(self.num_symbols, 400, 10000),
       self.dev_filepaths(data_dir, 1, shuffled=True), 100)
Пример #18
0
 def generate_data(self, data_dir, _):
   identity_problem = algorithmic.AlgorithmicIdentityBinary40()
   generator_utils.generate_files(
       identity_problem.generator(self.num_symbols, 40, 100000),
       self.training_filepaths(data_dir, 1, shuffled=True), 100)
   generator_utils.generate_files(
       identity_problem.generator(self.num_symbols, 400, 10000),
       self.dev_filepaths(data_dir, 1, shuffled=True), 100)
Пример #19
0
def encode_dataset(model, dataset, problem, ae_hparams, autoencoder_path,
                   out_files):
    """Encode all frames in dataset with model and write them out to out_files."""
    batch_size = 8
    dataset = dataset.batch(batch_size)
    examples = dataset.make_one_shot_iterator().get_next()
    images = examples.pop("frame")
    images = tf.cast(images, tf.int32)

    encoded = model.encode(images)
    encoded_frame_height = int(
        math.ceil(problem.frame_height / 2**ae_hparams.num_hidden_layers))
    encoded_frame_width = int(
        math.ceil(problem.frame_width / 2**ae_hparams.num_hidden_layers))
    num_bits = 8
    encoded = tf.reshape(
        encoded, [-1, encoded_frame_height, encoded_frame_width, 3, num_bits])
    encoded = tf.cast(discretization.bit_to_int(encoded, num_bits), tf.uint8)

    pngs = tf.map_fn(tf.image.encode_png,
                     encoded,
                     dtype=tf.string,
                     back_prop=False)

    with tf.Session() as sess:
        autoencoder_saver = tf.train.Saver(
            tf.global_variables("autoencoder.*"))
        trainer_lib.restore_checkpoint(autoencoder_path,
                                       autoencoder_saver,
                                       sess,
                                       must_restore=True)

        def generator():
            """Generate examples."""
            while True:
                try:
                    pngs_np, examples_np = sess.run([pngs, examples])
                    rewards = examples_np["reward"].tolist()
                    actions = examples_np["action"].tolist()
                    frame_numbers = examples_np["frame_number"].tolist()
                    for action, reward, frame_number, png in \
                            zip(actions, rewards, frame_numbers, pngs_np):
                        yield {
                            "action": action,
                            "reward": reward,
                            "frame_number": frame_number,
                            "image/encoded": [png],
                            "image/format": ["png"],
                            "image/height": [encoded_frame_height],
                            "image/width": [encoded_frame_width],
                        }
                except tf.errors.OutOfRangeError:
                    break

        generator_utils.generate_files(
            generator(),
            out_files,
            cycle_every_n=problem.total_number_of_frames // 10)
Пример #20
0
 def setUpClass(cls):
     # Generate a small test dataset
     FLAGS.problems = "algorithmic_addition_binary40"
     TrainerUtilsTest.data_dir = tf.test.get_temp_dir()
     gen = algorithmic.identity_generator(2, 10, 300)
     generator_utils.generate_files(gen, FLAGS.problems + "-train",
                                    TrainerUtilsTest.data_dir, 1, 100)
     generator_utils.generate_files(gen, FLAGS.problems + "-dev",
                                    TrainerUtilsTest.data_dir, 1, 100)
Пример #21
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   train_paths = self.training_filepaths(
       data_dir, self.num_shards, shuffled=False)
   dev_paths = self.dev_filepaths(
       data_dir, self.num_dev_shards, shuffled=False)
   all_paths = train_paths + dev_paths
   generator_utils.generate_files(
       self.generator(data_dir, tmp_dir), all_paths)
   generator_utils.shuffle_dataset(all_paths)
Пример #22
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   train_paths = self.training_filepaths(
       data_dir, self.num_shards, shuffled=False)
   dev_paths = self.dev_filepaths(
       data_dir, self.num_dev_shards, shuffled=False)
   all_paths = train_paths + dev_paths
   generator_utils.generate_files(
       self.generator(data_dir, tmp_dir), all_paths)
   generator_utils.shuffle_dataset(all_paths)
Пример #23
0
  def _testBatchExamples(self):
    tf.set_random_seed(1)
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a file with 100 examples, n-th example of length n + 1.
    def test_generator():
      for i in xrange(100):
        yield {"inputs": [i + 1 for _ in xrange(i + 1)], "targets": [i + 1]}

    filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1)
    generator_utils.generate_files(test_generator(), filenames)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))

    examples_train = data_reader.examples_queue([tmp_file_path + "*"], {
        "inputs": tf.VarLenFeature(tf.int64),
        "targets": tf.VarLenFeature(tf.int64)
    }, True)
    batch_train = data_reader.batch_examples(examples_train, 4)
    examples_eval = data_reader.examples_queue([tmp_file_path + "*"], {
        "inputs": tf.VarLenFeature(tf.int64),
        "targets": tf.VarLenFeature(tf.int64)
    }, False)
    batch_eval = data_reader.batch_examples(examples_eval, 2)
    session, coord = tf.Session(), tf.train.Coordinator()
    with session.as_default():
      tf.train.start_queue_runners(coord=coord)

      # Evaluation data comes in the same order as in the file.
      # The first batch will be inputs=[[1, 0], [2, 2]], targets=[[1], [2]].
      examples = session.run(batch_eval)
      self.assertAllClose(examples["inputs"], np.array([[1, 0], [2, 2]]))
      self.assertAllClose(examples["targets"], np.array([[1], [2]]))
      # Check the second batch too.
      examples = session.run(batch_eval)
      self.assertAllClose(examples["inputs"],
                          np.array([[3, 3, 3, 0], [4, 4, 4, 4]]))
      self.assertAllClose(examples["targets"], np.array([[3], [4]]))

      # Training data is shuffled but shouldn't have too many pads.
      for _ in xrange(10):
        examples = session.run(batch_train)
        inputs = examples["inputs"]
        # Only 3 out of 4 examples in a batch have padding zeros at all.
        pad_per_example = (inputs.size - np.count_nonzero(inputs)) // 3
        # Default bucketing is in steps of 8 until 64 and 32 later.
        if int(max(examples["targets"])) < 64:
          self.assertLess(pad_per_example, 8)
        else:
          self.assertLess(pad_per_example, 32)

    # Clean up.
    coord.request_stop()
    coord.join()
    os.remove(tmp_file_path + "-train-00000-of-00001")
    os.remove(tmp_file_path)
 def setUpClass(cls):
   # Generate a small test dataset
   FLAGS.problems = "algorithmic_addition_binary40"
   TrainerUtilsTest.data_dir = tf.test.get_temp_dir()
   gen = algorithmic.identity_generator(2, 10, 300)
   generator_utils.generate_files(gen, FLAGS.problems + "-train",
                                  TrainerUtilsTest.data_dir, 1, 100)
   generator_utils.generate_files(gen, FLAGS.problems + "-dev",
                                  TrainerUtilsTest.data_dir, 1, 100)
  def _testBatchExamples(self):
    tf.set_random_seed(1)
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a file with 100 examples, n-th example of length n + 1.
    def test_generator():
      for i in xrange(100):
        yield {"inputs": [i + 1 for _ in xrange(i + 1)], "targets": [i + 1]}

    generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001"))

    examples_train = data_reader.examples_queue([tmp_file_path + "*"], {
        "inputs": tf.VarLenFeature(tf.int64),
        "targets": tf.VarLenFeature(tf.int64)
    }, True)
    batch_train = data_reader.batch_examples(examples_train, 4)
    examples_eval = data_reader.examples_queue([tmp_file_path + "*"], {
        "inputs": tf.VarLenFeature(tf.int64),
        "targets": tf.VarLenFeature(tf.int64)
    }, False)
    batch_eval = data_reader.batch_examples(examples_eval, 2)
    session, coord = tf.Session(), tf.train.Coordinator()
    with session.as_default():
      tf.train.start_queue_runners(coord=coord)

      # Evaluation data comes in the same order as in the file.
      # The first batch will be inputs=[[1, 0], [2, 2]], targets=[[1], [2]].
      examples = session.run(batch_eval)
      self.assertAllClose(examples["inputs"], np.array([[1, 0], [2, 2]]))
      self.assertAllClose(examples["targets"], np.array([[1], [2]]))
      # Check the second batch too.
      examples = session.run(batch_eval)
      self.assertAllClose(examples["inputs"],
                          np.array([[3, 3, 3, 0], [4, 4, 4, 4]]))
      self.assertAllClose(examples["targets"], np.array([[3], [4]]))

      # Training data is shuffled but shouldn't have too many pads.
      for _ in xrange(10):
        examples = session.run(batch_train)
        inputs = examples["inputs"]
        # Only 3 out of 4 examples in a batch have padding zeros at all.
        pad_per_example = (inputs.size - np.count_nonzero(inputs)) // 3
        # Default bucketing is in steps of 8 until 64 and 32 later.
        if int(max(examples["targets"])) < 64:
          self.assertLess(pad_per_example, 8)
        else:
          self.assertLess(pad_per_example, 32)

    # Clean up.
    coord.request_stop()
    coord.join()
    os.remove(tmp_file_path + "-00000-of-00001")
    os.remove(tmp_file_path)
Пример #26
0
def generate_dataset(h5_filepath,
                     key_prefix,
                     out_filepaths,
                     start_idx=None,
                     end_idx=None):
    print("PID: %d, Key: %s, (Start, End): (%s, %s)" %
          (os.getpid(), key_prefix, start_idx, end_idx))
    generator_utils.generate_files(
        dataset_generator(h5_filepath, key_prefix, start_idx, end_idx),
        out_filepaths)
Пример #27
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    """Ganerate data for this problem."""

    del tmp_dir, task_id
    identity_problem = AlgorithmicIdentityBinary40()
    utils.generate_files(
        identity_problem.generator(self.num_symbols, 40, 100000),
        self.training_filepaths(data_dir, 1, shuffled=True), 100)
    utils.generate_files(
        identity_problem.generator(self.num_symbols, 400, 10000),
        self.dev_filepaths(data_dir, 1, shuffled=True), 100)
Пример #28
0
 def setUpClass(cls):
     # Generate a small test dataset
     FLAGS.problems = "algorithmic_addition_binary40"
     TrainerUtilsTest.data_dir = tf.test.get_temp_dir()
     gen = algorithmic.identity_generator(2, 10, 300)
     train_filenames = generator_utils.train_data_filenames(
         FLAGS.problems, TrainerUtilsTest.data_dir, 1)
     dev_filenames = generator_utils.dev_data_filenames(
         FLAGS.problems, TrainerUtilsTest.data_dir, 1)
     generator_utils.generate_files(gen, train_filenames, 100)
     generator_utils.generate_files(gen, dev_filenames, 100)
Пример #29
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    """Ganerate data for this problem."""

    del tmp_dir, task_id
    identity_problem = AlgorithmicIdentityBinary40()
    utils.generate_files(
        identity_problem.generator(self.num_symbols, 40, 100000),
        self.training_filepaths(data_dir, 1, shuffled=True), 100)
    utils.generate_files(
        identity_problem.generator(self.num_symbols, 400, 10000),
        self.dev_filepaths(data_dir, 1, shuffled=True), 100)
Пример #30
0
def generate_dataset(h5_filepath,
                     key_prefix,
                     out_filepaths,
                     chunk_size=1,
                     start_idx=None,
                     end_idx=None):
  print("PID: %d, Key: %s, (Start, End): (%s, %s)" % (os.getpid(), key_prefix,
                                                      start_idx, end_idx))
  generator_utils.generate_files(
      dataset_generator(h5_filepath, key_prefix, chunk_size, start_idx,
                        end_idx), out_filepaths)
Пример #31
0
    def testExamplesQueue(self):
        tf.set_random_seed(1)
        tmp_dir = self.get_temp_dir()
        (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
        tmp_file_name = os.path.basename(tmp_file_path)

        # Generate a file with 100 examples.
        def test_generator():
            for i in xrange(100):
                yield {"inputs": [i], "targets": [i], "floats": [i + 0.5]}

        filenames = generator_utils.train_data_filenames(
            tmp_file_name, tmp_dir, 1)
        generator_utils.generate_files(test_generator(), filenames)
        self.assertTrue(
            tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))

        examples_train = data_reader.examples_reader(
            [tmp_file_path + "*"], {
                "inputs": tf.VarLenFeature(tf.int64),
                "targets": tf.VarLenFeature(tf.int64)
            },
            training=True)
        examples_eval = data_reader.examples_reader(
            [tmp_file_path + "*"], {
                "inputs": tf.VarLenFeature(tf.int64),
                "targets": tf.VarLenFeature(tf.int64),
                "floats": tf.VarLenFeature(tf.float32)
            },
            training=False)
        with tf.train.MonitoredSession() as session:
            # Evaluation data comes in the same order as in the file, check 10.
            for i in xrange(10):
                examples = session.run(examples_eval)
                self.assertEqual(len(examples["inputs"]), 1)
                self.assertEqual(len(examples["targets"]), 1)
                self.assertEqual(examples["inputs"][0], i)
                self.assertEqual(examples["targets"][0], i)
                self.assertEqual(examples["floats"][0], i + 0.5)
            # Training data is shuffled.
            is_shuffled = False
            for i in xrange(10):
                examples = session.run(examples_train)
                self.assertEqual(len(examples["inputs"]), 1)
                self.assertEqual(len(examples["targets"]), 1)
                self.assertEqual(examples["inputs"][0], examples["targets"][0])
                if examples["inputs"][0] != i:
                    is_shuffled = True
            self.assertTrue(is_shuffled)

        # Clean up.
        os.remove(tmp_file_path + "-train-00000-of-00001")
        os.remove(tmp_file_path)
def generate_data_one(args):
    problem, data_dir, tmp_dir, input_file, output_file = args
    output_fp = open(output_file, "w")
    for sample in problem.generate_encoded_samples(data_dir, tmp_dir,
                                                   input_file):
        print("{}\t{}".format(sample["input_id"][0], sample["inputs_d"]),
              file=output_fp)
    output_fp.close()
    return
    generator_utils.generate_files(
        problem._maybe_pack_examples(
            problem.generate_encoded_samples(data_dir, tmp_dir, input_file)),
        [output_file])
  def testExamplesQueue(self):
    tf.set_random_seed(1)
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a file with 100 examples.
    def test_generator():
      for i in xrange(100):
        yield {"inputs": [i], "targets": [i], "floats": [i + 0.5]}

    generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001"))

    examples_train = data_reader.examples_queue(
        [tmp_file_path + "*"], {
            "inputs": tf.VarLenFeature(tf.int64),
            "targets": tf.VarLenFeature(tf.int64)
        },
        training=True)
    examples_eval = data_reader.examples_queue(
        [tmp_file_path + "*"], {
            "inputs": tf.VarLenFeature(tf.int64),
            "targets": tf.VarLenFeature(tf.int64),
            "floats": tf.VarLenFeature(tf.float32)
        },
        training=False)
    with tf.train.MonitoredSession() as session:
      # Evaluation data comes in the same order as in the file, check 10.
      for i in xrange(10):
        examples = session.run(examples_eval)
        self.assertEqual(len(examples["inputs"]), 1)
        self.assertEqual(len(examples["targets"]), 1)
        self.assertEqual(examples["inputs"][0], i)
        self.assertEqual(examples["targets"][0], i)
        self.assertEqual(examples["floats"][0], i + 0.5)
      # Training data is shuffled.
      is_shuffled = False
      for i in xrange(10):
        examples = session.run(examples_train)
        self.assertEqual(len(examples["inputs"]), 1)
        self.assertEqual(len(examples["targets"]), 1)
        self.assertEqual(examples["inputs"][0], examples["targets"][0])
        if examples["inputs"][0] != i:
          is_shuffled = True
      self.assertTrue(is_shuffled)

    # Clean up.
    os.remove(tmp_file_path + "-00000-of-00001")
    os.remove(tmp_file_path)
Пример #34
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir, 10, shuffled=False)
        dev_paths = self.dev_filepaths(data_dir, 1, shuffled=True)

        midi_files = glob.glob('data/maestro/maestro-v2.0.0/*/*.midi')
        random.seed(13)
        random.shuffle(midi_files)

        generator_utils.generate_files(self.generator(midi_files[:50]),
                                       dev_paths)

        generator_utils.generate_files(self.generator(midi_files[50:]),
                                       train_paths)
        generator_utils.shuffle_dataset(train_paths)
Пример #35
0
def encode_dataset(model, dataset, problem, ae_hparams, autoencoder_path,
                   out_files):
  """Encode all frames in dataset with model and write them out to out_files."""
  batch_size = 8
  dataset = dataset.batch(batch_size)
  examples = dataset.make_one_shot_iterator().get_next()
  images = examples.pop("frame")
  images = tf.expand_dims(images, 1)

  encoded = model.encode(images)
  encoded_frame_height = int(
      math.ceil(problem.frame_height / 2**ae_hparams.num_hidden_layers))
  encoded_frame_width = int(
      math.ceil(problem.frame_width / 2**ae_hparams.num_hidden_layers))
  num_bits = 8
  encoded = tf.reshape(
      encoded, [-1, encoded_frame_height, encoded_frame_width, 3, num_bits])
  encoded = tf.cast(discretization.bit_to_int(encoded, num_bits), tf.uint8)

  pngs = tf.map_fn(tf.image.encode_png, encoded, dtype=tf.string,
                   back_prop=False)

  with tf.Session() as sess:
    autoencoder_saver = tf.train.Saver(tf.global_variables("autoencoder.*"))
    trainer_lib.restore_checkpoint(autoencoder_path, autoencoder_saver, sess,
                                   must_restore=True)

    def generator():
      """Generate examples."""
      while True:
        try:
          pngs_np, examples_np = sess.run([pngs, examples])
          rewards_np = [list(el) for el in examples_np["reward"]]
          actions_np = [list(el) for el in examples_np["action"]]
          pngs_np = [el for el in pngs_np]
          for action, reward, png in zip(actions_np, rewards_np, pngs_np):
            yield {
                "action": action,
                "reward": reward,
                "image/encoded": [png],
                "image/format": ["png"],
                "image/height": [encoded_frame_height],
                "image/width": [encoded_frame_width],
            }
        except tf.errors.OutOfRangeError:
          break

    generator_utils.generate_files(
        generator(), out_files,
        cycle_every_n=problem.total_number_of_frames // 10)
Пример #36
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   train_paths = self.training_filepaths(
       data_dir, self.num_shards, shuffled=False)
   dev_paths = self.dev_filepaths(
       data_dir, self.num_dev_shards, shuffled=False)
   if self.use_train_shards_for_dev:
     all_paths = train_paths + dev_paths
     generator_utils.generate_files(
         self.generator(data_dir, tmp_dir, True), all_paths)
     generator_utils.shuffle_dataset(all_paths)
   else:
     generator_utils.generate_dataset_and_shuffle(
         self.generator(data_dir, tmp_dir, True), train_paths,
         self.generator(data_dir, tmp_dir, False), dev_paths)
  def testGenerateFiles(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a trivial file and assert the file exists.
    def test_generator():
      yield {"inputs": [1], "target": [1]}

    generator_utils.generate_files(test_generator(), tmp_file_name, tmp_dir)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-00000-of-00001"))

    # Clean up.
    os.remove(tmp_file_path + "-00000-of-00001")
    os.remove(tmp_file_path)
Пример #38
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     """ TODO """
     train_paths = self.training_filepaths(
         data_dir, self.num_shards, shuffled=False)
     dev_paths = self.dev_filepaths(
         data_dir, self.num_dev_shards, shuffled=False)
     if self.use_train_shards_for_dev:
         all_paths = train_paths + dev_paths
         generator_utils.generate_files(
             self.generator(data_dir, tmp_dir, True), all_paths)
         generator_utils.shuffle_dataset(all_paths)
     else:
         generator_utils.generate_dataset_and_shuffle(
             self.generator(data_dir, tmp_dir, True), train_paths,
             self.generator(data_dir, tmp_dir, False), dev_paths)
Пример #39
0
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, 1)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  all_output_files = train_output_files + dev_output_files
  generator_utils.shuffle_dataset(all_output_files)
Пример #40
0
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, num_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir, 1)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  all_output_files = train_output_files + dev_output_files
  generator_utils.shuffle_dataset(all_output_files)
Пример #41
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     train_paths = self.training_filepaths(
         data_dir, self.num_shards,
         shuffled=False)  # problem_name-train-00000-of-00001
     dev_paths = self.dev_filepaths(
         data_dir, self.num_dev_shards,
         shuffled=False)  # problem_name-dev-00000-of-00001
     if self.use_train_shards_for_dev:
         all_paths = train_paths + dev_paths
         generator_utils.generate_files(
             self.generator(data_dir, tmp_dir, True), all_paths)
         generator_utils.shuffle_dataset(all_paths)
     else:
         generator_utils.generate_dataset_and_shuffle(
             self.generator(data_dir, tmp_dir, True), train_paths,
             self.generator(data_dir, tmp_dir, False), dev_paths)
Пример #42
0
def generate_t2t_example(uni, svg):
    print(f'{bcolors.BOLD}Generating tfrecord...{bcolors.ENDC}', end='')

    path, width, vwidth = _parse_svg(uni, svg)
    errorString = None

    if _is_valid_glyph(uni, width, vwidth):
        if len(path) > maxpaths:
            # too many paths!

            errorString = f'{chr(uni)} ({uni}) has too many paths: {len(path)}'
        elif len(path) == 0:
            # no paths!

            errorString = f'{chr(uni)} ({uni}) has no paths'
        else:
            # super clunky but we have to get our example in the right format

            example = _create_example(uni, path)

            tempexamplefile = tempfile.NamedTemporaryFile(mode='w',
                                                          delete=False)
            tempexamplefile.close()
            Path(tempexamplefile.name).unlink(
            )  # we must delete before we generate_files

            generator_utils.generate_files(_generate_sample(example),
                                           [tempexamplefile.name],
                                           max_cases=1)

            # https://www.tensorflow.org/tutorials/load_data/tfrecord
            raw_dataset = tf.data.TFRecordDataset([tempexamplefile.name])

            for raw_record in raw_dataset.take(1):
                example = raw_record

            Path(tempexamplefile.name).unlink()  # delete for real

            print(f'{bcolors.OKGREEN}SUCCESS{bcolors.ENDC}')
            return {'error': None, 'example': example}
    else:
        errorString = f'{chr(uni)} ({uni}) is invalid'

    print(f'{bcolors.FAIL}{errorString}{bcolors.ENDC}')
    return {'error': errorString, 'example': None}
Пример #43
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.num_dev_shards,
                                       shuffled=False)
        test_paths = self.test_filepaths(data_dir,
                                         self.num_test_shards,
                                         shuffled=True)

        generator_utils.generate_files(
            self.generator(data_dir, tmp_dir, TEST_DATASET), test_paths)

        all_paths = train_paths + dev_paths
        generator_utils.generate_files(
            self.generator(data_dir, tmp_dir, TRAIN_DATASET), all_paths)
        generator_utils.shuffle_dataset(all_paths)
Пример #44
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    """Generates training/dev data.

    Args:
      data_dir: a string
      tmp_dir: a string
      task_id: an optional integer
    Returns:
      shard or shards for which data was generated.
    """
    tf.logging.info("generate_data task_id=%s" % task_id)
    encoder = self.get_or_create_vocab(data_dir, tmp_dir)
    assert task_id >= 0 and task_id < self.num_generate_tasks
    if task_id < self.num_train_shards:
      out_file = self.training_filepaths(
          data_dir, self.num_train_shards, shuffled=False)[task_id]
    else:
      out_file = self.dev_filepaths(
          data_dir, self.num_dev_shards,
          shuffled=False)[task_id - self.num_train_shards]
    generator_utils.generate_files(
        self.example_generator(encoder, tmp_dir, task_id), [out_file])
    generator_utils.shuffle_dataset([out_file])
Пример #45
0
  def generate_data(self, data_dir, tmp_dir=None, task_id=-1):
    """Saves the current epoch rollouts to disk, split into train/dev sets."""
    if not self._rollouts_by_epoch_and_split[self.current_epoch]:
      # Data not loaded from disk.
      self._split_current_epoch()

    rollouts_by_split = self._rollouts_by_epoch_and_split[self.current_epoch]
    splits_and_paths = self.splits_and_paths(data_dir)

    for (split, paths) in splits_and_paths:
      rollouts = rollouts_by_split[split]
      num_frames = self._calc_num_frames(rollouts)
      shard_size = num_frames // len(paths)

      frame_gen = self._generate_frames(rollouts)
      for (path_index, path) in enumerate(paths):
        limit = shard_size
        # Put the remainder in the last shard to preserve the ordering.
        if path_index == len(paths) - 1:
          limit = None
        generator_utils.generate_files(
            itertools.islice(frame_gen, limit), [path],
            cycle_every_n=float("inf")
        )
Пример #46
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    train_gen = self.generator(tmp_dir, 162770)
    train_paths = self.training_filepaths(
        data_dir, self.train_shards, shuffled=False)
    generator_utils.generate_files(train_gen, train_paths)

    dev_gen = self.generator(tmp_dir, 19867, 162770)
    dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False)
    generator_utils.generate_files(dev_gen, dev_paths)

    test_gen = self.generator(tmp_dir, 19962, 162770+19867)
    test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False)
    generator_utils.generate_files(test_gen, test_paths)

    generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)
Пример #47
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):

    if task_id == -1 or task_id is None:
      for i in range(FLAGS.wiki_revision_num_train_shards +
                     FLAGS.wiki_revision_num_dev_shards):
        self.generate_data(data_dir, tmp_dir, i)
        return

    tf.logging.info(
        "Flags for job (task_id {}): "
        "Dev shards: {}, Train shards: {}, "
        "Revision skip factor: {}, Max page size: 2**{}, Introduce errors: {},"
        "Percent Identical Examples: {}"
        "".format(task_id, FLAGS.wiki_revision_num_dev_shards,
                  FLAGS.wiki_revision_num_train_shards,
                  FLAGS.wiki_revision_revision_skip_factor,
                  FLAGS.wiki_revision_max_page_size_exp,
                  FLAGS.wiki_revision_introduce_errors,
                  FLAGS.wiki_revision_percent_identical_examples))

    if FLAGS.wiki_revision_vocab_file:
      encoder = wiki_revision_utils.get_encoder_from_vocab(
          FLAGS.wiki_revision_vocab_file)
    else:
      encoder = wiki_revision_utils.get_or_generate_vocabulary(
          data_dir, tmp_dir, FLAGS.wiki_revision_data_prefix,
          FLAGS.wiki_revision_max_page_size_exp, self.approx_vocab_size,
          self.strip)

    random.seed(123)
    if task_id < FLAGS.wiki_revision_num_train_shards:
      out_file = self.training_filepaths(
          data_dir, FLAGS.wiki_revision_num_train_shards,
          shuffled=False)[task_id]
    else:
      out_file = self.dev_filepaths(
          data_dir, FLAGS.wiki_revision_num_dev_shards,
          shuffled=False)[task_id - FLAGS.wiki_revision_num_train_shards]

    tf.logging.info("Generating files for path: %s", out_file)
    self.corpus_files = wiki_revision_utils.corpus_files_for_shard(
        task_id, FLAGS.wiki_revision_num_train_shards,
        FLAGS.wiki_revision_num_dev_shards, FLAGS.wiki_revision_data_prefix)
    example_generator = self.generator(encoder, self.corpus_files, tmp_dir)

    packed_example_generator = self._maybe_pack_examples(example_generator)
    generator_utils.generate_files(packed_example_generator, [out_file])
    generator_utils.shuffle_dataset([out_file])

    tf.logging.info(
        "Job stats: identity examples: {}, total examples {}, ratio: {}".format(
            self.num_identity_examples, self.num_total_examples,
            (1 + self.num_identity_examples) / (1 + self.num_total_examples)))

    job_stats_string = self.aggregate_job_stats()
    out_dir, filename = out_file.replace("-unshuffled", "").rsplit("/", 1)
    stats_prefix = "/stats_"
    stats_file_path = "".join([out_dir, stats_prefix, filename])
    if tf.gfile.Exists(
        stats_file_path) and tf.gfile.Open(stats_file_path).size() != 0:
      tf.logging.info("Skipping writing stats because output file exists.")
    else:
      with tf.gfile.Open(stats_file_path, "w") as out:
        tf.logging.info("Writing job stats to {}".format(stats_file_path))
        out.write(job_stats_string)

    tf.logging.info(job_stats_string)
Пример #48
0
def produce_examples(shard_ids, wikis_dir, refs_dir, urls_dir, vocab_path,
                     out_filepaths):
  """Produce examples from shard_ids to out_filepaths."""
  # * Join the Wikipedia articles with their references
  # * Run Tf-idf to sort reference paragraphs
  # * Encode the Wikipedia and reference text with the vocabulary
  # * Write out TFRecords of tensorflow.Example
  tf.logging.info("Processing %d input shards into %d output files.",
                  len(shard_ids), len(out_filepaths))

  vocab = text_encoder.SubwordTextEncoder(vocab_path)
  eot_ids = vocab.encode(EOT)

  def example_generator():
    """Generate Example dicts."""
    stats = dict(total_original_wikis=0, total_original_refs=0,
                 total_found_refs=0, ref_lengths=[], wiki_original_refs=[],
                 wiki_found_refs=[], wikis_skipped_no_refs=0,
                 wikis_skipped_short_lead=0, num_wikis_written=0)
    ref_files_by_shard = _references_files_by_shard(refs_dir)
    for shard_id in shard_ids:
      tf.logging.info("Processing shard %d", shard_id)
      wiki_urls = _wiki_urls_for_shard(shard_id, urls_dir)
      tf.logging.info("Loaded wiki URLs for shard")
      refs_content = _references_content(ref_files_by_shard[shard_id])
      tf.logging.info("Loaded reference content for shard")
      for i, wiki in enumerate(_wiki_articles(shard_id, wikis_dir)):
        if not i % 1000:
          tf.logging.info("Processing wiki index %d for shard %d", i, shard_id)
        stats["total_original_wikis"] += 1

        # Get reference content
        wiki_ref_content = []
        ref_urls = wiki_urls[wiki.url]["refs"]
        stats["total_original_refs"] += len(ref_urls)
        stats_wiki_original_refs = len(ref_urls)
        stats_wiki_found_refs = 0
        for ref_url in ref_urls:
          ref_content = refs_content.get(ref_url)
          if not ref_content:
            continue
          stats["total_found_refs"] += 1
          stats["ref_lengths"].append(len(ref_content))
          stats_wiki_found_refs += 1
          wiki_ref_content.append(ref_content)

        stats["wiki_original_refs"].append(stats_wiki_original_refs)
        stats["wiki_found_refs"].append(stats_wiki_found_refs)
        if not wiki_ref_content or len(wiki_ref_content) < _MIN_REFS:
          # No/few refs were found
          stats["wikis_skipped_no_refs"] += 1
          continue

        # Rank reference paragraphs with TFIDF
        wiki_title = _normalize_text(wiki.title)
        ranked_paragraphs = rank_reference_paragraphs(wiki_title,
                                                      wiki_ref_content)

        # Construct inputs from Wiki title and references
        inputs = []
        inputs.extend(vocab.encode(wiki_title))
        inputs.extend(eot_ids)
        for paragraph in ranked_paragraphs:
          if len(inputs) >= 1e6:
            break
          paragraph += " "
          inputs.extend(vocab.encode(paragraph))

        # Construct targets from article sections
        targets, section_boundaries = _encode_wiki_sections(
            wiki.sections, vocab)

        # Skip if lead section is too short
        if (not section_boundaries or
            section_boundaries[0] < _MIN_LEADSECTION_TOKENS):
          stats["wikis_skipped_short_lead"] += 1
          continue

        inputs.append(text_encoder.EOS_ID)
        targets.append(text_encoder.EOS_ID)

        stats["num_wikis_written"] += 1
        yield {
            "inputs": inputs,
            "targets": targets,
            "section_boundaries": section_boundaries,
        }

    tf.logging.info("Total: %d, Skipped: %d",
                    stats["num_wikis_written"],
                    stats["total_original_wikis"] - stats["num_wikis_written"])
    tf.logging.info("Total refs: %d, Skipped refs: %d",
                    stats["total_found_refs"],
                    stats["total_original_refs"] - stats["total_found_refs"])
    stats_fname = os.path.join(os.path.split(out_filepaths[0])[0],
                               "stats.%d.json" % shard_ids[0])
    with tf.gfile.Open(stats_fname, "w") as f:
      f.write(json.dumps(stats))

  generator_utils.generate_files(example_generator(), out_filepaths)