Exemplo n.º 1
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):

        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }

        split_paths = [
            (split["split"],
             filepath_fns[split["split"]](data_dir,
                                          split["shards"],
                                          shuffled=self.already_shuffled))
            for split in self.dataset_splits
        ]
        all_paths = []
        for _, paths in split_paths:
            all_paths.extend(paths)

        if self.is_generate_per_split:
            for split, paths in split_paths:
                generator_utils.generate_files(
                    self.generate_encoded_samples(data_dir, tmp_dir, split),
                    paths)
        else:
            generator_utils.generate_files(
                self.generate_encoded_samples(data_dir, tmp_dir,
                                              problem.DatasetSplit.TRAIN),
                all_paths)

        generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
Exemplo n.º 2
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.num_dev_shards,
                                       shuffled=False)
        test_paths = self.test_filepaths(data_dir,
                                         self.num_test_shards,
                                         shuffled=True)

        generator_utils.generate_files(
            self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths)

        if self.use_train_shards_for_dev:
            all_paths = train_paths + dev_paths
            generator_utils.generate_files(
                self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS),
                all_paths)
            generator_utils.shuffle_dataset(all_paths)
        else:
            generator_utils.generate_dataset_and_shuffle(
                self.generator(data_dir, tmp_dir,
                               self.TRAIN_DATASETS), train_paths,
                self.generator(data_dir, tmp_dir, self.DEV_DATASETS),
                dev_paths)
Exemplo n.º 3
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        """Ganerate data for this problem."""

        del tmp_dir, task_id
        identity_problem = AlgorithmicIdentityBinary40()
        utils.generate_files(
            identity_problem.generator(self.num_symbols, 40, 100000),
            self.training_filepaths(data_dir, 1, shuffled=True), 100)
        utils.generate_files(
            identity_problem.generator(self.num_symbols, 400, 10000),
            self.dev_filepaths(data_dir, 1, shuffled=True), 100)
Exemplo n.º 4
0
  def testGenerateFiles(self):
    tmp_dir = self.get_temp_dir()
    (_, tmp_file_path) = tempfile.mkstemp(dir=tmp_dir)
    tmp_file_name = os.path.basename(tmp_file_path)

    # Generate a trivial file and assert the file exists.
    def test_generator():
      yield {"inputs": [1], "target": [1]}

    filenames = generator_utils.train_data_filenames(tmp_file_name, tmp_dir, 1)
    generator_utils.generate_files(test_generator(), filenames)
    self.assertTrue(tf.gfile.Exists(tmp_file_path + "-train-00000-of-00001"))

    # Clean up.
    os.remove(tmp_file_path + "-train-00000-of-00001")
    os.remove(tmp_file_path)
Exemplo n.º 5
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        # task_id should be in [0, self.num_output_shards)
        assert (0 <= task_id) and (task_id < self.num_output_shards)

        # A task_id is only supposed to write only one output shard, it can operate
        # over multiple *input* shards.
        input_files = self._task_id_to_input_files(task_id)
        output_file = self._task_id_to_output_file(data_dir, task_id)

        # Which output split is this task writing to?
        split, _, _ = self._task_id_to_output_split(task_id)

        # Actually generate examples.
        generator_utils.generate_files(
            self.generate_encoded_samples(data_dir, tmp_dir, split,
                                          input_files), [output_file])

        # Shuffle the output.
        generator_utils.shuffle_dataset([output_file],
                                        extra_fn=self._pack_fn())
Exemplo n.º 6
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        """Generates training/dev data.

    Args:
      data_dir: a string
      tmp_dir: a string
      task_id: an optional integer
    Returns:
      shard or shards for which data was generated.
    """
        tf.logging.info("generate_data task_id=%s" % task_id)
        encoder = self.get_or_create_vocab(data_dir, tmp_dir)
        assert task_id >= 0 and task_id < self.num_generate_tasks
        if task_id < self.num_train_shards:
            out_file = self.training_filepaths(data_dir,
                                               self.num_train_shards,
                                               shuffled=False)[task_id]
        else:
            out_file = self.dev_filepaths(
                data_dir, self.num_dev_shards,
                shuffled=False)[task_id - self.num_train_shards]
        generator_utils.generate_files(
            self.example_generator(encoder, tmp_dir, task_id), [out_file])
        generator_utils.shuffle_dataset([out_file])
Exemplo n.º 7
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_gen = self.generator(tmp_dir, 162770)
        train_paths = self.training_filepaths(data_dir,
                                              self.train_shards,
                                              shuffled=False)
        generator_utils.generate_files(train_gen, train_paths)

        dev_gen = self.generator(tmp_dir, 19867, 162770)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.dev_shards,
                                       shuffled=False)
        generator_utils.generate_files(dev_gen, dev_paths)

        test_gen = self.generator(tmp_dir, 19962, 162770 + 19867)
        test_paths = self.test_filepaths(data_dir,
                                         self.test_shards,
                                         shuffled=False)
        generator_utils.generate_files(test_gen, test_paths)

        generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)