예제 #1
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    filepath_fns = {
        problem.DatasetSplit.TRAIN: self.training_filepaths,
        problem.DatasetSplit.EVAL: self.dev_filepaths,
        problem.DatasetSplit.TEST: self.test_filepaths,
    }

    split_paths = [(split["split"], filepath_fns[split["split"]](
        data_dir, split["shards"], shuffled=False))
                   for split in self.dataset_splits]

    all_paths = []
    for _, paths in split_paths:
      all_paths.extend(paths)

    if self.is_generate_per_split:
      for split, paths in split_paths:
        generator_utils.generate_files(
            self.generate_samples(data_dir, tmp_dir, split), paths)
    else:
      generator_utils.generate_files(
          self.generate_samples(data_dir, tmp_dir, problem.DatasetSplit.TRAIN),
          all_paths)

    generator_utils.shuffle_dataset(all_paths)
예제 #2
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):

        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }

        split_paths = dict([(split["split"],
                             filepath_fns[split["split"]](data_dir,
                                                          split["shards"],
                                                          shuffled=False))
                            for split in self.dataset_splits])
        all_paths = []
        for paths in split_paths.values():
            all_paths.extend(paths)

        if self.is_generate_per_split:
            for split, paths in split_paths.items():
                generator_utils.generate_files(
                    self._maybe_pack_examples(
                        self.generate_encoded_samples(data_dir, tmp_dir,
                                                      split)), paths)
        else:
            generator_utils.generate_files(
                self._maybe_pack_examples(
                    self.generate_encoded_samples(data_dir, tmp_dir,
                                                  problem.DatasetSplit.TRAIN)),
                all_paths)

        generator_utils.shuffle_dataset(all_paths)
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        """
    Generates training/dev data.

    Args:
      data_dir: a string
      tmp_dir: a string
      task_id: an optional integer
    Returns:
      shard or shards for which data was generated.
    """

        # In case of parallel execution, each shard is generated by a different
        # process
        if self.multiprocess_generate:
            tf.logging.info("generate_data task_id=%s" % task_id)
            assert task_id >= 0 and task_id < self.num_generate_tasks
            if task_id < self.num_train_shards:
                out_file = self.training_filepaths(data_dir,
                                                   self.num_train_shards,
                                                   shuffled=False)[task_id]
                dataset_split = problem.DatasetSplit.TRAIN
            else:
                out_file = self.dev_filepaths(
                    data_dir, self.num_dev_shards,
                    shuffled=False)[task_id - self.num_train_shards]
                dataset_split = problem.DatasetSplit.EVAL
            generator_utils.generate_files(
                self.generator(data_dir, tmp_dir, dataset_split, task_id),
                [out_file])
            generator_utils.shuffle_dataset([out_file])
        else:
            filepath_fns = {
                problem.DatasetSplit.TRAIN: self.training_filepaths,
                problem.DatasetSplit.EVAL: self.dev_filepaths,
                problem.DatasetSplit.TEST: self.test_filepaths,
            }

            split_paths = [
                (split["split"],
                 filepath_fns[split["split"]](data_dir,
                                              split["shards"],
                                              shuffled=self.already_shuffled))
                for split in self.dataset_splits
            ]

            all_paths = []
            for _, paths in split_paths:
                all_paths.extend(paths)

            if self.is_generate_per_split:
                for split, paths in split_paths:
                    generator_utils.generate_files(
                        self.generator(data_dir, tmp_dir, split), paths)
            else:
                generator_utils.generate_files(
                    self.generator(data_dir, tmp_dir,
                                   problem.DatasetSplit.TRAIN), all_paths)

            generator_utils.shuffle_dataset(all_paths)
예제 #4
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    self.data_dir = data_dir
    # Determine whether we are in training or validation mode.
    self.mode = {problem.DatasetSplit.TRAIN: 'train',
                 problem.DatasetSplit.EVAL: 'dev',
                 problem.DatasetSplit.TEST: 'test'}
    filepath_fns = {problem.DatasetSplit.TRAIN: self.training_filepaths,
                    problem.DatasetSplit.EVAL: self.dev_filepaths,
                    problem.DatasetSplit.TEST: self.test_filepaths}

    split_paths = [(split['split'], filepath_fns[split['split']](
      data_dir, split['shards'], shuffled=self.already_shuffled))
      for split in self.dataset_splits]
    all_paths = []
    for _, paths in split_paths:
      all_paths.extend(paths)

    if self.is_generate_per_split:
      for split, paths in split_paths:
        # Create the source and target txt files from the raw data.
        self.preprocess_data(self.mode[split])
        generator_utils.generate_files(
            self.generate_encoded_samples(data_dir, tmp_dir, split), paths)
    else:
      self.preprocess_data(self.mode[problem.DatasetSplit.TRAIN])
      generator_utils.generate_files(
          self.generate_encoded_samples(
              data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths)

    generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
예제 #5
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        del tmp_dir  # unused argument
        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }

        split_paths = [(split['split'],
                        filepath_fns[split['split']](data_dir,
                                                     split['shards'],
                                                     shuffled=False))
                       for split in self.dataset_splits]
        all_paths = []
        for _, paths in split_paths:
            all_paths.extend(paths)

        if self.is_generate_per_split:
            for split, paths in split_paths:
                generator_utils.generate_files(
                    self.generate_encoded_samples(data_dir, tmp_dir, split),
                    paths)
        else:
            generator_utils.generate_files(
                self.generate_encoded_samples(data_dir, tmp_dir,
                                              problem.DatasetSplit.TRAIN),
                all_paths)

        generator_utils.shuffle_dataset(all_paths)
예제 #6
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        # 构造对应文件名的方法dict
        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }
        # 根据dataset_splits 的设置,得到各类文件名
        split_paths = [
            (split["split"],
             filepath_fns[split["split"]](data_dir,
                                          split["shards"],
                                          shuffled=self.already_shuffled))
            for split in self.dataset_splits
        ]
        all_paths = []
        for _, paths in split_paths:
            all_paths.extend(paths)
        # split 是train/eval/test,对应的paths是对应类别下根据shards生成的文件名list
        if self.is_generate_per_split:
            for split, paths in split_paths:
                # generate_files(generator, filenames) 将generator生成的token id sample 写入 filenames
                generator_utils.generate_files(
                    # generate_encoded_samples yield token id形式的sample
                    self.generate_encoded_samples(data_dir, tmp_dir, split),
                    paths)
        else:
            generator_utils.generate_files(
                self.generate_encoded_samples(data_dir, tmp_dir,
                                              problem.DatasetSplit.TRAIN),
                all_paths)

        generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
예제 #7
0
def generate_data_for_problem(problem):
  """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
  training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

  num_train_shards = FLAGS.num_shards or 10
  tf.logging.info("Generating training data for %s.", problem)
  train_output_files = generator_utils.train_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_train_shards)
  generator_utils.generate_files(training_gen(), train_output_files,
                                 FLAGS.max_cases)
  num_dev_shards = int(num_train_shards * 0.1)
  tf.logging.info("Generating development data for %s.", problem)
  dev_output_files = generator_utils.dev_data_filenames(
      problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
      num_dev_shards)
  generator_utils.generate_files(dev_gen(), dev_output_files)
  num_test_shards = int(num_train_shards * 0.1)
  test_output_files = []
  test_gen_data = test_gen()
  if test_gen_data is not None:
    tf.logging.info("Generating test data for %s.", problem)
    test_output_files = generator_utils.test_data_filenames(
        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
        num_test_shards)
    generator_utils.generate_files(test_gen_data, test_output_files)
  all_output_files = train_output_files + dev_output_files + test_output_files
  generator_utils.shuffle_dataset(all_output_files)
예제 #8
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):

    filepath_fns = {
        problem.DatasetSplit.TRAIN: self.training_filepaths,
        problem.DatasetSplit.EVAL: self.dev_filepaths,
        problem.DatasetSplit.TEST: self.test_filepaths,
    }

    split_paths = [(split["split"], filepath_fns[split["split"]](
        data_dir, split["shards"], shuffled=self.already_shuffled))
                   for split in self.dataset_splits]
    all_paths = []
    for _, paths in split_paths:
      all_paths.extend(paths)

    if self.is_generate_per_split:
      for split, paths in split_paths:
        generator_utils.generate_files(
            self.generate_encoded_samples(data_dir, tmp_dir, split), paths)
    else:
      generator_utils.generate_files(
          self.generate_encoded_samples(
              data_dir, tmp_dir, problem.DatasetSplit.TRAIN), all_paths)

    generator_utils.shuffle_dataset(all_paths, extra_fn=self._pack_fn())
예제 #9
0
def generate_data_for_problem(problem):
    """Generate data for a problem in _SUPPORTED_PROBLEM_GENERATORS."""
    training_gen, dev_gen, test_gen = _SUPPORTED_PROBLEM_GENERATORS[problem]

    num_train_shards = FLAGS.num_shards or 10
    tf.logging.info("Generating training data for %s.", problem)
    train_output_files = generator_utils.train_data_filenames(
        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
        num_train_shards)
    generator_utils.generate_files(training_gen(), train_output_files,
                                   FLAGS.max_cases)
    num_dev_shards = int(num_train_shards * 0.1)
    tf.logging.info("Generating development data for %s.", problem)
    dev_output_files = generator_utils.dev_data_filenames(
        problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
        num_dev_shards)
    generator_utils.generate_files(dev_gen(), dev_output_files)
    num_test_shards = int(num_train_shards * 0.1)
    test_output_files = []
    test_gen_data = test_gen()
    if test_gen_data is not None:
        tf.logging.info("Generating test data for %s.", problem)
        test_output_files = generator_utils.test_data_filenames(
            problem + generator_utils.UNSHUFFLED_SUFFIX, FLAGS.data_dir,
            num_test_shards)
        generator_utils.generate_files(test_gen_data, test_output_files)
    all_output_files = train_output_files + dev_output_files + test_output_files
    generator_utils.shuffle_dataset(all_output_files)
예제 #10
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.num_dev_shards,
                                       shuffled=False)
        test_paths = self.test_filepaths(data_dir,
                                         self.num_test_shards,
                                         shuffled=True)

        generator_utils.generate_files(
            self.generator(data_dir, tmp_dir, self.TEST_DATASETS), test_paths)

        if self.use_train_shards_for_dev:
            all_paths = train_paths + dev_paths
            generator_utils.generate_files(
                self.generator(data_dir, tmp_dir, self.TRAIN_DATASETS),
                all_paths)
            generator_utils.shuffle_dataset(all_paths)
        else:
            generator_utils.generate_dataset_and_shuffle(
                self.generator(data_dir, tmp_dir,
                               self.TRAIN_DATASETS), train_paths,
                self.generator(data_dir, tmp_dir, self.DEV_DATASETS),
                dev_paths)
예제 #11
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   train_paths = self.training_filepaths(
       data_dir, self.num_shards, shuffled=False)
   dev_paths = self.dev_filepaths(
       data_dir, self.num_dev_shards, shuffled=False)
   all_paths = train_paths + dev_paths
   generator_utils.generate_files(
       self.generator(data_dir, tmp_dir), all_paths)
   generator_utils.shuffle_dataset(all_paths)
예제 #12
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   train_paths = self.training_filepaths(
       data_dir, self.num_shards, shuffled=False)
   dev_paths = self.dev_filepaths(
       data_dir, self.num_dev_shards, shuffled=False)
   all_paths = train_paths + dev_paths
   generator_utils.generate_files(
       self.generator(data_dir, tmp_dir), all_paths)
   generator_utils.shuffle_dataset(all_paths)
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        data_dir = os.path.join(data_dir, self.name)

        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }

        split_paths = []
        test_paths = []
        for split in self.dataset_splits:
            if split["split"] is not problem.DatasetSplit.TEST:
                split_paths.append((split["split"],
                                    filepath_fns[split["split"]](data_dir,
                                                                 split["shards"],
                                                                 shuffled=False)))
            else:
                test_paths.append((split["split"],
                                   filepath_fns[split["split"]](data_dir,
                                                                split["shards"],
                                                                shuffled=True)))

        all_paths = []
        for _, paths in split_paths:
            all_paths.extend(paths)

        if self.is_generate_per_split:
            for split, paths in split_paths:
                generator_utils.generate_files(
                    self._maybe_pack_examples(
                        self.generate_encoded_samples(data_dir, tmp_dir, split)), paths)
        else:
            generator_utils.generate_files(
                self._maybe_pack_examples(
                    self.generate_encoded_samples(
                        data_dir, tmp_dir, problem.DatasetSplit.TRAIN)), all_paths)

        generator_utils.shuffle_dataset(all_paths)

        test_split_paths = []
        for _, paths in test_paths:
            test_split_paths.extend(paths)

        if self.is_generate_per_split:
            for split, paths in test_paths:
                generator_utils.generate_files(
                    self._maybe_pack_examples(
                        self.generate_encoded_samples(data_dir, tmp_dir, split)), paths)
        else:
            generator_utils.generate_files(
                self._maybe_pack_examples(
                    self.generate_encoded_samples(
                        data_dir, tmp_dir, problem.DatasetSplit.TEST)), test_split_paths)
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    try:
      # Download source data if download_url specified
      h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file,
                                                   self.download_url)
    except NotImplementedError:
      # Otherwise, look for it locally
      h5_filepath = os.path.join(tmp_dir, self.h5_file)

    with h5py.File(h5_filepath, "r") as h5_file:
      num_train_examples = h5_file["train_in"].len()
      num_dev_examples = h5_file["valid_in"].len()
      num_test_examples = h5_file["test_in"].len()

    # Collect all_filepaths to later shuffle
    all_filepaths = []
    # Collect created shard processes to start and join
    processes = []

    datasets = [(self.training_filepaths, self.num_shards, "train",
                 num_train_examples), (self.dev_filepaths, 10, "valid",
                                       num_dev_examples),
                (self.test_filepaths, 10, "test", num_test_examples)]
    for fname_fn, nshards, key_prefix, num_examples in datasets:
      outfiles = fname_fn(data_dir, nshards, shuffled=False)
      all_filepaths.extend(outfiles)
      for start_idx, end_idx, outfile in generate_shard_args(
          outfiles, num_examples):
        p = mp.Process(
            target=generate_dataset,
            args=(h5_filepath, key_prefix, [outfile], self.chunk_size,
                  start_idx, end_idx))
        processes.append(p)

    # 1 per training shard + 10 for dev + 10 for test
    assert len(processes) == self.num_shards + 20

    # Start and wait for processes in batches
    num_batches = int(
        math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES))
    for i in xrange(num_batches):
      start = i * MAX_CONCURRENT_PROCESSES
      end = start + MAX_CONCURRENT_PROCESSES
      current = processes[start:end]
      for p in current:
        p.start()
      for p in current:
        p.join()

    # Shuffle
    generator_utils.shuffle_dataset(all_filepaths)
예제 #15
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    try:
      # Download source data if download_url specified
      h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file,
                                                   self.download_url)
    except NotImplementedError:
      # Otherwise, look for it locally
      h5_filepath = os.path.join(tmp_dir, self.h5_file)

    with h5py.File(h5_filepath, "r") as h5_file:
      num_train_examples = h5_file["train_in"].len()
      num_dev_examples = h5_file["valid_in"].len()
      num_test_examples = h5_file["test_in"].len()

    # Collect all_filepaths to later shuffle
    all_filepaths = []
    # Collect created shard processes to start and join
    processes = []

    datasets = [(self.training_filepaths, self.num_shards, "train",
                 num_train_examples), (self.dev_filepaths, 10, "valid",
                                       num_dev_examples),
                (self.test_filepaths, 10, "test", num_test_examples)]
    for fname_fn, nshards, key_prefix, num_examples in datasets:
      outfiles = fname_fn(data_dir, nshards, shuffled=False)
      all_filepaths.extend(outfiles)
      for start_idx, end_idx, outfile in generate_shard_args(
          outfiles, num_examples):
        p = mp.Process(
            target=generate_dataset,
            args=(h5_filepath, key_prefix, [outfile], self.chunk_size,
                  start_idx, end_idx))
        processes.append(p)

    # 1 per training shard + 10 for dev + 10 for test
    assert len(processes) == self.num_shards + 20

    # Start and wait for processes in batches
    num_batches = int(
        math.ceil(float(len(processes)) / MAX_CONCURRENT_PROCESSES))
    for i in range(num_batches):
      start = i * MAX_CONCURRENT_PROCESSES
      end = start + MAX_CONCURRENT_PROCESSES
      current = processes[start:end]
      for p in current:
        p.start()
      for p in current:
        p.join()

    # Shuffle
    generator_utils.shuffle_dataset(all_filepaths)
예제 #16
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir, 10, shuffled=False)
        dev_paths = self.dev_filepaths(data_dir, 1, shuffled=True)

        midi_files = glob.glob('data/maestro/maestro-v2.0.0/*/*.midi')
        random.seed(13)
        random.shuffle(midi_files)

        generator_utils.generate_files(self.generator(midi_files[:50]),
                                       dev_paths)

        generator_utils.generate_files(self.generator(midi_files[50:]),
                                       train_paths)
        generator_utils.shuffle_dataset(train_paths)
예제 #17
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
   train_paths = self.training_filepaths(
       data_dir, self.num_shards, shuffled=False)
   dev_paths = self.dev_filepaths(
       data_dir, self.num_dev_shards, shuffled=False)
   if self.use_train_shards_for_dev:
     all_paths = train_paths + dev_paths
     generator_utils.generate_files(
         self.generator(data_dir, tmp_dir, True), all_paths)
     generator_utils.shuffle_dataset(all_paths)
   else:
     generator_utils.generate_dataset_and_shuffle(
         self.generator(data_dir, tmp_dir, True), train_paths,
         self.generator(data_dir, tmp_dir, False), dev_paths)
예제 #18
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    train_gen = self.generator(tmp_dir, 162770)
    train_paths = self.training_filepaths(
        data_dir, self.train_shards, shuffled=False)
    generator_utils.generate_files(train_gen, train_paths)

    dev_gen = self.generator(tmp_dir, 19867, 162770)
    dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False)
    generator_utils.generate_files(dev_gen, dev_paths)

    test_gen = self.generator(tmp_dir, 19962, 162770+19867)
    test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False)
    generator_utils.generate_files(test_gen, test_paths)

    generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)
예제 #19
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    train_gen = self.generator(tmp_dir, 162770)
    train_paths = self.training_filepaths(
        data_dir, self.train_shards, shuffled=False)
    generator_utils.generate_files(train_gen, train_paths)

    dev_gen = self.generator(tmp_dir, 19867, 162770)
    dev_paths = self.dev_filepaths(data_dir, self.dev_shards, shuffled=False)
    generator_utils.generate_files(dev_gen, dev_paths)

    test_gen = self.generator(tmp_dir, 19962, 162770+19867)
    test_paths = self.test_filepaths(data_dir, self.test_shards, shuffled=False)
    generator_utils.generate_files(test_gen, test_paths)

    generator_utils.shuffle_dataset(train_paths + dev_paths + test_paths)
예제 #20
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     """ TODO """
     train_paths = self.training_filepaths(
         data_dir, self.num_shards, shuffled=False)
     dev_paths = self.dev_filepaths(
         data_dir, self.num_dev_shards, shuffled=False)
     if self.use_train_shards_for_dev:
         all_paths = train_paths + dev_paths
         generator_utils.generate_files(
             self.generator(data_dir, tmp_dir, True), all_paths)
         generator_utils.shuffle_dataset(all_paths)
     else:
         generator_utils.generate_dataset_and_shuffle(
             self.generator(data_dir, tmp_dir, True), train_paths,
             self.generator(data_dir, tmp_dir, False), dev_paths)
예제 #21
0
파일: problem.py 프로젝트: zqma2/RL4NMT
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     train_paths = self.training_filepaths(
         data_dir, self.num_shards,
         shuffled=False)  # problem_name-train-00000-of-00001
     dev_paths = self.dev_filepaths(
         data_dir, self.num_dev_shards,
         shuffled=False)  # problem_name-dev-00000-of-00001
     if self.use_train_shards_for_dev:
         all_paths = train_paths + dev_paths
         generator_utils.generate_files(
             self.generator(data_dir, tmp_dir, True), all_paths)
         generator_utils.shuffle_dataset(all_paths)
     else:
         generator_utils.generate_dataset_and_shuffle(
             self.generator(data_dir, tmp_dir, True), train_paths,
             self.generator(data_dir, tmp_dir, False), dev_paths)
예제 #22
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        _maybe_get_pmjtc_dataset(tmp_dir)
        self._maybe_save_image_meta(data_dir, tmp_dir)
        self._maybe_build_vocab(data_dir, tmp_dir)
        train_meta = self._load_image_meta(data_dir, 'train')
        dev_meta = self._load_image_meta(data_dir, 'dev')
        encoder = text_encoder.TokenTextEncoder(os.path.join(
            data_dir, self.vocab_name),
                                                replace_oov=OOV)

        train_paths = self.training_filepaths(data_dir,
                                              self.train_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.dev_shards,
                                       shuffled=False)
        train_meta_shards = _split_into_shards(train_meta, self.train_shards)
        dev_meta_shards = _split_into_shards(dev_meta, self.dev_shards)

        datasets = ((train_meta_shards, train_paths), (dev_meta_shards,
                                                       dev_paths))

        all_paths = []
        threads = []
        thread_counter = 0
        for i in xrange(len(datasets)):
            for j in xrange(len(datasets[i][0])):
                meta_list = datasets[i][0][j]
                out_file = datasets[i][1][j]
                all_paths.append(out_file)
                t = threading.Thread(target=self.generate_data_shard,
                                     args=(thread_counter, meta_list, out_file,
                                           encoder))
                threads.append(t)
                thread_counter += 1

        num_batches = int(ceil(float(len(threads)) / MAX_CONCURRENT_THREADS))
        for i in xrange(num_batches):
            coord = tf.train.Coordinator()
            start = i * MAX_CONCURRENT_THREADS
            end = start + MAX_CONCURRENT_THREADS
            current = threads[start:end]
            for t in current:
                t.start()
            coord.join(current)

        generator_utils.shuffle_dataset(all_paths)
예제 #23
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.num_dev_shards,
                                       shuffled=False)
        test_paths = self.test_filepaths(data_dir,
                                         self.num_test_shards,
                                         shuffled=True)

        generator_utils.generate_files(
            self.generator(data_dir, tmp_dir, TEST_DATASET), test_paths)

        all_paths = train_paths + dev_paths
        generator_utils.generate_files(
            self.generator(data_dir, tmp_dir, TRAIN_DATASET), all_paths)
        generator_utils.shuffle_dataset(all_paths)
예제 #24
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):

        gen = self.generator(tmp_dir, shard_id=task_id)

        if self.mode == "train":
            paths = self.training_filepaths(data_dir,
                                            self.train_shards,
                                            shuffled=False)
            paths = sharded_subset_list(paths, self.train_shards, task_id)
        else:
            paths = self.test_filepaths(data_dir,
                                        self.test_shards,
                                        shuffled=False)
            paths = sharded_subset_list(paths, self.test_shards, task_id)

        generator_utils.generate_files(gen, paths)

        generator_utils.shuffle_dataset(paths)
예제 #25
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    # task_id should be in [0, self.num_output_shards)
    assert (0 <= task_id) and (task_id < self.num_output_shards)

    # A task_id is only supposed to write only one output shard, it can operate
    # over multiple *input* shards.
    input_files = self._task_id_to_input_files(task_id)
    output_file = self._task_id_to_output_file(data_dir, task_id)

    # Which output split is this task writing to?
    split, _, _ = self._task_id_to_output_split(task_id)

    # Actually generate examples.
    generator_utils.generate_files(
        self.generate_encoded_samples(
            data_dir, tmp_dir, split, input_files),
        [output_file])

    # Shuffle the output.
    generator_utils.shuffle_dataset([output_file], extra_fn=self._pack_fn())
    def generate_data(self,
                      data_dir,
                      tmp_dir,
                      task_id=-1):  # 生成数据,按照TFRcord的格式输出到tmp_dir

        filepath_fns = { # 一个字典,三个键值对,每个键值是一个函数,每个函数的作用是获取该split下的一组分片文件的文件名
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }

        split_paths = [(split["split"],
                        filepath_fns[split["split"]](data_dir,
                                                     split["shards"],
                                                     shuffled=False))
                       for split in self.dataset_splits]
        # self.dataset_splits 中存储数据切分信息,是一个列表,两个元素,每个元素是一个字典,每个字典两个键值对:split_name, shards_num
        # training_filepaths(self, data_dir, num_shards, shuffled)

        all_paths = [
        ]  # 目前有两个split,每个split有num_shards个文件名,这个all_paths负责把所有文件名打包到一个list里面
        for _, paths in split_paths:
            all_paths.extend(paths)

        # generate_encoded_samples 返回的是一个迭代器,每个样本是一个字典,两个键,inputs和outputs,值均为list类型,内容是token id
        # _maybe_pack_examples 貌似做点关于长度的事情,样本格式应该是没有变化的
        # generate_files 负责将生成器中的样本数据按照TFRecord的格式输出
        if self.is_generate_per_split:  #
            for split, paths in split_paths:
                generator_utils.generate_files(
                    self._maybe_pack_examples(
                        self.generate_encoded_samples(data_dir, tmp_dir,
                                                      split)), paths)
        else:
            generator_utils.generate_files(
                self._maybe_pack_examples(
                    self.generate_encoded_samples(data_dir, tmp_dir,
                                                  problem.DatasetSplit.TRAIN)),
                all_paths)

        generator_utils.shuffle_dataset(all_paths)
예제 #27
0
    def generate_data(self, data_dir, tmp_dir, num_shards=None):
        if num_shards is None:
            num_shards = 100

        # Download source data
        h5_filepath = generator_utils.maybe_download(tmp_dir, self.h5_file,
                                                     self.download_url)
        with h5py.File(h5_filepath, "r") as h5_file:
            num_train_examples = h5_file["train_in"].len()
            num_dev_examples = h5_file["valid_in"].len()
            num_test_examples = h5_file["test_in"].len()

        # Collect all_filepaths to later shuffle
        all_filepaths = []
        # Collect created shard processes to start and join
        processes = []

        datasets = [(self.training_filepaths, num_shards, "train",
                     num_train_examples),
                    (self.dev_filepaths, 1, "valid", num_dev_examples),
                    (self.test_filepaths, 1, "test", num_test_examples)]
        for fname_fn, nshards, key_prefix, num_examples in datasets:
            outfiles = fname_fn(data_dir, nshards, shuffled=False)
            all_filepaths.extend(outfiles)
            for start_idx, end_idx, outfile in generate_shard_args(
                    outfiles, num_examples):
                p = mp.Process(target=generate_dataset,
                               args=(h5_filepath, key_prefix, [outfile],
                                     start_idx, end_idx))
                processes.append(p)

        # Start and wait for processes
        assert len(
            processes) == num_shards + 2  # 1 per training shard + dev + test
        for p in processes:
            p.start()
        for p in processes:
            p.join()

        # Shuffle
        generator_utils.shuffle_dataset(all_filepaths)
예제 #28
0
 def generate_data(self, data_dir, tmp_dir, task_id=-1):
     train_paths = self.training_filepaths(data_dir,
                                           self.num_shards,
                                           shuffled=False)
     # dev_paths = self.dev_filepaths(
     #     data_dir, self.num_dev_shards, shuffled=False)
     test_paths = self.test_filepaths(data_dir,
                                      self.num_test_shards,
                                      shuffled=True)
     data = self.generator(data_dir,
                           tmp_dir,
                           _LJSPEECH_TTS_DATASET,
                           start_from=0,
                           how_many=100)
     generator_utils.generate_files(data, test_paths)
     data = self.generator(data_dir,
                           tmp_dir,
                           _LJSPEECH_TTS_DATASET,
                           start_from=100,
                           how_many=-1)
     generator_utils.generate_files(data, train_paths)
     generator_utils.shuffle_dataset(train_paths)
예제 #29
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):

        raw_data_dir = _prepare_serchqa_data(tmp_dir)
        metadata_path = os.path.join(data_dir, "meta_data.json")

        train_file = os.path.join(raw_data_dir, "train.txt")
        dev_file = os.path.join(raw_data_dir, "val.txt")
        test_file = os.path.join(raw_data_dir, "test.txt")

        _build_vocab(generate_text_for_vocab(train_file), data_dir,
                     self.vocab_filename, self.vocab_size)

        encoder = self.get_or_create_vocab(data_dir, tmp_dir)
        self._extract_searchqa_metadata(encoder,
                                        [train_file, dev_file, test_file],
                                        metadata_path)

        filepath_fns = {
            problem.DatasetSplit.TRAIN: self.training_filepaths,
            problem.DatasetSplit.EVAL: self.dev_filepaths,
            problem.DatasetSplit.TEST: self.test_filepaths,
        }

        split_paths = [(split["split"],
                        filepath_fns[split["split"]](data_dir,
                                                     split["shards"],
                                                     shuffled=False))
                       for split in self.dataset_splits]
        all_paths = []
        for _, paths in split_paths:
            all_paths.extend(paths)

        for split, paths in split_paths:
            generator_utils.generate_files(
                self.generate_encoded_samples(data_dir, tmp_dir, split,
                                              encoder), paths)

        generator_utils.shuffle_dataset(all_paths)
예제 #30
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    """Generates training/dev data.

    Args:
      data_dir: a string
      tmp_dir: a string
      task_id: an optional integer
    Returns:
      shard or shards for which data was generated.
    """
    tf.logging.info("generate_data task_id=%s" % task_id)
    encoder = self.get_or_create_vocab(data_dir, tmp_dir)
    assert task_id >= 0 and task_id < self.num_generate_tasks
    if task_id < self.num_train_shards:
      out_file = self.training_filepaths(
          data_dir, self.num_train_shards, shuffled=False)[task_id]
    else:
      out_file = self.dev_filepaths(
          data_dir, self.num_dev_shards,
          shuffled=False)[task_id - self.num_train_shards]
    generator_utils.generate_files(
        self.example_generator(encoder, tmp_dir, task_id), [out_file])
    generator_utils.shuffle_dataset([out_file])
예제 #31
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):
    """Generates training/dev data.

    Args:
      data_dir: a string
      tmp_dir: a string
      task_id: an optional integer
    Returns:
      shard or shards for which data was generated.
    """
    tf.logging.info("generate_data task_id=%s" % task_id)
    encoder = self.get_or_create_vocab(data_dir, tmp_dir)
    assert task_id >= 0 and task_id < self.num_generate_tasks
    if task_id < self.num_train_shards:
      out_file = self.training_filepaths(
          data_dir, self.num_train_shards, shuffled=False)[task_id]
    else:
      out_file = self.dev_filepaths(
          data_dir, self.num_dev_shards,
          shuffled=False)[task_id - self.num_train_shards]
    generator_utils.generate_files(
        self.example_generator(encoder, tmp_dir, task_id), [out_file])
    generator_utils.shuffle_dataset([out_file])
예제 #32
0
def main(_):
    train_shards = 100
    dev_shards = 1
    pred_shards = 1
    train_file_names = [
        os.path.join(
            FLAGS.data_dir,
            "{0}-{1}-train-000{2}-of-00{3}".format(FLAGS.src, FLAGS.tgt, i,
                                                   train_shards))
        for i in range(train_shards)
    ]

    dev_file_names = [
        os.path.join(
            FLAGS.data_dir,
            "{0}-{1}-dev-000{2}-of-00{3}".format(FLAGS.src, FLAGS.tgt, i,
                                                 dev_shards))
        for i in range(dev_shards)
    ]

    pred_file_names = [
        os.path.join(
            FLAGS.data_dir,
            "{0}-{1}-pred-000{2}-of-00{3}".format(FLAGS.src, FLAGS.tgt, i,
                                                  pred_shards))
        for i in range(pred_shards)
    ]

    train_generator = example_generator(FLAGS.tmp_dir, "train")

    eval_generator = example_generator(FLAGS.tmp_dir, "dev")

    pred_generator = example_generator(FLAGS.tmp_dir, "pred")

    generator_utils.generate_files(train_generator,
                                   train_file_names,
                                   cycle_every_n=10)

    generator_utils.generate_files(eval_generator,
                                   dev_file_names,
                                   cycle_every_n=10)

    generator_utils.generate_files(pred_generator,
                                   pred_file_names,
                                   cycle_every_n=10)

    generator_utils.shuffle_dataset(train_file_names)

    generator_utils.shuffle_dataset(dev_file_names)

    generator_utils.shuffle_dataset(pred_file_names)
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        """Generates sharded Train, Dev and Test splits of the KGS and GoGoD Datasets.

        Assumes GoGoD zip from https://gogodonline.co.uk/ and
        KGS tar.gz from https://u-go.net/gamerecords/ are already downloaded and in the tmp_dir!

        Uses split fractions defined in self.split_fractions and
        num shards per split defined in self.{train/dev/test]_shards.
        Args:
            data_dir: (str), final data directory.
            tmp_dir: (str), directory containing KGS and GoGoD zips
            task_id: (int), task id.
        """
        data = self.generate_dataset(tmp_dir)

        for k, v in data.items():
            if v == []:
                raise ValueError("No {} files found!".format(k))

        # generate sharded TFRecord files of the train sgf's and shuffle
        tf.logging.info("Generating GoGoD and KGS train data")
        train_gen = self.generator(data["train"])
        train_paths = self.training_filepaths(data_dir,
                                              self.train_shards,
                                              shuffled=False)
        generator_utils.generate_files(train_gen, train_paths)
        generator_utils.shuffle_dataset(train_paths)

        # generate sharded TFRecord files of the dev sgf's and shuffle
        tf.logging.info("Generating GoGoD and KGS dev data")
        dev_gen = self.generator(data["dev"])
        dev_paths = self.dev_filepaths(data_dir,
                                       self.dev_shards,
                                       shuffled=False)
        generator_utils.generate_files(dev_gen, dev_paths)
        generator_utils.shuffle_dataset(dev_paths)

        # generate sharded TFRecord files of the test sgf's and shuffle
        tf.logging.info("Generating GoGoD and KGS test data")
        test_gen = self.generator(data["test"])
        test_paths = self.test_filepaths(data_dir,
                                         self.test_shards,
                                         shuffled=False)
        generator_utils.generate_files(test_gen, test_paths)
        generator_utils.shuffle_dataset(test_paths)
예제 #34
0
  def generate_data(self, data_dir, tmp_dir, task_id=-1):

    if task_id == -1 or task_id is None:
      for i in range(FLAGS.wiki_revision_num_train_shards +
                     FLAGS.wiki_revision_num_dev_shards):
        self.generate_data(data_dir, tmp_dir, i)
        return

    tf.logging.info(
        "Flags for job (task_id {}): "
        "Dev shards: {}, Train shards: {}, "
        "Revision skip factor: {}, Max page size: 2**{}, Introduce errors: {},"
        "Percent Identical Examples: {}"
        "".format(task_id, FLAGS.wiki_revision_num_dev_shards,
                  FLAGS.wiki_revision_num_train_shards,
                  FLAGS.wiki_revision_revision_skip_factor,
                  FLAGS.wiki_revision_max_page_size_exp,
                  FLAGS.wiki_revision_introduce_errors,
                  FLAGS.wiki_revision_percent_identical_examples))

    if FLAGS.wiki_revision_vocab_file:
      encoder = wiki_revision_utils.get_encoder_from_vocab(
          FLAGS.wiki_revision_vocab_file)
    else:
      encoder = wiki_revision_utils.get_or_generate_vocabulary(
          data_dir, tmp_dir, FLAGS.wiki_revision_data_prefix,
          FLAGS.wiki_revision_max_page_size_exp, self.approx_vocab_size,
          self.strip)

    random.seed(123)
    if task_id < FLAGS.wiki_revision_num_train_shards:
      out_file = self.training_filepaths(
          data_dir, FLAGS.wiki_revision_num_train_shards,
          shuffled=False)[task_id]
    else:
      out_file = self.dev_filepaths(
          data_dir, FLAGS.wiki_revision_num_dev_shards,
          shuffled=False)[task_id - FLAGS.wiki_revision_num_train_shards]

    tf.logging.info("Generating files for path: %s", out_file)
    self.corpus_files = wiki_revision_utils.corpus_files_for_shard(
        task_id, FLAGS.wiki_revision_num_train_shards,
        FLAGS.wiki_revision_num_dev_shards, FLAGS.wiki_revision_data_prefix)
    example_generator = self.generator(encoder, self.corpus_files, tmp_dir)

    packed_example_generator = self._maybe_pack_examples(example_generator)
    generator_utils.generate_files(packed_example_generator, [out_file])
    generator_utils.shuffle_dataset([out_file])

    tf.logging.info(
        "Job stats: identity examples: {}, total examples {}, ratio: {}".format(
            self.num_identity_examples, self.num_total_examples,
            (1 + self.num_identity_examples) / (1 + self.num_total_examples)))

    job_stats_string = self.aggregate_job_stats()
    out_dir, filename = out_file.replace("-unshuffled", "").rsplit("/", 1)
    stats_prefix = "/stats_"
    stats_file_path = "".join([out_dir, stats_prefix, filename])
    if tf.gfile.Exists(
        stats_file_path) and tf.gfile.Open(stats_file_path).size() != 0:
      tf.logging.info("Skipping writing stats because output file exists.")
    else:
      with tf.gfile.Open(stats_file_path, "w") as out:
        tf.logging.info("Writing job stats to {}".format(stats_file_path))
        out.write(job_stats_string)

    tf.logging.info(job_stats_string)
예제 #35
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):

        if task_id == -1 or task_id is None:
            for i in range(FLAGS.wiki_revision_num_train_shards +
                           FLAGS.wiki_revision_num_dev_shards):
                self.generate_data(data_dir, tmp_dir, i)
                return

        tf.logging.info(
            "Flags for job (task_id {}): "
            "Dev shards: {}, Train shards: {}, "
            "Revision skip factor: {}, Max page size: 2**{}, Introduce errors: {},"
            "Percent Identical Examples: {}"
            "".format(task_id, FLAGS.wiki_revision_num_dev_shards,
                      FLAGS.wiki_revision_num_train_shards,
                      FLAGS.wiki_revision_revision_skip_factor,
                      FLAGS.wiki_revision_max_page_size_exp,
                      FLAGS.wiki_revision_introduce_errors,
                      FLAGS.wiki_revision_percent_identical_examples))

        if FLAGS.wiki_revision_vocab_file:
            encoder = wiki_revision_utils.get_encoder_from_vocab(
                FLAGS.wiki_revision_vocab_file)
        else:
            encoder = wiki_revision_utils.get_or_generate_vocabulary(
                data_dir, tmp_dir, FLAGS.wiki_revision_data_prefix,
                FLAGS.wiki_revision_max_page_size_exp, self.approx_vocab_size,
                self.strip)

        random.seed(123)
        if task_id < FLAGS.wiki_revision_num_train_shards:
            out_file = self.training_filepaths(
                data_dir, FLAGS.wiki_revision_num_train_shards,
                shuffled=False)[task_id]
        else:
            out_file = self.dev_filepaths(
                data_dir, FLAGS.wiki_revision_num_dev_shards,
                shuffled=False)[task_id - FLAGS.wiki_revision_num_train_shards]

        tf.logging.info("Generating files for path: %s", out_file)
        self.corpus_files = wiki_revision_utils.corpus_files_for_shard(
            task_id, FLAGS.wiki_revision_num_train_shards,
            FLAGS.wiki_revision_num_dev_shards,
            FLAGS.wiki_revision_data_prefix)
        example_generator = self.generator(encoder, self.corpus_files, tmp_dir)

        packed_example_generator = self._maybe_pack_examples(example_generator)
        generator_utils.generate_files(packed_example_generator, [out_file])
        generator_utils.shuffle_dataset([out_file])

        tf.logging.info(
            "Job stats: identity examples: {}, total examples {}, ratio: {}".
            format(self.num_identity_examples, self.num_total_examples,
                   (1 + self.num_identity_examples) /
                   (1 + self.num_total_examples)))

        job_stats_string = self.aggregate_job_stats()
        out_dir, filename = out_file.replace("-unshuffled", "").rsplit("/", 1)
        stats_prefix = "/stats_"
        stats_file_path = "".join([out_dir, stats_prefix, filename])
        if tf.gfile.Exists(stats_file_path
                           ) and tf.gfile.Open(stats_file_path).size() != 0:
            tf.logging.info(
                "Skipping writing stats because output file exists.")
        else:
            with tf.gfile.Open(stats_file_path, "w") as out:
                tf.logging.info(
                    "Writing job stats to {}".format(stats_file_path))
                out.write(job_stats_string)

        tf.logging.info(job_stats_string)
예제 #36
0
def main(_):
    train_shards = 10
    dev_shards = 1
    pred_shards = 1
    user_item_train_file_names = [
        os.path.join(
            FLAGS.data_dir,
            "{0}-train-000{1}-of-00{2}".format(FLAGS.user_item, i,
                                               train_shards))
        for i in range(train_shards)
    ]

    user_item_dev_file_names = [
        os.path.join(
            FLAGS.data_dir,
            "{0}-dev-000{1}-of-00{2}".format(FLAGS.user_item, i, dev_shards))
        for i in range(dev_shards)
    ]

    user_item_pred_file_names = [
        os.path.join(
            FLAGS.data_dir,
            "{0}-pred-000{1}-of-00{2}".format(FLAGS.user_item, i, pred_shards))
        for i in range(pred_shards)
    ]

    user_item_train_generator = user_item_example_generator(
        FLAGS.tmp_dir, "train")

    user_item_dev_generator = user_item_example_generator(FLAGS.tmp_dir, "dev")

    user_item_pred_generator = user_item_example_generator(
        FLAGS.tmp_dir, "pred")

    generator_utils.generate_files(user_item_train_generator,
                                   user_item_train_file_names,
                                   cycle_every_n=10)

    generator_utils.generate_files(user_item_dev_generator,
                                   user_item_dev_file_names,
                                   cycle_every_n=10)

    generator_utils.generate_files(user_item_pred_generator,
                                   user_item_pred_file_names,
                                   cycle_every_n=10)

    generator_utils.shuffle_dataset(user_item_train_file_names)

    generator_utils.shuffle_dataset(user_item_dev_file_names)

    generator_utils.shuffle_dataset(user_item_pred_file_names)

    also_view_train_file_names = [
        os.path.join(
            FLAGS.data_dir,
            "{0}-train-000{1}-of-00{2}".format(FLAGS.also_view, i,
                                               train_shards))
        for i in range(train_shards)
    ]

    also_view_dev_file_names = [
        os.path.join(
            FLAGS.data_dir,
            "{0}-dev-000{1}-of-00{2}".format(FLAGS.also_view, i, dev_shards))
        for i in range(dev_shards)
    ]

    also_view_pred_file_names = [
        os.path.join(
            FLAGS.data_dir,
            "{0}-pred-000{1}-of-00{2}".format(FLAGS.also_view, i, pred_shards))
        for i in range(pred_shards)
    ]

    also_view_train_generator = also_view_example_generator(
        FLAGS.tmp_dir, "train")

    also_view_dev_generator = also_view_example_generator(FLAGS.tmp_dir, "dev")

    also_view_pred_generator = also_view_example_generator(
        FLAGS.tmp_dir, "pred")

    generator_utils.generate_files(also_view_train_generator,
                                   also_view_train_file_names,
                                   cycle_every_n=10)

    generator_utils.generate_files(also_view_dev_generator,
                                   also_view_dev_file_names,
                                   cycle_every_n=10)

    generator_utils.generate_files(also_view_pred_generator,
                                   also_view_pred_file_names,
                                   cycle_every_n=10)

    generator_utils.shuffle_dataset(also_view_train_file_names)

    generator_utils.shuffle_dataset(also_view_dev_file_names)

    generator_utils.shuffle_dataset(also_view_pred_file_names)
예제 #37
0
    def generate_data(self, data_dir, tmp_dir, task_id=-1):
        test = 20
        train_paths = self.training_filepaths(data_dir,
                                              self.num_shards,
                                              shuffled=False)
        dev_paths = self.dev_filepaths(data_dir,
                                       self.num_dev_shards,
                                       shuffled=False)
        test_paths = self.test_filepaths(data_dir,
                                         self.num_test_shards,
                                         shuffled=True)
        try_num = 0
        if test:
            try_num = test
        manager = Manager()
        lock = manager.Lock()

        #shared_dict=manager.dict({'current_id':id_init,"current_last_updated":0,"record_num":0,"source_index":0})
        def process_files(train_paths, datasets, num_run, shared_dict):

            total_file_num = len(train_paths)
            num_per_partition = int(math.floor(total_file_num / num_run))
            train_paths_list = []
            for i in range(num_run):
                if i == num_run - 1:
                    train_paths_list.append(train_paths[i *
                                                        num_per_partition:])
                else:
                    train_paths_list.append(
                        train_paths[i * num_per_partition:(i + 1) *
                                    num_per_partition])
            generator_list = []
            for i in range(num_run):
                generator_list.append(
                    self.generator(data_dir,
                                   tmp_dir,
                                   datasets,
                                   lock,
                                   shared_dict,
                                   how_many=try_num))

            p = []
            for i in range(num_run):
                p.append(
                    Process(target=generator_utils.generate_files,
                            args=(generator_list[i], train_paths_list[i],
                                  try_num)))
                p[i].start()
            my_logger.error("Time: {} All processes started".format(
                str(datetime.datetime.now())))
            for q in p:
                q.join()
            my_logger.error("Time: {} All processes ended".format(
                str(datetime.datetime.now())))

        shared_dict = manager.dict({
            'current_id': id_init,
            "current_last_updated": 0,
            "record_num": 0,
            "source_index": 0
        })
        num_run = min(self.process_num, self.num_shards)
        process_files(train_paths, self.train_sources, num_run, shared_dict)
        if len(self.eval_sources) == 0:
            generator_utils.shuffle_dataset(train_paths)

        else:
            shared_dict["current_id"] = id_init
            shared_dict["current_last_updated"] = 0
            shared_dict["record_num"] = 0
            shared_dict["source_index"] = 0
            num_run = min(self.process_num, self.num_dev_shards)
            my_logger.error("Time: {} process dev dataset".format(
                str(datetime.datetime.now())))
            process_files(dev_paths, self.eval_sources, num_run, shared_dict)
            my_logger.error("Time: {} shuffle dataset".format(
                str(datetime.datetime.now())))
            generator_utils.shuffle_dataset(train_paths + dev_paths)
        shared_dict["current_id"] = id_init
        shared_dict["current_last_updated"] = 0
        shared_dict["record_num"] = 0
        shared_dict["source_index"] = 0
        num_run = min(self.process_num, self.num_test_shards)
        process_files(test_paths, self.test_sources, num_run, shared_dict)
예제 #38
0
import youtokentome as yttm

MAX_LEN = 128
bpe = yttm.BPE(model="models/model_ruzh_47k.yttm")


def gen(path):
    dfs = spark.read.load(path)
    for row in dfs.head(1000000000):
        sample = {
            "inputs": bpe.encode([row.zh], eos=True)[0][:MAX_LEN],
            "targets": bpe.encode([row.ru], eos=True)[0][:MAX_LEN]
        }
        yield sample


total = 32
for i in range(0, total):
    generator_utils.generate_files(
        gen("hdfs://ryzen:9000/user/root/dataset/mt/shuffled-ru-zh.parquet/part-%05d-*"
            % i), [
                "tfrecords/translate_zhru-train-%05d-of-%05d-unshuffled" %
                (i, total)
            ])

all_paths = [
    "tfrecords/translate_zhru-train-%05d-of-%05d-unshuffled" % (i, total)
    for i in range(total)
]
generator_utils.shuffle_dataset(all_paths)