Exemplo n.º 1
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError("Too many command-line arguments.")
  config = tf_example_utils.PretrainConversionConfig(
      vocab_file=FLAGS.vocab_file,
      max_seq_length=FLAGS.max_seq_length,
      max_predictions_per_seq=FLAGS.max_predictions_per_seq,
      random_seed=FLAGS.random_seed,
      masked_lm_prob=FLAGS.masked_lm_prob,
      max_column_id=FLAGS.max_column_id,
      max_row_id=FLAGS.max_row_id,
      min_question_length=FLAGS.min_question_length,
      max_question_length=FLAGS.max_question_length,
      always_continue_cells=FLAGS.always_continue_cells,
      strip_column_names=False,
  )
  pipeline = pretrain_utils.build_pretrain_data_pipeline(
      input_file=FLAGS.input_file,
      output_dir=FLAGS.output_dir,
      config=config,
      dupe_factor=FLAGS.dupe_factor,
      min_num_rows=FLAGS.min_num_rows,
      min_num_columns=FLAGS.min_num_columns,
  )
  beam_runner.run(pipeline)
Exemplo n.º 2
0
    def test_end_to_end(self, runner_type, always_continue_cells):

        self._create_vocab(list(_RESERVED_SYMBOLS) + ['released'])

        pipeline = create_data.build_pretraining_pipeline(
            input_file=os.path.join(self._test_dir,
                                    'pretrain_interactions.txtpb'),
            output_suffix='.tfrecord',
            output_dir=self._output_path,
            config=tf_example_utils.PretrainConversionConfig(
                vocab_file=self._vocab_path,
                max_seq_length=10,
                max_predictions_per_seq=10,
                random_seed=5,
                masked_lm_prob=0.5,
                max_column_id=3,
                max_row_id=3,
                min_question_length=1,
                max_question_length=4,
                always_continue_cells=always_continue_cells,
                strip_column_names=False),
            dupe_factor=2,
            min_num_columns=0,
            min_num_rows=0,
            num_corpus_bins=2,
        )

        beam_runner.run_type(pipeline, runner_type).wait_until_finish()

        for name in ['train', 'test']:
            examples = _read_examples(
                os.path.join(self._output_path, f'{name}.tfrecord'))
            self.assertNotEmpty(examples)
Exemplo n.º 3
0
    def test_end_to_end(self, runner_type):

        with tempfile.TemporaryDirectory() as temp_dir:
            vocab_path = os.path.join(temp_dir, "vocab.txt")
            _create_vocab(list(_RESERVED_SYMBOLS) + ["released"], vocab_path)

            pipeline = pretrain_utils.build_pretrain_data_pipeline(
                input_file=os.path.join(self._test_dir,
                                        "pretrain_interactions.txtpb"),
                output_dir=temp_dir,
                config=tf_example_utils.PretrainConversionConfig(
                    vocab_file=vocab_path,
                    max_seq_length=10,
                    max_predictions_per_seq=10,
                    random_seed=5,
                    masked_lm_prob=0.5,
                    max_column_id=3,
                    max_row_id=3,
                    min_question_length=1,
                    max_question_length=4,
                    always_continue_cells=True,
                    strip_column_names=False),
                dupe_factor=2,
                min_num_columns=0,
                min_num_rows=0,
                num_splits=2,
            )

            beam_runner.run_type(pipeline, runner_type).wait_until_finish()

            logging.info("temp dir: %s", os.listdir(temp_dir))
            for name in ["train", "test"]:
                examples = list(
                    _read_examples(os.path.join(temp_dir, f"{name}.tfrecord")))
                self.assertNotEmpty(examples)