def generate_data_for_registered_problem(problem_name):
    """Generate data for a registered problem."""
    tf.logging.info("Generating data for %s.", problem_name)
    if FLAGS.num_shards:
        raise ValueError(
            "--num_shards should not be set for registered Problem.")
    problem = registry.problem(problem_name)
    task_id = None if FLAGS.task_id < 0 else FLAGS.task_id
    data_dir = os.path.expanduser(FLAGS.data_dir)
    tmp_dir = os.path.expanduser(FLAGS.tmp_dir)
    if task_id is None and problem.multiprocess_generate:
        if FLAGS.task_id_start != -1:
            assert FLAGS.task_id_end != -1
            task_id_start = FLAGS.task_id_start
            task_id_end = FLAGS.task_id_end
        else:
            task_id_start = 0
            task_id_end = problem.num_generate_tasks
        # BEGIN GOOGLE-INTERNAL
        # For some reason, the process hangs if the data is on cns and pool is
        # created after prepare_to_generate.  Dunno why, but this order seems to
        # work.
        # END GOOGLE-INTERNAL
        pool = multiprocessing.Pool(processes=FLAGS.num_concurrent_processes)
        problem.prepare_to_generate(data_dir, tmp_dir)
        args = [(problem_name, data_dir, tmp_dir, task_id)
                for task_id in range(task_id_start, task_id_end)]
        pool.map(generate_data_in_process, args)
    else:
        problem.generate_data(data_dir, tmp_dir, task_id)
示例#2
0
def add_problem_hparams(hparams, problem_name_or_instance):
    """Add problem hparams for the problems."""
    if isinstance(problem_name_or_instance, Problem):
        problem = problem_name_or_instance
    else:
        problem = registry.problem(problem_name_or_instance)
    p_hparams = problem.get_hparams(hparams)
    hparams.problem = problem
    hparams.problem_hparams = p_hparams
def generate_data_in_process(arg):
    problem_name, data_dir, tmp_dir, task_id = arg
    problem = registry.problem(problem_name)
    problem.generate_data(data_dir, tmp_dir, task_id)
示例#4
0
def create_experiment(run_config,
                      hparams,
                      model_name,
                      problem_name,
                      data_dir,
                      train_steps,
                      eval_steps,
                      min_eval_frequency=2000,
                      schedule="train_and_evaluate",
                      decode_hparams=None,
                      eval_timeout_mins=240,
                      use_tpu=False,
                      train_with_low_level_api=False,
                      decode_with_low_level_api=False,
                      train_and_decode_with_low_level_api=False,
                      tpu_num_hosts=1,
                      iterations_per_loop=1000,
                      decode_from_file=None,
                      decode_to_file=None,
                      decode_reference=None):
    """Create Experiment."""
    # HParams
    hparams.add_hparam("model_dir", run_config.model_dir)
    hparams.add_hparam("data_dir", data_dir)
    hparams.add_hparam("train_steps", train_steps)
    hparams.add_hparam("eval_steps", eval_steps)
    hparams.add_hparam("schedule", schedule)
    hparams.add_hparam("eval_freq_in_steps", min_eval_frequency)
    hparams.add_hparam("eval_timeout_mins", eval_timeout_mins)
    hparams.add_hparam("train_with_low_level_api", train_with_low_level_api)
    hparams.add_hparam("decode_with_low_level_api", decode_with_low_level_api)
    hparams.add_hparam("train_and_decode_with_low_level_api",
                       train_and_decode_with_low_level_api)
    if decode_hparams is not None:
        decode_hparams.add_hparam("decode_from_file", decode_from_file)
        decode_hparams.add_hparam("decode_to_file", decode_to_file)
        decode_hparams.add_hparam("decode_reference", decode_reference)
    add_problem_hparams(hparams, problem_name)

    # Input fns from Problem
    problem = hparams.problem
    train_input_fn = problem.make_estimator_input_fn(
        tf.estimator.ModeKeys.TRAIN, hparams)
    eval_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.EVAL,
                                                    hparams)

    if train_with_low_level_api:
        params = {}
        params["batch_size"] = problem.tpu_batch_size_per_shard(hparams)
        params["tpu_num_hosts"] = tpu_num_hosts
        mlp_log.mlperf_print(key="global_batch_size",
                             value=params["batch_size"] *
                             run_config.tpu_config.num_shards)
        trunner = train_low_level_runner.TrainLowLevelRunner(
            iterations=iterations_per_loop)
        model_fn = t2t_model.T2TModel.make_estimator_model_fn(
            model_name,
            hparams,
            decode_hparams=decode_hparams,
            use_tpu=use_tpu)
        trunner.initialize(train_input_fn, model_fn, params, hparams,
                           run_config)

    if decode_with_low_level_api:
        if decode_hparams.batch_size:
            hparams.batch_size = decode_hparams.batch_size
            hparams.use_fixed_batch_size = True
        dataset_kwargs = {
            "shard":
            decode_hparams.shard_id if decode_hparams.shards > 1 else None,
            "dataset_split": tf.estimator.ModeKeys.EVAL,
            "max_records": decode_hparams.num_samples
        }
        infer_input_fn = problem.make_estimator_input_fn(
            tf.estimator.ModeKeys.PREDICT,
            hparams,
            dataset_kwargs=dataset_kwargs)

        params = {}
        # Currently, the decoding part runs on a donut, will change this for
        # distibuted eval.
        params["batch_size"] = int(decode_hparams.batch_size * tpu_num_hosts /
                                   run_config.tpu_config.num_shards)
        erunner = eval_low_level_runner.EvalLowLevelRunner(eval_steps=int(
            math.ceil(decode_hparams.num_samples / decode_hparams.batch_size)))
        model_fn = t2t_model.T2TModel.make_estimator_model_fn(
            model_name,
            hparams,
            decode_hparams=decode_hparams,
            use_tpu=use_tpu)
        erunner.initialize(infer_input_fn, params, run_config)
        erunner.build_model(model_fn, params, run_config)

    if train_and_decode_with_low_level_api:
        mlp_log.mlperf_print(key="max_sequence_length",
                             value=hparams.max_length)
        fake_train_input_fn = problem.make_estimator_input_fn(
            tf.estimator.ModeKeys.TRAIN, hparams, fake_data=True)
        params = {}
        params["batch_size"] = problem.tpu_batch_size_per_shard(hparams)
        params["tpu_num_hosts"] = tpu_num_hosts
        mlp_log.mlperf_print(key="global_batch_size",
                             value=params["batch_size"] *
                             run_config.tpu_config.num_shards)
        runner = low_level_runner.LowLevelRunner(
            iterations=iterations_per_loop,
            eval_steps=int(
                math.ceil(decode_hparams.num_samples /
                          decode_hparams.batch_size)))
        model_fn = t2t_model.T2TModel.make_estimator_model_fn(
            model_name,
            hparams,
            decode_hparams=decode_hparams,
            use_tpu=use_tpu)

        # Changed the problem to unpacked one for decoding.
        if "_packed" in hparams.problem.name:
            problem = registry.problem(
                hparams.problem.name.replace("_packed", ""))
            p_hparams = problem.get_hparams(hparams)
            hparams.problem = problem
            hparams.problem_hparams = p_hparams

        # Hard-coded based on the current wmt14 en-de eval dataset.
        hparams.max_length = 97

        if decode_hparams.batch_size:
            hparams.batch_size = decode_hparams.batch_size
            hparams.use_fixed_batch_size = True
        dataset_kwargs = {
            "shard":
            decode_hparams.shard_id if decode_hparams.shards > 1 else None,
            "dataset_split": tf.estimator.ModeKeys.EVAL,
            "max_records": decode_hparams.num_samples
        }
        fake_infer_input_fn = problem.make_estimator_input_fn(
            tf.estimator.ModeKeys.PREDICT,
            hparams,
            fake_data=True,
            dataset_kwargs=dataset_kwargs)
        infer_input_fn = problem.make_estimator_input_fn(
            tf.estimator.ModeKeys.PREDICT,
            hparams,
            dataset_kwargs=dataset_kwargs)
        infer_model_fn = t2t_model.T2TModel.make_estimator_model_fn(
            model_name,
            hparams,
            decode_hparams=decode_hparams,
            use_tpu=use_tpu)
        runner.initialize(fake_train_input_fn, fake_infer_input_fn,
                          train_input_fn, infer_input_fn, model_fn,
                          infer_model_fn, params, hparams, run_config)

    # Estimator
    estimator = create_estimator(model_name,
                                 hparams,
                                 run_config,
                                 schedule=schedule,
                                 decode_hparams=decode_hparams,
                                 use_tpu=use_tpu)

    # Eval on TPU Pods is not supported yet
    if use_tpu and run_config.tpu_config.num_shards > 8 and "eval" in schedule:
        raise ValueError("Eval is not currently supported on a TPU Pod")

    train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_steps)
    eval_spec = tf.estimator.EvalSpec(
        eval_input_fn,
        steps=eval_steps,
        start_delay_secs=0 if hparams.schedule == "evaluate" else 120,
        exporters=None)

    return T2TExperiment(
        estimator, hparams, train_spec, eval_spec, decode_hparams,
        trunner if train_with_low_level_api else None,
        erunner if decode_with_low_level_api else None,
        runner if train_and_decode_with_low_level_api else None)
示例#5
0
    def train_and_decode(self):
        """Does decode after training every eval_freq_in_steps."""
        eval_steps = self._hparams.eval_freq_in_steps
        if self._hparams.train_and_decode_with_low_level_api:
            self._runner.train_and_eval(self._train_spec.max_steps,
                                        self._hparams.batch_size)
            for i in range(0, self._train_spec.max_steps, eval_steps):
                if self._hparams.mlperf_mode:
                    self._decode_hparams.mlperf_decode_step = i + eval_steps
                output_dir = os.path.join(self._estimator.model_dir, "decode")
                tf.gfile.MakeDirs(output_dir)
                output_dirs = [output_dir]
                result = list(self._runner.dequeue(self._decode_hparams))

                mlp_log.mlperf_print(
                    "eval_start",
                    None,
                    metadata={"epoch_num": (i // eval_steps + 1)})
                predictions = []
                inputs_vocab = self._hparams.problem_hparams.vocabulary[
                    "inputs"]
                targets_vocab = self._hparams.problem_hparams.vocabulary[
                    "targets"]
                for prediction in result:
                    inputs = prediction.get("inputs")
                    targets = prediction.get("targets")
                    outputs = prediction.get("outputs")
                    if not re.match(
                            "^({})+$".format(text_encoder.PAD),
                            inputs_vocab.decode(
                                decoding.save_until_eos(inputs))):
                        predictions.append(
                            (targets_vocab.decode(
                                decoding.save_until_eos(outputs)),
                             targets_vocab.decode(
                                 decoding.save_until_eos(targets))))
                decoding.run_postdecode_hooks(
                    decoding.DecodeHookArgs(
                        estimator=self._estimator,
                        problem=self._hparams.problem,
                        output_dirs=output_dirs,
                        hparams=self._hparams,
                        decode_hparams=self._decode_hparams,
                        predictions=predictions), tf.estimator.ModeKeys.EVAL)

                mlp_log.mlperf_print("block_stop",
                                     None,
                                     metadata={
                                         "first_epoch_num":
                                         (i // eval_steps + 1),
                                         "epoch_count": 1
                                     })
                if self._hparams.mlperf_mode and self._decode_hparams.mlperf_success:
                    break

            self._runner.shutdown()
        else:
            mlp_log.mlperf_print(key="init_stop", value=None)
            mlp_log.mlperf_print(key="run_start", value=None)
            packed_dataset = "_packed" in self._hparams.problem.name
            for i in range(0, self._train_spec.max_steps, eval_steps):
                mlp_log.mlperf_print("block_start",
                                     None,
                                     metadata={
                                         "first_epoch_num":
                                         (i // eval_steps + 1),
                                         "epoch_count": 1
                                     })
                if packed_dataset and i > 0:
                    problem = registry.problem(self._hparams.problem.name +
                                               "_packed")
                    p_hparams = problem.get_hparams(self._hparams)
                    self._hparams.problem = problem
                    self._hparams.problem_hparams = p_hparams
                self._estimator.train(self._train_spec.input_fn,
                                      steps=eval_steps,
                                      hooks=self._train_spec.hooks)
                if packed_dataset:
                    problem = registry.problem(
                        self._hparams.problem.name.replace("_packed", ""))
                    p_hparams = problem.get_hparams(self._hparams)
                    self._hparams.problem = problem
                    self._hparams.problem_hparams = p_hparams
                if self._hparams.mlperf_mode:
                    self._decode_hparams.mlperf_decode_step = i + eval_steps
                predictions = self.decode(
                    dataset_split=tf.estimator.ModeKeys.EVAL)
                mlp_log.mlperf_print("block_stop",
                                     None,
                                     metadata={
                                         "first_epoch_num":
                                         (i // eval_steps + 1),
                                         "epoch_count": 1
                                     })
                if self._hparams.mlperf_mode and self._decode_hparams.mlperf_success:
                    break

        if self._hparams.mlperf_mode and not self._decode_hparams.mlperf_success:
            mlp_log.mlperf_print("run_stop",
                                 None,
                                 metadata={"status": "abort"})
        return predictions, self._train_spec.max_steps
def score_file(filename):
    """Score each line in a file and return the scores."""
    # Prepare model.
    hparams = create_hparams()
    encoders = registry.problem(FLAGS.problem).feature_encoders(FLAGS.data_dir)
    has_inputs = "inputs" in encoders

    # Prepare features for feeding into the model.
    if has_inputs:
        inputs_ph = tf.placeholder(dtype=tf.int32)  # Just length dimension.
        batch_inputs = tf.reshape(inputs_ph, [1, -1, 1, 1])  # Make it 4D.
    targets_ph = tf.placeholder(dtype=tf.int32)  # Just length dimension.
    batch_targets = tf.reshape(targets_ph, [1, -1, 1, 1])  # Make it 4D.
    features = {
        "inputs": batch_inputs,
        "targets": batch_targets,
    } if has_inputs else {
        "targets": batch_targets
    }

    # Prepare the model and the graph when model runs on features.
    model = registry.model(FLAGS.model)(hparams, tf.estimator.ModeKeys.EVAL)
    _, losses = model(features)
    saver = tf.train.Saver()

    with tf.Session() as sess:
        # Load weights from checkpoint.
        ckpts = tf.train.get_checkpoint_state(FLAGS.output_dir)
        ckpt = ckpts.model_checkpoint_path
        saver.restore(sess, ckpt)
        # Run on each line.
        with tf.gfile.Open(filename) as f:
            lines = f.readlines()
        results = []
        for line in lines:
            tab_split = line.split("\t")
            if len(tab_split) > 2:
                raise ValueError(
                    "Each line must have at most one tab separator.")
            if len(tab_split) == 1:
                targets = tab_split[0].strip()
            else:
                targets = tab_split[1].strip()
                inputs = tab_split[0].strip()
            # Run encoders and append EOS symbol.
            targets_numpy = encoders["targets"].encode(targets) + [
                text_encoder.EOS_ID
            ]
            if has_inputs:
                inputs_numpy = encoders["inputs"].encode(inputs) + [
                    text_encoder.EOS_ID
                ]
            # Prepare the feed.
            feed = {
                inputs_ph: inputs_numpy,
                targets_ph: targets_numpy
            } if has_inputs else {
                targets_ph: targets_numpy
            }
            # Get the score.
            np_loss = sess.run(losses["training"], feed)
            results.append(np_loss)
    return results
def problem(name):
    return registry.problem(name)
示例#8
0
 def setUpClass(cls):
     tf.set_random_seed(1)
     cls.problem = registry.problem("test_problem")
     cls.data_dir = tempfile.gettempdir()
     cls.filepatterns = generate_test_data(cls.problem, cls.data_dir)