示例#1
0
def dcgan_run_optimize():
    bench_total_start = time.time()
    _NUM_VIZ_IMAGES = dcgan_main._NUM_VIZ_IMAGES
    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    config = tf.compat.v1.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        tpu_config=tf.compat.v1.estimator.tpu.TPUConfig(
            num_shards=FLAGS.num_shards,
            iterations_per_loop=FLAGS.iterations_per_loop))

    # Get the generator and discriminator functions depending on which dataset
    # we want to train on.
    # model, dataset = {
    # 		'mnist': (mnist_model, mnist_input),
    # 		'cifar': (cifar_model, cifar_input),
    # }[FLAGS.dataset]
    model, dataset = {
        'mnist': (mnist_model, MNISTInputFunction),
        'cifar': (cifar_model, CIFARInputFunction),
    }[FLAGS.dataset]
    train_eval_input_fn = dataset(True, FLAGS.noise_dim)

    def unconditional_generator(noise, mode):
        """Generator with extra argument for tf.Estimator's `mode`."""
        is_training = (mode == tf.estimator.ModeKeys.TRAIN)
        return model.generator(noise, is_training=is_training)

    def unconditional_discriminator(images, unused_conditioning):
        """Discriminator that conforms to TF-GAN API."""
        return model.discriminator(images, is_training=True)

    # TPU-based estimator used for TRAIN and EVAL
    # TODO(joelshor): Add get_eval_metric_ops_fn.
    est = tfgan.estimator.TPUGANEstimator(
        generator_fn=unconditional_generator,
        discriminator_fn=unconditional_discriminator,
        generator_loss_fn=tfgan.losses.minimax_generator_loss,
        discriminator_loss_fn=tfgan.losses.minimax_discriminator_loss,
        generator_optimizer=tf.train.AdamOptimizer(FLAGS.learning_rate, 0.5),
        discriminator_optimizer=tf.train.AdamOptimizer(FLAGS.learning_rate,
                                                       0.5),
        joint_train=True,  # train G and D jointly instead of sequentially.
        eval_on_tpu=True,
        train_batch_size=FLAGS.batch_size,
        eval_batch_size=FLAGS.batch_size,
        predict_batch_size=_NUM_VIZ_IMAGES,
        use_tpu=FLAGS.use_tpu,
        config=config)

    # Get the tf.Estimator `input_fn` for training and evaluation.
    # train_eval_input_fn = functools.partial(input_fn, dataset=dataset)
    tf.gfile.MakeDirs(os.path.join(FLAGS.model_dir, 'generated_images'))

    current_step = estimator._load_global_step_from_checkpoint_dir(
        FLAGS.model_dir)  # pylint: disable=protected-access,line-too-long
    tf.logging.info('Starting training for %d steps, current step: %d' %
                    (FLAGS.train_steps, current_step))
    # while current_step < FLAGS.train_steps:
    #	 next_checkpoint = min(current_step + FLAGS.train_steps_per_eval,
    #												 FLAGS.train_steps)

    tpupoint = TPUPoint(
        estimator=est,
        gcp_project=FLAGS.gcp_project,
        tpu_zone=FLAGS.tpu_zone,
        tpu=FLAGS.tpu,
        logdir=FLAGS.model_dir,
        workers_list=None,
        num_tracing_attempts=3,
        include_dataset_ops=False,  # False for longer traces
        monitoring_level=1,  # 1 or 2 logging level
        num_queries=4)

    tpupoint.optimize_input_fn(train_eval_input_fn, blocking=True)

    bench_start = time.time()
    # tpupoint.Start()
    # est.train(input_fn=train_eval_input_fn, max_steps=FLAGS.train_steps)
    tpupoint.train(estimator=est,
                   input_fn=train_eval_input_fn,
                   max_steps=FLAGS.train_steps)
    # tpupoint.Stop()
    bench_elapsed = time.time() - bench_start
    bench_total_dur = time.time() - bench_total_start
    tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds")
    tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds")
    tpupoint.CleanUp()
示例#2
0
def resnet_run_optimize():
    bench_total_start = time.time()
    params = params_dict.ParamsDict(resnet_config.RESNET_CFG,
                                    resnet_config.RESNET_RESTRICTIONS)
    params = params_dict.override_params_dict(params,
                                              FLAGS.config_file,
                                              is_strict=True)
    params = params_dict.override_params_dict(params,
                                              FLAGS.params_override,
                                              is_strict=True)

    params = flags_to_params.override_params_from_input_flags(params, FLAGS)

    params.validate()
    params.lock()

    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
        FLAGS.tpu if (FLAGS.tpu or params.use_tpu) else '',
        zone=FLAGS.tpu_zone,
        project=FLAGS.gcp_project)

    if params.use_async_checkpointing:
        save_checkpoints_steps = None
    else:
        save_checkpoints_steps = max(5000, params.iterations_per_loop)
    config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=FLAGS.model_dir,
        save_checkpoints_steps=save_checkpoints_steps,
        log_step_count_steps=FLAGS.log_step_count_steps,
        session_config=tf.ConfigProto(
            graph_options=tf.GraphOptions(
                rewrite_options=rewriter_config_pb2.RewriterConfig(
                    disable_meta_optimizer=True))),
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=params.iterations_per_loop,
            num_shards=params.num_cores,
            per_host_input_for_training=tf.contrib.tpu.InputPipelineConfig
            .PER_HOST_V2))  # pylint: disable=line-too-long

    resnet_classifier = tf.contrib.tpu.TPUEstimator(
        use_tpu=params.use_tpu,
        model_fn=resnet_model_fn,
        config=config,
        params=params.as_dict(),
        train_batch_size=params.train_batch_size,
        eval_batch_size=params.eval_batch_size,
        export_to_tpu=FLAGS.export_to_tpu)

    assert (params.precision == 'bfloat16' or params.precision
            == 'float32'), ('Invalid value for precision parameter; '
                            'must be bfloat16 or float32.')
    tf.logging.info('Precision: %s', params.precision)
    use_bfloat16 = params.precision == 'bfloat16'

    cifar10_train, cifar10_eval = [
        cifar10_input.CIFAR10Input(
            is_training=is_training,
            data_dir=FLAGS.data_dir,
            transpose_input=params.transpose_input,
            cache=params.use_cache and is_training,
            image_size=params.image_size,
            num_parallel_calls=params.num_parallel_calls,
            include_background_label=(params.num_label_classes == 1001),
            use_bfloat16=use_bfloat16) for is_training in [True, False]
    ]

    steps_per_epoch = params.num_train_images // params.train_batch_size
    eval_steps = params.num_eval_images // params.eval_batch_size

    hooks = []
    if params.use_async_checkpointing:
        hooks.append(
            async_checkpoint.AsyncCheckpointSaverHook(
                checkpoint_dir=FLAGS.model_dir,
                save_steps=max(5000, params.iterations_per_loop)))
    if FLAGS.profile_every_n_steps > 0:
        hooks.append(
            tpu_profiler_hook.TPUProfilerHook(
                save_steps=FLAGS.profile_every_n_steps,
                output_dir=FLAGS.model_dir,
                tpu=FLAGS.tpu))

    tpupoint = TPUPoint(
        estimator=resnet_classifier,
        gcp_project=FLAGS.gcp_project,
        tpu_zone=FLAGS.tpu_zone,
        tpu=FLAGS.tpu,
        logdir=FLAGS.model_dir,
        workers_list=None,
        num_tracing_attempts=3,
        include_dataset_ops=False,  # False for longer traces
        monitoring_level=1,  # 1 or 2 logging level
        num_queries=4)

    input_fn = tpupoint.optimize_input_fn(cifar10_train.input_fn,
                                          blocking=True)

    bench_start = time.time()
    tpupoint.train(estimator=resnet_classifier,
                   input_fn=input_fn,
                   max_steps=params.train_steps,
                   hooks=hooks)
    bench_elapsed = time.time() - bench_start
    bench_total_dur = time.time() - bench_total_start
    tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds")
    tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds")
示例#3
0
def bert_run_optimize():
    bench_total_start = time.time()
    tf.logging.set_verbosity(tf.logging.INFO)

    processors = {
        "cola": ColaProcessor,
        "mnli": MnliProcessor,
        "mrpc": MrpcProcessor,
        "xnli": XnliProcessor,
        "mnli": MnliProcessor,
    }

    tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
                                                  FLAGS.init_checkpoint)

    # if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
    #   raise ValueError(
    #       "At least one of `do_train`, `do_eval` or `do_predict' must be True.")

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    if FLAGS.max_seq_length > bert_config.max_position_embeddings:
        raise ValueError(
            "Cannot use sequence length %d because the BERT model "
            "was only trained up to sequence length %d" %
            (FLAGS.max_seq_length, bert_config.max_position_embeddings))

    tf.gfile.MakeDirs(FLAGS.output_dir)

    task_name = FLAGS.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()

    label_list = processor.get_labels()

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    # if FLAGS.do_train:
    train_examples = processor.get_train_examples(FLAGS.data_dir)
    num_train_steps = int(
        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

    model_fn = model_fn_builder(bert_config=bert_config,
                                num_labels=len(label_list),
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        eval_batch_size=FLAGS.eval_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    # if FLAGS.do_train:
    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
    file_based_convert_examples_to_features(train_examples, label_list,
                                            FLAGS.max_seq_length, tokenizer,
                                            train_file)
    tf.logging.info("***** Running training *****")
    tf.logging.info("  Num examples = %d", len(train_examples))
    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info("  Num steps = %d", num_train_steps)
    train_input_fn = file_based_input_fn_builder(
        input_file=train_file,
        seq_length=FLAGS.max_seq_length,
        is_training=True,
        drop_remainder=True)

    tpupoint = TPUPoint(
        estimator=estimator,
        gcp_project=FLAGS.gcp_project,
        tpu_zone=FLAGS.tpu_zone,
        tpu=FLAGS.tpu_name,
        logdir=FLAGS.output_dir,
        workers_list=None,
        num_tracing_attempts=3,
        include_dataset_ops=False,  # False for longer traces
        monitoring_level=1,  # 1 or 2 logging level
        num_queries=4)

    tpupoint.optimize_input_fn(train_input_fn, blocking=True)
    bench_start = time.time()
    # tpupoint.Start()
    # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    tpupoint.train(estimator=estimator,
                   input_fn=train_input_fn,
                   max_steps=num_train_steps)
    # tpupoint.Stop()
    bench_elapsed = time.time() - bench_start
    bench_total_dur = time.time() - bench_total_start
    tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds")
    tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds")
    tpupoint.CleanUp()
示例#4
0
def qanet_run_optimize():
    bench_total_start = time.time()
    # tf.logging.set_verbosity(tf.logging.INFO)
    cfg = create_config(model_dir=FLAGS.model_dir)

    if FLAGS.tpu:
        cfg.tpu.name = FLAGS.tpu
        cfg.tpu.zone = FLAGS.tpu_zone
        cfg.tpu.gcp_project = FLAGS.gcp_project
        cfg.tpu.enable = True
    else:
        # Toggle TPU relevant settings
        if FLAGS.enable_tpu:
            cfg.tpu.enable = True
        else:
            cfg.tpu.enable = False
    # train_and_eval(cfg, do_eval=("eval" in FLAGS.mode))

    tf.logging.info("cfg.model_dir = " + cfg.model_dir)
    # Save out config to model directory
    # assert "train" in FLAGS.mode
    tf.gfile.MakeDirs(cfg.model_dir)
    with tf.gfile.GFile(os.path.join(cfg.model_dir, "config.json"), "w") as f:
        json.dump(cfg, f)

    if not cfg.dataset.num_repeats and not cfg.steps_per_epoch:
        raise ValueError("Must have a fixed num repeats or epoch step size.")

    # Construct inputs and estimator
    # train_input, eval_input = data.build_dataset( cfg.dataset, is_tpu=cfg.tpu.enable)
    train_input, eval_input = new_build_dataset(cfg.dataset,
                                                is_tpu=cfg.tpu.enable)
    estimator = model.get_estimator(**cfg)

    # if do_eval:
    # 	eval_metrics = None
    # 	for i in range(cfg.num_epochs):
    # 		tf.logging.info("Starting epoch %s/%s" % (i + 1, cfg.num_epochs))
    # 		train_metrics = estimator.train(
    # 				input_fn=train_input, steps=cfg.steps_per_epoch or None)
    # 		tf.logging.info(pprint.pformat(train_metrics))
    # 		eval_metrics = estimator.evaluate(input_fn=eval_input)
    # 		tf.logging.info(pprint.pformat(eval_metrics))
    # 		if report_fn:
    # 			report_fn(eval_metrics)
    # 	return eval_metrics
    # else:
    # 	for i in range(cfg.num_epochs):
    # 		tf.logging.info("Starting epoch %s/%s" % (i + 1, cfg.num_epochs))
    # 		train_metrics = estimator.train(
    # 				input_fn=train_input, steps=cfg.steps_per_epoch)
    # 		tf.logging.info(pprint.pformat(train_metrics))

    tpupoint = TPUPoint(
        estimator=estimator,
        gcp_project=FLAGS.gcp_project,
        tpu_zone=FLAGS.tpu_zone,
        tpu=FLAGS.tpu,
        logdir=FLAGS.model_dir,
        workers_list=None,
        num_tracing_attempts=3,
        include_dataset_ops=False,  # False for longer traces
        monitoring_level=1,  # 1 or 2 logging level
        num_queries=4)

    tpupoint.optimize_input_fn(train_input, blocking=True)

    tf.logging.info("Starting training for  %s steps" %
                    (cfg.steps_per_epoch * cfg.num_epochs))
    bench_start = time.time()
    # tpupoint.Start()
    # train_metrics = estimator.train( input_fn=train_input, steps=(cfg.steps_per_epoch * cfg.num_epochs))
    tpupoint.train(estimator=estimator,
                   input_fn=train_input,
                   steps=(cfg.steps_per_epoch * cfg.num_epochs))
    # tpupoint.Stop()
    bench_elapsed = time.time() - bench_start
    bench_total_dur = time.time() - bench_total_start
    # tf.logging.info(pprint.pformat(train_metrics))
    tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds")
    tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds")
    tpupoint.CleanUp()
示例#5
0
def bert_run_squad_optimize():
    bench_total_start = time.time()
    tf.logging.set_verbosity(tf.logging.INFO)

    bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)

    validate_flags_or_throw(bert_config)

    tf.gfile.MakeDirs(FLAGS.output_dir)

    tokenizer = tokenization.FullTokenizer(vocab_file=FLAGS.vocab_file,
                                           do_lower_case=FLAGS.do_lower_case)

    tpu_cluster_resolver = None
    if FLAGS.use_tpu and FLAGS.tpu_name:
        tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
            FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)

    is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
    run_config = tf.contrib.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        master=FLAGS.master,
        model_dir=FLAGS.output_dir,
        save_checkpoints_steps=FLAGS.save_checkpoints_steps,
        tpu_config=tf.contrib.tpu.TPUConfig(
            iterations_per_loop=FLAGS.iterations_per_loop,
            num_shards=FLAGS.num_tpu_cores,
            per_host_input_for_training=is_per_host))

    train_examples = None
    num_train_steps = None
    num_warmup_steps = None
    if FLAGS.do_train:
        train_examples = read_squad_examples(input_file=FLAGS.train_file,
                                             is_training=True)
        num_train_steps = int(
            len(train_examples) / FLAGS.train_batch_size *
            FLAGS.num_train_epochs)
        num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)

        # Pre-shuffle the input to avoid having to make a very large shuffle
        # buffer in in the `input_fn`.
        rng = random.Random(12345)
        rng.shuffle(train_examples)

    model_fn = model_fn_builder(bert_config=bert_config,
                                init_checkpoint=FLAGS.init_checkpoint,
                                learning_rate=FLAGS.learning_rate,
                                num_train_steps=num_train_steps,
                                num_warmup_steps=num_warmup_steps,
                                use_tpu=FLAGS.use_tpu,
                                use_one_hot_embeddings=FLAGS.use_tpu)

    # If TPU is not available, this will fall back to normal Estimator on CPU
    # or GPU.
    estimator = tf.contrib.tpu.TPUEstimator(
        use_tpu=FLAGS.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=FLAGS.train_batch_size,
        predict_batch_size=FLAGS.predict_batch_size)

    # if FLAGS.do_train:
    # We write to a temporary file to avoid storing very large constant tensors
    # in memory.
    train_writer = FeatureWriter(filename=os.path.join(FLAGS.output_dir,
                                                       "train.tf_record"),
                                 is_training=True)
    convert_examples_to_features(examples=train_examples,
                                 tokenizer=tokenizer,
                                 max_seq_length=FLAGS.max_seq_length,
                                 doc_stride=FLAGS.doc_stride,
                                 max_query_length=FLAGS.max_query_length,
                                 is_training=True,
                                 output_fn=train_writer.process_feature)
    train_writer.close()

    tf.logging.info("***** Running training *****")
    tf.logging.info(" Num orig examples = %d", len(train_examples))
    tf.logging.info(" Num split examples = %d", train_writer.num_features)
    tf.logging.info(" Batch size = %d", FLAGS.train_batch_size)
    tf.logging.info(" Num steps = %d", num_train_steps)
    del train_examples

    tpupoint = TPUPoint(
        estimator=estimator,
        gcp_project=FLAGS.bench_gcp_project,
        tpu_zone=FLAGS.bench_tpu_zone,
        tpu=FLAGS.tpu_name,
        logdir=FLAGS.output_dir,
        workers_list=None,
        num_tracing_attempts=3,
        include_dataset_ops=False,  # False for longer traces
        monitoring_level=1,  # 1 or 2 logging level
        num_queries=4)

    train_input_fn = input_fn_builder(input_file=train_writer.filename,
                                      seq_length=FLAGS.max_seq_length,
                                      is_training=True,
                                      drop_remainder=True)

    tpupoint.optimize_input_fn(train_input_fn, blocking=True)

    bench_start = time.time()

    # # estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
    tpupoint.train(estimator=estimator,
                   input_fn=train_input_fn,
                   max_steps=num_train_steps)

    bench_elapsed = time.time() - bench_start
    bench_total_dur = time.time() - bench_total_start
    tf.logging.info("Train End-to-End: " + str(bench_elapsed) + " seconds")
    tf.logging.info("Total End-to-End: " + str(bench_total_dur) + " seconds")
    tpupoint.CleanUp()