예제 #1
0
    def __init__(self,
                 config: configure_finetuning.FinetuningConfig,
                 tasks,
                 pretraining_config=None):
        self._config = config
        self._tasks = tasks
        self._preprocessor = preprocessing.Preprocessor(config, self._tasks)

        is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        tpu_cluster_resolver = None
        if config.use_tpu:
            tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                zone=config.tpu_zone, project=config.gcp_project)
            print('Running on TPU ',
                  tpu_cluster_resolver.cluster_spec().as_dict()['worker'])

            if tpu_cluster_resolver:
                tf.config.experimental_connect_to_cluster(tpu_cluster_resolver)
                tf.tpu.experimental.initialize_tpu_system(tpu_cluster_resolver)
            #     strategy = tf.distribute.experimental.TPUStrategy(tpu_cluster_resolver)
            # else:
            #     strategy = tf.distribute.get_strategy()
            print("REPLICAS: ", strategy.num_replicas_in_sync)

        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=config.iterations_per_loop,
            num_shards=config.num_tpu_cores,
            per_host_input_for_training=is_per_host,
            tpu_job_name=config.tpu_job_name)
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            save_checkpoints_secs=None,
            tpu_config=tpu_config)

        if self._config.do_train:
            (self._train_input_fn,
             self.train_steps) = self._preprocessor.prepare_train()
        else:
            self._train_input_fn, self.train_steps = None, 0
        model_fn = model_fn_builder(config=config,
                                    tasks=self._tasks,
                                    num_train_steps=self.train_steps,
                                    pretraining_config=pretraining_config)
        self._estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
            predict_batch_size=config.predict_batch_size)
    def __init__(self, config: configure_finetuning.FinetuningConfig, tasks,
                 pretraining_config=None):
        self._config = config
        self._tasks = tasks
        self._preprocessor = preprocessing.Preprocessor(config, self._tasks)

        is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
        tpu_cluster_resolver = None
        if config.use_tpu and config.tpu_name:
            tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
                config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)
        tpu_config = tf.estimator.tpu.TPUConfig(
            iterations_per_loop=config.iterations_per_loop,
            num_shards=config.num_tpu_cores,
            per_host_input_for_training=is_per_host,
            tpu_job_name=config.tpu_job_name)
        
        session_config2 = tf.ConfigProto(allow_soft_placement=True)
        run_config = tf.estimator.tpu.RunConfig(
            cluster=tpu_cluster_resolver,
            model_dir=config.model_dir,
            save_checkpoints_steps=config.save_checkpoints_steps,
            save_checkpoints_secs=None,
            tpu_config=tpu_config,
            session_config=session_config,
            keep_checkpoint_max=config.max_save)


        if self._config.do_train:
            (self._train_input_fn,
             self.train_steps) = self._preprocessor.prepare_train()
        else:
            self._train_input_fn, self.train_steps = None, 0
        
        model_fn = model_fn_builder(
            config=config,
            tasks=self._tasks,
            num_train_steps=self.train_steps,
            pretraining_config=pretraining_config)
        
        self._estimator = tf.estimator.tpu.TPUEstimator(
            use_tpu=config.use_tpu,
            model_fn=model_fn,
            config=run_config,
            train_batch_size=config.train_batch_size,
            eval_batch_size=config.eval_batch_size,
            predict_batch_size=config.predict_batch_size)
예제 #3
0
  def __init__(self, config: configure_finetuning.FinetuningConfig, tasks, hvd,
               pretraining_config=None):
    self._config = config
    self._tasks = tasks
    self._preprocessor = preprocessing.Preprocessor(config, self._tasks)
    self._hooks = [hvd.BroadcastGlobalVariablesHook(0)]
    is_per_host = tf.estimator.tpu.InputPipelineConfig.PER_HOST_V2
    tpu_cluster_resolver = None
    if config.use_tpu and config.tpu_name:
      tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(
          config.tpu_name, zone=config.tpu_zone, project=config.gcp_project)

    session_config = tf.ConfigProto()
    session_config.gpu_options.visible_device_list = str(hvd.local_rank())

    tpu_config = tf.estimator.tpu.TPUConfig(
        iterations_per_loop=config.iterations_per_loop,
        num_shards=config.num_tpu_cores,
        per_host_input_for_training=is_per_host,
        tpu_job_name=config.tpu_job_name)
    run_config = tf.estimator.tpu.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=config.model_dir,
        save_checkpoints_steps=config.save_checkpoints_steps,
        session_config=session_config,
        save_checkpoints_secs=None,
        tpu_config=tpu_config)

    if self._config.do_train:
      (self._train_input_fn,
       self.train_steps) = self._preprocessor.prepare_train()
    else:
      self._train_input_fn, self.train_steps = None, 0
    model_fn = model_fn_builder(
        config=config,
        tasks=self._tasks,
        num_train_steps=self.train_steps,
        pretraining_config=pretraining_config)
    self._estimator = tf.estimator.tpu.TPUEstimator(
        use_tpu=config.use_tpu,
        model_fn=model_fn,
        config=run_config,
        train_batch_size=config.train_batch_size,
        eval_batch_size=config.eval_batch_size,
        predict_batch_size=config.predict_batch_size)
예제 #4
0
    def __init__(self,
                 config: configure_finetuning.FinetuningConfig,
                 tasks,
                 pretraining_config=None):
        self._config = config
        self._tasks = tasks
        self._preprocessor = preprocessing.Preprocessor(config, self._tasks)

        num_gpus = utils.get_available_gpus()
        utils.log("Found {} gpus".format(len(num_gpus)))

        if num_gpus == 1:
            session_config = tf.ConfigProto(
                log_device_placement=True,
                allow_soft_placement=True,
                gpu_options=tf.GPUOptions(allow_growth=True))

            run_config = tf.estimator.RunConfig(
                model_dir=config.model_dir,
                save_checkpoints_steps=config.save_checkpoints_steps,
                # save_checkpoints_secs=3600,
                # tf_random_seed=FLAGS.seed,
                session_config=session_config,
                # keep_checkpoint_max=0,
                log_step_count_steps=100)
        else:
            train_distribution_strategy = tf.distribute.MirroredStrategy(
                devices=None,
                cross_device_ops=tensorflow.contrib.distribute.
                AllReduceCrossDeviceOps('nccl', num_packs=len(num_gpus)))
            eval_distribution_strategy = tf.distribute.MirroredStrategy(
                devices=None)

            session_config = tf.ConfigProto(
                # log_device_placement=True,
                inter_op_parallelism_threads=0,
                intra_op_parallelism_threads=0,
                allow_soft_placement=True,
                gpu_options=tf.GPUOptions(allow_growth=True))

            run_config = tf.estimator.RunConfig(
                model_dir=config.model_dir,
                save_checkpoints_steps=config.save_checkpoints_steps,
                train_distribute=train_distribution_strategy,
                eval_distribute=eval_distribution_strategy,
                # save_checkpoints_secs=3600,
                # tf_random_seed=FLAGS.seed,
                session_config=session_config,
                # keep_checkpoint_max=0,
                log_step_count_steps=100)

        if self._config.do_train:
            (self._train_input_fn,
             self.train_steps) = self._preprocessor.prepare_train()
        else:
            self._train_input_fn, self.train_steps = None, 0

        model_fn = model_fn_builder(config=config,
                                    tasks=self._tasks,
                                    num_train_steps=self.train_steps,
                                    pretraining_config=pretraining_config)
        self._estimator = tf.estimator.Estimator(model_fn=model_fn,
                                                 config=run_config,
                                                 params={
                                                     'train_batch_size':
                                                     config.train_batch_size,
                                                     'eval_batch_size':
                                                     config.eval_batch_size
                                                 })