示例#1
0
    def _train(self):
        tf.disable_eager_execution()
        ps_tasks = 0
        worker_replicas = 1
        worker_job_name = 'lonely_worker'
        task = 0
        is_chief = True
        master = ''
        graph_rewriter_fn = None
        # loading and reading  the config file
        configs = create_configs_from_pipeline_proto(self.pipeline)
        model_config = configs['model']
        train_config = configs['train_config']
        input_config = configs['train_input_config']
        # creating the tf object detection api model (from the config parameters)
        model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True)

        def get_next(config):
            return dataset_builder.make_initializable_iterator(dataset_builder.build(config)).get_next()

        create_input_dict_fn = functools.partial(get_next, input_config)
        if 'graph_rewriter_config' in configs:
            graph_rewriter_fn = graph_rewriter_builder.build(configs['graph_rewriter_config'], is_training=True)
        # training the model with the new parameters
        trainer.train(create_input_dict_fn, model_fn, train_config, master, task, 1, worker_replicas, False, ps_tasks,
                      worker_job_name, is_chief, str(self._out_folder), graph_hook_fn=graph_rewriter_fn)
示例#2
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.pipeline_config_path:
        model_config, train_config, input_config = get_configs_from_pipeline_file(
        )
    else:
        model_config, train_config, input_config = get_configs_from_multiple_files(
        )

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    create_input_dict_fn = functools.partial(input_reader_builder.build,
                                             input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                  FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu,
                  ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
示例#3
0
def train_car(train_dir="models/faster_car_models",
              pipeline_config_path="pipeline/faster_rcnn_resnet101_car.config",
              num_clones=1,
              clone_on_cpu=False):

    if pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            pipeline_config_path)
        if os.path.exists(train_dir) is not True:
            os.makedirs(train_dir)
        tf.gfile.Copy(pipeline_config_path,
                      os.path.join(train_dir, 'pipeline.config'),
                      overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''
    graph_rewriter_fn = None

    trainer.train(create_input_dict_fn,
                  model_fn,
                  train_config,
                  master,
                  task,
                  num_clones,
                  worker_replicas,
                  clone_on_cpu,
                  ps_tasks,
                  worker_job_name,
                  is_chief,
                  train_dir,
                  graph_hook_fn=graph_rewriter_fn)
示例#4
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    tf.gfile.MakeDirs(FLAGS.train_dir)
    assert FLAGS.pipeline_config_path, '`pipeline_config_path` is missing.'
    configs = config_util.get_configs_from_pipeline_file(
        FLAGS.pipeline_config_path)
    tf.gfile.Copy(FLAGS.pipeline_config_path,
                  os.path.join(FLAGS.train_dir, 'pipeline.config'),
                  overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    num_clones = 1
    clone_on_cpu = False

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    trainer.train(create_input_dict_fn,
                  model_fn,
                  train_config,
                  master,
                  task,
                  num_clones,
                  worker_replicas,
                  clone_on_cpu,
                  ps_tasks,
                  worker_job_name,
                  is_chief,
                  FLAGS.train_dir,
                  graph_hook_fn=graph_rewriter_fn)
示例#5
0
    def test_configure_trainer_with_multiclass_scores_and_train_two_steps(
            self):
        train_config_text_proto = """
    optimizer {
      adam_optimizer {
        learning_rate {
          constant_learning_rate {
            learning_rate: 0.01
          }
        }
      }
    }
    data_augmentation_options {
      random_adjust_brightness {
        max_delta: 0.2
      }
    }
    data_augmentation_options {
      random_adjust_contrast {
        min_delta: 0.7
        max_delta: 1.1
      }
    }
    num_steps: 2
    use_multiclass_scores: true
    """
        train_config = train_pb2.TrainConfig()
        text_format.Merge(train_config_text_proto, train_config)

        train_dir = self.get_temp_dir()

        trainer.train(create_tensor_dict_fn=get_input_function,
                      create_model_fn=FakeDetectionModel,
                      train_config=train_config,
                      master='',
                      task=0,
                      num_clones=1,
                      worker_replicas=1,
                      clone_on_cpu=True,
                      ps_tasks=0,
                      worker_job_name='worker',
                      is_chief=True,
                      train_dir=train_dir)
示例#6
0
  def test_configure_trainer_with_multiclass_scores_and_train_two_steps(self):
    train_config_text_proto = """
    optimizer {
      adam_optimizer {
        learning_rate {
          constant_learning_rate {
            learning_rate: 0.01
          }
        }
      }
    }
    data_augmentation_options {
      random_adjust_brightness {
        max_delta: 0.2
      }
    }
    data_augmentation_options {
      random_adjust_contrast {
        min_delta: 0.7
        max_delta: 1.1
      }
    }
    num_steps: 2
    use_multiclass_scores: true
    """
    train_config = train_pb2.TrainConfig()
    text_format.Merge(train_config_text_proto, train_config)

    train_dir = self.get_temp_dir()

    trainer.train(create_tensor_dict_fn=get_input_function,
                  create_model_fn=FakeDetectionModel,
                  train_config=train_config,
                  master='',
                  task=0,
                  num_clones=1,
                  worker_replicas=1,
                  clone_on_cpu=True,
                  ps_tasks=0,
                  worker_job_name='worker',
                  is_chief=True,
                  train_dir=train_dir)
示例#7
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        if FLAGS.task == 0:
            tf.gfile.Copy(FLAGS.pipeline_config_path,
                          os.path.join(FLAGS.train_dir, 'pipeline.config'),
                          overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path)
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                print(FLAGS.train_dir, name)
                tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(
        model_builder.build,
        model_config=model_config,
        is_training=True)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object,), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError('At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    trainer.train(
        create_input_dict_fn,
        model_fn,
        train_config,
        master,
        task,
        FLAGS.num_clones,
        worker_replicas,
        FLAGS.clone_on_cpu,
        ps_tasks,
        worker_job_name,
        is_chief,
        FLAGS.train_dir,
        graph_hook_fn=graph_rewriter_fn)
示例#8
0
def main(_):
    print("starting program . . .")

    # show info to std out during the training process
    tf.logging.set_verbosity(tf.logging.INFO)

    if not checkIfNecessaryPathsAndFilesExist():
        return
    # end if

    configs = config_util.get_configs_from_pipeline_file(PIPELINE_CONFIG_PATH)
    tf.gfile.Copy(PIPELINE_CONFIG_PATH,
                  os.path.join(TRAINING_DATA_DIR, 'pipeline.config'),
                  overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    # ToDo: this nested function seems odd, factor this out eventually ??
    # nested function
    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    # end nested function

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # parameters for a single worker
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    # end if

    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])
    # end if

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')
    # end if

    if worker_replicas >= 1 and ps_tasks > 0:
        # set up distributed training
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return
        # end if

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target
    # end if

    trainer.train(create_input_dict_fn, model_fn, train_config, master, task,
                  NUM_CLONES, worker_replicas, CLONE_ON_CPU, ps_tasks,
                  worker_job_name, is_chief, TRAINING_DATA_DIR)
def main(_):
    print("starting program . . .")

    # show info to std out during the training process
    tf.logging.set_verbosity(tf.logging.INFO)

    if not checkIfNecessaryPathsAndFilesExist():
        return
    # end if

    configs = config_util.get_configs_from_pipeline_file(PIPELINE_CONFIG_PATH)
    tf.gfile.Copy(PIPELINE_CONFIG_PATH, os.path.join(TRAINING_DATA_DIR, 'pipeline.config'), overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True)

    # ToDo: this nested function seems odd, factor this out eventually ??
    # nested function
    def get_next(config):
        return dataset_builder.make_initializable_iterator(dataset_builder.build(config)).get_next()
    # end nested function

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object,), task_data)

    # parameters for a single worker
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    # end if

    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])
    # end if

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError('At least 1 ps task is needed for distributed training.')
    # end if

    if worker_replicas >= 1 and ps_tasks > 0:
        # set up distributed training
        server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return
        # end if

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target
    # end if

    trainer.train(create_input_dict_fn, model_fn, train_config, master, task, NUM_CLONES, worker_replicas,
                  CLONE_ON_CPU, ps_tasks, worker_job_name, is_chief, TRAINING_DATA_DIR)
示例#10
0
文件: train.py 项目: ALISCIFP/models
def main(_):
  assert FLAGS.train_dir, '`train_dir` is missing.'
  if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
  if FLAGS.pipeline_config_path:
    configs = config_util.get_configs_from_pipeline_file(
        FLAGS.pipeline_config_path)
    if FLAGS.task == 0:
      tf.gfile.Copy(FLAGS.pipeline_config_path,
                    os.path.join(FLAGS.train_dir, 'pipeline.config'),
                    overwrite=True)
  else:
    configs = config_util.get_configs_from_multiple_files(
        model_config_path=FLAGS.model_config_path,
        train_config_path=FLAGS.train_config_path,
        train_input_config_path=FLAGS.input_config_path)
    if FLAGS.task == 0:
      for name, config in [('model.config', FLAGS.model_config_path),
                           ('train.config', FLAGS.train_config_path),
                           ('input.config', FLAGS.input_config_path)]:
        tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name),
                      overwrite=True)

  model_config = configs['model']
  train_config = configs['train_config']
  input_config = configs['train_input_config']

  model_fn = functools.partial(
      model_builder.build,
      model_config=model_config,
      is_training=True)

  def get_next(config):
    return dataset_builder.make_initializable_iterator(
        dataset_builder.build(config)).get_next()

  create_input_dict_fn = functools.partial(get_next, input_config)

  env = json.loads(os.environ.get('TF_CONFIG', '{}'))
  cluster_data = env.get('cluster', None)
  cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
  task_data = env.get('task', None) or {'type': 'master', 'index': 0}
  task_info = type('TaskSpec', (object,), task_data)

  # Parameters for a single worker.
  ps_tasks = 0
  worker_replicas = 1
  worker_job_name = 'lonely_worker'
  task = 0
  is_chief = True
  master = ''

  if cluster_data and 'worker' in cluster_data:
    # Number of total worker replicas include "worker"s and the "master".
    worker_replicas = len(cluster_data['worker']) + 1
  if cluster_data and 'ps' in cluster_data:
    ps_tasks = len(cluster_data['ps'])

  if worker_replicas > 1 and ps_tasks < 1:
    raise ValueError('At least 1 ps task is needed for distributed training.')

  if worker_replicas >= 1 and ps_tasks > 0:
    # Set up distributed training.
    server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc',
                             job_name=task_info.type,
                             task_index=task_info.index)
    if task_info.type == 'ps':
      server.join()
      return

    worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
    task = task_info.index
    is_chief = (task_info.type == 'master')
    master = server.target

  graph_rewriter_fn = None
  if 'graph_rewriter_config' in configs:
    graph_rewriter_fn = graph_rewriter_builder.build(
        configs['graph_rewriter_config'], is_training=True)

  trainer.train(
      create_input_dict_fn,
      model_fn,
      train_config,
      master,
      task,
      FLAGS.num_clones,
      worker_replicas,
      FLAGS.clone_on_cpu,
      ps_tasks,
      worker_job_name,
      is_chief,
      FLAGS.train_dir,
      graph_hook_fn=graph_rewriter_fn)
示例#11
0
def main(_):
    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)  # tf.gfile模块创建一个目录
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path
        )  #读取pipeline_config_path配置文件,返回一个dict,保存配置文件中`model`, `train_config`,
        #`train_input_config`, `eval_config`, `eval_input_config`信息
        if FLAGS.task == 0:
            tf.gfile.Copy(
                FLAGS.pipeline_config_path,
                os.path.join(FLAGS.train_dir, 'pipeline.config'),
                overwrite=True
            )  #把pipeline_config_path配置文件复制到train_dir目录下,命名为pipeline.config
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path
        )  #读取model_config_path、train_config_path、train_input_config_path的路径
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                tf.gfile.Copy(config,
                              os.path.join(FLAGS.train_dir, name),
                              overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']
    """"
  以下这行代码为核心代码,通过传入部分所需要的参数并且 “重新定义” 函数名称。这样简化函数,更少更灵活的函数参数调用。 
  通过functools.partial函数对model_builder.build函数赋予默认值,该目录下有一个model_builder模块,包含了生成网络模型的代码,
  包含ssd,fast_rcnn等众多模型代码,部分代码如下所示
  def build(model_config, is_training):
      if not isinstance(model_config, model_pb2.DetectionModel):
          raise ValueError('model_config not of type model_pb2.DetectionModel.')
      # 获取配置中的模型种类
      meta_architecture = model_config.WhichOneof('model')
      # 进行具体加载
      if meta_architecture == 'ssd':
          return _build_ssd_model(model_config.ssd, is_training)
      if meta_architecture == 'faster_rcnn':
          return _build_faster_rcnn_model(model_config.faster_rcnn, is_training)
      raise ValueError('Unknown meta architecture: {}'.format(meta_architecture))
      以'faster_rcnn模型为例子,进入_build_faster_rcnn_model(仍在model_builder.py文件中),该类中定义了fast_rcnn所有的参数
      之后说明每一个子模型的构建,比如image_resizer_builder的构建
      """

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    #第二阶段中的参数配置
    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)
    #python解码JSON对象
    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    trainer.train(create_input_dict_fn,
                  model_fn,
                  train_config,
                  master,
                  task,
                  FLAGS.num_clones,
                  worker_replicas,
                  FLAGS.clone_on_cpu,
                  ps_tasks,
                  worker_job_name,
                  is_chief,
                  FLAGS.train_dir,
                  graph_hook_fn=graph_rewriter_fn)
示例#12
0
def main():
    configs = config_util.get_configs_from_pipeline_file(CONFIG_PATH)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))
    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    graph_rewriter_fn = None
    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    trainer.train(create_input_dict_fn,
                  model_fn,
                  train_config,
                  master,
                  task,
                  NUM_CLONE,
                  worker_replicas,
                  CLONE_ON_CPU,
                  ps_tasks,
                  worker_job_name,
                  is_chief,
                  TRAINING_DIR,
                  graph_hook_fn=graph_rewriter_fn)
示例#13
0
  if task_info.type == 'ps':
    server.join()
    #return

  worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
  task = task_info.index
  is_chief = (task_info.type == 'master')
  master = server.target

graph_rewriter_fn = None
if 'graph_rewriter_config' in configs:
  graph_rewriter_fn = graph_rewriter_builder.build(
      configs['graph_rewriter_config'], is_training=True)

trainer.train(
    create_input_dict_fn,
    model_fn,
    train_config,
    master,
    task,
    FLAGS.num_clones,
    worker_replicas,
    FLAGS.clone_on_cpu,
    ps_tasks,
    worker_job_name,
    is_chief,
    train_dir,
    graph_hook_fn=graph_rewriter_fn)

示例#14
0
def train_tensorflow_object_detection_api(base_config_path, to_save_path,
                                          dataset_path, num_steps):

    os.makedirs(to_save_path, exist_ok=True)

    if len(os.listdir(to_save_path)) == 0:

        do_xml_to_csv(to_save_path, dataset_path)
        generate_config(base_config_path, to_save_path, num_steps)
        generate_tfrecords(base_config_path, to_save_path, dataset_path)

    checkpoint_path = "{}/training".format(to_save_path)
    config_file_path = "{}/config/faster_rcnn_inception_v2_pets.config".format(
        to_save_path)

    tf.logging.set_verbosity(tf.logging.INFO)
    flags = tf.app.flags
    flags.DEFINE_string('f', '', 'kernel')
    flags.DEFINE_string('master', '', 'Name of the TensorFlow master to use.')
    flags.DEFINE_integer('task', 0, 'task id')
    flags.DEFINE_integer('num_clones', 1,
                         'Number of clones to deploy per worker.')
    flags.DEFINE_boolean(
        'clone_on_cpu', False,
        'Force clones to be deployed on CPU.  Note that even if '
        'set to False (allowing ops to run on gpu), some ops may '
        'still be run on the CPU if they have no GPU kernel.')
    flags.DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer '
                         'replicas.')
    flags.DEFINE_integer(
        'ps_tasks', 0,
        'Number of parameter server tasks. If None, does not use '
        'a parameter server.')
    flags.DEFINE_string(
        'train_dir', '{}'.format(checkpoint_path),
        'Directory to save the checkpoints and training summaries.')
    flags.DEFINE_string(
        'pipeline_config_path', '{}'.format(config_file_path),
        'Path to a pipeline_pb2.TrainEvalPipelineConfig config '
        'file. If provided, other configs are ignored')

    flags.DEFINE_string('train_config_path', '',
                        'Path to a train_pb2.TrainConfig config file.')
    flags.DEFINE_string(
        'input_config_path', '',
        'Path to an input_reader_pb2.InputReader config file.')
    flags.DEFINE_string('model_config_path', '',
                        'Path to a model_pb2.DetectionModel config file.')

    FLAGS = flags.FLAGS

    assert FLAGS.train_dir, '`train_dir` is missing.'
    if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir)
    if FLAGS.pipeline_config_path:
        configs = config_util.get_configs_from_pipeline_file(
            FLAGS.pipeline_config_path)
        if FLAGS.task == 0:
            tf.gfile.Copy(FLAGS.pipeline_config_path,
                          os.path.join(FLAGS.train_dir, 'pipeline.config'),
                          overwrite=True)
    else:
        configs = config_util.get_configs_from_multiple_files(
            model_config_path=FLAGS.model_config_path,
            train_config_path=FLAGS.train_config_path,
            train_input_config_path=FLAGS.input_config_path)
        if FLAGS.task == 0:
            for name, config in [('model.config', FLAGS.model_config_path),
                                 ('train.config', FLAGS.train_config_path),
                                 ('input.config', FLAGS.input_config_path)]:
                tf.gfile.Copy(config,
                              os.path.join(FLAGS.train_dir, name),
                              overwrite=True)

    model_config = configs['model']
    train_config = configs['train_config']
    input_config = configs['train_input_config']

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    def get_next(config):
        return dataset_builder.make_initializable_iterator(
            dataset_builder.build(config)).get_next()

    create_input_dict_fn = functools.partial(get_next, input_config)

    env = json.loads(os.environ.get('TF_CONFIG', '{}'))

    cluster_data = env.get('cluster', None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get('task', None) or {'type': 'master', 'index': 0}
    task_info = type('TaskSpec', (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = 'lonely_worker'
    task = 0
    is_chief = True
    master = ''

    if cluster_data and 'worker' in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data['worker']) + 1
    if cluster_data and 'ps' in cluster_data:
        ps_tasks = len(cluster_data['ps'])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            'At least 1 ps task is needed for distributed training.')

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(tf.train.ClusterSpec(cluster),
                                 protocol='grpc',
                                 job_name=task_info.type,
                                 task_index=task_info.index)
        if task_info.type == 'ps':
            server.join()
            return

        worker_job_name = '%s/task:%d' % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = (task_info.type == 'master')
        master = server.target

    graph_rewriter_fn = None

    if 'graph_rewriter_config' in configs:
        graph_rewriter_fn = graph_rewriter_builder.build(
            configs['graph_rewriter_config'], is_training=True)

    # get TF logger
    log = logging.getLogger('tensorflow')
    log.setLevel(logging.INFO)
    # create formatter and add it to the handlers
    formatter = logging.Formatter(
        '%(asctime)s - %(name)s - %(levelname)s - %(message)s')
    # create file handler which logs even debug messages
    fh = logging.FileHandler('{}training/training.log'.format(to_save_path))
    fh.setLevel(logging.INFO)
    fh.setFormatter(formatter)
    log.addHandler(fh)

    trainer.train(create_input_dict_fn,
                  model_fn,
                  train_config,
                  master,
                  task,
                  FLAGS.num_clones,
                  worker_replicas,
                  FLAGS.clone_on_cpu,
                  ps_tasks,
                  worker_job_name,
                  is_chief,
                  FLAGS.train_dir,
                  graph_hook_fn=graph_rewriter_fn)


# # tf.app.run(main=train_tensorflow_object_detection_api)
# base_config_path = '/home/aniruddh/lincode/product/livis-develop_v2/gpu_q/tf_training/'
# to_save_path = '/home/aniruddh/lincode/product/livis-develop_v2/experiments/tensorflow/t2/'
# dataset_path = '/home/aniruddh/lincode/product/images/'
# num_steps = 15000
# train_tensorflow_object_detection_api(base_config_path, to_save_path, dataset_path, num_steps)

# os.makedirs(to_save_path, exist_ok=True)

# do_xml_to_csv(to_save_path, dataset_path)
# config_path = generate_config(base_config_path, to_save_path, num_steps)
# generate_tfrecords(base_config_path, to_save_path, dataset_path)
# train_tf_model(base_config_path, to_save_path, dataset_path, num_steps)
示例#15
0
def main(_):
    assert FLAGS.train_dir, "`train_dir` is missing."
    if FLAGS.pipeline_config_path:
        model_config, train_config, input_config = (
            get_configs_from_pipeline_file())
    else:
        model_config, train_config, input_config = (
            get_configs_from_multiple_files())

    model_fn = functools.partial(model_builder.build,
                                 model_config=model_config,
                                 is_training=True)

    create_input_dict_fn = functools.partial(input_reader_builder.build,
                                             input_config)

    env = json.loads(os.environ.get("TF_CONFIG", "{}"))
    cluster_data = env.get("cluster", None)
    cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None
    task_data = env.get("task", None) or {"type": "master", "index": 0}
    task_info = type("TaskSpec", (object, ), task_data)

    # Parameters for a single worker.
    ps_tasks = 0
    worker_replicas = 1
    worker_job_name = "lonely_worker"
    task = 0
    is_chief = True
    master = ""

    if cluster_data and "worker" in cluster_data:
        # Number of total worker replicas include "worker"s and the "master".
        worker_replicas = len(cluster_data["worker"]) + 1
    if cluster_data and "ps" in cluster_data:
        ps_tasks = len(cluster_data["ps"])

    if worker_replicas > 1 and ps_tasks < 1:
        raise ValueError(
            "At least 1 ps task is needed for distributed training.")

    if worker_replicas >= 1 and ps_tasks > 0:
        # Set up distributed training.
        server = tf.train.Server(
            tf.train.ClusterSpec(cluster),
            protocol="grpc",
            job_name=task_info.type,
            task_index=task_info.index,
        )
        if task_info.type == "ps":
            server.join()
            return

        worker_job_name = "%s/task:%d" % (task_info.type, task_info.index)
        task = task_info.index
        is_chief = task_info.type == "master"
        master = server.target

    trainer.train(
        create_input_dict_fn,
        model_fn,
        train_config,
        master,
        task,
        FLAGS.num_clones,
        worker_replicas,
        FLAGS.clone_on_cpu,
        ps_tasks,
        worker_job_name,
        is_chief,
        FLAGS.train_dir,
    )