def _train(self): tf.disable_eager_execution() ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' graph_rewriter_fn = None # loading and reading the config file configs = create_configs_from_pipeline_proto(self.pipeline) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] # creating the tf object detection api model (from the config parameters) model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator(dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build(configs['graph_rewriter_config'], is_training=True) # training the model with the new parameters trainer.train(create_input_dict_fn, model_fn, train_config, master, task, 1, worker_replicas, False, ps_tasks, worker_job_name, is_chief, str(self._out_folder), graph_hook_fn=graph_rewriter_fn)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.pipeline_config_path: model_config, train_config, input_config = get_configs_from_pipeline_file( ) else: model_config, train_config, input_config = get_configs_from_multiple_files( ) model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def train_car(train_dir="models/faster_car_models", pipeline_config_path="pipeline/faster_rcnn_resnet101_car.config", num_clones=1, clone_on_cpu=False): if pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) if os.path.exists(train_dir) is not True: os.makedirs(train_dir) tf.gfile.Copy(pipeline_config_path, os.path.join(train_dir, 'pipeline.config'), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' graph_rewriter_fn = None trainer.train(create_input_dict_fn, model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, graph_hook_fn=graph_rewriter_fn)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' tf.gfile.MakeDirs(FLAGS.train_dir) assert FLAGS.pipeline_config_path, '`pipeline_config_path` is missing.' configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' num_clones = 1 clone_on_cpu = False graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn)
def test_configure_trainer_with_multiclass_scores_and_train_two_steps( self): train_config_text_proto = """ optimizer { adam_optimizer { learning_rate { constant_learning_rate { learning_rate: 0.01 } } } } data_augmentation_options { random_adjust_brightness { max_delta: 0.2 } } data_augmentation_options { random_adjust_contrast { min_delta: 0.7 max_delta: 1.1 } } num_steps: 2 use_multiclass_scores: true """ train_config = train_pb2.TrainConfig() text_format.Merge(train_config_text_proto, train_config) train_dir = self.get_temp_dir() trainer.train(create_tensor_dict_fn=get_input_function, create_model_fn=FakeDetectionModel, train_config=train_config, master='', task=0, num_clones=1, worker_replicas=1, clone_on_cpu=True, ps_tasks=0, worker_job_name='worker', is_chief=True, train_dir=train_dir)
def test_configure_trainer_with_multiclass_scores_and_train_two_steps(self): train_config_text_proto = """ optimizer { adam_optimizer { learning_rate { constant_learning_rate { learning_rate: 0.01 } } } } data_augmentation_options { random_adjust_brightness { max_delta: 0.2 } } data_augmentation_options { random_adjust_contrast { min_delta: 0.7 max_delta: 1.1 } } num_steps: 2 use_multiclass_scores: true """ train_config = train_pb2.TrainConfig() text_format.Merge(train_config_text_proto, train_config) train_dir = self.get_temp_dir() trainer.train(create_tensor_dict_fn=get_input_function, create_model_fn=FakeDetectionModel, train_config=train_config, master='', task=0, num_clones=1, worker_replicas=1, clone_on_cpu=True, ps_tasks=0, worker_job_name='worker', is_chief=True, train_dir=train_dir)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: print(FLAGS.train_dir, name) tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) trainer.train( create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn)
def main(_): print("starting program . . .") # show info to std out during the training process tf.logging.set_verbosity(tf.logging.INFO) if not checkIfNecessaryPathsAndFilesExist(): return # end if configs = config_util.get_configs_from_pipeline_file(PIPELINE_CONFIG_PATH) tf.gfile.Copy(PIPELINE_CONFIG_PATH, os.path.join(TRAINING_DATA_DIR, 'pipeline.config'), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) # ToDo: this nested function seems odd, factor this out eventually ?? # nested function def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() # end nested function create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # parameters for a single worker ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 # end if if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) # end if if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') # end if if worker_replicas >= 1 and ps_tasks > 0: # set up distributed training server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return # end if worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target # end if trainer.train(create_input_dict_fn, model_fn, train_config, master, task, NUM_CLONES, worker_replicas, CLONE_ON_CPU, ps_tasks, worker_job_name, is_chief, TRAINING_DATA_DIR)
def main(_): print("starting program . . .") # show info to std out during the training process tf.logging.set_verbosity(tf.logging.INFO) if not checkIfNecessaryPathsAndFilesExist(): return # end if configs = config_util.get_configs_from_pipeline_file(PIPELINE_CONFIG_PATH) tf.gfile.Copy(PIPELINE_CONFIG_PATH, os.path.join(TRAINING_DATA_DIR, 'pipeline.config'), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) # ToDo: this nested function seems odd, factor this out eventually ?? # nested function def get_next(config): return dataset_builder.make_initializable_iterator(dataset_builder.build(config)).get_next() # end nested function create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # parameters for a single worker ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 # end if if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) # end if if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') # end if if worker_replicas >= 1 and ps_tasks > 0: # set up distributed training server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return # end if worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target # end if trainer.train(create_input_dict_fn, model_fn, train_config, master, task, NUM_CLONES, worker_replicas, CLONE_ON_CPU, ps_tasks, worker_job_name, is_chief, TRAINING_DATA_DIR)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) trainer.train( create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) # tf.gfile模块创建一个目录 if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path ) #读取pipeline_config_path配置文件,返回一个dict,保存配置文件中`model`, `train_config`, #`train_input_config`, `eval_config`, `eval_input_config`信息 if FLAGS.task == 0: tf.gfile.Copy( FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True ) #把pipeline_config_path配置文件复制到train_dir目录下,命名为pipeline.config else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path ) #读取model_config_path、train_config_path、train_input_config_path的路径 if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] """" 以下这行代码为核心代码,通过传入部分所需要的参数并且 “重新定义” 函数名称。这样简化函数,更少更灵活的函数参数调用。 通过functools.partial函数对model_builder.build函数赋予默认值,该目录下有一个model_builder模块,包含了生成网络模型的代码, 包含ssd,fast_rcnn等众多模型代码,部分代码如下所示 def build(model_config, is_training): if not isinstance(model_config, model_pb2.DetectionModel): raise ValueError('model_config not of type model_pb2.DetectionModel.') # 获取配置中的模型种类 meta_architecture = model_config.WhichOneof('model') # 进行具体加载 if meta_architecture == 'ssd': return _build_ssd_model(model_config.ssd, is_training) if meta_architecture == 'faster_rcnn': return _build_faster_rcnn_model(model_config.faster_rcnn, is_training) raise ValueError('Unknown meta architecture: {}'.format(meta_architecture)) 以'faster_rcnn模型为例子,进入_build_faster_rcnn_model(仍在model_builder.py文件中),该类中定义了fast_rcnn所有的参数 之后说明每一个子模型的构建,比如image_resizer_builder的构建 """ model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) #第二阶段中的参数配置 def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) #python解码JSON对象 env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn)
def main(): configs = config_util.get_configs_from_pipeline_file(CONFIG_PATH) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, NUM_CLONE, worker_replicas, CLONE_ON_CPU, ps_tasks, worker_job_name, is_chief, TRAINING_DIR, graph_hook_fn=graph_rewriter_fn)
if task_info.type == 'ps': server.join() #return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) trainer.train( create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, graph_hook_fn=graph_rewriter_fn)
def train_tensorflow_object_detection_api(base_config_path, to_save_path, dataset_path, num_steps): os.makedirs(to_save_path, exist_ok=True) if len(os.listdir(to_save_path)) == 0: do_xml_to_csv(to_save_path, dataset_path) generate_config(base_config_path, to_save_path, num_steps) generate_tfrecords(base_config_path, to_save_path, dataset_path) checkpoint_path = "{}/training".format(to_save_path) config_file_path = "{}/config/faster_rcnn_inception_v2_pets.config".format( to_save_path) tf.logging.set_verbosity(tf.logging.INFO) flags = tf.app.flags flags.DEFINE_string('f', '', 'kernel') flags.DEFINE_string('master', '', 'Name of the TensorFlow master to use.') flags.DEFINE_integer('task', 0, 'task id') flags.DEFINE_integer('num_clones', 1, 'Number of clones to deploy per worker.') flags.DEFINE_boolean( 'clone_on_cpu', False, 'Force clones to be deployed on CPU. Note that even if ' 'set to False (allowing ops to run on gpu), some ops may ' 'still be run on the CPU if they have no GPU kernel.') flags.DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer ' 'replicas.') flags.DEFINE_integer( 'ps_tasks', 0, 'Number of parameter server tasks. If None, does not use ' 'a parameter server.') flags.DEFINE_string( 'train_dir', '{}'.format(checkpoint_path), 'Directory to save the checkpoints and training summaries.') flags.DEFINE_string( 'pipeline_config_path', '{}'.format(config_file_path), 'Path to a pipeline_pb2.TrainEvalPipelineConfig config ' 'file. If provided, other configs are ignored') flags.DEFINE_string('train_config_path', '', 'Path to a train_pb2.TrainConfig config file.') flags.DEFINE_string( 'input_config_path', '', 'Path to an input_reader_pb2.InputReader config file.') flags.DEFINE_string('model_config_path', '', 'Path to a model_pb2.DetectionModel config file.') FLAGS = flags.FLAGS assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) # get TF logger log = logging.getLogger('tensorflow') log.setLevel(logging.INFO) # create formatter and add it to the handlers formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') # create file handler which logs even debug messages fh = logging.FileHandler('{}training/training.log'.format(to_save_path)) fh.setLevel(logging.INFO) fh.setFormatter(formatter) log.addHandler(fh) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn) # # tf.app.run(main=train_tensorflow_object_detection_api) # base_config_path = '/home/aniruddh/lincode/product/livis-develop_v2/gpu_q/tf_training/' # to_save_path = '/home/aniruddh/lincode/product/livis-develop_v2/experiments/tensorflow/t2/' # dataset_path = '/home/aniruddh/lincode/product/images/' # num_steps = 15000 # train_tensorflow_object_detection_api(base_config_path, to_save_path, dataset_path, num_steps) # os.makedirs(to_save_path, exist_ok=True) # do_xml_to_csv(to_save_path, dataset_path) # config_path = generate_config(base_config_path, to_save_path, num_steps) # generate_tfrecords(base_config_path, to_save_path, dataset_path) # train_tf_model(base_config_path, to_save_path, dataset_path, num_steps)
def main(_): assert FLAGS.train_dir, "`train_dir` is missing." if FLAGS.pipeline_config_path: model_config, train_config, input_config = ( get_configs_from_pipeline_file()) else: model_config, train_config, input_config = ( get_configs_from_multiple_files()) model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) env = json.loads(os.environ.get("TF_CONFIG", "{}")) cluster_data = env.get("cluster", None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get("task", None) or {"type": "master", "index": 0} task_info = type("TaskSpec", (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = "lonely_worker" task = 0 is_chief = True master = "" if cluster_data and "worker" in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data["worker"]) + 1 if cluster_data and "ps" in cluster_data: ps_tasks = len(cluster_data["ps"]) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( "At least 1 ps task is needed for distributed training.") if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server( tf.train.ClusterSpec(cluster), protocol="grpc", job_name=task_info.type, task_index=task_info.index, ) if task_info.type == "ps": server.join() return worker_job_name = "%s/task:%d" % (task_info.type, task_info.index) task = task_info.index is_chief = task_info.type == "master" master = server.target trainer.train( create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, )