def main(_): #gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.5) #sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.pipeline_config_path: model_config, train_config, input_config = get_configs_from_pipeline_file( ) else: model_config, train_config, input_config = get_configs_from_multiple_files( ) model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def start(self): if self.config is None: logger.error('No Config Found') return train_pipeline_file = self.config['pipeline_config_file'] configs = self._get_configs_from_pipeline_file(train_pipeline_file) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] logger.info('Building Model') model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) logger.info('creating input dict') create_input_dict_fn = functools.partial(self.get_next, input_config) ps_tasks = 0 worker_replicas = 1 worker_job_name = 'obj_detection_trainer' task = 0 is_chief = True master = '' num_clones = 1 clone_on_cpu = False try: logger.info('Training Started') trainer.train(create_input_dict_fn, model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, self.config) except: logger.error('Cannot Start Training') traceback.print_exc(file=sys.stdout)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.pipeline_config_path: model_config, train_config, input_config = get_configs_from_pipeline_file() else: model_config, train_config, input_config = get_configs_from_multiple_files() model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial( input_reader_builder.build, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def train_process(model_config, input_config, train_config, train_dir, num_clones=1, clone_on_cpu=False): model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target # change_process_config(os.getpid()) total_loss = trainer.train(create_input_dict_fn, model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir) return total_loss
def test_configure_trainer_with_multiclass_scores_and_train_two_steps( self): train_config_text_proto = """ optimizer { adam_optimizer { learning_rate { constant_learning_rate { learning_rate: 0.01 } } } } data_augmentation_options { random_adjust_brightness { max_delta: 0.2 } } data_augmentation_options { random_adjust_contrast { min_delta: 0.7 max_delta: 1.1 } } num_steps: 2 use_multiclass_scores: true """ train_config = train_pb2.TrainConfig() text_format.Merge(train_config_text_proto, train_config) train_dir = self.get_temp_dir() trainer.train(create_tensor_dict_fn=get_input_function, create_model_fn=FakeDetectionModel, train_config=train_config, master='', task=0, num_clones=1, worker_replicas=1, clone_on_cpu=True, ps_tasks=0, worker_job_name='worker', is_chief=True, train_dir=train_dir)
def test_configure_trainer_with_multiclass_scores_and_train_two_steps(self): train_config_text_proto = """ optimizer { adam_optimizer { learning_rate { constant_learning_rate { learning_rate: 0.01 } } } } data_augmentation_options { random_adjust_brightness { max_delta: 0.2 } } data_augmentation_options { random_adjust_contrast { min_delta: 0.7 max_delta: 1.1 } } num_steps: 2 use_multiclass_scores: true """ train_config = train_pb2.TrainConfig() text_format.Merge(train_config_text_proto, train_config) train_dir = self.get_temp_dir() trainer.train(create_tensor_dict_fn=get_input_function, create_model_fn=FakeDetectionModel, train_config=train_config, master='', task=0, num_clones=1, worker_replicas=1, clone_on_cpu=True, ps_tasks=0, worker_job_name='worker', is_chief=True, train_dir=train_dir)
def train(): return trainer.train(create_tensor_dict_fn=train_input_dict_fn, create_model_fn=train_model_fn, train_config=train_config, master=master, task=task, num_clones=FLAGS.num_clones, worker_replicas=worker_replicas, clone_on_cpu=FLAGS.clone_on_cpu, ps_tasks=parameter_server_tasks, worker_job_name=worker_job_name, is_chief=is_chief, train_dir=FLAGS.train_dir, graph_hook_fn=train_graph_rewriter_fn)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' model_config, train_config, input_config = get_configs_from_pipeline_file() model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target total_num_steps = train_config.num_steps current_step = FLAGS.eval_every_n_steps print("Total number of training steps {}".format(train_config.num_steps)) print("Evaluation will run every {} steps".format( FLAGS.eval_every_n_steps)) train_config.num_steps = current_step while current_step <= total_num_steps: print("Training steps # {0}".format(current_step)) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir) tf.reset_default_graph() evaluate_step() tf.reset_default_graph() current_step = current_step + FLAGS.eval_every_n_steps train_config.num_steps = current_step if current_step > FLAGS.eval_every_n_steps: train_config.num_steps = total_num_steps print("Training steps # {0}".format(train_config.num_steps)) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def start_training(self): """Start training for the model""" worker_replicas = 1 ps_tasks = 0 clone_on_cpu = False num_clones = 1 ensure_path(config.BASE_MODELS_PATH) train_dir = self.train_dir model_json_path = os.path.join(train_dir, 'job.json') job = self.job num_steps = int(job['steps']) try: if config.DEBUG: num_steps = 50 except AttributeError: pass except Exception as e: _LOGGER.error(e) job = api.update_job_state(job, 'training', 'Start training for {} steps'.format(num_steps)) model = self.model ensure_path(config.EXPORTED_MODELS) model_graph = os.path.join(config.EXPORTED_MODELS, '{}.pb'.format(model['file_name'])) if not os.path.exists(os.path.join(train_dir, 'checkpoint')): # New training started _LOGGER.debug("Checkpoints doesn't exists") base_checkpoints_path = os.path.join(config.BASE_MODELS_PATH, model['architecture']) _tmf = os.path.join(config.TRAINED_MODELS_DATA, model['file_name']) if os.path.isdir(_tmf): _LOGGER.debug("Model already exists as %s" % model_graph) base_checkpoints_path = _tmf elif model['type'] == 'new': _LOGGER.debug("model type new") else: _LOGGER.debug("New model from parent model") parent_model = api.get_model(model['parent']) if not parent_model: raise Exception('Parent model not found on server') parent_tmf = os.path.join(config.TRAINED_MODELS_DATA, parent_model['file_name']) if os.path.isdir(parent_tmf): base_checkpoints_path = parent_tmf else: _LOGGER.error("Parent model not found. please train it first") return False if not os.path.exists(os.path.join(base_checkpoints_path, 'model.ckpt.meta')): _LOGGER.debug("Base model not found for %s, Downloading now." % model['architecture']) _f = api.download_model_files(model['architecture']) tmp_model_data = os.path.join(config.DATA_DIR, 'tmp_model_data') if tarfile.is_tarfile(_f): if os.path.exists(tmp_model_data): shutil.rmtree(tmp_model_data) ensure_path(tmp_model_data) print("Tar file found") shutil.unpack_archive(_f, tmp_model_data) for root, dirs, files in os.walk(tmp_model_data): for file in files: if 'model.ckpt' in file: path = os.path.join(root, file) # print(path) ensure_path(base_checkpoints_path) shutil.copy(path, os.path.join(base_checkpoints_path, file)) else: _LOGGER.error("Invalid file") return False if os.path.exists(train_dir): shutil.rmtree(train_dir) shutil.copytree(base_checkpoints_path, train_dir) if os.path.exists(os.path.join(train_dir, 'checkpoint')): os.remove(os.path.join(train_dir, 'checkpoint')) if os.path.exists(os.path.join(train_dir, 'data')): shutil.rmtree(os.path.join(train_dir, 'data')) shutil.copytree(self.data_dir, os.path.join(train_dir, 'data')) counts = {'train': 0, 'test': 1000, 'classes': 1} stats_file = os.path.join(train_dir, "data", "stats.json") try: with open(stats_file) as _f: counts = json.load(_f) except: pass pipeline_config_path = os.path.join(train_dir, 'pipeline.config') if not os.path.exists(pipeline_config_path): pipeline_config_path = os.path.join(self.configs_dir, "{}.config".format(model['architecture'])) task = '0' if task == '0': tf.gfile.MakeDirs(train_dir) if pipeline_config_path: _LOGGER.info("Pipeline config file : {}".format(pipeline_config_path)) configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) if task == '0': tf.gfile.Copy(pipeline_config_path, os.path.join(train_dir, 'pipeline.config'), overwrite=True) else: _LOGGER.error("No config found") return False pipeline_config_path = os.path.join(train_dir, 'pipeline.config') # with open(model_json_path, 'w') as mf: # json.dump(job, mf) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] if model_config.HasField('faster_rcnn'): model_config.faster_rcnn.num_classes = counts['classes'] if model_config.HasField('ssd'): model_config.ssd.num_classes = counts['classes'] # Set num_steps train_config.num_steps = num_steps train_config.fine_tune_checkpoint = os.path.join(train_dir, 'model.ckpt') # Update input config to use updated list of input input_config.tf_record_input_reader.ClearField('input_path') input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "train_baheads.tfrecord-??????")) input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt") eval_config = configs['eval_config'] eval_input_config = configs['eval_input_config'] eval_config.num_examples = counts['test'] eval_config.max_evals = 1 # Update input config to use updated list of input eval_input_config.tf_record_input_reader.ClearField('input_path') eval_input_config.tf_record_input_reader.input_path.append(os.path.join(train_dir, 'data', "test_baheads.tfrecord-??????")) eval_input_config.label_map_path = os.path.join(train_dir, 'data', "labels.pbtxt") # Save the updated config to pipeline file config_util.save_pipeline_config(config_util.create_pipeline_proto_from_configs({ 'model': model_config, 'train_config': train_config, 'train_input_config': input_config, 'eval_config': eval_config, 'eval_input_config': eval_input_config }), train_dir) model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) if not os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))): status_timer = StatusThread(tfh, num_steps, job) status_timer.start() try: trainer.train( create_input_dict_fn, model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, graph_hook_fn=graph_rewriter_fn) except KeyboardInterrupt: raise finally: status_timer.stop() if status_timer.is_alive(): _LOGGER.info("Waiting for status thread to close") status_timer.join() if os.path.exists(os.path.join(train_dir, 'model.ckpt-{}.meta'.format(num_steps))): # Training complete. Export model _LOGGER.debug("Training complete for %d steps" % num_steps) job = api.update_job_state(job, 'training', 'Training complete') export_path = os.path.join(config.TRAINED_MODELS_DATA, model['file_name']) if os.path.exists(export_path): shutil.rmtree(export_path) ckpt_path = os.path.join(train_dir, 'model.ckpt-{}'.format(num_steps)) exporter.export(pipeline_config_path, export_path, ckpt_path) frozen_graph = os.path.join(export_path, 'frozen_inference_graph.pb') if os.path.exists(frozen_graph): # Successfully exported shutil.copy(frozen_graph, model_graph) shutil.copy( os.path.join(train_dir, 'data', "labels.pbtxt"), os.path.join(config.EXPORTED_MODELS, '{}.pbtxt'.format(model['file_name'])) ) # TODO: Eval the trained graph, Push the result to server. eval_dir = 'eval_dir' tf.reset_default_graph() eval_result = run_eval(train_dir, eval_dir, pipeline_config_path, counts['test']) if 'PascalBoxes_Precision/[email protected]' in eval_result: acc = eval_result['PascalBoxes_Precision/[email protected]'] * 100 _LOGGER.info("PascalBoxes_Precision/[email protected] : %d %%" % (acc)) job = api.update_job_state(job, 'complete', 'PascalBoxes_Precision %d %%' % (acc)) _LOGGER.info(eval_result) if os.path.exists(train_dir): shutil.rmtree(train_dir) return True return False
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.pipeline_config_path: model_config, train_config, input_config = get_configs_from_pipeline_file( ) #we use this function cz we supply a config file else: model_config, train_config, input_config = get_configs_from_multiple_files( ) #Here the funct tool will make the build function with some parameters alread filled in . Kind of sub function of the original model_fn = functools.partial( #create the model with the parameters provided by the config file and model_builder.build, model_config=model_config, is_training=True) #Now it's time to create the input pipeline or place holders create_input_dict_fn = functools.partial( #Now creating the input feed with the protobuf input_reader_builder.build, input_config) ################################################################################################################################## #regarding the distributed training set up env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target #end of the distributed training thing ############################################################################################################################################# trainer.train( create_input_dict_fn, model_fn, train_config, master, task, #call a function inorder to train FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) trainer.train( create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.export_model: assert FLAGS.pipeline_config_path, '`pipeline_config_path` is required if exporting model' pipeline_config = pipeline_pb2.TrainEvalPipelineConfig() with tf.gfile.GFile(FLAGS.pipeline_config_path, 'r') as f: text_format.Merge(f.read(), pipeline_config) if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial( input_reader_builder.build, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir) if FLAGS.export_model: latest_ckpt = tf.train.latest_checkpoint(FLAGS.train_dir) exporter.export_inference_graph(FLAGS.input_type, pipeline_config, latest_ckpt, FLAGS.saved_model_output_dir, FLAGS.input_shape)
if cluster_data and 'worker' in cluster_data: worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir) if __name__ == '__main__': tf.app.run()
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) # tf.gfile模块创建一个目录 if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path)#读取pipeline_config_path配置文件,返回一个dict,保存配置文件中`model`, `train_config`, #`train_input_config`, `eval_config`, `eval_input_config`信息 if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) #把pipeline_config_path配置文件复制到train_dir目录下,命名为pipeline.config else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path)#读取model_config_path、train_config_path、train_input_config_path的路径 if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] """" 以下这行代码为核心代码,通过传入部分所需要的参数并且 “重新定义” 函数名称。这样简化函数,更少更灵活的函数参数调用。 通过functools.partial函数对model_builder.build函数赋予默认值,该目录下有一个model_builder模块,包含了生成网络模型的代码, 包含ssd,fast_rcnn等众多模型代码,部分代码如下所示 def build(model_config, is_training): if not isinstance(model_config, model_pb2.DetectionModel): raise ValueError('model_config not of type model_pb2.DetectionModel.') # 获取配置中的模型种类 meta_architecture = model_config.WhichOneof('model') # 进行具体加载 if meta_architecture == 'ssd': return _build_ssd_model(model_config.ssd, is_training) if meta_architecture == 'faster_rcnn': return _build_faster_rcnn_model(model_config.faster_rcnn, is_training) raise ValueError('Unknown meta architecture: {}'.format(meta_architecture)) 以'faster_rcnn模型为例子,进入_build_faster_rcnn_model(仍在model_builder.py文件中),该类中定义了fast_rcnn所有的参数 之后说明每一个子模型的构建,比如image_resizer_builder的构建 """ model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) #第二阶段中的参数配置 def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) #python解码JSON对象 env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) trainer.train( create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn)
def main(train_dir, pipeline_config_path, train_config_path="", input_config_path="", model_config_path="", master="", task=0, num_clones=1, clone_on_cpu=False, worker_replicas=1, ps_tasks=0): """ DEFINE_string('master', '', 'Name of the TensorFlow master to use.') DEFINE_integer('task', 0, 'task id') DEFINE_integer('num_clones', 1, 'Number of clones to deploy per worker.') DEFINE_boolean('clone_on_cpu', False, 'Force clones to be deployed on CPU. Note that even if ' 'set to False (allowing ops to run on gpu), some ops may ' 'still be run on the CPU if they have no GPU kernel.') DEFINE_integer('worker_replicas', 1, 'Number of worker+trainer ' 'replicas.') DEFINE_integer('ps_tasks', 0, 'Number of parameter server tasks. If None, does not use ' 'a parameter server.') DEFINE_string('train_dir', '', 'Directory to save the checkpoints and training summaries.') DEFINE_string('pipeline_config_path', '', 'Path to a pipeline_pb2.TrainEvalPipelineConfig config ' 'file. If provided, other configs are ignored') DEFINE_string('train_config_path', '', 'Path to a train_pb2.TrainConfig config file.') DEFINE_string('input_config_path', '', 'Path to an input_reader_pb2.InputReader config file.') DEFINE_string('model_config_path', '', 'Path to a model_pb2.DetectionModel config file.') """ tf.logging.set_verbosity(tf.logging.INFO) if task == 0: tf.gfile.MakeDirs(train_dir) if pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( pipeline_config_path) if task == 0: tf.gfile.Copy(pipeline_config_path, os.path.join(train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=model_config_path, train_config_path=train_config_path, train_input_config_path=input_config_path) if task == 0: for name, config in [('model.config', model_config_path), ('train.config', train_config_path), ('input.config', input_config_path)]: tf.gfile.Copy(config, os.path.join(train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial( model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_builder.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object,), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError('At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) print("\n\n\n\n\nMADE IT HERE\n\n\n\n\n\n") trainer.train( create_input_dict_fn, model_fn, train_config, master, task, num_clones, worker_replicas, clone_on_cpu, ps_tasks, worker_job_name, is_chief, train_dir, graph_hook_fn=graph_rewriter_fn) print("MADE IT THERE")
def main(_): if FLAGS.train_label: FLAGS.pipeline_config_path = '../configs/test/' + FLAGS.train_label + '.config' FLAGS.train_dir = '../checkpoints/train/' + FLAGS.train_label FLAGS.train_tag = FLAGS.train_label if FLAGS.pipeline_config_dir_path: model_configs, train_configs, input_configs, eval_configs, eval_input_configs = get_configs_from_dir( ) else: total_configs = get_configs_from_pipeline_file() if FLAGS.pipeline_config_path: model_config, train_config, input_config, eval_config, eval_input_config = total_configs else: model_config, train_config, input_config = total_configs if not FLAGS.train_dir: root_dir = utils.get_tempdir() dataset = os.path.basename( input_config.label_map_path).split('_')[0].upper() tempfile.tempdir = utils.mkdir_p(os.path.join(root_dir, dataset)) meta_architecture = model_config.WhichOneof('model') model_name = meta_architecture.upper() tempfile.tempdir = utils.mkdir_p( os.path.join(tempfile.tempdir, model_name)) if meta_architecture == 'ssd': meta_config = model_config.ssd elif meta_architecture == 'faster_rcnn': meta_config = model_config.faster_rcnn else: raise ValueError( 'Unknown meta architecture: {}'.format(meta_architecture)) feature_extractor = meta_config.feature_extractor.type backbone_name = feature_extractor.replace(meta_architecture, '').lstrip('_').upper() tempfile.tempdir = utils.mkdir_p( os.path.join(tempfile.tempdir, backbone_name)) train_prefix = "small-%s-" % time.strftime("%Y%m%d-%H%M%S") FLAGS.train_dir = tempfile.mkdtemp(suffix="-" + FLAGS.train_tag, prefix=train_prefix) if not os.path.exists(FLAGS.train_dir): os.makedirs(FLAGS.train_dir) # Save configuration def _save_config(config, prefix): config_str = text_format.MessageToString(config) save_path = os.path.join(FLAGS.train_dir, prefix + '.config') with open(save_path, 'w') as f: f.write(config_str) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target if not FLAGS.pipeline_config_dir_path: # Not consecutive training _save_config(model_config, 'model') _save_config(train_config, 'train') _save_config(input_config, 'train_input') if FLAGS.pipeline_config_path: _save_config(eval_config, 'eval') _save_config(eval_input_config, 'eval_input') model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) num_examples = sum(1 for _ in tf.python_io.tf_record_iterator( input_config.tf_record_input_reader.input_path)) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, num_examples, total_configs=total_configs, model_config=model_config) else: # Consecutive training num_of_configs = len(model_configs) for config_index in range(num_of_configs): model_config = model_configs[config_index] train_config = train_configs[config_index] input_config = input_configs[config_index] eval_config = eval_configs[config_index] eval_input_config = eval_input_configs[config_index] total_configs = (model_config, train_config, input_config, eval_config, eval_input_config) _save_config(model_config, 'model') _save_config(train_config, 'train') _save_config(input_config, 'train_input') _save_config(eval_config, 'eval') _save_config(eval_input_config, 'eval_input') model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial( input_reader_builder.build, input_config) num_examples = sum(1 for _ in tf.python_io.tf_record_iterator( input_config.tf_record_input_reader.input_path)) trainer.train( create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, num_examples, total_configs=total_configs, is_first_training=(True if config_index == 0 else False)) def _is_last_training(): return config_index == num_of_configs - 1 if _is_last_training(): break # Remove all the files except events files in train_dir for the next training. for f in os.listdir(FLAGS.train_dir): path_to_file = os.path.join(FLAGS.train_dir, f) if os.path.isfile(path_to_file) and not f.startswith('events'): os.remove(path_to_file)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) # env = json.loads(os.environ.get('TF_CONFIG', '{}')) # cluster_data = env.get('cluster', None) # cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None # task_data = env.get('task', None) or {'type': 'master', 'index': 0} # task_info = type('TaskSpec', (object,), task_data) # # Parameters for a single worker. # ps_tasks = 0 # worker_replicas = 1 # worker_job_name = 'lonely_worker' # task = 0 # is_chief = True # master = '' # cluster_data, my_job_name, my_task_index = tf_config_from_slurm(ps_number=1) parameter_servers = ["localhost:2232"] workers = ["localhost:2233", "localhost:2234", "localhost:2235"] cluster_data = {"ps": parameter_servers, "worker": workers} if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) print("Number of replicas: ", worker_replicas) if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) print("Number of ps tasks: ", ps_tasks) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster_data), protocol='grpc', job_name=FLAGS.job_name, task_index=FLAGS.task_index) if FLAGS.job_name == 'ps': server.join() return worker_job_name = '%s/task:%d' % (FLAGS.job_name, FLAGS.task_index) task = FLAGS.task_index is_chief = (FLAGS.task_index == 0) master = server.target print("worker_job_name: ", worker_job_name) print("task: ", task) print("is_chief: ", is_chief) print("master: ", master) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) print("%s" % str(env)) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) print("cluster_data %s" % str(cluster_data)) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. try: print("tf.train.Server") server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) except KeyboardInterrupt: print("ctrl c END") if task_info.type == 'ps': print("ps") try: print("tf.Session") sess = tf.Session(server.target) print("create_done_queue: " + str(worker_replicas)) queue = create_done_queue(task_info.index, worker_replicas, ps_tasks) # wait until all workers are done for i in range(worker_replicas): sess.run(queue.dequeue()) print("ps %d received done %d" % (task_info.index, i)) print("ps %d: quitting" % (task_info.index)) # server.join() return except KeyboardInterrupt: print("ctrl c END") worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target print("is_chief:" + str(is_chief)) graph_rewriter_fn = None if 'graph_rewriter_config' in configs: graph_rewriter_fn = graph_rewriter_builder.build( configs['graph_rewriter_config'], is_training=True) try: trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir, graph_hook_fn=graph_rewriter_fn) except KeyboardInterrupt: print("ctrl c END1") finally: if worker_replicas >= 1 and ps_tasks > 0: print("tf.Session") sess = tf.Session(server.target) print("end create_done_queues:" + str(worker_replicas)) for q in create_done_queues(worker_replicas, ps_tasks): print("enqueue") sess.run(q.enqueue(1))
def main(_): print("starting program . . .") # show info to std out during the training process tf.logging.set_verbosity(tf.logging.INFO) if not checkIfNecessaryPathsAndFilesExist(): return # end if configs = config_util.get_configs_from_pipeline_file(PIPELINE_CONFIG_PATH) tf.gfile.Copy(PIPELINE_CONFIG_PATH, os.path.join(TRAINING_DATA_DIR, 'pipeline.config'), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) # ToDo: this nested function seems odd, factor this out eventually ?? # nested function def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() # end nested function create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # parameters for a single worker ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 # end if if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) # end if if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') # end if if worker_replicas >= 1 and ps_tasks > 0: # set up distributed training server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return # end if worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target # end if trainer.train(create_input_dict_fn, model_fn, train_config, master, task, NUM_CLONES, worker_replicas, CLONE_ON_CPU, ps_tasks, worker_job_name, is_chief, TRAINING_DATA_DIR)
def main(_): if iswindos(): FLAGS.train_dir = winprefix + FLAGS.train_dir FLAGS.pipeline_config_path = FLAGS.pipeline_config_path + "_win" assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.pipeline_config_path: model_config, train_config, input_config = get_configs_from_pipeline_file( ) else: model_config, train_config, input_config = get_configs_from_multiple_files( ) print("[main]: model_config:", model_config) print("[main]: train_config:", train_config) print("[main]: input_config:", input_config) model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) print("[main]: model_fn:", model_fn) create_input_dict_fn = functools.partial(input_reader_builder.build, input_config) print("[main]: create_input_dict_fn:", create_input_dict_fn) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) print("[main]: cluster_data:", cluster_data) print("[main]: cluster:", cluster) print("[main]: task_data:", task_data) print("[main]: task_info:", task_info) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target print("[main]: create_input_dict_fn:", create_input_dict_fn) print("[main]: model_fn:", model_fn) print("[main]: train_config:", train_config) print("[main]: master:", master) print("[main]: task:", task) print("[main]: FLAGS.num_clones:", FLAGS.num_clones) print("[main]: worker_replicas:", worker_replicas) print("[main]: FLAGS.clone_on_cpu:", FLAGS.clone_on_cpu) print("[main]: ps_tasks:", ps_tasks) print("[main]: worker_job_name:", worker_job_name) print("[main]: is_chief:", is_chief) print("[main]: train_dir:", FLAGS.train_dir) trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
def main(_): assert FLAGS.train_dir, '`train_dir` is missing.' if FLAGS.task == 0: tf.gfile.MakeDirs(FLAGS.train_dir) if FLAGS.pipeline_config_path: configs = config_util.get_configs_from_pipeline_file( FLAGS.pipeline_config_path) if FLAGS.task == 0: tf.gfile.Copy(FLAGS.pipeline_config_path, os.path.join(FLAGS.train_dir, 'pipeline.config'), overwrite=True) else: configs = config_util.get_configs_from_multiple_files( model_config_path=FLAGS.model_config_path, train_config_path=FLAGS.train_config_path, train_input_config_path=FLAGS.input_config_path) if FLAGS.task == 0: for name, config in [('model.config', FLAGS.model_config_path), ('train.config', FLAGS.train_config_path), ('input.config', FLAGS.input_config_path)]: tf.gfile.Copy(config, os.path.join(FLAGS.train_dir, name), overwrite=True) model_config = configs['model'] train_config = configs['train_config'] input_config = configs['train_input_config'] model_fn = functools.partial(model_builder.build, model_config=model_config, is_training=True) def get_next(config): return dataset_util.make_initializable_iterator( dataset_builder.build(config)).get_next() create_input_dict_fn = functools.partial(get_next, input_config) env = json.loads(os.environ.get('TF_CONFIG', '{}')) cluster_data = env.get('cluster', None) cluster = tf.train.ClusterSpec(cluster_data) if cluster_data else None task_data = env.get('task', None) or {'type': 'master', 'index': 0} task_info = type('TaskSpec', (object, ), task_data) # Parameters for a single worker. ps_tasks = 0 worker_replicas = 1 worker_job_name = 'lonely_worker' task = 0 is_chief = True master = '' if cluster_data and 'worker' in cluster_data: # Number of total worker replicas include "worker"s and the "master". worker_replicas = len(cluster_data['worker']) + 1 if cluster_data and 'ps' in cluster_data: ps_tasks = len(cluster_data['ps']) if worker_replicas > 1 and ps_tasks < 1: raise ValueError( 'At least 1 ps task is needed for distributed training.') if worker_replicas >= 1 and ps_tasks > 0: # Set up distributed training. server = tf.train.Server(tf.train.ClusterSpec(cluster), protocol='grpc', job_name=task_info.type, task_index=task_info.index, config=config) if task_info.type == 'ps': server.join() return worker_job_name = '%s/task:%d' % (task_info.type, task_info.index) task = task_info.index is_chief = (task_info.type == 'master') master = server.target trainer.train(create_input_dict_fn, model_fn, train_config, master, task, FLAGS.num_clones, worker_replicas, FLAGS.clone_on_cpu, ps_tasks, worker_job_name, is_chief, FLAGS.train_dir)
TASK, PS_TASKS, TRAIN_DIR, PIPELINE_CONFING_PATH, TRAIN_CONFIG_PATH, MODEL_CONFIG_PATH, INPUT_CONFIG_PATH, WORKER_REPLICAS, MASTER) train_config.num_steps = NUM_EPOCHS ################################################## # TRAIN THE MODEL # ################################################## trainer.train(create_input_dict_fn, model_fn, train_config, master, task, NUM_CLONES, worker_replicas, CLONE_ON_CPU, ps_tasks, worker_job_name, is_chief, TRAIN_DIR, graph_hook_fn=graph_rewriter_fn) ###################################### # EXPORT THE MODEL # ###################################### # Exporting the model for Evaluation from google.protobuf import text_format from object_detection import exporter from object_detection.protos import pipeline_pb2