def get_config(self, model_type, override_params=None): custom_config = load_config_files(self.config.config_files) model_class = get_model('fasterrcnn') model_base_config = get_base_config(model_class) config = get_model_config(model_base_config, custom_config, override_params) config.model.type = model_type return config
def get_config(self, model_type, override_params=None): custom_config = load_config_files(self.config.config_files) model_class = get_model('fasterrcnn') model_base_config = get_base_config(model_class) config = get_model_config( model_base_config, custom_config, override_params ) config.model.type = model_type return config
def get_prediction(model_type, image, config_file, session=None, prediction_dict=None, image_tensor=None, return_tf_vars=False): """ Gets the prediction given by the model `model_type` of the image `image`. If a checkpoint exists in the job's directory, load it. The names of the classes will be obtained from the dataset directory. Returns a dictionary with the objects, their labels and probabilities, the inference time and the scale factor. Also if the `return_tf_vars` is True, returns the image tensor, the entire prediction of the model and the sesssion. """ model_class = get_model(model_type) config = get_model_config( model_class.base_config, config_file, None ) if session is None or prediction_dict is None or image_tensor is None: graph = tf.Graph() session = tf.Session(graph=graph) with graph.as_default(): image_tensor = tf.placeholder(tf.float32, (1, None, None, 3)) model = model_class(model_class.base_config) prediction_dict = model(image_tensor) # Restore checkpoint if config.train.job_dir and config.train.run_name: ckpt = tf.train.get_checkpoint_state(os.path.join( config.train.job_dir, config.train.run_name)) if not ckpt or not ckpt.all_model_checkpoint_paths: raise ValueError('Could not find checkpoint in {}.'.format( config.train.job_dir )) ckpt = ckpt.all_model_checkpoint_paths[-1] ckpt_dir = os.path.join('.', ckpt) saver = tf.train.Saver(sharded=True, allow_empty=True) saver.restore(session, ckpt_dir) # A prediction without checkpoint is just used for testing else: init_op = tf.group( tf.global_variables_initializer(), tf.local_variables_initializer() ) session.run(init_op) classification_prediction = prediction_dict['classification_prediction'] objects_tf = classification_prediction['objects'] objects_labels_tf = classification_prediction['labels'] objects_labels_prob_tf = classification_prediction['probs'] image_resize_config = model_class.base_config.dataset.image_preprocessing image_array, scale_factor = resize_image( image, float(image_resize_config.min_size), float(image_resize_config.max_size) ) start_time = time.time() objects, objects_labels, objects_labels_prob = session.run([ objects_tf, objects_labels_tf, objects_labels_prob_tf ], feed_dict={ image_tensor: image_array }) end_time = time.time() if config.dataset.dir: # Gets the names of the classes classes_file = os.path.join(config.dataset.dir, 'classes.json') class_labels = json.load(tf.gfile.GFile(classes_file)) objects_labels = [class_labels[obj] for obj in objects_labels] else: objects_labels = objects_labels.tolist() res = { 'objects': objects.tolist(), 'objects_labels': objects_labels, 'objects_labels_prob': objects_labels_prob.tolist(), 'inference_time': end_time - start_time, 'scale_factor': scale_factor, } if return_tf_vars: res['image_tensor'] = image_tensor res['prediction_dict'] = prediction_dict res['session'] = session return res
def run(custom_config, model_type, override_params, target='', cluster_spec=None, is_chief=True, job_name=None, task_index=None, get_model_fn=get_model, get_dataset_fn=get_dataset): model_class = get_model_fn(model_type) config = get_model_config( model_class.base_config, custom_config, override_params, ) if config.train.get('seed') is not None: tf.set_random_seed(config.train.seed) log_prefix = '[{}-{}] - '.format(job_name, task_index) \ if job_name is not None and task_index is not None else '' if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) model = model_class(config) # Placement of ops on devices using replica device setter # which automatically places the parameters on the `ps` server # and the `ops` on the workers # # See: # https://www.tensorflow.org/api_docs/python/tf/train/replica_device_setter with tf.device(tf.train.replica_device_setter(cluster=cluster_spec)): try: config['dataset']['type'] except KeyError: raise KeyError('dataset.type should be set on the custom config.') dataset_class = get_dataset_fn(config.dataset.type) dataset = dataset_class(config) train_dataset = dataset() train_image = train_dataset['image'] train_filename = train_dataset['filename'] train_bboxes = train_dataset['bboxes'] # TODO: This is not the best place to configure rank? Why is rank not # transmitted through the queue train_image.set_shape((None, None, 3)) # We add fake batch dimension to train data. # TODO: DEFINITELY NOT THE BEST PLACE train_image = tf.expand_dims(train_image, 0) prediction_dict = model(train_image, train_bboxes, is_training=True) total_loss = model.loss(prediction_dict) global_step = tf.contrib.framework.get_or_create_global_step() optimizer = get_optimizer(config.train, global_step) trainable_vars = model.get_trainable_vars() with tf.name_scope('gradients'): # Compute, clip and apply gradients grads_and_vars = optimizer.compute_gradients( total_loss, trainable_vars) # Clip by norm. TODO: Configurable grads_and_vars = clip_gradients_by_norm(grads_and_vars) train_op = optimizer.apply_gradients(grads_and_vars, global_step=global_step) tf.logging.info('{}Starting training for {}'.format(log_prefix, model)) run_options = None if config.train.full_trace: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) if is_chief: # Load pretrained weights needs to be called before defining the train # op. After it, variables for the optimizer are created. with tf.control_dependencies([tf.global_variables_initializer()]): with tf.control_dependencies([model.load_pretrained_weights()]): init_op = tf.no_op(name='global_init_load_pretrained') else: init_op = tf.no_op() # Create custom Scaffold to make sure we run our own init_op when model # is not restored from checkpoint. scaffold = tf.train.Scaffold( # Initialize local and global variables. init_op=init_op, # Queue-related variables need a special initializer. local_init_op=tf.local_variables_initializer(), summary_op=tf.summary.merge([ tf.summary.merge_all(), model.summary, ])) # Custom hooks for our session hooks = [] chief_only_hooks = [] if config.train.tf_debug: debug_hook = tf_debug.LocalCLIDebugHook() debug_hook.add_tensor_filter('has_inf_or_nan', tf_debug.has_inf_or_nan) hooks.extend([debug_hook]) if not config.train.job_dir: tf.logging.warning( '`job_dir` is not defined. Checkpoints and logs will not be saved.' ) elif config.train.run_name: # Use run_name when available checkpoint_dir = os.path.join(config.train.job_dir, config.train.run_name) else: checkpoint_dir = config.train.job_dir if config.train.display_every_steps or config.train.display_every_secs: if not config.train.debug: tf.logging.warning('ImageVisHook will not run without debug mode.') else: # ImageVis only runs on the chief. chief_only_hooks.append( ImageVisHook(prediction_dict, with_rcnn=config.network.with_rcnn, output_dir=checkpoint_dir, every_n_steps=config.train.display_every_steps, every_n_secs=config.train.display_every_secs)) with tf.train.MonitoredTrainingSession( master=target, is_chief=is_chief, checkpoint_dir=checkpoint_dir, scaffold=scaffold, hooks=hooks, chief_only_hooks=chief_only_hooks, save_checkpoint_secs=config.train.save_checkpoint_secs, save_summaries_steps=config.train.save_summaries_steps, save_summaries_secs=config.train.save_summaries_secs, ) as sess: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) try: while not coord.should_stop(): before = time.time() _, train_loss, step, filename = sess.run( [train_op, total_loss, global_step, train_filename], options=run_options) # TODO: Add image summary every once in a while. tf.logging.info( '{}step: {}, file: {}, train_loss: {}, in {:.2f}s'.format( log_prefix, step, filename, train_loss, time.time() - before)) except tf.errors.OutOfRangeError: tf.logging.info('{}finished training after {} epoch limit'.format( log_prefix, config.train.num_epochs)) # TODO: Print summary finally: coord.request_stop() # Wait for all threads to stop. coord.join(threads)
def evaluate(model_type, dataset_split, config_file, job_dir, watch, from_global_step, override_params, image_vis, files_per_class): """ Evaluate models using dataset. """ model_cls = get_model(model_type) config = model_cls.base_config config = get_model_config(model_cls.base_config, config_file, override_params) config.train.job_dir = job_dir or config.train.job_dir # Only activate debug for image visualizations. config.train.debug = image_vis if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Build the dataset tensors, overriding the default dataset split. config.dataset.split = dataset_split # Disable data augmentation. config.dataset.data_augmentation = [] # Only a single run over the dataset to calculate metrics. config.train.num_epochs = 1 # Seed setup if config.train.seed: tf.set_random_seed(config.train.seed) # Set pretrained as not training config.base_network.trainable = False model = model_cls(config) dataset = TFRecordDataset(config) train_dataset = dataset() train_image = train_dataset['image'] train_objects = train_dataset['bboxes'] train_filename = train_dataset['filename'] # TODO: This is not the best place to configure rank? Why is rank not # transmitted through the queue train_image.set_shape((None, None, 3)) # We add fake batch dimension to train data. TODO: DEFINITELY NOT THE BEST # PLACE train_image = tf.expand_dims(train_image, 0) # Build the graph of the model to evaluate, retrieving required # intermediate tensors. prediction_dict = model(train_image, train_objects) pred = prediction_dict['classification_prediction'] pred_objects = pred['objects'] pred_objects_classes = pred['labels'] pred_objects_scores = pred['probs'] # Retrieve *all* the losses from the model and calculate their streaming # means, so we get the loss over the whole dataset. batch_losses = model.loss(prediction_dict, return_all=True) losses = {} for loss_name, loss_tensor in batch_losses.items(): loss_mean, _ = tf.metrics.mean( loss_tensor, name=loss_name, metrics_collections='metrics', updates_collections='metric_ops', ) full_loss_name = '{}_losses/{}'.format(dataset_split, loss_name) losses[full_loss_name] = loss_mean metric_ops = tf.get_collection('metric_ops') init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Using a global saver instead of the one for the model. saver = tf.train.Saver(sharded=True, allow_empty=True) # Aggregate the required ops to evaluate into a dict.. ops = { 'init_op': init_op, 'metric_ops': metric_ops, 'pred_objects': pred_objects, 'pred_objects_classes': pred_objects_classes, 'pred_objects_scores': pred_objects_scores, 'train_objects': train_objects, 'losses': losses, 'prediction_dict': prediction_dict, 'filename': train_filename } metrics_scope = '{}_metrics'.format(dataset_split) # Use global writer for all checkpoints. We don't want to write different # files for each checkpoint. writer = tf.summary.FileWriter(config.train.job_dir) files_to_visualize = {} last_global_step = from_global_step while True: # Get the checkpoint files to evaluate. try: checkpoints = get_checkpoints(config, last_global_step) except ValueError as e: if not watch: tf.logging.error('Missing checkpoint.') raise e tf.logging.warning( 'Missing checkpoint; Checking again in a minute') time.sleep(60) continue for checkpoint in checkpoints: # Always returned in order, so it's safe to assign directly. tf.logging.info( 'Evaluating global_step {} using checkpoint \'{}\''.format( checkpoint['global_step'], checkpoint['file'])) try: start = time.time() evaluate_once(writer, saver, ops, config.network.num_classes, checkpoint, metrics_scope=metrics_scope, image_vis=image_vis, files_per_class=files_per_class, files_to_visualize=files_to_visualize) last_global_step = checkpoint['global_step'] tf.logging.info('Evaluated in {:.2f}s'.format(time.time() - start)) except tf.errors.NotFoundError: # The checkpoint is not ready yet. It was written in the # checkpoints file, but it still hasn't been completely saved. tf.logging.info('Checkpoint {} is not ready yet. ' 'Checking again in a minute.'.format( checkpoint['file'])) time.sleep(60) continue # If no watching was requested, finish the execution. if not watch: return # Sleep for a minute and check for new checkpoints. tf.logging.info('All checkpoints evaluated; sleeping for a minute') time.sleep(60)
def train(job_id, service_account_json, bucket_name, region, config_files, dataset, scale_tier, master_type, worker_type, worker_count, parameter_server_type, parameter_server_count): project_id = get_project_id(service_account_json) if project_id is None: raise ValueError( 'Missing "project_id" in service_account_json "{}"'.format( service_account_json)) if bucket_name is None: client_id = get_client_id(service_account_json) bucket_name = 'luminoth-{}'.format(client_id) click.echo( 'Bucket name not specified. Using "{}".'.format(bucket_name)) credentials = get_credentials(service_account_json) validate_region(region, project_id, credentials) # Creates bucket for logs and models if it doesn't exist bucket = get_bucket(service_account_json, bucket_name) if not job_id: job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S")) # Define path in bucket to store job's config, logs, etc. base_path = 'lumi_{}'.format(job_id) package_path = build_package(bucket, base_path) # Check if absolute or relative dataset path if not dataset.startswith('gs://'): dataset = 'gs://{}'.format(dataset) args = [] args.extend([ '-o', 'dataset.dir={}'.format(dataset), ]) override_params = [ 'dataset.dir={}'.format(dataset), ] custom_config = load_config(config_files) model_class = get_model(custom_config.model.type) config = get_model_config( model_class.base_config, custom_config, override_params, ) # We should validate config before submitting job # Update final config file to job bucket config_path = os.path.join(base_path, DEFAULT_CONFIG_FILENAME) upload_data(bucket, config_path, dump_config(config)) args = ['--config', 'gs://{}/{}'.format(bucket_name, config_path)] cloudml = cloud_service(credentials, 'ml') training_inputs = { 'scaleTier': scale_tier, 'packageUris': ['gs://{}/{}'.format(bucket_name, package_path)], 'pythonModule': 'luminoth.train', 'args': args, 'region': region, 'jobDir': 'gs://{}/{}/'.format(bucket_name, base_path), 'runtimeVersion': RUNTIME_VERSION } if scale_tier == 'CUSTOM': training_inputs['masterType'] = master_type training_inputs['workerType'] = worker_type training_inputs['workerCount'] = worker_count if parameter_server_count > 0: training_inputs['parameterServerCount'] = parameter_server_count training_inputs['parameterServerType'] = parameter_server_type job_spec = {'jobId': job_id, 'trainingInput': training_inputs} jobrequest = cloudml.projects().jobs().create( body=job_spec, parent='projects/{}'.format(project_id)) try: click.echo('Submitting training job.') res = jobrequest.execute() click.echo('Job {} submitted successfully.'.format(job_id)) click.echo('state = {}, createTime = {}'.format( res.get('state'), res.get('createTime'))) save_run(config, environment='gcloud', extra_config=job_spec) except Exception as err: click.echo('There was an error creating the training job. ' 'Check the details: \n{}'.format(err._get_reason()))