def testTrainSave(self): custom_config = load_config(self.config.config_files) model_type = 'mockfasterrcnn' # Save checkpoints to a temp directory. tmp_job_dir = tempfile.mkdtemp() self.config.override_params = [ 'train.num_epochs={}'.format(self.total_epochs), 'train.job_dir={}'.format(tmp_job_dir), 'train.run_name=test_runname', ] step = run(custom_config, model_type, self.config.override_params, get_dataset_fn=self.get_dataset, get_model_fn=self.get_model) self.assertEqual(step, 2) # We have to reset the graph to avoid having duplicate names. tf.reset_default_graph() step = run(custom_config, model_type, self.config.override_params, get_dataset_fn=self.get_dataset, get_model_fn=self.get_model) # This is because of a MonitoredTrainingSession "bug". # When ending training it saves a checkpoint as the next step. # That causes that we are one step ahead when loading it. self.assertEqual(step, 5)
def train(config_files, job_dir, override_params): """ Parse TF_CONFIG to cluster_spec and call run() function """ # TF_CONFIG environment variable is available when running using # gcloud either locally or on cloud. It has all the information required # to create a ClusterSpec which is important for running distributed code. tf_config_val = os.environ.get('TF_CONFIG') if tf_config_val: tf_config = json.loads(tf_config_val) else: tf_config = {} cluster = tf_config.get('cluster') job_name = tf_config.get('task', {}).get('type') task_index = tf_config.get('task', {}).get('index') environment = tf_config.get('environment', 'local') # Get the user config and the model type from it. custom_config = load_config(config_files) try: model_type = custom_config['model']['type'] except KeyError: # Without mode type defined we can't use the default config settings. raise KeyError('model.type should be set on the custom config.') if job_dir: override_params += ('train.job_dir={}'.format(job_dir), ) # If cluster information is empty or TF_CONFIG is not available, run local if job_name is None or task_index is None: return run(custom_config, model_type, override_params, environment=environment) cluster_spec = tf.train.ClusterSpec(cluster) server = tf.train.Server(cluster_spec, job_name=job_name, task_index=task_index) # Wait for incoming connections forever # Worker ships the graph to the ps server # The ps server manages the parameters of the model. if job_name == 'ps': server.join() return elif job_name in ['master', 'worker']: is_chief = job_name == 'master' return run(custom_config, model_type, override_params=override_params, target=server.target, cluster_spec=cluster_spec, is_chief=is_chief, job_name=job_name, task_index=task_index, environment=environment)
def testTrain(self): config = self.config custom_config = load_config(config.config_files) # The string we use here is ignored. model_type = 'mockfasterrcnn' # This should not fail run(custom_config, model_type, config.override_params, get_dataset_fn=self.get_dataset, get_model_fn=self.get_model)
def testTrain(self): custom_config = load_config(self.config.config_files) # The string we use here is ignored. model_type = 'mockfasterrcnn' self.config.override_params = [ 'train.num_epochs={}'.format(self.total_epochs), 'train.job_dir=', ] # This should not fail run(custom_config, model_type, self.config.override_params, get_dataset_fn=self.get_dataset, get_model_fn=self.get_model)
def get_prediction(model_type, image, config_files, session=None, pred_dict=None, image_tensor=None, return_tf_vars=False): """ Gets the prediction given by the model `model_type` of the image `image`. If a checkpoint exists in the job's directory, load it. The names of the classes will be obtained from the dataset directory. Returns a dictionary with the objects, their labels and probabilities, the inference time and the scale factor. Also if the `return_tf_vars` is True, returns the image tensor, the entire prediction of the model and the sesssion. """ model_class = get_model(model_type) custom_config = load_config(config_files) config = get_model_config(model_class.base_config, custom_config, None) if session is None or pred_dict is None or image_tensor is None: graph = tf.Graph() session = tf.Session(graph=graph) with graph.as_default(): image_tensor = tf.placeholder(tf.float32, (1, None, None, 3)) model = model_class(model_class.base_config) pred_dict = model(image_tensor) # Restore checkpoint if config.train.job_dir and config.train.run_name: ckpt = tf.train.get_checkpoint_state( os.path.join(config.train.job_dir, config.train.run_name)) if not ckpt or not ckpt.all_model_checkpoint_paths: raise ValueError('Could not find checkpoint in {}.'.format( config.train.job_dir)) ckpt = ckpt.all_model_checkpoint_paths[-1] ckpt_dir = os.path.join('.', ckpt) saver = tf.train.Saver(sharded=True, allow_empty=True) saver.restore(session, ckpt_dir) # A prediction without checkpoint is just used for testing else: init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) session.run(init_op) if config.model.network.with_rcnn: cls_prediction = pred_dict['classification_prediction'] objects_tf = cls_prediction['objects'] objects_labels_tf = cls_prediction['labels'] objects_labels_prob_tf = cls_prediction['probs'] else: rpn_prediction = pred_dict['rpn_prediction'] objects_tf = rpn_prediction['proposals'] objects_labels_prob_tf = rpn_prediction['scores'] # All labels without RCNN are zero objects_labels_tf = tf.zeros(tf.shape(objects_labels_prob_tf), dtype=tf.int32) image_resize_config = model_class.base_config.dataset.image_preprocessing image_array, scale_factor = resize_image( image, float(image_resize_config.min_size), float(image_resize_config.max_size)) start_time = time.time() objects, objects_labels, objects_labels_prob = session.run( [objects_tf, objects_labels_tf, objects_labels_prob_tf], feed_dict={image_tensor: image_array}) end_time = time.time() if config.dataset.dir: # Gets the names of the classes classes_file = os.path.join(config.dataset.dir, 'classes.json') class_labels = json.load(tf.gfile.GFile(classes_file)) objects_labels = [class_labels[obj] for obj in objects_labels] else: objects_labels = objects_labels.tolist() res = { 'objects': objects.tolist(), 'objects_labels': objects_labels, 'objects_labels_prob': objects_labels_prob.tolist(), 'inference_time': end_time - start_time, 'scale_factor': scale_factor, } if return_tf_vars: res['image_tensor'] = image_tensor res['prediction_dict'] = pred_dict res['session'] = session return res
def evaluate(dataset_split, config_files, job_dir, watch, from_global_step, override_params, files_per_class): """ Evaluate models using dataset. """ custom_config = load_config(config_files) # If the config file is empty, our config will be the base_config for the # default model. try: model_type = custom_config['model']['type'] except KeyError: raise KeyError('model.type should be set on the custom config.') model_class = get_model(model_type) config = get_model_config( model_class.base_config, custom_config, override_params, ) config.train.job_dir = job_dir or config.train.job_dir # Only activate debug for if needed for debug visualization mode. if not config.train.debug: config.train.debug = config.eval.image_vis == 'debug' if config.train.debug or config.train.tf_debug: tf.logging.set_verbosity(tf.logging.DEBUG) else: tf.logging.set_verbosity(tf.logging.INFO) # Build the dataset tensors, overriding the default dataset split. config.dataset.split = dataset_split # Disable data augmentation. config.dataset.data_augmentation = [] # Only a single run over the dataset to calculate metrics. config.train.num_epochs = 1 # Seed setup if config.train.seed: tf.set_random_seed(config.train.seed) # Set pretrained as not training config.model.base_network.trainable = False model = model_class(config) dataset_class = get_dataset(config.dataset.type) dataset = dataset_class(config) train_dataset = dataset() train_image = train_dataset['image'] train_objects = train_dataset['bboxes'] train_filename = train_dataset['filename'] # TODO: This is not the best place to configure rank? Why is rank not # transmitted through the queue train_image.set_shape((None, None, 3)) # We add fake batch dimension to train data. TODO: DEFINITELY NOT THE BEST # PLACE train_image = tf.expand_dims(train_image, 0) # Build the graph of the model to evaluate, retrieving required # intermediate tensors. prediction_dict = model(train_image, train_objects) pred = prediction_dict['classification_prediction'] pred_objects = pred['objects'] pred_objects_classes = pred['labels'] pred_objects_scores = pred['probs'] # Retrieve *all* the losses from the model and calculate their streaming # means, so we get the loss over the whole dataset. batch_losses = model.loss(prediction_dict, return_all=True) losses = {} for loss_name, loss_tensor in batch_losses.items(): loss_mean, _ = tf.metrics.mean( loss_tensor, name=loss_name, metrics_collections='metrics', updates_collections='metric_ops', ) full_loss_name = '{}_losses/{}'.format(dataset_split, loss_name) losses[full_loss_name] = loss_mean metric_ops = tf.get_collection('metric_ops') init_op = tf.group(tf.global_variables_initializer(), tf.local_variables_initializer()) # Using a global saver instead of the one for the model. saver = tf.train.Saver(sharded=True, allow_empty=True) # Aggregate the required ops to evaluate into a dict.. ops = { 'init_op': init_op, 'metric_ops': metric_ops, 'pred_objects': pred_objects, 'pred_objects_classes': pred_objects_classes, 'pred_objects_scores': pred_objects_scores, 'train_objects': train_objects, 'losses': losses, 'prediction_dict': prediction_dict, 'filename': train_filename, 'train_image': train_image } metrics_scope = '{}_metrics'.format(dataset_split) # Use global writer for all checkpoints. We don't want to write different # files for each checkpoint. writer = tf.summary.FileWriter(config.train.job_dir) files_to_visualize = {} last_global_step = from_global_step while True: # Get the checkpoint files to evaluate. try: checkpoints = get_checkpoints(config, last_global_step) except ValueError as e: if not watch: tf.logging.error('Missing checkpoint.') raise e tf.logging.warning( 'Missing checkpoint; Checking again in a minute') time.sleep(60) continue for checkpoint in checkpoints: # Always returned in order, so it's safe to assign directly. tf.logging.info( 'Evaluating global_step {} using checkpoint \'{}\''.format( checkpoint['global_step'], checkpoint['file'])) try: start = time.time() evaluate_once(writer, saver, ops, config.model.network.num_classes, checkpoint, metrics_scope=metrics_scope, image_vis=config.eval.image_vis, files_per_class=files_per_class, files_to_visualize=files_to_visualize) last_global_step = checkpoint['global_step'] tf.logging.info('Evaluated in {:.2f}s'.format(time.time() - start)) except tf.errors.NotFoundError: # The checkpoint is not ready yet. It was written in the # checkpoints file, but it still hasn't been completely saved. tf.logging.info('Checkpoint {} is not ready yet. ' 'Checking again in a minute.'.format( checkpoint['file'])) time.sleep(60) continue # If no watching was requested, finish the execution. if not watch: return # Sleep for a minute and check for new checkpoints. tf.logging.info('All checkpoints evaluated; sleeping for a minute') time.sleep(60)
def train(job_id, service_account_json, bucket_name, region, config_files, dataset, scale_tier, master_type, worker_type, worker_count, parameter_server_type, parameter_server_count): project_id = get_project_id(service_account_json) if project_id is None: raise ValueError( 'Missing "project_id" in service_account_json "{}"'.format( service_account_json)) if bucket_name is None: client_id = get_client_id(service_account_json) bucket_name = 'luminoth-{}'.format(client_id) click.echo( 'Bucket name not specified. Using "{}".'.format(bucket_name)) credentials = get_credentials(service_account_json) validate_region(region, project_id, credentials) # Creates bucket for logs and models if it doesn't exist bucket = get_bucket(service_account_json, bucket_name) if not job_id: job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S")) # Define path in bucket to store job's config, logs, etc. base_path = 'lumi_{}'.format(job_id) package_path = build_package(bucket, base_path) # Check if absolute or relative dataset path if not dataset.startswith('gs://'): dataset = 'gs://{}'.format(dataset) args = [] args.extend([ '-o', 'dataset.dir={}'.format(dataset), ]) override_params = [ 'dataset.dir={}'.format(dataset), ] custom_config = load_config(config_files) model_class = get_model(custom_config.model.type) config = get_model_config( model_class.base_config, custom_config, override_params, ) # We should validate config before submitting job # Update final config file to job bucket config_path = os.path.join(base_path, DEFAULT_CONFIG_FILENAME) upload_data(bucket, config_path, dump_config(config)) args = ['--config', 'gs://{}/{}'.format(bucket_name, config_path)] cloudml = cloud_service(credentials, 'ml') training_inputs = { 'scaleTier': scale_tier, 'packageUris': ['gs://{}/{}'.format(bucket_name, package_path)], 'pythonModule': 'luminoth.train', 'args': args, 'region': region, 'jobDir': 'gs://{}/{}/'.format(bucket_name, base_path), 'runtimeVersion': RUNTIME_VERSION } if scale_tier == 'CUSTOM': training_inputs['masterType'] = master_type training_inputs['workerType'] = worker_type training_inputs['workerCount'] = worker_count if parameter_server_count > 0: training_inputs['parameterServerCount'] = parameter_server_count training_inputs['parameterServerType'] = parameter_server_type job_spec = {'jobId': job_id, 'trainingInput': training_inputs} jobrequest = cloudml.projects().jobs().create( body=job_spec, parent='projects/{}'.format(project_id)) try: click.echo('Submitting training job.') res = jobrequest.execute() click.echo('Job {} submitted successfully.'.format(job_id)) click.echo('state = {}, createTime = {}'.format( res.get('state'), res.get('createTime'))) save_run(config, environment='gcloud', extra_config=job_spec) except Exception as err: click.echo('There was an error creating the training job. ' 'Check the details: \n{}'.format(err._get_reason()))