Пример #1
0
    def testTrainSave(self):
        custom_config = load_config(self.config.config_files)
        model_type = 'mockfasterrcnn'

        # Save checkpoints to a temp directory.
        tmp_job_dir = tempfile.mkdtemp()
        self.config.override_params = [
            'train.num_epochs={}'.format(self.total_epochs),
            'train.job_dir={}'.format(tmp_job_dir),
            'train.run_name=test_runname',
        ]

        step = run(custom_config,
                   model_type,
                   self.config.override_params,
                   get_dataset_fn=self.get_dataset,
                   get_model_fn=self.get_model)
        self.assertEqual(step, 2)

        # We have to reset the graph to avoid having duplicate names.
        tf.reset_default_graph()
        step = run(custom_config,
                   model_type,
                   self.config.override_params,
                   get_dataset_fn=self.get_dataset,
                   get_model_fn=self.get_model)

        # This is because of a MonitoredTrainingSession "bug".
        # When ending training it saves a checkpoint as the next step.
        # That causes that we are one step ahead when loading it.
        self.assertEqual(step, 5)
Пример #2
0
def train(config_files, job_dir, override_params):
    """
    Parse TF_CONFIG to cluster_spec and call run() function
    """
    # TF_CONFIG environment variable is available when running using
    # gcloud either locally or on cloud. It has all the information required
    # to create a ClusterSpec which is important for running distributed code.
    tf_config_val = os.environ.get('TF_CONFIG')

    if tf_config_val:
        tf_config = json.loads(tf_config_val)
    else:
        tf_config = {}

    cluster = tf_config.get('cluster')
    job_name = tf_config.get('task', {}).get('type')
    task_index = tf_config.get('task', {}).get('index')
    environment = tf_config.get('environment', 'local')

    # Get the user config and the model type from it.
    custom_config = load_config(config_files)

    try:
        model_type = custom_config['model']['type']
    except KeyError:
        # Without mode type defined we can't use the default config settings.
        raise KeyError('model.type should be set on the custom config.')

    if job_dir:
        override_params += ('train.job_dir={}'.format(job_dir), )

    # If cluster information is empty or TF_CONFIG is not available, run local
    if job_name is None or task_index is None:
        return run(custom_config,
                   model_type,
                   override_params,
                   environment=environment)

    cluster_spec = tf.train.ClusterSpec(cluster)
    server = tf.train.Server(cluster_spec,
                             job_name=job_name,
                             task_index=task_index)

    # Wait for incoming connections forever
    # Worker ships the graph to the ps server
    # The ps server manages the parameters of the model.
    if job_name == 'ps':
        server.join()
        return
    elif job_name in ['master', 'worker']:
        is_chief = job_name == 'master'
        return run(custom_config,
                   model_type,
                   override_params=override_params,
                   target=server.target,
                   cluster_spec=cluster_spec,
                   is_chief=is_chief,
                   job_name=job_name,
                   task_index=task_index,
                   environment=environment)
Пример #3
0
    def testTrain(self):
        config = self.config

        custom_config = load_config(config.config_files)
        # The string we use here is ignored.
        model_type = 'mockfasterrcnn'

        # This should not fail
        run(custom_config, model_type, config.override_params,
            get_dataset_fn=self.get_dataset, get_model_fn=self.get_model)
Пример #4
0
    def testTrain(self):
        custom_config = load_config(self.config.config_files)
        # The string we use here is ignored.
        model_type = 'mockfasterrcnn'

        self.config.override_params = [
            'train.num_epochs={}'.format(self.total_epochs),
            'train.job_dir=',
        ]

        # This should not fail
        run(custom_config,
            model_type,
            self.config.override_params,
            get_dataset_fn=self.get_dataset,
            get_model_fn=self.get_model)
Пример #5
0
def get_prediction(model_type,
                   image,
                   config_files,
                   session=None,
                   pred_dict=None,
                   image_tensor=None,
                   return_tf_vars=False):
    """
    Gets the prediction given by the model `model_type` of the image `image`.
    If a checkpoint exists in the job's directory, load it.
    The names of the classes will be obtained from the dataset directory.
    Returns a dictionary with the objects, their labels and probabilities,
    the inference time and the scale factor. Also if the `return_tf_vars` is
    True, returns the image tensor, the entire prediction of the model and
    the sesssion.
    """
    model_class = get_model(model_type)
    custom_config = load_config(config_files)
    config = get_model_config(model_class.base_config, custom_config, None)

    if session is None or pred_dict is None or image_tensor is None:
        graph = tf.Graph()
        session = tf.Session(graph=graph)

        with graph.as_default():
            image_tensor = tf.placeholder(tf.float32, (1, None, None, 3))
            model = model_class(model_class.base_config)
            pred_dict = model(image_tensor)

            # Restore checkpoint
            if config.train.job_dir and config.train.run_name:
                ckpt = tf.train.get_checkpoint_state(
                    os.path.join(config.train.job_dir, config.train.run_name))
                if not ckpt or not ckpt.all_model_checkpoint_paths:
                    raise ValueError('Could not find checkpoint in {}.'.format(
                        config.train.job_dir))
                ckpt = ckpt.all_model_checkpoint_paths[-1]
                ckpt_dir = os.path.join('.', ckpt)
                saver = tf.train.Saver(sharded=True, allow_empty=True)
                saver.restore(session, ckpt_dir)
            # A prediction without checkpoint is just used for testing
            else:
                init_op = tf.group(tf.global_variables_initializer(),
                                   tf.local_variables_initializer())
                session.run(init_op)

            if config.model.network.with_rcnn:
                cls_prediction = pred_dict['classification_prediction']
                objects_tf = cls_prediction['objects']
                objects_labels_tf = cls_prediction['labels']
                objects_labels_prob_tf = cls_prediction['probs']
            else:
                rpn_prediction = pred_dict['rpn_prediction']
                objects_tf = rpn_prediction['proposals']
                objects_labels_prob_tf = rpn_prediction['scores']
                # All labels without RCNN are zero
                objects_labels_tf = tf.zeros(tf.shape(objects_labels_prob_tf),
                                             dtype=tf.int32)

    image_resize_config = model_class.base_config.dataset.image_preprocessing

    image_array, scale_factor = resize_image(
        image, float(image_resize_config.min_size),
        float(image_resize_config.max_size))

    start_time = time.time()
    objects, objects_labels, objects_labels_prob = session.run(
        [objects_tf, objects_labels_tf, objects_labels_prob_tf],
        feed_dict={image_tensor: image_array})
    end_time = time.time()

    if config.dataset.dir:
        # Gets the names of the classes
        classes_file = os.path.join(config.dataset.dir, 'classes.json')
        class_labels = json.load(tf.gfile.GFile(classes_file))
        objects_labels = [class_labels[obj] for obj in objects_labels]

    else:
        objects_labels = objects_labels.tolist()

    res = {
        'objects': objects.tolist(),
        'objects_labels': objects_labels,
        'objects_labels_prob': objects_labels_prob.tolist(),
        'inference_time': end_time - start_time,
        'scale_factor': scale_factor,
    }

    if return_tf_vars:
        res['image_tensor'] = image_tensor
        res['prediction_dict'] = pred_dict
        res['session'] = session

    return res
Пример #6
0
def evaluate(dataset_split, config_files, job_dir, watch, from_global_step,
             override_params, files_per_class):
    """
    Evaluate models using dataset.
    """
    custom_config = load_config(config_files)
    # If the config file is empty, our config will be the base_config for the
    # default model.
    try:
        model_type = custom_config['model']['type']
    except KeyError:
        raise KeyError('model.type should be set on the custom config.')

    model_class = get_model(model_type)

    config = get_model_config(
        model_class.base_config,
        custom_config,
        override_params,
    )

    config.train.job_dir = job_dir or config.train.job_dir

    # Only activate debug for if needed for debug visualization mode.
    if not config.train.debug:
        config.train.debug = config.eval.image_vis == 'debug'

    if config.train.debug or config.train.tf_debug:
        tf.logging.set_verbosity(tf.logging.DEBUG)
    else:
        tf.logging.set_verbosity(tf.logging.INFO)

    # Build the dataset tensors, overriding the default dataset split.
    config.dataset.split = dataset_split
    # Disable data augmentation.
    config.dataset.data_augmentation = []

    # Only a single run over the dataset to calculate metrics.
    config.train.num_epochs = 1

    # Seed setup
    if config.train.seed:
        tf.set_random_seed(config.train.seed)

    # Set pretrained as not training
    config.model.base_network.trainable = False

    model = model_class(config)
    dataset_class = get_dataset(config.dataset.type)
    dataset = dataset_class(config)
    train_dataset = dataset()

    train_image = train_dataset['image']
    train_objects = train_dataset['bboxes']
    train_filename = train_dataset['filename']

    # TODO: This is not the best place to configure rank? Why is rank not
    # transmitted through the queue
    train_image.set_shape((None, None, 3))
    # We add fake batch dimension to train data. TODO: DEFINITELY NOT THE BEST
    # PLACE
    train_image = tf.expand_dims(train_image, 0)

    # Build the graph of the model to evaluate, retrieving required
    # intermediate tensors.
    prediction_dict = model(train_image, train_objects)

    pred = prediction_dict['classification_prediction']
    pred_objects = pred['objects']
    pred_objects_classes = pred['labels']
    pred_objects_scores = pred['probs']

    # Retrieve *all* the losses from the model and calculate their streaming
    # means, so we get the loss over the whole dataset.
    batch_losses = model.loss(prediction_dict, return_all=True)
    losses = {}
    for loss_name, loss_tensor in batch_losses.items():
        loss_mean, _ = tf.metrics.mean(
            loss_tensor,
            name=loss_name,
            metrics_collections='metrics',
            updates_collections='metric_ops',
        )
        full_loss_name = '{}_losses/{}'.format(dataset_split, loss_name)
        losses[full_loss_name] = loss_mean

    metric_ops = tf.get_collection('metric_ops')

    init_op = tf.group(tf.global_variables_initializer(),
                       tf.local_variables_initializer())

    # Using a global saver instead of the one for the model.
    saver = tf.train.Saver(sharded=True, allow_empty=True)

    # Aggregate the required ops to evaluate into a dict..
    ops = {
        'init_op': init_op,
        'metric_ops': metric_ops,
        'pred_objects': pred_objects,
        'pred_objects_classes': pred_objects_classes,
        'pred_objects_scores': pred_objects_scores,
        'train_objects': train_objects,
        'losses': losses,
        'prediction_dict': prediction_dict,
        'filename': train_filename,
        'train_image': train_image
    }

    metrics_scope = '{}_metrics'.format(dataset_split)

    # Use global writer for all checkpoints. We don't want to write different
    # files for each checkpoint.
    writer = tf.summary.FileWriter(config.train.job_dir)

    files_to_visualize = {}

    last_global_step = from_global_step
    while True:
        # Get the checkpoint files to evaluate.
        try:
            checkpoints = get_checkpoints(config, last_global_step)
        except ValueError as e:
            if not watch:
                tf.logging.error('Missing checkpoint.')
                raise e

            tf.logging.warning(
                'Missing checkpoint; Checking again in a minute')
            time.sleep(60)
            continue

        for checkpoint in checkpoints:
            # Always returned in order, so it's safe to assign directly.
            tf.logging.info(
                'Evaluating global_step {} using checkpoint \'{}\''.format(
                    checkpoint['global_step'], checkpoint['file']))
            try:
                start = time.time()
                evaluate_once(writer,
                              saver,
                              ops,
                              config.model.network.num_classes,
                              checkpoint,
                              metrics_scope=metrics_scope,
                              image_vis=config.eval.image_vis,
                              files_per_class=files_per_class,
                              files_to_visualize=files_to_visualize)
                last_global_step = checkpoint['global_step']
                tf.logging.info('Evaluated in {:.2f}s'.format(time.time() -
                                                              start))
            except tf.errors.NotFoundError:
                # The checkpoint is not ready yet. It was written in the
                # checkpoints file, but it still hasn't been completely saved.
                tf.logging.info('Checkpoint {} is not ready yet. '
                                'Checking again in a minute.'.format(
                                    checkpoint['file']))
                time.sleep(60)
                continue

        # If no watching was requested, finish the execution.
        if not watch:
            return

        # Sleep for a minute and check for new checkpoints.
        tf.logging.info('All checkpoints evaluated; sleeping for a minute')
        time.sleep(60)
Пример #7
0
def train(job_id, service_account_json, bucket_name, region, config_files,
          dataset, scale_tier, master_type, worker_type, worker_count,
          parameter_server_type, parameter_server_count):

    project_id = get_project_id(service_account_json)
    if project_id is None:
        raise ValueError(
            'Missing "project_id" in service_account_json "{}"'.format(
                service_account_json))

    if bucket_name is None:
        client_id = get_client_id(service_account_json)
        bucket_name = 'luminoth-{}'.format(client_id)
        click.echo(
            'Bucket name not specified. Using "{}".'.format(bucket_name))

    credentials = get_credentials(service_account_json)
    validate_region(region, project_id, credentials)

    # Creates bucket for logs and models if it doesn't exist
    bucket = get_bucket(service_account_json, bucket_name)

    if not job_id:
        job_id = 'train_{}'.format(datetime.now().strftime("%Y%m%d_%H%M%S"))

    # Define path in bucket to store job's config, logs, etc.
    base_path = 'lumi_{}'.format(job_id)

    package_path = build_package(bucket, base_path)

    # Check if absolute or relative dataset path
    if not dataset.startswith('gs://'):
        dataset = 'gs://{}'.format(dataset)

    args = []

    args.extend([
        '-o',
        'dataset.dir={}'.format(dataset),
    ])

    override_params = [
        'dataset.dir={}'.format(dataset),
    ]

    custom_config = load_config(config_files)
    model_class = get_model(custom_config.model.type)
    config = get_model_config(
        model_class.base_config,
        custom_config,
        override_params,
    )
    # We should validate config before submitting job

    # Update final config file to job bucket
    config_path = os.path.join(base_path, DEFAULT_CONFIG_FILENAME)
    upload_data(bucket, config_path, dump_config(config))

    args = ['--config', 'gs://{}/{}'.format(bucket_name, config_path)]

    cloudml = cloud_service(credentials, 'ml')

    training_inputs = {
        'scaleTier': scale_tier,
        'packageUris': ['gs://{}/{}'.format(bucket_name, package_path)],
        'pythonModule': 'luminoth.train',
        'args': args,
        'region': region,
        'jobDir': 'gs://{}/{}/'.format(bucket_name, base_path),
        'runtimeVersion': RUNTIME_VERSION
    }

    if scale_tier == 'CUSTOM':
        training_inputs['masterType'] = master_type
        training_inputs['workerType'] = worker_type
        training_inputs['workerCount'] = worker_count
        if parameter_server_count > 0:
            training_inputs['parameterServerCount'] = parameter_server_count
            training_inputs['parameterServerType'] = parameter_server_type

    job_spec = {'jobId': job_id, 'trainingInput': training_inputs}

    jobrequest = cloudml.projects().jobs().create(
        body=job_spec, parent='projects/{}'.format(project_id))

    try:
        click.echo('Submitting training job.')
        res = jobrequest.execute()
        click.echo('Job {} submitted successfully.'.format(job_id))
        click.echo('state = {}, createTime = {}'.format(
            res.get('state'), res.get('createTime')))

        save_run(config, environment='gcloud', extra_config=job_spec)

    except Exception as err:
        click.echo('There was an error creating the training job. '
                   'Check the details: \n{}'.format(err._get_reason()))