Exemplo n.º 1
0
def save_progress(config, weight_dict, it_val_dict, exp_label, step,
                  directories, sess, saver, val_check, val_score, val_loss,
                  val_perf, train_score, train_loss, timer, num_params, log,
                  use_db, summary_op, summary_writer, save_activities,
                  save_gradients, save_checkpoints):
    """Save progress and important data."""
    # Update best val
    if len(val_check):
        val_check_idx = val_check[0]
        val_perf[val_check_idx] = val_loss

    # Then trigger optional saves
    if config.save_weights and len(val_check):
        it_weights = {k: it_val_dict[k] for k in weight_dict.keys()}
        py_utils.save_npys(data=it_weights,
                           model_name='%s_%s' % (exp_label, step),
                           output_string=directories['weights'])

    if save_activities and len(val_check):
        py_utils.save_npys(data=it_val_dict,
                           model_name='%s_%s' % (exp_label, step),
                           output_string=directories['weights'])

    ckpt_path = os.path.join(directories['checkpoints'],
                             'model_%s.ckpt' % step)
    if save_checkpoints and len(val_check):
        log.info('Saving checkpoint to: %s' % ckpt_path)
        saver.save(sess, ckpt_path, global_step=step)
        val_check = val_check[0]
        val_perf[val_check] = val_loss
    if save_gradients and len(val_check):
        # np.savez(
        #     os.path.join(
        #         config.results,
        #         '%s_train_gradients' % exp_label),
        #     **it_train_dict)
        np.savez(os.path.join(config.results, '%s_val_gradients' % exp_label),
                 **it_val_dict)

    if use_db:
        db.update_performance(experiment_id=config._id,
                              experiment=config.experiment,
                              train_score=float(train_score),
                              train_loss=float(train_loss),
                              val_score=float(val_score),
                              val_loss=float(val_loss),
                              step=step,
                              num_params=int(num_params),
                              ckpt_path=ckpt_path,
                              results_path=config.results,
                              summary_path=directories['summaries'])

    # Summaries
    summary_str = sess.run(summary_op)
    summary_writer.add_summary(summary_str, step)
    return val_perf
Exemplo n.º 2
0
def save_progress(config, weight_dict, it_val_dict, exp_label, step,
                  directories, sess, saver, data_structure, val_acc, val_lo,
                  train_acc, train_loss, timesteps, log, summary_op,
                  summary_writer, save_activities, save_checkpoints):
    """Save progress and important data."""
    if config.save_weights:
        it_weights = {k: it_val_dict[k] for k in weight_dict.keys()}
        py_utils.save_npys(data=it_weights,
                           model_name='%s_%s' % (exp_label, step),
                           output_string=directories['weights'])
    if save_activities:
        py_utils.save_npys(data=it_val_dict,
                           model_name='%s_%s' % (exp_label, step),
                           output_string=directories['weights'])

    if save_checkpoints:
        ckpt_path = os.path.join(directories['checkpoints'],
                                 'model_%s.ckpt' % step)
        saver.save(sess, ckpt_path, global_step=step)
    try:
        data_structure.update_validation(validation_accuracy=val_acc,
                                         validation_loss=val_lo,
                                         validation_step=step)
        data_structure.save()
    except Exception as e:
        log.warning('Failed to save validation info: %s' % e)
    try:
        data_structure.update_training(train_accuracy=train_acc,
                                       train_loss=train_loss,
                                       train_step=timesteps)
        data_structure.save()
    except Exception as e:
        log.warning('Failed to save training info: %s' % e)

    # Summaries
    try:
        summary_str = sess.run(summary_op)
        summary_writer.add_summary(summary_str, step)
    except Exception:
        print 'Failed to update summaries.'
Exemplo n.º 3
0
def main(experiment_name, list_experiments=False, gpu_device='/gpu:0'):
    """Create a tensorflow worker to run experiments in your DB."""
    if list_experiments:
        exps = db.list_experiments()
        print '_' * 30
        print 'Initialized experiments:'
        print '_' * 30
        for l in exps:
            print l.values()[0]
        print '_' * 30
        print 'You can add to the DB with: '\
            'python prepare_experiments.py --experiment=%s' % \
            exps[0].values()[0]
        return
    if experiment_name is None:
        print 'No experiment specified. Pulling one out of the DB.'
        experiment_name = db.get_experiment_name()

    # Prepare to run the model
    config = Config()
    condition_label = '%s_%s' % (experiment_name, py_utils.get_dt_stamp())
    experiment_label = '%s' % (experiment_name)
    log = logger.get(os.path.join(config.log_dir, condition_label))
    experiment_dict = experiments.experiments()[experiment_name]()
    config = add_to_config(d=experiment_dict, config=config)  # Globals
    config, exp_params = process_DB_exps(
        experiment_name=experiment_name, log=log,
        config=config)  # Update config w/ DB params
    dataset_module = py_utils.import_module(model_dir=config.dataset_info,
                                            dataset=config.dataset)
    dataset_module = dataset_module.data_processing()  # hardcoded class name
    train_data, train_means = get_data_pointers(
        dataset=config.dataset,
        base_dir=config.tf_records,
        cv=dataset_module.folds.keys()[1],  # TODO: SEARCH FOR INDEX.
        log=log)
    val_data, val_means = get_data_pointers(dataset=config.dataset,
                                            base_dir=config.tf_records,
                                            cv=dataset_module.folds.keys()[0],
                                            log=log)

    # Initialize output folders
    dir_list = {
        'checkpoints':
        os.path.join(config.checkpoints, condition_label),
        'summaries':
        os.path.join(config.summaries, condition_label),
        'condition_evaluations':
        os.path.join(config.condition_evaluations, condition_label),
        'experiment_evaluations':
        os.path.join(  # DEPRECIATED
            config.experiment_evaluations, experiment_label),
        'visualization':
        os.path.join(config.visualizations, condition_label),
        'weights':
        os.path.join(config.condition_evaluations, condition_label, 'weights')
    }
    [py_utils.make_dir(v) for v in dir_list.values()]

    # Prepare data loaders on the cpu
    config.data_augmentations = py_utils.flatten_list(
        config.data_augmentations, log)
    with tf.device('/cpu:0'):
        train_images, train_labels = data_loader.inputs(
            dataset=train_data,
            batch_size=config.batch_size,
            model_input_image_size=dataset_module.model_input_image_size,
            tf_dict=dataset_module.tf_dict,
            data_augmentations=config.data_augmentations,
            num_epochs=config.epochs,
            tf_reader_settings=dataset_module.tf_reader,
            shuffle=config.shuffle)
        val_images, val_labels = data_loader.inputs(
            dataset=val_data,
            batch_size=config.batch_size,
            model_input_image_size=dataset_module.model_input_image_size,
            tf_dict=dataset_module.tf_dict,
            data_augmentations=config.data_augmentations,
            num_epochs=config.epochs,
            tf_reader_settings=dataset_module.tf_reader,
            shuffle=config.shuffle)
    log.info('Created tfrecord dataloader tensors.')

    # Load model specification
    struct_name = config.model_struct.split(os.path.sep)[-1]
    try:
        model_dict = py_utils.import_module(
            dataset=struct_name,
            model_dir=os.path.join('models', 'structs',
                                   experiment_name).replace(os.path.sep, '.'))
    except IOError:
        print 'Could not find the model structure: %s' % experiment_name

    # Inject model_dict with hyperparameters if requested
    model_dict.layer_structure = hp_opt_utils.inject_model_with_hps(
        layer_structure=model_dict.layer_structure, exp_params=exp_params)

    # Prepare model on GPU
    with tf.device(gpu_device):
        with tf.variable_scope('cnn') as scope:

            # Training model
            if len(dataset_module.output_size) > 1:
                log.warning('Found > 1 dimension for your output size.'
                            'Converting to a scalar.')
                dataset_module.output_size = np.prod(
                    dataset_module.output_size)

            if hasattr(model_dict, 'output_structure'):
                # Use specified output layer
                output_structure = model_dict.output_structure
            else:
                output_structure = None
            model = model_utils.model_class(
                mean=train_means,
                training=True,
                output_size=dataset_module.output_size)
            train_scores, model_summary = model.build(
                data=train_images,
                layer_structure=model_dict.layer_structure,
                output_structure=output_structure,
                log=log,
                tower_name='cnn')
            log.info('Built training model.')
            log.debug(json.dumps(model_summary, indent=4), verbose=0)
            print_model_architecture(model_summary)

            # Prepare the loss function
            train_loss, _ = loss_utils.loss_interpreter(
                logits=train_scores,
                labels=train_labels,
                loss_type=config.loss_function,
                dataset_module=dataset_module)

            # Add weight decay if requested
            if len(model.regularizations) > 0:
                train_loss = loss_utils.wd_loss(
                    regularizations=model.regularizations,
                    loss=train_loss,
                    wd_penalty=config.regularization_strength)
            train_op = loss_utils.optimizer_interpreter(
                loss=train_loss,
                lr=config.lr,
                optimizer=config.optimizer,
                constraints=config.optimizer_constraints,
                model=model)
            log.info('Built training loss function.')

            train_accuracy = eval_metrics.metric_interpreter(
                metric=dataset_module.score_metric,
                pred=train_scores,
                labels=train_labels)  # training accuracy
            if int(train_images.get_shape()[-1]) <= 3:
                tf.summary.image('train images', train_images)
            tf.summary.scalar('training loss', train_loss)
            tf.summary.scalar('training accuracy', train_accuracy)
            log.info('Added training summaries.')

            # Validation model
            scope.reuse_variables()
            val_model = model_utils.model_class(
                mean=val_means,
                training=True,
                output_size=dataset_module.output_size)
            val_scores, _ = val_model.build(  # Ignore summary
                data=val_images,
                layer_structure=model_dict.layer_structure,
                output_structure=output_structure,
                log=log,
                tower_name='cnn')
            log.info('Built validation model.')

            val_loss, _ = loss_utils.loss_interpreter(
                logits=val_scores,
                labels=val_labels,
                loss_type=config.loss_function,
                dataset_module=dataset_module)
            val_accuracy = eval_metrics.metric_interpreter(
                metric=dataset_module.score_metric,
                pred=val_scores,
                labels=val_labels)  # training accuracy
            if int(train_images.get_shape()[-1]) <= 3:
                tf.summary.image('val images', val_images)
            tf.summary.scalar('validation loss', val_loss)
            tf.summary.scalar('validation accuracy', val_accuracy)
            log.info('Added validation summaries.')

    # Set up summaries and saver
    saver = tf.train.Saver(tf.global_variables())
    summary_op = tf.summary.merge_all()

    # Initialize the graph
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

    # Need to initialize both of these if supplying num_epochs to inputs
    sess.run(
        tf.group(tf.global_variables_initializer(),
                 tf.local_variables_initializer()))
    summary_writer = tf.summary.FileWriter(dir_list['summaries'], sess.graph)

    # Set up exemplar threading
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # Create dictionaries of important training and validation information
    train_dict = {
        'train_loss': train_loss,
        'train_accuracy': train_accuracy,
        'train_images': train_images,
        'train_labels': train_labels,
        'train_op': train_op,
        'train_scores': train_scores
    }
    val_dict = {
        'val_loss': val_loss,
        'val_accuracy': val_accuracy,
        'val_images': val_images,
        'val_labels': val_labels,
        'val_scores': val_scores,
    }

    # Start training loop
    np.save(
        os.path.join(dir_list['condition_evaluations'],
                     'training_config_file'), config)
    log.info('Starting training')
    output_dict = training.training_loop(
        config=config,
        db=db,
        coord=coord,
        sess=sess,
        summary_op=summary_op,
        summary_writer=summary_writer,
        saver=saver,
        threads=threads,
        summary_dir=dir_list['summaries'],
        checkpoint_dir=dir_list['checkpoints'],
        weight_dir=dir_list['weights'],
        train_dict=train_dict,
        val_dict=val_dict,
        train_model=model,
        val_model=val_model,
        exp_params=exp_params)
    log.info('Finished training.')

    model_name = config.model_struct.replace('/', '_')
    py_utils.save_npys(data=output_dict,
                       model_name=model_name,
                       output_string=dir_list['experiment_evaluations'])
Exemplo n.º 4
0
def evaluation_loop(
        config,
        db,
        coord,
        sess,
        summary_op,
        summary_writer,
        saver,
        threads,
        summary_dir,
        checkpoint_dir,
        weight_dir,
        train_dict,
        val_dict,
        train_model,
        val_model,
        exp_params,
        placeholder_data=None,
        performance_metric='validation_loss',
        aggregator='max'):
    """Run the model training loop."""
    step = 0
    train_losses, train_accs, train_aux, timesteps = {}, {}, {}, {}
    val_scores, val_aux, val_labels = {}, {}, {}
    train_images, val_images = {}, {}
    train_scores, train_labels = {}, {}
    train_aux_check = np.any(['aux_score' in k for k in train_dict.keys()])
    val_aux_check = np.any(['aux_score' in k for k in val_dict.keys()])

    # Restore model
    saver.restore(sess, config.load_and_evaluate_ckpt)

    # Start evaluation
    if config.save_weights:
        weight_dict = {
            k[0]: v for k, v in train_model.var_dict.iteritems() if k[1] == 0}
        val_dict = dict(
            val_dict,
            **weight_dict)
    if placeholder_data is not None:
        num_batches = len(placeholder_data['label_data']) // config.batch_size
        batch_index = np.arange(num_batches).repeat(config.batch_size)
        for idx in np.arange(num_batches):
            batch_images = placeholder_data['image_data'][batch_index == idx]
            batch_labels = placeholder_data['label_data'][batch_index == idx]
            import ipdb;ipdb.set_trace()
            batch_images = batch_images.reshape(
                placeholder_data['val_image_shape'])
            batch_labels = batch_labels.reshape(
                placeholder_data['val_label_shape'])
            feed_dict = {
                placeholder_data['val_images']: batch_images,
                placeholder_data['val_labels']: batch_labels,
            }
            it_vars = sess.run(val_dict.values(), feed_dict=feed_dict)
            it_dict = {k: v for k, v in zip(
                val_dict.keys(), it_vars)}
            a = 2

    else:
        try:
            while not coord.should_stop():
                start_time = time.time()
                if 1:  # step % config.validation_iters == 0:
                    it_val_scores, it_val_labels, it_val_aux = [], [], []
                    for num_vals in range(config.num_validation_evals):
                        # Validation accuracy as the average of n batches
                        val_vars = sess.run(val_dict.values())
                        it_val_dict = {k: v for k, v in zip(
                            val_dict.keys(), val_vars)}
                        it_val_labels += [it_val_dict['val_labels']]
                        it_val_scores += [it_val_dict['val_scores']]
                        if val_aux_check:
                            iva = {
                                itk: itv
                                for itk, itv in it_val_dict.iteritems()
                                if 'aux_score' in itk}
                            it_val_aux += [iva]
                    val_scores[step] = np.concatenate(it_val_scores)
                    val_labels[step] = np.concatenate(it_val_labels)
                    val_aux[step] = it_val_aux
                    val_images[step] = it_val_dict['val_images']

                    # Save the model checkpoint if it's the best yet
                    it_weights = {
                        k: it_val_dict[k] for k in weight_dict.keys()}
                    py_utils.save_npys(
                        data=it_weights,
                        model_name='%s_%s' % (
                            config.experiment_name,
                            step),
                        output_string=weight_dir)

                # End iteration
                step += 1

        except tf.errors.OutOfRangeError:
            print 'Done with evaluation for %d epochs, %d steps.' % (
                config.epochs,
                step)
            print 'Saved to: %s' % checkpoint_dir
        finally:
            coord.request_stop()
        coord.join(threads)
    sess.close()

    # Package images into a dictionary
    image_dict = {
        # 'train_images': train_images,
        'val_images': val_images,
        # 'train_scores': train_scores,
        # 'train_labels': train_labels,
        'val_scores': val_scores,
        'val_labels': val_labels
    }
    py_utils.save_npys(
        data=image_dict,
        model_name='%s_%s' % (
            config.experiment_name,
            step),
        output_string=weight_dir)

    return val_labels, val_scores
Exemplo n.º 5
0
def main(
        experiment_name,
        list_experiments=False,
        load_and_evaluate_ckpt=None,
        placeholder_data=None,
        grad_images=False,
        gpu_device='/gpu:0'):
    """Create a tensorflow worker to run experiments in your DB."""
    if list_experiments:
        exps = db.list_experiments()
        print '_' * 30
        print 'Initialized experiments:'
        print '_' * 30
        for l in exps:
            print l.values()[0]
        print '_' * 30
        if len(exps) == 0:
            print 'No experiments found.'
        else:
            print 'You can add to the DB with: '\
                'python prepare_experiments.py --experiment=%s' % \
                exps[0].values()[0]
        return

    if experiment_name is None:
        print 'No experiment specified. Pulling one out of the DB.'
        experiment_name = db.get_experiment_name()

    # Prepare to run the model
    config = Config()
    condition_label = '%s_%s' % (experiment_name, py_utils.get_dt_stamp())
    experiment_label = '%s' % (experiment_name)
    log = logger.get(os.path.join(config.log_dir, condition_label))
    assert experiment_name is not None, 'Empty experiment name.'
    experiment_dict = experiments.experiments()[experiment_name]()
    config = add_to_config(d=experiment_dict, config=config)  # Globals
    config.load_and_evaluate_ckpt = load_and_evaluate_ckpt
    if load_and_evaluate_ckpt is not None:
        # Remove the train operation and add a ckpt pointer
        from ops import evaluation
    config, exp_params = process_DB_exps(
        experiment_name=experiment_name,
        log=log,
        config=config)  # Update config w/ DB params
    dataset_module = py_utils.import_module(
        model_dir=config.dataset_info,
        dataset=config.dataset)
    dataset_module = dataset_module.data_processing()  # hardcoded class name
    train_key = [k for k in dataset_module.folds.keys() if 'train' in k]
    if not len(train_key):
        train_key = 'train'
    else:
        train_key = train_key[0]
    train_data, train_means_image, train_means_label = get_data_pointers(
        dataset=config.dataset,
        base_dir=config.tf_records,
        cv=train_key,
        log=log)
    val_key = [k for k in dataset_module.folds.keys() if 'val' in k]
    if not len(val_key):
        val_key = 'train'
    else:
        val_key = val_key[0]
    val_data, val_means_image, val_means_label = get_data_pointers(
        dataset=config.dataset,
        base_dir=config.tf_records,
        cv=val_key,
        log=log)

    # Initialize output folders
    dir_list = {
        'checkpoints': os.path.join(
            config.checkpoints, condition_label),
        'summaries': os.path.join(
            config.summaries, condition_label),
        'condition_evaluations': os.path.join(
            config.condition_evaluations, condition_label),
        'experiment_evaluations': os.path.join(  # DEPRECIATED
            config.experiment_evaluations, experiment_label),
        'visualization': os.path.join(
            config.visualizations, condition_label),
        'weights': os.path.join(
            config.condition_evaluations, condition_label, 'weights')
    }
    [py_utils.make_dir(v) for v in dir_list.values()]

    # Prepare data loaders on the cpu
    if all(isinstance(i, list) for i in config.data_augmentations):
        if config.data_augmentations:
            config.data_augmentations = py_utils.flatten_list(
                config.data_augmentations,
                log)
    if load_and_evaluate_ckpt is not None:
        config.epochs = 1
        config.train_shuffle = False
        config.val_shuffle = False
    with tf.device('/cpu:0'):
        if placeholder_data:
            placeholder_shape = placeholder_data['train_image_shape']
            placeholder_dtype = placeholder_data['train_image_dtype']
            original_train_images = tf.placeholder(
                dtype=placeholder_dtype,
                shape=placeholder_shape,
                name='train_images')
            placeholder_shape = placeholder_data['train_label_shape']
            placeholder_dtype = placeholder_data['train_label_dtype']
            original_train_labels = tf.placeholder(
                dtype=placeholder_dtype,
                shape=placeholder_shape,
                name='train_labels')
            placeholder_shape = placeholder_data['val_image_shape']
            placeholder_dtype = placeholder_data['val_image_dtype']
            original_val_images = tf.placeholder(
                dtype=placeholder_dtype,
                shape=placeholder_shape,
                name='val_images')
            placeholder_shape = placeholder_data['val_label_shape']
            placeholder_dtype = placeholder_data['val_label_dtype']
            original_val_labels = tf.placeholder(
                dtype=placeholder_dtype,
                shape=placeholder_shape,
                name='val_labels')

            # Apply augmentations
            (
                train_images,
                train_labels
            ) = data_loader.placeholder_image_augmentations(
                images=original_train_images,
                model_input_image_size=dataset_module.model_input_image_size,
                labels=original_train_labels,
                data_augmentations=config.data_augmentations,
                batch_size=config.batch_size)
            (
                val_images,
                val_labels
            ) = data_loader.placeholder_image_augmentations(
                images=original_val_images,
                model_input_image_size=dataset_module.model_input_image_size,
                labels=original_val_labels,
                data_augmentations=config.data_augmentations,
                batch_size=config.batch_size)

            # Store in the placeholder dict
            placeholder_data['train_images'] = original_train_images
            placeholder_data['train_labels'] = original_train_labels
            placeholder_data['val_images'] = original_val_images
            placeholder_data['val_labels'] = original_val_labels
        else:
            train_images, train_labels = data_loader.inputs(
                dataset=train_data,
                batch_size=config.batch_size,
                model_input_image_size=dataset_module.model_input_image_size,
                tf_dict=dataset_module.tf_dict,
                data_augmentations=config.data_augmentations,
                num_epochs=config.epochs,
                tf_reader_settings=dataset_module.tf_reader,
                shuffle=config.shuffle_train,
                resize_output=config.resize_output)
            if hasattr(config, 'val_augmentations'):
                val_augmentations = config.val_augmentations
            else:
                val_augmentations = config.data_augmentations
            val_images, val_labels = data_loader.inputs(
                dataset=val_data,
                batch_size=config.batch_size,
                model_input_image_size=dataset_module.model_input_image_size,
                tf_dict=dataset_module.tf_dict,
                data_augmentations=val_augmentations,
                num_epochs=config.epochs,
                tf_reader_settings=dataset_module.tf_reader,
                shuffle=config.shuffle_val,
                resize_output=config.resize_output)
    log.info('Created tfrecord dataloader tensors.')

    # Load model specification
    struct_name = config.model_struct.split(os.path.sep)[-1]
    try:
        model_dict = py_utils.import_module(
            dataset=struct_name,
            model_dir=os.path.join(
                'models',
                'structs',
                experiment_name).replace(os.path.sep, '.')
            )
    except IOError:
        print 'Could not find the model structure: %s in folder %s' % (
            struct_name,
            experiment_name)

    # Inject model_dict with hyperparameters if requested
    model_dict.layer_structure = hp_opt_utils.inject_model_with_hps(
        layer_structure=model_dict.layer_structure,
        exp_params=exp_params)

    # Prepare variables for the models
    if len(dataset_module.output_size) == 2:
        log.warning(
            'Found > 1 dimension for your output size.'
            'Converting to a scalar.')
        dataset_module.output_size = np.prod(
            dataset_module.output_size)

    if hasattr(model_dict, 'output_structure'):
        # Use specified output layer
        output_structure = model_dict.output_structure
    else:
        output_structure = None

    # Correct number of output neurons if needed
    if config.dataloader_override and\
            'weights' in output_structure[-1].keys():
        output_neurons = output_structure[-1]['weights'][0]
        size_check = output_neurons != dataset_module.output_size
        fc_check = output_structure[-1]['layers'][0] == 'fc'
        if size_check and fc_check:
            output_structure[-1]['weights'][0] = dataset_module.output_size
            log.warning('Adjusted output neurons from %s to %s.' % (
                output_neurons,
                dataset_module.output_size))

    # Prepare model on GPU
    if not hasattr(dataset_module, 'input_normalization'):
        dataset_module.input_normalization = None
    with tf.device(gpu_device):
        with tf.variable_scope('cnn') as scope:
            # Training model
            model = model_utils.model_class(
                mean=train_means_image,
                training=True,
                output_size=dataset_module.output_size,
                input_normalization=dataset_module.input_normalization)
            train_scores, model_summary, _ = model.build(
                data=train_images,
                layer_structure=model_dict.layer_structure,
                output_structure=output_structure,
                log=log,
                tower_name='cnn')
            if grad_images:
                oh_dims = int(train_scores.get_shape()[-1])
                target_scores = tf.one_hot(train_labels, oh_dims) * train_scores
                train_gradients = tf.gradients(target_scores, train_images)[0]
            log.info('Built training model.')
            log.debug(
                json.dumps(model_summary, indent=4),
                verbose=0)
            print_model_architecture(model_summary)

            # Normalize labels on GPU if needed
            if 'normalize_labels' in exp_params.keys():
                if exp_params['normalize_labels'] == 'zscore':
                    train_labels -= train_means_label['mean']
                    train_labels /= train_means_label['std']
                    val_labels -= train_means_label['mean']
                    val_labels /= train_means_label['std']
                    log.info('Z-scoring labels.')
                elif exp_params['normalize_labels'] == 'mean':
                    train_labels -= train_means_label['mean']
                    val_labels -= val_means_label['mean']
                    log.info('Mean-centering labels.')

            # Check the shapes of labels and scores
            if not isinstance(train_scores, list):
                if len(
                        train_scores.get_shape()) != len(
                            train_labels.get_shape()):
                    train_shape = train_scores.get_shape().as_list()
                    label_shape = train_labels.get_shape().as_list()
                    val_shape = val_scores.get_shape().as_list()
                    val_label_shape = val_labels.get_shape().as_list()

                    if len(
                        train_shape) == 2 and len(
                            label_shape) == 1 and train_shape[-1] == 1:
                        train_labels = tf.expand_dims(train_labels, axis=-1)
                        val_labels = tf.expand_dims(val_labels, axis=-1)
                    elif len(
                        train_shape) == 2 and len(
                            label_shape) == 1 and train_shape[-1] == 1:
                        train_scores = tf.expand_dims(train_scores, axis=-1)
                        val_scores = tf.expand_dims(val_scores, axis=-1)

            # Prepare the loss function
            train_loss, _ = loss_utils.loss_interpreter(
                logits=train_scores,  # TODO
                labels=train_labels,
                loss_type=config.loss_function,
                weights=config.loss_weights,
                dataset_module=dataset_module)

            # Add loss tensorboard tracking
            if isinstance(train_loss, list):
                for lidx, tl in enumerate(train_loss):
                    tf.summary.scalar('training_loss_%s' % lidx, tl)
                train_loss = tf.add_n(train_loss)
            else:
                tf.summary.scalar('training_loss', train_loss)

            # Add weight decay if requested
            if len(model.regularizations) > 0:
                train_loss = loss_utils.wd_loss(
                    regularizations=model.regularizations,
                    loss=train_loss,
                    wd_penalty=config.regularization_strength)
            assert config.lr is not None, 'No learning rate.'  # TODO: Make a QC function 
            if config.lr > 1:
                old_lr = config.lr
                config.lr = loss_utils.create_lr_schedule(
                    train_batch=config.batch_size,
                    num_training=config.lr)
                config.optimizer = 'momentum'
                log.info('Forcing momentum classifier.')
            else:
                old_lr = None
            train_op = loss_utils.optimizer_interpreter(
                loss=train_loss,
                lr=config.lr,
                optimizer=config.optimizer,
                constraints=config.optimizer_constraints,
                model=model)
            log.info('Built training loss function.')

            # Add a score for the training set
            train_accuracy = eval_metrics.metric_interpreter(
                metric=dataset_module.score_metric,  # TODO: Attach to exp cnfg
                pred=train_scores,  # TODO
                labels=train_labels)

            # Add aux scores if requested
            train_aux = {}
            if hasattr(dataset_module, 'aux_scores'):
                for m in dataset_module.aux_scores:
                    train_aux[m] = eval_metrics.metric_interpreter(
                        metric=m,
                        pred=train_scores,
                        labels=train_labels)  # [0]  # TODO: Fix for multiloss

            # Prepare remaining tensorboard summaries
            if config.tensorboard_images:
                if len(train_images.get_shape()) == 4:
                    tf_fun.image_summaries(train_images, tag='Training images')
                if (np.asarray(
                        train_labels.get_shape().as_list()) > 1).sum() > 2:
                    tf_fun.image_summaries(
                        train_labels,
                        tag='Training_targets')
                    tf_fun.image_summaries(
                        train_scores,
                        tag='Training_predictions')
            if isinstance(train_accuracy, list):
                for tidx, ta in enumerate(train_accuracy):
                    tf.summary.scalar('training_accuracy_%s' % tidx, ta)
            else:
                tf.summary.scalar('training_accuracy', train_accuracy)
            if config.pr_curve:
                if isinstance(train_scores, list):
                    for pidx, train_score in enumerate(train_scores):
                        train_label = train_labels[:, pidx]
                        pr_summary.op(
                            tag='training_pr_%s' % pidx,
                            predictions=tf.cast(
                                tf.argmax(
                                    train_score,
                                    axis=-1),
                                tf.float32),
                            labels=tf.cast(train_label, tf.bool),
                            display_name='training_precision_recall_%s' % pidx)
                else:
                    pr_summary.op(
                        tag='training_pr',
                        predictions=tf.cast(
                            tf.argmax(
                                train_scores,
                                axis=-1),
                            tf.float32),
                        labels=tf.cast(train_labels, tf.bool),
                        display_name='training_precision_recall')
            log.info('Added training summaries.')

        with tf.variable_scope('cnn', tf.AUTO_REUSE) as scope:
            # Validation model
            scope.reuse_variables()
            val_model = model_utils.model_class(
                mean=train_means_image,  # Normalize with train data
                training=False,
                output_size=dataset_module.output_size,
                input_normalization=dataset_module.input_normalization)
            val_scores, _, _ = val_model.build(  # Ignore summary
                data=val_images,
                layer_structure=model_dict.layer_structure,
                output_structure=output_structure,
                log=log,
                tower_name='cnn')
            if grad_images:
                oh_dims = int(val_scores.get_shape()[-1])
                target_scores = tf.one_hot(val_labels, oh_dims) * val_scores
                val_gradients = tf.gradients(target_scores, val_images)[0]
            log.info('Built validation model.')

            # Check the shapes of labels and scores
            val_loss, _ = loss_utils.loss_interpreter(
                logits=val_scores,
                labels=val_labels,
                loss_type=config.loss_function,
                weights=config.loss_weights,
                dataset_module=dataset_module)

            # Add loss tensorboard tracking
            if isinstance(val_loss, list):
                for lidx, tl in enumerate(val_loss):
                    tf.summary.scalar('validation_loss_%s' % lidx, tl)
                val_loss = tf.add_n(val_loss)
            else:
                tf.summary.scalar('validation_loss', val_loss)

            # Add a score for the validation set
            val_accuracy = eval_metrics.metric_interpreter(
                metric=dataset_module.score_metric,  # TODO
                pred=val_scores,
                labels=val_labels)

            # Add aux scores if requested
            val_aux = {}
            if hasattr(dataset_module, 'aux_scores'):
                for m in dataset_module.aux_scores:
                    val_aux[m] = eval_metrics.metric_interpreter(
                        metric=m,
                        pred=val_scores,
                        labels=val_labels)  # [0]  # TODO: Fix for multiloss

            # Prepare tensorboard summaries
            if config.tensorboard_images:
                if len(val_images.get_shape()) == 4:
                    tf_fun.image_summaries(
                        val_images,
                        tag='Validation')
                if (np.asarray(
                        val_labels.get_shape().as_list()) > 1).sum() > 2:
                    tf_fun.image_summaries(
                        val_labels,
                        tag='Validation_targets')
                    tf_fun.image_summaries(
                        val_scores,
                        tag='Validation_predictions')
            if isinstance(val_accuracy, list):
                for vidx, va in enumerate(val_accuracy):
                    tf.summary.scalar('validation_accuracy_%s' % vidx, va)
            else:
                tf.summary.scalar('validation_accuracy', val_accuracy)
            if config.pr_curve:
                if isinstance(val_scores, list):
                    for pidx, val_score in enumerate(val_scores):
                        val_label = val_labels[:, pidx]
                        pr_summary.op(
                            tag='validation_pr_%s' % pidx,
                            predictions=tf.cast(
                                tf.argmax(
                                    val_score,
                                    axis=-1),
                                tf.float32),
                            labels=tf.cast(val_label, tf.bool),
                            display_name='validation_precision_recall_%s' %
                            pidx)
                else:
                    pr_summary.op(
                        tag='validation_pr',
                        predictions=tf.cast(
                            tf.argmax(
                                val_scores,
                                axis=-1),
                            tf.float32),
                        labels=tf.cast(val_labels, tf.bool),
                        display_name='validation_precision_recall')
            log.info('Added validation summaries.')

    # Set up summaries and saver
    if not hasattr(config, 'max_to_keep'):
        config.max_to_keep = None
    saver = tf.train.Saver(
        var_list=tf.global_variables(),
        max_to_keep=config.max_to_keep)
    summary_op = tf.summary.merge_all()

    # Initialize the graph
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

    # Need to initialize both of these if supplying num_epochs to inputs
    sess.run(
        tf.group(
            tf.global_variables_initializer(),
            tf.local_variables_initializer())
        )
    summary_writer = tf.summary.FileWriter(dir_list['summaries'], sess.graph)

    # Set up exemplar threading
    if placeholder_data:
        coord, threads = None, None
    else:
        coord = tf.train.Coordinator()
        threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # Create dictionaries of important training and validation information
    train_dict = {
        'train_loss': train_loss,
        'train_images': train_images,
        'train_labels': train_labels,
        'train_op': train_op,
        'train_scores': train_scores
    }
    val_dict = {
        'val_loss': val_loss,
        'val_images': val_images,
        'val_labels': val_labels,
        'val_scores': val_scores,
    }

    if grad_images:
        train_dict['train_gradients'] = train_gradients
        val_dict['val_gradients'] = val_gradients

    if isinstance(train_accuracy, list):
        for tidx, (ta, va) in enumerate(zip(train_accuracy, val_accuracy)):
            train_dict['train_accuracy_%s' % tidx] = ta
            val_dict['val_accuracy_%s' % tidx] = va
    else:
        train_dict['train_accuracy_0'] = train_accuracy
        val_dict['val_accuracy_0'] = val_accuracy

    if load_and_evaluate_ckpt is not None:
        # Remove the train operation and add a ckpt pointer
        del train_dict['train_op']

    if hasattr(dataset_module, 'aux_score'):
        # Attach auxillary scores to tensor dicts
        for m in dataset_module.aux_scores:
            train_dict['train_aux_%s' % m] = train_aux[m]
            val_dict['val_aux_%s' % m] = val_aux[m]

    # Start training loop
    if old_lr is not None:
        config.lr = old_lr
    np.save(
        os.path.join(
            dir_list['condition_evaluations'], 'training_config_file'),
        config)
    log.info('Starting training')
    if load_and_evaluate_ckpt is not None:
        return evaluation.evaluation_loop(
            config=config,
            db=db,
            coord=coord,
            sess=sess,
            summary_op=summary_op,
            summary_writer=summary_writer,
            saver=saver,
            threads=threads,
            summary_dir=dir_list['summaries'],
            checkpoint_dir=dir_list['checkpoints'],
            weight_dir=dir_list['weights'],
            train_dict=train_dict,
            val_dict=val_dict,
            train_model=model,
            val_model=val_model,
            exp_params=exp_params,
            placeholder_data=placeholder_data)
    else:
        output_dict = training.training_loop(
            config=config,
            db=db,
            coord=coord,
            sess=sess,
            summary_op=summary_op,
            summary_writer=summary_writer,
            saver=saver,
            threads=threads,
            summary_dir=dir_list['summaries'],
            checkpoint_dir=dir_list['checkpoints'],
            weight_dir=dir_list['weights'],
            train_dict=train_dict,
            val_dict=val_dict,
            train_model=model,
            val_model=val_model,
            exp_params=exp_params)

    log.info('Finished training.')
    model_name = config.model_struct.replace('/', '_')
    if output_dict is not None:
        py_utils.save_npys(
            data=output_dict,
            model_name=model_name,
            output_string=dir_list['experiment_evaluations'])
Exemplo n.º 6
0
def evaluation_loop(config,
                    db,
                    coord,
                    sess,
                    summary_op,
                    summary_writer,
                    saver,
                    threads,
                    summary_dir,
                    checkpoint_dir,
                    weight_dir,
                    train_dict,
                    val_dict,
                    train_model,
                    val_model,
                    exp_params,
                    performance_metric='validation_loss',
                    aggregator='max'):
    """Run the model training loop."""
    step = 0
    train_losses, train_accs, train_aux, timesteps = {}, {}, {}, {}
    val_losses, val_accs, val_scores, val_aux, val_labels = {}, {}, {}, {}, {}
    train_images, val_images = {}, {}
    train_scores, train_labels = {}, {}
    train_aux_check = np.any(['aux_score' in k for k in train_dict.keys()])
    val_aux_check = np.any(['aux_score' in k for k in val_dict.keys()])

    # Restore model
    saver.restore(sess, config.load_and_evaluate_ckpt)

    # Start evaluation
    if config.save_weights:
        weight_dict = {
            k[0]: v
            for k, v in train_model.var_dict.iteritems() if k[1] == 0
        }
        val_dict = dict(val_dict, **weight_dict)
    try:
        while not coord.should_stop():
            start_time = time.time()
            train_vars = sess.run(train_dict.values())
            it_train_dict = {
                k: v
                for k, v in zip(train_dict.keys(), train_vars)
            }
            duration = time.time() - start_time
            train_losses[step] = it_train_dict['train_loss']
            train_accs[step] = it_train_dict['train_accuracy']
            train_images[step] = it_train_dict['train_images']
            train_labels[step] = it_train_dict['train_labels']
            train_scores[step] = it_train_dict['train_scores']
            timesteps[step] = duration
            if train_aux_check:
                # Loop through to find aux scores
                it_train_aux = {
                    itk: itv
                    for itk, itv in it_train_dict.iteritems()
                    if 'aux_score' in itk
                }
                train_aux[step] = it_train_aux
            assert not np.isnan(it_train_dict['train_loss']).any(
            ), 'Model diverged with loss = NaN'
            if step % config.validation_iters == 0:
                it_val_acc = np.asarray([])
                it_val_loss = np.asarray([])
                it_val_scores, it_val_labels, it_val_aux = [], [], []
                for num_vals in range(config.num_validation_evals):
                    # Validation accuracy as the average of n batches
                    val_vars = sess.run(val_dict.values())
                    it_val_dict = {
                        k: v
                        for k, v in zip(val_dict.keys(), val_vars)
                    }
                    it_val_acc = np.append(it_val_acc,
                                           it_val_dict['val_accuracy'])
                    it_val_loss = np.append(it_val_loss,
                                            it_val_dict['val_loss'])
                    it_val_labels += [it_val_dict['val_labels']]
                    it_val_scores += [it_val_dict['val_scores']]
                    if val_aux_check:
                        iva = {
                            itk: itv
                            for itk, itv in it_val_dict.iteritems()
                            if 'aux_score' in itk
                        }
                        it_val_aux += [iva]
                val_acc = it_val_acc.mean()
                val_lo = it_val_loss.mean()
                val_accs[step] = val_acc
                val_losses[step] = val_lo
                val_scores[step] = it_val_scores
                val_labels[step] = it_val_labels
                val_aux[step] = it_val_aux
                val_images[step] = it_val_dict['val_images']

                # Save the model checkpoint if it's the best yet
                it_weights = {k: it_val_dict[k] for k in weight_dict.keys()}
                py_utils.save_npys(data=it_weights,
                                   model_name='%s_%s' %
                                   (config.experiment_name, step),
                                   output_string=weight_dir)

            # End iteration
            step += 1

    except tf.errors.OutOfRangeError:
        print 'Done with evaluation for %d epochs, %d steps.' % (config.epochs,
                                                                 step)
        print 'Saved to: %s' % checkpoint_dir
    finally:
        coord.request_stop()
    coord.join(threads)
    sess.close()

    # Package images into a dictionary
    image_dict = {
        'train_images': train_images,
        'val_images': val_images,
        'train_scores': train_scores,
        'train_labels': train_labels,
        'val_scores': val_scores,
        'val_labels': val_labels
    }
    py_utils.save_npys(data=image_dict,
                       model_name='%s_%s' % (config.experiment_name, step),
                       output_string=weight_dir)
Exemplo n.º 7
0
def training_loop(config, db, coord, sess, summary_op, summary_writer, saver,
                  threads, summary_dir, checkpoint_dir, weight_dir, train_dict,
                  val_dict, train_model, val_model, exp_params):
    """Run the model training loop."""
    step, time_elapsed = 0, 0
    train_losses, train_accs, timesteps = {}, {}, {}
    val_losses, val_accs, val_scores, val_labels = {}, {}, {}, {}
    if config.save_weights:
        weight_dict = {
            k[0]: v
            for k, v in val_model.var_dict.iteritems() if k[1] == 0
        }
        val_dict = dict(val_dict, **weight_dict)
    try:
        while not coord.should_stop():
            start_time = time.time()
            train_vars = sess.run(train_dict.values())
            it_train_dict = {
                k: v
                for k, v in zip(train_dict.keys(), train_vars)
            }
            duration = time.time() - start_time
            train_losses[step] = it_train_dict['train_loss']
            train_accs[step] = it_train_dict['train_accuracy']
            timesteps[step] = duration
            assert not np.isnan(it_train_dict['train_loss']).any(
            ), 'Model diverged with loss = NaN'
            if step % config.validation_iters == 0:
                it_val_acc = np.asarray([])
                it_val_loss = np.asarray([])
                it_val_scores = np.asarray([])
                it_val_labels = np.asarray([])
                for num_vals in range(config.num_validation_evals):
                    # Validation accuracy as the average of n batches
                    val_vars = sess.run(val_dict.values())
                    it_val_dict = {
                        k: v
                        for k, v in zip(val_dict.keys(), val_vars)
                    }
                    it_val_acc = np.append(it_val_acc,
                                           it_val_dict['val_accuracy'])
                    it_val_loss = np.append(it_val_loss,
                                            it_val_dict['val_loss'])
                    it_val_scores = np.append(it_val_loss,
                                              it_val_dict['val_scores'])
                    it_val_labels = np.append(it_val_loss,
                                              it_val_dict['val_labels'])
                val_acc = it_val_acc.mean()
                val_lo = it_val_loss.mean()
                val_accs[step] = val_acc
                val_losses[step] = val_lo
                val_scores[step] = it_val_scores
                val_labels[step] = it_val_labels

                # Summaries
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

                # Training status and validation accuracy
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s | '
                              'Validation accuracy = %s | logdir = %s')
                print format_str % (
                    datetime.now(), step, it_train_dict['train_loss'],
                    config.batch_size / duration, float(duration),
                    it_train_dict['train_accuracy'], val_acc, summary_dir)

                # Save the model checkpoint if it's the best yet
                if config.top_n_validation > 0:
                    rep_idx = val_acc > val_accs
                    if sum(rep_idx) > 0:
                        force_save = True
                        val_accs[np.argmax(rep_idx)] = val_acc
                else:
                    force_save = True

                if force_save:
                    ckpt_path = os.path.join(checkpoint_dir,
                                             'model_' + str(step) + '.ckpt')
                    saver.save(sess, ckpt_path, global_step=step)
                    print 'Saved checkpoint to: %s' % ckpt_path
                    force_save = False
                    time_elapsed += float(duration)
                    db.update_performance(
                        experiment_id=config._id,
                        experiment_name=config.experiment_name,
                        summary_dir=summary_dir,
                        ckpt_file=ckpt_path,
                        training_loss=float(it_train_dict['train_loss']),
                        validation_loss=float(val_acc),
                        time_elapsed=time_elapsed,
                        training_step=step)
                    if config.save_weights:
                        it_weights = {
                            k: it_val_dict[k]
                            for k in weight_dict.keys()
                        }
                        py_utils.save_npys(data=it_weights,
                                           model_name='%s_%s' %
                                           (config.experiment_name, step),
                                           output_string=weight_dir)

                if config.early_stop:
                    keys = np.sort([int(k) for k in val_accs.keys()])
                    sorted_vals = np.asarray([val_accs[k] for k in keys])
                    if check_early_stop(sorted_vals):
                        print 'Triggered an early stop.'
                        break
            else:
                # Training status
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s')
                print format_str % (
                    datetime.now(), step,
                    it_train_dict['train_loss'], config.batch_size / duration,
                    float(duration), it_train_dict['train_accuracy'])

            # End iteration
            step += 1

    except tf.errors.OutOfRangeError:
        print 'Done training for %d epochs, %d steps.' % (config.epochs, step)
        print 'Saved to: %s' % checkpoint_dir
    finally:
        coord.request_stop()
    coord.join(threads)
    sess.close()

    # If using hp optimization, store performance here
    if exp_params['hp_current_iteration'] is not None:
        exp_params['hp_current_iteration'] += 1

    # Package output variables into a dictionary
    output_dict = {
        'train_losses': train_losses,
        'train_accs': val_losses,
        'timesteps': train_accs,
        'val_losses': val_accs,
        'val_accs': timesteps,
        'val_scores': val_scores,
        'val_labels': val_labels,
    }
    return output_dict
Exemplo n.º 8
0
def training_loop(config,
                  db,
                  coord,
                  sess,
                  summary_op,
                  summary_writer,
                  saver,
                  threads,
                  summary_dir,
                  checkpoint_dir,
                  weight_dir,
                  train_dict,
                  val_dict,
                  train_model,
                  val_model,
                  exp_params,
                  performance_metric='validation_loss',
                  aggregator='max'):
    """Run the model training loop."""
    step, time_elapsed = 0, 0
    train_losses, train_accs, train_aux, timesteps = {}, {}, {}, {}
    val_losses, val_accs, val_scores, val_aux, val_labels = {}, {}, {}, {}, {}
    train_aux_check = np.any(['aux_score' in k for k in train_dict.keys()])
    val_aux_check = np.any(['aux_score' in k for k in val_dict.keys()])
    if config.save_weights:
        weight_dict = {
            k[0]: v
            for k, v in val_model.var_dict.iteritems() if k[1] == 0
        }
        val_dict = dict(val_dict, **weight_dict)
    try:
        while not coord.should_stop():
            start_time = time.time()
            train_vars = sess.run(train_dict.values())
            it_train_dict = {
                k: v
                for k, v in zip(train_dict.keys(), train_vars)
            }
            duration = time.time() - start_time
            train_losses[step] = it_train_dict['train_loss']
            train_accs[step] = it_train_dict['train_accuracy']
            timesteps[step] = duration
            if train_aux_check:
                # Loop through to find aux scores
                it_train_aux = {
                    itk: itv
                    for itk, itv in it_train_dict.iteritems()
                    if 'aux_score' in itk
                }
                train_aux[step] = it_train_aux
            assert not np.isnan(it_train_dict['train_loss']).any(
            ), 'Model diverged with loss = NaN'
            if step % config.validation_iters == 0:
                it_val_acc = np.asarray([])
                it_val_loss = np.asarray([])
                it_val_scores, it_val_labels, it_val_aux = [], [], []
                for num_vals in range(config.num_validation_evals):
                    # Validation accuracy as the average of n batches
                    val_vars = sess.run(val_dict.values())
                    it_val_dict = {
                        k: v
                        for k, v in zip(val_dict.keys(), val_vars)
                    }
                    it_val_acc = np.append(it_val_acc,
                                           it_val_dict['val_accuracy'])
                    it_val_loss = np.append(it_val_loss,
                                            it_val_dict['val_loss'])
                    it_val_labels += [it_val_dict['val_labels']]
                    it_val_scores += [it_val_dict['val_scores']]
                    if val_aux_check:
                        iva = {
                            itk: itv
                            for itk, itv in it_val_dict.iteritems()
                            if 'aux_score' in itk
                        }
                        it_val_aux += [iva]
                val_acc = it_val_acc.mean()
                val_lo = it_val_loss.mean()
                val_accs[step] = val_acc
                val_losses[step] = val_lo
                val_scores[step] = it_val_scores
                val_labels[step] = it_val_labels
                val_aux[step] = it_val_aux

                # Summaries
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

                # Training status and validation accuracy
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s | '
                              'Validation accuracy = %s | logdir = %s')
                print format_str % (
                    datetime.now(), step, it_train_dict['train_loss'],
                    config.batch_size / duration, float(duration),
                    it_train_dict['train_accuracy'], val_acc, summary_dir)

                # Save the model checkpoint if it's the best yet
                if config.top_n_validation > 0:
                    rep_idx = val_acc > val_accs
                    if sum(rep_idx) > 0:
                        force_save = True
                        val_accs[np.argmax(rep_idx)] = val_acc
                else:
                    force_save = True

                if force_save:
                    ckpt_path = os.path.join(checkpoint_dir,
                                             'model_' + str(step) + '.ckpt')
                    saver.save(sess, ckpt_path, global_step=step)
                    print 'Saved checkpoint to: %s' % ckpt_path
                    force_save = False
                    time_elapsed += float(duration)
                    db.update_performance(
                        experiment_id=config._id,
                        experiment_name=config.experiment_name,
                        summary_dir=summary_dir,
                        ckpt_file=ckpt_path,
                        training_loss=float(it_train_dict['train_loss']),
                        validation_loss=float(val_acc),
                        time_elapsed=time_elapsed,
                        training_step=step)
                    if config.save_weights:
                        it_weights = {
                            k: it_val_dict[k]
                            for k in weight_dict.keys()
                        }
                        py_utils.save_npys(data=it_weights,
                                           model_name='%s_%s' %
                                           (config.experiment_name, step),
                                           output_string=weight_dir)

                if config.early_stop:
                    keys = np.sort([int(k) for k in val_accs.keys()])
                    sorted_vals = np.asarray([val_accs[k] for k in keys])
                    if check_early_stop(sorted_vals):
                        print 'Triggered an early stop.'
                        break
            else:
                # Training status
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s')
                print format_str % (
                    datetime.now(), step,
                    it_train_dict['train_loss'], config.batch_size / duration,
                    float(duration), it_train_dict['train_accuracy'])

            # End iteration
            step += 1

    except tf.errors.OutOfRangeError:
        print 'Done training for %d epochs, %d steps.' % (config.epochs, step)
        print 'Saved to: %s' % checkpoint_dir
    finally:
        coord.request_stop()
    coord.join(threads)
    sess.close()

    # If using online hp optimization, update the database with performance
    if exp_params['hp_current_iteration'] is not None:

        # If we have not exceeded the maximum online hp optimizations:
        if exp_params['hp_current_iteration'] < exp_params['hp_max_studies']:

            # Database lookup to get all performance for this hp-thread
            performance_history = db.query_hp_hist(
                exp_params=exp_params,
                performance_metric=performance_metric,
                aggregator=aggregator)

            # Call on online optimization tools
            exp_params = hp_opt_utils.hp_optim_interpreter(
                performance_history=performance_history, aggregator=aggregator)

            # Prepare parameters for DB
            pk = prepare_experiments.protected_keys()
            exp_params = prepare_experiments.prepare_hp_params(
                parameter_dict=exp_params, pk=pk)

            # Iterate the count
            exp_params['hp_current_iteration'] += 1
            for k, v in exp_params.iteritems():
                if isinstance(v, basestring) and 'null' in v:
                    exp_params[k] = None

            # Update the database with the new hyperparameters
            db.update_online_experiment(
                exp_combos=[exp_params],
                experiment_link=exp_params['experiment_link'])

    # Package output variables into a dictionary
    output_dict = {
        'train_losses': train_losses,
        'train_accs': train_accs,
        'train_aux': train_aux,
        'timesteps': timesteps,
        'val_losses': val_losses,
        'val_accs': val_accs,
        'val_scores': val_scores,
        'val_labels': val_labels,
        'val_aux': val_aux,
    }
    return output_dict
Exemplo n.º 9
0
def main(experiment_name, list_experiments=False):
    """Create a tensorflow worker to run experiments in your DB."""
    if list_experiments:
        exps = db.list_experiments()
        print '_' * 30
        print 'Initialized experiments:'
        print '_' * 30
        for l in exps:
            print l.values()[0]
        print '_' * 30
        return
    # Prepare to run the model
    config = Config()
    condition_label = '%s_%s' % (experiment_name, get_dt_stamp())
    experiment_label = '%s' % (experiment_name)
    log = logger.get(os.path.join(config.log_dir, condition_label))
    experiment_dict = experiments.experiments()[experiment_name]()
    config = add_to_config(d=experiment_dict, config=config)  # Globals
    config = process_DB_exps(experiment_name=experiment_name,
                             log=log,
                             config=config)  # Update config w/ DB params
    dataset_module = py_utils.import_module(model_dir=config.dataset_info,
                                            dataset=config.dataset)
    dataset_module = dataset_module.data_processing()  # hardcoded class name

    # Prepare data loaders on the cpu
    with tf.device('/cpu:0'):
        # Test issues with data loading? Try placeholders instead.
        train_images = tf.placeholder(tf.float32,
                                      name='train_images',
                                      shape=[config.batch_size] +
                                      dataset_module.im_size)
        train_labels = tf.placeholder(tf.int64,
                                      name='train_labels',
                                      shape=[config.batch_size])
        val_images = tf.placeholder(tf.float32,
                                    name='val_images',
                                    shape=[config.batch_size] +
                                    dataset_module.im_size)
        val_labels = tf.placeholder(tf.int64,
                                    name='val_labels',
                                    shape=[config.batch_size])
    log.info('Created tfrecord dataloader tensors.')

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn') as scope:

            # Training model
            if len(dataset_module.output_size) > 1:
                log.warning('Found > 1 dimension for your output size.'
                            'Converting to a scalar.')
                dataset_module.output_size = np.prod(
                    dataset_module.output_size)
            # Click weighting
            flat_ims = tf.reshape(
                train_images,
                [config.batch_size,
                 np.prod(dataset_module.im_size)])
            W = tf.get_variable(
                name='W',
                initializer=tf.truncated_normal_initializer(stddev=0.1),
                shape=[
                    np.prod(dataset_module.im_size), dataset_module.output_size
                ])
            b = tf.get_variable(
                name='b',
                initializer=tf.truncated_normal_initializer(stddev=0.1),
                shape=[dataset_module.output_size])
            output_scores = tf.matmul(flat_ims, W) + b

            # Prepare the loss function
            train_loss = tf.reduce_mean(
                tf.nn.sparse_softmax_cross_entropy_with_logits(
                    labels=train_labels, logits=output_scores))
            train_op = tf.train.GradientDescentOptimizer(
                config.lr).minimize(train_loss)
            log.info('Built training loss function.')

    # Set up summaries and saver
    saver = tf.train.Saver(tf.global_variables())
    summary_op = tf.summary.merge_all()

    # Initialize the graph
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))

    # Need to initialize both of these if supplying num_epochs to inputs
    sess.run(
        tf.group(tf.global_variables_initializer(),
                 tf.local_variables_initializer()))

    # Start training loop
    step, time_elapsed = 0, 0
    train_losses, train_accs, val_losses, val_accs, timesteps = {}, {}, {}, {}, {}
    files, labels = dataset_module.get_data()
    combined_files = files['train']
    combined_labels = labels['train']
    batch_size = config.batch_size
    num_batches = len(combined_files) // batch_size
    log.info('Finished training.')
    for image_batch, label_batch, _ in tqdm(image_batcher(
            start=0,
            num_batches=num_batches,
            images=combined_files,
            labels=combined_labels,
            batch_size=batch_size),
                                            total=num_batches):
        feed_dict = {
            train_images: image_batch.astype(np.float32),
            train_labels: np.asarray(label_batch).astype(int)
        }
        import ipdb
        ipdb.set_trace()
        start_time = time.time()
        _, loss_value = sess.run([
            train_op,
            train_loss,
        ], feed_dict=feed_dict)

    files_to_save = {
        'training_loss': tr_loss,
        'validation_loss': val_loss,
        'training_acc': tr_accs,
        'validation_acc': val_accs,
        'timesteps': timesteps
    }

    model_name = config.model_struct.replace('/', '_')
    py_utils.save_npys(data=files_to_save,
                       model_name=model_name,
                       output_string=dir_list['experiment_evaluations'])

    # Compare this condition w/ all others.
    plotting.plot_data(train_loss=tr_loss,
                       val_loss=val_loss,
                       model_name=model_name,
                       timesteps=timesteps,
                       config=config,
                       output=os.path.join(dir_list['condition_evaluations'],
                                           'loss'),
                       output_ext='.pdf',
                       data_type='loss')
    plotting.plot_data(tr_accs=tr_accs,
                       val_accs=val_accs,
                       model_name=model_name,
                       timesteps=timesteps,
                       config=config,
                       output=os.path.join(dir_list['condition_evaluations'],
                                           'acc'),
                       output_ext='.pdf',
                       data_type='acc')
    log.info('Completed plots.')