예제 #1
0
def train_vgg16(train_dir=None, validation_dir=None):
    config = GEDIconfig()
    if train_dir is None:  # Use globals
        train_data = os.path.join(config.tfrecord_dir,
                                  config.tf_record_names['train'])
        meta_data = np.load(
            os.path.join(config.tfrecord_dir,
                         '%s_%s' % (config.tvt_flags[0], config.max_file)))
    else:
        meta_data = np.load(
            os.path.join(train_dir,
                         '%s_%s' % (config.tvt_flags[0], config.max_file)))

    # Prepare image normalization values
    if config.max_gedi is None:
        max_value = np.nanmax(meta_data['max_array']).astype(np.float32)
        if max_value == 0:
            max_value = None
            print('Derived max value is 0')
        else:
            print('Normalizing with empirical max.')
        if 'min_array' in meta_data.keys():
            min_value = np.min(meta_data['min_array']).astype(np.float32)
            print('Normalizing with empirical min.')
        else:
            min_value = None
            print('Not normalizing with a min.')
    else:
        max_value = config.max_gedi
        min_value = config.min_gedi
        print('max value', max_value)
    ratio = meta_data['ratio']
    if config.encode_time_of_death:
        tod = pd.read_csv(config.encode_time_of_death)
        tod_data = tod['dead_tp'].as_matrix()
        mask = np.isnan(tod_data).astype(int) + (
            tod['plate_well_neuron'] == 'empty').as_matrix().astype(int)
        tod_data = tod_data[mask == 0]
        tod_data = tod_data[tod_data <
                            10]  # throw away values have a high number
        config.output_shape = len(np.unique(tod_data))
        ratio = class_weight.compute_class_weight('balanced',
                                                  np.sort(np.unique(tod_data)),
                                                  tod_data)
        flip_ratio = False
    else:
        flip_ratio = True
    print('Ratio is: %s' % ratio)

    if validation_dir is None:  # Use globals
        validation_data = os.path.join(config.tfrecord_dir,
                                       config.tf_record_names['val'])
    elif validation_dir is False:
        pass  # Do not use validation data during training

    # Make output directories if they do not exist
    dt_stamp = re.split(
        '\.', str(datetime.now()))[0].\
        replace(' ', '_').replace(':', '_').replace('-', '_')
    dt_dataset = config.which_dataset + '_' + dt_stamp + '/'
    config.train_checkpoint = os.path.join(config.train_checkpoint,
                                           dt_dataset)  # timestamp this run
    out_dir = os.path.join(config.results, dt_dataset)
    dir_list = [
        config.train_checkpoint, config.train_summaries, config.results,
        out_dir
    ]
    [make_dir(d) for d in dir_list]
    # im_shape = get_image_size(config)
    im_shape = config.gedi_image_size

    print('-' * 60)
    print('Training model:' + dt_dataset)
    print('-' * 60)

    # Prepare data on CPU
    assert os.path.exists(train_data)
    assert os.path.exists(validation_data)
    assert os.path.exists(config.vgg16_weight_path)
    with tf.device('/cpu:0'):
        train_images, train_labels = inputs(train_data,
                                            config.train_batch,
                                            im_shape,
                                            config.model_image_size[:2],
                                            max_value=max_value,
                                            min_value=min_value,
                                            train=config.data_augmentations,
                                            num_epochs=config.epochs,
                                            normalize=config.normalize)
        val_images, val_labels = inputs(validation_data,
                                        config.validation_batch,
                                        im_shape,
                                        config.model_image_size[:2],
                                        max_value=max_value,
                                        min_value=min_value,
                                        num_epochs=config.epochs,
                                        normalize=config.normalize)
        tf.summary.image('train images', train_images)
        tf.summary.image('validation images', val_images)

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn') as scope:
            if config.ordinal_classification == 'ordinal':  # config.output_shape > 2:  # Hardcoded fix for timecourse pred
                vgg_output = config.output_shape * 2
            elif config.ordinal_classification == 'regression':
                vgg_output = 1
            elif config.ordinal_classification is None:
                vgg_output = config.output_shape
            vgg = vgg16.Vgg16(vgg16_npy_path=config.vgg16_weight_path,
                              fine_tune_layers=config.fine_tune_layers)
            train_mode = tf.get_variable(name='training', initializer=True)
            vgg.build(train_images,
                      output_shape=vgg_output,
                      train_mode=train_mode,
                      batchnorm=config.batchnorm_layers)

            # Prepare the cost function
            if config.ordinal_classification == 'ordinal':
                # Encode y w/ k-hot and yhat w/ sigmoid ce. units capture dist.
                enc = tf.concat([
                    tf.reshape(tf.range(0, config.output_shape), [1, -1])
                    for x in range(config.train_batch)
                ],
                                axis=0)
                enc_train_labels = tf.cast(
                    tf.greater_equal(enc, tf.expand_dims(train_labels,
                                                         axis=1)), tf.float32)
                split_labs = tf.split(enc_train_labels,
                                      config.output_shape,
                                      axis=1)
                res_output = tf.reshape(
                    vgg.fc8, [config.train_batch, 2, config.output_shape])
                split_logs = tf.split(res_output, config.output_shape, axis=2)
                if config.balance_cost:
                    cost = tf.add_n([
                        tf.nn.softmax_cross_entropy_with_logits(
                            labels=tf.one_hot(tf.cast(tf.squeeze(s), tf.int32),
                                              2),
                            logits=tf.squeeze(l)) * r
                        for s, l, r in zip(split_labs, split_logs, ratio)
                    ])
                else:
                    cost = tf.add_n([
                        tf.nn.softmax_cross_entropy_with_logits(
                            labels=tf.one_hot(tf.cast(tf.squeeze(s), tf.int32),
                                              2),
                            logits=tf.squeeze(l))
                        for s, l in zip(split_labs, split_logs)
                    ])
                cost = tf.reduce_mean(cost)
            elif config.ordinal_classification == 'regression':
                if config.balance_cost:
                    weight_vec = tf.gather(train_labels, ratio)
                    cost = tf.reduce_mean(
                        tf.pow(
                            (vgg.fc8) - tf.cast(train_labels, tf.float32), 2) *
                        weight_vec)
                else:
                    cost = tf.nn.l2_loss((vgg.fc8) -
                                         tf.cast(train_labels, tf.float32))
            else:
                if config.balance_cost:
                    cost = softmax_cost(vgg.fc8,
                                        train_labels,
                                        ratio=ratio,
                                        flip_ratio=flip_ratio)
                else:
                    cost = softmax_cost(vgg.fc8, train_labels)
            tf.summary.scalar("cost", cost)

            # Finetune the learning rates
            if config.wd_layers is not None:
                _, l2_wd_layers = fine_tune_prepare_layers(
                    tf.trainable_variables(), config.wd_layers)
                l2_wd_layers = [
                    x for x in l2_wd_layers if 'biases' not in x.name
                ]
                cost += (config.wd_penalty *
                         tf.add_n([tf.nn.l2_loss(x) for x in l2_wd_layers]))

            # for all variables in trainable variables
            # print name if there's duplicates you f****d up
            other_opt_vars, ft_opt_vars = fine_tune_prepare_layers(
                tf.trainable_variables(), config.fine_tune_layers)
            if config.optimizer == 'adam':
                train_op = ft_non_optimized(cost, other_opt_vars, ft_opt_vars,
                                            tf.train.AdamOptimizer,
                                            config.hold_lr, config.new_lr)
            elif config.optimizer == 'sgd':
                train_op = ft_non_optimized(cost, other_opt_vars, ft_opt_vars,
                                            tf.train.GradientDescentOptimizer,
                                            config.hold_lr, config.new_lr)

            if config.ordinal_classification == 'ordinal':
                arg_guesses = tf.cast(
                    tf.reduce_sum(tf.squeeze(tf.argmax(res_output, axis=1)),
                                  reduction_indices=[1]), tf.int32)
                train_accuracy = tf.reduce_mean(
                    tf.cast(tf.equal(arg_guesses, train_labels), tf.float32))
            elif config.ordinal_classification == 'regression':
                train_accuracy = tf.reduce_mean(
                    tf.cast(
                        tf.equal(tf.cast(tf.round(vgg.prob), tf.int32),
                                 train_labels), tf.float32))
            else:
                train_accuracy = class_accuracy(
                    vgg.prob, train_labels)  # training accuracy
            tf.summary.scalar("training accuracy", train_accuracy)

            # Setup validation op
            if validation_data is not False:
                scope.reuse_variables()
                # Validation graph is the same as training except no batchnorm
                val_vgg = vgg16.Vgg16(vgg16_npy_path=config.vgg16_weight_path,
                                      fine_tune_layers=config.fine_tune_layers)
                val_vgg.build(val_images, output_shape=vgg_output)
                # Calculate validation accuracy
                if config.ordinal_classification == 'ordinal':
                    val_res_output = tf.reshape(
                        val_vgg.fc8,
                        [config.validation_batch, 2, config.output_shape])
                    val_arg_guesses = tf.cast(
                        tf.reduce_sum(tf.squeeze(
                            tf.argmax(val_res_output, axis=1)),
                                      reduction_indices=[1]), tf.int32)
                    val_accuracy = tf.reduce_mean(
                        tf.cast(tf.equal(val_arg_guesses, val_labels),
                                tf.float32))
                elif config.ordinal_classification == 'regression':
                    val_accuracy = tf.reduce_mean(
                        tf.cast(
                            tf.equal(tf.cast(tf.round(val_vgg.prob), tf.int32),
                                     val_labels), tf.float32))
                else:
                    val_accuracy = class_accuracy(val_vgg.prob, val_labels)
                tf.summary.scalar("validation accuracy", val_accuracy)

    # Set up summaries and saver
    saver = tf.train.Saver(tf.global_variables(),
                           max_to_keep=config.keep_checkpoints)
    summary_op = tf.summary.merge_all()

    # Initialize the graph
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # Need to initialize both of these if supplying num_epochs to inputs
    sess.run(
        tf.group(tf.global_variables_initializer(),
                 tf.local_variables_initializer()))
    summary_dir = os.path.join(config.train_summaries,
                               config.which_dataset + '_' + dt_stamp)
    summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)

    # Set up exemplar threading
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # Start training loop
    np.save(out_dir + 'meta_info', config)
    step, losses = 0, []  # val_max = 0
    try:
        # print response
        while not coord.should_stop():
            start_time = time.time()
            _, loss_value, train_acc = sess.run(
                [train_op, cost, train_accuracy])
            losses.append(loss_value)
            duration = time.time() - start_time
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'

            if step % config.validation_steps == 0:
                if validation_data is not False:
                    _, val_acc = sess.run([train_op, val_accuracy])
                else:
                    val_acc -= 1  # Store every checkpoint

                # Summaries
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

                # Training status and validation accuracy
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s | '
                              'Validation accuracy = %s | logdir = %s')
                print(format_str %
                      (datetime.now(), step,
                       loss_value, config.train_batch / duration,
                       float(duration), train_acc, val_acc, summary_dir))

                # Save the model checkpoint if it's the best yet
                if 1:  # val_acc >= val_max:
                    saver.save(sess,
                               os.path.join(config.train_checkpoint,
                                            'model_' + str(step) + '.ckpt'),
                               global_step=step)
                    # Store the new max validation accuracy
                    # val_max = val_acc

            else:
                # Training status
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s')
                print(format_str %
                      (datetime.now(), step, loss_value, config.train_batch /
                       duration, float(duration), train_acc))
            # End iteration
            step += 1

    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps.' % (config.epochs, step))
    finally:
        coord.request_stop()
        np.save(os.path.join(config.tfrecord_dir, 'training_loss'), losses)
    coord.join(threads)
    sess.close()
예제 #2
0
def train_vgg16(train_dir=None, validation_dir=None):
    config = GEDIconfig()
    if train_dir is None:  # Use globals
        train_data = os.path.join(config.tfrecord_dir, 'train.tfrecords')
        meta_data = np.load(
            os.path.join(
                config.tfrecord_dir, config.tvt_flags[0] + '_' +
                config.max_file))
    else:
        meta_data = np.load(
            os.path.join(
                train_dir, config.tvt_flags[0] + '_' + config.max_file))

    # Prepare image normalization values
    max_value = np.nanmax(meta_data['max_array']).astype(np.float32)
    if max_value == 0:
        max_value = None
        print 'Derived max value is 0'
    else:
        print 'Normalizing with empirical max.'
    if 'min_array' in meta_data.keys():
        min_value = np.min(meta_data['min_array']).astype(np.float32)
        print 'Normalizing with empirical min.'
    else:
        min_value = None
        print 'Not normalizing with a min.'
    ratio = meta_data['ratio']
    print 'Ratio is: %s' % ratio

    if validation_dir is None:  # Use globals
        validation_data = os.path.join(config.tfrecord_dir, 'val.tfrecords')
    elif validation_dir is False:
        pass  # Do not use validation data during training

    # Make output directories if they do not exist
    dt_stamp = re.split(
        '\.', str(datetime.now()))[0].\
        replace(' ', '_').replace(':', '_').replace('-', '_')
    dt_dataset = config.which_dataset + '_' + dt_stamp + '/'
    config.train_checkpoint = os.path.join(
        config.train_checkpoint, dt_dataset)  # timestamp this run
    out_dir = os.path.join(config.results, dt_dataset)
    dir_list = [
        config.train_checkpoint, config.train_summaries,
        config.results, out_dir]
    [make_dir(d) for d in dir_list]
    # im_shape = get_image_size(config)
    im_shape = config.gedi_image_size

    print '-'*60
    print('Training model:' + dt_dataset)
    print '-'*60

    # Prepare data on CPU
    with tf.device('/cpu:0'):
        train_images, train_labels = inputs(
            train_data,
            config.train_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            train=config.data_augmentations,
            num_epochs=config.epochs,
            normalize=config.normalize)
        val_images, val_labels = inputs(
            validation_data,
            config.validation_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            num_epochs=config.epochs,
            normalize=config.normalize)
        tf.summary.image('train images', train_images)
        tf.summary.image('validation images', val_images)

    # Prepare model on GPU
    num_timepoints = len(config.channel)
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn') as scope:

            # Prepare the loss function
            if config.balance_cost:
                cost_fun = lambda yhat, y: softmax_cost(yhat, y, ratio=ratio)
            else:
                cost_fun = lambda yhat, y: softmax_cost(yhat, y)

            def cond(i, state, images, output, loss, num_timepoints):  # NOT CORRECT
                return tf.less(i, num_timepoints)

            def body(
                    i,
                    images,
                    label,
                    cell,
                    state,
                    output,
                    loss,
                    vgg,
                    train_mode,
                    output_shape,
                    batchnorm_layers,
                    cost_fun,
                    score_layer='fc7'):
                vgg.build(
                    images[i],
                    output_shape=config.output_shape,
                    train_mode=train_mode,
                    batchnorm=config.batchnorm_layers)
                    it_output, state = cell(vgg[score_layer], state)
                    output= output.write(i, it_output)
                    it_loss = cost_fun(output, label)
                    loss = loss.write(i, it_loss)
                return (i+1, images, label, cell, state, output, loss, vgg, train_mode, output_shape, batchnorm_layers) 

            # Prepare LSTM loop
            train_mode = tf.get_variable(name='training', initializer=True)
            cell = lstm_layers(layer_units=config.lstm_units)
            output = tf.TensorArray(tf.float32, num_timepoints) # output array for LSTM loop -- one slot per timepoint
            loss = tf.TensorArray(tf.float32, num_timepoints)  # loss for LSTM loop -- one slot per timepoint
            i = tf.constant(0)  # iterator for LSTM loop
            loop_vars = [i, images, label, cell, state, output, loss, vgg, train_mode, output_shape, batchnorm_layers, cost_fun]

            # Run the lstm
            processed_list = tf.while_loop(
                cond=cond,
                body=body,
                loop_vars=loop_vars,
                back_prop=True,
                swap_memory=False)
            output_cell = processed_list[3]
            output_state = processed_list[4]
            output_activity = processed_list[5]
            cost = processed_list[6]

            # Optimize
            combined_cost = tf.reduce_sum(cost)
            tf.summary.scalar('cost', combined_cost)
            import ipdb;ipdb.set_trace()  # Need to make sure lstm weights are being trained
            other_opt_vars, ft_opt_vars = fine_tune_prepare_layers(
                tf.trainable_variables(), config.fine_tune_layers)
            if config.optimizer == 'adam':
                train_op = ft_non_optimized(
                    cost, other_opt_vars, ft_opt_vars,
                    tf.train.AdamOptimizer, config.hold_lr, config.new_lr)
            elif config.optimizer == 'sgd':
                train_op = ft_non_optimized(
                    cost, other_opt_vars, ft_opt_vars,
                    tf.train.GradientDescentOptimizer,
                    config.hold_lr, config.new_lr)

            train_accuracy = class_accuracy(
                vgg.prob, train_labels)  # training accuracy
            tf.summary.scalar("training accuracy", train_accuracy)

            # Setup validation op
            if validation_data is not False:
                scope.reuse_variables()
                # Validation graph is the same as training except no batchnorm
                val_vgg = vgg16.Vgg16(
                    vgg16_npy_path=config.vgg16_weight_path,
                    fine_tune_layers=config.fine_tune_layers)
                val_vgg.build(val_images, output_shape=config.output_shape)
                # Calculate validation accuracy
                val_accuracy = class_accuracy(val_vgg.prob, val_labels)
                tf.summary.scalar("validation accuracy", val_accuracy)
예제 #3
0
def train_vgg16(train_dir=None, validation_dir=None):
    config = GEDIconfig()
    if train_dir is None:  # Use globals
        train_data = os.path.join(
            config.tfrecord_dir,
            config.tf_record_names['train'])
        meta_data = np.load(
            os.path.join(
                config.tfrecord_dir,
                '%s_%s' % (config.tvt_flags[0], config.max_file)))
    else:
        meta_data = np.load(
            os.path.join(
                train_dir,
                '%s_%s' % (config.tvt_flags[0], config.max_file)))

    # Prepare image normalization values
    if config.max_gedi is None:
        max_value = np.nanmax(meta_data['max_array']).astype(np.float32)
        if max_value == 0:
            max_value = None
            print 'Derived max value is 0'
        else:
            print 'Normalizing with empirical max.'
        if 'min_array' in meta_data.keys():
            min_value = np.min(meta_data['min_array']).astype(np.float32)
            print 'Normalizing with empirical min.'
        else:
            min_value = None
            print 'Not normalizing with a min.'
    else:
        max_value = config.max_gedi
        min_value = config.min_gedi
    ratio = meta_data['ratio']
    if config.encode_time_of_death:
        tod = pd.read_csv(config.encode_time_of_death)
        tod_data = tod['dead_tp'].as_matrix()
        mask = np.isnan(tod_data).astype(int) + (
            tod['plate_well_neuron'] == 'empty').as_matrix().astype(int)
        tod_data = tod_data[mask == 0]
        tod_data = tod_data[tod_data > config.mask_timepoint_value]
        config.output_shape = len(np.unique(tod_data))
        ratio = class_weight.compute_class_weight(
            'balanced',
            np.sort(np.unique(tod_data)),
            tod_data)
        flip_ratio = False
    else:
        flip_ratio = True
    print 'Ratio is: %s' % ratio

    if validation_dir is None:  # Use globals
        validation_data = os.path.join(
            config.tfrecord_dir,
            config.tf_record_names['val'])
    elif validation_dir is False:
        pass  # Do not use validation data during training

    # Make output directories if they do not exist
    dt_stamp = re.split(
        '\.', str(datetime.now()))[0].\
        replace(' ', '_').replace(':', '_').replace('-', '_')
    dt_dataset = config.which_dataset + '_' + dt_stamp + '/'
    config.train_checkpoint = os.path.join(
        config.train_checkpoint, dt_dataset)  # timestamp this run
    out_dir = os.path.join(config.results, dt_dataset)
    dir_list = [
        config.train_checkpoint, config.train_summaries,
        config.results, out_dir]
    [make_dir(d) for d in dir_list]
    # im_shape = get_image_size(config)
    im_shape = config.gedi_image_size

    print '-'*60
    print('Training model:' + dt_dataset)
    print '-'*60

    # Prepare data on CPU
    assert os.path.exists(train_data)
    assert os.path.exists(validation_data)
    assert os.path.exists(config.vgg16_weight_path)
    with tf.device('/cpu:0'):
        train_images, train_labels, train_gedi_images = inputs(
            train_data,
            config.train_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            train=config.data_augmentations,
            num_epochs=config.epochs,
            normalize=config.normalize,
            return_gedi=config.include_GEDI_in_tfrecords,
            return_extra_gfp=config.extra_image,
            return_GEDI_derivative=True)
        val_images, val_labels, val_gedi_images = inputs(
            validation_data,
            config.validation_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            num_epochs=config.epochs,
            normalize=config.normalize,
            return_gedi=config.include_GEDI_in_tfrecords,
            return_extra_gfp=config.extra_image,
            return_GEDI_derivative=True)
        if config.include_GEDI_in_tfrecords:
            extra_im_name = 'GEDI at current timepoint'
        else:
            extra_im_name = 'next gfp timepoint'
        tf.summary.image('train images', train_images)
        tf.summary.image('validation images', val_images)
        tf.summary.image('train %s' % extra_im_name, train_gedi_images)
        tf.summary.image('validation %s' % extra_im_name, val_gedi_images)

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('cnn') as scope:
            if config.ordinal_classification is None:
                vgg_output = 2  # Sign of derivative (inf norm)
                train_labels = tf.cast(tf.sign(train_labels), tf.int32)
                val_labels = tf.cast(tf.sign(val_labels), tf.int32)
            elif config.ordinal_classification == 'regression':
                vgg_output = 1
            else:
                raise RuntimeError(
                    'config.ordinal_classification must be sign or regression.'
                    )
            vgg = vgg16.model_struct()
            train_mode = tf.get_variable(name='training', initializer=True)

            # Mask NAN images from loss
            image_nan = tf.reduce_sum(
                tf.cast(tf.is_nan(train_images), tf.float32),
                reduction_indices=[1, 2, 3])
            gedi_nan = tf.reduce_sum(
                tf.cast(tf.is_nan(train_gedi_images), tf.float32),
                reduction_indices=[1, 2, 3],
                keep_dims=True)
            image_mask = tf.cast(tf.equal(image_nan, 0.), tf.float32)
            gedi_nan = tf.cast(tf.equal(gedi_nan, 0.), tf.float32)
            train_images = tf.where(
                tf.is_nan(train_images),
                tf.zeros_like(train_images),
                train_images)
            train_gedi_images = tf.where(
                tf.is_nan(train_gedi_images),
                tf.zeros_like(train_gedi_images),
                train_gedi_images)
            train_images = tf.concat([train_images, train_images, train_images], axis=3)
            val_images = tf.concat([val_images, val_images, val_images], axis=3)
            vgg.build(
                train_images, output_shape=vgg_output,
                train_mode=train_mode, batchnorm=config.batchnorm_layers)
            # Prepare the cost function
            if config.ordinal_classification is None:
                # Encode y w/ k-hot and yhat w/ sigmoid ce. units capture dist.
                cost = softmax_cost(
                    vgg.fc8,
                    train_labels,
                    mask=image_mask)
            elif config.ordinal_classification == 'regression':
                cost = tf.nn.l2_loss(tf.squeeze(vgg.fc8) - train_labels)

            class_loss = cost
            tf.summary.scalar("cce cost", cost)

            # Weight decay
            if config.wd_layers is not None:
                _, l2_wd_layers = fine_tune_prepare_layers(
                    tf.trainable_variables(), config.wd_layers)
                l2_wd_layers = [
                    x for x in l2_wd_layers if 'biases' not in x.name]
                if len(l2_wd_layers) > 0:
                    cost += (config.wd_penalty * tf.add_n(
                        [tf.nn.l2_loss(x) for x in l2_wd_layers]))

            # Optimize
            train_op = tf.train.AdamOptimizer(config.new_lr).minimize(cost)
            if config.ordinal_classification is None:
                train_accuracy = class_accuracy(
                    vgg.prob, train_labels)  # training accuracy
            elif config.ordinal_classification == 'regression':
                train_accuracy = tf.nn.l2_loss(
                    tf.squeeze(vgg.fc8) - train_labels)
            tf.summary.scalar("training accuracy", train_accuracy)

            # Setup validation op
            if validation_data is not False:
                scope.reuse_variables()
                # Validation graph is the same as training except no batchnorm
                val_vgg = vgg16.model_struct(
                    fine_tune_layers=config.fine_tune_layers)
                val_vgg.build(val_images, output_shape=vgg_output)
                # Calculate validation accuracy
                if config.ordinal_classification is None:
                    val_accuracy = class_accuracy(val_vgg.prob, val_labels)
                elif config.ordinal_classification == 'regression':
                    val_accuracy = tf.nn.l2_loss(tf.squeeze(val_vgg.fc8) - val_labels)
                tf.summary.scalar("validation accuracy", val_accuracy)

    # Set up summaries and saver
    saver = tf.train.Saver(
        tf.global_variables(), max_to_keep=config.keep_checkpoints)
    summary_op = tf.summary.merge_all()

    # Initialize the graph
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # Need to initialize both of these if supplying num_epochs to inputs
    sess.run(tf.group(tf.global_variables_initializer(),
             tf.local_variables_initializer()))
    summary_dir = os.path.join(
        config.train_summaries, config.which_dataset + '_' + dt_stamp)
    summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)

    # Set up exemplar threading
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # Restore model if requested
    if config.restore_path is not None:
        print '-' * 60
        print 'Restoring from a previous model: %s' % config.restore_path
        print '-' * 60
        saver.restore(sess, config.restore_path)

    # Start training loop
    np.save(out_dir + 'meta_info', config)
    step, losses = 0, []  # val_max = 0

    try:
        # print response
        while not coord.should_stop():
            start_time = time.time()
            _, loss_value, train_acc = sess.run(
                [train_op, cost, train_accuracy])
            losses.append(loss_value)
            duration = time.time() - start_time
            if np.isnan(loss_value).sum():
                import ipdb;ipdb.set_trace()
            assert not np.isnan(loss_value), 'Model diverged with loss = NaN'
            if step % config.validation_steps == 0:
                if validation_data is not False:
                    _, val_acc = sess.run([train_op, val_accuracy])
                else:
                    val_acc -= 1  # Store every checkpoint

                # Summaries
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

                # Training status and validation accuracy
                format_str = (
                    '%s: step %d, loss = %.2f (%.1f examples/sec; '
                    '%.3f sec/batch) | Training accuracy = %s | '
                    'Training %s = %s | Training class loss = %s | '
                    'Validation accuracy = %s | Validation %s = %s | '
                    'logdir = %s')
                print (format_str % (
                    datetime.now(), step, loss_value,
                    config.train_batch / duration, float(duration),
                    train_acc, extra_im_name, 0.,
                    0., val_acc, extra_im_name,
                    0., summary_dir))

                # Save the model checkpoint if it's the best yet
                if 1:  # val_acc >= val_max:
                    saver.save(
                        sess, os.path.join(
                            config.train_checkpoint,
                            'model_' + str(step) + '.ckpt'), global_step=step)
                    # Store the new max validation accuracy
                    # val_max = val_acc

            else:
                # Training status
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s | '
                              'Training %s = %s | Training class loss = %s')
                print (format_str % (datetime.now(), step, loss_value,
                                     config.train_batch / duration,
                                     float(duration), train_acc,
                                     extra_im_name, 0.,
                                     0.))
            # End iteration
            step += 1

    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps.' % (config.epochs, step))
    finally:
        coord.request_stop()
        np.save(os.path.join(config.tfrecord_dir, 'training_loss'), losses)
    coord.join(threads)
    sess.close()
예제 #4
0
def train_model(train_dir=None, validation_dir=None):
    config = GEDIconfig()
    if train_dir is None:  # Use globals
        train_data = os.path.join(config.tfrecord_dir,
                                  config.tf_record_names['train'])
        meta_data = np.load(
            os.path.join(config.tfrecord_dir,
                         '%s_%s' % (config.tvt_flags[0], config.max_file)))
    else:
        meta_data = np.load(
            os.path.join(train_dir,
                         '%s_%s' % (config.tvt_flags[0], config.max_file)))

    # Prepare image normalization values
    if config.max_gedi is None:
        max_value = np.nanmax(meta_data['max_array']).astype(np.float32)
        if max_value == 0:
            max_value = None
            print 'Derived max value is 0'
        else:
            print 'Normalizing with empirical max.'
        if 'min_array' in meta_data.keys():
            min_value = np.min(meta_data['min_array']).astype(np.float32)
            print 'Normalizing with empirical min.'
        else:
            min_value = None
            print 'Not normalizing with a min.'
    else:
        max_value = config.max_gedi
        min_value = config.min_gedi
    ratio = meta_data['ratio']
    print 'Ratio is: %s' % ratio

    if validation_dir is None:  # Use globals
        validation_data = os.path.join(config.tfrecord_dir,
                                       config.tf_record_names['val'])
    elif validation_dir is False:
        pass  # Do not use validation data during training

    # Make output directories if they do not exist
    dt_stamp = re.split(
        '\.', str(datetime.now()))[0].\
        replace(' ', '_').replace(':', '_').replace('-', '_')
    dt_dataset = config.which_dataset + '_' + dt_stamp + '/'
    config.train_checkpoint = os.path.join(config.train_checkpoint,
                                           dt_dataset)  # timestamp this run
    out_dir = os.path.join(config.results, dt_dataset)
    dir_list = [
        config.train_checkpoint, config.train_summaries, config.results,
        out_dir
    ]
    [make_dir(d) for d in dir_list]
    # im_shape = get_image_size(config)
    im_shape = config.gedi_image_size

    print '-' * 60
    print('Training model:' + dt_dataset)
    print '-' * 60

    # Prepare data on CPU
    assert os.path.exists(train_data)
    assert os.path.exists(validation_data)
    assert os.path.exists(config.vgg16_weight_path)
    with tf.device('/cpu:0'):
        train_images_0, train_images_1, train_labels, train_times = inputs(
            train_data,
            config.train_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            train=config.data_augmentations,
            num_epochs=config.epochs,
            normalize=config.normalize,
            return_filename=True)
        val_images_0, val_images_1, val_labels, val_times = inputs(
            validation_data,
            config.validation_batch,
            im_shape,
            config.model_image_size[:2],
            max_value=max_value,
            min_value=min_value,
            num_epochs=config.epochs,
            normalize=config.normalize,
            return_filename=True)
        tf.summary.image('train image frame 0', train_images_0)
        tf.summary.image('train image frame 1', train_images_1)
        tf.summary.image('validation image frame 0', val_images_0)
        tf.summary.image('validation image frame 1', val_images_1)

    # Prepare model on GPU
    with tf.device('/gpu:0'):
        with tf.variable_scope('gedi'):
            # Build training GEDI model for frame 0
            vgg_train_mode = tf.get_variable(name='vgg_training',
                                             initializer=False)
            gedi_model_0 = vgg16.model_struct(
                vgg16_npy_path=config.gedi_weight_path, trainable=False)
            gedi_model_0.build(prep_images_for_gedi(train_images_0),
                               output_shape=2,
                               train_mode=vgg_train_mode)
            gedi_scores_0 = gedi_model_0.fc7

        with tf.variable_scope('match'):
            # Build matching model for frame 0
            model_0 = matching_gedi.model_struct()
            model_0.build(train_images_0)

            # Build frame 0 vector
            frame_0 = tf.concat([gedi_scores_0, model_0.output], axis=-1)

            # Build output layer
            if config.matching_combine == 'concatenate':
                output_shape = [int(frame_0.get_shape()[-1]) * 2, 2]
            elif config.matching_combine == 'subtract':
                output_shape = [int(frame_0.get_shape()[-1]), 2]
            else:
                raise RuntimeError

        # Build GEDI model for frame 1
        with tf.variable_scope('gedi', reuse=True):
            gedi_model_1 = vgg16.model_struct(
                vgg16_npy_path=config.gedi_weight_path, trainable=False)
            gedi_model_1.build(prep_images_for_gedi(train_images_1),
                               output_shape=2,
                               train_mode=vgg_train_mode)
            gedi_scores_1 = gedi_model_1.fc7

        with tf.variable_scope('match', reuse=True):
            # Build matching model for frame 1
            model_1 = matching_gedi.model_struct()
            model_1.build(train_images_1)

        # Build frame 0 and frame 1 vectors
        frame_1 = tf.concat([gedi_scores_1, model_1.output], axis=-1)

        with tf.variable_scope('output'):
            # Concatenate or subtract
            if config.matching_combine == 'concatenate':
                output_scores = tf.concat([frame_0, frame_1], axis=-1)
            elif config.matching_combine == 'subtract':
                output_scores = frame_0 - frame_1
            else:
                raise NotImplementedError

            # Build output layer
            output_shape = [int(output_scores.get_shape()[-1]), 2]
            output_weights = tf.get_variable(
                name='output_weights',
                shape=output_shape,
                initializer=tf.contrib.layers.xavier_initializer(
                    uniform=False))
            output_bias = tf.get_variable(name='output_bias',
                                          initializer=tf.truncated_normal(
                                              [output_shape[-1]], .0, .001))
            decision_logits = tf.nn.bias_add(
                tf.matmul(output_scores, output_weights), output_bias)
            train_soft_decisions = tf.nn.softmax(decision_logits)
            cost = softmax_cost(decision_logits, train_labels)
            tf.summary.scalar("cce loss", cost)
            cost += tf.nn.l2_loss(output_weights)

            # Weight decay
            if config.wd_layers is not None:
                _, l2_wd_layers = fine_tune_prepare_layers(
                    tf.trainable_variables(), config.wd_layers)
                l2_wd_layers = [
                    x for x in l2_wd_layers if 'biases' not in x.name
                ]
                if len(l2_wd_layers) > 0:
                    cost += (config.wd_penalty *
                             tf.add_n([tf.nn.l2_loss(x)
                                       for x in l2_wd_layers]))

        # Optimize
        train_op = tf.train.AdamOptimizer(config.new_lr).minimize(cost)
        train_accuracy = class_accuracy(train_soft_decisions,
                                        train_labels)  # training accuracy
        tf.summary.scalar("training accuracy", train_accuracy)

        # Setup validation op
        if validation_data is not False:
            with tf.variable_scope('gedi', reuse=tf.AUTO_REUSE):  # FIX THIS
                # Validation graph is the same as training except no batchnorm
                val_gedi_model_0 = vgg16.model_struct(
                    vgg16_npy_path=config.gedi_weight_path)
                val_gedi_model_0.build(prep_images_for_gedi(val_images_0),
                                       output_shape=2,
                                       train_mode=vgg_train_mode)
                val_gedi_scores_0 = val_gedi_model_0.fc7

                # Build GEDI model for frame 1
                val_gedi_model_1 = vgg16.model_struct(
                    vgg16_npy_path=config.gedi_weight_path)
                val_gedi_model_1.build(prep_images_for_gedi(val_images_1),
                                       output_shape=2,
                                       train_mode=vgg_train_mode)
                val_gedi_scores_1 = val_gedi_model_1.fc7

            with tf.variable_scope('match', reuse=tf.AUTO_REUSE):
                # Build matching model for frame 0
                val_model_0 = matching_gedi.model_struct()
                val_model_0.build(val_images_0)

                # Build matching model for frame 1
                val_model_1 = matching_gedi.model_struct()
                val_model_1.build(val_images_1)

            # Build frame 0 and frame 1 vectors
            val_frame_0 = tf.concat([val_gedi_scores_0, val_model_0.output],
                                    axis=-1)
            val_frame_1 = tf.concat([val_gedi_scores_1, val_model_1.output],
                                    axis=-1)

            # Concatenate or subtract
            if config.matching_combine == 'concatenate':
                val_output_scores = tf.concat([val_frame_0, val_frame_1],
                                              axis=-1)
            elif config.matching_combine == 'subtract':
                val_output_scores = val_frame_0 - val_frame_1
            else:
                raise NotImplementedError

            with tf.variable_scope('output', reuse=tf.AUTO_REUSE):
                # Build output layer
                val_output_weights = tf.get_variable(
                    name='val_output_weights',
                    shape=output_shape,
                    trainable=False,
                    initializer=tf.contrib.layers.xavier_initializer(
                        uniform=False))
                val_output_bias = tf.get_variable(
                    name='output_bias',
                    trainable=False,
                    initializer=tf.truncated_normal([output_shape[-1]], .0,
                                                    .001))
                val_decision_logits = tf.nn.bias_add(
                    tf.matmul(val_output_scores, val_output_weights),
                    val_output_bias)
                val_soft_decisions = tf.nn.softmax(val_decision_logits)

            # Calculate validation accuracy
            val_accuracy = class_accuracy(val_soft_decisions, val_labels)
            tf.summary.scalar("validation accuracy", val_accuracy)

    # Set up summaries and saver
    saver = tf.train.Saver(tf.global_variables(),
                           max_to_keep=config.keep_checkpoints)
    summary_op = tf.summary.merge_all()

    # Initialize the graph
    sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True))
    # Need to initialize both of these if supplying num_epochs to inputs
    sess.run(
        tf.group(tf.global_variables_initializer(),
                 tf.local_variables_initializer()))
    summary_dir = os.path.join(config.train_summaries,
                               config.which_dataset + '_' + dt_stamp)
    summary_writer = tf.summary.FileWriter(summary_dir, sess.graph)

    # Set up exemplar threading
    coord = tf.train.Coordinator()
    threads = tf.train.start_queue_runners(sess=sess, coord=coord)

    # Start training loop
    np.save(out_dir + 'meta_info', config)
    step, losses = 0, []  # val_max = 0
    try:
        # print response
        while not coord.should_stop():
            start_time = time.time()
            _, loss_value, train_acc, val_acc = sess.run(
                [train_op, cost, train_accuracy, val_accuracy])
            losses += [loss_value]
            duration = time.time() - start_time
            if np.isnan(loss_value).sum():
                assert not np.isnan(loss_value), 'Model loss = NaN'

            if step % config.validation_steps == 0:
                if validation_data is not False:
                    val_acc = sess.run(val_accuracy)
                else:
                    val_acc -= 1  # Store every checkpoint

                # Summaries
                summary_str = sess.run(summary_op)
                summary_writer.add_summary(summary_str, step)

                # Training status and validation accuracy
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s | '
                              'Validation accuracy = %s | '
                              'logdir = %s')
                print(format_str %
                      (datetime.now(), step,
                       loss_value, config.train_batch / duration,
                       float(duration), train_acc, val_acc, summary_dir))

                # Save the model checkpoint if it's the best yet
                if 1:  # val_acc >= val_max:
                    saver.save(sess,
                               os.path.join(config.train_checkpoint,
                                            'model_' + str(step) + '.ckpt'),
                               global_step=step)
                    # Store the new max validation accuracy
                    # val_max = val_acc

            else:
                # Training status
                format_str = ('%s: step %d, loss = %.2f (%.1f examples/sec; '
                              '%.3f sec/batch) | Training accuracy = %s | '
                              'Training loss = %s')
                print(format_str %
                      (datetime.now(), step, loss_value, config.train_batch /
                       duration, float(duration), loss_value))
            # End iteration
            step += 1

    except tf.errors.OutOfRangeError:
        print('Done training for %d epochs, %d steps.' % (config.epochs, step))
    finally:
        coord.request_stop()
        np.save(os.path.join(config.tfrecord_dir, 'training_loss'), losses)
    coord.join(threads)
    sess.close()