Exemplo n.º 1
0
    def test_siamese_data_generator_invalid_data(self):
        generator = image_generators.SiameseDataGenerator(
            featurewise_center=True,
            samplewise_center=True,
            featurewise_std_normalization=True,
            samplewise_std_normalization=True,
            zca_whitening=True,
            data_format='channels_last')

        feats = ['appearance', 'distance', 'neighborhood', 'regionprop']

        # Test fit with invalid data
        with self.assertRaises(ValueError):
            x = np.random.random((3, 10, 10))
            generator.fit(x)

        # Test flow with invalid dimensions
        with self.assertRaises(ValueError):
            train_dict = {
                'X': np.random.random((8, 10, 10)),
                'y': np.random.random((8, 10, 10)),
                'daughters': {}
            }
            generator.flow(train_dict, features=feats)

        # Test flow with non-matching batches
        with self.assertRaises(Exception):
            train_dict = {
                'X': np.random.random((8, 11, 10, 10, 1)),
                'y': np.random.random((7, 11, 10, 10, 1)),
                'daughters': {}
            }
            generator.flow(train_dict, features=feats)
        # Test flow without daughters
        with self.assertRaises(ValueError):
            train_dict = {
                'X': np.random.random((8, 11, 10, 10, 1)),
                'y': np.random.random((7, 11, 10, 10, 1)),
            }
            generator.flow(train_dict, features=feats)
        # Invalid number of channels: will work but raise a warning
        generator.fit(np.random.random((8, 10, 10, 5)))

        with self.assertRaises(ValueError):
            generator = image_generators.SiameseDataGenerator(
                data_format='unknown')

        generator = image_generators.SiameseDataGenerator(zoom_range=(2, 2))
        with self.assertRaises(ValueError):
            generator = image_generators.SiameseDataGenerator(zoom_range=(2, 2,
                                                                          2))
Exemplo n.º 2
0
    def test_siamese_data_generator(self):
        frames = 5
        # TODO: image generator should handle RGB as well as grayscale
        for test_images in _generate_test_images()[1:]:
            img_list = []
            for im in test_images:
                frame_list = []
                for _ in range(frames):
                    frame_list.append(img_to_array(im)[None, ...])
                img_stack = np.vstack(frame_list)
                img_list.append(img_stack)

            images = np.vstack(img_list)
            batches = images.shape[0] // frames
            images = np.reshape(images, (batches, frames, *images.shape[1:]))
            generator = image_generators.SiameseDataGenerator(
                featurewise_center=True,
                samplewise_center=True,
                featurewise_std_normalization=True,
                samplewise_std_normalization=True,
                zca_whitening=True,
                rotation_range=90.,
                width_shift_range=0.1,
                height_shift_range=0.1,
                shear_range=0.5,
                zoom_range=0.2,
                channel_shift_range=1.,
                brightness_range=(1, 5),
                fill_mode='nearest',
                cval=0.5,
                horizontal_flip=True,
                vertical_flip=True)

            feats = ['appearance', 'distance', 'neighborhood', 'regionprop']

            # Basic test before fit
            train_dict = {
                # TODO: image generator should handle RGB as well as grayscale
                'X':
                np.random.random((8, 5, 10, 10, 1)),
                'y':
                np.random.randint(low=0, high=4, size=(8, 5, 10, 10, 1)),
                'daughters': [{j: [{
                    1: [2, 3]
                }]
                               for j in range(1, 4)} for k in range(8)]
            }
            generator.flow(train_dict, features=feats)

            # Temp dir to save generated images
            temp_dir = self.get_temp_dir()
            y_shape = tuple(list(images.shape)[:-1] + [1])
            train_dict['X'] = images
            train_dict['y'] = np.random.randint(low=0, high=4, size=y_shape)
            train_dict['daughters'] = [{j: [{
                1: [2, 3]
            }]
                                        for j in range(1, 4)}
                                       for k in range(y_shape[0])]
Exemplo n.º 3
0
def train_model_siamese_daughter(model,
                                 dataset,
                                 expt='',
                                 test_size=.1,
                                 n_epoch=100,
                                 batch_size=1,
                                 num_gpus=None,
                                 crop_dim=32,
                                 min_track_length=1,
                                 neighborhood_scale_size=10,
                                 features=None,
                                 optimizer=SGD(lr=0.01,
                                               decay=1e-6,
                                               momentum=0.9,
                                               nesterov=True),
                                 log_dir='/data/tensorboard_logs',
                                 model_dir='/data/models',
                                 model_name=None,
                                 focal=False,
                                 gamma=0.5,
                                 lr_sched=rate_scheduler(lr=0.01, decay=0.95),
                                 rotation_range=0,
                                 flip=True,
                                 shear=0,
                                 zoom_range=0,
                                 seed=None,
                                 **kwargs):
    is_channels_first = K.image_data_format() == 'channels_first'

    if model_name is None:
        todays_date = datetime.datetime.now().strftime('%Y-%m-%d')
        data_name = os.path.splitext(os.path.basename(dataset))[0]
        model_name = '{}_{}_[{}]_neighs={}_epochs={}_seed={}_{}'.format(
            todays_date, data_name, ','.join(f[0] for f in sorted(features)),
            neighborhood_scale_size, n_epoch, seed, expt)
    model_path = os.path.join(model_dir, '{}.h5'.format(model_name))
    loss_path = os.path.join(model_dir, '{}.npz'.format(model_name))

    print('training on dataset:', dataset)
    print('saving model at:', model_path)
    print('saving loss at:', loss_path)

    train_dict, val_dict = get_data(dataset,
                                    mode='siamese_daughters',
                                    seed=seed,
                                    test_size=test_size)

    # the data, shuffled and split between train and test sets
    print('X_train shape:', train_dict['X'].shape)
    print('y_train shape:', train_dict['y'].shape)
    print('X_test shape:', val_dict['X'].shape)
    print('y_test shape:', val_dict['y'].shape)
    print('Output Shape:', model.layers[-1].output_shape)

    n_classes = model.layers[-1].output_shape[1 if is_channels_first else -1]

    def loss_function(y_true, y_pred):
        if focal:
            return losses.weighted_focal_loss(y_true,
                                              y_pred,
                                              gamma=gamma,
                                              n_classes=n_classes,
                                              from_logits=False)
        return losses.weighted_categorical_crossentropy(y_true,
                                                        y_pred,
                                                        n_classes=n_classes,
                                                        from_logits=False)

    if num_gpus is None:
        num_gpus = train_utils.count_gpus()

    if num_gpus >= 2:
        batch_size = batch_size * num_gpus
        model = train_utils.MultiGpuModel(model, num_gpus)

    print('Training on {} GPUs'.format(num_gpus))

    model.compile(loss=loss_function,
                  optimizer=optimizer,
                  metrics=['accuracy'])

    print('Using real-time data augmentation.')

    # this will do preprocessing and realtime data augmentation
    datagen = image_generators.SiameseDataGenerator(
        rotation_range=rotation_range,
        shear_range=shear,
        zoom_range=zoom_range,
        horizontal_flip=flip,
        vertical_flip=flip)

    datagen_val = image_generators.SiameseDataGenerator(rotation_range=0,
                                                        zoom_range=0,
                                                        shear_range=0,
                                                        horizontal_flip=0,
                                                        vertical_flip=0)

    total_train_pairs = tracking_utils.count_pairs(train_dict['y'],
                                                   same_probability=5.0)
    total_test_pairs = tracking_utils.count_pairs(val_dict['y'],
                                                  same_probability=5.0)

    train_data = datagen.flow(train_dict,
                              crop_dim=crop_dim,
                              batch_size=batch_size,
                              min_track_length=min_track_length,
                              neighborhood_scale_size=neighborhood_scale_size,
                              features=features)

    val_data = datagen_val.flow(
        val_dict,
        crop_dim=crop_dim,
        batch_size=batch_size,
        min_track_length=min_track_length,
        neighborhood_scale_size=neighborhood_scale_size,
        features=features)

    print('total_train_pairs:', total_train_pairs)
    print('total_test_pairs:', total_test_pairs)
    print('batch size:', batch_size)
    print('validation_steps: ', total_test_pairs // batch_size)

    # fit the model on the batches generated by datagen.flow()
    loss_history = model.fit_generator(
        train_data,
        steps_per_epoch=total_train_pairs // batch_size,
        epochs=n_epoch,
        validation_data=val_data,
        validation_steps=total_test_pairs // batch_size,
        callbacks=[
            callbacks.LearningRateScheduler(lr_sched),
            callbacks.ModelCheckpoint(model_path,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_best_only=True,
                                      save_weights_only=num_gpus >= 2),
            callbacks.TensorBoard(log_dir=os.path.join(log_dir, model_name))
        ])

    model.save_weights(model_path)
    np.savez(loss_path, loss_history=loss_history.history)

    return model
Exemplo n.º 4
0
def train_model_siamese_daughter(model,
                                 dataset,
                                 expt='',
                                 test_size=.2,
                                 n_epoch=100,
                                 batch_size=1,
                                 num_gpus=None,
                                 crop_dim=32,
                                 min_track_length=1,
                                 neighborhood_scale_size=10,
                                 features=None,
                                 optimizer=SGD(lr=0.01,
                                               decay=1e-6,
                                               momentum=0.9,
                                               nesterov=True),
                                 log_dir='/data/tensorboard_logs',
                                 model_dir='/data/models',
                                 model_name=None,
                                 focal=False,
                                 gamma=0.5,
                                 lr_sched=rate_scheduler(lr=0.01, decay=0.95),
                                 rotation_range=0,
                                 flip=True,
                                 shear=0,
                                 zoom_range=0,
                                 seed=0,
                                 **kwargs):
    is_channels_first = K.image_data_format() == 'channels_first'

    if model_name is None:
        todays_date = datetime.datetime.now().strftime('%Y-%m-%d')
        data_name = os.path.splitext(os.path.basename(dataset))[0]
        model_name = '{}_{}_[{}]_neighs={}_epochs={}_seed={}_{}'.format(
            todays_date, data_name, ','.join(f[0] for f in sorted(features)),
            neighborhood_scale_size, n_epoch, seed, expt)
    model_path = os.path.join(model_dir, '{}.h5'.format(model_name))
    loss_path = os.path.join(model_dir, '{}.npz'.format(model_name))

    print('training on dataset:', dataset)
    print('saving model at:', model_path)
    print('saving loss at:', loss_path)

    train_dict, val_dict = get_data(dataset,
                                    mode='siamese_daughters',
                                    seed=seed,
                                    test_size=test_size)

    # the data, shuffled and split between train and test sets
    print('X_train shape:', train_dict['X'].shape)
    print('y_train shape:', train_dict['y'].shape)
    print('X_test shape:', val_dict['X'].shape)
    print('y_test shape:', val_dict['y'].shape)
    print('Output Shape:', model.layers[-1].output_shape)

    n_classes = model.layers[-1].output_shape[1 if is_channels_first else -1]

    def loss_function(y_true, y_pred):
        if focal:
            return losses.weighted_focal_loss(y_true,
                                              y_pred,
                                              gamma=gamma,
                                              n_classes=n_classes,
                                              from_logits=False)
        return losses.weighted_categorical_crossentropy(y_true,
                                                        y_pred,
                                                        n_classes=n_classes,
                                                        from_logits=False)

    if num_gpus is None:
        num_gpus = train_utils.count_gpus()

    print('Training on {} GPUs'.format(num_gpus))

    model.compile(loss=loss_function,
                  optimizer=optimizer,
                  metrics=['accuracy'])

    print('Using real-time data augmentation.')

    # this will do preprocessing and realtime data augmentation
    datagen = image_generators.SiameseDataGenerator(
        rotation_range=rotation_range,
        shear_range=shear,
        zoom_range=zoom_range,
        horizontal_flip=flip,
        vertical_flip=flip)

    datagen_val = image_generators.SiameseDataGenerator(rotation_range=0,
                                                        zoom_range=0,
                                                        shear_range=0,
                                                        horizontal_flip=0,
                                                        vertical_flip=0)

    # same_probability values have varied from 0.5 to 5.0
    total_train_pairs = tracking_utils.count_pairs(train_dict['y'],
                                                   same_probability=5.0)
    total_test_pairs = tracking_utils.count_pairs(val_dict['y'],
                                                  same_probability=5.0)

    train_data = datagen.flow(train_dict,
                              seed=seed,
                              crop_dim=crop_dim,
                              batch_size=batch_size,
                              min_track_length=min_track_length,
                              neighborhood_scale_size=neighborhood_scale_size,
                              features=features)

    val_data = datagen_val.flow(
        val_dict,
        seed=seed,
        crop_dim=crop_dim,
        batch_size=batch_size,
        min_track_length=min_track_length,
        neighborhood_scale_size=neighborhood_scale_size,
        features=features)

    print('total_train_pairs:', total_train_pairs)
    print('total_test_pairs:', total_test_pairs)
    print('batch size:', batch_size)
    print('validation_steps: ', total_test_pairs // batch_size)

    # Make dicts to map the two generator outputs to the Dataset and model
    # input here is model input and output is model output
    features = sorted(features)

    input_type_dict = {}
    input_shape_dict = {}
    for feature in features:

        feature_name1 = '{}_input1'.format(feature)
        feature_name2 = '{}_input2'.format(feature)

        input_type_dict[feature_name1] = tf.float32
        input_type_dict[feature_name2] = tf.float32

        if feature == 'appearance':
            app1 = tuple([
                None, train_data.min_track_length, train_data.crop_dim,
                train_data.crop_dim, 1
            ])
            app2 = tuple(
                [None, 1, train_data.crop_dim, train_data.crop_dim, 1])

            input_shape_dict[feature_name1] = app1
            input_shape_dict[feature_name2] = app2

        elif feature == 'distance':
            dist1 = tuple([None, train_data.min_track_length, 2])
            dist2 = tuple([None, 1, 2])

            input_shape_dict[feature_name1] = dist1
            input_shape_dict[feature_name2] = dist2

        elif feature == 'neighborhood':
            neighborhood_size = 2 * train_data.neighborhood_scale_size + 1
            neigh1 = tuple([
                None, train_data.min_track_length, neighborhood_size,
                neighborhood_size, 1
            ])
            neigh2 = tuple([None, 1, neighborhood_size, neighborhood_size, 1])

            input_shape_dict[feature_name1] = neigh1
            input_shape_dict[feature_name2] = neigh2

        elif feature == 'regionprop':
            rprop1 = tuple([None, train_data.min_track_length, 3])
            rprop2 = tuple([None, 1, 3])

            input_shape_dict[feature_name1] = rprop1
            input_shape_dict[feature_name2] = rprop2

    output_type_dict = {'classification': tf.int32}
    # Ouput_shape has to be None because we dont know how many cells
    output_shape_dict = {'classification': (None, 3)}

    train_dataset = Dataset.from_generator(lambda: train_data,
                                           (input_type_dict, output_type_dict),
                                           output_shapes=(input_shape_dict,
                                                          output_shape_dict))
    val_dataset = Dataset.from_generator(lambda: val_data,
                                         (input_type_dict, output_type_dict),
                                         output_shapes=(input_shape_dict,
                                                        output_shape_dict))

    train_callbacks = get_callbacks(model_path,
                                    lr_sched=lr_sched,
                                    tensorboard_log_dir=log_dir,
                                    save_weights_only=num_gpus >= 2,
                                    monitor='val_loss',
                                    verbose=1)

    # fit the model on the batches generated by datagen.flow()
    loss_history = model.fit(train_dataset,
                             steps_per_epoch=total_train_pairs // batch_size,
                             epochs=n_epoch,
                             validation_data=val_dataset,
                             validation_steps=total_test_pairs // batch_size,
                             callbacks=train_callbacks)

    np.savez(loss_path, loss_history=loss_history.history)

    return model
Exemplo n.º 5
0
def train_model_siamese(model=None,
                        dataset=None,
                        optimizer=None,
                        expt='',
                        it=0,
                        batch_size=1,
                        n_epoch=100,
                        direc_save='/data/models',
                        direc_data='/data/npz_data',
                        focal=False,
                        gamma=0.5,
                        lr_sched=rate_scheduler(lr=0.01, decay=0.95),
                        rotation_range=0,
                        flip=True,
                        shear=0,
                        class_weight=None):
    is_channels_first = K.image_data_format() == 'channels_first'
    training_data_file_name = os.path.join(direc_data, dataset + '.npz')
    todays_date = datetime.datetime.now().strftime('%Y-%m-%d')

    file_name_save = os.path.join(
        direc_save, '{}_{}_{}_{}.h5'.format(todays_date, dataset, expt, it))
    file_name_save_loss = os.path.join(
        direc_save, '{}_{}_{}_{}.npz'.format(todays_date, dataset, expt, it))

    train_dict, test_dict = get_data(training_data_file_name, mode='siamese')

    class_weights = train_dict['class_weights']
    # the data, shuffled and split between train and test sets
    print('X_train shape:', train_dict['X'].shape)
    print('y_train shape:', train_dict['y'].shape)
    print('X_test shape:', test_dict['X'].shape)
    print('y_test shape:', test_dict['y'].shape)
    print('Output Shape:', model.layers[-1].output_shape)

    n_classes = model.layers[-1].output_shape[1 if is_channels_first else -1]

    def loss_function(y_true, y_pred):
        if focal:
            return losses.weighted_focal_loss(y_true,
                                              y_pred,
                                              gamma=gamma,
                                              n_classes=n_classes,
                                              from_logits=False)
        else:
            return losses.weighted_categorical_crossentropy(
                y_true, y_pred, n_classes=n_classes, from_logits=False)

    model.compile(loss=loss_function,
                  optimizer=optimizer,
                  metrics=['accuracy'])

    print('Using real-time data augmentation.')

    # this will do preprocessing and realtime data augmentation
    datagen = generators.SiameseDataGenerator(
        rotation_range=
        rotation_range,  # randomly rotate images by 0 to rotation_range degrees
        shear_range=
        shear,  # randomly shear images in the range (radians , -shear_range to shear_range)
        horizontal_flip=flip,  # randomly flip images
        vertical_flip=flip)  # randomly flip images

    datagen_val = generators.SiameseDataGenerator(
        rotation_range=
        0,  # randomly rotate images by 0 to rotation_range degrees
        shear_range=
        0,  # randomly shear images in the range (radians , -shear_range to shear_range)
        horizontal_flip=0,  # randomly flip images
        vertical_flip=0)  # randomly flip images

    def count_pairs(y):
        """
        Compute number of training samples needed to (stastically speaking)
        observe all cell pairs.
        Assume that the number of images is encoded in the second dimension.
        Assume that y values are a cell-uniquely-labeled mask.
        Assume that a cell is paired with one of its other frames 50% of the time
        and a frame from another cell 50% of the time.
        """
        # TODO: channels_first axes
        total_pairs = 0
        for image_set in range(y.shape[0]):
            set_cells = 0
            cells_per_image = []
            for image in range(y.shape[1]):
                image_cells = int(y[image_set, image, :, :, :].max())
                set_cells = set_cells + image_cells
                cells_per_image.append(image_cells)

            # Since there are many more possible non-self pairings than there are self pairings,
            # we want to estimate the number of possible non-self pairings and then multiply
            # that number by two, since the odds of getting a non-self pairing are 50%, to
            # find out how many pairs we would need to sample to (statistically speaking)
            # observe all possible cell-frame pairs.
            # We're going to assume that the average cell is present in every frame. This will
            # lead to an underestimate of the number of possible non-self pairings, but it's
            # unclear how significant the underestimate is.
            average_cells_per_frame = int(
                sum(cells_per_image) / len(cells_per_image))
            non_self_cellframes = (average_cells_per_frame -
                                   1) * len(cells_per_image)
            non_self_pairings = non_self_cellframes * max(cells_per_image)
            cell_pairings = non_self_pairings * 2
            total_pairs = total_pairs + cell_pairings
        return total_pairs

    # This shouldn't remain long term.
    magic_number = 2048  # A power of 2 chosen just to reduce training time.
    total_train_pairs = count_pairs(train_dict['y'])
    total_train_pairs = int(total_train_pairs // magic_number)

    total_test_pairs = count_pairs(test_dict['y'])
    total_test_pairs = int(total_test_pairs // magic_number)

    # fit the model on the batches generated by datagen.flow()
    loss_history = model.fit_generator(
        datagen.flow(train_dict, batch_size=batch_size),
        steps_per_epoch=total_train_pairs // batch_size,
        epochs=n_epoch,
        validation_data=datagen_val.flow(test_dict, batch_size=batch_size),
        validation_steps=total_test_pairs // batch_size,
        callbacks=[
            callbacks.LearningRateScheduler(lr_sched),
            callbacks.ModelCheckpoint(file_name_save,
                                      monitor='val_loss',
                                      verbose=1,
                                      save_best_only=True,
                                      save_weights_only=num_gpus >= 2),
        ])

    model.save_weights(file_name_save)
    np.savez(file_name_save_loss, loss_history=loss_history.history)

    return model