Пример #1
0
 def on_batch_end(self, batch, logs=None):
     overwrite = True  # if set to False, will keep weights from previous batches (uses a lot of storage!)
     if self.batch % self.save_each == 0:
         weights_path = pj(
             self.exp_dir,
             'weights.h5' if overwrite else f'weights_{self.batch}.h5')
         log_path = weights_path.replace('.h5', '.log')
         self.model.save_weights(weights_path)
         log = {**logs, **self.params}
         dump(log.items(), log_path)
     self.batch += 1
Пример #2
0
def gen_exp_data_dir(gender, train_samples, validation_samples, subjects=None):
    """
    Generates a data dir for the experiment, using either random or specified subjects of a given gender with the specified train/validation sets sizes.
    :param gender: the gender of the subjects to train model on
    :param train_samples: the number of images to use in training phase
    :param validation_samples: the number of images to use in validation phase
    :param subjects: [optional] a tuple of exactly two subject names, of the same gender.
    :return: the experiment name, the experiment data dir, actual number of training samples, actual number of validation samples
    """

    gender_source_dir = pj(Paths.data_dir, gender)

    # use specified subjects
    if subjects:
        assert len(
            subjects
        ) == 2, f'Invalid size ({len(subjects)}) for subjects argument. Must be exactly 2!'
        assert all(
            subject in SUBJECTS[gender] for subject in subjects
        ), f'Both subjects must be of the specified gender: {gender}'

        subject1_source_dir = pj(gender_source_dir, subjects[0])
        subject2_source_dir = pj(gender_source_dir, subjects[1])
        subject1_name = subjects[0]
        subject2_name = subjects[1]

        assert pe(subject1_source_dir) and pe(
            subject2_source_dir
        ), f'Images dir for either {subjects[0]} or {subjects[1]} was not found!'

    # get two random subjects from relevant gender dir
    else:
        gender_source_subjects = glob(pj(gender_source_dir, '*'))
        random.shuffle(gender_source_subjects)
        subject1_source_dir = gender_source_subjects[0]
        subject2_source_dir = gender_source_subjects[1]
        subject1_name = ps(subject1_source_dir)[1]
        subject2_name = ps(subject2_source_dir)[1]

    # get images for both subjects
    subject1_image_paths = glob(pj(subject1_source_dir, '*.jpg'))
    subject2_image_paths = glob(pj(subject2_source_dir, '*.jpg'))

    # get the minimum number of images between subject 1 and 2 (to create train/validation sets of same sizes)
    min_num_images = min(len(subject1_image_paths), len(subject2_image_paths))

    # if there are not enough images for requested train and validation samples, use same ratio of train/validation with available images
    if train_samples + validation_samples > min_num_images:
        train_validation_ratio = train_samples / (train_samples +
                                                  validation_samples)
        train_samples = int(min_num_images * train_validation_ratio)
        validation_samples = min_num_images - train_samples

    assert train_samples and validation_samples, 'Train and Validation sets must be larger than 0'

    # print sets sizes
    print(
        f'Using {train_samples} training samples and {validation_samples} validation samples.'
    )

    # get random training and validation images for subject 1
    random.shuffle(subject1_image_paths)
    subject1_train_image_paths = subject1_image_paths[:train_samples]
    subject1_validation_image_paths = subject1_image_paths[
        train_samples:train_samples + validation_samples]

    # get random training and validation images for subject 2
    random.shuffle(subject2_image_paths)
    subject2_train_image_paths = subject2_image_paths[:train_samples]
    subject2_validation_image_paths = subject2_image_paths[
        train_samples:train_samples + validation_samples]

    # init experiment data dir with train and validation data dirs
    timestamp = dt.datetime.now().strftime('%Y-%m-%d_%H%M')
    exp_name = f'{subject1_name}_{subject2_name}_{timestamp}'
    exp_data_dir = pj(Paths.experiments_dir, exp_name)

    # init train data dirs
    train_dir = pj(exp_data_dir, 'train')
    subject1_train_dir = pj(train_dir, subject1_name)
    subject2_train_dir = pj(train_dir, subject2_name)
    mkdirs(subject1_train_dir)
    mkdirs(subject2_train_dir)

    # init validation data dirs
    validation_dir = pj(exp_data_dir, 'validation')
    subject1_validation_dir = pj(validation_dir, subject1_name)
    subject2_validation_dir = pj(validation_dir, subject2_name)
    mkdirs(subject1_validation_dir)
    mkdirs(subject2_validation_dir)

    # copy training and validation images for subject 1 and 2 to exp_data_dir
    for ip in subject1_train_image_paths:
        shutil.copy(ip, subject1_train_dir)

    for ip in subject2_train_image_paths:
        shutil.copy(ip, subject2_train_dir)

    for ip in subject1_validation_image_paths:
        shutil.copy(ip, subject1_validation_dir)

    for ip in subject2_validation_image_paths:
        shutil.copy(ip, subject2_validation_dir)

    # write experiment metadata
    metadata = {
        'Gender': gender,
        'Training samples': train_samples,
        'Validation samples': validation_samples
    }
    dump(metadata.items(), pj(exp_data_dir, 'metadata.txt'))
    dump(subject1_train_image_paths + subject2_train_image_paths,
         pj(exp_data_dir, 'train_paths.txt'))
    dump(subject1_validation_image_paths + subject2_validation_image_paths,
         pj(exp_data_dir, 'validation_paths.txt'))

    return exp_name, exp_data_dir, train_samples, validation_samples
Пример #3
0
                        validation_data=validation_generator,
                        validation_steps=valid_steps_per_epoch,
                        callbacks=[WeightsSaver(model, save_each, exp_dir)])

    # save final model weights to disk
    model.save_weights(pj(exp_dir, f'{EXP_NAME}_weights_final.h5'))

    # make predictions on the validation/test set
    p_validation = model.predict_generator(validation_generator, verbose=1)

    # cross-entropy loss score on the validation/test set
    # loss_valid = log_loss(validation_generator, p_validation)

    # save model predictions to disk
    p_valid_path = pj(exp_dir, f'{EXP_NAME}_pred_valid.csv')
    dump(p_validation, p_valid_path)

    # log end and run times
    end_time = dt.datetime.now()
    print(f'End time: {end_time}')
    print(f'Run time: {end_time - start_time}')

    # save experiment statistics to disk
    exp_stats = {
        'Exp name:': EXP_NAME,
        'Start time:': start_time,
        'End time:': end_time,
        'Run time:': (end_time - start_time),
        # 'Loss (valid):': loss_valid,
        '': ''
    }  # TODO : add more statistics to the log file.
Пример #4
0
def run_experiment(gender=None,
                   train_samples=None,
                   validation_samples=None,
                   subjects=None,
                   img_size=224,
                   img_channels=3,
                   num_classes=2,
                   batch_size=16,
                   epochs=10,
                   freeze_first_layers=36,
                   save_each=5,
                   learning_rate=0.001,
                   model='vgg16',
                   prev_exp_dir=None):
    """
    Train a model on a pair of subjects, according to specified arguments and get predictions on validation set, as well as performance metrics.
    :param gender: the gender of the subjects for which to create an experiment
    :param train_samples: number of training samples per class
    :param validation_samples: number of validation samples per class
    :param subjects: [optional] a tuple of exactly two subject names, of the same gender. if not set, two random subjects of the same gender will be used.
    :param img_size: [optional] height / width of input images
    :param img_channels: [optional] number of color channels in input images
    :param num_classes: [optional] number of output classes
    :param batch_size: [optional] number of training samples per gradient update
    :param epochs: [optional] number of iteration over the entire training set
    :param freeze_first_layers: [optional] number of layers to freeze
    :param save_each: [optional] number of batches after which to save intermediate weights h5 file
    :param learning_rate: [optional] the step to use in each gradient update
    :param model: [optional] the name of the model to use (vgg16/bcn)
    :param prev_exp_dir: [optional] an existing experiment dir to use train/validation images from (if specified, no need to specify gender, train_samples, validation_samples).
    :return:
    """

    assert (
        gender and train_samples and validation_samples
    ) or prev_exp_dir, 'Either prev_exp_dir or gender and train_samples and validation_samples must be specified!'

    assert model in models, f'{model} is not supported! available models: {str(models)}'

    # print start time
    start_time = dt.datetime.now()
    print(f'Start time: {start_time}')

    # generate data for experiment
    if prev_exp_dir:
        # init experiment data dir with subject names from prev_exp_dir and new timestamp
        subject1_name, subject2_name, old_timestamp = ps(
            prev_exp_dir)[1].split('_', 2)
        timestamp = dt.datetime.now().strftime('%Y-%m-%d_%H%M')
        exp_name = f'{subject1_name}_{subject2_name}_{timestamp}'
        exp_data_dir = pj(Paths.experiments_dir, exp_name)
        mkdirs(exp_data_dir)

        # copy train and validation images from prev_exp_dir to exp_data_dir
        shutil.copytree(pj(prev_exp_dir, 'train'), pj(exp_data_dir, 'train'))
        shutil.copytree(pj(prev_exp_dir, 'validation'),
                        pj(exp_data_dir, 'validation'))

        assert (subject1_name in M_SUBJECTS
                and subject2_name in M_SUBJECTS) or (
                    subject1_name in F_SUBJECTS and subject2_name in F_SUBJECTS
                ), f'Subjects in {prev_exp_dir} are from different genders!'

        subjects = (subject1_name, subject2_name)

        gender = 'M' if subject1_name in M_SUBJECTS else 'F'

        total_train_samples = len(
            glob(pj(exp_data_dir, 'train', '**', '*.jpg'), recursive=True))
        total_validation_samples = len(
            glob(pj(exp_data_dir, 'validation', '**', '*.jpg'),
                 recursive=True))

        # train_samples = total_train_samples // 2
        # validation_samples = total_validation_samples // 2

        print(
            f'Using data for experiment | Exp name: {exp_name} | Gender: {gender} | Total train samples: {total_train_samples} | Total validation samples: {total_validation_samples} | Subjects: {subjects}'
        )
    else:
        print(
            f'Generating data for experiment | Gender: {gender} | Requested train samples (per class): {train_samples} | Requested validation samples (per class): {validation_samples} | Subjects: {subjects}'
        )
        exp_name, exp_data_dir, actual_train_samples, actual_validation_samples = gen_exp_data_dir(
            gender, train_samples, validation_samples, subjects)
        total_train_samples = actual_train_samples * 2
        total_validation_samples = actual_validation_samples * 2
        print(
            f'Generated data for experiment | Exp name: {exp_name} | Total train samples: {total_train_samples} | Total validation samples: {total_validation_samples} | Exp dir: {exp_data_dir}'
        )

    print(f'Using exp data dir: {exp_data_dir}')

    # get training and validation data generators
    print(
        f'Getting train and validation generators | Batch size: {batch_size} | Image size: {img_size}'
    )
    train_generator, validation_generator = get_train_and_valid_generators(
        exp_data_dir, batch_size, img_size)

    # load model
    metrics = ['accuracy']
    if model == 'vgg16':
        # load vgg16 model
        from cnn_finetune.vgg16 import vgg16_model
        initial_weights_path = pj(
            Paths.pretrained_dir,
            'vgg16_weights_tf_dim_ordering_tf_kernels.h5')
        initial_weights_num_classes = 1000
        initial_weights = (initial_weights_path, initial_weights_num_classes)

        print(
            f'Loading vgg16 model | Initial weights path: {initial_weights_path} | Initial weights number of classes: {initial_weights_num_classes}'
        )
        model = vgg16_model(img_size, img_size, img_channels, num_classes,
                            initial_weights, freeze_first_layers,
                            learning_rate, metrics)

    elif model == 'bcn':
        # load binary convnet model
        from cnn_finetune.small_convnet import binary_convnet_model
        model = binary_convnet_model(img_size,
                                     img_size,
                                     img_channels,
                                     metrics=metrics)

    else:
        model = None

    # start fine-tuning the model
    print(f'Training model | Epochs: {epochs}')
    model.fit_generator(
        train_generator,
        steps_per_epoch=total_train_samples // batch_size,
        epochs=epochs,
        validation_data=validation_generator,
        validation_steps=total_validation_samples // batch_size,
        # callbacks=[WeightsSaver(model, save_each, exp_data_dir)]
    )

    # save final model weights to disk
    final_weights_path = pj(exp_data_dir, f'{exp_name}_weights_final.h5')
    print(f'Saving model weights | Path: {final_weights_path}')
    model.save_weights(final_weights_path)

    # make predictions on the validation/test set
    print('Making predictions on validation set')
    # make predictions on validation set
    validation_predictions = model.predict_generator(validation_generator,
                                                     verbose=1)

    # convert the probabilities matrix to an array of predicted classes
    if num_classes > 2:
        validation_y_pred = np.array(
            [np.argmax(p) for p in validation_predictions], dtype=np.float32)

    else:  # if binary classification
        validation_y_pred = np.array(
            [0 if p > 0.5 else 1 for p in validation_predictions],
            dtype=np.float32)

    # generate an array of true classes. Important: validation generator must be used with shuffle=False for this to work.
    validation_y_true = validation_generator.classes  # np.array([[1-yt, yt] for yt in validation_generator.classes], dtype=np.float32)

    # generate predictions object and save it to exp_data_dir
    predictions_data_path = pj(exp_data_dir,
                               f'{exp_name}_validations_predictions.csv')
    print(
        f'Saving predictions on validation set | Path: {predictions_data_path}'
    )
    predictions_data = [('class', 'filename', 'y_true', 'y_pred', 'prob_0')]
    predictions_data += [
        (validation_generator.class_indices[ps(
            validation_generator.filenames[ind])[0]],
         ps(validation_generator.filenames[ind])[1], validation_y_true[ind],
         int(validation_y_pred[ind]), pred[0])
        for ind, pred in enumerate(validation_predictions)
    ]
    dump(predictions_data, predictions_data_path, delimiter=',')
    objdump([validation_y_true, validation_y_pred],
            pj(exp_data_dir, f'{exp_name}_validations_predictions.pkl'))

    # cross-entropy loss score on the validation/test set
    print('Getting metrics on validation set predictions')
    validation_loss = log_loss(validation_y_true, validation_y_pred)
    validation_accuracy = accuracy_score(validation_y_true, validation_y_pred)
    validation_precision = precision_score(validation_y_true,
                                           validation_y_pred,
                                           average='micro')
    validation_recall = recall_score(validation_y_true,
                                     validation_y_pred,
                                     average='micro')
    validation_f1 = f1_score(validation_y_true,
                             validation_y_pred,
                             average='micro')

    # save classification report
    report = classification_report(
        validation_y_true, validation_y_pred,
        list(validation_generator.class_indices.values()),
        list(validation_generator.class_indices.keys()))
    print(report)
    dump(report, pj(exp_data_dir, 'report.txt'))

    # log end and run times
    end_time = dt.datetime.now()
    print(f'End time: {end_time}')
    print(f'Run time: {end_time - start_time}')

    # save experiment statistics to disk
    exp_stats = {
        'Exp name:': exp_name,
        'Gender:': gender,
        'Start time:': start_time,
        'End time:': end_time,
        'Run time:': (end_time - start_time),
        '-': '-',
        'Train samples:': total_train_samples,
        'Epochs:': epochs,
        'Batch size:': batch_size,
        'Steps per epoch:': total_train_samples // batch_size,
        'Freeze first layers:': freeze_first_layers,
        'Learning rate:': learning_rate,
        'Save each:': save_each,
        '--': '--',
        'Validation samples:': total_validation_samples,
        'Validation loss:': validation_loss,
        'Validation accuracy:': validation_accuracy,
        'Validation precision:': validation_precision,
        'Validation recall:': validation_recall,
        'Validation F1:': validation_f1,
        '': ''
    }

    exp_stats_path = pj(exp_data_dir, f'{exp_name}.log')
    dump(exp_stats.items(), exp_stats_path, append=True)

    # if final weights are saved, delete intermediate weights file
    intermediate_weights_path = pj(exp_data_dir, 'weights.h5')
    if pe(final_weights_path) and pe(intermediate_weights_path):
        os.remove(intermediate_weights_path)
Пример #5
0
def predict_with_model(data_dir,
                       out_dir,
                       model_name,
                       initial_weights_path,
                       num_classes,
                       batch_size=16):

    # create output dir
    mkdirs(out_dir)

    # load model
    img_size = None
    if model_name == 'vgg16':
        initial_weights = (initial_weights_path,
                           num_classes if num_classes > 2 else 1)
        model = vgg16_model(num_classes=num_classes,
                            initial_weights=initial_weights)
        img_size = vgg16_img_size
    elif model_name == 'bcn':
        model = binary_convnet_model(initial_weights_path=initial_weights_path)
        img_size = bcn_img_size
    else:
        model = None

    assert model, f'model {model_name} not loaded!'
    print(
        f'Loading {model_name} model | Initial weights path: {initial_weights_path} | Initial weights number of classes: {num_classes}'
    )

    # init data generator
    data_generator = get_data_generator(data_dir, img_size, batch_size)

    # make predictions
    data_predictions = model.predict_generator(data_generator, verbose=1)

    # convert the probabilities matrix to an array of predicted classes
    if num_classes > 2:
        data_y_pred = np.array([np.argmax(p) for p in data_predictions],
                               dtype=np.float32)

    else:  # if binary classification
        data_y_pred = np.array([0 if p > 0.5 else 1 for p in data_predictions],
                               dtype=np.float32)

    # generate an array of true classes. Important: data generator must be used with shuffle=False for this to work.
    data_y_true = data_generator.classes  # np.array([[1-yt, yt] for yt in data_generator.classes], dtype=np.float32)

    # generate predictions object and save it to out_dir
    exp_name = 'exp'  # TODO : construct exp name
    predictions_data_path = pj(out_dir, f'{exp_name}_predictions.csv')
    print(f'Saving predictions on data set | Path: {predictions_data_path}')
    predictions_data = [('class', 'filename', 'y_true', 'y_pred', 'prob_0')]
    predictions_data += [
        (data_generator.class_indices[ps(data_generator.filenames[ind])[0]],
         ps(data_generator.filenames[ind])[1], data_y_true[ind],
         int(data_y_pred[ind]), pred[0])
        for ind, pred in enumerate(data_predictions)
    ]
    dump(predictions_data, predictions_data_path, delimiter=',')
    objdump([data_y_true, data_y_pred],
            pj(out_dir, f'{exp_name}_predictions.pkl'))

    # cross-entropy loss score on the data/test set
    print('Getting metrics on data set predictions')
    data_loss = log_loss(data_y_true, data_y_pred)
    data_accuracy = accuracy_score(data_y_true, data_y_pred)
    data_precision = precision_score(data_y_true, data_y_pred, average='micro')
    data_recall = recall_score(data_y_true, data_y_pred, average='micro')
    data_f1 = f1_score(data_y_true, data_y_pred, average='micro')

    # save classification report
    report = classification_report(data_y_true, data_y_pred,
                                   list(data_generator.class_indices.values()),
                                   list(data_generator.class_indices.keys()))
    print(report)
    dump(report, pj(out_dir, 'report.txt'))

    # save experiment statistics to disk
    exp_stats = {
        'Exp name:': exp_name,
        # 'Gender:': gender,
        # 'Start time:': start_time,
        # 'End time:': end_time,
        # 'Run time:': (end_time - start_time),
        '-': '-',
        'Data dir:': data_dir,
        'Model name:': model_name,
        'Initial weights path:': initial_weights_path,
        'Num. classess:': num_classes,
        '--': '--',
        # 'Validation samples:': total_data_samples,
        'Validation loss:': data_loss,
        'Validation accuracy:': data_accuracy,
        'Validation precision:': data_precision,
        'Validation recall:': data_recall,
        'Validation F1:': data_f1,
        '': ''
    }

    exp_stats_path = pj(out_dir, f'{exp_name}.log')
    dump(exp_stats.items(), exp_stats_path, append=True)
Пример #6
0
    # cross-entropy loss score on the validation/test set
    loss_valid = log_loss(y_valid, p_valid)

    # generate predictions object and save it to exp_data_dir
    predictions_data_path = pj(exp_dir,
                               f'{EXP_NAME}_validations_predictions.csv')
    predictions_data = [
        ('y_true', 'y_pred', 'prob_0', 'prob_1', 'prob_2', 'prob_3', 'prob_4',
         'prob_5', 'prob_6', 'prob_7', 'prob_8', 'prob_9')
    ]
    predictions_data += [(np.argmax(y_valid[ind]), np.argmax(p_valid[ind]),
                          pred[0], pred[1], pred[2], pred[3], pred[4], pred[5],
                          pred[6], pred[7], pred[8], pred[9])
                         for ind, pred in enumerate(p_valid)]
    dump(predictions_data, predictions_data_path, delimiter=',')

    # cross-entropy loss score on the validation/test set
    # p_valid_one_hot = np.array([[int(i == np.argmax(pv)) for i in range(0, len(pv))] for pv in p_valid], dtype=np.float32)  # convert the probabilities matrix to an array of 1-hot vectors.
    yv = [np.argmax(y) for y in y_valid]
    pv = [np.argmax(p) for p in p_valid]
    objdump([y_valid, p_valid, yv, pv],
            pj(exp_dir, f'{EXP_NAME}_validations_predictions.pkl'))

    validation_loss = log_loss(y_valid, p_valid)
    validation_accuracy = accuracy_score(yv, pv)
    validation_precision = precision_score(yv, pv, average='micro')
    validation_recall = recall_score(yv, pv, average='micro')
    validation_f1 = f1_score(yv, pv, average='micro')

    # save classification report