Пример #1
0
def main(run_config):
    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        logging_dir=run_config['logging_dir'],
        tfhub_module_url=
        'https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path'])
    all_bottlenecks = bottleneck_executor.get_bottlenecks()
    class_labels = list(all_bottlenecks['class'].unique())
    image_executor = bottleneck_executor.image_executor
    image_lists = image_executor.get_image_lists()
    num_images_to_show = 4
    random_keys = [
        list(image_lists.keys())[np.random.randint(0, len(image_lists.keys()))]
        for _ in range(num_images_to_show)
    ]
    random_samples = []
    for random_key in random_keys:
        random_sample_index = np.random.randint(0,
                                                len(image_lists[random_key]))
        random_samples.append(image_lists[random_key][random_sample_index])
    print('Displaying sample images: %s' % random_samples)
    image_executor.display_sample_images(sample_image_paths=random_samples)

    pass
def main(run_config):
    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        tfhub_module_url=
        'https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path'])
    bottlenecks = bottleneck_executor.get_bottlenecks()
    image_count = bottlenecks.shape[0]
    num_classes = len(bottlenecks['class'].unique())
    all_image_paths = bottlenecks['path'].values
    all_image_labels = bottlenecks['class'].values
    label_to_index = dict(
        (name, index)
        for index, name in enumerate(bottlenecks['class'].unique()))
    all_image_labels_one_hot = [
        label_to_index[label] for label in all_image_labels
    ]
    path_ds = tf.data.Dataset.from_tensor_slices(all_image_paths)
    image_ds = path_ds.map(load_and_preprocess_image,
                           num_parallel_calls=AUTOTUNE)
    label_ds = tf.data.Dataset.from_tensor_slices(
        tf.cast(all_image_labels_one_hot, tf.int64))
    image_label_ds = tf.data.Dataset.zip((image_ds, label_ds))
    steps_per_epoch = math.ceil(len(all_image_paths) / BATCH_SIZE)
    ds = image_label_ds.cache()
    ds = ds.apply(
        tf.data.experimental.shuffle_and_repeat(buffer_size=image_count))
    ds = ds.batch(BATCH_SIZE).prefetch(buffer_size=AUTOTUNE)
    # ds = ds.batch(BATCH_SIZE)
    # time_shuffle_and_repeat(ds, batches=2*steps_per_epoch+1)
    ''' Fixed-Feature Extractor InceptionV3 with Transfer Learning:'''
    base_model = InceptionV3(include_top=False,
                             weights='imagenet',
                             input_shape=(299, 299, 3))
    # add a global spatial average pooling layer
    x = base_model.output
    x = GlobalAveragePooling2D()(x)
    # let's add a fully-connected layer
    x = Dense(1024, activation='relu')(x)
    # and a logistic layer -- let's say we have 200 classes
    predictions = Dense(num_classes, activation='softmax')(x)

    # this is the model we will train
    model = Model(inputs=base_model.input, outputs=predictions)

    # first: train only the top layers (which were randomly initialized)
    # i.e. freeze all convolutional InceptionV3 layers
    for layer in base_model.layers:
        layer.trainable = False

    # compile the model (should be done *after* setting layers to non-trainable)
    model.compile(optimizer='rmsprop', loss='sparse_categorical_crossentropy')

    # train the model on the new data for a few epochs
    model.fit(ds, epochs=3, steps_per_epoch=steps_per_epoch)

    # return the loss value and metric values for the model in test mode:
    print(model.evaluate(ds, batch_size=BATCH_SIZE, steps=None))
Пример #3
0
def set_up(request):
    run_configs = {
        'BOON': {
            'dataset':
            'BOON',
            'image_dir':
            'D:\\data\\BOON\\images',
            'bottleneck_path':
            'D:\\data\\BOON\\bottlenecks.pkl',
            'logging_dir':
            'C:\\Users\\ccamp\\Documents\\GitHub\\HerbariumDeep\\frameworks\\DataAcquisition\\CleaningResults\\BOON'
        }
    }
    run_config = run_configs['BOON']

    model_export_dir = os.path.join('C:\\tmp\\summaries', 'trained_model')
    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        tfhub_module_url=
        'https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path'],
        logging_dir=run_config['logging_dir'])
    bottlenecks = bottleneck_executor.get_bottlenecks()
    train_bottlenecks, val_bottlenecks, test_bottlenecks = bottleneck_executor.get_partitioned_bottlenecks(
        train_percent=0.80, val_percent=0.20, test_percent=0.20)
    class_labels = list(bottlenecks['class'].unique())
    tf.logging.info(
        'Partitioned (N=%d) total bottleneck vectors into training (N=%d), validation (N=%d), and testing (N=%d) datasets.'
        % (bottlenecks.shape[0], train_bottlenecks.shape[0],
           val_bottlenecks.shape[0], test_bottlenecks.shape[0]))
    tf.logging.info(
        'Detected %d unique class labels in the bottlenecks dataframe' %
        len(class_labels))
    train_bottleneck_values = train_bottlenecks['bottleneck'].tolist()
    train_bottleneck_values = np.array(train_bottleneck_values)
    val_bottleneck_values = val_bottlenecks['bottleneck'].tolist()
    val_bottleneck_values = np.array(val_bottleneck_values)
    train_bottleneck_ground_truth_labels = train_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    train_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in train_bottleneck_ground_truth_labels
    ])
    val_bottleneck_ground_truth_labels = val_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    val_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in val_bottleneck_ground_truth_labels
    ])
    initializer_options = get_initializer_options()
    activation_options = get_activation_options(leaky_relu_alpha=0.2)
    optimizer_options = get_optimizer_options(static_learning_rate=0.001,
                                              momentum_const=0.9,
                                              adam_beta1=0.9,
                                              adam_beta2=0.999,
                                              adam_epsilon=1e-08)
    # Inject class variables:
    request.cls.dataset = run_config['dataset']
    request.cls.train_bottlenecks = train_bottleneck_values
    request.cls.train_ground_truth_indices = train_bottleneck_ground_truth_indices
    request.cls.class_labels = class_labels
    request.cls.val_bottlenecks = val_bottleneck_values
    request.cls.val_ground_truth_indices = val_bottleneck_ground_truth_indices
    request.cls.initializers = initializer_options
    request.cls.activations = activation_options
    request.cls.optimizers = optimizer_options
    request.cls.log_dir = 'C:\\tmp\\summaries'
    yield
Пример #4
0
def main(run_config):
    """
    main:
    :return:
    """
    """
    TensorBoard summaries directory:
    """
    summaries_dir = 'C:\\tmp\\summaries'
    model_export_dir = os.path.join('C:\\tmp\\summaries', 'trained_model')
    _clear_temp_folder(os.path.join(summaries_dir, os.pardir))
    # _prepare_tensor_board_directories(tb_summaries_dir='C:\\Users\\ccamp\\Documents\\GitHub\\HerbariumDeep\\frameworks\\TensorFlow\\TFHub\\tmp\\summaries\\trained_model\\')
    _prepare_model_export_directories(model_export_dir=model_export_dir)

    # Run preliminary setup operations and retrieve partitioned bottlenecks dataframe:
    _run_setup(tb_summaries_dir=summaries_dir)
    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        tfhub_module_url=
        'https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path'],
        logging_dir=run_config['logging_dir'])
    bottlenecks = bottleneck_executor.get_bottlenecks()
    train_bottlenecks, val_bottlenecks, test_bottlenecks = bottleneck_executor.get_partitioned_bottlenecks(
        train_percent=0.80, val_percent=0.20, test_percent=0.20)
    class_labels = list(bottlenecks['class'].unique())
    tf.logging.info(
        'Partitioned (N=%d) total bottleneck vectors into training (N=%d), validation (N=%d), and testing (N=%d) datasets.'
        % (bottlenecks.shape[0], train_bottlenecks.shape[0],
           val_bottlenecks.shape[0], test_bottlenecks.shape[0]))
    tf.logging.info(
        'Detected %d unique class labels in the bottlenecks dataframe' %
        len(class_labels))
    train_bottleneck_values = train_bottlenecks['bottleneck'].tolist()
    train_bottleneck_values = np.array(train_bottleneck_values)
    val_bottleneck_values = val_bottlenecks['bottleneck'].tolist()
    val_bottleneck_values = np.array(val_bottleneck_values)
    train_bottleneck_ground_truth_labels = train_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    train_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in train_bottleneck_ground_truth_labels
    ])
    val_bottleneck_ground_truth_labels = val_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    val_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in val_bottleneck_ground_truth_labels
    ])
    # train_bottlenecks, train_ground_truth_indices = _get_all_cached_bottlenecks(
    #     bottleneck_dataframe=bottleneck_dataframes['train'],
    #     class_labels=class_labels
    # )
    #
    # val_bottlenecks, val_ground_truth_indices = _get_all_cached_bottlenecks(
    #     bottleneck_dataframe=bottleneck_dataframes['val'],
    #     class_labels=class_labels
    # )
    # tf.logging.info(msg='Obtained bottleneck values from dataframe. Performed corresponding one-hot encoding of class labels')
    initializer_options = get_initializer_options()
    activation_options = get_activation_options(leaky_relu_alpha=0.2)
    # User MUST provide all default arguments or a KeyError will be thrown:
    optimizer_options = get_optimizer_options(static_learning_rate=0.001,
                                              momentum_const=0.9,
                                              adam_beta1=0.9,
                                              adam_beta2=0.999,
                                              adam_epsilon=1e-08)
    tb_log_dir = 'C:\\tmp\\summaries'
    _run_grid_search(
        dataset=run_config['dataset'],
        train_bottlenecks=train_bottleneck_values,
        train_ground_truth_indices=train_bottleneck_ground_truth_indices,
        initializers=initializer_options,
        activations=activation_options,
        optimizers=optimizer_options,
        class_labels=class_labels,
        val_bottlenecks=val_bottleneck_values,
        val_ground_truth_indices=val_bottleneck_ground_truth_indices,
        log_dir=tb_log_dir,
        model_export_dir=model_export_dir)
Пример #5
0
def main(run_config):
    static_learning_rate = 0.001
    momentum_const = 0.9
    adam_beta1 = 0.9
    adam_beta2 = 0.999
    adam_epsilon = 1e-08

    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        logging_dir=run_config['logging_dir'],
        tfhub_module_url=
        'https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path'])
    all_bottlenecks = bottleneck_executor.get_bottlenecks()
    class_labels = list(all_bottlenecks['class'].unique())
    train_bottlenecks, val_bottlenecks, test_bottlenecks = bottleneck_executor.get_partitioned_bottlenecks(
    )
    train_bottleneck_values = train_bottlenecks['bottleneck'].tolist()
    train_bottleneck_values = np.array(train_bottleneck_values)
    train_bottleneck_ground_truth_labels = train_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    train_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in train_bottleneck_ground_truth_labels
    ])
    val_bottleneck_values = val_bottlenecks['bottleneck'].tolist()
    val_bottleneck_values = np.array(val_bottleneck_values)
    val_bottleneck_ground_truth_labels = val_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    val_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in val_bottleneck_ground_truth_labels
    ])
    num_train_samples = len(train_bottleneck_values)
    num_val_samples = len(val_bottleneck_values)

    params = {
        'initializer': [
            tf.initializers.he_normal(),
            tf.initializers.he_uniform(), tf.initializers.truncated_normal
        ],
        'activation': [tf.nn.leaky_relu, tf.nn.elu],
        'optimizer': [
            tf.train.MomentumOptimizer(learning_rate=static_learning_rate,
                                       momentum=momentum_const,
                                       use_nesterov=True),
            tf.train.AdamOptimizer(learning_rate=static_learning_rate,
                                   beta1=adam_beta1,
                                   beta2=adam_beta2,
                                   epsilon=adam_epsilon)
        ],
        'train_batch_size': [20, 60, 100]
    }
    num_epochs = 10  #10,000
    eval_freq = 5  #10
    early_stopping_eval_freq = 5
    ckpt_freq = 0

    mem_leak_test_clf = MemoryLeakTestClassifier(train_from_bottlenecks=True,
                                                 num_classes=len(class_labels))
    custom_cv_splitter = CrossValidationSplitter(train_size=num_train_samples,
                                                 test_size=num_val_samples,
                                                 n_splits=1)
    grid_search = GridSearchCV(mem_leak_test_clf,
                               params,
                               cv=custom_cv_splitter,
                               verbose=2,
                               refit=False,
                               n_jobs=1,
                               return_train_score=False)
    tf.logging.info('Running GridSearch...')
    X = np.concatenate((train_bottleneck_values, val_bottleneck_values))
    y = np.concatenate((train_bottleneck_ground_truth_indices,
                        val_bottleneck_ground_truth_indices))
    grid_search.fit(X=X,
                    y=y,
                    num_epochs=num_epochs,
                    eval_freq=eval_freq,
                    ckpt_freq=ckpt_freq,
                    early_stopping_eval_freq=early_stopping_eval_freq,
                    fed_bottlenecks=True,
                    X_val=val_bottleneck_values,
                    y_val=val_bottleneck_ground_truth_indices)
    tf.logging.info(
        msg='Finished GridSearch! Restoring best performing parameter set...')
def main(run_config):
    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        logging_dir=run_config['logging_dir'],
        tfhub_module_url='https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path']
    )
    all_bottlenecks = bottleneck_executor.get_bottlenecks()
    class_labels = list(all_bottlenecks['class'].unique())
    train_bottlenecks, val_bottlenecks, test_bottlenecks = bottleneck_executor.get_partitioned_bottlenecks(
        train_percent=.80, val_percent=.20, test_percent=.20, random_state=42
    )
    train_bottleneck_values = train_bottlenecks['bottleneck'].tolist()
    train_bottleneck_values = np.array(train_bottleneck_values)
    train_bottleneck_ground_truth_labels = train_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    train_bottleneck_ground_truth_indices = np.array([class_labels.index(ground_truth_label)
                                                      for ground_truth_label in train_bottleneck_ground_truth_labels])
    val_bottleneck_values = val_bottlenecks['bottleneck'].tolist()
    val_bottleneck_values = np.array(val_bottleneck_values)
    val_bottleneck_ground_truth_labels = val_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    val_bottleneck_ground_truth_indices = np.array([class_labels.index(ground_truth_label)
                                                    for ground_truth_label in val_bottleneck_ground_truth_labels])
    X_train, y_train = train_bottleneck_values, train_bottleneck_ground_truth_indices
    X_valid, y_valid = val_bottleneck_values, val_bottleneck_ground_truth_indices

    num_train_bottlenecks = train_bottlenecks.shape[0]
    num_val_bottlenecks = val_bottlenecks.shape[0]
    num_test_bottlenecks = test_bottlenecks.shape[0]

    class_labels_and_bottleneck_vector_counts = {}
    for class_label in class_labels:
        class_labels_and_bottleneck_vector_counts[class_label] = all_bottlenecks[all_bottlenecks['class'] == class_label].shape[0]

    # Export summary stats:
    with open(os.path.join(run_config['logging_dir'], 'bottleneck_executor_summary_stats.txt'), 'w') as fp:
        fp.write('Number of Training Bottlenecks: %d\n' % num_train_bottlenecks)
        fp.write('Number of Validation Bottlenecks: %d\n' % num_val_bottlenecks)
        fp.write('Number of Testing Bottlenecks: %d\n' % num_test_bottlenecks)
        fp.write('\n')
        fp.write('Number of Unique Bottleneck Class Labels: %d\n' % len(class_labels))
        fp.write('Total Number of Bottleneck Vectors: %d\n' % all_bottlenecks.shape[0])
        fp.write('Enforced Minimum Number of Bottleneck Vectors per Class: %d\n' % 20)
        fp.write('Actual Minimum Number of Bottleneck Vectors per Class: %d\n' % min(class_labels_and_bottleneck_vector_counts.values()))
        fp.write('Enforced Maximum Number of Bottleneck Vectors per Class: 2^{27} - 1 ~= 134 M\n')
        fp.write('Actual Maximum Number of Bottleneck Vectors per Class: %d\n' % max(class_labels_and_bottleneck_vector_counts.values()))
        fp.write('Mean Number of Bottleneck Vectors per Class: %.2f\n' % np.mean(list(class_labels_and_bottleneck_vector_counts.values())))
        fp.write('Median Number of Bottleneck Vectors per Class: %.2f\n' % np.median(list(class_labels_and_bottleneck_vector_counts.values())))
        try:
            mode_bottleneck_vectors_per_class = statistics.mode(list(class_labels_and_bottleneck_vector_counts.values()))
            mode_count = len([bottleneck_count for clss_label, bottleneck_count in class_labels_and_bottleneck_vector_counts.items() if bottleneck_count == mode_bottleneck_vectors_per_class])
            fp.write('Mode Number of Bottleneck Vectors per Class: %d (%d counts)\n' % (statistics.mode(list(class_labels_and_bottleneck_vector_counts.values())), mode_count))
        except statistics.StatisticsError as err:
            data = Counter(list(class_labels_and_bottleneck_vector_counts.values()))
            print('Note: Two most equally common values: %s' % data.most_common(2))
            fp.write('Mode_1 Number of Bottleneck Vectors per Class: %d (%d counts)\n' % (data.most_common(2)[0][0], data.most_common(2)[0][1]))
            fp.write('Mode_2 Number of Bottleneck Vectors per Class: %d (%d counts)\n' % (data.most_common(2)[1][0], data.most_common(2)[1][1]))

    print('Wrote Bottleneck Executor summary statistics to \'%s\'' % os.path.join(run_config['logging_dir'], 'bottleneck_executor_summary_stats.txt'))

    # First we need to calculate the prior probabilities using the distribution in the training dataset:
    classes_and_counts_series = train_bottlenecks['class'].value_counts()
    num_samples = train_bottlenecks.shape[0]

    print()
Пример #7
0
def main(run_config):

    with open(run_config['top_1_per_class_acc_json_path'], 'r') as fp:
        top_1_acc_by_class_df = pd.read_json(fp, orient='index')

    with open(run_config['top_1_per_class_acc_json_path'].replace('1', '5'), 'r') as fp:
        top_5_acc_by_class_df = pd.read_json(fp, orient='index')

    gs_hyperparams_df = pd.read_pickle(run_config['hyperparam_df_path'])
    optimizers = gs_hyperparams_df.optimizer.unique()
    num_optimizers = len(optimizers)
    print('Optimizers: %s' % optimizers.categories)

    activations = gs_hyperparams_df.activation.unique()
    num_activations = len(activations)
    print('Activations: %s' % activations.categories)

    train_batch_sizes = gs_hyperparams_df.train_batch_size.unique()
    num_train_batch_sizes = len(train_batch_sizes)
    print('Train Batch Sizes: %s' % train_batch_sizes)

    initializers = gs_hyperparams_df.initializer.unique()
    num_initializers = len(initializers)
    print('Initializers: %s' % initializers.categories)

    heatmap_dims = ((num_activations * num_optimizers), (num_initializers * num_train_batch_sizes))
    data = np.zeros(heatmap_dims)
    print('HeatMap Dimensions: %s' %(data.shape,))

    print('Columns: %s\n' % gs_hyperparams_df.columns.values)

    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        logging_dir=run_config['logging_dir'],
        tfhub_module_url='https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path']
    )
    all_bottlenecks = bottleneck_executor.get_bottlenecks()
    class_labels = list(all_bottlenecks['class'].unique())
    train_bottlenecks, val_bottlenecks, test_bottlenecks = bottleneck_executor.get_partitioned_bottlenecks()
    if run_config['process'].lower() == 'training':
        training_bottlenecks_df = None
        training_top_1_acc_by_class_df = None
        training_top_5_acc_by_class_df = None
        bottlenecks_df = train_bottlenecks
    elif run_config['process'].lower() == 'validation':
        training_bottlenecks_df = train_bottlenecks
        training_top_1_acc_by_class_df = None
        training_top_5_acc_by_class_df = None
        proceeding_run_config = run_configs[run_config['dataset']]['train']
        with open(proceeding_run_config['top_1_per_class_acc_json_path'], 'r') as fp:
            training_top_1_acc_by_class_df = pd.read_json(fp, orient='index')
        with open(proceeding_run_config['top_5_per_class_acc_json_path'], 'r') as fp:
            training_top_5_acc_by_class_df = pd.read_json(fp, orient='index')
        bottlenecks_df = val_bottlenecks
    elif run_config['process'].lower() == 'testing':
        raise NotImplementedError("training_bottlenecks_df = train_bottlenecks.join(val_bottlenecks)")
        bottlenecks_df = test_bottlenecks
    else:
        raise NotImplementedError
    # bottlenecks_df = None

    # Training Batch Size vs. Best Performing Epoch Acc (2D Histogram)
    # plot_2d_hist_training_batch_size_vs_best_performing_epoch_acc(df=gs_hyperparams_df, data_set=run_config['dataset'], process=run_config['process'])

    # Training Batch Size vs. Fit Time (2D Histogram with Colorbar):
    # plot_2d_hist_with_colorbar_train_batch_size_vs_fit_time(df=gs_hyperparams_df, data_set=run_config['dataset'], process=run_config['process'])

    # Training Batch Size vs. Fit Time (Bar Chart)
    # plot_bar_chart_train_batch_size_vs_train_time(df=gs_hyperparams_df)

    # per-class top-1 accuracy (Box Plot):
    plot_boxplot_per_class_top_one_acc(top_1_acc_by_class_df=top_1_acc_by_class_df, dataset=run_config['dataset'], process=run_config['process'])

    # per-class top-5 accuracy (Horizontal Bar Plot):
    plot_boxplot_per_class_top_five_acc(top_5_acc_by_class_df, dataset=run_config['dataset'], process=run_config['process'])

    # per-class top-1 accuracy (Box Plot with Aggregation):
    plot_boxplot_per_class_top_one_acc_aggregated(top_1_acc_by_class_df, dataset=run_config['dataset'], process=run_config['process'])

    # per-class top-5 accuracy (Box Plot with Aggregation):
    plot_boxplot_per_class_top_five_acc_aggregated(top_5_acc_by_class_df, dataset=run_config['dataset'], process=run_config['process'])

    # Plot number of samples per-class (colorbar on existing) vs class's top-1 acc
    if run_config['process'].lower() == 'training':
        plot_per_class_top_one_acc_vs_number_of_samples_aggregated(top_1_acc_by_class_df, bottlenecks_df, training_top_1_acc_by_class_df=None, training_bottlenecks_df=None, dataset=run_config['dataset'], process=run_config['process'])
    elif run_config['process'].lower() == 'validation':
        plot_per_class_top_one_acc_vs_number_of_samples_aggregated(top_1_acc_by_class_df, bottlenecks_df, training_top_1_acc_by_class_df=training_top_1_acc_by_class_df, training_bottlenecks_df=training_bottlenecks_df, dataset=run_config['dataset'], process=run_config['process'])
    else:
        raise NotImplementedError("Need to distinguish testing process.")

    # Plot number of samples per-class (colorbar on existing) vs class's top-5 acc
    if run_config['process'].lower() == 'training':
        plot_per_class_top_five_acc_vs_number_of_samples_aggregated(top_5_acc_by_class_df, bottlenecks_df, training_top_5_acc_by_class_df=None, training_bottlenecks_df=None, dataset=run_config['dataset'], process=run_config['process'])
    elif run_config['process'].lower() == 'validation':
        plot_per_class_top_five_acc_vs_number_of_samples_aggregated(top_5_acc_by_class_df, bottlenecks_df, training_top_5_acc_by_class_df=training_top_5_acc_by_class_df, training_bottlenecks_df=training_bottlenecks_df, dataset=run_config['dataset'], process=run_config['process'])
    else:
        raise NotImplementedError("Need to distinguish testing process.")

    # Plot each hyperparameter on y-axis and then training time on the left-axis.
    plot_boxplot_hyperparameters_vs_training_time(gs_hyperparams_df, dataset=run_config['dataset'], process=run_config['process'])

    # per-class top-1 accuracy (scatter):
    plot_scatter_per_class_top_one_acc(top_1_acc_by_class_df, top_5_acc_by_class_df, bottlenecks_df, dataset=run_config['dataset'], process=run_config['process'])
Пример #8
0
def main(run_config):
    num_params = 100
    num_epochs = 100

    train_from_bottlenecks = True
    activations = ['elu', 'relu', 'tanh']
    optimizers = [
        tf.train.AdamOptimizer(learning_rate=0.001,
                               beta1=0.9,
                               beta2=0.999,
                               epsilon=1e-08),
        tf.train.MomentumOptimizer(learning_rate=0.001,
                                   momentum=0.9,
                                   use_nesterov=True)
    ]
    train_batch_sizes = [20, 60, 100]

    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        logging_dir=run_config['logging_dir'],
        tfhub_module_url=
        'https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path'])

    all_bottlenecks = bottleneck_executor.get_bottlenecks()
    class_labels = list(all_bottlenecks['class'].unique())
    train_bottlenecks, val_bottlenecks, test_bottlenecks = bottleneck_executor.get_partitioned_bottlenecks(
    )
    train_bottleneck_values = train_bottlenecks['bottleneck'].tolist()
    train_bottleneck_values = np.array(train_bottleneck_values)
    train_bottleneck_ground_truth_labels = train_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    train_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in train_bottleneck_ground_truth_labels
    ])
    val_bottleneck_values = val_bottlenecks['bottleneck'].tolist()
    val_bottleneck_values = np.array(val_bottleneck_values)
    val_bottleneck_ground_truth_labels = val_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    val_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in val_bottleneck_ground_truth_labels
    ])
    num_train_samples = len(train_bottleneck_values)
    num_val_samples = len(val_bottleneck_values)
    num_classes = len(np.unique(train_bottleneck_ground_truth_labels))

    X_train, y_train = train_bottleneck_values, train_bottleneck_ground_truth_indices
    X_valid, y_valid = val_bottleneck_values, val_bottleneck_ground_truth_indices

    for i in range(num_params):
        tf.keras.backend.clear_session()
        tf.logging.warning('Cleared Keras\' back-end session.')

        gc.collect()
        tf.logging.warning('Ran garbage collector.')

        current_activation = activations[i % len(activations)]
        current_optimizer = optimizers[i % len(optimizers)]
        current_train_batch_size = train_batch_sizes[i %
                                                     len(train_batch_sizes)]

        base_model = InceptionV3(include_top=False,
                                 weights='imagenet',
                                 input_shape=(299, 299, 3))

        for layer in base_model.layers:
            layer.trainable = False

        if not train_from_bottlenecks:
            x = base_model.output
            bottlenecks = GlobalAveragePooling2D()(x)
            logits = Dense(num_classes,
                           activation=current_activation,
                           name='logits')(bottlenecks)
            y_proba = Dense(num_classes, activation='softmax')(logits)
            _keras_model = Model(inputs=base_model.input, outputs=y_proba)
        else:
            bottlenecks = Input(shape=(base_model.output_shape[-1], ),
                                name='bottleneck')
            # bottlenecks = Dense(self.num_classes, input_shape=(base_model.output_shape[-1],))
            logits = Dense(num_classes,
                           activation=current_activation,
                           name='logits')(bottlenecks)
            y_proba = Dense(num_classes, activation='softmax',
                            name='y_proba')(logits)
            # This is the model that is actually trained, if bottlenecks are being fed from memory:
            _keras_model = Model(inputs=bottlenecks, outputs=y_proba)

        _keras_model.compile(optimizer=current_optimizer,
                             loss='categorical_crossentropy',
                             metrics=['accuracy'])
        tf.logging.info(msg='Compiled Keras model.')
        train_ds = _tf_data_generator_from_memory(
            image_bottlenecks=X_train,
            image_encoded_labels=y_train,
            is_training=True,
            num_classes=num_classes,
            train_batch_size=current_train_batch_size,
            val_batch_size=num_val_samples)
        val_ds = _tf_data_generator_from_memory(
            image_bottlenecks=X_valid,
            image_encoded_labels=y_valid,
            is_training=False,
            num_classes=num_classes,
            train_batch_size=current_train_batch_size,
            val_batch_size=num_val_samples)

        train_steps_per_epoch = math.ceil(num_train_samples /
                                          current_train_batch_size)
        val_steps_per_epoch = math.ceil(num_val_samples / num_val_samples)

        _keras_model.fit(train_ds.make_one_shot_iterator(),
                         validation_data=val_ds.make_one_shot_iterator(),
                         epochs=num_epochs,
                         steps_per_epoch=train_steps_per_epoch,
                         validation_steps=val_steps_per_epoch,
                         callbacks=[])
Пример #9
0
def main(run_config):
    tb_log_dir = 'C:\\Users\\ccamp\Documents\\GitHub\\HerbariumDeep\\frameworks\\TensorFlow\\TFHub\\tmp\\summaries'
    _prepare_tensor_board_directories(tb_summaries_dir=tb_log_dir,
                                      intermediate_output_graphs_dir=None)
    # image_executor = ImageExecutor(img_root_dir=run_config['image_dir'], logging_dir=run_config['logging_dir'], min_num_images_per_class=20, accepted_extensions=['jpg', 'jpeg'])
    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        logging_dir=run_config['logging_dir'],
        tfhub_module_url=
        'https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path'])
    initializer_options = get_initializer_options()
    activation_options = get_activation_options(leaky_relu_alpha=0.2)
    # User MUST provide all default arguments or a KeyError will be thrown:
    optimizer_options = get_optimizer_options(static_learning_rate=0.001,
                                              momentum_const=0.9,
                                              adam_beta1=0.9,
                                              adam_beta2=0.999,
                                              adam_epsilon=1e-08)
    all_bottlenecks = bottleneck_executor.get_bottlenecks()
    class_labels = list(all_bottlenecks['class'].unique())
    train_bottlenecks, val_bottlenecks, test_bottlenecks = bottleneck_executor.get_partitioned_bottlenecks(
    )
    train_bottleneck_values = train_bottlenecks['bottleneck'].tolist()
    train_bottleneck_values = np.array(train_bottleneck_values)
    train_bottleneck_ground_truth_labels = train_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    train_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in train_bottleneck_ground_truth_labels
    ])
    val_bottleneck_values = val_bottlenecks['bottleneck'].tolist()
    val_bottleneck_values = np.array(val_bottleneck_values)
    val_bottleneck_ground_truth_labels = val_bottlenecks['class'].values
    # Convert the labels into indices (one hot encoding by index):
    val_bottleneck_ground_truth_indices = np.array([
        class_labels.index(ground_truth_label)
        for ground_truth_label in val_bottleneck_ground_truth_labels
    ])

    # _run_grid_search_from_drive(
    #     dataset=run_config['dataset'],
    #     train_image_paths=train_bottlenecks['path'].values,
    #     train_ground_truth_labels=train_bottleneck_ground_truth_indices,
    #     val_image_paths=val_bottlenecks['path'].values,
    #     val_ground_truth_labels=val_bottleneck_ground_truth_indices,
    #     initializers=initializer_options,
    #     optimizers=optimizer_options,
    #     activations=activation_options,
    #     class_labels=class_labels,
    #     tb_log_dir=tb_log_dir
    # )

    _run_grid_search_from_memory(
        dataset=run_config['dataset'],
        train_bottlenecks=train_bottleneck_values,
        train_ground_truth_indices=train_bottleneck_ground_truth_indices,
        val_bottlenecks=val_bottleneck_values,
        val_ground_truth_indices=val_bottleneck_ground_truth_indices,
        initializers=initializer_options,
        activations=activation_options,
        optimizers=optimizer_options,
        class_labels=class_labels,
        tb_log_dir=tb_log_dir)
Пример #10
0
def main(run_config):
    if run_config['process'] == 'Training':
        preceding_process = None
        raise NotImplementedError('This is not a fair evaluation')
    elif run_config['process'] == 'Validation':
        preceding_process = 'train'
    elif run_config['process'] == 'Testing':
        preceding_process = 'val'
    else:
        preceding_process = None

    preceding_process_run_config = run_configs[run_config['dataset']][preceding_process]
    preceding_process_model_path = preceding_process_run_config['saved_model_path']
    current_process_model_path = run_config['saved_model_path']

    preceding_process_model_path = os.path.join(preceding_process_model_path, os.listdir(preceding_process_model_path)[0])
    current_process_model_path = os.path.join(current_process_model_path, os.listdir(current_process_model_path)[0])

    preceding_process_model_path = os.path.join(preceding_process_model_path, 'trained_model')
    current_process_model_path = os.path.join(current_process_model_path, 'trained_model')

    preceding_process_model_label_file_path = os.path.join(preceding_process_model_path, 'class_labels.txt')
    current_process_model_label_file_path = os.path.join(current_process_model_path, 'class_labels.txt')

    bottleneck_executor = BottleneckExecutor(
        image_dir=run_config['image_dir'],
        logging_dir=run_config['logging_dir'],
        tfhub_module_url='https://tfhub.dev/google/imagenet/inception_v3/feature_vector/1',
        compressed_bottleneck_file_path=run_config['bottleneck_path']
    )
    all_bottlenecks = bottleneck_executor.get_bottlenecks()
    class_labels = list(all_bottlenecks['class'].unique())
    train_bottlenecks, val_bottlenecks, test_bottlenecks = bottleneck_executor.get_partitioned_bottlenecks()
    # assert len(class_labels) == len(train_bottlenecks['class'].unique())
    # print('Different class label: %s' % list(set(train_bottlenecks['class'].unique()) ^ set(class_labels)))
    # print('Different class labels: %s' % np.setdiff1d(train_bottlenecks['class'].unique(), val_bottlenecks['class'].unique()))
    # assert len(class_labels) == len(val_bottlenecks['class'].unique())
    # assert len(class_labels) == len(test_bottlenecks['class'].unique())

    # Load trained classifier:
    tfh_classifier = TrainedTFHClassifier(preceding_model_path=os.path.join(preceding_process_model_path, 'inference'), preceding_model_label_file_path=preceding_process_model_label_file_path, current_model_label_file_path=current_process_model_label_file_path)
    # tfh_classifier.load_preceding_model()

    if run_config['process'] == 'Training':
        train_bottleneck_values = train_bottlenecks['bottleneck'].tolist()
        train_bottleneck_values = np.array(train_bottleneck_values)
        train_bottleneck_ground_truth_labels = train_bottlenecks['class'].values
        # Convert the labels into indices (one hot encoding by index):
        train_bottleneck_ground_truth_indices = np.array([class_labels.index(ground_truth_label)
                                                          for ground_truth_label in train_bottleneck_ground_truth_labels])
        class_top_1_accuracies = tfh_classifier.calculate_class_top_1_accuracies(current_process_bottlenecks=train_bottlenecks, class_labels=class_labels)
        class_top_1_positive_predictive_values = tfh_classifier.calculate_class_top_1_positive_predictive_values(current_process_bottlenecks=train_bottlenecks, class_labels=class_labels)
        class_top_5_accuracies = tfh_classifier.calculate_class_top_5_accuracies(bottlenecks=train_bottlenecks, class_labels=class_labels)
        top_1_accs = [value['top_1_acc'] for (key, value) in class_top_1_accuracies.items()]
        top_5_accs = [value['top_5_acc'] for (key, value) in class_top_5_accuracies.items()]
        print('Average top-1 Accuracy (training set): %.2f%%' % (sum(top_1_accs)/len(top_1_accs)))
        print('Average top-5 Accuracy (training set): %.2f%%' % (sum(top_5_accs)/len(top_5_accs)))
        with open('top_1_accuracies_by_class_train_set.json', 'w') as fp:
            json.dump(class_top_1_accuracies, fp, indent=4, separators=(',', ': '))
        with open('top_5_accuracies_by_class_train_set.json', 'w') as fp:
            json.dump(class_top_5_accuracies, fp, indent=4, separators=(',', ': '))
    elif run_config['process'] == 'Validation':
        val_bottleneck_values = val_bottlenecks['bottleneck'].tolist()
        val_bottleneck_values = np.array(val_bottleneck_values)
        val_bottleneck_ground_truth_labels = val_bottlenecks['class'].values
        # Convert the labels into indices (one hot encoding by index):
        val_bottleneck_ground_truth_indices = np.array([class_labels.index(ground_truth_label)
                                                        for ground_truth_label in val_bottleneck_ground_truth_labels])
        class_top_1_accuracies = tfh_classifier.calculate_class_top_1_accuracies(current_process_bottlenecks=val_bottlenecks, class_labels=class_labels)
        class_top_1_positive_predictive_values = tfh_classifier.calculate_class_top_1_positive_predictive_values(current_process_bottlenecks=val_bottlenecks, class_labels=class_labels)
        class_top_5_accuracies = tfh_classifier.calculate_class_top_5_accuracies(bottlenecks=val_bottlenecks, class_labels=class_labels)

        top_1_accs = [value['top_1_acc'] for (key, value) in class_top_1_accuracies.items()]
        # top_5_accs = [value['top_5_acc'] for (key, value) in class_top_5_accuracies.items()]
        # top_1_ppvs = [value['top_1_ppv'] for (key, value) in class_top_1_positive_predictive_values.items()]

        # Threshold and display:
        print('Calculating classifier impact: ')
        threshold = 95
        ppv_viable_classes = []
        top_5_acc_but_not_ppv_viable_classes = []
        num_samples_ppv_classified = 0
        num_samples_top_5_acc_classified = 0
        total_num_samples_in_current_process = val_bottlenecks.shape[0]
        for clss, info in class_top_1_positive_predictive_values.items():
            class_ppv = info['top_1_ppv']
            if class_ppv >= threshold:
                ppv_viable_classes.append(clss)
                num_samples_ppv_classified += info['num_current_process_samples']
                print('\tClass \'%s\' (%d) can be classified automatically, with [%d/%d] the total number of samples' % (clss, class_labels.index(clss), info['num_current_process_samples'], val_bottlenecks.shape[0]))
        print('If the classifier only issues predictions for class labels whose top-1 PPV is at or above a threshold of %.2f%%, then %d samples can be classified automatically.' % (threshold, num_samples_ppv_classified))
        print('Of the remaining classes, the top-5 accuracy is:')
        overall_top_5_acc = tfh_classifier.calculate_top_5_acc_of_remaining_samples(current_process_bottlenecks=val_bottlenecks, class_labels=class_labels, positive_predictive_value_exceeds_threshold_class_labels=ppv_viable_classes)

        # print('Of the remaining classes...')
        # for clss, info in class_top_1_positive_predictive_values.items():
        #     if clss not in ppv_viable_classes:
        #         if class_top_5_accuracies[clss]['top_5_acc'] >= threshold:
        #             top_5_acc_but_not_ppv_viable_classes.append(clss)
        #             num_samples_top_5_acc_classified += info['num_current_process_samples']
        #             print('\tClass \'%s\' (%d) can be classified in the top-5 predictions, with [%d/%d] the total number of samples' % (clss, class_labels.index(clss), info['num_current_process_samples'], val_bottlenecks.shape[0]))
        # print('If the classifier only issues predictions for class labels whose top-5 acc is at or above a threshold of %.2f%%, then %d samples can be classified automatically.' % (threshold, num_samples_top_5_acc_classified))
        # percent_ppv_samples = (num_samples_ppv_classified*100)/total_num_samples_in_current_process
        # percent_top_5_samples = (num_samples_top_5_acc_classified*100)/total_num_samples_in_current_process
        # num_samples_manual_classified = total_num_samples_in_current_process - num_samples_ppv_classified - num_samples_top_5_acc_classified
        # percent_manual_samples = (num_samples_manual_classified*100)/total_num_samples_in_current_process
        # print('Therefore, [%d/%d] samples (%.2f%%) can be classified automatically. And [%d/%d] samples (%.2f%%) can be '
        #       'classified via drop-down. There are [%d/%d] (%.2f%%) samples which require manual transcription in the '
        #       'current process\'s (validation) dataset'
        #       % (num_samples_ppv_classified, total_num_samples_in_current_process, percent_ppv_samples,
        #          num_samples_top_5_acc_classified, total_num_samples_in_current_process, percent_top_5_samples, num_samples_manual_classified, total_num_samples_in_current_process, percent_manual_samples))

        # print('Average top-1 Accuracy (validation set): %.2f%%' % (sum(top_1_accs)/len(top_1_accs)))
        # print('Average top-5 Accuracy (validation set): %.2f%%' % (sum(top_5_accs)/len(top_5_accs)))

        # with open('top_1_accuracies_by_class_val_set.json', 'w') as fp:
        #     json.dump(class_top_1_accuracies, fp, indent=4, separators=(',', ': '))
        # with open('top_5_accuracies_by_class_val_set.json', 'w') as fp:
        #     json.dump(class_top_5_accuracies, fp, indent=4, separators=(',', ': '))
    elif run_config['process'] == 'Testing':
        test_bottleneck_values = test_bottlenecks['bottleneck'].tolist()
        test_bottleneck_values = np.array(test_bottleneck_values)
        test_bottleneck_ground_truth_labels = test_bottlenecks['class'].values
        # Convert the labels into indices (one hot encoding by index):
        test_bottleneck_ground_truth_indices = np.array([class_labels.index(ground_truth_label)
                                                        for ground_truth_label in test_bottleneck_ground_truth_labels])
        class_top_1_accuracies = tfh_classifier.calculate_class_top_1_accuracies(current_process_bottlenecks=test_bottlenecks, class_labels=class_labels)
        class_top_1_positive_predictive_values = tfh_classifier.calculate_class_top_1_positive_predictive_values(current_process_bottlenecks=test_bottlenecks, class_labels=class_labels)
        class_top_5_accuracies = tfh_classifier.calculate_class_top_5_accuracies(bottlenecks=test_bottlenecks, class_labels=class_labels)
        top_1_accs = [value['top_1_acc'] for (key, value) in class_top_1_accuracies.items()]
        top_5_accs = [value['top_5_acc'] for (key, value) in class_top_5_accuracies.items()]
        print('Average top-1 Accuracy (testing set): %.2f%%' % (sum(top_1_accs)/len(top_1_accs)))
        print('Average top-5 Accuracy (testing set): %.2f%%' % (sum(top_5_accs)/len(top_5_accs)))
        with open('top_1_accuracies_by_class_test_set.json', 'w') as fp:
            json.dump(class_top_1_accuracies, fp, indent=4, separators=(',', ': '))
        with open('top_5_accuracies_by_class_test_set.json', 'w') as fp:
            json.dump(class_top_5_accuracies, fp, indent=4, separators=(',', ': '))
    else:
        print('ERROR: Could not identify process designation')