def test_random_undersample():
    df = create_test_df(length=30, c1_percentage=0.8)
    standardized_df = categories.standardize_columns(df, 'id_column_name',
                                                     'category_column_name')
    rus_df = categories.random_undersample(standardized_df)
    print(rus_df)
    assert (list(
        rus_df['id'] == [13, 18, 3, 14, 20, 17, 24, 25, 26, 27, 28, 29]))
def test_create_sets():
    df = create_test_df(length=100)
    standardized_df = categories.standardize_columns(df, 'id_column_name',
                                                     'category_column_name')
    training, validation, testing = categories.create_sets(
        standardized_df, 10, 10)
    print(training)
    print(validation)
    print(testing)
    assert (len(validation) == 10 and len(testing) == 10)
def test_StratifiedShuffleSplit():
    df = create_test_df()
    standardized_df = categories.standardize_columns(df, 'id_column_name',
                                                     'category_column_name')
    splits = train_test_split(standardized_df,
                              train_size=0.8,
                              random_state=0,
                              stratify=standardized_df['category'])

    assert (list(splits[1].groupby('category').size()) == [3, 3])
    assert (list(splits[0].groupby('category').size()) == [12, 12])

    [print(split, split.groupby('category').size()) for split in splits]
isbi2016test_metadata_df = load_isbi2016_test_df(isbi2016test_metadata_path)
filtered_isic_metadata_df = filter_metadata(isic_metadata_df)

# union
# of: filtered, mclass and isbi2016
# to download:
# the pipe symbol "|" carries out the union.
download_df = isic_metadata_df.loc[(filtered_isic_metadata_df.index
                                    | mclass_metadata_df.index
                                    | isbi2016test_metadata_df.index)]

df = download(download_df, images_base_path)

# standardize image DataFrames, i.e. select only the relevant columns and
# rename them to 'id' and 'category'.
std_df = standardize_columns(df, config_dict['id_column'],
                             config_dict['category_column'])
# in the ISBI 2016 test set there are two images without clear
# category:
# - 'ISIC_0009959' with category 'indeterminate'
# - 'ISIC_0010454' with category 'indeterminate/malignant'
# in the official ground truth both are counted as malignant
std_df[std_df.isin({'category': ['indeterminate',
                                 'indeterminate/malignant']})] = 'malignant'
# 'ISIC_0011319' has no category. in the official ground truth it is treated as
# benign
std_df.loc[std_df['id'] == 'ISIC_0011319.jpg', 'category'] = 'benign'

# select images which are in filtered but not in mclass or isic2016test
# a set difference operation.
filtered_std_df = std_df.loc[filtered_isic_metadata_df.index.difference(
    mclass_metadata_df.index.union(isbi2016test_metadata_df.index))]
def test_standardize_columns():
    df = create_test_df(length=2)
    standardized_df = categories.standardize_columns(df, 'id_column_name',
                                                     'category_column_name')
    assert (list(standardized_df.columns) == ['id', 'category'])
    print(standardized_df)