def test_missingness_threshold(thre):
    clf = InsilicoClassifier()

    # Create a dataframe which looks like a checker board of 1s and 0s
    # and set the first n percent of the observations in the column to missing.
    # The percent missing is the column header
    df = pd.DataFrame(np.tile([1, 0], 550).reshape((100, 11)),
                      index=['I{}'.format(i) for i in range(100)],
                      columns=['{}0%'.format(i) for i in range(11)])
    for x in range(11):
        df.iloc[np.arange(0, x * 10), x] = -1

    # The pattern of five letters repeat over and over. This ensures that the
    # for each y value some are missing others are not.
    y = pd.Series(np.tile(['a', 'b', 'c', 'd', 'e'], 20), index=df.index)
    clf.extract_prob(df, y, missingness_threshold=thre)
    df.iloc[:, np.arange(int((thre + 0.05) * 10))].columns
def extracted_data(request):
    clf = InsilicoClassifier()

    key, value = request.param
    df_data = {
        'data': np.concatenate([np.eye(5) for i in range(4)]),
        'index': ['row{}'.format(i) for i in range(20)],
        'columns': ['col{}'.format(i) for i in range(5)],
    }

    if key == 'gs':
        y = pd.Series(value)
    else:
        y = pd.Series(['gold{}'.format(x) for x in np.repeat(np.arange(5), 4)])
        df_data.update({key: value})

    df = pd.DataFrame(**df_data)
    y.index = df.index

    probs = clf.extract_prob(df, y)
    return probs, df, y