def test_missingness_threshold(thre): clf = InsilicoClassifier() # Create a dataframe which looks like a checker board of 1s and 0s # and set the first n percent of the observations in the column to missing. # The percent missing is the column header df = pd.DataFrame(np.tile([1, 0], 550).reshape((100, 11)), index=['I{}'.format(i) for i in range(100)], columns=['{}0%'.format(i) for i in range(11)]) for x in range(11): df.iloc[np.arange(0, x * 10), x] = -1 # The pattern of five letters repeat over and over. This ensures that the # for each y value some are missing others are not. y = pd.Series(np.tile(['a', 'b', 'c', 'd', 'e'], 20), index=df.index) clf.extract_prob(df, y, missingness_threshold=thre) df.iloc[:, np.arange(int((thre + 0.05) * 10))].columns
def extracted_data(request): clf = InsilicoClassifier() key, value = request.param df_data = { 'data': np.concatenate([np.eye(5) for i in range(4)]), 'index': ['row{}'.format(i) for i in range(20)], 'columns': ['col{}'.format(i) for i in range(5)], } if key == 'gs': y = pd.Series(value) else: y = pd.Series(['gold{}'.format(x) for x in np.repeat(np.arange(5), 4)]) df_data.update({key: value}) df = pd.DataFrame(**df_data) y.index = df.index probs = clf.extract_prob(df, y) return probs, df, y