def config_classifier(clf, params=None): """Process the args for the classifier Args: clf (str): name of sklearn-like classifier object. Currently only 'insilico' and 'random' are supported params (dict): params initialize the classifier object Returns: classifier: Sklearn-like classifer with fit and predict methods """ if params: for k, v in params.items(): try: try: params[k] = int(v) except ValueError: params[k] = float(v) except ValueError: params[k] = v else: params = dict() if clf == 'insilico': clf = InsilicoClassifier(**params) elif clf == 'random': clf = RandomClassifier(**params) else: raise ValueError('Unknown classifier: "{}"'.format(clf)) return clf
def test_train_gs_col_not_found(df): clf = InsilicoClassifier() gs_list = list(df['GS'].unique()) with pandas2ri_activated(): with pytest.raises(RRuntimeError): clf.r_insilico.extract_prob(df, 'foo', gs_list)
def test_predict_with_numeric_encoding(df, params): clf = InsilicoClassifier() df = df.replace({'Y': 1, '': 0, '.': -1}) with pandas2ri_activated(): with pytest.raises(RRuntimeError): clf.r_insilico.insilico(df, isNumeric=True, **params)
def test_no_demographics(df, params, mask, context): clf = InsilicoClassifier() df.loc[:, mask] = '' with pandas2ri_activated(): with context: clf.r_insilico.insilico(df.drop('GS', axis=1), **params)
def test_train_with_wrong_learning_type(df, learning_type, context): clf = InsilicoClassifier() gs_list = list(df['GS'].unique()) learning = {'type': learning_type} if learning_type else dict() with pandas2ri_activated(): with context: clf.r_insilico.extract_prob(df, 'GS', gs_list, **learning)
def test_train_all_one_value(df, val, context): clf = InsilicoClassifier() gs_list = list(df['GS'].unique()) df.loc[:, :] = val with pandas2ri_activated(): with context: clf.r_insilico.extract_prob(df, 'GS', gs_list, thre=0)
def test_predict_with_all_injuries(df, params, mask, context): clf = InsilicoClassifier() df = df.drop('GS', axis=1) df.loc[:, mask] = 'Y' with pandas2ri_activated(): with context: clf.r_insilico.insilico(df, **params)
def test_returns_rpy2_converted_package(self): clf = InsilicoClassifier() pkg = clf.get_r_insilico_package() assert isinstance(pkg, rpy2.robjects.packages.InstalledSTPackage) assert 'insilico_fit' in pkg._exported_names assert isinstance(pkg.insilico_fit, rpy2.robjects.functions.DocumentedSTFunction) assert 'extract_prob' in pkg._exported_names assert isinstance(pkg.extract_prob, rpy2.robjects.functions.DocumentedSTFunction)
def df(): clf = InsilicoClassifier() df = clf.get_sample_data().reset_index() df = df.set_index(df.columns[0], drop=False) gs_list = ['A', 'B', 'C', 'D', 'E'] gs_name = 'GS' gs = np.repeat(gs_list, df.shape[0] / len(gs_list)) df[gs_name] = gs return df
def test_train_drops_all_but_one_missing(df, thre, context): """ Siminarily to above, if only one column is above the missingness threshold, all but one columns are dropped. This results in a one dimensional data structure. When R attempts to subset in two dimensions it throws an error. """ clf = InsilicoClassifier() gs_list = list(df['GS'].unique()) # Set all values in all columns except the ID and first column to missing df.iloc[:, 2:] = '.' with pandas2ri_activated(): with context: clf.r_insilico.extract_prob(df, 'GS', gs_list, thre=thre)
def test_missingness_threshold(thre): clf = InsilicoClassifier() # Create a dataframe which looks like a checker board of 1s and 0s # and set the first n percent of the observations in the column to missing. # The percent missing is the column header df = pd.DataFrame(np.tile([1, 0], 550).reshape((100, 11)), index=['I{}'.format(i) for i in range(100)], columns=['{}0%'.format(i) for i in range(11)]) for x in range(11): df.iloc[np.arange(0, x * 10), x] = -1 # The pattern of five letters repeat over and over. This ensures that the # for each y value some are missing others are not. y = pd.Series(np.tile(['a', 'b', 'c', 'd', 'e'], 20), index=df.index) clf.extract_prob(df, y, missingness_threshold=thre) df.iloc[:, np.arange(int((thre + 0.05) * 10))].columns
def extracted_data(request): clf = InsilicoClassifier() key, value = request.param df_data = { 'data': np.concatenate([np.eye(5) for i in range(4)]), 'index': ['row{}'.format(i) for i in range(20)], 'columns': ['col{}'.format(i) for i in range(5)], } if key == 'gs': y = pd.Series(value) else: y = pd.Series(['gold{}'.format(x) for x in np.repeat(np.arange(5), 4)]) df_data.update({key: value}) df = pd.DataFrame(**df_data) y.index = df.index probs = clf.extract_prob(df, y) return probs, df, y
def test_train_drop_all_missing(df, thre, context): """ The threshold for misisngness ``thre`` can be any number. It is interpreted as a proportion, so ideally it should be between zero and one. A threshold of zero or less indicates that the user only wishes to use columns which have no missingness. A threshold of one or more indicates that the user wishes to keep all columns from the training data regardless of the amount of missingness. With a zero or negative missingness threshold (i.e. no missingess allowed) and some missingness in all columns, R will drop all columns and attempt to subset an empty data structure. """ clf = InsilicoClassifier() gs_list = list(df['GS'].unique()) # Set the first row to all missing except the ID df.iloc[0, 1:] = '.' with pandas2ri_activated(): with context: clf.r_insilico.extract_prob(df, 'GS', gs_list, thre=thre)
def test_predict_with_training_but_no_impossible_causes( df, params, mask, context): clf = InsilicoClassifier() rbase = importr('base') symptoms = clf.get_insilico_symptoms() causes = clf.get_insilico_causes() test = df.drop('GS', axis=1) train = df.copy() cause_cycle = itertools.cycle(causes) train['GS'] = [next(cause_cycle) for i in df.index] with pandas2ri_activated(): probs = clf.r_insilico.extract_prob(train, 'GS', causes, type='empirical') cond_prob = pandas2ri.ri2py(rbase.data_frame(probs.rx2('cond.prob'))) cond_prob = cond_prob.loc[symptoms, causes].fillna(0) cond_prob.loc[mask] = .5 # all non-zero (no impossible combos) with context: clf.r_insilico.insilico(test, CondProbNum=cond_prob.values, **params)
def test_environment(): clf = InsilicoClassifier(auto_length=False) df = clf.get_sample_data() clf.fit(None, None).predict(df.iloc[:5])
def test_predict_with_thin_gt_nsim(): clf = InsilicoClassifier() with pandas2ri_activated(): with pytest.raises(RRuntimeError): clf.r_insilico.insilico(df, thin=50, Nsim=10)
def test_get_short_cause_map(self): clf = InsilicoClassifier() cause_map = clf.get_insilico_short_causes() assert isinstance(cause_map, dict)
def test_get_symptoms(self): clf = InsilicoClassifier() symptoms = clf.get_insilico_symptoms() assert isinstance(symptoms, list)
def test_get_causes(self): clf = InsilicoClassifier() causes = clf.get_insilico_causes() assert isinstance(causes, list)
def test_all_tied(self, values): clf = InsilicoClassifier() s = pd.Series(values, index=['a', 'b', 'c']) b = clf.indiv_most_probable(s) assert b == 'Undetermined'
def test_get_cond_prob_num(self): clf = InsilicoClassifier() df = clf.get_cond_prob_num() assert isinstance(df, pd.DataFrame)
def test_tie_for_biggest(self, values, index, biggest): clf = InsilicoClassifier() s = pd.Series(values, index=index) b = clf.indiv_most_probable(s) assert b in biggest
def test_get_labels_map(labels, encoded): clf = InsilicoClassifier() expected = dict(zip(encoded, labels)) observed = clf.get_labels_map(labels) assert expected == observed
def test_get_sample_data(self): clf = InsilicoClassifier() df = clf.get_sample_data() assert isinstance(df, pd.DataFrame)
def fit_data(): """Small data set with one injury observation""" clf = InsilicoClassifier() df = clf.get_sample_data().iloc[np.arange(30, 40)] fit = clf.insilico_fit(df, n_sim=1000, burn_in=100, auto_length=False) return df, fit