def config_classifier(clf, params=None):
    """Process the args for the classifier

    Args:
        clf (str): name of sklearn-like classifier object. Currently only
            'insilico' and 'random' are supported
        params (dict): params initialize the classifier object

    Returns:
        classifier: Sklearn-like classifer with fit and predict methods
    """
    if params:
        for k, v in params.items():
            try:
                try:
                    params[k] = int(v)
                except ValueError:
                    params[k] = float(v)
            except ValueError:
                params[k] = v
    else:
        params = dict()

    if clf == 'insilico':
        clf = InsilicoClassifier(**params)
    elif clf == 'random':
        clf = RandomClassifier(**params)
    else:
        raise ValueError('Unknown classifier: "{}"'.format(clf))
    return clf
def test_train_gs_col_not_found(df):
    clf = InsilicoClassifier()
    gs_list = list(df['GS'].unique())

    with pandas2ri_activated():
        with pytest.raises(RRuntimeError):
            clf.r_insilico.extract_prob(df, 'foo', gs_list)
def test_predict_with_numeric_encoding(df, params):
    clf = InsilicoClassifier()

    df = df.replace({'Y': 1, '': 0, '.': -1})

    with pandas2ri_activated():
        with pytest.raises(RRuntimeError):
            clf.r_insilico.insilico(df, isNumeric=True, **params)
def test_no_demographics(df, params, mask, context):
    clf = InsilicoClassifier()

    df.loc[:, mask] = ''

    with pandas2ri_activated():
        with context:
            clf.r_insilico.insilico(df.drop('GS', axis=1), **params)
def test_train_with_wrong_learning_type(df, learning_type, context):
    clf = InsilicoClassifier()
    gs_list = list(df['GS'].unique())

    learning = {'type': learning_type} if learning_type else dict()

    with pandas2ri_activated():
        with context:
            clf.r_insilico.extract_prob(df, 'GS', gs_list, **learning)
def test_train_all_one_value(df, val, context):
    clf = InsilicoClassifier()
    gs_list = list(df['GS'].unique())

    df.loc[:, :] = val

    with pandas2ri_activated():
        with context:
            clf.r_insilico.extract_prob(df, 'GS', gs_list, thre=0)
def test_predict_with_all_injuries(df, params, mask, context):
    clf = InsilicoClassifier()

    df = df.drop('GS', axis=1)
    df.loc[:, mask] = 'Y'

    with pandas2ri_activated():
        with context:
            clf.r_insilico.insilico(df, **params)
 def test_returns_rpy2_converted_package(self):
     clf = InsilicoClassifier()
     pkg = clf.get_r_insilico_package()
     assert isinstance(pkg, rpy2.robjects.packages.InstalledSTPackage)
     assert 'insilico_fit' in pkg._exported_names
     assert isinstance(pkg.insilico_fit,
                       rpy2.robjects.functions.DocumentedSTFunction)
     assert 'extract_prob' in pkg._exported_names
     assert isinstance(pkg.extract_prob,
                       rpy2.robjects.functions.DocumentedSTFunction)
def df():
    clf = InsilicoClassifier()
    df = clf.get_sample_data().reset_index()
    df = df.set_index(df.columns[0], drop=False)

    gs_list = ['A', 'B', 'C', 'D', 'E']
    gs_name = 'GS'
    gs = np.repeat(gs_list, df.shape[0] / len(gs_list))
    df[gs_name] = gs

    return df
def test_train_drops_all_but_one_missing(df, thre, context):
    """
    Siminarily to above, if only one column is above the missingness threshold,
    all but one columns are dropped. This results in a one dimensional data
    structure. When R attempts to subset in two dimensions it throws an error.
    """
    clf = InsilicoClassifier()
    gs_list = list(df['GS'].unique())

    # Set all values in all columns except the ID and first column to missing
    df.iloc[:, 2:] = '.'

    with pandas2ri_activated():
        with context:
            clf.r_insilico.extract_prob(df, 'GS', gs_list, thre=thre)
def test_missingness_threshold(thre):
    clf = InsilicoClassifier()

    # Create a dataframe which looks like a checker board of 1s and 0s
    # and set the first n percent of the observations in the column to missing.
    # The percent missing is the column header
    df = pd.DataFrame(np.tile([1, 0], 550).reshape((100, 11)),
                      index=['I{}'.format(i) for i in range(100)],
                      columns=['{}0%'.format(i) for i in range(11)])
    for x in range(11):
        df.iloc[np.arange(0, x * 10), x] = -1

    # The pattern of five letters repeat over and over. This ensures that the
    # for each y value some are missing others are not.
    y = pd.Series(np.tile(['a', 'b', 'c', 'd', 'e'], 20), index=df.index)
    clf.extract_prob(df, y, missingness_threshold=thre)
    df.iloc[:, np.arange(int((thre + 0.05) * 10))].columns
def extracted_data(request):
    clf = InsilicoClassifier()

    key, value = request.param
    df_data = {
        'data': np.concatenate([np.eye(5) for i in range(4)]),
        'index': ['row{}'.format(i) for i in range(20)],
        'columns': ['col{}'.format(i) for i in range(5)],
    }

    if key == 'gs':
        y = pd.Series(value)
    else:
        y = pd.Series(['gold{}'.format(x) for x in np.repeat(np.arange(5), 4)])
        df_data.update({key: value})

    df = pd.DataFrame(**df_data)
    y.index = df.index

    probs = clf.extract_prob(df, y)
    return probs, df, y
def test_train_drop_all_missing(df, thre, context):
    """
    The threshold for misisngness ``thre`` can be any number. It is interpreted
    as a proportion, so ideally it should be between zero and one. A threshold
    of zero or less indicates that the user only wishes to use columns which
    have no missingness. A threshold of one or more indicates that the user
    wishes to keep all columns from the training data regardless of the
    amount of missingness.

    With a zero or negative missingness threshold (i.e. no missingess allowed)
    and some missingness in all columns, R will drop all columns and attempt to
    subset an empty data structure.
    """
    clf = InsilicoClassifier()
    gs_list = list(df['GS'].unique())

    # Set the first row to all missing except the ID
    df.iloc[0, 1:] = '.'

    with pandas2ri_activated():
        with context:
            clf.r_insilico.extract_prob(df, 'GS', gs_list, thre=thre)
def test_predict_with_training_but_no_impossible_causes(
        df, params, mask, context):
    clf = InsilicoClassifier()
    rbase = importr('base')
    symptoms = clf.get_insilico_symptoms()
    causes = clf.get_insilico_causes()

    test = df.drop('GS', axis=1)
    train = df.copy()
    cause_cycle = itertools.cycle(causes)
    train['GS'] = [next(cause_cycle) for i in df.index]

    with pandas2ri_activated():
        probs = clf.r_insilico.extract_prob(train,
                                            'GS',
                                            causes,
                                            type='empirical')
        cond_prob = pandas2ri.ri2py(rbase.data_frame(probs.rx2('cond.prob')))
        cond_prob = cond_prob.loc[symptoms, causes].fillna(0)
        cond_prob.loc[mask] = .5  # all non-zero (no impossible combos)
        with context:
            clf.r_insilico.insilico(test,
                                    CondProbNum=cond_prob.values,
                                    **params)
示例#15
0
def test_environment():
    clf = InsilicoClassifier(auto_length=False)
    df = clf.get_sample_data()
    clf.fit(None, None).predict(df.iloc[:5])
def test_predict_with_thin_gt_nsim():
    clf = InsilicoClassifier()

    with pandas2ri_activated():
        with pytest.raises(RRuntimeError):
            clf.r_insilico.insilico(df, thin=50, Nsim=10)
 def test_get_short_cause_map(self):
     clf = InsilicoClassifier()
     cause_map = clf.get_insilico_short_causes()
     assert isinstance(cause_map, dict)
 def test_get_symptoms(self):
     clf = InsilicoClassifier()
     symptoms = clf.get_insilico_symptoms()
     assert isinstance(symptoms, list)
 def test_get_causes(self):
     clf = InsilicoClassifier()
     causes = clf.get_insilico_causes()
     assert isinstance(causes, list)
 def test_all_tied(self, values):
     clf = InsilicoClassifier()
     s = pd.Series(values, index=['a', 'b', 'c'])
     b = clf.indiv_most_probable(s)
     assert b == 'Undetermined'
 def test_get_cond_prob_num(self):
     clf = InsilicoClassifier()
     df = clf.get_cond_prob_num()
     assert isinstance(df, pd.DataFrame)
 def test_tie_for_biggest(self, values, index, biggest):
     clf = InsilicoClassifier()
     s = pd.Series(values, index=index)
     b = clf.indiv_most_probable(s)
     assert b in biggest
def test_get_labels_map(labels, encoded):
    clf = InsilicoClassifier()
    expected = dict(zip(encoded, labels))
    observed = clf.get_labels_map(labels)
    assert expected == observed
 def test_get_sample_data(self):
     clf = InsilicoClassifier()
     df = clf.get_sample_data()
     assert isinstance(df, pd.DataFrame)
def fit_data():
    """Small data set with one injury observation"""
    clf = InsilicoClassifier()
    df = clf.get_sample_data().iloc[np.arange(30, 40)]
    fit = clf.insilico_fit(df, n_sim=1000, burn_in=100, auto_length=False)
    return df, fit