Exemplo n.º 1
0
def train_gefs_model():
    print("Preparing df for GeFs Random forest model")
    df, ncat = get_fraud_data(df)  # Preprocess the df
    # ncat is the number of categories of each variable in the df
    X_train, X_test, y_train, y_test, data_train, data_test = gefs_train_test_split(
        df, ncat)
    rf = RandomForest(n_estimators=30, ncat=ncat)  # Train a Random Forest
    print('Starting the GeFs Random Forest Training')
    rf.fit(X_train, y_train)
    print('Converting Random Forest to GeF')
    gef = rf.topc()  # Convert to a GeF

    ## Classification is performed either by averaging the prediction of each tree (`classify_avg` method)
    #  or by defining a mixture over them (`classify` method).
    print('Making predictions on test df')
    y_pred_avg = gef.classify_avg(X_test, classcol=df.shape[1] - 1)
    y_pred_mixture = gef.classify(X_test, classcol=df.shape[1] - 1)

    _, y_prob = gef.classify(X_test,
                             classcol=df.shape[1] - 1,
                             return_prob=True)
    y_prob = np.max(y_prob, axis=1)
    from sklearn import metrics
    score = metrics.roc_auc_score(y_test, y_prob)
    print(f"Test score for GeFs Model: {score}")
Exemplo n.º 2
0
def train_gefs_model():
    print("Preparing data for GeFs Random forest model")
    data, ncat = get_fraud_data(df)  # Preprocess the data
    # ncat is the number of categories of each variable in the data
    X_train, X_test, y_train, y_test, data_train, data_test = gefs_train_test_split(
        data, ncat)
    rf = RandomForest(n_estimators=30, ncat=ncat)  # Train a Random Forest
    print('Starting the GeFs Random Forest Training')
    rf.fit(X_train, y_train)
    print('Converting Random Forest to GeF')
    gef = rf.topc()  # Convert to a GeF

    ## Classification is performed either by averaging the prediction of each tree (`classify_avg` method)
    #  or by defining a mixture over them (`classify` method).
    print('Making predictions on test data')
    y_pred_avg = gef.classify_avg(X_test, classcol=data.shape[1] - 1)
    y_pred_mixture = gef.classify(X_test, classcol=data.shape[1] - 1)

    from sklearn import metrics
    score = metrics.roc_auc_score(y_test, y_pred_avg)
    print(f"Test score for GeFs Model: {score}")

    ### Computing Robustness Values
    ##  Robustness values can be computed with the `compute_rob_class` function.
    from gefs import compute_rob_class
    pred, rob = compute_rob_class(gef.root, X_test, data.shape[1] - 1,
                                  int(ncat[-1]))
Exemplo n.º 3
0
def fit(data_pars=None, compute_pars=None, out_pars=None, **kw):
    """
    """
    global model, session
    session = None  # Session type for compute
    Xtrain, ytrain, Xtest, ytest = get_dataset(data_pars, task_type="train")
    if VERBOSE: log(Xtrain.shape, model.model)

    if model.ncat is None:
        log("#!IMPORTANT This indicates that the preprocessing pipeline was not adapted to GEFS! and we need to calculate ncat"
            )
        cont_cols = data_pars['cols_input_type'].get(
            "colnum")  #  continous, float column is this correct?
        temp_train = pd.concat([Xtrain, ytrain], axis=1)
        temp_test = pd.concat([Xtest, ytest], axis=1)
        df = pd.concat([temp_train, temp_test], ignore_index=True, sort=False)
        model.ncat = pd_colcat_get_catcount(
            df.values,
            classcol=-1,
            continuous_ids=[df.columns.get_loc(c) for c in cont_cols])

        model.model = RandomForest(model.n_estimators, ncat=model.ncat)

    model.model.fit(Xtrain, ytrain)
    model.model = model.model.topc()  # Convert to a GeF
Exemplo n.º 4
0
def fit(data_pars=None, compute_pars=None, out_pars=None, **kw):
    """
    """
    global model, session
    session = None  # Session type for compute
    Xtrain, ytrain, Xtest, ytest = get_dataset(data_pars, task_type="train")
    log(Xtrain.shape, model.model)

    if model.ncat is None:
        log("#!IMPORTANT This indicates that the preprocessing pipeline was not adapted to GEFS! and we need to calculate ncat"
            )
        cont_cols = data_pars['cols_input_type'].get(
            "colnum")  #  continous, float column is this correct?
        temp_train = pd.concat([Xtrain, ytrain], axis=1)
        temp_test = pd.concat([Xtest, ytest], axis=1)
        df = pd.concat([temp_train, temp_test], ignore_index=True, sort=False)
        model.ncat = pd_colcat_get_catcount(
            df,
            # categ cols
            colcat=data_pars["cols_input_type"]["colcat"],
            # target col index
            classcol=-1,
            # num cols indices
            continuous_ids=[df.columns.get_loc(c) for c in cont_cols])
        ncat = np.array(list(model.ncat.values()))

        # In case of warnings make sure ncat is consistent
        # check this issue : https://github.com/AlCorreia/GeFs/issues/6
        """
         def __init__(self, n_estimators=100, imp_measure='gini', min_samples_split=2,
         min_samples_leaf=1, max_features=None, bootstrap=True,
         ncat=None, max_depth=1e6, surrogate=False):
        """
        model.model = RandomForest(
            n_estimators=model.n_estimators,
            ncat=ncat,
        )

    # Remove the target col
    X = Xtrain.iloc[:, :-1]
    # y should be 1-dim
    model.model.fit(X.values, ytrain.values.reshape(-1))

    # Make sure ncat is consistent, otherwise model.topc()
    # will throw all kind of numba errors
    # check this issue : https://github.com/AlCorreia/GeFs/issues/5
    model.model = model.model.topc()  # Convert to a GeF
Exemplo n.º 5
0
    def __init__(self, model_pars=None, data_pars=None, compute_pars=None):
        self.model_pars, self.compute_pars, self.data_pars = model_pars, compute_pars, data_pars

        if model_pars is None:
            self.model = None
        else:
            self.n_estimators = model_pars.get('n_estimators', 100)
            self.ncat = model_pars.get(
                'ncat', None
            )  # Number of categories of each variable This is an ndarray
            if self.ncat is None:
                self.model = None  # In order to create an instance of the model we need to calculate the ncat mentioned above on our dataset
                log('ncat is not define')
            else:
                self.model = RandomForest(n_estimators=self.n_estimators,
                                          ncat=self.ncat)
            if VERBOSE: log(None, self.model)
Exemplo n.º 6
0
    def __init__(self, model_pars=None, data_pars=None, compute_pars=None):
        self.model_pars, self.compute_pars, self.data_pars = model_pars, compute_pars, data_pars

        if model_pars is None:
            self.model = None
        else:
            self.n_estimators = model_pars.get('n_estimators', 100)
            self.ncat = model_pars.get(
                'ncat', None
            )  # Number of categories of each variable This is an ndarray
            if self.ncat is None:
                self.model = None  # In order to create an instance of the model we need to calculate the ncat mentioned above on our dataset
                log('ncat is not define')
            else:
                """
                    def __init__(self, n_estimators=100, imp_measure='gini', min_samples_split=2,
                 min_samples_leaf=1, max_features=None, bootstrap=True,
                 ncat=None, max_depth=1e6, surrogate=False):
                """
                self.model = RandomForest(n_estimators=self.n_estimators,
                                          ncat=self.ncat)
            log(None, self.model)
Exemplo n.º 7
0
    data.insert(
        len(data.columns) - 1, 'is_attributed', data.pop('is_attributed'))
    data.loc[:, cat_cols] = get_dummies(data[cat_cols])
    ncat = learncats(
        data.values,
        classcol=-1,
        continuous_ids=[data.columns.get_loc(c) for c in cont_cols])
    return data.values.astype(float), ncat


print("Preparing data for GeFs Random forest model")
data, ncat = get_fraud_data(df)  # Preprocess the data
# ncat is the number of categories of each variable in the data
X_train, X_test, y_train, y_test, data_train, data_test = gefs_train_test_split(
    data, ncat)
rf = RandomForest(n_estimators=30, ncat=ncat)  # Train a Random Forest
print('Starting the GeFs Random Forest Training')
rf.fit(X_train, y_train)
print('Converting Random Forest to GeF')
gef = rf.topc()  # Convert to a GeF

from sklearn.ensemble import RandomForestClassifier as rfsk
rfsk.fit(X_train, y_train)
y_pred_avg_sk = rfsk.predict(X_test)

## Classification is performed either by averaging the prediction of each tree (`classify_avg` method)
#  or by defining a mixture over them (`classify` method).
print('Making predictions on test data')
y_pred_avg = gef.classify_avg(X_test, classcol=data.shape[1] - 1)
y_pred_mixture = gef.classify(X_test, classcol=data.shape[1] - 1)