Python BalancedRandomForestClassifier.fit示例，imblearn.ensemble.BalancedRandomForestClassifier.fit Python示例

示例#1

0

显示文件

        def objective(trial):

            train_X, val_X, train_y, val_y = train_test_split(self.X,
                                                              self.y,
                                                              test_size=0.2)
            median_imputer = SimpleImputer(missing_values=np.NaN,
                                           strategy='median')
            v_train_X = median_imputer.fit_transform(train_X)
            v_val_X = median_imputer.fit_transform(val_X)
            train_X = pd.DataFrame(v_train_X,
                                   columns=train_X.columns,
                                   index=train_X.index)
            val_X = pd.DataFrame(v_val_X,
                                 columns=val_X.columns,
                                 index=val_X.index)

            v_test_X = median_imputer.fit_transform(self.X_validation)
            test_X = pd.DataFrame(v_test_X,
                                  columns=self.X_validation.columns,
                                  index=self.X_validation.index)

            list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000]

            brf_n_estimators = trial.suggest_categorical(
                'n_estimators', list_trees)
            brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0)
            brf_min_samples_split = trial.suggest_int('min_samples_split', 2,
                                                      16)
            brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)
            brf_min_weight_fraction_leaf = trial.suggest_uniform(
                'min_weight_fraction_leaf', 0, 0.5)
            brf_max_depth = trial.suggest_int('max_depth', 2, 32)

            brfmodel = BalancedRandomForestClassifier(
                n_estimators=brf_n_estimators,
                max_features=brf_max_features,
                min_samples_split=brf_min_samples_split,
                min_samples_leaf=brf_min_samples_leaf,
                max_depth=brf_max_depth,
                min_weight_fraction_leaf=brf_min_weight_fraction_leaf,
                bootstrap=True)

            brfmodel.fit(train_X, train_y)

            aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1])
            aucbrf_test = roc_auc_score(self.y_validation,
                                        brfmodel.predict_proba(test_X)[:, 1])
            print('Accuracy test ' + str(
                accuracy_score(self.y_validation, brfmodel.predict(test_X))))

            plt.figure()
            plot_confusion_matrix(brfmodel,
                                  test_X,
                                  self.y_validation,
                                  cmap=plt.cm.Blues,
                                  normalize=None)
            plt.show()
            print(aucbrf_test)

            return aucbrf

示例#2

0

显示文件

def test_balanced_random_forest_attributes(imbalanced_dataset):
    X, y = imbalanced_dataset
    n_estimators = 10
    brf = BalancedRandomForestClassifier(
        n_estimators=n_estimators, random_state=0
    )
    brf.fit(X, y)

    for idx in range(n_estimators):
        X_res, y_res = brf.samplers_[idx].fit_resample(X, y)
        X_res_2, y_res_2 = (
            brf.pipelines_[idx]
            .named_steps["randomundersampler"]
            .fit_resample(X, y)
        )
        assert_allclose(X_res, X_res_2)
        assert_array_equal(y_res, y_res_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X)
        assert_array_equal(y_pred, y_pred_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X)
        assert_array_equal(y_pred, y_pred_2)

示例#3

0

显示文件

文件： test_forest.py 项目： vishalbelsare/imbalanced-learn

def test_little_tree_with_small_max_samples():
    rng = np.random.RandomState(1)

    X = rng.randn(10000, 2)
    y = rng.randn(10000) > 0

    # First fit with no restriction on max samples
    est1 = BalancedRandomForestClassifier(
        n_estimators=1,
        random_state=rng,
        max_samples=None,
    )

    # Second fit with max samples restricted to just 2
    est2 = BalancedRandomForestClassifier(
        n_estimators=1,
        random_state=rng,
        max_samples=2,
    )

    est1.fit(X, y)
    est2.fit(X, y)

    tree1 = est1.estimators_[0].tree_
    tree2 = est2.estimators_[0].tree_

    msg = "Tree without `max_samples` restriction should have more nodes"
    assert tree1.node_count > tree2.node_count, msg

示例#4

0

显示文件

文件： damage_predictor.py 项目： Wauplin/car_insurance_challenge

 def _train_has_damage(cls, preprocessed_df: pd.DataFrame) -> LinearModelType:
     X_train, X_test, Y_train, Y_test = cls.get_X_Y_split(
         preprocessed_df, "has_claim"
     )
     model = BalancedRandomForestClassifier()
     model.fit(X_train, Y_train)
     return model

示例#5

0

显示文件

文件： test_forest.py 项目： vishalbelsare/imbalanced-learn

def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        random_state=42,
                                                        stratify=y)
    est = BalancedRandomForestClassifier(
        oob_score=True,
        random_state=0,
        n_estimators=1000,
        min_samples_leaf=2,
    )

    est.fit(X_train, y_train)
    test_score = est.score(X_test, y_test)

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True,
                                         random_state=0,
                                         n_estimators=1,
                                         bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)

示例#6

0

显示文件

文件： pair_cha_imbl_learning.py 项目： BenjiTheC/TopCoderDataAnalysis

def main():
    """ Main entrance."""
    print('Spliting challenges')
    split_challenges()
    print('Reading X...')
    X = pd.concat([pd.read_json(XY_PATH['X'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])
    print('Reading y...')
    y = pd.concat([pd.read_json(XY_PATH['y'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])

    print('\nTraining Inner sampler RFC')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        balanced_rfc = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        balanced_rfc.fit(X_train.to_numpy(), y_train.to_numpy().ravel())

        pd.DataFrame(balanced_rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'brf', f'y_prob_{i}.json'), orient='records')
        pd.Series(balanced_rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'brf', f'feature_importance_{i}.json'))

    print('\nTraining RandomUnderSampler')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        rfc = RandomForestClassifier(n_estimators=100, random_state=0)
        rus = RandomUnderSampler(random_state=0)

        X_resample, y_resample = rus.fit_resample(X_train.to_numpy(), y_train.to_numpy().ravel())
        rfc.fit(X_resample, y_resample)

        pd.DataFrame(rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'rus', f'y_prob_{i}.json'), orient='records')
        pd.Series(rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'rus', f'feature_importance_{i}.json'))

示例#7

0

显示文件

文件： BurdenDerek.py 项目： BurdenDerek/Academic_Success_Predictions

    def random_forest(df, drop, target, show, model_name):

        # split the table into features and outcomes
        x_cols = [i for i in df.columns if i not in drop]
        X = df[x_cols]
        y = df[target]

        # split features and outcomes into train and test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=1)
        brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        brf.fit(X_train, y_train)
        y_predictions = brf.predict(X_test)

        feature_importance = sorted(
            zip(brf.feature_importances_, X.columns.tolist()))[::-1]

        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_predictions)

        # Displaying results
        if show == True:
            print(f"Feature Importance: {model_name}")
            for i in feature_importance:
                print(i)
            print("\n")

        return acc_score * 100

示例#8

0

显示文件

文件： from_weka_to_python.py 项目： amir9979/Debugger

def predict_model_kfold(name,path,features_type,label_name,data):
    kfold = KFold(10, True)
    #RandomForest -I 1000 -K 0 -S 1 -num-slots 1
    model = BalancedRandomForestClassifier(n_estimators=1000,max_depth=5)
    index = 0
    size = data.shape[0]
    all_predictions = 0
    x = data.drop('hasBug', axis=1)
    y = data['hasBug']
    num_of_bugs = data.loc[data['hasBug'] == 1].shape[0]
    num_of_all_instances = data.shape[0]
    bug_precent = float(num_of_bugs) / float(num_of_all_instances)
    for train, test in kfold.split(data):
        index += 1
        prediction_train = model.fit(x.iloc[train], y.iloc[train]).predict(x.iloc[test])
        all_predictions += create_all_eval_results(False,y.iloc[test],prediction_train,name,"training",features_type,num_of_bugs,num_of_all_instances,bug_precent,None)

    all_predictions /= index
    start_list = [name,"training",features_type,"sklearn - python"]
    result_list = start_list+ all_predictions.tolist()

    global results_all_projects
    results_all_projects.loc[len(results_all_projects)] = result_list

    model.fit(x,y)
    return model

示例#9

0

显示文件

def test_balanced_random_forest(imbalanced_dataset):
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators, random_state=0)
    brf.fit(*imbalanced_dataset)

    assert len(brf.samplers_) == n_estimators
    assert len(brf.estimators_) == n_estimators
    assert len(brf.pipelines_) == n_estimators
    assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]

示例#10

0

显示文件

文件： test_forest.py 项目： vishalbelsare/imbalanced-learn

def test_balanced_random_forest_pruning(imbalanced_dataset):
    brf = BalancedRandomForestClassifier()
    brf.fit(*imbalanced_dataset)
    n_nodes_no_pruning = brf.estimators_[0].tree_.node_count

    brf_pruned = BalancedRandomForestClassifier(ccp_alpha=0.015)
    brf_pruned.fit(*imbalanced_dataset)
    n_nodes_pruning = brf_pruned.estimators_[0].tree_.node_count

    assert n_nodes_no_pruning > n_nodes_pruning

示例#11

0

显示文件

文件： test_forest.py 项目： chkoar/imbalanced-learn

def test_balanced_random_forest(imbalanced_dataset):
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators,
                                         random_state=0)
    brf.fit(*imbalanced_dataset)

    assert len(brf.samplers_) == n_estimators
    assert len(brf.estimators_) == n_estimators
    assert len(brf.pipelines_) == n_estimators
    assert len(brf.feature_importances_) == imbalanced_dataset[0].shape[1]

示例#12

0

显示文件

    def evaluate_model(self):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)

            for data in dati:

                del data['value']

                rf_model = BalancedRandomForestClassifier(**data)

                rf_auc = []

                for i in tqdm(range(20)):

                    cv = StratifiedKFold(n_splits=5,
                                         shuffle=True,
                                         random_state=i + 187462)

                    for train_index, test_index in cv.split(self.X, self.y):

                        trainX = self.X.iloc[lambda x: train_index]
                        testX = self.X.iloc[lambda x: test_index]

                        trainy = np.take(self.y, train_index)
                        testy = np.take(self.y, test_index)

                        median_imputer = SimpleImputer(missing_values=np.NaN,
                                                       strategy='median')
                        imputer = median_imputer.fit(trainX)
                        vtrainX = imputer.transform(trainX)

                        imputertest = median_imputer.fit(testX)
                        vtestX = imputertest.transform(testX)
                        trainX = pd.DataFrame(vtrainX,
                                              columns=trainX.columns,
                                              index=trainX.index)
                        testX = pd.DataFrame(vtestX,
                                             columns=testX.columns,
                                             index=testX.index)

                        # Calcolo AUC per migliori risultati da CatBoost

                        rf_model.fit(trainX, trainy)
                        roc_rf = roc_auc_score(
                            testy,
                            rf_model.predict_proba(testX)[:, 1])
                        rf_auc.append(roc_rf)

                        print(roc_rf)

            print(statistics.mean(rf_auc))
        return rf_auc

示例#13

0

显示文件

def _plot_championship_importance(all_res, save_directory, top = 6):
    
    save_file = save_directory + 'championship_importance.png'
    
    if os.path.exists(save_file):
        return
    
    xs = []
    ys = []
    teams = []

    for season in all_res:

        team_df = all_res[season][0]
        team_stats = all_res[season][1]
        champion = all_res[season][2]

        for team, g in team_df.groupby('TEAM'):
            x = g.nlargest(top, 'TIME')[['off_norm', 'def_norm']].unstack().values
            y = 1 if team in champion else 0

            xs.append(x)
            ys.append(y)
            teams.append(team + '_' + season)

    xs = np.vstack(xs)
    ys = np.array(ys)

    fts = []
    for ntree in tqdm([50, 75, 100, 125, 150, 175, 200]):

        for i in np.where(ys==1)[0]:

            xs_temp = xs[[x for x in range(len(xs)) if x != i]]
            ys_temp = ys[[y for y in range(len(xs)) if y != i]]

            rfr = BalancedRandomForestClassifier(n_estimators=ntree)
            rfr.fit(xs_temp, ys_temp)
            ft = rfr.feature_importances_
            fts.append(ft)
            
    fts = np.vstack(fts)
    
    feature_names = ['off' + str(i+1) for i in range(top)] + ['def' + str(i+1) for i in range(top)]
    
    fig, ax = plt.subplots(figsize=(8,6))
    for i in range(len(feature_names)):
        ax.boxplot(fts[:, i], positions=[i])
    ax.set_xticklabels(feature_names)
    ax.set_ylabel('Feature Importance', labelpad=10)
    ax.set_title('Championship Feature Importance')
    
    plt.savefig(save_file)
    plt.close()

示例#14

0

显示文件

文件： test_forest.py 项目： vishalbelsare/imbalanced-learn

def test_balanced_random_forest_oob_binomial(ratio):
    # Regression test for #655: check that the oob score is closed to 0.5
    # a binomial experiment.
    rng = np.random.RandomState(42)
    n_samples = 1000
    X = np.arange(n_samples).reshape(-1, 1)
    y = rng.binomial(1, ratio, size=n_samples)

    erf = BalancedRandomForestClassifier(oob_score=True, random_state=42)
    erf.fit(X, y)
    assert np.abs(erf.oob_score_ - 0.5) < 0.1

示例#15

0

显示文件

    def evaluate_on_validation_or_test(self, test=False):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)
            for data in dati:

                del data['value']

                rf_model = BalancedRandomForestClassifier(**data)

                trainX = self.X
                trainy = self.y
                valx = self.X_validation
                valy = self.y_validation
                if test == True:
                    testx = self.X_test
                    testy = self.y_test

                median_imputer = SimpleImputer(missing_values=np.NaN,
                                               strategy='median')
                imputer = median_imputer.fit(trainX)
                vtrainX = imputer.transform(trainX)
                trainX = pd.DataFrame(vtrainX,
                                      columns=trainX.columns,
                                      index=trainX.index)

                vvalX = imputer.transform(valx)
                valx = pd.DataFrame(vvalX,
                                    columns=valx.columns,
                                    index=valx.index)

                if test == True:
                    vtest = imputer.transform(testx)
                    testx = pd.DataFrame(vtest,
                                         columns=testx.columns,
                                         index=testx.index)
                    trainX = pd.concat([trainX, valx])
                    trainy = np.concatenate((trainy, valy))

                rf_model.fit(trainX, trainy)

                if test == True:
                    roc_rf = roc_auc_score(testy,
                                           rf_model.predict_proba(testx)[:, 1])
                else:
                    roc_rf = roc_auc_score(valy,
                                           rf_model.predict_proba(valx)[:, 1])

                if test == False:
                    print("Validation AUC: {}".format(str(roc_rf)))
                else:
                    print("Test AUC: {}".format(str(roc_rf)))

示例#16

0

显示文件

文件： ml_truthfulness_undersampled_1.1.py 项目： smithkakar/kcl-ms-ai

    def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator,
                           params, clf_type, question):
        estimator_scores = {}

        if estimator == 'BalancedRandomForestClassifier':
            clf = BalancedRandomForestClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'BalancedBaggingClassifier':
            clf = BalancedBaggingClassifier(
                n_estimators=params['n_estimators'],
                bootstrap=params['bootstrap'],
                max_samples=params['max_samples'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'EasyEnsembleClassifier':
            clf = EasyEnsembleClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)

        clf.fit(train_x, train_y)
        cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y,
                                                      clf_type, question)

        predicted_labels = clf.predict(test_x)

        tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel()
        specificity = round((tn / (tn + fp)) * 100, 2)

        predicted_prob = clf.predict_proba(test_x)
        predicted_prob_true = [p[1] for p in predicted_prob]

        estimator_scores['Question'] = question
        estimator_scores['Accuracy'] = round(
            accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Balanced Accuracy'] = round(
            balanced_accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Precision'] = round(
            precision_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Recall'] = round(
            recall_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Specificity'] = specificity
        estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2)
        estimator_scores['ROC AUC'] = round(
            roc_auc_score(test_y, predicted_prob_true), 2)

        # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2)))
        # perfect_labels = train_y
        # print(confusion_matrix(train_y, perfect_labels))

        return cross_val_scores, estimator_scores

示例#17

0

显示文件

class BaselineRandomForest(BaseClassifier):
    def __init__(self):
        self.random_forest_classifier = RandomForestClassifier(
            n_estimators=500,
            max_features='auto',
            max_depth=None,
            n_jobs=1,
            class_weight=None,
            criterion='entropy',
            min_samples_split=2,
            min_samples_leaf=1)
        self.feature_preprocessor = FeaturePreprocessor()
        self.feature_list = None
        self.model_filename = 'baseline_rf.pkl'

    def fit(self, samples: pd.DataFrame, labels: pd.DataFrame):
        samples = self.feature_preprocessor.preprocess_features(samples)
        samples = self.feature_preprocessor.remove_duplicates(samples)

        # intersect samples and labels
        samples, labels = intersect_oids_in_dataframes(samples, labels)

        self.feature_list = samples.columns
        samples_np_array = samples.values
        labels_np_array = labels['classALeRCE'].loc[samples.index].values
        self.random_forest_classifier.fit(samples_np_array, labels_np_array)

    def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame:
        samples = self.feature_preprocessor.preprocess_features(samples)
        samples_np_array = samples[self.feature_list].values
        predicted_probs = self.random_forest_classifier.predict_proba(
            samples_np_array)
        predicted_probs_df = pd.DataFrame(predicted_probs,
                                          columns=self.get_list_of_classes(),
                                          index=samples.index.values)
        predicted_probs_df.index.name = 'oid'
        return predicted_probs_df

    def get_list_of_classes(self) -> list:
        return self.random_forest_classifier.classes_

    def save_model(self, directory: str) -> None:
        with open(os.path.join(directory, self.model_filename), 'wb') as f:
            pickle.dump(self.random_forest_classifier, f,
                        pickle.HIGHEST_PROTOCOL)
        with open(os.path.join(directory, 'feature_list.pkl'), 'wb') as f:
            pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL)

    def load_model(self, directory: str) -> None:
        rf = pd.read_pickle(os.path.join(directory, self.model_filename))
        self.random_forest_classifier = rf
        self.feature_list = pd.read_pickle(
            os.path.join(directory, 'feature_list.pkl'))

示例#18

0

显示文件

文件： test_forest.py 项目： matfonseca/TP2DATOS

def test_balanced_random_forest_oob(imbalanced_dataset):
    X, y = imbalanced_dataset
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0)

    n_samples = X.shape[0]
    est.fit(X[:n_samples // 2, :], y[:n_samples // 2])
    test_score = est.score(X[n_samples // 2:, :], y[n_samples // 2:])

    assert abs(test_score - est.oob_score_) < 0.1

    # Check warning if not enough estimators
    est = BalancedRandomForestClassifier(oob_score=True, random_state=0,
                                         n_estimators=1, bootstrap=True)
    with pytest.warns(UserWarning) and np.errstate(divide="ignore",
                                                   invalid="ignore"):
        est.fit(X, y)

示例#19

0

显示文件

文件： IrregularDatasets.py 项目： mblaszczyk97/Irregular-Datasets

def random_forest(X_train, y_train, X_test, y_test, X_train_res, y_train_res):
    rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    rf.fit(X_train, y_train.values.ravel())
    y_train_rf = rf.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_rf)
    without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (niezbalansowany): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    rf_oversampling = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    rf_oversampling.fit(X_train_res, y_train_res.ravel())
    y_train_rf = rf_oversampling.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_rf)
    with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (z oversamplingiem): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    brf.fit(X_train, y_train.values.ravel())
    y_train_brf = brf.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_brf)
    within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (zbalansowany - undersampling): {}%".format(within))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])
    print(brf.feature_importances_)
    
    objects = ('country','gender', 'age', 'visiting Wuhan', 'from Wuhan')
    y_pos = np.arange(len(objects))
    performance = brf.feature_importances_*100
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent zależności')
    plt.title('Zależność poszczególnych atrybutów')
    plt.show()

    objects = ('Random Forest niezbalansowany','Random Forest z oversamplingiem', 'Random Forest zbalansowany')
    y_pos = np.arange(len(objects))
    performance = [without, with_oversampling, within]
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent dokładności')
    plt.title('Dokładność Random Forest')
    plt.show()

    return without, within

示例#20

0

显示文件

文件： multi-class.py 项目： uaauaguga/cfRNA-analysis

def evaluate(X_train, y_train, X_test, y_test):
    global seed
    clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test).argsort(axis=1)
    y_pred1 = y_pred[:, -1]
    y_pred2 = y_pred[:, -2]
    return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix(
        y_test, y_pred2)

示例#21

0

显示文件

def balanced_random_forest(train_features,
                           train_labels,
                           test_features,
                           feature_list=None,
                           hfo_type_name=None):
    rf = BalancedRandomForestClassifier(
        random_state=32,
        n_jobs=-1,  # use all available processors
        # class_weight='balanced_subsample'
    )
    rf.fit(train_features, train_labels)
    # Predict over test
    rf_predictions = rf.predict(test_features)
    rf_probs = rf.predict_proba(test_features)[:, 1]
    # IF FEATURE IMPORTANCE FIGS NOT EXISTS
    # print_feature_importances(rf, feature_list)
    # graphics.feature_importances(feature_list, rf.feature_importances_, hfo_type_name)
    return rf_predictions, rf_probs, rf

示例#22

0

显示文件

文件： apply_classifier_model.py 项目： popkdodge/Submission

def apply_balanced_RF_classifier(X_train, y_train, model_path):
    '''
    Args: 
        X_train dataframe with all the features to be used for training
        y_train series containing labels for each row of X_train
        model_path path where trained balanced random forest model is to be saved
        
    Output:
        trained balanced random forest model
    '''
    BRF_model = BalancedRandomForestClassifier(n_estimators=50,
                                               random_state=0,
                                               n_jobs=-1)
    # Fit the training data
    BRF_model.fit(X_train, y_train)

    pickle_models(BRF_model, model_path)

    return BRF_model

示例#23

0

显示文件

文件： test_forest.py 项目： chkoar/imbalanced-learn

def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset):
    brf = BalancedRandomForestClassifier(n_estimators=5)
    brf.fit(*imbalanced_dataset)

    with pytest.raises(ValueError, message="must be larger or equal to"):
        brf.set_params(warm_start=True, n_estimators=2)
        brf.fit(*imbalanced_dataset)

    brf.set_params(n_estimators=10)
    brf.fit(*imbalanced_dataset)

    with pytest.warns(UserWarning, match="Warm-start fitting without"):
        brf.fit(*imbalanced_dataset)

示例#24

0

显示文件

文件： test_forest.py 项目： vishalbelsare/imbalanced-learn

def test_balanced_random_forest_error_warning_warm_start(imbalanced_dataset):
    brf = BalancedRandomForestClassifier(n_estimators=5)
    brf.fit(*imbalanced_dataset)

    with pytest.raises(ValueError, match="must be larger or equal to"):
        brf.set_params(warm_start=True, n_estimators=2)
        brf.fit(*imbalanced_dataset)

    brf.set_params(n_estimators=10)
    brf.fit(*imbalanced_dataset)

    with pytest.warns(UserWarning, match="Warm-start fitting without"):
        brf.fit(*imbalanced_dataset)

示例#25

0

显示文件

文件： balanced_random_forest.py 项目： thomas-young-2013/soln-ml

    def fit(self, X, Y, sample_weight=None):
        from imblearn.ensemble import BalancedRandomForestClassifier
        estimator = BalancedRandomForestClassifier(
            n_estimators=self.n_estimators,
            criterion=self.criterion,
            max_features=self.max_features,
            min_samples_split=self.min_samples_split,
            min_samples_leaf=self.min_samples_leaf,
            min_weight_fraction_leaf=self.min_weight_fraction_leaf,
            bootstrap=self.bootstrap,
            min_impurity_decrease=self.min_impurity_decrease,
            random_state=self.random_state,
            n_jobs=self.n_jobs,
            class_weight=self.class_weight,
            sampling_strategy=self.sampling_strategy,
            replacement=self.replacement)

        estimator.fit(X, Y)

        self.estimator = estimator
        return self

示例#26

0

显示文件

文件： test_forest.py 项目： chkoar/imbalanced-learn

def test_balanced_random_forest_attributes(imbalanced_dataset):
    X, y = imbalanced_dataset
    n_estimators = 10
    brf = BalancedRandomForestClassifier(n_estimators=n_estimators,
                                         random_state=0)
    brf.fit(X, y)

    for idx in range(n_estimators):
        X_res, y_res = brf.samplers_[idx].fit_resample(X, y)
        X_res_2, y_res_2 = brf.pipelines_[idx].named_steps[
            'randomundersampler'].fit_resample(X, y)
        assert_allclose(X_res, X_res_2)
        assert_array_equal(y_res, y_res_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict(X)
        assert_array_equal(y_pred, y_pred_2)

        y_pred = brf.estimators_[idx].fit(X_res, y_res).predict_proba(X)
        y_pred_2 = brf.pipelines_[idx].fit(X, y).predict_proba(X)
        assert_array_equal(y_pred, y_pred_2)

示例#27

0

显示文件

文件： RF.py 项目： frnbs/Covid_Classific

        def objective(trial):
            train_X, val_X, train_y, val_y = self.df_train_media.loc[:, self.
                                                                     df_train_media
                                                                     .
                                                                     columns !=
                                                                     '41'].values, self.df_validation_media.loc[:, self.df_validation_media.columns != '41'].values, self.df_train_media[
                                                                         '41'].values, self.df_validation_media[
                                                                             '41'].values
            test_X, test_y = self.df_test_media.loc[:, self.df_test_media.
                                                    columns !=
                                                    '41'].values, self.df_test_media[
                                                        '41'].values
            list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000]

            n_estimators = trial.suggest_categorical('n_estimators',
                                                     list_trees)
            max_features = trial.suggest_uniform('max_features', 0.15, 1.0)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 16)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)
            min_weight_fraction_leaf = trial.suggest_uniform(
                'min_weight_fraction_leaf', 0, 0.5)
            max_depth = trial.suggest_int('max_depth', 2, 32)

            brfmodel = BalancedRandomForestClassifier(
                n_estimators=n_estimators,
                max_features=max_features,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                max_depth=max_depth,
                min_weight_fraction_leaf=min_weight_fraction_leaf,
                bootstrap=True)
            brfmodel.fit(train_X, train_y)
            aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1])
            print(
                "Test AUC: " +
                str(roc_auc_score(test_y,
                                  brfmodel.predict_proba(test_X)[:, 1])))

            return aucbrf

示例#28

0

显示文件

def train_model(data):

    dataset = pd.get_dummies(
        data,
        columns=['Employment.Type', 'Driving_flag', 'Bureau_bin'],
        drop_first=True)
    #dataset = pd.get_dummies(data,columns=['Employment.Type','Driving_flag'],drop_first=True)
    X = dataset.drop('loan_default', axis=1)
    y = dataset['loan_default']

    #X_train, X_test, y_train, y_test = cross_validation.train_test_split(X,y,train_size=.8, stratify=y)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.2,
                                                        random_state=0,
                                                        stratify=y)

    rfc = RandomForestClassifier(class_weight='balanced', n_estimators=100)
    rfc.fit(X_train, y_train)
    lr = LogisticRegression(class_weight='balanced')
    lr.fit(X_train, y_train)
    xgb = XGBClassifier(scale_pos_weight=3.4)
    xgb.fit(X_train, y_train)

    brfc = BalancedRandomForestClassifier(max_depth=4, random_state=0)
    brfc.fit(X_train, y_train)
    bbc = BalancedBaggingClassifier(n_estimators=100, random_state=42)
    bbc.fit(X_train, y_train)
    models = [rfc, lr, xgb, brfc, bbc]
    model_names = [
        'RandomForestClassifier', 'LogisticRegression', 'XGBClassifier',
        'BalancedRandomForestClassifier', 'BalancedBaggingClassifier'
    ]
    for m, n in zip(models, model_names):
        print('Classifier: ' + n)
        predict_evaluate_classifier(X_test, y_test, m)

    return rfc, lr, xgb, brfc, bbc

示例#29

0

显示文件

文件： test_forest.py 项目： vishalbelsare/imbalanced-learn

def test_balanced_random_forest_error(imbalanced_dataset, forest_params,
                                      err_msg):
    brf = BalancedRandomForestClassifier(**forest_params)
    with pytest.raises(ValueError, match=err_msg):
        brf.fit(*imbalanced_dataset)

示例#30

0

显示文件

文件： plot_comparison_ensemble_classifier.py 项目： chkoar/imbalanced-learn

plot_confusion_matrix(cm_balanced_bagging, classes=np.unique(satimage.target),
                      ax=ax[1], title='Balanced bagging')

###############################################################################
# Classification using random forest classifier with and without sampling
###############################################################################
# Random forest is another popular ensemble method and it is usually
# outperforming bagging. Here, we used a vanilla random forest and its balanced
# counterpart in which each bootstrap sample is balanced.

rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0,
                                     n_jobs=-1)

rf.fit(X_train, y_train)
brf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_brf = brf.predict(X_test)

# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.

print('Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rf),
              geometric_mean_score(y_test, y_pred_rf)))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_rf, classes=np.unique(satimage.target), ax=ax[0],

示例#31

0

显示文件

    else:
        finite_idx = np.where(np.isfinite(column))[0]
    x = vectors[finite_idx, :]
    y = column[finite_idx]
    if y.sum() == 0 or y.sum() == len(y):
        print("%15s: undefined" % (name))
        continue
    train_x, test_x, train_y, test_y = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        stratify=y)

    if args.brf:
        rf = BalancedRandomForestClassifier(n_estimators=100, n_jobs=4)
    else:
        rf = RandomForestClassifier(n_estimators=100, n_jobs=4)

    rf.fit(train_x, train_y)
    p_te = rf.predict_proba(test_x)
    auc_te = roc_auc_score(test_y, p_te[:, 1])
    bacc = balanced_accuracy_score(test_y, p_te[:, 1].round(0))
    print("%15s: %3.5f %3.5f" % (name, auc_te, bacc))
    bacc_av += bacc
    auc_av += auc_te

    if not (args.save is None):
        gzpickle(args.save + '_%i.pkz' % i, rf)

print('Averages:')
print('AUC: %8.3f   BAcc: %8.3f' % (auc_av / (i + 1), bacc_av / (i + 1)))

示例#32

0

显示文件

文件： test_forest.py 项目： vishalbelsare/imbalanced-learn

def test_balanced_random_forest_sample_weight(imbalanced_dataset):
    rng = np.random.RandomState(42)
    X, y = imbalanced_dataset
    sample_weight = rng.rand(y.shape[0])
    brf = BalancedRandomForestClassifier(n_estimators=5, random_state=0)
    brf.fit(X, y, sample_weight)

示例#33

0

显示文件

# %% [markdown]
''' 
## Train a random forest classifier

*Note: this may take a while*
'''

# %%
clf = BalancedRandomForestClassifier(n_estimators=2000,
                                     replacement=True,
                                     sampling_strategy='not minority',
                                     n_jobs=4,
                                     random_state=42,
                                     verbose=1)
clf.fit(X_train, Y_train)

Y_test_pred = clf.predict(X_test)
print('\nClassifier performance')
print('Out of sample:\n',
      metrics.classification_report(Y_test, Y_test_pred, zero_division=0))

# %% [markdown]
'''
## Robustness to unforseen scenarios

What if the subjects in the test set wore the device differently from
those in the training set? For example, suppose that all the subjects in the
training set were right-handed, but the test subjects are left-handed.
This would more or less result in the device being rotated.

示例#34

0

显示文件

文件： test_forest.py 项目： chkoar/imbalanced-learn

def test_balanced_random_forest_sample_weight(imbalanced_dataset):
    rng = np.random.RandomState(42)
    X, y = imbalanced_dataset
    sample_weight = rng.rand(y.shape[0])
    brf = BalancedRandomForestClassifier(n_estimators=5, random_state=0)
    brf.fit(X, y, sample_weight)

示例#35

0

显示文件

文件： test_forest.py 项目： chkoar/imbalanced-learn

def test_balanced_random_forest_error(imbalanced_dataset, forest_params,
                                      err_msg):
    brf = BalancedRandomForestClassifier(**forest_params)
    with pytest.raises(ValueError, message=err_msg):
        brf.fit(*imbalanced_dataset)

示例#36

0

显示文件

文件： funcionesInvestments.py 项目： jluza/investments

def Clasificar(database, new, path):
    pd.options.mode.chained_assignment = None
    if 'Response by Category' in list(database.columns):
        database = database.drop(['Response by Category','Response by Description'], axis = 1)
    database = database.sample(frac= 0.4, replace = False)
    
    #Chequeo las companias que ya estaban clasificadas
    #d = new.merge(database, how ='left', left_on='Organization Name', right_on = 'Investee')[['Investee','Category.1','Area of Focus']]
    #new = new.merge(d, how = "left", left_on = "Organization Name", right_on = "Investee")
    #new = new.drop(columns=["Investee"])
    
    database["Category.1"] = database["Category.1"].replace("rejected", "Rejected")
    database["Category.1"] = database["Category.1"].replace("B2C ", "B2C")
    database["Category.1"] = database["Category.1"].replace("FIntech", "Fintech")

    database['Prediction'] = np.nan
    new['Prediction'] = np.nan
    new = new.drop(['Prediction'], axis=1)

    #CLASIFICADOR
    
    warnings.filterwarnings('ignore')
    
    
    print('Importando bases de datos')
    
    new = new.rename(columns = {'Categories':'Category','Organization Name':'Investee'})
    train = database[['Operation','Investee', 'Category', 'Description', 'Category.1', 'Area of Focus']].dropna()
    newdata = new[['Transaction Name','Investee', 'Category', 'Description']]
    
    
    print('Preprocesamiento del texto')
    
    stop_words = stopwords.words('english')
    
    for column in ['Category','Description']:
        
        train[column] = train[column].apply(lambda x: (" ".join(str(x).lower() for x in str(x).split())).encode('utf-8').decode('utf-8'))  # lower case
        train[column] = train[column].str.replace('[^\w\s]', ' ')          																											# removing punctuation
        train[column] = train[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words))   # removing stop words
        newdata[column] = newdata[column].apply(lambda x: (" ".join(x.lower() for x in str(x).split())))  # lower case
        newdata[column] = newdata[column].str.replace('[^\w\s]', ' ')																		# removing punctuation
        newdata[column] = newdata[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words))   # removing stop words
    
    
    train_src1 = train[['Category','Description','Category.1']]
    train_src1['Rejected?'] = 0
    train_src1.loc[train_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1
    
    new_src1 = newdata[['Category','Description']]
    #new_src1['Rejected?'] = 0
    #new_src1.loc[new_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1
    
    
    #Binarizacion
    vectorizer = CountVectorizer()
    
    vectorI = pd.DataFrame(vectorizer.fit_transform(train_src1['Category']).toarray())
    vectorI_new = pd.DataFrame(vectorizer.transform(new_src1['Category']).toarray())
    vectorIdes = pd.DataFrame(vectorizer.fit_transform(train_src1['Description']).toarray())
    vectorIdes_new = pd.DataFrame(vectorizer.transform(new_src1['Description']).toarray())
    
    vectorI = pd.concat([vectorI, vectorIdes], axis = 1)
    vectorI_new = pd.concat([vectorI_new, vectorIdes_new], axis = 1)
    
    print('Entrenamiento')
    
    #Clasificacion binaria: Rechazadas vs no rechazadas
                #Resampling + Random Forest
    brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
    brf.fit(vectorI, train_src1['Rejected?'])
    y_train_pred = brf.predict(vectorI)
    print('Confusion matrix: \n' , confusion_matrix(train_src1['Rejected?'], y_train_pred))
    print('Accuracy: \n' , accuracy_score(train_src1['Rejected?'], y_train_pred))
    print('Recall: \n' , recall_score(train_src1['Rejected?'], y_train_pred))
    
    
    print('Clasificacion y exportacion')
    #Ajustando modelo a nuevos datos
    y_new_predict = brf.predict(vectorI_new)
    y_new_predict_proba = brf.predict_proba(vectorI_new)
    
    newdata['Prediction'] = y_new_predict
    newdata['Prob. of being rejected'] = y_new_predict_proba[:,0]
    newdata['Prob. of being of interest'] = y_new_predict_proba[:,1]
    

    
    #Creamos archivo Companies y exportamos
    new = pd.concat([new, newdata[['Prediction','Prob. of being rejected','Prob. of being of interest']]], axis=1, sort=False) 

    return new

示例#37

0

显示文件

文件： lstm.py 项目： apcreagh/Oxford_Wearables_Activity_Recognition

## Train a random forest classifier

*Note: this may take a while*
'''

# %%
clf = BalancedRandomForestClassifier(
    n_estimators=2000,
    replacement=True,
    sampling_strategy='not minority',
    oob_score=True,
    n_jobs=4,
    random_state=42,
    verbose=1
)
clf.fit(X_train, Y_train)

Y_test_pred = clf.predict(X_test)
print('\nClassifier performance')
print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred, zero_division=0)) 

# This will be the training set
Y_in_train = clf.oob_decision_function_.astype('float32')
# This will be the test set
Y_in_test = clf.predict_proba(X_test).astype('float32')

# %% [markdown]
'''

## Architecture design
As a baseline, let's use a single-layer bidirectional LSTM.