예제 #1
0
def study_fergus(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='fergus_results.json'):
    results= {}
    from sklearn.neural_network import MLPClassifier
    #base_classifier= RadialBasisNeuralNetworkClassifier()
    base_classifier=MLPClassifier()
    grid_search_params= {'hidden_layer_sizes': [(100,), (50,), (200)],
                            'activation': ['logistic', 'tanh', 'relu'],
                            'alpha': [0.0001, 0.001, 0.01, 0.1]}

    # without oversampling
    classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc')
    pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)])
    validator= StratifiedKFold(n_splits=10, random_state= random_seed)

    preds= evaluate(pipeline, features, target, validator)
    results['without_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values)
    results['without_oversampling_details']= preds.to_dict()

    print('without oversampling: ', results['without_oversampling_auc'])

    # with correct oversampling
    #base_classifier= RadialBasisNeuralNetworkClassifier()
    classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc')
    classifier= OversamplingClassifier(SMOTE(), classifier)
    pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)])
    validator= StratifiedKFold(n_splits=10, random_state= random_seed)

    preds= evaluate(classifier, features, target, validator)
    results['with_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values)
    results['with_oversampling_details']= preds.to_dict()
    print('with oversampling: ', results['with_oversampling_auc'])

    #base_classifier= RadialBasisNeuralNetworkClassifier()
    # in-sample evaluation
    classifier= base_classifier
    preds= classifier.fit(features.values, target.values).predict_proba(features.values)[:,1]
    preds= pd.DataFrame({'fold': 0, 'label': target.values, 'prediction': preds})
    results['in_sample_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values)
    results['in_sample_details']= preds.to_dict()
    print('in sample: ', results['in_sample_auc'])

    # with incorrect oversampling
    X, y= SMOTE().sample(features.values, target.values)
    X= pd.DataFrame(X, columns=features.columns)
    y= pd.Series(y)

    #base_classifier= RadialBasisNeuralNetworkClassifier()
    classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc')
    pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)])
    validator= StratifiedKFold(n_splits=10, random_state= random_seed)

    preds= evaluate(pipeline, X, y, validator)
    results['incorrect_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values)
    results['incorrect_oversampling_details']= preds.to_dict()
    print('incorrect oversampling: ', results['incorrect_oversampling_auc'])

    json.dump(results, open(output_file, 'w'))
    return results
예제 #2
0
    def fit(self, X, y):
        base_classifier=MLPClassifier()
        grid_search_params= {'hidden_layer_sizes': [(100,), (50,), (200)],
                                'activation': ['logistic', 'tanh', 'relu'],
                                'alpha': [0.0001, 0.001, 0.01, 0.1]}

        classifier= base_classifier if not self.grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc')
        classifier= OversamplingClassifier(SMOTE(), classifier)
        self.pipeline= classifier if not self.preprocessing else Pipeline([('preprocessing', self.preprocessing), ('classifier', classifier)])
        self.pipeline.fit(X, y)

        return self
예제 #3
0
    def fit(self, X, y):
        base_classifier = SVC(random_state=self.random_state, probability=True)
        grid_search_params = {'C': [10**i for i in range(-4, 5)]}

        classifier = base_classifier if not self.grid else GridSearchCV(
            base_classifier, grid_search_params, scoring='roc_auc')
        classifier = OversamplingClassifier(
            SMOTE(random_state=self.random_state), classifier)
        self.pipeline = classifier if not self.preprocessing else Pipeline([
            ('preprocessing', self.preprocessing), ('classifier', classifier)
        ])
        self.pipeline.fit(X, y)

        return self
예제 #4
0
    def fit(self, X, y):
        base_classifier = AdaBoostClassifier(random_state=self.random_state)
        grid_search_params = {
            'n_estimators': [10, 50, 100, 500],
            'learning_rate': [0.1, 0.5, 1.0]
        }

        classifier = base_classifier if not self.grid else GridSearchCV(
            base_classifier, grid_search_params, scoring='roc_auc')
        classifier = OversamplingClassifier(
            SMOTE(random_state=self.random_state), classifier)
        self.pipeline = classifier if not self.preprocessing else Pipeline([
            ('preprocessing', self.preprocessing), ('classifier', classifier)
        ])
        self.pipeline.fit(X, y)

        return self
예제 #5
0
    def fit(self, X, y):
        base_classifier = RandomForestClassifier(
            random_state=self.random_state)
        grid_search_params = {
            'max_depth': [3, 5, 10, None],
            'min_samples_leaf': [1, 3, 5]
        }

        classifier = base_classifier if not self.grid else GridSearchCV(
            base_classifier, grid_search_params, scoring='roc_auc')
        classifier = OversamplingClassifier(
            SMOTE(random_state=self.random_state), classifier)
        self.pipeline = classifier if not self.preprocessing else Pipeline([
            ('preprocessing', self.preprocessing), ('classifier', classifier)
        ])
        self.pipeline.fit(X, y)

        return self
예제 #6
0
def study_idowu(features,
                target,
                preprocessing=StandardScaler(),
                grid=True,
                random_seed=42,
                output_file='idowu.json'):
    results = {}
    base_classifier = RandomForestClassifier(random_state=random_seed)
    grid_search_params = {
        'max_depth': [3, 5, 10, None],
        'min_samples_leaf': [1, 3, 5]
    }

    np.random.seed(random_seed)

    # without oversampling
    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='roc_auc')
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=5, random_state=random_seed)

    preds = evaluate(pipeline, features, target, validator)
    results['without_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['without_oversampling_details'] = preds.to_dict()

    print('without oversampling: ', results['without_oversampling_auc'])

    # with correct oversampling
    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='roc_auc')
    classifier = OversamplingClassifier(SMOTE(random_state=random_seed),
                                        classifier)
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=5, random_state=random_seed)

    preds = evaluate(classifier, features, target, validator)
    results['with_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['with_oversampling_details'] = preds.to_dict()
    print('with oversampling: ', results['with_oversampling_auc'])

    # in-sample evaluation
    classifier = base_classifier
    preds = classifier.fit(features.values,
                           target.values).predict_proba(features.values)[:, 1]
    preds = pd.DataFrame({
        'fold': 0,
        'label': target.values,
        'prediction': preds
    })
    results['in_sample_auc'] = roc_auc_score(preds['label'].values,
                                             preds['prediction'].values)
    results['in_sample_details'] = preds.to_dict()
    print('in sample: ', results['in_sample_auc'])

    # with incorrect oversampling
    X, y = SMOTE(random_state=random_seed).sample(features.values,
                                                  target.values)
    X = pd.DataFrame(X, columns=features.columns)
    y = pd.Series(y)

    classifier = base_classifier if not grid else GridSearchCV(
        base_classifier, grid_search_params, scoring='roc_auc')
    pipeline = classifier if not preprocessing else Pipeline([('preprocessing',
                                                               preprocessing),
                                                              ('classifier',
                                                               classifier)])
    validator = StratifiedKFold(n_splits=5, random_state=random_seed)

    preds = evaluate(pipeline, X, y, validator)
    results['incorrect_oversampling_auc'] = roc_auc_score(
        preds['label'].values, preds['prediction'].values)
    results['incorrect_oversampling_details'] = preds.to_dict()
    print('incorrect oversampling: ', results['incorrect_oversampling_auc'])

    json.dump(results, open(output_file, 'w'))
    return results
예제 #7
0
np.random.seed(42)

# Generate random data
X = np.random.rand(10000, 5)
y = np.random.choice([0, 1], size=(10000, ), p=[0.9, 0.1])

# Let's measure accuracy score on test set with no oversampling
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
preds = rf.predict_proba(X_test)[:, 1]
print('AUC no oversampling: {}'.format(roc_auc_score(y_test, preds)))

# Let's apply over_sampling on our train set and measure accuracy
smote = SMOTE()
X_train_s, y_train_s = smote.sample(X_train, y_train)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_s, y_train_s)
preds = rf.predict_proba(X_test)[:, 1]
print('AUC with oversampling after partitioning: {}'.format(
    roc_auc_score(y_test, preds)))

# Now let's first apply smote, then partition and measure accuracy
smote = SMOTE()
X_s, y_s = smote.sample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, random_state=42)
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)
preds = rf.predict_proba(X_test)[:, 1]
print('AUC with oversampling before partitioning: {}'.format(