def study_fergus(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='fergus_results.json'): results= {} from sklearn.neural_network import MLPClassifier #base_classifier= RadialBasisNeuralNetworkClassifier() base_classifier=MLPClassifier() grid_search_params= {'hidden_layer_sizes': [(100,), (50,), (200)], 'activation': ['logistic', 'tanh', 'relu'], 'alpha': [0.0001, 0.001, 0.01, 0.1]} # without oversampling classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=10, random_state= random_seed) preds= evaluate(pipeline, features, target, validator) results['without_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['without_oversampling_details']= preds.to_dict() print('without oversampling: ', results['without_oversampling_auc']) # with correct oversampling #base_classifier= RadialBasisNeuralNetworkClassifier() classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') classifier= OversamplingClassifier(SMOTE(), classifier) pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=10, random_state= random_seed) preds= evaluate(classifier, features, target, validator) results['with_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['with_oversampling_details']= preds.to_dict() print('with oversampling: ', results['with_oversampling_auc']) #base_classifier= RadialBasisNeuralNetworkClassifier() # in-sample evaluation classifier= base_classifier preds= classifier.fit(features.values, target.values).predict_proba(features.values)[:,1] preds= pd.DataFrame({'fold': 0, 'label': target.values, 'prediction': preds}) results['in_sample_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['in_sample_details']= preds.to_dict() print('in sample: ', results['in_sample_auc']) # with incorrect oversampling X, y= SMOTE().sample(features.values, target.values) X= pd.DataFrame(X, columns=features.columns) y= pd.Series(y) #base_classifier= RadialBasisNeuralNetworkClassifier() classifier= base_classifier if not grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') pipeline= classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator= StratifiedKFold(n_splits=10, random_state= random_seed) preds= evaluate(pipeline, X, y, validator) results['incorrect_oversampling_auc']= roc_auc_score(preds['label'].values, preds['prediction'].values) results['incorrect_oversampling_details']= preds.to_dict() print('incorrect oversampling: ', results['incorrect_oversampling_auc']) json.dump(results, open(output_file, 'w')) return results
def fit(self, X, y): base_classifier=MLPClassifier() grid_search_params= {'hidden_layer_sizes': [(100,), (50,), (200)], 'activation': ['logistic', 'tanh', 'relu'], 'alpha': [0.0001, 0.001, 0.01, 0.1]} classifier= base_classifier if not self.grid else GridSearchCV(base_classifier, grid_search_params, scoring='roc_auc') classifier= OversamplingClassifier(SMOTE(), classifier) self.pipeline= classifier if not self.preprocessing else Pipeline([('preprocessing', self.preprocessing), ('classifier', classifier)]) self.pipeline.fit(X, y) return self
def fit(self, X, y): base_classifier = SVC(random_state=self.random_state, probability=True) grid_search_params = {'C': [10**i for i in range(-4, 5)]} classifier = base_classifier if not self.grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') classifier = OversamplingClassifier( SMOTE(random_state=self.random_state), classifier) self.pipeline = classifier if not self.preprocessing else Pipeline([ ('preprocessing', self.preprocessing), ('classifier', classifier) ]) self.pipeline.fit(X, y) return self
def fit(self, X, y): base_classifier = AdaBoostClassifier(random_state=self.random_state) grid_search_params = { 'n_estimators': [10, 50, 100, 500], 'learning_rate': [0.1, 0.5, 1.0] } classifier = base_classifier if not self.grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') classifier = OversamplingClassifier( SMOTE(random_state=self.random_state), classifier) self.pipeline = classifier if not self.preprocessing else Pipeline([ ('preprocessing', self.preprocessing), ('classifier', classifier) ]) self.pipeline.fit(X, y) return self
def fit(self, X, y): base_classifier = RandomForestClassifier( random_state=self.random_state) grid_search_params = { 'max_depth': [3, 5, 10, None], 'min_samples_leaf': [1, 3, 5] } classifier = base_classifier if not self.grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') classifier = OversamplingClassifier( SMOTE(random_state=self.random_state), classifier) self.pipeline = classifier if not self.preprocessing else Pipeline([ ('preprocessing', self.preprocessing), ('classifier', classifier) ]) self.pipeline.fit(X, y) return self
def study_idowu(features, target, preprocessing=StandardScaler(), grid=True, random_seed=42, output_file='idowu.json'): results = {} base_classifier = RandomForestClassifier(random_state=random_seed) grid_search_params = { 'max_depth': [3, 5, 10, None], 'min_samples_leaf': [1, 3, 5] } np.random.seed(random_seed) # without oversampling classifier = base_classifier if not grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') pipeline = classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator = StratifiedKFold(n_splits=5, random_state=random_seed) preds = evaluate(pipeline, features, target, validator) results['without_oversampling_auc'] = roc_auc_score( preds['label'].values, preds['prediction'].values) results['without_oversampling_details'] = preds.to_dict() print('without oversampling: ', results['without_oversampling_auc']) # with correct oversampling classifier = base_classifier if not grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') classifier = OversamplingClassifier(SMOTE(random_state=random_seed), classifier) pipeline = classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator = StratifiedKFold(n_splits=5, random_state=random_seed) preds = evaluate(classifier, features, target, validator) results['with_oversampling_auc'] = roc_auc_score( preds['label'].values, preds['prediction'].values) results['with_oversampling_details'] = preds.to_dict() print('with oversampling: ', results['with_oversampling_auc']) # in-sample evaluation classifier = base_classifier preds = classifier.fit(features.values, target.values).predict_proba(features.values)[:, 1] preds = pd.DataFrame({ 'fold': 0, 'label': target.values, 'prediction': preds }) results['in_sample_auc'] = roc_auc_score(preds['label'].values, preds['prediction'].values) results['in_sample_details'] = preds.to_dict() print('in sample: ', results['in_sample_auc']) # with incorrect oversampling X, y = SMOTE(random_state=random_seed).sample(features.values, target.values) X = pd.DataFrame(X, columns=features.columns) y = pd.Series(y) classifier = base_classifier if not grid else GridSearchCV( base_classifier, grid_search_params, scoring='roc_auc') pipeline = classifier if not preprocessing else Pipeline([('preprocessing', preprocessing), ('classifier', classifier)]) validator = StratifiedKFold(n_splits=5, random_state=random_seed) preds = evaluate(pipeline, X, y, validator) results['incorrect_oversampling_auc'] = roc_auc_score( preds['label'].values, preds['prediction'].values) results['incorrect_oversampling_details'] = preds.to_dict() print('incorrect oversampling: ', results['incorrect_oversampling_auc']) json.dump(results, open(output_file, 'w')) return results
np.random.seed(42) # Generate random data X = np.random.rand(10000, 5) y = np.random.choice([0, 1], size=(10000, ), p=[0.9, 0.1]) # Let's measure accuracy score on test set with no oversampling X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) rf = RandomForestClassifier(n_estimators=100) rf.fit(X_train, y_train) preds = rf.predict_proba(X_test)[:, 1] print('AUC no oversampling: {}'.format(roc_auc_score(y_test, preds))) # Let's apply over_sampling on our train set and measure accuracy smote = SMOTE() X_train_s, y_train_s = smote.sample(X_train, y_train) rf = RandomForestClassifier(n_estimators=100) rf.fit(X_train_s, y_train_s) preds = rf.predict_proba(X_test)[:, 1] print('AUC with oversampling after partitioning: {}'.format( roc_auc_score(y_test, preds))) # Now let's first apply smote, then partition and measure accuracy smote = SMOTE() X_s, y_s = smote.sample(X, y) X_train, X_test, y_train, y_test = train_test_split(X_s, y_s, random_state=42) rf = RandomForestClassifier(n_estimators=100) rf.fit(X_train, y_train) preds = rf.predict_proba(X_test)[:, 1] print('AUC with oversampling before partitioning: {}'.format(