def objective(trial): train_X, val_X, train_y, val_y = train_test_split(self.X, self.y, test_size=0.2) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') v_train_X = median_imputer.fit_transform(train_X) v_val_X = median_imputer.fit_transform(val_X) train_X = pd.DataFrame(v_train_X, columns=train_X.columns, index=train_X.index) val_X = pd.DataFrame(v_val_X, columns=val_X.columns, index=val_X.index) v_test_X = median_imputer.fit_transform(self.X_validation) test_X = pd.DataFrame(v_test_X, columns=self.X_validation.columns, index=self.X_validation.index) list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000] brf_n_estimators = trial.suggest_categorical( 'n_estimators', list_trees) brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0) brf_min_samples_split = trial.suggest_int('min_samples_split', 2, 16) brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16) brf_min_weight_fraction_leaf = trial.suggest_uniform( 'min_weight_fraction_leaf', 0, 0.5) brf_max_depth = trial.suggest_int('max_depth', 2, 32) brfmodel = BalancedRandomForestClassifier( n_estimators=brf_n_estimators, max_features=brf_max_features, min_samples_split=brf_min_samples_split, min_samples_leaf=brf_min_samples_leaf, max_depth=brf_max_depth, min_weight_fraction_leaf=brf_min_weight_fraction_leaf, bootstrap=True) brfmodel.fit(train_X, train_y) aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1]) aucbrf_test = roc_auc_score(self.y_validation, brfmodel.predict_proba(test_X)[:, 1]) print('Accuracy test ' + str( accuracy_score(self.y_validation, brfmodel.predict(test_X)))) plt.figure() plot_confusion_matrix(brfmodel, test_X, self.y_validation, cmap=plt.cm.Blues, normalize=None) plt.show() print(aucbrf_test) return aucbrf
def evaluate_on_validation_or_test(self, test=False): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] rf_model = BalancedRandomForestClassifier(**data) trainX = self.X trainy = self.y valx = self.X_validation valy = self.y_validation if test == True: testx = self.X_test testy = self.y_test median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(trainX) vtrainX = imputer.transform(trainX) trainX = pd.DataFrame(vtrainX, columns=trainX.columns, index=trainX.index) vvalX = imputer.transform(valx) valx = pd.DataFrame(vvalX, columns=valx.columns, index=valx.index) if test == True: vtest = imputer.transform(testx) testx = pd.DataFrame(vtest, columns=testx.columns, index=testx.index) trainX = pd.concat([trainX, valx]) trainy = np.concatenate((trainy, valy)) rf_model.fit(trainX, trainy) if test == True: roc_rf = roc_auc_score(testy, rf_model.predict_proba(testx)[:, 1]) else: roc_rf = roc_auc_score(valy, rf_model.predict_proba(valx)[:, 1]) if test == False: print("Validation AUC: {}".format(str(roc_rf))) else: print("Test AUC: {}".format(str(roc_rf)))
def main(): """ Main entrance.""" print('Spliting challenges') split_challenges() print('Reading X...') X = pd.concat([pd.read_json(XY_PATH['X'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1']) print('Reading y...') y = pd.concat([pd.read_json(XY_PATH['y'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1']) print('\nTraining Inner sampler RFC') for i in range(10): print(f'Training 10-Fold CV #{i}', end='\r') X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i) balanced_rfc = BalancedRandomForestClassifier(n_estimators=100, random_state=0) balanced_rfc.fit(X_train.to_numpy(), y_train.to_numpy().ravel()) pd.DataFrame(balanced_rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'brf', f'y_prob_{i}.json'), orient='records') pd.Series(balanced_rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'brf', f'feature_importance_{i}.json')) print('\nTraining RandomUnderSampler') for i in range(10): print(f'Training 10-Fold CV #{i}', end='\r') X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i) rfc = RandomForestClassifier(n_estimators=100, random_state=0) rus = RandomUnderSampler(random_state=0) X_resample, y_resample = rus.fit_resample(X_train.to_numpy(), y_train.to_numpy().ravel()) rfc.fit(X_resample, y_resample) pd.DataFrame(rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'rus', f'y_prob_{i}.json'), orient='records') pd.Series(rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'rus', f'feature_importance_{i}.json'))
def evaluate(X_train, y_train, X_test, y_test): global seed clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed) clf = clf.fit(X_train, y_train) y_pred = clf.predict_proba(X_test).argsort(axis=1) y_pred1 = y_pred[:, -1] y_pred2 = y_pred[:, -2] return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix( y_test, y_pred2)
def evaluate_model(self): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] rf_model = BalancedRandomForestClassifier(**data) rf_auc = [] for i in tqdm(range(20)): cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=i + 187462) for train_index, test_index in cv.split(self.X, self.y): trainX = self.X.iloc[lambda x: train_index] testX = self.X.iloc[lambda x: test_index] trainy = np.take(self.y, train_index) testy = np.take(self.y, test_index) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(trainX) vtrainX = imputer.transform(trainX) imputertest = median_imputer.fit(testX) vtestX = imputertest.transform(testX) trainX = pd.DataFrame(vtrainX, columns=trainX.columns, index=trainX.index) testX = pd.DataFrame(vtestX, columns=testX.columns, index=testX.index) # Calcolo AUC per migliori risultati da CatBoost rf_model.fit(trainX, trainy) roc_rf = roc_auc_score( testy, rf_model.predict_proba(testX)[:, 1]) rf_auc.append(roc_rf) print(roc_rf) print(statistics.mean(rf_auc)) return rf_auc
def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator, params, clf_type, question): estimator_scores = {} if estimator == 'BalancedRandomForestClassifier': clf = BalancedRandomForestClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'BalancedBaggingClassifier': clf = BalancedBaggingClassifier( n_estimators=params['n_estimators'], bootstrap=params['bootstrap'], max_samples=params['max_samples'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'EasyEnsembleClassifier': clf = EasyEnsembleClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) clf.fit(train_x, train_y) cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y, clf_type, question) predicted_labels = clf.predict(test_x) tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel() specificity = round((tn / (tn + fp)) * 100, 2) predicted_prob = clf.predict_proba(test_x) predicted_prob_true = [p[1] for p in predicted_prob] estimator_scores['Question'] = question estimator_scores['Accuracy'] = round( accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Balanced Accuracy'] = round( balanced_accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Precision'] = round( precision_score(test_y, predicted_labels) * 100, 2) estimator_scores['Recall'] = round( recall_score(test_y, predicted_labels) * 100, 2) estimator_scores['Specificity'] = specificity estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2) estimator_scores['ROC AUC'] = round( roc_auc_score(test_y, predicted_prob_true), 2) # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2))) # perfect_labels = train_y # print(confusion_matrix(train_y, perfect_labels)) return cross_val_scores, estimator_scores
class BaselineRandomForest(BaseClassifier): def __init__(self): self.random_forest_classifier = RandomForestClassifier( n_estimators=500, max_features='auto', max_depth=None, n_jobs=1, class_weight=None, criterion='entropy', min_samples_split=2, min_samples_leaf=1) self.feature_preprocessor = FeaturePreprocessor() self.feature_list = None self.model_filename = 'baseline_rf.pkl' def fit(self, samples: pd.DataFrame, labels: pd.DataFrame): samples = self.feature_preprocessor.preprocess_features(samples) samples = self.feature_preprocessor.remove_duplicates(samples) # intersect samples and labels samples, labels = intersect_oids_in_dataframes(samples, labels) self.feature_list = samples.columns samples_np_array = samples.values labels_np_array = labels['classALeRCE'].loc[samples.index].values self.random_forest_classifier.fit(samples_np_array, labels_np_array) def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame: samples = self.feature_preprocessor.preprocess_features(samples) samples_np_array = samples[self.feature_list].values predicted_probs = self.random_forest_classifier.predict_proba( samples_np_array) predicted_probs_df = pd.DataFrame(predicted_probs, columns=self.get_list_of_classes(), index=samples.index.values) predicted_probs_df.index.name = 'oid' return predicted_probs_df def get_list_of_classes(self) -> list: return self.random_forest_classifier.classes_ def save_model(self, directory: str) -> None: with open(os.path.join(directory, self.model_filename), 'wb') as f: pickle.dump(self.random_forest_classifier, f, pickle.HIGHEST_PROTOCOL) with open(os.path.join(directory, 'feature_list.pkl'), 'wb') as f: pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL) def load_model(self, directory: str) -> None: rf = pd.read_pickle(os.path.join(directory, self.model_filename)) self.random_forest_classifier = rf self.feature_list = pd.read_pickle( os.path.join(directory, 'feature_list.pkl'))
def objective(trial): train_X, val_X, train_y, val_y = self.df_train_media.loc[:, self. df_train_media . columns != '41'].values, self.df_validation_media.loc[:, self.df_validation_media.columns != '41'].values, self.df_train_media[ '41'].values, self.df_validation_media[ '41'].values test_X, test_y = self.df_test_media.loc[:, self.df_test_media. columns != '41'].values, self.df_test_media[ '41'].values list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000] n_estimators = trial.suggest_categorical('n_estimators', list_trees) max_features = trial.suggest_uniform('max_features', 0.15, 1.0) min_samples_split = trial.suggest_int('min_samples_split', 2, 16) min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16) min_weight_fraction_leaf = trial.suggest_uniform( 'min_weight_fraction_leaf', 0, 0.5) max_depth = trial.suggest_int('max_depth', 2, 32) brfmodel = BalancedRandomForestClassifier( n_estimators=n_estimators, max_features=max_features, min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf, max_depth=max_depth, min_weight_fraction_leaf=min_weight_fraction_leaf, bootstrap=True) brfmodel.fit(train_X, train_y) aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1]) print( "Test AUC: " + str(roc_auc_score(test_y, brfmodel.predict_proba(test_X)[:, 1]))) return aucbrf
def balanced_random_forest(train_features, train_labels, test_features, feature_list=None, hfo_type_name=None): rf = BalancedRandomForestClassifier( random_state=32, n_jobs=-1, # use all available processors # class_weight='balanced_subsample' ) rf.fit(train_features, train_labels) # Predict over test rf_predictions = rf.predict(test_features) rf_probs = rf.predict_proba(test_features)[:, 1] # IF FEATURE IMPORTANCE FIGS NOT EXISTS # print_feature_importances(rf, feature_list) # graphics.feature_importances(feature_list, rf.feature_importances_, hfo_type_name) return rf_predictions, rf_probs, rf
from sklearn.metrics import classification_report cr_rf = classification_report(y_test, y_pred_rf) print(cr_rf) visualizer = ClassificationReport(rf, classes=['Buy', 'No Buy'], support=True) visualizer.fit(X_train, y_train) visualizer.score(X_test, y_test) g = visualizer.poof() #--------------------Balanced Random Forest------------------------------ from imblearn.ensemble import BalancedRandomForestClassifier brf = BalancedRandomForestClassifier(class_weight= "balanced", random_state = 0) brf.fit(X_train, y_train) # ROC predicted_probas = brf.predict_proba(X_test) import scikitplot as skplt skplt.metrics.plot_roc(y_test, predicted_probas) plt.show() #Precision Recall Curve skplt.metrics.plot_precision_recall_curve(y_test, predicted_probas) plt.show() # Predicting the Test set results y_pred_brf = brf.predict(X_test) predictions = [round(value) for value in y_pred_brf] from sklearn.metrics import f1_score f1_brf = f1_score(y_test, y_pred_brf)
def train_classifier( _train_df_x, train_df_y, _val_df_x, val_df_y, lcset_info, nan_mode=NAN_MODE, ): class_names = lcset_info['class_names'] train_df_x, mean_train_df_x, null_cols = clean_df_nans(_train_df_x, mode=NAN_MODE) features = list(train_df_x.columns) best_rf = None best_rf_metric = -np.inf for criterion in ['gini', 'entropy']: # for criterion in ['entropy']: for max_depth in [1, 2, 3, 4, 5][::-1]: # for max_depth in [1, 2, 4, 8, 16][::-1]: for max_samples in np.linspace(.1, .9, 6): # for max_samples in [None]: rf = BalancedRandomForestClassifier( # BalancedRandomForestClassifier RandomForestClassifier n_jobs=N_JOBS, criterion=criterion, max_depth=max_depth, n_estimators=512, # 16 256 512 1024 2048 max_samples=max_samples, max_features='auto', # None auto # min_samples_split=min_samples_split, bootstrap=True, #verbose=1, ) rf.fit(train_df_x.values, train_df_y[['_y']].values[..., 0]) val_df_x, _, _ = clean_df_nans(_val_df_x, mode=NAN_MODE, df_values=mean_train_df_x) y_pred_p = rf.predict_proba(val_df_x.values) y_true = val_df_y[['_y']].values[..., 0] metrics_cdict, metrics_dict, cm = get_multiclass_metrics( y_pred_p, y_true, class_names) rf_metric = metrics_dict['b-f1score'] # recall f1score recall = {c: metrics_cdict[c]['recall'] for c in class_names} print( f'samples={len(train_df_y)}: features={len(features)}; criterion={criterion}; max_depth={max_depth}; max_samples={max_samples}; rf_metric={rf_metric}; best_rf_metric={best_rf_metric}; recall={recall}' ) if rf_metric > best_rf_metric: best_rf = rf best_rf_metric = rf_metric ### save best rank = TopRank('features', n=30) rank.add_list(features, best_rf.feature_importances_) rank.calcule() print(rank) d = { 'rf': best_rf, 'mean_train_df_x': mean_train_df_x, 'null_cols': null_cols, 'features': features, 'rank': rank, } return d
else: finite_idx = np.where(np.isfinite(column))[0] x = vectors[finite_idx, :] y = column[finite_idx] if y.sum() == 0 or y.sum() == len(y): print("%15s: undefined" % (name)) continue train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, stratify=y) if args.brf: rf = BalancedRandomForestClassifier(n_estimators=100, n_jobs=4) else: rf = RandomForestClassifier(n_estimators=100, n_jobs=4) rf.fit(train_x, train_y) p_te = rf.predict_proba(test_x) auc_te = roc_auc_score(test_y, p_te[:, 1]) bacc = balanced_accuracy_score(test_y, p_te[:, 1].round(0)) print("%15s: %3.5f %3.5f" % (name, auc_te, bacc)) bacc_av += bacc auc_av += auc_te if not (args.save is None): gzpickle(args.save + '_%i.pkz' % i, rf) print('Averages:') print('AUC: %8.3f BAcc: %8.3f' % (auc_av / (i + 1), bacc_av / (i + 1)))
def main(): f = open("trainingData/featuresall_train.txt") data = [] label = [] for lineNumber, line in enumerate(f): if lineNumber != 0: newLine = line.rstrip() entries = newLine.split('\t') data.append(list(map(float, entries[2:]))) label.append(int(entries[1])) f.close() f2 = open("testingData/featuresall_test.txt") extractTest = [] extractIds = [] for lineNumber, line in enumerate(f2): if lineNumber != 0: newLine = line.rstrip() entries = newLine.split('\t') extractTest.append(list(map(float, entries[1:]))) extractIds.append(entries[0]) f2.close() trainLabel = np.array(label) trainData = np.array(data) testData = np.asarray(extractTest) ########################################################### X_train, X_test, y_train, y_test = model_selection.train_test_split( trainData, trainLabel, test_size=0.33) X_train, y_train = BorderlineSMOTE().fit_resample(X_train, y_train) clf_train = BalancedRandomForestClassifier(n_estimators=100, max_depth=5) clf_train = clf_train.fit(X_train, y_train) y_dt_pred_train = clf_train.predict_proba(X_test) onlyPKpredictions_train = y_dt_pred_train[:, 1] fpr, tpr, thresholds = metrics.roc_curve(y_test, onlyPKpredictions_train) print("accuracy BRFC AUC:", metrics.auc(fpr, tpr)) ########################################################### X_train, y_train = BorderlineSMOTE().fit_resample(trainData, trainLabel) clf_test = BalancedRandomForestClassifier(n_estimators=100, max_depth=5) clf_test = clf_test.fit(X_train, y_train) y_dt_pred_test = clf_test.predict_proba(testData) onlyPKpredictions_test = y_dt_pred_test[:, 1] ########################################################### o = open('featuresall_pred3.txt', 'w') for i in range(len(extractIds)): entry = extractIds[i] + "," + str(onlyPKpredictions_test[i]) o.write(entry) o.write('\n') o.close()
y_pred2 = y_pred[:, -2] return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix( y_test, y_pred2) fields = list(label2int.index) + ["rank"] records = [] loo = LeaveOneOut() y_cv_pred1 = [] y_cv_pred2 = [] y_cv = [] for train_idx, test_idx in tqdm(loo.split(X_train)): clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed) clf.fit(X_train[train_idx, :], y_train[train_idx]) y_p = clf.predict_proba(X_train[test_idx, :]).argsort(axis=1) y_cv_pred1.append(y_p[:, -1][0]) y_cv_pred2.append(y_p[:, -2][0]) y_cv.append(y_train[test_idx][0]) y_cv_pred1 = np.array(y_cv_pred1) y_cv_pred2 = np.array(y_cv_pred2) y_cv = np.array(y_cv) c1 = metrics.confusion_matrix(y_cv, y_cv_pred1) c2 = metrics.confusion_matrix(y_cv, y_cv_pred2) df1 = pd.DataFrame(data=c1, index=label2int.index, columns=label2int.index) df2 = pd.DataFrame(data=c2, index=label2int.index, columns=label2int.index) acc1 = df1 / df1.sum(axis=1).values.reshape((-1, 1)) acc2 = df2 / df2.sum(axis=1).values.reshape((-1, 1)) acc1.to_csv(args.confusion_loo, sep="\t") top1 = list(np.diagonal(acc1.values)) + ["top1-loo"] top2 = list(np.diagonal((acc1 + acc2).values)) + ["top2-loo"]
class HierarchicalRandomForest(BaseClassifier): MODEL_NAME = "hierarchical_random_forest" MODEL_VERSION = "1.0.0" MODEL_VERSION_NAME = f"{MODEL_NAME}_{MODEL_VERSION}" MODEL_PICKLE_PATH = os.path.join(PICKLE_PATH, f"{MODEL_VERSION_NAME}") def __init__(self, taxonomy_dictionary, non_used_features=None): n_trees = 500 self.top_classifier = RandomForestClassifier(n_estimators=n_trees, max_depth=None, max_features='auto') self.stochastic_classifier = RandomForestClassifier( n_estimators=n_trees, max_depth=None, max_features=0.2) self.periodic_classifier = RandomForestClassifier(n_estimators=n_trees, max_depth=None, max_features='auto') self.transient_classifier = RandomForestClassifier( n_estimators=n_trees, max_depth=None, max_features='auto') self.feature_preprocessor = FeaturePreprocessor( non_used_features=non_used_features) self.taxonomy_dictionary = taxonomy_dictionary self.feature_list = None self.inverted_dictionary = invert_dictionary(self.taxonomy_dictionary) self.pickles = { "features_list": "features_RF_model.pkl", "top_rf": "hierarchical_level_RF_model.pkl", "periodic_rf": "periodic_level_RF_model.pkl", "stochastic_rf": "stochastic_level_RF_model.pkl", "transient_rf": "transient_level_RF_model.pkl" } self.url_model = f"https://assets.alerce.online/pipeline/hierarchical_rf_{self.MODEL_VERSION}/" def fit(self, samples: pd.DataFrame, labels: pd.DataFrame) -> None: labels = labels.copy() # Check that the received labels are in the taxonomy feeded_labels = labels.classALeRCE.unique() expected_labels = self.inverted_dictionary.keys() for label in feeded_labels: if label not in expected_labels: raise Exception(f'{label} is not in the taxonomy dictionary') # Create top class labels['top_class'] = labels['classALeRCE'].map( self.inverted_dictionary) # Preprocessing samples = self.feature_preprocessor.preprocess_features(samples) samples = self.feature_preprocessor.remove_duplicates(samples) samples, labels = intersect_oids_in_dataframes(samples, labels) # Save list of features to know their order self.feature_list = samples.columns # Train top classifier self.top_classifier.fit(samples.values, labels['top_class'].values) # Train specialized classifiers is_stochastic = labels['top_class'] == 'Stochastic' self.stochastic_classifier.fit( samples[is_stochastic].values, labels[is_stochastic]['classALeRCE'].values) is_periodic = labels['top_class'] == 'Periodic' self.periodic_classifier.fit(samples[is_periodic].values, labels[is_periodic]['classALeRCE'].values) is_transient = labels['top_class'] == 'Transient' self.transient_classifier.fit( samples[is_transient].values, labels[is_transient]['classALeRCE'].values) def check_missing_features(self, columns, feature_list): missing = set(feature_list).difference(set(columns)) return missing def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame: missing = self.check_missing_features(samples.columns, self.feature_list) if len(missing) > 0: raise Exception(f"Missing features: {missing}") samples = samples[self.feature_list] samples = self.feature_preprocessor.preprocess_features(samples) top_probs = self.top_classifier.predict_proba(samples.values) stochastic_probs = self.stochastic_classifier.predict_proba( samples.values) periodic_probs = self.periodic_classifier.predict_proba(samples.values) transient_probs = self.transient_classifier.predict_proba( samples.values) stochastic_index = self.top_classifier.classes_.tolist().index( 'Stochastic') periodic_index = self.top_classifier.classes_.tolist().index( 'Periodic') transient_index = self.top_classifier.classes_.tolist().index( 'Transient') stochastic_probs = stochastic_probs * top_probs[:, stochastic_index].reshape( [-1, 1]) periodic_probs = periodic_probs * top_probs[:, periodic_index].reshape( [-1, 1]) transient_probs = transient_probs * top_probs[:, transient_index].reshape( [-1, 1]) final_probs = np.concatenate( [stochastic_probs, periodic_probs, transient_probs], axis=1) df = pd.DataFrame(data=final_probs, index=samples.index, columns=self.get_list_of_classes()) df.index.name = samples.index.name return df def get_list_of_classes(self) -> list: final_columns = (self.stochastic_classifier.classes_.tolist() + self.periodic_classifier.classes_.tolist() + self.transient_classifier.classes_.tolist()) return final_columns def save_model(self, directory: str) -> None: with open(os.path.join(directory, self.pickles['top_rf']), 'wb') as f: pickle.dump(self.top_classifier, f, pickle.HIGHEST_PROTOCOL) with open(os.path.join(directory, self.pickles['stochastic_rf']), 'wb') as f: pickle.dump(self.stochastic_classifier, f, pickle.HIGHEST_PROTOCOL) with open(os.path.join(directory, self.pickles['periodic_rf']), 'wb') as f: pickle.dump(self.periodic_classifier, f, pickle.HIGHEST_PROTOCOL) with open(os.path.join(directory, self.pickles['transient_rf']), 'wb') as f: pickle.dump(self.transient_classifier, f, pickle.HIGHEST_PROTOCOL) with open(os.path.join(directory, self.pickles['features_list']), 'wb') as f: pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL) def load_model(self, directory: str) -> None: self.top_classifier = pd.read_pickle( os.path.join(directory, self.pickles['top_rf'])) self.stochastic_classifier = pd.read_pickle( os.path.join(directory, self.pickles['stochastic_rf'])) self.periodic_classifier = pd.read_pickle( os.path.join(directory, self.pickles['periodic_rf'])) self.transient_classifier = pd.read_pickle( os.path.join(directory, self.pickles['transient_rf'])) self.feature_list = pd.read_pickle( os.path.join(directory, self.pickles['features_list'])) def download_model(self): if not os.path.exists(self.MODEL_PICKLE_PATH): os.makedirs(self.MODEL_PICKLE_PATH) for pkl in self.pickles.values(): tmp_path = os.path.join(self.MODEL_PICKLE_PATH, pkl) if not os.path.exists(tmp_path): command = f"wget {self.url_model}{pkl} -O {tmp_path}" wget.download(os.path.join(self.url_model, pkl), tmp_path) def predict_in_pipeline(self, input_features: pd.DataFrame) -> dict: if isinstance(input_features, pd.Series): input_features = input_features.to_frame().transpose() if len(input_features) != 1: raise ValueError( 'predict_in_pipeline receives features one by one') missing = self.check_missing_features(input_features.columns, self.feature_list) if len(missing) > 0: raise Exception(f"Missing features: {missing}") input_features = input_features[self.feature_list] input_features = self.feature_preprocessor.preprocess_features( input_features) prob_root = pd.DataFrame( self.top_classifier.predict_proba(input_features), columns=self.top_classifier.classes_, index=input_features.index) prob_children = [] resp_children = {} child_models = [ self.stochastic_classifier, self.periodic_classifier, self.transient_classifier ] child_names = ['Stochastic', 'Periodic', 'Transient'] for name, model in zip(child_names, child_models): prob_child = pd.DataFrame(model.predict_proba(input_features), columns=model.classes_, index=input_features.index) resp_children[name] = prob_child.iloc[0].to_dict() prob_child = prob_child.mul(prob_root[name].values, axis="rows") prob_children.append(prob_child) prob_all = pd.concat(prob_children, axis=1, sort=False) return { "hierarchical": { "top": prob_root.iloc[0].to_dict(), "children": resp_children }, "probabilities": prob_all.iloc[0].to_dict(), "class": prob_all.idxmax(axis=1).iloc[0] }
def Improved_BRF_low(x_train,y_train,x_test,y_test,threshold1_low,threshold2_low,threshold3_low): clf1 = BalancedRandomForestClassifier(max_leaf_nodes=20,\ n_estimators = 60,criterion = 'entropy',min_samples_leaf=20,min_samples_split=50,\ max_depth=7, oob_score = True,random_state=10) clf2 = BalancedRandomForestClassifier(max_leaf_nodes=20,max_features = 10,\ n_estimators = 60,criterion = 'entropy',min_samples_leaf=10,min_samples_split=30,\ max_depth=9, oob_score = True,random_state=10) clf3 = BalancedRandomForestClassifier(max_leaf_nodes=20,max_features = 14,\ n_estimators = 40,criterion = 'entropy',min_samples_leaf=10,min_samples_split=50,\ max_depth=7, oob_score = True,random_state=10) ################################################## Data frist Classifier print('################################################## Data frist Classifier') print('Train Clients %s'%Counter(y_train)) print('Test Clients %s'%Counter(y_test)) clf1.fit(x_train,y_train) with open('BRF_clf1_low.pkl', 'wb') as f: pickle.dump(clf1, f, pickle.HIGHEST_PROTOCOL) y_pred1 = clf1.predict(x_test) y_prob1 = clf1.predict_proba(x_test)[:,1] y_prob1_train = clf1.predict_proba(x_train)[:,1] Plot_Prob_Distribution.Plot_probability(y_test,y_prob1,threshold1_low,threshold1_low) Prediction = np.zeros(y_test.shape) for i in range(len(y_test)): if y_prob1[i] <= threshold1_low: Prediction[i] = -1 else: Prediction[i] = clf1.predict(x_test[i,:].reshape(1, -1)) ################################################## Data second Classifier print('################################################## Data second Classifier') train_choix_bool = (y_prob1_train > threshold1_low) test_choix_bool = (y_prob1 > threshold1_low) print('Train Clients %s'%Counter(y_train[train_choix_bool])) print('Test Clients %s'%Counter(y_test[test_choix_bool])) clf2.fit(x_train[train_choix_bool],y_train[train_choix_bool]) with open('BRF_clf2_low.pkl', 'wb') as f: pickle.dump(clf2, f, pickle.HIGHEST_PROTOCOL) y_prob2 = clf2.predict_proba(x_test[test_choix_bool])[:,1] y_prob2_train = np.zeros(len(x_train)) for i in range(len(x_train)): if (y_prob1_train[i] > threshold1_low): y_prob2_train[i] = clf2.predict_proba(x_train[i,:].reshape(1,-1))[:,1] Plot_Prob_Distribution.Plot_probability(y_test[test_choix_bool],y_prob2,threshold2_low,threshold2_low) y_prob2 = np.zeros(len(x_test)) for i in range(len(x_test)): if (y_prob1[i] > threshold1_low): y_prob2[i] = clf2.predict_proba(x_test[i,:].reshape(1,-1))[:,1] for i in range(len(y_test)): if (y_prob1[i]+y_prob2[i])/2 <= threshold2_low: Prediction[i] = -1 else: Prediction[i] = clf2.predict(x_test[i,:].reshape(1, -1)) ################################################## Data third Classifier print('################################################## Data third Classifier') train_choix_bool = (y_prob1_train>threshold1_low) & (y_prob2_train>threshold2_low) test_choix_bool = (y_prob1>threshold1_low) & (y_prob2>threshold2_low) print('Train Clients %s'%Counter(y_train[train_choix_bool])) print('Test Clients %s'%Counter(y_test[test_choix_bool])) clf3.fit(x_train[train_choix_bool],y_train[train_choix_bool]) with open('BRF_clf3_low.pkl', 'wb') as f: pickle.dump(clf3, f, pickle.HIGHEST_PROTOCOL) y_prob3 = clf3.predict_proba(x_test[test_choix_bool])[:,1] y_prob3_train = np.zeros(len(x_train)) for i in range(len(x_train)): if (y_prob1_train[i]>threshold1_low) & (y_prob2_train[i]>threshold2_low) : y_prob3_train[i] = clf3.predict_proba(x_train[i,:].reshape(1,-1))[:,1] Plot_Prob_Distribution.Plot_probability(y_test[test_choix_bool],y_prob3,threshold3_low,threshold3_low) y_prob3 = np.zeros(len(x_test)) for i in range(len(x_test)): if (y_prob1[i]>threshold1_low) & (y_prob2[i]>threshold2_low) : y_prob3[i] = clf3.predict_proba(x_test[i,:].reshape(1,-1))[:,1] ########## Model 1 for i in range(len(y_test)): if y_prob3[i] <= threshold3_low: Prediction[i] = -1 else: Prediction[i] = clf3.predict(x_test[i,:].reshape(1, -1)) ########## Model 2 y_Prob = np.zeros(len(x_test)) for i in range(len(x_test)): if (y_prob1[i]<threshold1_low) : y_Prob[i] = -1 else: if (y_prob1[i]+y_prob2[i])/2 < threshold2_low: y_Prob[i] = -1 else: y_Prob[i] = (y_prob1[i]+y_prob2[i]+y_prob3[i])/3 y_Pred = np.sign(y_Prob-0.5) return y_pred1, y_Pred
def random_boruta(self): with open(self.result_folder + '/param_RF_{}.json'.format(self.epoch)) as f: dati = json.load(f) for data in dati: del data['value'] brfmodel = BalancedRandomForestClassifier(**data) cv = StratifiedKFold(n_splits=5, shuffle=True) for train_index, test_index in cv.split(self.X, self.y): X_train = self.X.iloc[lambda x: train_index] X_test = self.X.iloc[lambda x: test_index] y_train = np.take(self.y, train_index) y_test = np.take(self.y, test_index) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') imputer = median_imputer.fit(X_train) vX_train = imputer.transform(X_train) imputertest = median_imputer.fit(X_test) vX_test = imputertest.transform(X_test) X_train = pd.DataFrame(vX_train, columns=X_train.columns, index=X_train.index) X_test = pd.DataFrame(vX_test, columns=X_test.columns, index=X_test.index) Feature_Selector = BorutaShap(model=brfmodel, importance_measure='shap', percentile=85, pvalue=0.08, classification=True) Feature_Selector.fit(X_train, y_train, n_trials=200, random_state=0) Feature_Selector.TentativeRoughFix() Feature_Selector.plot(X_size=12, figsize=(12, 8), y_scale='log', which_features='all') Xstrain = Feature_Selector.Subset() selected = [x for x in Xstrain.columns] print('features selected', selected) v_test_X = median_imputer.fit_transform(self.X_test) test_X = pd.DataFrame(v_test_X, columns=self.X_test.columns, index=self.X_test.index) valx = self.X_validation valy = self.y_validation vvalX = imputer.transform(valx) valx = pd.DataFrame(vvalX, columns=valx.columns, index=valx.index) print('AUC') brfmodel.fit(X_train, y_train) roc = roc_auc_score(y_test, brfmodel.predict_proba(X_test)[:, 1]) print(roc) print('AUC Validation') roc_test = roc_auc_score( self.y_validation, brfmodel.predict_proba(valx)[:, 1]) print(roc_test) print('AUC ridotte') brfmodel.fit(Xstrain, y_train) roc = roc_auc_score( y_test, brfmodel.predict_proba(X_test[selected])[:, 1]) print(roc) roc_test = roc_auc_score( self.y_validation, brfmodel.predict_proba(valx[selected])[:, 1]) print(roc_test)
weighted_clf.classes_ sample.to_csv('weighted_rfc.csv',index = False) #balancedrfc from imblearn.ensemble import BalancedRandomForestClassifier brfc = BalancedRandomForestClassifier(n_estimators=500,random_state=0).fit(X_tr,y_tr) roc_auc_score(y_tst,brfc.predict(X_tst)) sample['risk_flag'] = brfc.predict(test_df.drop(columns = ['risk_flag'])) sample['risk_flag_proba'] = brfc.predict_proba(test_df.drop(columns = ['risk_flag']))[:,1] weighted_clf.classes_ print("F1 Score for Balanced Random Forest Classifier is ", f1_score(y_test,brfc.predict(X_test))) print("Accuracy Score for Balanced Random Fo #catboost submission = pd.read_csv('catboost_+_feats_+15000itrs.csv') sample['risk_flag_cat'] = submission['risk_flag']
class BalancedBinaryClassifier(BaseEstimator, ClassifierMixin): def __init__(self, max_depth=None, n_features=10, selector=ranksum, trend="both", space_mask=None): self.max_depth = max_depth self.n_features = n_features self.selector = selector self.model_ = BalancedRandomForestClassifier(max_depth=max_depth, n_estimators=100, random_state=777) self.trend = trend self.space_mask = space_mask def fit(self, X, y): X, y = check_X_y(X, y) self.mask = self.trend_mask = np.zeros(X.shape[1]) self.classes_ = unique_labels(y) if self.classes_.shape[0] != 2: raise Exception( 'Current implementation only support binary classification') self.importance = self.selector(X, y) flag1 = flag2 = False mean_diff = X[y == 1, :].mean(axis=0) - X[y == 0, :].mean(axis=0) if self.trend == "up": flag1 = True self.trend_mask[mean_diff <= 0] = 1 print("Trend mask: {}/{}".format(int(self.trend_mask.sum()), X.shape[1])) if self.space_mask is not None: flag2 = True self.space_mask = np.array(self.space_mask).astype(int) print("Space mask: {}/{}".format(int(self.space_mask.sum()), X.shape[1])) else: self.space_mask = np.zeros(X.shape[1]) if flag1 or flag2: self.mask = self.trend_mask + self.space_mask self.mask[self.mask > 1] = 1 print("Remained: {}/{}".format(X.shape[1] - int(self.mask.sum()), X.shape[1])) self.importance[self.mask.astype(bool)] = self.importance.min() - 1 if self.trend == "both_balance": n_up = int(self.n_features / 2) n_down = self.n_features - n_up up_importance = copy(self.importance) down_importance = copy(self.importance) up_importance[mean_diff < 0] = up_importance.min() - 1 down_importance[mean_diff > 0] = down_importance.min() - 1 up_order = np.argsort(up_importance)[::-1] down_order = np.argsort(down_importance)[::-1] features = np.array( list(up_order[:n_up]) + list(down_order[:n_down])) print(features) else: order = np.argsort(self.importance)[::-1] features = order[:self.n_features] self.features = features self.model_.fit(X[:, self.features], y) def predict(self, X): check_is_fitted(self) return self.model_.predict(X[:, self.features]) def predict_proba(self, X): check_is_fitted(self) return self.model_.predict_proba(X[:, self.features])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.31, random_state=0,stratify=y) instance = X_train.sample(n=50000).values #instance randomized to avoid RAM error # Scale data (standardize it) scaler.fit_transform(X_train) scaler.transform(X_test) from imblearn.metrics import classification_report_imbalanced clf = BalancedRandomForestClassifier(max_depth=None, random_state=0,oob_score=True,n_estimators=100) clf.fit(X_train,y_train) y_pred = clf.predict(X_test) y_pred= pd.DataFrame(y_pred) print(classification_report_imbalanced(y_test, y_pred, target_names=["0","1"])) clf_probs = clf.predict_proba(X_test)[:,1] clf_probs0 = clf.predict_proba(X_test)[:,0] print(roc_auc_score(y_test, clf_probs,average="weighted")) print(roc_auc_score(y_test, clf_probs0,average="weighted")) np.set_printoptions(precision=2) from sklearn.utils.multiclass import unique_labels #classes_spr = list(unique_labels(y_test, y_pred)) classes_spr = ["0","1"] plot_confusion_matrix(y_test, y_pred, classes=classes_spr, normalize=False, title='confusion matrix for Balanced Random Forest') plt.show() from treeinterpreter import treeinterpreter as ti from collections import defaultdict import random
names = [train.columns[i] for i in indices] # Barplot: Add bars plt.bar(range(train.shape[1]), importances[indices]) # Add feature names as x-axis labels plt.xticks(range(train.shape[1]), names, rotation=20, fontsize=8) plt.yticks(range(0, 35, 5), fontsize=12) plt.grid(b=None, axis='x') # Create plot title plt.title("Feature Importances") # Show plot plt.show() #Training data prediction train_rf_predictions = model.predict(train) train_rf_probs = model.predict_proba(train)[:, 1] # Testing predictions (to determine performance) rf_predictions = model.predict(test) rf_probs = model.predict_proba(test)[:, 1] #Combine predicted train data odds with team name and year train_1 = pd.concat([train_year, train_team, train], axis=1) train_1.reset_index(drop=True, inplace=True) train_1 = pd.concat([train_1, pd.DataFrame(train_labels)], axis=1) train_1 = train_1.rename(columns={0: 'Champion'}) train_1 = pd.concat([train_1, pd.DataFrame(train_rf_probs)], axis=1) train_1 = train_1.rename(columns={'Year': 'Year', 'Team': 'Team', 0: 'Probs'}) #train_1.sort_values(by=['Probs'],ascending=False)
X_train = X.loc[pos_ids_train+neg_ids_train,:].values y_train = np.array([1]*len(pos_ids_train)+[0]*len(neg_ids_train)) X_test = X.loc[pos_ids_test+neg_ids_test,:].values y_test = np.array([1]*len(pos_ids_test)+[0]*len(neg_ids_test)) print(pos_ids_test[:4]) print(neg_ids_test[:4]) print(X_test[:4,:]) clf2 = BalancedRandomForestClassifier(n_estimators=100,random_state=777) #666 grid = {"max_depth":[2,4,8,16,None]} clf2 = autoTune(clf2,X_train,y_train,grid) #y_pred = clf2.predict_proba(X.values)[:,1] prob = pd.DataFrame(index=sample_ids,columns=["probability","dataset"]) prob.loc[pos_ids_test+neg_ids_test,"probability"]=clf2.predict_proba(X_test)[:,1] prob.loc[pos_ids_train+neg_ids_train,"probability"]=clf2.predict_proba(X_train)[:,1] prob.loc[pos_ids_train+neg_ids_train,"dataset"]="train" prob.loc[pos_ids_test+neg_ids_test,"dataset"]="test" prob.to_csv(args.probability,sep="\t") y_pred = prob.loc[pos_ids_test+neg_ids_test,"probability"] print("ROC-AUC on test set") print("full-training-set\ttest_set\t{}".format(metrics.roc_auc_score(y_test,y_pred)),sep="\t") auroc=metrics.roc_auc_score(y_test,y_pred) with open(args.auroc,"w") as f: f.write(str(auroc)) fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred) roc_data = {} roc_data["fpr"] = fpr roc_data["tpr"] = tpr roc_data["thresholds"] = thresholds
sampling_strategy='not minority', oob_score=True, n_jobs=4, random_state=42, verbose=1 ) clf.fit(X_train, Y_train) Y_test_pred = clf.predict(X_test) print('\nClassifier performance') print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred, zero_division=0)) # This will be the training set Y_in_train = clf.oob_decision_function_.astype('float32') # This will be the test set Y_in_test = clf.predict_proba(X_test).astype('float32') # %% [markdown] ''' ## Architecture design As a baseline, let's use a single-layer bidirectional LSTM. PyTorch uses a sligtly unintuitive array format for the input and output of its LSTM module. The array shape for both input and output is `(seq_length,N,num_labels)`, corresponding to `N` sequences of `seq_length` elements of size `num_labels`. Here, each element is a vector of label probabilities/logits. ''' # %% class LSTM(nn.Module):
def feature_importance(self, n_rows, n_cols, X_train, y_train, X_valid, y_valid): '''Calculate feature importance from Logistic, Random Forest, CatBoost, XGB, LGBM''' # train classifiers lr = LogisticRegression(max_iter=100, random_state=42) lr.fit(X_train, y_train) lr_prob = lr.predict_proba(X_valid) rfc = RandomForestClassifier(n_jobs=2, random_state=42) rfc.fit(X_train, y_train) rfc_prob = rfc.predict_proba(X_valid) brfc = BalancedRandomForestClassifier(random_state=42) brfc.fit(X_train, y_train) brfc_prob = brfc.predict_proba(X_valid) cb = CatBoostClassifier(random_state=42, verbose=False) cb.fit(X_train, y_train) cb_prob = cb.predict_proba(X_valid) xgb = XGBClassifier(random_state=42) xgb.fit(X_train, y_train) xgb_prob = xgb.predict_proba(X_valid) lgbm = LGBMClassifier(random_state=42, n_jobs=-1) lgbm.fit(X_train, y_train) lgbm_prob = lgbm.predict_proba(X_valid) feat_importance_list = [ lr.coef_[0], rfc.feature_importances_, brfc.feature_importances_, cb.feature_importances_, xgb.feature_importances_, lgbm.feature_importances_ ] model_name = [ 'Logistic Regression', 'Random Forest Classifier', 'Balanced Random Forest Classifier', 'CatBoost Classifier', 'XGB Classifier', 'LGBM Classifier' ] # generate feature importance plots fig, ax = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 20)) sns.set(font_scale=1.5) for feature, name, n, ax in zip(feat_importance_list, model_name, list(range(n_rows * n_cols)), ax.flatten()): # get feature importance importance = feature # create dataframe df_imp = pd.DataFrame() # calculate importance of each variable df_imp['importance'] = pd.Series(importance, index=list(X_train.columns)) # transform dataframe long_df = pd.melt(df_imp.T) # plot barplot plt.subplot(n_rows, n_cols, n + 1) sns.barplot(y=long_df.variable, x=long_df.value, order=long_df.sort_values( 'value', ascending=False)['variable'].to_list()) plt.title(f'{name}') # adjusts subplot plt.tight_layout() # displays the plot plt.show()
def Clasificar(database, new, path): pd.options.mode.chained_assignment = None if 'Response by Category' in list(database.columns): database = database.drop(['Response by Category','Response by Description'], axis = 1) database = database.sample(frac= 0.4, replace = False) #Chequeo las companias que ya estaban clasificadas #d = new.merge(database, how ='left', left_on='Organization Name', right_on = 'Investee')[['Investee','Category.1','Area of Focus']] #new = new.merge(d, how = "left", left_on = "Organization Name", right_on = "Investee") #new = new.drop(columns=["Investee"]) database["Category.1"] = database["Category.1"].replace("rejected", "Rejected") database["Category.1"] = database["Category.1"].replace("B2C ", "B2C") database["Category.1"] = database["Category.1"].replace("FIntech", "Fintech") database['Prediction'] = np.nan new['Prediction'] = np.nan new = new.drop(['Prediction'], axis=1) #CLASIFICADOR warnings.filterwarnings('ignore') print('Importando bases de datos') new = new.rename(columns = {'Categories':'Category','Organization Name':'Investee'}) train = database[['Operation','Investee', 'Category', 'Description', 'Category.1', 'Area of Focus']].dropna() newdata = new[['Transaction Name','Investee', 'Category', 'Description']] print('Preprocesamiento del texto') stop_words = stopwords.words('english') for column in ['Category','Description']: train[column] = train[column].apply(lambda x: (" ".join(str(x).lower() for x in str(x).split())).encode('utf-8').decode('utf-8')) # lower case train[column] = train[column].str.replace('[^\w\s]', ' ') # removing punctuation train[column] = train[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words)) # removing stop words newdata[column] = newdata[column].apply(lambda x: (" ".join(x.lower() for x in str(x).split()))) # lower case newdata[column] = newdata[column].str.replace('[^\w\s]', ' ') # removing punctuation newdata[column] = newdata[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words)) # removing stop words train_src1 = train[['Category','Description','Category.1']] train_src1['Rejected?'] = 0 train_src1.loc[train_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1 new_src1 = newdata[['Category','Description']] #new_src1['Rejected?'] = 0 #new_src1.loc[new_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1 #Binarizacion vectorizer = CountVectorizer() vectorI = pd.DataFrame(vectorizer.fit_transform(train_src1['Category']).toarray()) vectorI_new = pd.DataFrame(vectorizer.transform(new_src1['Category']).toarray()) vectorIdes = pd.DataFrame(vectorizer.fit_transform(train_src1['Description']).toarray()) vectorIdes_new = pd.DataFrame(vectorizer.transform(new_src1['Description']).toarray()) vectorI = pd.concat([vectorI, vectorIdes], axis = 1) vectorI_new = pd.concat([vectorI_new, vectorIdes_new], axis = 1) print('Entrenamiento') #Clasificacion binaria: Rechazadas vs no rechazadas #Resampling + Random Forest brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) brf.fit(vectorI, train_src1['Rejected?']) y_train_pred = brf.predict(vectorI) print('Confusion matrix: \n' , confusion_matrix(train_src1['Rejected?'], y_train_pred)) print('Accuracy: \n' , accuracy_score(train_src1['Rejected?'], y_train_pred)) print('Recall: \n' , recall_score(train_src1['Rejected?'], y_train_pred)) print('Clasificacion y exportacion') #Ajustando modelo a nuevos datos y_new_predict = brf.predict(vectorI_new) y_new_predict_proba = brf.predict_proba(vectorI_new) newdata['Prediction'] = y_new_predict newdata['Prob. of being rejected'] = y_new_predict_proba[:,0] newdata['Prob. of being of interest'] = y_new_predict_proba[:,1] #Creamos archivo Companies y exportamos new = pd.concat([new, newdata[['Prediction','Prob. of being rejected','Prob. of being of interest']]], axis=1, sort=False) return new
def main(): f = open("trainingData/featuresall_train.txt") data = [] floatData = [] label = [] for lineNumber, line in enumerate(f): if lineNumber != 0: entries = line.split('\t') data.append(list(map(float, entries[2:]))) label.append(int(entries[1])) #data.append((list(map(float, entries[2:])), int(entries[1]))) f.close() # f2 = open("testingData/features103_test.txt") # extractTest = [] # extractTestLabels = [] # for lineNumber, line in enumerate(f2): # if lineNumber != 0: # entries = line.split('\t') # extractTest.append(list(map(float, entries[1:]))) # # extractTestLabels.append(float(entries[1])) # testData = np.asarray(extractTest) # # testLabels = np.asarray(extractTestLabels) classLabel = np.array(label) trainData = np.array(data) # ros = RandomOverSampler(random_state=0) # X_resampled, y_resampled = SMOTE().fit_resample(trainData, classLabel) X_train, X_test, y_train, y_test = model_selection.train_test_split( trainData, classLabel, test_size=0.33) # X_train, X_test, y_train, y_test = model_selection.train_test_split(X_resampled, y_resampled, test_size = 0.33) # X_train, y_train = ros.fit_resample(X_train, y_train) X_train, y_train = SMOTE().fit_resample(X_train, y_train) # Gaussian Naive Bayes SUCKS don't use it nb = GaussianNB() nb.fit(X_train, y_train) y_nb_pred = nb.predict_proba(X_test) # print(y_nb_pred) onlyPKpredictions = y_nb_pred[:, 1] # print(onlyPKpredictions) # print("accuracy KAPPA:",metrics.cohen_kappa_score(y_test, y_nb_pred)) fpr, tpr, thresholds = metrics.roc_curve(y_test, onlyPKpredictions) print("accuracy NB AUC:", metrics.auc(fpr, tpr)) # print("confusion:\n",metrics.confusion_matrix(y_test, y_nb_pred)) # We are blessed with DT clf = tree.DecisionTreeClassifier(criterion='entropy', max_depth=5) #, max_depth=20) # clf = tree.DecisionTreeRegressor() clf = clf.fit(X_train, y_train) y_dt_pred = clf.predict_proba(X_test) # print(y_dt_pred[:,0]) onlyPKpredictions = y_dt_pred[:, 1] fpr, tpr, thresholds = metrics.roc_curve(y_test, onlyPKpredictions) print("accuracy DT AUC:", metrics.auc(fpr, tpr)) # print(y_dt_pred) # print("accuracy KAPPA:",metrics.cohen_kappa_score(y_test, y_dt_pred)) # print("accuracy AUC:",metrics.roc_auc_score(y_test, y_dt_pred)) # print("DT:\n",metrics.confusion_matrix(y_test, y_dt_pred)) model = BalancedRandomForestClassifier(n_estimators=100, max_depth=5) model = model.fit(X_train, y_train) y_rfc_pred = model.predict_proba(X_test) onlyPKpredictions = y_rfc_pred[:, 1] fpr, tpr, thresholds = metrics.roc_curve(y_test, onlyPKpredictions) print("accuracy RFC AUC:", metrics.auc(fpr, tpr)) # print(y_rfc_pred) # print("accuracy AUC:",metrics.roc_auc_score(y_test, y_rfc_pred)) f3 = open('results_train103.txt') grab = [] for line in f3: spl = line.split('\t') grab.append(float(spl[1])) nn = np.array(grab) fpr, tpr, thresholds = metrics.roc_curve(classLabel, nn) print("accuracy NN AUC:", metrics.auc(fpr, tpr)) f3.close()