def random_forest(df, drop, target, show, model_name): # split the table into features and outcomes x_cols = [i for i in df.columns if i not in drop] X = df[x_cols] y = df[target] # split features and outcomes into train and test data X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1) brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) brf.fit(X_train, y_train) y_predictions = brf.predict(X_test) feature_importance = sorted( zip(brf.feature_importances_, X.columns.tolist()))[::-1] # Calculating the accuracy score. acc_score = balanced_accuracy_score(y_test, y_predictions) # Displaying results if show == True: print(f"Feature Importance: {model_name}") for i in feature_importance: print(i) print("\n") return acc_score * 100
def objective(trial): train_X, val_X, train_y, val_y = train_test_split(self.X, self.y, test_size=0.2) median_imputer = SimpleImputer(missing_values=np.NaN, strategy='median') v_train_X = median_imputer.fit_transform(train_X) v_val_X = median_imputer.fit_transform(val_X) train_X = pd.DataFrame(v_train_X, columns=train_X.columns, index=train_X.index) val_X = pd.DataFrame(v_val_X, columns=val_X.columns, index=val_X.index) v_test_X = median_imputer.fit_transform(self.X_validation) test_X = pd.DataFrame(v_test_X, columns=self.X_validation.columns, index=self.X_validation.index) list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000] brf_n_estimators = trial.suggest_categorical( 'n_estimators', list_trees) brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0) brf_min_samples_split = trial.suggest_int('min_samples_split', 2, 16) brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16) brf_min_weight_fraction_leaf = trial.suggest_uniform( 'min_weight_fraction_leaf', 0, 0.5) brf_max_depth = trial.suggest_int('max_depth', 2, 32) brfmodel = BalancedRandomForestClassifier( n_estimators=brf_n_estimators, max_features=brf_max_features, min_samples_split=brf_min_samples_split, min_samples_leaf=brf_min_samples_leaf, max_depth=brf_max_depth, min_weight_fraction_leaf=brf_min_weight_fraction_leaf, bootstrap=True) brfmodel.fit(train_X, train_y) aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1]) aucbrf_test = roc_auc_score(self.y_validation, brfmodel.predict_proba(test_X)[:, 1]) print('Accuracy test ' + str( accuracy_score(self.y_validation, brfmodel.predict(test_X)))) plt.figure() plot_confusion_matrix(brfmodel, test_X, self.y_validation, cmap=plt.cm.Blues, normalize=None) plt.show() print(aucbrf_test) return aucbrf
def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator, params, clf_type, question): estimator_scores = {} if estimator == 'BalancedRandomForestClassifier': clf = BalancedRandomForestClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'BalancedBaggingClassifier': clf = BalancedBaggingClassifier( n_estimators=params['n_estimators'], bootstrap=params['bootstrap'], max_samples=params['max_samples'], sampling_strategy=params['sampling_strategy'], random_state=42) elif estimator == 'EasyEnsembleClassifier': clf = EasyEnsembleClassifier( n_estimators=params['n_estimators'], sampling_strategy=params['sampling_strategy'], random_state=42) clf.fit(train_x, train_y) cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y, clf_type, question) predicted_labels = clf.predict(test_x) tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel() specificity = round((tn / (tn + fp)) * 100, 2) predicted_prob = clf.predict_proba(test_x) predicted_prob_true = [p[1] for p in predicted_prob] estimator_scores['Question'] = question estimator_scores['Accuracy'] = round( accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Balanced Accuracy'] = round( balanced_accuracy_score(test_y, predicted_labels) * 100, 2) estimator_scores['Precision'] = round( precision_score(test_y, predicted_labels) * 100, 2) estimator_scores['Recall'] = round( recall_score(test_y, predicted_labels) * 100, 2) estimator_scores['Specificity'] = specificity estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2) estimator_scores['ROC AUC'] = round( roc_auc_score(test_y, predicted_prob_true), 2) # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2))) # perfect_labels = train_y # print(confusion_matrix(train_y, perfect_labels)) return cross_val_scores, estimator_scores
def random_forest(X_train, y_train, X_test, y_test, X_train_res, y_train_res): rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf.fit(X_train, y_train.values.ravel()) y_train_rf = rf.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_rf) without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Random Forest (niezbalansowany): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) rf_oversampling = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf_oversampling.fit(X_train_res, y_train_res.ravel()) y_train_rf = rf_oversampling.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_rf) with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Random Forest (z oversamplingiem): {}%".format(without)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) brf.fit(X_train, y_train.values.ravel()) y_train_brf = brf.predict(X_test) cnf_matrix_tra = confusion_matrix(y_test, y_train_brf) within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1]) print("Random Forest (zbalansowany - undersampling): {}%".format(within)) print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1]) print(brf.feature_importances_) objects = ('country','gender', 'age', 'visiting Wuhan', 'from Wuhan') y_pos = np.arange(len(objects)) performance = brf.feature_importances_*100 plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent zależności') plt.title('Zależność poszczególnych atrybutów') plt.show() objects = ('Random Forest niezbalansowany','Random Forest z oversamplingiem', 'Random Forest zbalansowany') y_pos = np.arange(len(objects)) performance = [without, with_oversampling, within] plt.bar(y_pos, performance, align='center', alpha=0.5) plt.xticks(y_pos, objects) plt.ylabel('Procent dokładności') plt.title('Dokładność Random Forest') plt.show() return without, within
def balanced_random_forest(train_features, train_labels, test_features, feature_list=None, hfo_type_name=None): rf = BalancedRandomForestClassifier( random_state=32, n_jobs=-1, # use all available processors # class_weight='balanced_subsample' ) rf.fit(train_features, train_labels) # Predict over test rf_predictions = rf.predict(test_features) rf_probs = rf.predict_proba(test_features)[:, 1] # IF FEATURE IMPORTANCE FIGS NOT EXISTS # print_feature_importances(rf, feature_list) # graphics.feature_importances(feature_list, rf.feature_importances_, hfo_type_name) return rf_predictions, rf_probs, rf
n_estimators=10, n_jobs=1, oob_score=False, random_state=1, replacement=False, sampling_strategy='auto', verbose=0, warm_start=False) # In[266]: classification_balanced_RF.fit(X_train, Y_train) # In[267]: Y_pred_IBRF = classification_balanced_RF.predict(X_test) # In[268]: # Balanced accuracy, Precision and Recall print(balanced_accuracy_score(Y_test, Y_pred_IBRF), average_precision_score(Y_test, Y_pred_IBRF), recall_score(Y_test, Y_pred_IBRF)) # In[269]: # Confusion matrix matrix_BRF = confusion_matrix(Y_test, Y_pred_IBRF) matrix_BRF
class Model_Finder: """ Tthis is to find the best model """ def __init__(self): self.file_object = open("../logs/modeltune/log.txt", 'a+') self.saved_best_model_path = '../saved_model/best_model.sav' self.logger = App_Logger() self.transformed_data = dataTransform() self.df = self.transformed_data.trainingData() self.data = self.df.iloc[:, :-1] self.label = self.df.iloc[:, -1] self.X_train, self.X_test, self.y_train, self.y_test = train_test_split( self.data, self.label, test_size=0.2, random_state=0, stratify=self.label) self.BRF = BalancedRandomForestClassifier(n_jobs=-1) self.EEC = EasyEnsembleClassifier(n_jobs=-1) def f2_make(self, y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2) def get_best_params_for_balanced_random_forest(self, X_train, y_train): self.logger.log( self.file_object, 'Entered the get_best_params_for_balanced_random_forest method of the Model_Finder class' ) #def f2_make(y_true, y_pred): #return fbeta_score(y_true, y_pred, beta=2) print('in RF') f2 = make_scorer(self.f2_make) try: # Number of trees in random forest n_estimators = [80, 100, 130, 160] criterion = ['gini', 'entropy'] # Number of features to consider at every split max_features = ['log2', 'sqrt'] # Maximum number of levels in tree max_depth = [5, 8, 10, 15] max_depth.append(None) # Minimum number of samples required to split a node min_samples_split = [2, 5, 8] # Minimum number of samples required at each leaf node min_samples_leaf = [2, 4] # Method of selecting samples for training each tree bootstrap = [True, False] replacement = [True, False] class_weight = ['balanced', None] # Create the random grid self.param_grid = { 'brf__n_estimators': n_estimators, 'brf__criterion': criterion, 'brf__max_features': max_features, 'brf__max_depth': max_depth, 'brf__min_samples_split': min_samples_split, 'brf__min_samples_leaf': min_samples_leaf, 'brf__bootstrap': bootstrap, 'brf__replacement': replacement, 'brf__class_weight': class_weight } self.estimators = [] #estimators.append(('standardize', StandardScaler())) self.estimators.append(('brf', self.BRF)) self.pipeline_imlearn = Pipeline(self.estimators) self.brf_random = RandomizedSearchCV( estimator=self.pipeline_imlearn, param_distributions=self.param_grid, n_iter=80, cv=5, verbose=0, random_state=42, scoring=f2, n_jobs=-1) self.brf_random.fit(X_train, y_train) self.n_estimators = self.brf_random.best_params_[ 'brf__n_estimators'] self.criterion = self.brf_random.best_params_['brf__criterion'] self.max_features = self.brf_random.best_params_[ 'brf__max_features'] self.max_depth = self.brf_random.best_params_['brf__max_depth'] self.min_samples_split = self.brf_random.best_params_[ 'brf__min_samples_split'] self.min_samples_leaf = self.brf_random.best_params_[ 'brf__min_samples_leaf'] self.bootstrap = self.brf_random.best_params_['brf__bootstrap'] self.replacement = self.brf_random.best_params_['brf__replacement'] self.class_weight = self.brf_random.best_params_[ 'brf__class_weight'] self.brf = BalancedRandomForestClassifier( n_estimators=self.n_estimators, criterion=self.criterion, max_features=self.max_features, max_depth=self.max_depth, min_samples_split=self.min_samples_split, min_samples_leaf=self.min_samples_leaf, bootstrap=self.bootstrap, replacement=self.replacement, class_weight=self.class_weight) self.brf.fit(X_train, y_train) self.logger.log( self.file_object, 'Balanced Random Forest best params: ' + str(self.brf_random.best_params_) + '\t' + str(self.brf_random.best_score_) + '. Exited the get_best_params_for_random_forest method of the Model_Finder class' ) print('RF done and exited') return self.brf except Exception as e: self.logger.log( self.file_object, 'Exception occured in get_best_params_for_balanced_random_forest method of the Model_Finder class. Exception message: ' + str(e)) self.logger.log( self.file_object, 'Balance Random Forest Parameter tuning failed. Exited the get_best_params_for_balanced_random_forest method of the Model_Finder class' ) raise Exception() def get_best_params_for_balanced_adaBoost(self, X_train, y_train): self.logger.log( self.file_object, 'Entered the get_best_params_for_balanced_adaBoost method of the Model_Finder class' ) print('enter ada boost') f2 = make_scorer(self.f2_make) try: n_estimators = [10, 15, 20, 25] warm_start = [True, False] sampling_strategy = ['auto', 'majority'] replacement = [True, False] # Create the random grid self.param_grid = { 'eec__n_estimators': n_estimators, 'eec__warm_start': warm_start, 'eec__sampling_strategy': sampling_strategy, 'eec__replacement': replacement } self.estimators = [] #estimators.append(('standardize', StandardScaler())) self.estimators.append(('eec', self.EEC)) self.pipeline_imlearn = Pipeline(self.estimators) self.eec_random = RandomizedSearchCV( estimator=self.pipeline_imlearn, param_distributions=self.param_grid, n_iter=32, cv=5, verbose=0, random_state=42, scoring=f2, n_jobs=-1) self.eec_random.fit(X_train, y_train) self.n_estimators = self.eec_random.best_params_[ 'eec__n_estimators'] self.warm_start = self.eec_random.best_params_['eec__warm_start'] self.sampling_strategy = self.eec_random.best_params_[ 'eec__sampling_strategy'] self.replacement = self.eec_random.best_params_['eec__replacement'] self.eec = EasyEnsembleClassifier( n_estimators=self.n_estimators, warm_start=self.warm_start, sampling_strategy=self.sampling_strategy, replacement=self.replacement) self.eec.fit(X_train, y_train) self.logger.log( self.file_object, 'Balanced Ada Boost params: ' + str(self.eec_random.best_params_) + '\t' + str(self.eec_random.best_score_) + '. Exited the get_best_params_for_AdaBoost method of the Model_Finder class' ) print('aba boost done and exited') return self.eec except Exception as e: self.logger.log( self.file_object, 'Exception occured in get_best_params_for_balanced_adaBoost method of the Model_Finder class. Exception message: ' + str(e)) self.logger.log( self.file_object, 'Balance Ada Boost tuning failed. Exited the get_best_params_for_balanced_AdaBoost method of the Model_Finder class' ) raise Exception() def get_best_model(self, X_train, X_test, y_train, y_test): self.logger.log( self.file_object, 'Entered the get_best_model method of the Model_Finder class') print('in get best model') try: self.brf = self.get_best_params_for_balanced_random_forest( X_train, y_train) self.y_pred_brf = self.brf.predict(X_test) self.brf_f2 = self.f2_make(y_test, self.y_pred_brf) self.eec = self.get_best_params_for_balanced_adaBoost( X_train, y_train) self.y_pred_eec = self.eec.predict(X_test) self.eec_f2 = self.f2_make(y_test, self.y_pred_eec) #comparing the two models if (self.brf_f2 > self.eec_f2): print('best model exited') joblib.dump(self.brf, self.saved_best_model_path) return 'BalancedRandomForestClassifier', self.brf else: print('best model exited') joblib.dump(self.eec, self.saved_best_model_path) return 'EasyEnsembleClassifier', self.eec except Exception as e: self.logger.log( self.file_object, 'Exception occured in get_best_model method of the Model_Finder class. Exception message: ' + str(e)) self.logger.log( self.file_object, 'Model Selection Failed. Exited the get_best_model method of the Model_Finder class' ) raise Exception()
def Improved_BRF_low(x_train,y_train,x_test,y_test,threshold1_low,threshold2_low,threshold3_low): clf1 = BalancedRandomForestClassifier(max_leaf_nodes=20,\ n_estimators = 60,criterion = 'entropy',min_samples_leaf=20,min_samples_split=50,\ max_depth=7, oob_score = True,random_state=10) clf2 = BalancedRandomForestClassifier(max_leaf_nodes=20,max_features = 10,\ n_estimators = 60,criterion = 'entropy',min_samples_leaf=10,min_samples_split=30,\ max_depth=9, oob_score = True,random_state=10) clf3 = BalancedRandomForestClassifier(max_leaf_nodes=20,max_features = 14,\ n_estimators = 40,criterion = 'entropy',min_samples_leaf=10,min_samples_split=50,\ max_depth=7, oob_score = True,random_state=10) ################################################## Data frist Classifier print('################################################## Data frist Classifier') print('Train Clients %s'%Counter(y_train)) print('Test Clients %s'%Counter(y_test)) clf1.fit(x_train,y_train) with open('BRF_clf1_low.pkl', 'wb') as f: pickle.dump(clf1, f, pickle.HIGHEST_PROTOCOL) y_pred1 = clf1.predict(x_test) y_prob1 = clf1.predict_proba(x_test)[:,1] y_prob1_train = clf1.predict_proba(x_train)[:,1] Plot_Prob_Distribution.Plot_probability(y_test,y_prob1,threshold1_low,threshold1_low) Prediction = np.zeros(y_test.shape) for i in range(len(y_test)): if y_prob1[i] <= threshold1_low: Prediction[i] = -1 else: Prediction[i] = clf1.predict(x_test[i,:].reshape(1, -1)) ################################################## Data second Classifier print('################################################## Data second Classifier') train_choix_bool = (y_prob1_train > threshold1_low) test_choix_bool = (y_prob1 > threshold1_low) print('Train Clients %s'%Counter(y_train[train_choix_bool])) print('Test Clients %s'%Counter(y_test[test_choix_bool])) clf2.fit(x_train[train_choix_bool],y_train[train_choix_bool]) with open('BRF_clf2_low.pkl', 'wb') as f: pickle.dump(clf2, f, pickle.HIGHEST_PROTOCOL) y_prob2 = clf2.predict_proba(x_test[test_choix_bool])[:,1] y_prob2_train = np.zeros(len(x_train)) for i in range(len(x_train)): if (y_prob1_train[i] > threshold1_low): y_prob2_train[i] = clf2.predict_proba(x_train[i,:].reshape(1,-1))[:,1] Plot_Prob_Distribution.Plot_probability(y_test[test_choix_bool],y_prob2,threshold2_low,threshold2_low) y_prob2 = np.zeros(len(x_test)) for i in range(len(x_test)): if (y_prob1[i] > threshold1_low): y_prob2[i] = clf2.predict_proba(x_test[i,:].reshape(1,-1))[:,1] for i in range(len(y_test)): if (y_prob1[i]+y_prob2[i])/2 <= threshold2_low: Prediction[i] = -1 else: Prediction[i] = clf2.predict(x_test[i,:].reshape(1, -1)) ################################################## Data third Classifier print('################################################## Data third Classifier') train_choix_bool = (y_prob1_train>threshold1_low) & (y_prob2_train>threshold2_low) test_choix_bool = (y_prob1>threshold1_low) & (y_prob2>threshold2_low) print('Train Clients %s'%Counter(y_train[train_choix_bool])) print('Test Clients %s'%Counter(y_test[test_choix_bool])) clf3.fit(x_train[train_choix_bool],y_train[train_choix_bool]) with open('BRF_clf3_low.pkl', 'wb') as f: pickle.dump(clf3, f, pickle.HIGHEST_PROTOCOL) y_prob3 = clf3.predict_proba(x_test[test_choix_bool])[:,1] y_prob3_train = np.zeros(len(x_train)) for i in range(len(x_train)): if (y_prob1_train[i]>threshold1_low) & (y_prob2_train[i]>threshold2_low) : y_prob3_train[i] = clf3.predict_proba(x_train[i,:].reshape(1,-1))[:,1] Plot_Prob_Distribution.Plot_probability(y_test[test_choix_bool],y_prob3,threshold3_low,threshold3_low) y_prob3 = np.zeros(len(x_test)) for i in range(len(x_test)): if (y_prob1[i]>threshold1_low) & (y_prob2[i]>threshold2_low) : y_prob3[i] = clf3.predict_proba(x_test[i,:].reshape(1,-1))[:,1] ########## Model 1 for i in range(len(y_test)): if y_prob3[i] <= threshold3_low: Prediction[i] = -1 else: Prediction[i] = clf3.predict(x_test[i,:].reshape(1, -1)) ########## Model 2 y_Prob = np.zeros(len(x_test)) for i in range(len(x_test)): if (y_prob1[i]<threshold1_low) : y_Prob[i] = -1 else: if (y_prob1[i]+y_prob2[i])/2 < threshold2_low: y_Prob[i] = -1 else: y_Prob[i] = (y_prob1[i]+y_prob2[i]+y_prob3[i])/3 y_Pred = np.sign(y_Prob-0.5) return y_pred1, y_Pred
clf = BalancedRandomForestClassifier(n_estimators=2000, replacement=True, sampling_strategy='not minority', oob_score=True, n_jobs=4, random_state=42, verbose=1) clf.fit(X_train, Y_train) # %% [markdown] ''' ## Model performance ''' # %% Y_train_pred = clf.predict(X_train) Y_test_pred = clf.predict(X_test) print('\nClassifier performance') print('In sample:\n', metrics.classification_report(Y_train, Y_train_pred)) print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred)) # %% [markdown] ''' Overall, the model seems to do well in distinguishing between very inactive periods ("sit-stand" and "sleep") and very active ones ("bicycling"), but there seems to be confusion between the remaining activities. ## Plot predicted vs. true activity profiles Using our utility function, let's plot the activity profile for participant `006`. Here we also pass the acceleration mean for plotting purposes.
random_state=0) bbc_score = [] brfc_score = [] eec_score = [] rbc_score = [] for train_index, test_index in kf.split(X, y): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] bbc.fit(X_train, y_train) brfc.fit(X_train, y_train) eec.fit(X_train, y_train) rbc.fit(X_train, y_train) y_pred_bbc = bbc.predict(X_test) y_pred_brfc = brfc.predict(X_test) y_pred_eec = eec.predict(X_test) y_pred_rbc = rbc.predict(X_test) bbc_score.append(balanced_accuracy_score(y_test, y_pred_bbc)) brfc_score.append(balanced_accuracy_score(y_test, y_pred_brfc)) eec_score.append(balanced_accuracy_score(y_test, y_pred_eec)) rbc_score.append(balanced_accuracy_score(y_test, y_pred_rbc)) print("\t Average score:\t\t Standard deviation:") print("bbc\t", sum(bbc_score) / float(len(bbc_score)), "\t", statistics.stdev(bbc_score)) print("brfc\t", sum(brfc_score) / float(len(brfc_score)), "\t", statistics.stdev(brfc_score)) print("eec\t",
from sklearn.model_selection import KFold from sklearn.metrics import make_scorer, accuracy_score from sklearn.pipeline import Pipeline df_X_train = pd.read_csv('../data/raw/X_train.csv') df_y_train = pd.read_csv('../data/raw/y_train.csv') df_X_test = pd.read_csv('../data/raw/X_test.csv').set_index('id', drop=True) df_train = pd.merge(df_y_train, df_X_train, on='id').set_index('id', drop=True) #df_train = pd.concat([df_train.loc[df_train['y'] == 1].sample(n=600), df_train.loc[df_train['y'] != 1]]).sample(frac=1).reset_index(drop=True) print(df_train) X = df_train Y = X['y'].values X = X.drop('y', axis=1).values X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=42) scorer = make_scorer(accuracy_score) X_train = X y_train = Y X_test = df_X_test.values scaler = RobustScaler().fit(X_train) rescaled_X_train = scaler.transform(X_train) rescaled_X_test = scaler.transform(X_test) model = BalancedRandomForestClassifier(random_state=42, n_estimators=156) model.fit(rescaled_X_train, y_train) y_pred = model.predict(rescaled_X_test) #predicted values without index column df_y_pred = pd.DataFrame({'id': np.arange(np.size(y_pred)), 'y': y_pred}) df_y_pred.to_csv('../data/processed/y_pred.csv', index=False)
# Classification using random forest classifier with and without sampling ############################################################################### # Random forest is another popular ensemble method and it is usually # outperforming bagging. Here, we used a vanilla random forest and its balanced # counterpart in which each bootstrap sample is balanced. rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf.fit(X_train, y_train) brf.fit(X_train, y_train) y_pred_rf = rf.predict(X_test) y_pred_brf = brf.predict(X_test) # Similarly to the previous experiment, the balanced classifier outperform the # classifier which learn from imbalanced bootstrap samples. In addition, random # forest outsperforms the bagging classifier. print('Random Forest classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format( balanced_accuracy_score(y_test, y_pred_rf), geometric_mean_score(y_test, y_pred_rf))) cm_rf = confusion_matrix(y_test, y_pred_rf) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_rf, classes=np.unique(satimage.target), ax=ax[0], title='Random forest')
''' ## Train a random forest classifier *Note: this may take a while* ''' # %% clf = BalancedRandomForestClassifier(n_estimators=2000, replacement=True, sampling_strategy='not minority', n_jobs=4, random_state=42, verbose=1) clf.fit(X_train, Y_train) Y_test_pred = clf.predict(X_test) print('\nClassifier performance') print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred, zero_division=0)) # %% [markdown] ''' ## Robustness to unforseen scenarios What if the subjects in the test set wore the device differently from those in the training set? For example, suppose that all the subjects in the training set were right-handed, but the test subjects are left-handed. This would more or less result in the device being rotated. <img src="wrist_accelerometer.jpg" width="200"/>
class BalancedBinaryClassifier(BaseEstimator, ClassifierMixin): def __init__(self, max_depth=None, n_features=10, selector=ranksum, trend="both", space_mask=None): self.max_depth = max_depth self.n_features = n_features self.selector = selector self.model_ = BalancedRandomForestClassifier(max_depth=max_depth, n_estimators=100, random_state=777) self.trend = trend self.space_mask = space_mask def fit(self, X, y): X, y = check_X_y(X, y) self.mask = self.trend_mask = np.zeros(X.shape[1]) self.classes_ = unique_labels(y) if self.classes_.shape[0] != 2: raise Exception( 'Current implementation only support binary classification') self.importance = self.selector(X, y) flag1 = flag2 = False mean_diff = X[y == 1, :].mean(axis=0) - X[y == 0, :].mean(axis=0) if self.trend == "up": flag1 = True self.trend_mask[mean_diff <= 0] = 1 print("Trend mask: {}/{}".format(int(self.trend_mask.sum()), X.shape[1])) if self.space_mask is not None: flag2 = True self.space_mask = np.array(self.space_mask).astype(int) print("Space mask: {}/{}".format(int(self.space_mask.sum()), X.shape[1])) else: self.space_mask = np.zeros(X.shape[1]) if flag1 or flag2: self.mask = self.trend_mask + self.space_mask self.mask[self.mask > 1] = 1 print("Remained: {}/{}".format(X.shape[1] - int(self.mask.sum()), X.shape[1])) self.importance[self.mask.astype(bool)] = self.importance.min() - 1 if self.trend == "both_balance": n_up = int(self.n_features / 2) n_down = self.n_features - n_up up_importance = copy(self.importance) down_importance = copy(self.importance) up_importance[mean_diff < 0] = up_importance.min() - 1 down_importance[mean_diff > 0] = down_importance.min() - 1 up_order = np.argsort(up_importance)[::-1] down_order = np.argsort(down_importance)[::-1] features = np.array( list(up_order[:n_up]) + list(down_order[:n_down])) print(features) else: order = np.argsort(self.importance)[::-1] features = order[:self.n_features] self.features = features self.model_.fit(X[:, self.features], y) def predict(self, X): check_is_fitted(self) return self.model_.predict(X[:, self.features]) def predict_proba(self, X): check_is_fitted(self) return self.model_.predict_proba(X[:, self.features])
indices = np.argsort(importances)[::-1] names = [train.columns[i] for i in indices] # Barplot: Add bars plt.bar(range(train.shape[1]), importances[indices]) # Add feature names as x-axis labels plt.xticks(range(train.shape[1]), names, rotation=20, fontsize=8) plt.yticks(range(0, 35, 5), fontsize=12) plt.grid(b=None, axis='x') # Create plot title plt.title("Feature Importances") # Show plot plt.show() #Training data prediction train_rf_predictions = model.predict(train) train_rf_probs = model.predict_proba(train)[:, 1] # Testing predictions (to determine performance) rf_predictions = model.predict(test) rf_probs = model.predict_proba(test)[:, 1] #Combine predicted train data odds with team name and year train_1 = pd.concat([train_year, train_team, train], axis=1) train_1.reset_index(drop=True, inplace=True) train_1 = pd.concat([train_1, pd.DataFrame(train_labels)], axis=1) train_1 = train_1.rename(columns={0: 'Champion'}) train_1 = pd.concat([train_1, pd.DataFrame(train_rf_probs)], axis=1) train_1 = train_1.rename(columns={'Year': 'Year', 'Team': 'Team', 0: 'Probs'}) #train_1.sort_values(by=['Probs'],ascending=False)
[`imbalanced-learn`](https://imbalanced-learn.org/stable/) package, which has better support for imbalanced datasets. ''' # %% clf = BalancedRandomForestClassifier( n_estimators=1000, replacement=True, sampling_strategy='not minority', n_jobs=4, random_state=42, ) clf.fit(X_feats, Y) print('\nClassifier performance in training set') print(metrics.classification_report(Y, clf.predict(X_feats), zero_division=0)) # %% [markdown] ''' The classification in-sample is just acceptable. This suggests that we might need to add more discriminative features. Let's load another subject to test and get the true (out-of-sample) performance. ''' # %% # Load another participant data data2 = pd.read_pickle(CAPTURE24_PATH + '077.pkl').dropna() # Translate annotations data2['label'] = anno_label_dict.loc[data2['annotation'], 'label:Willetts2018'].values
def callAI(claim): x = pd.read_csv('x.csv') y = pd.read_csv('y.csv') months_as_customer = random.randint(8, 40) age = claim['age'] policy_state = ['IL', 'IN', 'OH'][random.randint(0, 3)] policy_csl = ('500/1000', '100/300', '250/500')[random.randint(0, 3)] policy_deductable = 1000 * random.randint(0, 10) policy_annual_premium = 500 * random.randint(0, 10) umbrella_limit = 10000 * random.randint(0, 3) insured_zip = random.randint(111111, 999999) insured_sex = claim['insured_sex'].upper() edu = ('Masters', 'High School', 'Associate', 'JD', 'College', 'MD', 'PhD') insured_education_level = edu[random.randint(0, len(edu))] occupation = ('other-service', 'priv-house-serv', 'adm-clerical', 'handlers-cleaners', 'prof-specialty', 'protective-serv', 'machine-op-inspct', 'armed-forces', 'sales', 'tech-support', 'transport-moving', 'craft-repair', 'farming-fishing', 'exec-managerial') insured_occupation = occupation[random.randint(0, len(occupation))] hobbies = ('camping', 'kayaking', 'golf', 'dancing', 'bungie-jumping', 'movies', 'basketball', 'exercise', 'sleeping', 'video-games', 'skydiving', 'paintball', 'hiking', 'base-jumping', 'reading', 'polo', 'board-games', 'yachting', 'cross-fit', 'chess') insured_hobbies = hobbies[random.randint(0, len(hobbies))] insured_relationship = claim['insured_relationship'] capital_gains = 500 * random.randint(0, 10) capital_loss = 500 * random.randint(0, 10) type_of_admission = claim['type_of_admission'] type_of_visit = claim['type_of_visit'] incident_severity = claim['incident_severity'] source_of_admission = claim['source_of_admission'] h_state = ('WV', 'NY', 'VA', 'PA', 'SC', 'NC', 'OH') hospital_state = h_state[random.randint(0, len(h_state))] h_city = ('Northbrook', 'Riverwood', 'Northbend', 'Springfield', 'Hillsdale', 'Columbus', 'Arlington') hospital_city = h_city[random.randint(0, len(h_city))] service_provider = "Long Island Medical Arts" hospitalized_hour_of_the_day = random.randint(0, 25) status_when_brought_in = claim['status_when_brought_in'] survival_status = claim['survival_status'] duration_of_hospitalization = claim['duration_of_hospitalization'] medical_staff = claim['medical_staff'] total_claim_amount = claim['total_claim'] board_claim = claim['board_claim'] pharmacy_claim = claim['pharmacy_claim'] doctor_consultation_claim = claim['doctor_claim'] rsn = ('GORD', 'Appendectomy', 'Hemorrhoidectomy', 'Kidney', 'Cataract', 'Delivery', 'Liver', 'Cancer', 'Lungs', 'Brain', 'Prosthetics', 'Heart', 'Stones', 'ALS') reason = rsn[random.randint(0, len(rsn))] r_type = ('B123', 'RSX', 'L1', 'J5', 'A12', 'H763', 'H445', 'CR362', 'D2', 'L14', 'C93', 'TL', 'A3', 'MDX', 'C736', 'J1', 'S9', 'E400', 'H1', 'P1', 'S2', '92x', 'A1', 'D1', 'X5', 'L72', 'M5', 'S1', 'A5', 'C633', 'LN142', 'F150', 'C300', 'ML350', 'LN132', 'X6') reason_type = r_type[random.randint(0, len(r_type))] diagnosed_year = random.randint(2007, 2020) hospitalized_month = datetime.now().month hospitalized_day = datetime.now().day x_test = pd.DataFrame(columns=[ 'months_as_customer', 'age', 'policy_state', 'policy_csl', 'policy_deductable', 'policy_annual_premium', 'umbrella_limit', 'insured_zip', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', 'capital-gains', 'capital-loss', 'type_of_admission', 'type_of_visit', 'incident_severity', 'source_of_admission', 'hospital_state', 'hospital_city', 'service_provider', 'hospitalized_hour_of_the_day', 'status_when_brought_in', 'survival_status', 'duration_of_hospitalization', 'medical_staff', 'total_claim_amount', 'board_claim', 'pharmacy_claim', 'doctor_consultation_claim', 'reason', 'reason_type', 'diagnosed_year', 'hospitalized_month', 'hospitalized_day' ], index=['a']) x_test.loc['a'] = [ months_as_customer, age, policy_state, policy_csl, policy_deductable, policy_annual_premium, umbrella_limit, insured_zip, insured_sex, insured_education_level, insured_occupation, insured_hobbies, insured_relationship, capital_gains, capital_loss, type_of_admission, type_of_visit, incident_severity, source_of_admission, hospital_state, hospital_city, service_provider, hospitalized_hour_of_the_day, status_when_brought_in, survival_status, duration_of_hospitalization, medical_staff, total_claim_amount, board_claim, pharmacy_claim, doctor_consultation_claim, reason, reason_type, diagnosed_year, hospitalized_month, hospitalized_day ] x_test['reason_type'] = x_test['reason_type'].replace( ('B123', 'RSX', 'L1', 'J5', 'A12', 'H763', 'H445', 'CR362', 'D2', 'L14', 'C93', 'TL', 'A3', 'MDX', 'C736', 'J1', 'S9', 'E400', 'H1', 'P1', 'S2', '92x', 'A1', 'D1', 'X5', 'L72', 'M5', 'S1', 'A5', 'C633', 'LN142', 'F150', 'C300', 'ML350', 'LN132', 'X6'), (0.95, 0.91, 0.90, 0.88, 0.87, 0.86, 0.85, 0.85, 0.84, 0.83, 0.81, 0.80, 0.78, 0.77, 0.77, 0.76, 0.75, 0.74, 0.73, 0.72, 0.71, 0.71, 0.71, 0.71, 0.70, 0.68, 0.67, 0.67, 0.66, 0.64, 0.62, 0.62, 0.61, 0.60, 0.59, 0.56)) x_test['reason'] = x_test['reason'].replace( ('GORD', 'Appendectomy', 'Hemorrhoidectomy', 'Kidney', 'Cataract', 'Delivery', 'Liver', 'Cancer', 'Lungs', 'Brain', 'Prosthetics', 'Heart', 'Stones', 'ALS'), (0.84, 0.82, 0.81, 0.80, 0.77, 0.76, 0.75, 0.74, 0.73, 0.72, 0.71, 0.69, 0.69, 0.66)) x_test['survival_status'] = x_test['survival_status'].replace( ('NO', 'YES'), (0.76, 0.74)) x_test['hospital_city'] = x_test['hospital_city'].replace( ('Northbrook', 'Riverwood', 'Northbend', 'Springfield', 'Hillsdale', 'Columbus', 'Arlington'), (0.78, 0.77, 0.76, 0.75, 0.74, 0.73, 0.71)) x_test['hospital_state'] = x_test['hospital_state'].replace( ('WV', 'NY', 'VA', 'PA', 'SC', 'NC', 'OH'), (0.82, 0.77, 0.76, 0.73, 0.70, 0.69, 0.56)) x_test['source_of_admission'] = x_test['source_of_admission'].replace( ('None', 'Self', 'Neighbor', 'Family', 'Ambulance', 'Other'), (1.0, 0.93, 0.79, 0.73, 0.70, 0.68)) x_test['incident_severity'] = x_test['incident_severity'].replace( (1, 3, 5, 4), (0.94, 0.89, 0.87, 0.39)) x_test['type_of_visit'] = x_test['type_of_visit'].replace( ('V67', 'V55', 'V73'), (0.78, 0.74, 0.72)) x_test['type_of_admission'] = x_test['type_of_admission'].replace( ('AD3', 'AD6', 'AD8', 'AD1'), (0.91, 0.90, 0.72, 0.70)) x_test['insured_relationship'] = x_test['insured_relationship'].replace( ('husband', 'own-child', 'unmarried', 'not-in-family', 'wife', 'other-relative'), (0.79, 0.78, 0.75, 0.74, 0.72, 0.70)) x_test['insured_hobbies'] = x_test['insured_hobbies'].replace( ('camping', 'kayaking', 'golf', 'dancing', 'bungie-jumping', 'movies', 'basketball', 'exercise', 'sleeping', 'video-games', 'skydiving', 'paintball', 'hiking', 'base-jumping', 'reading', 'polo', 'board-games', 'yachting', 'cross-fit', 'chess'), (0.91, 0.90, 0.89, 0.88, 0.84, 0.83, 0.82, 0.81, 0.805, 0.80, 0.78, 0.77, 0.76, 0.73, 0.73, 0.72, 0.70, 0.69, 0.25, 0.17)) x_test['insured_occupation'] = x_test['insured_occupation'].replace( ('other-service', 'priv-house-serv', 'adm-clerical', 'handlers-cleaners', 'prof-specialty', 'protective-serv', 'machine-op-inspct', 'armed-forces', 'sales', 'tech-support', 'transport-moving', 'craft-repair', 'farming-fishing', 'exec-managerial'), (0.84, 0.84, 0.83, 0.79, 0.78, 0.77, 0.76, 0.75, 0.72, 0.71, 0.705, 0.70, 0.69, 0.63)) x_test['insured_education_level'] = x_test[ 'insured_education_level'].replace( ('Masters', 'High School', 'Associate', 'JD', 'College', 'MD', 'PhD'), (0.78, 0.77, 0.76, 0.74, 0.73, 0.72, 0.71)) x_test['insured_sex'] = x_test['insured_sex'].replace(('FEMALE', 'MALE'), (0.76, 0.73)) x_test['policy_csl'] = x_test['policy_csl'].replace( ('500/1000', '100/300', '250/500'), (0.78, 0.74, 0.73)) x_test['policy_state'] = x_test['policy_state'].replace( ('IL', 'IN', 'OH'), (0.77, 0.745, 0.74)) x_test['service_provider'] = x_test['service_provider'].replace( ('Long Island Medical Arts', 'Francis W Iacobellis', 'Lenox Hill Hospital', 'Otis M Jones', 'Ms St Lukes And Roosevelt', 'Mount Sinai Hospital', 'Nyp-Weill Cornell'), (0.778, 0.776, 0.765, 0.757, 0.751, 0.74, 0.71)) model = BalancedRandomForestClassifier(n_estimators=100, random_state=0) model.fit(x, y) y_pred_rf = model.predict(x_test) return y_pred_rf[0]
def main(data_path_list): df_all = get_all_seq(data_path_list, 'COURSE_ACCESS') df_all['09_weekday_seq'] = df_all['09_day_list'].apply(get_weekday) df_all['09_weekend_seq'] = df_all['09_day_list'].apply(get_weekday) df_all['10_weekday_seq'] = df_all['10_day_list'].apply(get_weekday) df_all['10_weekend_seq'] = df_all['10_day_list'].apply(get_weekday) df_all['11_weekday_seq'] = df_all['11_day_list'].apply(get_weekday) df_all['11_weekend_seq'] = df_all['11_day_list'].apply(get_weekday) df_all['12_weekday_seq'] = df_all['12_day_list'].apply(get_weekday) df_all['12_weekend_seq'] = df_all['12_day_list'].apply(get_weekday) df_all['total_weekday_seq'] = df_all['total_list'].apply(get_weekday) df_all['total_weekend_seq'] = df_all['total_list'].apply(get_weekday) df_all = get_weekday_seq_entropy(df_all, 5) df_all = get_weekend_seq_entropy(df_all, 2) df_all = add_at_risk_label(df_all) n_list = list(df_all.columns) pattern = re.compile('.*_entropy_.*') entropy_list = ['De-id'] for i in n_list: if pattern.match(i): entropy_list.append(i) df_all_entropy = df_all[entropy_list] df_all_entropy = df_all_entropy.rename( columns={'De-id': 'MASKED_STUDENT_ID'}) ''' Till this get all seq entropy features ''' lib_se1 = pd.read_csv('Std_Lib_features_2016_se1.csv') his_2015_se1 = pd.read_csv('Std_list_atRist_2015_se1.csv') his_2015_se2 = pd.read_csv('Std_list_atRist_2015_se2.csv') his_2015_se1.columns = ['MASKED_STUDENT_ID', '2015_se1_CUM_GPA'] his_2015_se2.columns = ['MASKED_STUDENT_ID', '2015_se2_CUM_GPA'] his_lib = pd.merge(lib_se1, his_2015_se1, on='MASKED_STUDENT_ID', how='left').fillna(0) his_lib = pd.merge(his_lib, his_2015_se2, on='MASKED_STUDENT_ID', how='left').fillna(0) df_se1 = pd.merge(df_all_entropy, his_lib, on='MASKED_STUDENT_ID', how='left').fillna(0) ''' Add historical grades for one year ''' # lib_se1 = pd.read_csv('Std_Lib_features_2016_se1.csv') # df_se1= lib_se1 df = pd.read_csv('DR0008_activity_accumulator_2016_09.csv', sep=' ') df['weekday'] = pd.to_datetime(df['timestamp']).dt.dayofweek df['is_weekday'] = df['weekday'].apply(lambda x: 1 if x <= 5 else 0) df_weekday = df[df['is_weekday'] == 1] df_weekend = df[df['is_weekday'] == 0] PRE_FIX = '09_weekday_' df_weekday_one_month = extract_one_month(df_weekday, PRE_FIX) PRE_FIX = '09_weekend_' df_weekend_one_month = extract_one_month(df_weekend, PRE_FIX) df_se1 = pd.merge(df_se1, df_weekday_one_month, on=['MASKED_STUDENT_ID'], how='left').fillna(0) df_se1 = pd.merge(df_se1, df_weekend_one_month, on=['MASKED_STUDENT_ID'], how='left').fillna(0) del df del df_weekday_one_month del df_weekend_one_month df = pd.read_csv('DR0008_activity_accumulator_2016-10.csv', sep=' ') df['weekday'] = pd.to_datetime(df['timestamp']).dt.dayofweek df['is_weekday'] = df['weekday'].apply(lambda x: 1 if x <= 5 else 0) df_weekday = df[df['is_weekday'] == 1] df_weekend = df[df['is_weekday'] == 0] PRE_FIX = '10_weekday_' df_weekday_one_month = extract_one_month(df_weekday, PRE_FIX) PRE_FIX = '10_weekend_' df_weekend_one_month = extract_one_month(df_weekend, PRE_FIX) df_se1 = pd.merge(df_se1, df_weekday_one_month, on=['MASKED_STUDENT_ID'], how='left').fillna(0) df_se1 = pd.merge(df_se1, df_weekend_one_month, on=['MASKED_STUDENT_ID'], how='left').fillna(0) del df del df_weekday_one_month del df_weekend_one_month df = pd.read_csv('DR0008_activity_accumulator_2016-11.csv', sep=' ') df['weekday'] = pd.to_datetime(df['timestamp']).dt.dayofweek df['is_weekday'] = df['weekday'].apply(lambda x: 1 if x <= 5 else 0) df_weekday = df[df['is_weekday'] == 1] df_weekend = df[df['is_weekday'] == 0] PRE_FIX = '11_weekday_' df_weekday_one_month = extract_one_month(df_weekday, PRE_FIX) PRE_FIX = '11_weekend_' df_weekend_one_month = extract_one_month(df_weekend, PRE_FIX) df_se1 = pd.merge(df_se1, df_weekday_one_month, on=['MASKED_STUDENT_ID'], how='left').fillna(0) df_se1 = pd.merge(df_se1, df_weekend_one_month, on=['MASKED_STUDENT_ID'], how='left').fillna(0) del df del df_weekday_one_month del df_weekend_one_month df = pd.read_csv('DR0008_activity_accumulator_2016-12.csv', sep=' ') df['weekday'] = pd.to_datetime(df['timestamp']).dt.dayofweek df['is_weekday'] = df['weekday'].apply(lambda x: 1 if x <= 5 else 0) df_weekday = df[df['is_weekday'] == 1] df_weekend = df[df['is_weekday'] == 0] PRE_FIX = '12_weekday_' df_weekday_one_month = extract_one_month(df_weekday, PRE_FIX) PRE_FIX = '12_weekend_' df_weekend_one_month = extract_one_month(df_weekend, PRE_FIX) df_se1 = pd.merge(df_se1, df_weekday_one_month, on=['MASKED_STUDENT_ID'], how='left').fillna(0) df_se1 = pd.merge(df_se1, df_weekend_one_month, on=['MASKED_STUDENT_ID'], how='left').fillna(0) del df del df_weekday_one_month del df_weekend_one_month ''' Till this got LMS week statistical features ''' # merge feature df_se1 = pd.merge(df_se1, df_all_entropy, on='MASKED_STUDENT_ID', how='left').fillna(0) df_se1_features = df_se1[[ i for i in df_se1.columns if i != 'label_atRist' and i != 'MASKED_STUDENT_ID' ]] df_se1_labels = df_se1['label_atRist'] # classification X_train, X_test, y_train, y_test = train_test_split(df_se1_features, df_se1_labels, test_size=0.2, stratify=df_se1_labels) brf = BalancedRandomForestClassifier(n_estimators=300, criterion='gini', random_state=0) brf.fit(X_train, y_train) y_pred = brf.predict(X_test) # imp_feature = brf.feature_importances_ print(confusion_matrix(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) print(balanced_accuracy_score(y_test, y_pred)) '''
print("_____________________________________ \n Balanced Random Forest") # all features clf_brf_all = BalancedRandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1, max_depth=4, min_samples_split=0.05).fit( X_train, y_train.values.ravel()) print(f"All features results: \n", f"{list(loss_intensity.columns.values)[0]} - All training score is", clf_brf_all.score(X_train, y_train.values.ravel())) print(f"{list(loss_intensity.columns.values)[0]} - All test score is", clf_brf_all.score(X_test, y_test.values.ravel())) y_pred = clf_brf_all.predict(X_test) #select most important ones sel = SelectFromModel(BalancedRandomForestClassifier(n_estimators=1000, random_state=0), max_features=5) sel.fit(X_train, y_train.values.ravel()) selected_feat = X_train.columns[(sel.get_support())] print("\n Balanced Random Forest \n The selected features are", len(selected_feat), selected_feat.values) # transform X_train_selected = sel.transform(X_train) X_test_selected = sel.transform(X_test) # select features clf_brf = BalancedRandomForestClassifier(n_estimators=1000, random_state=0,
# time: 2021/04/16 import scipy.io as scio from imblearn.ensemble import BalancedRandomForestClassifier import scipy from sklearn.metrics import balanced_accuracy_score from sklearn.ensemble import BaggingClassifier from sklearn.tree import DecisionTreeClassifier brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) year = 15 while (year < 19): year_str = str(year) year += 1 train_data = scio.loadmat("/data/file/classification_data/SJ" + year_str + "/trainData.mat")["x_train"] train_label = scio.loadmat("/data/file/classification_data/SJ" + year_str + "/trainlabel.mat")["trainlabel"].ravel() test_data = scio.loadmat("/data/file/classification_data/SJ" + year_str + "/testData.mat")["x_test"] test_label = scio.loadmat("/data/file/classification_data/SJ" + year_str + "/testlabel.mat")["testlabel"].ravel() brf.fit(train_data, train_label) label_pred = brf.predict(test_data).reshape(-1, 1) print(label_pred) scipy.io.savemat( "/data/file/classification_data/pre/forest_of_random/SJ" + year_str + "/label_pred.mat", {'label_pred': label_pred})
############################################################################### # Classification using random forest classifier with and without sampling ############################################################################### # Random forest is another popular ensemble method and it is usually # outperforming bagging. Here, we used a vanilla random forest and its balanced # counterpart in which each bootstrap sample is balanced. rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1) rf.fit(X_train, y_train) brf.fit(X_train, y_train) y_pred_rf = rf.predict(X_test) y_pred_brf = brf.predict(X_test) # Similarly to the previous experiment, the balanced classifier outperform the # classifier which learn from imbalanced bootstrap samples. In addition, random # forest outsperforms the bagging classifier. print('Random Forest classifier performance:') print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}' .format(balanced_accuracy_score(y_test, y_pred_rf), geometric_mean_score(y_test, y_pred_rf))) cm_rf = confusion_matrix(y_test, y_pred_rf) fig, ax = plt.subplots(ncols=2) plot_confusion_matrix(cm_rf, classes=np.unique(satimage.target), ax=ax[0], title='Random forest') print('Balanced Random Forest classifier performance:')
X_train, X_test, y_train, y_test = train_test_split(working_df['lyrics'], y, test_size=.40) vectorizer = TfidfVectorizer( stop_words=all_stop_words, ngram_range=(1, 3), # , max_df=.8, # min_df=.2, max_features=10000) vectorizer.fit(X_train) X_train_vec = vectorizer.transform(X_train) features = vectorizer.get_feature_names() imbrf = BalancedRandomForestClassifier(n_estimators=5000, max_features='auto', sampling_strategy=0.5).fit( X_train_vec, y_train) X_test_vec = vectorizer.transform(X_test) y_pred = imbrf.predict(X_test_vec) for score, term in zip(imbrf.feature_importances_, features): if term not in aggr_feat_imp_dict: aggr_feat_imp_dict[term] = score else: aggr_feat_imp_dict[term] += score with open('feat_ranks_dict.json', 'w') as fp: json.dump(aggr_feat_imp_dict, fp)
''' Till this got LMS week statistical features ''' # merge feature df_se1 = pd.merge(df_se1, df_all_entropy, on='MASKED_STUDENT_ID', how='left').fillna(0) df_se1_features = df_se1[[i for i in df_se1.columns if i != 'label_atRist' and i != 'MASKED_STUDENT_ID']] df_se1_labels = df_se1['label_atRist'] # classification X_train, X_test, y_train, y_test = train_test_split(df_se1_features, df_se1_labels, test_size = 0.2, stratify=df_se1_labels) brf = BalancedRandomForestClassifier(n_estimators=300, criterion = 'gini', random_state=0) brf.fit(X_train, y_train) y_pred = brf.predict(X_test) imp_feature = brf.feature_importances_ print(confusion_matrix(y_test, y_pred)) print(accuracy_score(y_test, y_pred)) print(balanced_accuracy_score(y_test, y_pred)) ''' 2019.5.30 1. 加了lms统计特征(weekday and weekend)和seq统计特征(weekday and weekend) 2. 划分训练集采用同分布 Best acc now: 0.7309003914745542 Next Step: 1. 加入历史成绩特征 2. 观察具体 weekday 和 weekend 两类数据分布具体有何不同 3. 测试用半学期行为数据early predict
def BalancedRF_classifier(df, y_column, feature_columns, test_rate): # 不均衡クラス分類用ランダムフォレスト # 混合行列や重要度の高い変数を可視化する # 説明変数、目的変数の作成 X = df.loc[:, feature_columns].values Y = df.loc[:, y_column].values # 学習用、検証用データに分割 (X_train, X_test, Y_train, Y_test) = train_test_split(X, Y, test_size=test_rate, random_state=123, shuffle=True) ''' # モデル構築、パラメータはデフォルト parameters = { 'n_estimators' : [5, 10, 20, 30, 50], 'max_features' : [3, 5, 10, 15, 20], 'random_state' : [0], 'n_jobs' : [2], 'min_samples_split' : [3, 5, 10, 15, 20, 25, 30], 'max_depth' : [3, 5, 10, 15, 20, 25, 30, 50, 100] } clf = GridSearchCV(RandomForestClassifier(), parameters) clf.fit(X_train, Y_train) print(clf.best_estimator_)''' model = BalancedRandomForestClassifier(n_jobs=1, n_estimators=30, sampling_strategy='not minority') print(model.get_params()) model.fit(X_train, Y_train) # 正解率 print("正解率 : " + str(model.score(X_test, Y_test) * 100) + "%") print("訓練データの正解率 : " + str(model.score(X_train, Y_train) * 100) + "%") # confusion matrix を確認する print("confusion matrix") prediction = model.predict(X_test) labels = list(set(Y)) print_cmx(Y_test, prediction, labels) # 効いてる変数を調べる importances = None i = np.array([e.feature_importances_ for e in model.estimators_]) avg_i = np.array([e.feature_importances_ for e in model.estimators_]).mean(axis=0) importances = pd.DataFrame({ 'variable': feature_columns, 'importance': avg_i }).sort_values('importance', ascending=False).reset_index(drop=True) display(importances) IMP = importances.copy() plt.figure(figsize=(5, 7)) plt.plot(IMP.importance, sorted([i + 1 for i in range(IMP.shape[0])], reverse=True), 'o-') plt.yticks(sorted([i + 1 for i in range(IMP.shape[0])], reverse=True), IMP.variable) plt.xlabel('importance') # plt.xlabel('重要度') plt.show() return model, importances, (X_train, X_test, Y_train, Y_test)
*Note: this may take a while* ''' # %% clf = BalancedRandomForestClassifier( n_estimators=2000, replacement=True, sampling_strategy='not minority', oob_score=True, n_jobs=4, random_state=42, verbose=1 ) clf.fit(X_train, Y_train) Y_test_pred = clf.predict(X_test) print('\nClassifier performance') print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred, zero_division=0)) # This will be the training set Y_in_train = clf.oob_decision_function_.astype('float32') # This will be the test set Y_in_test = clf.predict_proba(X_test).astype('float32') # %% [markdown] ''' ## Architecture design As a baseline, let's use a single-layer bidirectional LSTM. PyTorch uses a sligtly unintuitive array format for the input and output of its LSTM module.
sample['risk_flag_weighted_rfc'] = weighted_clf.predict(test_df.drop(columns = ['risk_flag'])) sample['risk_flag_proba_weighted_rfc'] = weighted_clf.predict_proba(test_df.drop(columns = ['risk_flag']))[:,1] weighted_clf.classes_ sample.to_csv('weighted_rfc.csv',index = False) #balancedrfc from imblearn.ensemble import BalancedRandomForestClassifier brfc = BalancedRandomForestClassifier(n_estimators=500,random_state=0).fit(X_tr,y_tr) roc_auc_score(y_tst,brfc.predict(X_tst)) sample['risk_flag'] = brfc.predict(test_df.drop(columns = ['risk_flag'])) sample['risk_flag_proba'] = brfc.predict_proba(test_df.drop(columns = ['risk_flag']))[:,1] weighted_clf.classes_ print("F1 Score for Balanced Random Forest Classifier is ", f1_score(y_test,brfc.predict(X_test))) print("Accuracy Score for Balanced Random Fo #catboost
def Clasificar(database, new, path): pd.options.mode.chained_assignment = None if 'Response by Category' in list(database.columns): database = database.drop(['Response by Category','Response by Description'], axis = 1) database = database.sample(frac= 0.4, replace = False) #Chequeo las companias que ya estaban clasificadas #d = new.merge(database, how ='left', left_on='Organization Name', right_on = 'Investee')[['Investee','Category.1','Area of Focus']] #new = new.merge(d, how = "left", left_on = "Organization Name", right_on = "Investee") #new = new.drop(columns=["Investee"]) database["Category.1"] = database["Category.1"].replace("rejected", "Rejected") database["Category.1"] = database["Category.1"].replace("B2C ", "B2C") database["Category.1"] = database["Category.1"].replace("FIntech", "Fintech") database['Prediction'] = np.nan new['Prediction'] = np.nan new = new.drop(['Prediction'], axis=1) #CLASIFICADOR warnings.filterwarnings('ignore') print('Importando bases de datos') new = new.rename(columns = {'Categories':'Category','Organization Name':'Investee'}) train = database[['Operation','Investee', 'Category', 'Description', 'Category.1', 'Area of Focus']].dropna() newdata = new[['Transaction Name','Investee', 'Category', 'Description']] print('Preprocesamiento del texto') stop_words = stopwords.words('english') for column in ['Category','Description']: train[column] = train[column].apply(lambda x: (" ".join(str(x).lower() for x in str(x).split())).encode('utf-8').decode('utf-8')) # lower case train[column] = train[column].str.replace('[^\w\s]', ' ') # removing punctuation train[column] = train[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words)) # removing stop words newdata[column] = newdata[column].apply(lambda x: (" ".join(x.lower() for x in str(x).split()))) # lower case newdata[column] = newdata[column].str.replace('[^\w\s]', ' ') # removing punctuation newdata[column] = newdata[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words)) # removing stop words train_src1 = train[['Category','Description','Category.1']] train_src1['Rejected?'] = 0 train_src1.loc[train_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1 new_src1 = newdata[['Category','Description']] #new_src1['Rejected?'] = 0 #new_src1.loc[new_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1 #Binarizacion vectorizer = CountVectorizer() vectorI = pd.DataFrame(vectorizer.fit_transform(train_src1['Category']).toarray()) vectorI_new = pd.DataFrame(vectorizer.transform(new_src1['Category']).toarray()) vectorIdes = pd.DataFrame(vectorizer.fit_transform(train_src1['Description']).toarray()) vectorIdes_new = pd.DataFrame(vectorizer.transform(new_src1['Description']).toarray()) vectorI = pd.concat([vectorI, vectorIdes], axis = 1) vectorI_new = pd.concat([vectorI_new, vectorIdes_new], axis = 1) print('Entrenamiento') #Clasificacion binaria: Rechazadas vs no rechazadas #Resampling + Random Forest brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0) brf.fit(vectorI, train_src1['Rejected?']) y_train_pred = brf.predict(vectorI) print('Confusion matrix: \n' , confusion_matrix(train_src1['Rejected?'], y_train_pred)) print('Accuracy: \n' , accuracy_score(train_src1['Rejected?'], y_train_pred)) print('Recall: \n' , recall_score(train_src1['Rejected?'], y_train_pred)) print('Clasificacion y exportacion') #Ajustando modelo a nuevos datos y_new_predict = brf.predict(vectorI_new) y_new_predict_proba = brf.predict_proba(vectorI_new) newdata['Prediction'] = y_new_predict newdata['Prob. of being rejected'] = y_new_predict_proba[:,0] newdata['Prob. of being of interest'] = y_new_predict_proba[:,1] #Creamos archivo Companies y exportamos new = pd.concat([new, newdata[['Prediction','Prob. of being rejected','Prob. of being of interest']]], axis=1, sort=False) return new
X = data[columns] Y = data['upgrd_customer_class'] newDF = DataFrameImputer().fit_transform(X) missing = newDF.columns[newDF.isnull().any()] newDF = newDF.drop([ 'REMOTE_START_PARKING_ASSIST_CD', 'NEAR_FIELD_COMMUNICATION_FLG', 'TIRE_MOBILE_KIT_FLG', 'PREFERRED_CHANNEL_CD', 'PERSONICX_CATEGORY_CD' ], axis=1) le = MultiColumnLabelEncoder() X = le.fit_transform(X.astype(str)) transformer = RobustScaler().fit(X) X = transformer.transform(X) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42, stratify=Y) #Balanced Random Forest brf = BalancedRandomForestClassifier(n_estimators=300, random_state=0) brf.fit(X_train, y_train) print(f1_score(y_test, brf.predict(X_test)))