def random_forest(df, drop, target, show, model_name):

        # split the table into features and outcomes
        x_cols = [i for i in df.columns if i not in drop]
        X = df[x_cols]
        y = df[target]

        # split features and outcomes into train and test data
        X_train, X_test, y_train, y_test = train_test_split(X,
                                                            y,
                                                            random_state=1)
        brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        brf.fit(X_train, y_train)
        y_predictions = brf.predict(X_test)

        feature_importance = sorted(
            zip(brf.feature_importances_, X.columns.tolist()))[::-1]

        # Calculating the accuracy score.
        acc_score = balanced_accuracy_score(y_test, y_predictions)

        # Displaying results
        if show == True:
            print(f"Feature Importance: {model_name}")
            for i in feature_importance:
                print(i)
            print("\n")

        return acc_score * 100
示例#2
0
        def objective(trial):

            train_X, val_X, train_y, val_y = train_test_split(self.X,
                                                              self.y,
                                                              test_size=0.2)
            median_imputer = SimpleImputer(missing_values=np.NaN,
                                           strategy='median')
            v_train_X = median_imputer.fit_transform(train_X)
            v_val_X = median_imputer.fit_transform(val_X)
            train_X = pd.DataFrame(v_train_X,
                                   columns=train_X.columns,
                                   index=train_X.index)
            val_X = pd.DataFrame(v_val_X,
                                 columns=val_X.columns,
                                 index=val_X.index)

            v_test_X = median_imputer.fit_transform(self.X_validation)
            test_X = pd.DataFrame(v_test_X,
                                  columns=self.X_validation.columns,
                                  index=self.X_validation.index)

            list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000]

            brf_n_estimators = trial.suggest_categorical(
                'n_estimators', list_trees)
            brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0)
            brf_min_samples_split = trial.suggest_int('min_samples_split', 2,
                                                      16)
            brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)
            brf_min_weight_fraction_leaf = trial.suggest_uniform(
                'min_weight_fraction_leaf', 0, 0.5)
            brf_max_depth = trial.suggest_int('max_depth', 2, 32)

            brfmodel = BalancedRandomForestClassifier(
                n_estimators=brf_n_estimators,
                max_features=brf_max_features,
                min_samples_split=brf_min_samples_split,
                min_samples_leaf=brf_min_samples_leaf,
                max_depth=brf_max_depth,
                min_weight_fraction_leaf=brf_min_weight_fraction_leaf,
                bootstrap=True)

            brfmodel.fit(train_X, train_y)

            aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1])
            aucbrf_test = roc_auc_score(self.y_validation,
                                        brfmodel.predict_proba(test_X)[:, 1])
            print('Accuracy test ' + str(
                accuracy_score(self.y_validation, brfmodel.predict(test_X))))

            plt.figure()
            plot_confusion_matrix(brfmodel,
                                  test_X,
                                  self.y_validation,
                                  cmap=plt.cm.Blues,
                                  normalize=None)
            plt.show()
            print(aucbrf_test)

            return aucbrf
    def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator,
                           params, clf_type, question):
        estimator_scores = {}

        if estimator == 'BalancedRandomForestClassifier':
            clf = BalancedRandomForestClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'BalancedBaggingClassifier':
            clf = BalancedBaggingClassifier(
                n_estimators=params['n_estimators'],
                bootstrap=params['bootstrap'],
                max_samples=params['max_samples'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'EasyEnsembleClassifier':
            clf = EasyEnsembleClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)

        clf.fit(train_x, train_y)
        cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y,
                                                      clf_type, question)

        predicted_labels = clf.predict(test_x)

        tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel()
        specificity = round((tn / (tn + fp)) * 100, 2)

        predicted_prob = clf.predict_proba(test_x)
        predicted_prob_true = [p[1] for p in predicted_prob]

        estimator_scores['Question'] = question
        estimator_scores['Accuracy'] = round(
            accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Balanced Accuracy'] = round(
            balanced_accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Precision'] = round(
            precision_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Recall'] = round(
            recall_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Specificity'] = specificity
        estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2)
        estimator_scores['ROC AUC'] = round(
            roc_auc_score(test_y, predicted_prob_true), 2)

        # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2)))
        # perfect_labels = train_y
        # print(confusion_matrix(train_y, perfect_labels))

        return cross_val_scores, estimator_scores
def random_forest(X_train, y_train, X_test, y_test, X_train_res, y_train_res):
    rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    rf.fit(X_train, y_train.values.ravel())
    y_train_rf = rf.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_rf)
    without=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (niezbalansowany): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    rf_oversampling = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    rf_oversampling.fit(X_train_res, y_train_res.ravel())
    y_train_rf = rf_oversampling.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_rf)
    with_oversampling=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (z oversamplingiem): {}%".format(without))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])

    brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
    brf.fit(X_train, y_train.values.ravel())
    y_train_brf = brf.predict(X_test)
    cnf_matrix_tra = confusion_matrix(y_test, y_train_brf)
    within=100*cnf_matrix_tra[1,1]/(cnf_matrix_tra[1,0]+cnf_matrix_tra[1,1])
    print("Random Forest (zbalansowany - undersampling): {}%".format(within))
    print(cnf_matrix_tra[0,0],cnf_matrix_tra[1,1])
    print(brf.feature_importances_)
    
    objects = ('country','gender', 'age', 'visiting Wuhan', 'from Wuhan')
    y_pos = np.arange(len(objects))
    performance = brf.feature_importances_*100
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent zależności')
    plt.title('Zależność poszczególnych atrybutów')
    plt.show()

    objects = ('Random Forest niezbalansowany','Random Forest z oversamplingiem', 'Random Forest zbalansowany')
    y_pos = np.arange(len(objects))
    performance = [without, with_oversampling, within]
    plt.bar(y_pos, performance, align='center', alpha=0.5)
    plt.xticks(y_pos, objects)
    plt.ylabel('Procent dokładności')
    plt.title('Dokładność Random Forest')
    plt.show()

    return without, within
示例#5
0
def balanced_random_forest(train_features,
                           train_labels,
                           test_features,
                           feature_list=None,
                           hfo_type_name=None):
    rf = BalancedRandomForestClassifier(
        random_state=32,
        n_jobs=-1,  # use all available processors
        # class_weight='balanced_subsample'
    )
    rf.fit(train_features, train_labels)
    # Predict over test
    rf_predictions = rf.predict(test_features)
    rf_probs = rf.predict_proba(test_features)[:, 1]
    # IF FEATURE IMPORTANCE FIGS NOT EXISTS
    # print_feature_importances(rf, feature_list)
    # graphics.feature_importances(feature_list, rf.feature_importances_, hfo_type_name)
    return rf_predictions, rf_probs, rf
示例#6
0
    n_estimators=10,
    n_jobs=1,
    oob_score=False,
    random_state=1,
    replacement=False,
    sampling_strategy='auto',
    verbose=0,
    warm_start=False)

# In[266]:

classification_balanced_RF.fit(X_train, Y_train)

# In[267]:

Y_pred_IBRF = classification_balanced_RF.predict(X_test)

# In[268]:

# Balanced accuracy, Precision and Recall

print(balanced_accuracy_score(Y_test, Y_pred_IBRF),
      average_precision_score(Y_test, Y_pred_IBRF),
      recall_score(Y_test, Y_pred_IBRF))

# In[269]:

# Confusion matrix

matrix_BRF = confusion_matrix(Y_test, Y_pred_IBRF)
matrix_BRF
示例#7
0
class Model_Finder:
    """
               Tthis is to find the best model

               """
    def __init__(self):
        self.file_object = open("../logs/modeltune/log.txt", 'a+')
        self.saved_best_model_path = '../saved_model/best_model.sav'
        self.logger = App_Logger()
        self.transformed_data = dataTransform()
        self.df = self.transformed_data.trainingData()
        self.data = self.df.iloc[:, :-1]
        self.label = self.df.iloc[:, -1]
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.data,
            self.label,
            test_size=0.2,
            random_state=0,
            stratify=self.label)
        self.BRF = BalancedRandomForestClassifier(n_jobs=-1)
        self.EEC = EasyEnsembleClassifier(n_jobs=-1)

    def f2_make(self, y_true, y_pred):
        return fbeta_score(y_true, y_pred, beta=2)

    def get_best_params_for_balanced_random_forest(self, X_train, y_train):
        self.logger.log(
            self.file_object,
            'Entered the get_best_params_for_balanced_random_forest method of the Model_Finder class'
        )
        #def f2_make(y_true, y_pred):
        #return fbeta_score(y_true, y_pred, beta=2)

        print('in RF')
        f2 = make_scorer(self.f2_make)
        try:
            # Number of trees in random forest
            n_estimators = [80, 100, 130, 160]
            criterion = ['gini', 'entropy']
            # Number of features to consider at every split
            max_features = ['log2', 'sqrt']
            # Maximum number of levels in tree
            max_depth = [5, 8, 10, 15]
            max_depth.append(None)
            # Minimum number of samples required to split a node
            min_samples_split = [2, 5, 8]
            # Minimum number of samples required at each leaf node
            min_samples_leaf = [2, 4]
            # Method of selecting samples for training each tree
            bootstrap = [True, False]
            replacement = [True, False]
            class_weight = ['balanced', None]

            # Create the random grid
            self.param_grid = {
                'brf__n_estimators': n_estimators,
                'brf__criterion': criterion,
                'brf__max_features': max_features,
                'brf__max_depth': max_depth,
                'brf__min_samples_split': min_samples_split,
                'brf__min_samples_leaf': min_samples_leaf,
                'brf__bootstrap': bootstrap,
                'brf__replacement': replacement,
                'brf__class_weight': class_weight
            }
            self.estimators = []
            #estimators.append(('standardize', StandardScaler()))
            self.estimators.append(('brf', self.BRF))
            self.pipeline_imlearn = Pipeline(self.estimators)
            self.brf_random = RandomizedSearchCV(
                estimator=self.pipeline_imlearn,
                param_distributions=self.param_grid,
                n_iter=80,
                cv=5,
                verbose=0,
                random_state=42,
                scoring=f2,
                n_jobs=-1)
            self.brf_random.fit(X_train, y_train)
            self.n_estimators = self.brf_random.best_params_[
                'brf__n_estimators']
            self.criterion = self.brf_random.best_params_['brf__criterion']
            self.max_features = self.brf_random.best_params_[
                'brf__max_features']
            self.max_depth = self.brf_random.best_params_['brf__max_depth']
            self.min_samples_split = self.brf_random.best_params_[
                'brf__min_samples_split']
            self.min_samples_leaf = self.brf_random.best_params_[
                'brf__min_samples_leaf']
            self.bootstrap = self.brf_random.best_params_['brf__bootstrap']
            self.replacement = self.brf_random.best_params_['brf__replacement']
            self.class_weight = self.brf_random.best_params_[
                'brf__class_weight']

            self.brf = BalancedRandomForestClassifier(
                n_estimators=self.n_estimators,
                criterion=self.criterion,
                max_features=self.max_features,
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split,
                min_samples_leaf=self.min_samples_leaf,
                bootstrap=self.bootstrap,
                replacement=self.replacement,
                class_weight=self.class_weight)
            self.brf.fit(X_train, y_train)
            self.logger.log(
                self.file_object, 'Balanced Random Forest best params: ' +
                str(self.brf_random.best_params_) + '\t' +
                str(self.brf_random.best_score_) +
                '. Exited the get_best_params_for_random_forest method of the Model_Finder class'
            )
            print('RF done and exited')
            return self.brf
        except Exception as e:
            self.logger.log(
                self.file_object,
                'Exception occured in get_best_params_for_balanced_random_forest method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger.log(
                self.file_object,
                'Balance Random Forest Parameter tuning  failed. Exited the get_best_params_for_balanced_random_forest method of the Model_Finder class'
            )
            raise Exception()

    def get_best_params_for_balanced_adaBoost(self, X_train, y_train):
        self.logger.log(
            self.file_object,
            'Entered the get_best_params_for_balanced_adaBoost method of the Model_Finder class'
        )

        print('enter ada boost')
        f2 = make_scorer(self.f2_make)
        try:
            n_estimators = [10, 15, 20, 25]
            warm_start = [True, False]
            sampling_strategy = ['auto', 'majority']
            replacement = [True, False]

            # Create the random grid
            self.param_grid = {
                'eec__n_estimators': n_estimators,
                'eec__warm_start': warm_start,
                'eec__sampling_strategy': sampling_strategy,
                'eec__replacement': replacement
            }

            self.estimators = []
            #estimators.append(('standardize', StandardScaler()))
            self.estimators.append(('eec', self.EEC))
            self.pipeline_imlearn = Pipeline(self.estimators)
            self.eec_random = RandomizedSearchCV(
                estimator=self.pipeline_imlearn,
                param_distributions=self.param_grid,
                n_iter=32,
                cv=5,
                verbose=0,
                random_state=42,
                scoring=f2,
                n_jobs=-1)
            self.eec_random.fit(X_train, y_train)
            self.n_estimators = self.eec_random.best_params_[
                'eec__n_estimators']
            self.warm_start = self.eec_random.best_params_['eec__warm_start']
            self.sampling_strategy = self.eec_random.best_params_[
                'eec__sampling_strategy']
            self.replacement = self.eec_random.best_params_['eec__replacement']

            self.eec = EasyEnsembleClassifier(
                n_estimators=self.n_estimators,
                warm_start=self.warm_start,
                sampling_strategy=self.sampling_strategy,
                replacement=self.replacement)
            self.eec.fit(X_train, y_train)
            self.logger.log(
                self.file_object, 'Balanced Ada Boost params: ' +
                str(self.eec_random.best_params_) + '\t' +
                str(self.eec_random.best_score_) +
                '. Exited the get_best_params_for_AdaBoost method of the Model_Finder class'
            )
            print('aba boost done and exited')
            return self.eec
        except Exception as e:
            self.logger.log(
                self.file_object,
                'Exception occured in get_best_params_for_balanced_adaBoost method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger.log(
                self.file_object,
                'Balance Ada Boost tuning  failed. Exited the get_best_params_for_balanced_AdaBoost method of the Model_Finder class'
            )
            raise Exception()

    def get_best_model(self, X_train, X_test, y_train, y_test):

        self.logger.log(
            self.file_object,
            'Entered the get_best_model method of the Model_Finder class')

        print('in get best model')
        try:

            self.brf = self.get_best_params_for_balanced_random_forest(
                X_train, y_train)
            self.y_pred_brf = self.brf.predict(X_test)
            self.brf_f2 = self.f2_make(y_test, self.y_pred_brf)

            self.eec = self.get_best_params_for_balanced_adaBoost(
                X_train, y_train)
            self.y_pred_eec = self.eec.predict(X_test)
            self.eec_f2 = self.f2_make(y_test, self.y_pred_eec)

            #comparing the two models
            if (self.brf_f2 > self.eec_f2):
                print('best model exited')
                joblib.dump(self.brf, self.saved_best_model_path)
                return 'BalancedRandomForestClassifier', self.brf
            else:
                print('best model exited')
                joblib.dump(self.eec, self.saved_best_model_path)
                return 'EasyEnsembleClassifier', self.eec

        except Exception as e:
            self.logger.log(
                self.file_object,
                'Exception occured in get_best_model method of the Model_Finder class. Exception message:  '
                + str(e))
            self.logger.log(
                self.file_object,
                'Model Selection Failed. Exited the get_best_model method of the Model_Finder class'
            )
            raise Exception()
示例#8
0
def Improved_BRF_low(x_train,y_train,x_test,y_test,threshold1_low,threshold2_low,threshold3_low):
    
    clf1 = BalancedRandomForestClassifier(max_leaf_nodes=20,\
            n_estimators = 60,criterion = 'entropy',min_samples_leaf=20,min_samples_split=50,\
            max_depth=7, oob_score = True,random_state=10)
    
    clf2 = BalancedRandomForestClassifier(max_leaf_nodes=20,max_features = 10,\
            n_estimators = 60,criterion = 'entropy',min_samples_leaf=10,min_samples_split=30,\
            max_depth=9, oob_score = True,random_state=10)
        
    clf3 = BalancedRandomForestClassifier(max_leaf_nodes=20,max_features = 14,\
            n_estimators = 40,criterion = 'entropy',min_samples_leaf=10,min_samples_split=50,\
            max_depth=7, oob_score = True,random_state=10)
    
    ################################################## Data frist Classifier
    print('################################################## Data frist Classifier')
    print('Train Clients %s'%Counter(y_train))
    print('Test Clients %s'%Counter(y_test)) 
     
    clf1.fit(x_train,y_train)
    
    with open('BRF_clf1_low.pkl', 'wb') as f:
        pickle.dump(clf1, f, pickle.HIGHEST_PROTOCOL)
        
        
    y_pred1 = clf1.predict(x_test)
    
    y_prob1 = clf1.predict_proba(x_test)[:,1]
    y_prob1_train = clf1.predict_proba(x_train)[:,1]
           
    Plot_Prob_Distribution.Plot_probability(y_test,y_prob1,threshold1_low,threshold1_low)
    
    Prediction = np.zeros(y_test.shape)
    for i in range(len(y_test)):
        if y_prob1[i] <= threshold1_low:
            Prediction[i] = -1
        else:            
            Prediction[i] = clf1.predict(x_test[i,:].reshape(1, -1))
     

    ################################################## Data second Classifier
    print('################################################## Data second Classifier')


    train_choix_bool = (y_prob1_train > threshold1_low)
    test_choix_bool = (y_prob1 > threshold1_low)
    print('Train Clients %s'%Counter(y_train[train_choix_bool]))
    print('Test Clients %s'%Counter(y_test[test_choix_bool]))
    

    clf2.fit(x_train[train_choix_bool],y_train[train_choix_bool])
    with open('BRF_clf2_low.pkl', 'wb') as f:
        pickle.dump(clf2, f, pickle.HIGHEST_PROTOCOL)
    
    y_prob2 = clf2.predict_proba(x_test[test_choix_bool])[:,1]


    y_prob2_train = np.zeros(len(x_train))
    for i in range(len(x_train)):
        if (y_prob1_train[i] > threshold1_low):
            y_prob2_train[i] = clf2.predict_proba(x_train[i,:].reshape(1,-1))[:,1]
    
    
    
    Plot_Prob_Distribution.Plot_probability(y_test[test_choix_bool],y_prob2,threshold2_low,threshold2_low)
    
    y_prob2 = np.zeros(len(x_test))
    for i in range(len(x_test)):
        if  (y_prob1[i] > threshold1_low):

            y_prob2[i] = clf2.predict_proba(x_test[i,:].reshape(1,-1))[:,1]
    
            
    for i in range(len(y_test)):
        if (y_prob1[i]+y_prob2[i])/2 <= threshold2_low:
            Prediction[i] = -1
        else:
            Prediction[i] = clf2.predict(x_test[i,:].reshape(1, -1))
            
    
    ################################################## Data third Classifier
    print('################################################## Data third Classifier')

    train_choix_bool = (y_prob1_train>threshold1_low) & (y_prob2_train>threshold2_low) 
    test_choix_bool = (y_prob1>threshold1_low) & (y_prob2>threshold2_low) 
            
    print('Train Clients %s'%Counter(y_train[train_choix_bool]))
    print('Test Clients %s'%Counter(y_test[test_choix_bool]))
    
    clf3.fit(x_train[train_choix_bool],y_train[train_choix_bool])
    
    with open('BRF_clf3_low.pkl', 'wb') as f:
        pickle.dump(clf3, f, pickle.HIGHEST_PROTOCOL)
    
    y_prob3 = clf3.predict_proba(x_test[test_choix_bool])[:,1]
    
    y_prob3_train = np.zeros(len(x_train))
    for i in range(len(x_train)):
        if  (y_prob1_train[i]>threshold1_low) & (y_prob2_train[i]>threshold2_low) :
            y_prob3_train[i] = clf3.predict_proba(x_train[i,:].reshape(1,-1))[:,1]
    
    
    Plot_Prob_Distribution.Plot_probability(y_test[test_choix_bool],y_prob3,threshold3_low,threshold3_low)
    
    y_prob3 = np.zeros(len(x_test))
    for i in range(len(x_test)):
        if  (y_prob1[i]>threshold1_low) & (y_prob2[i]>threshold2_low) :

            y_prob3[i] = clf3.predict_proba(x_test[i,:].reshape(1,-1))[:,1]
    
    ##########  Model 1        
    for i in range(len(y_test)):
        if y_prob3[i] <= threshold3_low:
            Prediction[i] = -1
        else:
            Prediction[i] = clf3.predict(x_test[i,:].reshape(1, -1))

    ##########  Model 2        
    y_Prob = np.zeros(len(x_test))
    for i in range(len(x_test)):
        if  (y_prob1[i]<threshold1_low) :
            y_Prob[i] = -1
        else:
            if (y_prob1[i]+y_prob2[i])/2 < threshold2_low:
                y_Prob[i] = -1
            else:
                y_Prob[i] = (y_prob1[i]+y_prob2[i]+y_prob3[i])/3
            
    y_Pred = np.sign(y_Prob-0.5)
    
    return y_pred1, y_Pred
clf = BalancedRandomForestClassifier(n_estimators=2000,
                                     replacement=True,
                                     sampling_strategy='not minority',
                                     oob_score=True,
                                     n_jobs=4,
                                     random_state=42,
                                     verbose=1)
clf.fit(X_train, Y_train)

# %% [markdown]
'''
## Model performance
'''
# %%

Y_train_pred = clf.predict(X_train)
Y_test_pred = clf.predict(X_test)
print('\nClassifier performance')
print('In sample:\n', metrics.classification_report(Y_train, Y_train_pred))
print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred))

# %% [markdown]
'''
Overall, the model seems to do well in distinguishing between very inactive
periods ("sit-stand" and "sleep") and very active ones ("bicycling"), but there
seems to be confusion between the remaining activities.

## Plot predicted vs. true activity profiles

Using our utility function, let's plot the activity profile for participant
`006`. Here we also pass the acceleration mean for plotting purposes.
示例#10
0
                         random_state=0)

bbc_score = []
brfc_score = []
eec_score = []
rbc_score = []

for train_index, test_index in kf.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]
    bbc.fit(X_train, y_train)
    brfc.fit(X_train, y_train)
    eec.fit(X_train, y_train)
    rbc.fit(X_train, y_train)
    y_pred_bbc = bbc.predict(X_test)
    y_pred_brfc = brfc.predict(X_test)
    y_pred_eec = eec.predict(X_test)
    y_pred_rbc = rbc.predict(X_test)
    bbc_score.append(balanced_accuracy_score(y_test, y_pred_bbc))
    brfc_score.append(balanced_accuracy_score(y_test, y_pred_brfc))
    eec_score.append(balanced_accuracy_score(y_test, y_pred_eec))
    rbc_score.append(balanced_accuracy_score(y_test, y_pred_rbc))

print("\t Average score:\t\t Standard deviation:")
print("bbc\t",
      sum(bbc_score) / float(len(bbc_score)), "\t",
      statistics.stdev(bbc_score))
print("brfc\t",
      sum(brfc_score) / float(len(brfc_score)), "\t",
      statistics.stdev(brfc_score))
print("eec\t",
示例#11
0
from sklearn.model_selection import KFold
from sklearn.metrics import make_scorer, accuracy_score
from sklearn.pipeline import Pipeline

df_X_train = pd.read_csv('../data/raw/X_train.csv')
df_y_train = pd.read_csv('../data/raw/y_train.csv')
df_X_test = pd.read_csv('../data/raw/X_test.csv').set_index('id', drop=True)

df_train = pd.merge(df_y_train, df_X_train, on='id').set_index('id', drop=True)
#df_train = pd.concat([df_train.loc[df_train['y'] == 1].sample(n=600), df_train.loc[df_train['y'] != 1]]).sample(frac=1).reset_index(drop=True)
print(df_train)
X = df_train
Y = X['y'].values
X = X.drop('y', axis=1).values
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=42)
scorer = make_scorer(accuracy_score)
X_train = X
y_train = Y
X_test = df_X_test.values

scaler = RobustScaler().fit(X_train)
rescaled_X_train = scaler.transform(X_train)
rescaled_X_test = scaler.transform(X_test)
model = BalancedRandomForestClassifier(random_state=42, n_estimators=156)
model.fit(rescaled_X_train, y_train)
y_pred = model.predict(rescaled_X_test)  #predicted values without index column
df_y_pred = pd.DataFrame({'id': np.arange(np.size(y_pred)), 'y': y_pred})
df_y_pred.to_csv('../data/processed/y_pred.csv', index=False)
示例#12
0
# Classification using random forest classifier with and without sampling
###############################################################################
# Random forest is another popular ensemble method and it is usually
# outperforming bagging. Here, we used a vanilla random forest and its balanced
# counterpart in which each bootstrap sample is balanced.

rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
brf = BalancedRandomForestClassifier(n_estimators=50,
                                     random_state=0,
                                     n_jobs=-1)

rf.fit(X_train, y_train)
brf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_brf = brf.predict(X_test)

# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.

print('Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'.format(
    balanced_accuracy_score(y_test, y_pred_rf),
    geometric_mean_score(y_test, y_pred_rf)))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_rf,
                      classes=np.unique(satimage.target),
                      ax=ax[0],
                      title='Random forest')
示例#13
0
''' 
## Train a random forest classifier

*Note: this may take a while*
'''

# %%
clf = BalancedRandomForestClassifier(n_estimators=2000,
                                     replacement=True,
                                     sampling_strategy='not minority',
                                     n_jobs=4,
                                     random_state=42,
                                     verbose=1)
clf.fit(X_train, Y_train)

Y_test_pred = clf.predict(X_test)
print('\nClassifier performance')
print('Out of sample:\n',
      metrics.classification_report(Y_test, Y_test_pred, zero_division=0))

# %% [markdown]
'''
## Robustness to unforseen scenarios

What if the subjects in the test set wore the device differently from
those in the training set? For example, suppose that all the subjects in the
training set were right-handed, but the test subjects are left-handed.
This would more or less result in the device being rotated.

<img src="wrist_accelerometer.jpg" width="200"/>
class BalancedBinaryClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 max_depth=None,
                 n_features=10,
                 selector=ranksum,
                 trend="both",
                 space_mask=None):
        self.max_depth = max_depth
        self.n_features = n_features
        self.selector = selector
        self.model_ = BalancedRandomForestClassifier(max_depth=max_depth,
                                                     n_estimators=100,
                                                     random_state=777)
        self.trend = trend
        self.space_mask = space_mask

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.mask = self.trend_mask = np.zeros(X.shape[1])
        self.classes_ = unique_labels(y)
        if self.classes_.shape[0] != 2:
            raise Exception(
                'Current implementation only support binary classification')
        self.importance = self.selector(X, y)
        flag1 = flag2 = False
        mean_diff = X[y == 1, :].mean(axis=0) - X[y == 0, :].mean(axis=0)
        if self.trend == "up":
            flag1 = True
            self.trend_mask[mean_diff <= 0] = 1
            print("Trend mask: {}/{}".format(int(self.trend_mask.sum()),
                                             X.shape[1]))
        if self.space_mask is not None:
            flag2 = True
            self.space_mask = np.array(self.space_mask).astype(int)
            print("Space mask: {}/{}".format(int(self.space_mask.sum()),
                                             X.shape[1]))
        else:
            self.space_mask = np.zeros(X.shape[1])
        if flag1 or flag2:
            self.mask = self.trend_mask + self.space_mask
            self.mask[self.mask > 1] = 1
            print("Remained: {}/{}".format(X.shape[1] - int(self.mask.sum()),
                                           X.shape[1]))
            self.importance[self.mask.astype(bool)] = self.importance.min() - 1
        if self.trend == "both_balance":
            n_up = int(self.n_features / 2)
            n_down = self.n_features - n_up
            up_importance = copy(self.importance)
            down_importance = copy(self.importance)
            up_importance[mean_diff < 0] = up_importance.min() - 1
            down_importance[mean_diff > 0] = down_importance.min() - 1
            up_order = np.argsort(up_importance)[::-1]
            down_order = np.argsort(down_importance)[::-1]
            features = np.array(
                list(up_order[:n_up]) + list(down_order[:n_down]))
            print(features)
        else:
            order = np.argsort(self.importance)[::-1]
            features = order[:self.n_features]
        self.features = features
        self.model_.fit(X[:, self.features], y)

    def predict(self, X):
        check_is_fitted(self)
        return self.model_.predict(X[:, self.features])

    def predict_proba(self, X):
        check_is_fitted(self)
        return self.model_.predict_proba(X[:, self.features])
示例#15
0
indices = np.argsort(importances)[::-1]
names = [train.columns[i] for i in indices]

# Barplot: Add bars
plt.bar(range(train.shape[1]), importances[indices])
# Add feature names as x-axis labels
plt.xticks(range(train.shape[1]), names, rotation=20, fontsize=8)
plt.yticks(range(0, 35, 5), fontsize=12)
plt.grid(b=None, axis='x')
# Create plot title
plt.title("Feature Importances")
# Show plot
plt.show()

#Training data prediction
train_rf_predictions = model.predict(train)
train_rf_probs = model.predict_proba(train)[:, 1]

# Testing predictions (to determine performance)
rf_predictions = model.predict(test)
rf_probs = model.predict_proba(test)[:, 1]

#Combine predicted train data odds with team name and year
train_1 = pd.concat([train_year, train_team, train], axis=1)
train_1.reset_index(drop=True, inplace=True)
train_1 = pd.concat([train_1, pd.DataFrame(train_labels)], axis=1)
train_1 = train_1.rename(columns={0: 'Champion'})
train_1 = pd.concat([train_1, pd.DataFrame(train_rf_probs)], axis=1)
train_1 = train_1.rename(columns={'Year': 'Year', 'Team': 'Team', 0: 'Probs'})

#train_1.sort_values(by=['Probs'],ascending=False)
[`imbalanced-learn`](https://imbalanced-learn.org/stable/) package, which has
better support for imbalanced datasets.
'''

# %%
clf = BalancedRandomForestClassifier(
    n_estimators=1000,
    replacement=True,
    sampling_strategy='not minority',
    n_jobs=4,
    random_state=42,
)
clf.fit(X_feats, Y)

print('\nClassifier performance in training set')
print(metrics.classification_report(Y, clf.predict(X_feats), zero_division=0))

# %% [markdown]
'''
The classification in-sample is just acceptable. This suggests
that we might need to add more discriminative features. Let's load another
subject to test and get the true (out-of-sample) performance.
'''

# %%

# Load another participant data
data2 = pd.read_pickle(CAPTURE24_PATH + '077.pkl').dropna()
# Translate annotations
data2['label'] = anno_label_dict.loc[data2['annotation'],
                                     'label:Willetts2018'].values
示例#17
0
def callAI(claim):
    x = pd.read_csv('x.csv')
    y = pd.read_csv('y.csv')

    months_as_customer = random.randint(8, 40)
    age = claim['age']
    policy_state = ['IL', 'IN', 'OH'][random.randint(0, 3)]
    policy_csl = ('500/1000', '100/300', '250/500')[random.randint(0, 3)]

    policy_deductable = 1000 * random.randint(0, 10)

    policy_annual_premium = 500 * random.randint(0, 10)

    umbrella_limit = 10000 * random.randint(0, 3)

    insured_zip = random.randint(111111, 999999)

    insured_sex = claim['insured_sex'].upper()

    edu = ('Masters', 'High School', 'Associate', 'JD', 'College', 'MD', 'PhD')
    insured_education_level = edu[random.randint(0, len(edu))]

    occupation = ('other-service', 'priv-house-serv', 'adm-clerical',
                  'handlers-cleaners', 'prof-specialty', 'protective-serv',
                  'machine-op-inspct', 'armed-forces', 'sales', 'tech-support',
                  'transport-moving', 'craft-repair', 'farming-fishing',
                  'exec-managerial')
    insured_occupation = occupation[random.randint(0, len(occupation))]

    hobbies = ('camping', 'kayaking', 'golf', 'dancing', 'bungie-jumping',
               'movies', 'basketball', 'exercise', 'sleeping', 'video-games',
               'skydiving', 'paintball', 'hiking', 'base-jumping', 'reading',
               'polo', 'board-games', 'yachting', 'cross-fit', 'chess')
    insured_hobbies = hobbies[random.randint(0, len(hobbies))]

    insured_relationship = claim['insured_relationship']

    capital_gains = 500 * random.randint(0, 10)

    capital_loss = 500 * random.randint(0, 10)

    type_of_admission = claim['type_of_admission']

    type_of_visit = claim['type_of_visit']

    incident_severity = claim['incident_severity']

    source_of_admission = claim['source_of_admission']

    h_state = ('WV', 'NY', 'VA', 'PA', 'SC', 'NC', 'OH')
    hospital_state = h_state[random.randint(0, len(h_state))]

    h_city = ('Northbrook', 'Riverwood', 'Northbend', 'Springfield',
              'Hillsdale', 'Columbus', 'Arlington')
    hospital_city = h_city[random.randint(0, len(h_city))]

    service_provider = "Long Island Medical Arts"
    hospitalized_hour_of_the_day = random.randint(0, 25)

    status_when_brought_in = claim['status_when_brought_in']
    survival_status = claim['survival_status']
    duration_of_hospitalization = claim['duration_of_hospitalization']
    medical_staff = claim['medical_staff']
    total_claim_amount = claim['total_claim']
    board_claim = claim['board_claim']
    pharmacy_claim = claim['pharmacy_claim']
    doctor_consultation_claim = claim['doctor_claim']

    rsn = ('GORD', 'Appendectomy', 'Hemorrhoidectomy', 'Kidney', 'Cataract',
           'Delivery', 'Liver', 'Cancer', 'Lungs', 'Brain', 'Prosthetics',
           'Heart', 'Stones', 'ALS')
    reason = rsn[random.randint(0, len(rsn))]

    r_type = ('B123', 'RSX', 'L1', 'J5', 'A12', 'H763', 'H445', 'CR362', 'D2',
              'L14', 'C93', 'TL', 'A3', 'MDX', 'C736', 'J1', 'S9', 'E400',
              'H1', 'P1', 'S2', '92x', 'A1', 'D1', 'X5', 'L72', 'M5', 'S1',
              'A5', 'C633', 'LN142', 'F150', 'C300', 'ML350', 'LN132', 'X6')
    reason_type = r_type[random.randint(0, len(r_type))]

    diagnosed_year = random.randint(2007, 2020)

    hospitalized_month = datetime.now().month

    hospitalized_day = datetime.now().day

    x_test = pd.DataFrame(columns=[
        'months_as_customer', 'age', 'policy_state', 'policy_csl',
        'policy_deductable', 'policy_annual_premium', 'umbrella_limit',
        'insured_zip', 'insured_sex', 'insured_education_level',
        'insured_occupation', 'insured_hobbies', 'insured_relationship',
        'capital-gains', 'capital-loss', 'type_of_admission', 'type_of_visit',
        'incident_severity', 'source_of_admission', 'hospital_state',
        'hospital_city', 'service_provider', 'hospitalized_hour_of_the_day',
        'status_when_brought_in', 'survival_status',
        'duration_of_hospitalization', 'medical_staff', 'total_claim_amount',
        'board_claim', 'pharmacy_claim', 'doctor_consultation_claim', 'reason',
        'reason_type', 'diagnosed_year', 'hospitalized_month',
        'hospitalized_day'
    ],
                          index=['a'])

    x_test.loc['a'] = [
        months_as_customer, age, policy_state, policy_csl, policy_deductable,
        policy_annual_premium, umbrella_limit, insured_zip, insured_sex,
        insured_education_level, insured_occupation, insured_hobbies,
        insured_relationship, capital_gains, capital_loss, type_of_admission,
        type_of_visit, incident_severity, source_of_admission, hospital_state,
        hospital_city, service_provider, hospitalized_hour_of_the_day,
        status_when_brought_in, survival_status, duration_of_hospitalization,
        medical_staff, total_claim_amount, board_claim, pharmacy_claim,
        doctor_consultation_claim, reason, reason_type, diagnosed_year,
        hospitalized_month, hospitalized_day
    ]

    x_test['reason_type'] = x_test['reason_type'].replace(
        ('B123', 'RSX', 'L1', 'J5', 'A12', 'H763', 'H445', 'CR362', 'D2',
         'L14', 'C93', 'TL', 'A3', 'MDX', 'C736', 'J1', 'S9', 'E400', 'H1',
         'P1', 'S2', '92x', 'A1', 'D1', 'X5', 'L72', 'M5', 'S1', 'A5', 'C633',
         'LN142', 'F150', 'C300', 'ML350', 'LN132', 'X6'),
        (0.95, 0.91, 0.90, 0.88, 0.87, 0.86, 0.85, 0.85, 0.84, 0.83, 0.81,
         0.80, 0.78, 0.77, 0.77, 0.76, 0.75, 0.74, 0.73, 0.72, 0.71, 0.71,
         0.71, 0.71, 0.70, 0.68, 0.67, 0.67, 0.66, 0.64, 0.62, 0.62, 0.61,
         0.60, 0.59, 0.56))

    x_test['reason'] = x_test['reason'].replace(
        ('GORD', 'Appendectomy', 'Hemorrhoidectomy', 'Kidney', 'Cataract',
         'Delivery', 'Liver', 'Cancer', 'Lungs', 'Brain', 'Prosthetics',
         'Heart', 'Stones', 'ALS'), (0.84, 0.82, 0.81, 0.80, 0.77, 0.76, 0.75,
                                     0.74, 0.73, 0.72, 0.71, 0.69, 0.69, 0.66))

    x_test['survival_status'] = x_test['survival_status'].replace(
        ('NO', 'YES'), (0.76, 0.74))

    x_test['hospital_city'] = x_test['hospital_city'].replace(
        ('Northbrook', 'Riverwood', 'Northbend', 'Springfield', 'Hillsdale',
         'Columbus', 'Arlington'), (0.78, 0.77, 0.76, 0.75, 0.74, 0.73, 0.71))

    x_test['hospital_state'] = x_test['hospital_state'].replace(
        ('WV', 'NY', 'VA', 'PA', 'SC', 'NC', 'OH'),
        (0.82, 0.77, 0.76, 0.73, 0.70, 0.69, 0.56))

    x_test['source_of_admission'] = x_test['source_of_admission'].replace(
        ('None', 'Self', 'Neighbor', 'Family', 'Ambulance', 'Other'),
        (1.0, 0.93, 0.79, 0.73, 0.70, 0.68))

    x_test['incident_severity'] = x_test['incident_severity'].replace(
        (1, 3, 5, 4), (0.94, 0.89, 0.87, 0.39))

    x_test['type_of_visit'] = x_test['type_of_visit'].replace(
        ('V67', 'V55', 'V73'), (0.78, 0.74, 0.72))

    x_test['type_of_admission'] = x_test['type_of_admission'].replace(
        ('AD3', 'AD6', 'AD8', 'AD1'), (0.91, 0.90, 0.72, 0.70))

    x_test['insured_relationship'] = x_test['insured_relationship'].replace(
        ('husband', 'own-child', 'unmarried', 'not-in-family', 'wife',
         'other-relative'), (0.79, 0.78, 0.75, 0.74, 0.72, 0.70))

    x_test['insured_hobbies'] = x_test['insured_hobbies'].replace(
        ('camping', 'kayaking', 'golf', 'dancing', 'bungie-jumping', 'movies',
         'basketball', 'exercise', 'sleeping', 'video-games', 'skydiving',
         'paintball', 'hiking', 'base-jumping', 'reading', 'polo',
         'board-games', 'yachting', 'cross-fit', 'chess'),
        (0.91, 0.90, 0.89, 0.88, 0.84, 0.83, 0.82, 0.81, 0.805, 0.80, 0.78,
         0.77, 0.76, 0.73, 0.73, 0.72, 0.70, 0.69, 0.25, 0.17))

    x_test['insured_occupation'] = x_test['insured_occupation'].replace(
        ('other-service', 'priv-house-serv', 'adm-clerical',
         'handlers-cleaners', 'prof-specialty', 'protective-serv',
         'machine-op-inspct', 'armed-forces', 'sales', 'tech-support',
         'transport-moving', 'craft-repair', 'farming-fishing',
         'exec-managerial'), (0.84, 0.84, 0.83, 0.79, 0.78, 0.77, 0.76, 0.75,
                              0.72, 0.71, 0.705, 0.70, 0.69, 0.63))

    x_test['insured_education_level'] = x_test[
        'insured_education_level'].replace(
            ('Masters', 'High School', 'Associate', 'JD', 'College', 'MD',
             'PhD'), (0.78, 0.77, 0.76, 0.74, 0.73, 0.72, 0.71))

    x_test['insured_sex'] = x_test['insured_sex'].replace(('FEMALE', 'MALE'),
                                                          (0.76, 0.73))

    x_test['policy_csl'] = x_test['policy_csl'].replace(
        ('500/1000', '100/300', '250/500'), (0.78, 0.74, 0.73))

    x_test['policy_state'] = x_test['policy_state'].replace(
        ('IL', 'IN', 'OH'), (0.77, 0.745, 0.74))

    x_test['service_provider'] = x_test['service_provider'].replace(
        ('Long Island Medical Arts', 'Francis W Iacobellis',
         'Lenox Hill Hospital', 'Otis M Jones', 'Ms St Lukes And Roosevelt',
         'Mount Sinai Hospital', 'Nyp-Weill Cornell'),
        (0.778, 0.776, 0.765, 0.757, 0.751, 0.74, 0.71))

    model = BalancedRandomForestClassifier(n_estimators=100, random_state=0)

    model.fit(x, y)
    y_pred_rf = model.predict(x_test)

    return y_pred_rf[0]
示例#18
0
def main(data_path_list):
    df_all = get_all_seq(data_path_list, 'COURSE_ACCESS')

    df_all['09_weekday_seq'] = df_all['09_day_list'].apply(get_weekday)
    df_all['09_weekend_seq'] = df_all['09_day_list'].apply(get_weekday)

    df_all['10_weekday_seq'] = df_all['10_day_list'].apply(get_weekday)
    df_all['10_weekend_seq'] = df_all['10_day_list'].apply(get_weekday)

    df_all['11_weekday_seq'] = df_all['11_day_list'].apply(get_weekday)
    df_all['11_weekend_seq'] = df_all['11_day_list'].apply(get_weekday)

    df_all['12_weekday_seq'] = df_all['12_day_list'].apply(get_weekday)
    df_all['12_weekend_seq'] = df_all['12_day_list'].apply(get_weekday)

    df_all['total_weekday_seq'] = df_all['total_list'].apply(get_weekday)
    df_all['total_weekend_seq'] = df_all['total_list'].apply(get_weekday)

    df_all = get_weekday_seq_entropy(df_all, 5)
    df_all = get_weekend_seq_entropy(df_all, 2)
    df_all = add_at_risk_label(df_all)

    n_list = list(df_all.columns)
    pattern = re.compile('.*_entropy_.*')
    entropy_list = ['De-id']
    for i in n_list:
        if pattern.match(i):
            entropy_list.append(i)

    df_all_entropy = df_all[entropy_list]
    df_all_entropy = df_all_entropy.rename(
        columns={'De-id': 'MASKED_STUDENT_ID'})
    '''
        Till this get all seq entropy features

    '''

    lib_se1 = pd.read_csv('Std_Lib_features_2016_se1.csv')
    his_2015_se1 = pd.read_csv('Std_list_atRist_2015_se1.csv')
    his_2015_se2 = pd.read_csv('Std_list_atRist_2015_se2.csv')
    his_2015_se1.columns = ['MASKED_STUDENT_ID', '2015_se1_CUM_GPA']
    his_2015_se2.columns = ['MASKED_STUDENT_ID', '2015_se2_CUM_GPA']

    his_lib = pd.merge(lib_se1,
                       his_2015_se1,
                       on='MASKED_STUDENT_ID',
                       how='left').fillna(0)
    his_lib = pd.merge(his_lib,
                       his_2015_se2,
                       on='MASKED_STUDENT_ID',
                       how='left').fillna(0)

    df_se1 = pd.merge(df_all_entropy,
                      his_lib,
                      on='MASKED_STUDENT_ID',
                      how='left').fillna(0)
    '''
        Add historical grades for one year

    '''

    # lib_se1 = pd.read_csv('Std_Lib_features_2016_se1.csv')
    # df_se1= lib_se1

    df = pd.read_csv('DR0008_activity_accumulator_2016_09.csv', sep='    ')
    df['weekday'] = pd.to_datetime(df['timestamp']).dt.dayofweek
    df['is_weekday'] = df['weekday'].apply(lambda x: 1 if x <= 5 else 0)
    df_weekday = df[df['is_weekday'] == 1]
    df_weekend = df[df['is_weekday'] == 0]

    PRE_FIX = '09_weekday_'
    df_weekday_one_month = extract_one_month(df_weekday, PRE_FIX)
    PRE_FIX = '09_weekend_'
    df_weekend_one_month = extract_one_month(df_weekend, PRE_FIX)
    df_se1 = pd.merge(df_se1,
                      df_weekday_one_month,
                      on=['MASKED_STUDENT_ID'],
                      how='left').fillna(0)
    df_se1 = pd.merge(df_se1,
                      df_weekend_one_month,
                      on=['MASKED_STUDENT_ID'],
                      how='left').fillna(0)
    del df
    del df_weekday_one_month
    del df_weekend_one_month

    df = pd.read_csv('DR0008_activity_accumulator_2016-10.csv', sep='    ')
    df['weekday'] = pd.to_datetime(df['timestamp']).dt.dayofweek
    df['is_weekday'] = df['weekday'].apply(lambda x: 1 if x <= 5 else 0)
    df_weekday = df[df['is_weekday'] == 1]
    df_weekend = df[df['is_weekday'] == 0]

    PRE_FIX = '10_weekday_'
    df_weekday_one_month = extract_one_month(df_weekday, PRE_FIX)
    PRE_FIX = '10_weekend_'
    df_weekend_one_month = extract_one_month(df_weekend, PRE_FIX)
    df_se1 = pd.merge(df_se1,
                      df_weekday_one_month,
                      on=['MASKED_STUDENT_ID'],
                      how='left').fillna(0)
    df_se1 = pd.merge(df_se1,
                      df_weekend_one_month,
                      on=['MASKED_STUDENT_ID'],
                      how='left').fillna(0)
    del df
    del df_weekday_one_month
    del df_weekend_one_month

    df = pd.read_csv('DR0008_activity_accumulator_2016-11.csv', sep='    ')
    df['weekday'] = pd.to_datetime(df['timestamp']).dt.dayofweek
    df['is_weekday'] = df['weekday'].apply(lambda x: 1 if x <= 5 else 0)
    df_weekday = df[df['is_weekday'] == 1]
    df_weekend = df[df['is_weekday'] == 0]

    PRE_FIX = '11_weekday_'
    df_weekday_one_month = extract_one_month(df_weekday, PRE_FIX)
    PRE_FIX = '11_weekend_'
    df_weekend_one_month = extract_one_month(df_weekend, PRE_FIX)
    df_se1 = pd.merge(df_se1,
                      df_weekday_one_month,
                      on=['MASKED_STUDENT_ID'],
                      how='left').fillna(0)
    df_se1 = pd.merge(df_se1,
                      df_weekend_one_month,
                      on=['MASKED_STUDENT_ID'],
                      how='left').fillna(0)
    del df
    del df_weekday_one_month
    del df_weekend_one_month

    df = pd.read_csv('DR0008_activity_accumulator_2016-12.csv', sep='    ')
    df['weekday'] = pd.to_datetime(df['timestamp']).dt.dayofweek
    df['is_weekday'] = df['weekday'].apply(lambda x: 1 if x <= 5 else 0)
    df_weekday = df[df['is_weekday'] == 1]
    df_weekend = df[df['is_weekday'] == 0]

    PRE_FIX = '12_weekday_'
    df_weekday_one_month = extract_one_month(df_weekday, PRE_FIX)
    PRE_FIX = '12_weekend_'
    df_weekend_one_month = extract_one_month(df_weekend, PRE_FIX)
    df_se1 = pd.merge(df_se1,
                      df_weekday_one_month,
                      on=['MASKED_STUDENT_ID'],
                      how='left').fillna(0)
    df_se1 = pd.merge(df_se1,
                      df_weekend_one_month,
                      on=['MASKED_STUDENT_ID'],
                      how='left').fillna(0)
    del df
    del df_weekday_one_month
    del df_weekend_one_month
    '''
        Till this got LMS week statistical features

    '''

    # merge feature
    df_se1 = pd.merge(df_se1,
                      df_all_entropy,
                      on='MASKED_STUDENT_ID',
                      how='left').fillna(0)
    df_se1_features = df_se1[[
        i for i in df_se1.columns
        if i != 'label_atRist' and i != 'MASKED_STUDENT_ID'
    ]]
    df_se1_labels = df_se1['label_atRist']

    # classification
    X_train, X_test, y_train, y_test = train_test_split(df_se1_features,
                                                        df_se1_labels,
                                                        test_size=0.2,
                                                        stratify=df_se1_labels)

    brf = BalancedRandomForestClassifier(n_estimators=300,
                                         criterion='gini',
                                         random_state=0)
    brf.fit(X_train, y_train)
    y_pred = brf.predict(X_test)
    # imp_feature = brf.feature_importances_
    print(confusion_matrix(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))
    print(balanced_accuracy_score(y_test, y_pred))
    '''
print("_____________________________________ \n Balanced Random Forest")
# all features
clf_brf_all = BalancedRandomForestClassifier(n_estimators=1000,
                                             random_state=0,
                                             n_jobs=-1,
                                             max_depth=4,
                                             min_samples_split=0.05).fit(
                                                 X_train,
                                                 y_train.values.ravel())
print(f"All features results: \n",
      f"{list(loss_intensity.columns.values)[0]} - All training score is",
      clf_brf_all.score(X_train, y_train.values.ravel()))
print(f"{list(loss_intensity.columns.values)[0]} - All test score is",
      clf_brf_all.score(X_test, y_test.values.ravel()))
y_pred = clf_brf_all.predict(X_test)

#select most important ones
sel = SelectFromModel(BalancedRandomForestClassifier(n_estimators=1000,
                                                     random_state=0),
                      max_features=5)
sel.fit(X_train, y_train.values.ravel())
selected_feat = X_train.columns[(sel.get_support())]
print("\n Balanced Random Forest \n The selected features are",
      len(selected_feat), selected_feat.values)
# transform
X_train_selected = sel.transform(X_train)
X_test_selected = sel.transform(X_test)
# select features
clf_brf = BalancedRandomForestClassifier(n_estimators=1000,
                                         random_state=0,
示例#20
0
# time: 2021/04/16
import scipy.io as scio
from imblearn.ensemble import BalancedRandomForestClassifier
import scipy
from sklearn.metrics import balanced_accuracy_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
year = 15

while (year < 19):
    year_str = str(year)
    year += 1
    train_data = scio.loadmat("/data/file/classification_data/SJ" + year_str +
                              "/trainData.mat")["x_train"]
    train_label = scio.loadmat("/data/file/classification_data/SJ" + year_str +
                               "/trainlabel.mat")["trainlabel"].ravel()

    test_data = scio.loadmat("/data/file/classification_data/SJ" + year_str +
                             "/testData.mat")["x_test"]
    test_label = scio.loadmat("/data/file/classification_data/SJ" + year_str +
                              "/testlabel.mat")["testlabel"].ravel()

    brf.fit(train_data, train_label)
    label_pred = brf.predict(test_data).reshape(-1, 1)
    print(label_pred)
    scipy.io.savemat(
        "/data/file/classification_data/pre/forest_of_random/SJ" + year_str +
        "/label_pred.mat", {'label_pred': label_pred})
###############################################################################
# Classification using random forest classifier with and without sampling
###############################################################################
# Random forest is another popular ensemble method and it is usually
# outperforming bagging. Here, we used a vanilla random forest and its balanced
# counterpart in which each bootstrap sample is balanced.

rf = RandomForestClassifier(n_estimators=50, random_state=0, n_jobs=-1)
brf = BalancedRandomForestClassifier(n_estimators=50, random_state=0,
                                     n_jobs=-1)

rf.fit(X_train, y_train)
brf.fit(X_train, y_train)

y_pred_rf = rf.predict(X_test)
y_pred_brf = brf.predict(X_test)

# Similarly to the previous experiment, the balanced classifier outperform the
# classifier which learn from imbalanced bootstrap samples. In addition, random
# forest outsperforms the bagging classifier.

print('Random Forest classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_rf),
              geometric_mean_score(y_test, y_pred_rf)))
cm_rf = confusion_matrix(y_test, y_pred_rf)
fig, ax = plt.subplots(ncols=2)
plot_confusion_matrix(cm_rf, classes=np.unique(satimage.target), ax=ax[0],
                      title='Random forest')

print('Balanced Random Forest classifier performance:')
示例#22
0
    X_train, X_test, y_train, y_test = train_test_split(working_df['lyrics'],
                                                        y,
                                                        test_size=.40)

    vectorizer = TfidfVectorizer(
        stop_words=all_stop_words,
        ngram_range=(1, 3),
        #                             , max_df=.8,
        #                              min_df=.2,
        max_features=10000)
    vectorizer.fit(X_train)
    X_train_vec = vectorizer.transform(X_train)
    features = vectorizer.get_feature_names()

    imbrf = BalancedRandomForestClassifier(n_estimators=5000,
                                           max_features='auto',
                                           sampling_strategy=0.5).fit(
                                               X_train_vec, y_train)

    X_test_vec = vectorizer.transform(X_test)

    y_pred = imbrf.predict(X_test_vec)

    for score, term in zip(imbrf.feature_importances_, features):
        if term not in aggr_feat_imp_dict:
            aggr_feat_imp_dict[term] = score
        else:
            aggr_feat_imp_dict[term] += score

with open('feat_ranks_dict.json', 'w') as fp:
    json.dump(aggr_feat_imp_dict, fp)
示例#23
0
    '''
        Till this got LMS week statistical features
    
    '''

    # merge feature
    df_se1 = pd.merge(df_se1, df_all_entropy, on='MASKED_STUDENT_ID', how='left').fillna(0)
    df_se1_features = df_se1[[i for i in df_se1.columns if i != 'label_atRist' and i != 'MASKED_STUDENT_ID']]
    df_se1_labels = df_se1['label_atRist']

    # classification
    X_train, X_test, y_train, y_test = train_test_split(df_se1_features, df_se1_labels, test_size = 0.2, stratify=df_se1_labels)

    brf = BalancedRandomForestClassifier(n_estimators=300, criterion = 'gini', random_state=0)
    brf.fit(X_train, y_train) 
    y_pred = brf.predict(X_test)
    imp_feature = brf.feature_importances_
    print(confusion_matrix(y_test, y_pred))
    print(accuracy_score(y_test, y_pred))
    print(balanced_accuracy_score(y_test, y_pred))

    '''
        2019.5.30
        1. 加了lms统计特征(weekday and weekend)和seq统计特征(weekday and weekend)
        2. 划分训练集采用同分布
        Best acc now: 0.7309003914745542
        
        Next Step:
        1. 加入历史成绩特征
        2. 观察具体 weekday 和 weekend 两类数据分布具体有何不同
        3. 测试用半学期行为数据early predict
示例#24
0
def BalancedRF_classifier(df, y_column, feature_columns, test_rate):

    # 不均衡クラス分類用ランダムフォレスト
    # 混合行列や重要度の高い変数を可視化する

    # 説明変数、目的変数の作成
    X = df.loc[:, feature_columns].values
    Y = df.loc[:, y_column].values

    # 学習用、検証用データに分割
    (X_train, X_test, Y_train, Y_test) = train_test_split(X,
                                                          Y,
                                                          test_size=test_rate,
                                                          random_state=123,
                                                          shuffle=True)
    '''
    # モデル構築、パラメータはデフォルト
    parameters = {
        'n_estimators'      : [5, 10, 20, 30, 50],
        'max_features'      : [3, 5, 10, 15, 20],
        'random_state'      : [0],
        'n_jobs'            : [2],
        'min_samples_split' : [3, 5, 10, 15, 20, 25, 30],
        'max_depth'         : [3, 5, 10, 15, 20, 25, 30, 50, 100]
    }
    clf = GridSearchCV(RandomForestClassifier(), parameters)
    clf.fit(X_train, Y_train)
    print(clf.best_estimator_)'''

    model = BalancedRandomForestClassifier(n_jobs=1,
                                           n_estimators=30,
                                           sampling_strategy='not minority')

    print(model.get_params())
    model.fit(X_train, Y_train)

    # 正解率
    print("正解率 : " + str(model.score(X_test, Y_test) * 100) + "%")
    print("訓練データの正解率 : " + str(model.score(X_train, Y_train) * 100) + "%")

    # confusion matrix を確認する
    print("confusion matrix")
    prediction = model.predict(X_test)
    labels = list(set(Y))
    print_cmx(Y_test, prediction, labels)

    # 効いてる変数を調べる
    importances = None
    i = np.array([e.feature_importances_ for e in model.estimators_])
    avg_i = np.array([e.feature_importances_
                      for e in model.estimators_]).mean(axis=0)

    importances = pd.DataFrame({
        'variable': feature_columns,
        'importance': avg_i
    }).sort_values('importance', ascending=False).reset_index(drop=True)
    display(importances)

    IMP = importances.copy()
    plt.figure(figsize=(5, 7))
    plt.plot(IMP.importance,
             sorted([i + 1 for i in range(IMP.shape[0])], reverse=True), 'o-')
    plt.yticks(sorted([i + 1 for i in range(IMP.shape[0])], reverse=True),
               IMP.variable)
    plt.xlabel('importance')
    # plt.xlabel('重要度')
    plt.show()

    return model, importances, (X_train, X_test, Y_train, Y_test)
*Note: this may take a while*
'''

# %%
clf = BalancedRandomForestClassifier(
    n_estimators=2000,
    replacement=True,
    sampling_strategy='not minority',
    oob_score=True,
    n_jobs=4,
    random_state=42,
    verbose=1
)
clf.fit(X_train, Y_train)

Y_test_pred = clf.predict(X_test)
print('\nClassifier performance')
print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred, zero_division=0)) 

# This will be the training set
Y_in_train = clf.oob_decision_function_.astype('float32')
# This will be the test set
Y_in_test = clf.predict_proba(X_test).astype('float32')

# %% [markdown]
'''

## Architecture design
As a baseline, let's use a single-layer bidirectional LSTM.
PyTorch uses a sligtly unintuitive array format for the input and output of
its LSTM module.
示例#26
0

sample['risk_flag_weighted_rfc'] = weighted_clf.predict(test_df.drop(columns = ['risk_flag']))
sample['risk_flag_proba_weighted_rfc'] = weighted_clf.predict_proba(test_df.drop(columns = ['risk_flag']))[:,1]
weighted_clf.classes_

sample.to_csv('weighted_rfc.csv',index = False)




#balancedrfc

from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=500,random_state=0).fit(X_tr,y_tr)
roc_auc_score(y_tst,brfc.predict(X_tst))


sample['risk_flag'] = brfc.predict(test_df.drop(columns = ['risk_flag']))
sample['risk_flag_proba'] = brfc.predict_proba(test_df.drop(columns = ['risk_flag']))[:,1]
weighted_clf.classes_




print("F1 Score for Balanced Random Forest Classifier is ", f1_score(y_test,brfc.predict(X_test)))
print("Accuracy  Score for Balanced Random Fo
      
      
#catboost
示例#27
0
def Clasificar(database, new, path):
    pd.options.mode.chained_assignment = None
    if 'Response by Category' in list(database.columns):
        database = database.drop(['Response by Category','Response by Description'], axis = 1)
    database = database.sample(frac= 0.4, replace = False)
    
    #Chequeo las companias que ya estaban clasificadas
    #d = new.merge(database, how ='left', left_on='Organization Name', right_on = 'Investee')[['Investee','Category.1','Area of Focus']]
    #new = new.merge(d, how = "left", left_on = "Organization Name", right_on = "Investee")
    #new = new.drop(columns=["Investee"])
    
    database["Category.1"] = database["Category.1"].replace("rejected", "Rejected")
    database["Category.1"] = database["Category.1"].replace("B2C ", "B2C")
    database["Category.1"] = database["Category.1"].replace("FIntech", "Fintech")

    database['Prediction'] = np.nan
    new['Prediction'] = np.nan
    new = new.drop(['Prediction'], axis=1)

    #CLASIFICADOR
    
    warnings.filterwarnings('ignore')
    
    
    print('Importando bases de datos')
    
    new = new.rename(columns = {'Categories':'Category','Organization Name':'Investee'})
    train = database[['Operation','Investee', 'Category', 'Description', 'Category.1', 'Area of Focus']].dropna()
    newdata = new[['Transaction Name','Investee', 'Category', 'Description']]
    
    
    print('Preprocesamiento del texto')
    
    stop_words = stopwords.words('english')
    
    for column in ['Category','Description']:
        
        train[column] = train[column].apply(lambda x: (" ".join(str(x).lower() for x in str(x).split())).encode('utf-8').decode('utf-8'))  # lower case
        train[column] = train[column].str.replace('[^\w\s]', ' ')          																											# removing punctuation
        train[column] = train[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words))   # removing stop words
        newdata[column] = newdata[column].apply(lambda x: (" ".join(x.lower() for x in str(x).split())))  # lower case
        newdata[column] = newdata[column].str.replace('[^\w\s]', ' ')																		# removing punctuation
        newdata[column] = newdata[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words))   # removing stop words
    
    
    train_src1 = train[['Category','Description','Category.1']]
    train_src1['Rejected?'] = 0
    train_src1.loc[train_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1
    
    new_src1 = newdata[['Category','Description']]
    #new_src1['Rejected?'] = 0
    #new_src1.loc[new_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1
    
    
    #Binarizacion
    vectorizer = CountVectorizer()
    
    vectorI = pd.DataFrame(vectorizer.fit_transform(train_src1['Category']).toarray())
    vectorI_new = pd.DataFrame(vectorizer.transform(new_src1['Category']).toarray())
    vectorIdes = pd.DataFrame(vectorizer.fit_transform(train_src1['Description']).toarray())
    vectorIdes_new = pd.DataFrame(vectorizer.transform(new_src1['Description']).toarray())
    
    vectorI = pd.concat([vectorI, vectorIdes], axis = 1)
    vectorI_new = pd.concat([vectorI_new, vectorIdes_new], axis = 1)
    
    print('Entrenamiento')
    
    #Clasificacion binaria: Rechazadas vs no rechazadas
                #Resampling + Random Forest
    brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
    brf.fit(vectorI, train_src1['Rejected?'])
    y_train_pred = brf.predict(vectorI)
    print('Confusion matrix: \n' , confusion_matrix(train_src1['Rejected?'], y_train_pred))
    print('Accuracy: \n' , accuracy_score(train_src1['Rejected?'], y_train_pred))
    print('Recall: \n' , recall_score(train_src1['Rejected?'], y_train_pred))
    
    
    print('Clasificacion y exportacion')
    #Ajustando modelo a nuevos datos
    y_new_predict = brf.predict(vectorI_new)
    y_new_predict_proba = brf.predict_proba(vectorI_new)
    
    newdata['Prediction'] = y_new_predict
    newdata['Prob. of being rejected'] = y_new_predict_proba[:,0]
    newdata['Prob. of being of interest'] = y_new_predict_proba[:,1]
    

    
    #Creamos archivo Companies y exportamos
    new = pd.concat([new, newdata[['Prediction','Prob. of being rejected','Prob. of being of interest']]], axis=1, sort=False) 

    return new
示例#28
0
X = data[columns]

Y = data['upgrd_customer_class']

newDF = DataFrameImputer().fit_transform(X)

missing = newDF.columns[newDF.isnull().any()]

newDF = newDF.drop([
    'REMOTE_START_PARKING_ASSIST_CD', 'NEAR_FIELD_COMMUNICATION_FLG',
    'TIRE_MOBILE_KIT_FLG', 'PREFERRED_CHANNEL_CD', 'PERSONICX_CATEGORY_CD'
],
                   axis=1)

le = MultiColumnLabelEncoder()
X = le.fit_transform(X.astype(str))

transformer = RobustScaler().fit(X)
X = transformer.transform(X)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.33,
                                                    random_state=42,
                                                    stratify=Y)

#Balanced Random Forest
brf = BalancedRandomForestClassifier(n_estimators=300, random_state=0)
brf.fit(X_train, y_train)
print(f1_score(y_test, brf.predict(X_test)))