示例#1
0
        def objective(trial):

            train_X, val_X, train_y, val_y = train_test_split(self.X,
                                                              self.y,
                                                              test_size=0.2)
            median_imputer = SimpleImputer(missing_values=np.NaN,
                                           strategy='median')
            v_train_X = median_imputer.fit_transform(train_X)
            v_val_X = median_imputer.fit_transform(val_X)
            train_X = pd.DataFrame(v_train_X,
                                   columns=train_X.columns,
                                   index=train_X.index)
            val_X = pd.DataFrame(v_val_X,
                                 columns=val_X.columns,
                                 index=val_X.index)

            v_test_X = median_imputer.fit_transform(self.X_validation)
            test_X = pd.DataFrame(v_test_X,
                                  columns=self.X_validation.columns,
                                  index=self.X_validation.index)

            list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000]

            brf_n_estimators = trial.suggest_categorical(
                'n_estimators', list_trees)
            brf_max_features = trial.suggest_uniform('max_features', 0.15, 1.0)
            brf_min_samples_split = trial.suggest_int('min_samples_split', 2,
                                                      16)
            brf_min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)
            brf_min_weight_fraction_leaf = trial.suggest_uniform(
                'min_weight_fraction_leaf', 0, 0.5)
            brf_max_depth = trial.suggest_int('max_depth', 2, 32)

            brfmodel = BalancedRandomForestClassifier(
                n_estimators=brf_n_estimators,
                max_features=brf_max_features,
                min_samples_split=brf_min_samples_split,
                min_samples_leaf=brf_min_samples_leaf,
                max_depth=brf_max_depth,
                min_weight_fraction_leaf=brf_min_weight_fraction_leaf,
                bootstrap=True)

            brfmodel.fit(train_X, train_y)

            aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1])
            aucbrf_test = roc_auc_score(self.y_validation,
                                        brfmodel.predict_proba(test_X)[:, 1])
            print('Accuracy test ' + str(
                accuracy_score(self.y_validation, brfmodel.predict(test_X))))

            plt.figure()
            plot_confusion_matrix(brfmodel,
                                  test_X,
                                  self.y_validation,
                                  cmap=plt.cm.Blues,
                                  normalize=None)
            plt.show()
            print(aucbrf_test)

            return aucbrf
示例#2
0
    def evaluate_on_validation_or_test(self, test=False):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)
            for data in dati:

                del data['value']

                rf_model = BalancedRandomForestClassifier(**data)

                trainX = self.X
                trainy = self.y
                valx = self.X_validation
                valy = self.y_validation
                if test == True:
                    testx = self.X_test
                    testy = self.y_test

                median_imputer = SimpleImputer(missing_values=np.NaN,
                                               strategy='median')
                imputer = median_imputer.fit(trainX)
                vtrainX = imputer.transform(trainX)
                trainX = pd.DataFrame(vtrainX,
                                      columns=trainX.columns,
                                      index=trainX.index)

                vvalX = imputer.transform(valx)
                valx = pd.DataFrame(vvalX,
                                    columns=valx.columns,
                                    index=valx.index)

                if test == True:
                    vtest = imputer.transform(testx)
                    testx = pd.DataFrame(vtest,
                                         columns=testx.columns,
                                         index=testx.index)
                    trainX = pd.concat([trainX, valx])
                    trainy = np.concatenate((trainy, valy))

                rf_model.fit(trainX, trainy)

                if test == True:
                    roc_rf = roc_auc_score(testy,
                                           rf_model.predict_proba(testx)[:, 1])
                else:
                    roc_rf = roc_auc_score(valy,
                                           rf_model.predict_proba(valx)[:, 1])

                if test == False:
                    print("Validation AUC: {}".format(str(roc_rf)))
                else:
                    print("Test AUC: {}".format(str(roc_rf)))
def main():
    """ Main entrance."""
    print('Spliting challenges')
    split_challenges()
    print('Reading X...')
    X = pd.concat([pd.read_json(XY_PATH['X'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])
    print('Reading y...')
    y = pd.concat([pd.read_json(XY_PATH['y'].format(i), orient='records') for i in range(1, 163)]).set_index(['l0', 'l1'])

    print('\nTraining Inner sampler RFC')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        balanced_rfc = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
        balanced_rfc.fit(X_train.to_numpy(), y_train.to_numpy().ravel())

        pd.DataFrame(balanced_rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'brf', f'y_prob_{i}.json'), orient='records')
        pd.Series(balanced_rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'brf', f'feature_importance_{i}.json'))

    print('\nTraining RandomUnderSampler')
    for i in range(10):
        print(f'Training 10-Fold CV #{i}', end='\r')
        X_train, X_test, y_train, y_test = get_train_test_Xy(X, y, i)

        rfc = RandomForestClassifier(n_estimators=100, random_state=0)
        rus = RandomUnderSampler(random_state=0)

        X_resample, y_resample = rus.fit_resample(X_train.to_numpy(), y_train.to_numpy().ravel())
        rfc.fit(X_resample, y_resample)

        pd.DataFrame(rfc.predict_proba(X_test.to_numpy()), index=y_test.index).reset_index().to_json(os.path.join(RESULT_PATH, 'rus', f'y_prob_{i}.json'), orient='records')
        pd.Series(rfc.feature_importances_).to_json(os.path.join(RESULT_PATH, 'rus', f'feature_importance_{i}.json'))
示例#4
0
def evaluate(X_train, y_train, X_test, y_test):
    global seed
    clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed)
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict_proba(X_test).argsort(axis=1)
    y_pred1 = y_pred[:, -1]
    y_pred2 = y_pred[:, -2]
    return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix(
        y_test, y_pred2)
示例#5
0
    def evaluate_model(self):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:
            dati = json.load(f)

            for data in dati:

                del data['value']

                rf_model = BalancedRandomForestClassifier(**data)

                rf_auc = []

                for i in tqdm(range(20)):

                    cv = StratifiedKFold(n_splits=5,
                                         shuffle=True,
                                         random_state=i + 187462)

                    for train_index, test_index in cv.split(self.X, self.y):

                        trainX = self.X.iloc[lambda x: train_index]
                        testX = self.X.iloc[lambda x: test_index]

                        trainy = np.take(self.y, train_index)
                        testy = np.take(self.y, test_index)

                        median_imputer = SimpleImputer(missing_values=np.NaN,
                                                       strategy='median')
                        imputer = median_imputer.fit(trainX)
                        vtrainX = imputer.transform(trainX)

                        imputertest = median_imputer.fit(testX)
                        vtestX = imputertest.transform(testX)
                        trainX = pd.DataFrame(vtrainX,
                                              columns=trainX.columns,
                                              index=trainX.index)
                        testX = pd.DataFrame(vtestX,
                                             columns=testX.columns,
                                             index=testX.index)

                        # Calcolo AUC per migliori risultati da CatBoost

                        rf_model.fit(trainX, trainy)
                        roc_rf = roc_auc_score(
                            testy,
                            rf_model.predict_proba(testX)[:, 1])
                        rf_auc.append(roc_rf)

                        print(roc_rf)

            print(statistics.mean(rf_auc))
        return rf_auc
    def run_best_estimator(self, train_x, train_y, test_x, test_y, estimator,
                           params, clf_type, question):
        estimator_scores = {}

        if estimator == 'BalancedRandomForestClassifier':
            clf = BalancedRandomForestClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'BalancedBaggingClassifier':
            clf = BalancedBaggingClassifier(
                n_estimators=params['n_estimators'],
                bootstrap=params['bootstrap'],
                max_samples=params['max_samples'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)
        elif estimator == 'EasyEnsembleClassifier':
            clf = EasyEnsembleClassifier(
                n_estimators=params['n_estimators'],
                sampling_strategy=params['sampling_strategy'],
                random_state=42)

        clf.fit(train_x, train_y)
        cross_val_scores = self.calc_cross_val_scores(clf, train_x, train_y,
                                                      clf_type, question)

        predicted_labels = clf.predict(test_x)

        tn, fp, fn, tp = confusion_matrix(test_y, predicted_labels).ravel()
        specificity = round((tn / (tn + fp)) * 100, 2)

        predicted_prob = clf.predict_proba(test_x)
        predicted_prob_true = [p[1] for p in predicted_prob]

        estimator_scores['Question'] = question
        estimator_scores['Accuracy'] = round(
            accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Balanced Accuracy'] = round(
            balanced_accuracy_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Precision'] = round(
            precision_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Recall'] = round(
            recall_score(test_y, predicted_labels) * 100, 2)
        estimator_scores['Specificity'] = specificity
        estimator_scores['F1'] = round(f1_score(test_y, predicted_labels), 2)
        estimator_scores['ROC AUC'] = round(
            roc_auc_score(test_y, predicted_prob_true), 2)

        # print('Perfect Confusion Matrix for Q-%s is: ' % (str(question).zfill(2)))
        # perfect_labels = train_y
        # print(confusion_matrix(train_y, perfect_labels))

        return cross_val_scores, estimator_scores
示例#7
0
class BaselineRandomForest(BaseClassifier):
    def __init__(self):
        self.random_forest_classifier = RandomForestClassifier(
            n_estimators=500,
            max_features='auto',
            max_depth=None,
            n_jobs=1,
            class_weight=None,
            criterion='entropy',
            min_samples_split=2,
            min_samples_leaf=1)
        self.feature_preprocessor = FeaturePreprocessor()
        self.feature_list = None
        self.model_filename = 'baseline_rf.pkl'

    def fit(self, samples: pd.DataFrame, labels: pd.DataFrame):
        samples = self.feature_preprocessor.preprocess_features(samples)
        samples = self.feature_preprocessor.remove_duplicates(samples)

        # intersect samples and labels
        samples, labels = intersect_oids_in_dataframes(samples, labels)

        self.feature_list = samples.columns
        samples_np_array = samples.values
        labels_np_array = labels['classALeRCE'].loc[samples.index].values
        self.random_forest_classifier.fit(samples_np_array, labels_np_array)

    def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame:
        samples = self.feature_preprocessor.preprocess_features(samples)
        samples_np_array = samples[self.feature_list].values
        predicted_probs = self.random_forest_classifier.predict_proba(
            samples_np_array)
        predicted_probs_df = pd.DataFrame(predicted_probs,
                                          columns=self.get_list_of_classes(),
                                          index=samples.index.values)
        predicted_probs_df.index.name = 'oid'
        return predicted_probs_df

    def get_list_of_classes(self) -> list:
        return self.random_forest_classifier.classes_

    def save_model(self, directory: str) -> None:
        with open(os.path.join(directory, self.model_filename), 'wb') as f:
            pickle.dump(self.random_forest_classifier, f,
                        pickle.HIGHEST_PROTOCOL)
        with open(os.path.join(directory, 'feature_list.pkl'), 'wb') as f:
            pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL)

    def load_model(self, directory: str) -> None:
        rf = pd.read_pickle(os.path.join(directory, self.model_filename))
        self.random_forest_classifier = rf
        self.feature_list = pd.read_pickle(
            os.path.join(directory, 'feature_list.pkl'))
示例#8
0
        def objective(trial):
            train_X, val_X, train_y, val_y = self.df_train_media.loc[:, self.
                                                                     df_train_media
                                                                     .
                                                                     columns !=
                                                                     '41'].values, self.df_validation_media.loc[:, self.df_validation_media.columns != '41'].values, self.df_train_media[
                                                                         '41'].values, self.df_validation_media[
                                                                             '41'].values
            test_X, test_y = self.df_test_media.loc[:, self.df_test_media.
                                                    columns !=
                                                    '41'].values, self.df_test_media[
                                                        '41'].values
            list_trees = [250, 500, 1000, 1500, 3000, 3500, 4000]

            n_estimators = trial.suggest_categorical('n_estimators',
                                                     list_trees)
            max_features = trial.suggest_uniform('max_features', 0.15, 1.0)
            min_samples_split = trial.suggest_int('min_samples_split', 2, 16)
            min_samples_leaf = trial.suggest_int('min_samples_leaf', 1, 16)
            min_weight_fraction_leaf = trial.suggest_uniform(
                'min_weight_fraction_leaf', 0, 0.5)
            max_depth = trial.suggest_int('max_depth', 2, 32)

            brfmodel = BalancedRandomForestClassifier(
                n_estimators=n_estimators,
                max_features=max_features,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                max_depth=max_depth,
                min_weight_fraction_leaf=min_weight_fraction_leaf,
                bootstrap=True)
            brfmodel.fit(train_X, train_y)
            aucbrf = roc_auc_score(val_y, brfmodel.predict_proba(val_X)[:, 1])
            print(
                "Test AUC: " +
                str(roc_auc_score(test_y,
                                  brfmodel.predict_proba(test_X)[:, 1])))

            return aucbrf
示例#9
0
def balanced_random_forest(train_features,
                           train_labels,
                           test_features,
                           feature_list=None,
                           hfo_type_name=None):
    rf = BalancedRandomForestClassifier(
        random_state=32,
        n_jobs=-1,  # use all available processors
        # class_weight='balanced_subsample'
    )
    rf.fit(train_features, train_labels)
    # Predict over test
    rf_predictions = rf.predict(test_features)
    rf_probs = rf.predict_proba(test_features)[:, 1]
    # IF FEATURE IMPORTANCE FIGS NOT EXISTS
    # print_feature_importances(rf, feature_list)
    # graphics.feature_importances(feature_list, rf.feature_importances_, hfo_type_name)
    return rf_predictions, rf_probs, rf
示例#10
0
from sklearn.metrics import classification_report
cr_rf = classification_report(y_test, y_pred_rf)
print(cr_rf)

visualizer = ClassificationReport(rf, classes=['Buy', 'No Buy'], support=True)
visualizer.fit(X_train, y_train)
visualizer.score(X_test, y_test)  
g = visualizer.poof()         

#--------------------Balanced Random Forest------------------------------
from imblearn.ensemble import BalancedRandomForestClassifier
brf = BalancedRandomForestClassifier(class_weight= "balanced", random_state = 0)
brf.fit(X_train, y_train)

# ROC
predicted_probas = brf.predict_proba(X_test)
import scikitplot as skplt
skplt.metrics.plot_roc(y_test, predicted_probas)
plt.show()

#Precision Recall Curve
skplt.metrics.plot_precision_recall_curve(y_test, predicted_probas)
plt.show()

# Predicting the Test set results
y_pred_brf = brf.predict(X_test)
predictions = [round(value) for value in y_pred_brf]

from sklearn.metrics import f1_score
f1_brf = f1_score(y_test, y_pred_brf)
示例#11
0
def train_classifier(
    _train_df_x,
    train_df_y,
    _val_df_x,
    val_df_y,
    lcset_info,
    nan_mode=NAN_MODE,
):
    class_names = lcset_info['class_names']
    train_df_x, mean_train_df_x, null_cols = clean_df_nans(_train_df_x,
                                                           mode=NAN_MODE)
    features = list(train_df_x.columns)
    best_rf = None
    best_rf_metric = -np.inf
    for criterion in ['gini', 'entropy']:
        # for criterion in ['entropy']:
        for max_depth in [1, 2, 3, 4, 5][::-1]:
            # for max_depth in [1, 2, 4, 8, 16][::-1]:
            for max_samples in np.linspace(.1, .9, 6):
                # for max_samples in [None]:
                rf = BalancedRandomForestClassifier(  # BalancedRandomForestClassifier RandomForestClassifier
                    n_jobs=N_JOBS,
                    criterion=criterion,
                    max_depth=max_depth,
                    n_estimators=512,  # 16 256 512 1024 2048
                    max_samples=max_samples,
                    max_features='auto',  # None auto
                    # min_samples_split=min_samples_split,
                    bootstrap=True,
                    #verbose=1,
                )
                rf.fit(train_df_x.values, train_df_y[['_y']].values[..., 0])
                val_df_x, _, _ = clean_df_nans(_val_df_x,
                                               mode=NAN_MODE,
                                               df_values=mean_train_df_x)
                y_pred_p = rf.predict_proba(val_df_x.values)
                y_true = val_df_y[['_y']].values[..., 0]
                metrics_cdict, metrics_dict, cm = get_multiclass_metrics(
                    y_pred_p, y_true, class_names)
                rf_metric = metrics_dict['b-f1score']  # recall f1score
                recall = {c: metrics_cdict[c]['recall'] for c in class_names}
                print(
                    f'samples={len(train_df_y)}: features={len(features)}; criterion={criterion}; max_depth={max_depth}; max_samples={max_samples}; rf_metric={rf_metric}; best_rf_metric={best_rf_metric}; recall={recall}'
                )
                if rf_metric > best_rf_metric:
                    best_rf = rf
                    best_rf_metric = rf_metric

    ### save best
    rank = TopRank('features', n=30)
    rank.add_list(features, best_rf.feature_importances_)
    rank.calcule()
    print(rank)
    d = {
        'rf': best_rf,
        'mean_train_df_x': mean_train_df_x,
        'null_cols': null_cols,
        'features': features,
        'rank': rank,
    }
    return d
示例#12
0
    else:
        finite_idx = np.where(np.isfinite(column))[0]
    x = vectors[finite_idx, :]
    y = column[finite_idx]
    if y.sum() == 0 or y.sum() == len(y):
        print("%15s: undefined" % (name))
        continue
    train_x, test_x, train_y, test_y = train_test_split(x,
                                                        y,
                                                        test_size=0.2,
                                                        stratify=y)

    if args.brf:
        rf = BalancedRandomForestClassifier(n_estimators=100, n_jobs=4)
    else:
        rf = RandomForestClassifier(n_estimators=100, n_jobs=4)

    rf.fit(train_x, train_y)
    p_te = rf.predict_proba(test_x)
    auc_te = roc_auc_score(test_y, p_te[:, 1])
    bacc = balanced_accuracy_score(test_y, p_te[:, 1].round(0))
    print("%15s: %3.5f %3.5f" % (name, auc_te, bacc))
    bacc_av += bacc
    auc_av += auc_te

    if not (args.save is None):
        gzpickle(args.save + '_%i.pkz' % i, rf)

print('Averages:')
print('AUC: %8.3f   BAcc: %8.3f' % (auc_av / (i + 1), bacc_av / (i + 1)))
示例#13
0
def main():
    f = open("trainingData/featuresall_train.txt")

    data = []
    label = []
    for lineNumber, line in enumerate(f):
        if lineNumber != 0:
            newLine = line.rstrip()
            entries = newLine.split('\t')
            data.append(list(map(float, entries[2:])))
            label.append(int(entries[1]))

    f.close()

    f2 = open("testingData/featuresall_test.txt")

    extractTest = []
    extractIds = []
    for lineNumber, line in enumerate(f2):
        if lineNumber != 0:
            newLine = line.rstrip()
            entries = newLine.split('\t')
            extractTest.append(list(map(float, entries[1:])))
            extractIds.append(entries[0])

    f2.close()

    trainLabel = np.array(label)
    trainData = np.array(data)

    testData = np.asarray(extractTest)

    ###########################################################

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        trainData, trainLabel, test_size=0.33)

    X_train, y_train = BorderlineSMOTE().fit_resample(X_train, y_train)

    clf_train = BalancedRandomForestClassifier(n_estimators=100, max_depth=5)
    clf_train = clf_train.fit(X_train, y_train)

    y_dt_pred_train = clf_train.predict_proba(X_test)

    onlyPKpredictions_train = y_dt_pred_train[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, onlyPKpredictions_train)
    print("accuracy BRFC AUC:", metrics.auc(fpr, tpr))

    ###########################################################

    X_train, y_train = BorderlineSMOTE().fit_resample(trainData, trainLabel)

    clf_test = BalancedRandomForestClassifier(n_estimators=100, max_depth=5)
    clf_test = clf_test.fit(X_train, y_train)

    y_dt_pred_test = clf_test.predict_proba(testData)

    onlyPKpredictions_test = y_dt_pred_test[:, 1]

    ###########################################################

    o = open('featuresall_pred3.txt', 'w')
    for i in range(len(extractIds)):
        entry = extractIds[i] + "," + str(onlyPKpredictions_test[i])
        o.write(entry)
        o.write('\n')

    o.close()
示例#14
0
    y_pred2 = y_pred[:, -2]
    return metrics.confusion_matrix(y_test, y_pred1), metrics.confusion_matrix(
        y_test, y_pred2)


fields = list(label2int.index) + ["rank"]
records = []

loo = LeaveOneOut()
y_cv_pred1 = []
y_cv_pred2 = []
y_cv = []
for train_idx, test_idx in tqdm(loo.split(X_train)):
    clf = BalancedRandomForestClassifier(n_estimators=500, random_state=seed)
    clf.fit(X_train[train_idx, :], y_train[train_idx])
    y_p = clf.predict_proba(X_train[test_idx, :]).argsort(axis=1)
    y_cv_pred1.append(y_p[:, -1][0])
    y_cv_pred2.append(y_p[:, -2][0])
    y_cv.append(y_train[test_idx][0])
y_cv_pred1 = np.array(y_cv_pred1)
y_cv_pred2 = np.array(y_cv_pred2)
y_cv = np.array(y_cv)
c1 = metrics.confusion_matrix(y_cv, y_cv_pred1)
c2 = metrics.confusion_matrix(y_cv, y_cv_pred2)
df1 = pd.DataFrame(data=c1, index=label2int.index, columns=label2int.index)
df2 = pd.DataFrame(data=c2, index=label2int.index, columns=label2int.index)
acc1 = df1 / df1.sum(axis=1).values.reshape((-1, 1))
acc2 = df2 / df2.sum(axis=1).values.reshape((-1, 1))
acc1.to_csv(args.confusion_loo, sep="\t")
top1 = list(np.diagonal(acc1.values)) + ["top1-loo"]
top2 = list(np.diagonal((acc1 + acc2).values)) + ["top2-loo"]
示例#15
0
class HierarchicalRandomForest(BaseClassifier):
    MODEL_NAME = "hierarchical_random_forest"
    MODEL_VERSION = "1.0.0"
    MODEL_VERSION_NAME = f"{MODEL_NAME}_{MODEL_VERSION}"
    MODEL_PICKLE_PATH = os.path.join(PICKLE_PATH, f"{MODEL_VERSION_NAME}")

    def __init__(self, taxonomy_dictionary, non_used_features=None):
        n_trees = 500
        self.top_classifier = RandomForestClassifier(n_estimators=n_trees,
                                                     max_depth=None,
                                                     max_features='auto')

        self.stochastic_classifier = RandomForestClassifier(
            n_estimators=n_trees, max_depth=None, max_features=0.2)

        self.periodic_classifier = RandomForestClassifier(n_estimators=n_trees,
                                                          max_depth=None,
                                                          max_features='auto')

        self.transient_classifier = RandomForestClassifier(
            n_estimators=n_trees, max_depth=None, max_features='auto')

        self.feature_preprocessor = FeaturePreprocessor(
            non_used_features=non_used_features)

        self.taxonomy_dictionary = taxonomy_dictionary
        self.feature_list = None
        self.inverted_dictionary = invert_dictionary(self.taxonomy_dictionary)
        self.pickles = {
            "features_list": "features_RF_model.pkl",
            "top_rf": "hierarchical_level_RF_model.pkl",
            "periodic_rf": "periodic_level_RF_model.pkl",
            "stochastic_rf": "stochastic_level_RF_model.pkl",
            "transient_rf": "transient_level_RF_model.pkl"
        }
        self.url_model = f"https://assets.alerce.online/pipeline/hierarchical_rf_{self.MODEL_VERSION}/"

    def fit(self, samples: pd.DataFrame, labels: pd.DataFrame) -> None:
        labels = labels.copy()

        # Check that the received labels are in the taxonomy
        feeded_labels = labels.classALeRCE.unique()
        expected_labels = self.inverted_dictionary.keys()

        for label in feeded_labels:
            if label not in expected_labels:
                raise Exception(f'{label} is not in the taxonomy dictionary')

        # Create top class
        labels['top_class'] = labels['classALeRCE'].map(
            self.inverted_dictionary)

        # Preprocessing
        samples = self.feature_preprocessor.preprocess_features(samples)
        samples = self.feature_preprocessor.remove_duplicates(samples)
        samples, labels = intersect_oids_in_dataframes(samples, labels)

        # Save list of features to know their order
        self.feature_list = samples.columns

        # Train top classifier
        self.top_classifier.fit(samples.values, labels['top_class'].values)

        # Train specialized classifiers
        is_stochastic = labels['top_class'] == 'Stochastic'
        self.stochastic_classifier.fit(
            samples[is_stochastic].values,
            labels[is_stochastic]['classALeRCE'].values)

        is_periodic = labels['top_class'] == 'Periodic'
        self.periodic_classifier.fit(samples[is_periodic].values,
                                     labels[is_periodic]['classALeRCE'].values)

        is_transient = labels['top_class'] == 'Transient'
        self.transient_classifier.fit(
            samples[is_transient].values,
            labels[is_transient]['classALeRCE'].values)

    def check_missing_features(self, columns, feature_list):
        missing = set(feature_list).difference(set(columns))
        return missing

    def predict_proba(self, samples: pd.DataFrame) -> pd.DataFrame:
        missing = self.check_missing_features(samples.columns,
                                              self.feature_list)
        if len(missing) > 0:
            raise Exception(f"Missing features: {missing}")

        samples = samples[self.feature_list]
        samples = self.feature_preprocessor.preprocess_features(samples)

        top_probs = self.top_classifier.predict_proba(samples.values)

        stochastic_probs = self.stochastic_classifier.predict_proba(
            samples.values)
        periodic_probs = self.periodic_classifier.predict_proba(samples.values)
        transient_probs = self.transient_classifier.predict_proba(
            samples.values)

        stochastic_index = self.top_classifier.classes_.tolist().index(
            'Stochastic')
        periodic_index = self.top_classifier.classes_.tolist().index(
            'Periodic')
        transient_index = self.top_classifier.classes_.tolist().index(
            'Transient')

        stochastic_probs = stochastic_probs * top_probs[:,
                                                        stochastic_index].reshape(
                                                            [-1, 1])
        periodic_probs = periodic_probs * top_probs[:, periodic_index].reshape(
            [-1, 1])
        transient_probs = transient_probs * top_probs[:,
                                                      transient_index].reshape(
                                                          [-1, 1])

        final_probs = np.concatenate(
            [stochastic_probs, periodic_probs, transient_probs], axis=1)

        df = pd.DataFrame(data=final_probs,
                          index=samples.index,
                          columns=self.get_list_of_classes())
        df.index.name = samples.index.name
        return df

    def get_list_of_classes(self) -> list:
        final_columns = (self.stochastic_classifier.classes_.tolist() +
                         self.periodic_classifier.classes_.tolist() +
                         self.transient_classifier.classes_.tolist())
        return final_columns

    def save_model(self, directory: str) -> None:
        with open(os.path.join(directory, self.pickles['top_rf']), 'wb') as f:
            pickle.dump(self.top_classifier, f, pickle.HIGHEST_PROTOCOL)

        with open(os.path.join(directory, self.pickles['stochastic_rf']),
                  'wb') as f:
            pickle.dump(self.stochastic_classifier, f, pickle.HIGHEST_PROTOCOL)

        with open(os.path.join(directory, self.pickles['periodic_rf']),
                  'wb') as f:
            pickle.dump(self.periodic_classifier, f, pickle.HIGHEST_PROTOCOL)

        with open(os.path.join(directory, self.pickles['transient_rf']),
                  'wb') as f:
            pickle.dump(self.transient_classifier, f, pickle.HIGHEST_PROTOCOL)

        with open(os.path.join(directory, self.pickles['features_list']),
                  'wb') as f:
            pickle.dump(self.feature_list, f, pickle.HIGHEST_PROTOCOL)

    def load_model(self, directory: str) -> None:
        self.top_classifier = pd.read_pickle(
            os.path.join(directory, self.pickles['top_rf']))
        self.stochastic_classifier = pd.read_pickle(
            os.path.join(directory, self.pickles['stochastic_rf']))
        self.periodic_classifier = pd.read_pickle(
            os.path.join(directory, self.pickles['periodic_rf']))
        self.transient_classifier = pd.read_pickle(
            os.path.join(directory, self.pickles['transient_rf']))
        self.feature_list = pd.read_pickle(
            os.path.join(directory, self.pickles['features_list']))

    def download_model(self):
        if not os.path.exists(self.MODEL_PICKLE_PATH):
            os.makedirs(self.MODEL_PICKLE_PATH)
        for pkl in self.pickles.values():
            tmp_path = os.path.join(self.MODEL_PICKLE_PATH, pkl)
            if not os.path.exists(tmp_path):
                command = f"wget {self.url_model}{pkl} -O {tmp_path}"
                wget.download(os.path.join(self.url_model, pkl), tmp_path)

    def predict_in_pipeline(self, input_features: pd.DataFrame) -> dict:
        if isinstance(input_features, pd.Series):
            input_features = input_features.to_frame().transpose()
        if len(input_features) != 1:
            raise ValueError(
                'predict_in_pipeline receives features one by one')

        missing = self.check_missing_features(input_features.columns,
                                              self.feature_list)
        if len(missing) > 0:
            raise Exception(f"Missing features: {missing}")

        input_features = input_features[self.feature_list]

        input_features = self.feature_preprocessor.preprocess_features(
            input_features)
        prob_root = pd.DataFrame(
            self.top_classifier.predict_proba(input_features),
            columns=self.top_classifier.classes_,
            index=input_features.index)

        prob_children = []
        resp_children = {}

        child_models = [
            self.stochastic_classifier, self.periodic_classifier,
            self.transient_classifier
        ]
        child_names = ['Stochastic', 'Periodic', 'Transient']
        for name, model in zip(child_names, child_models):
            prob_child = pd.DataFrame(model.predict_proba(input_features),
                                      columns=model.classes_,
                                      index=input_features.index)

            resp_children[name] = prob_child.iloc[0].to_dict()
            prob_child = prob_child.mul(prob_root[name].values, axis="rows")
            prob_children.append(prob_child)
        prob_all = pd.concat(prob_children, axis=1, sort=False)

        return {
            "hierarchical": {
                "top": prob_root.iloc[0].to_dict(),
                "children": resp_children
            },
            "probabilities": prob_all.iloc[0].to_dict(),
            "class": prob_all.idxmax(axis=1).iloc[0]
        }
示例#16
0
def Improved_BRF_low(x_train,y_train,x_test,y_test,threshold1_low,threshold2_low,threshold3_low):
    
    clf1 = BalancedRandomForestClassifier(max_leaf_nodes=20,\
            n_estimators = 60,criterion = 'entropy',min_samples_leaf=20,min_samples_split=50,\
            max_depth=7, oob_score = True,random_state=10)
    
    clf2 = BalancedRandomForestClassifier(max_leaf_nodes=20,max_features = 10,\
            n_estimators = 60,criterion = 'entropy',min_samples_leaf=10,min_samples_split=30,\
            max_depth=9, oob_score = True,random_state=10)
        
    clf3 = BalancedRandomForestClassifier(max_leaf_nodes=20,max_features = 14,\
            n_estimators = 40,criterion = 'entropy',min_samples_leaf=10,min_samples_split=50,\
            max_depth=7, oob_score = True,random_state=10)
    
    ################################################## Data frist Classifier
    print('################################################## Data frist Classifier')
    print('Train Clients %s'%Counter(y_train))
    print('Test Clients %s'%Counter(y_test)) 
     
    clf1.fit(x_train,y_train)
    
    with open('BRF_clf1_low.pkl', 'wb') as f:
        pickle.dump(clf1, f, pickle.HIGHEST_PROTOCOL)
        
        
    y_pred1 = clf1.predict(x_test)
    
    y_prob1 = clf1.predict_proba(x_test)[:,1]
    y_prob1_train = clf1.predict_proba(x_train)[:,1]
           
    Plot_Prob_Distribution.Plot_probability(y_test,y_prob1,threshold1_low,threshold1_low)
    
    Prediction = np.zeros(y_test.shape)
    for i in range(len(y_test)):
        if y_prob1[i] <= threshold1_low:
            Prediction[i] = -1
        else:            
            Prediction[i] = clf1.predict(x_test[i,:].reshape(1, -1))
     

    ################################################## Data second Classifier
    print('################################################## Data second Classifier')


    train_choix_bool = (y_prob1_train > threshold1_low)
    test_choix_bool = (y_prob1 > threshold1_low)
    print('Train Clients %s'%Counter(y_train[train_choix_bool]))
    print('Test Clients %s'%Counter(y_test[test_choix_bool]))
    

    clf2.fit(x_train[train_choix_bool],y_train[train_choix_bool])
    with open('BRF_clf2_low.pkl', 'wb') as f:
        pickle.dump(clf2, f, pickle.HIGHEST_PROTOCOL)
    
    y_prob2 = clf2.predict_proba(x_test[test_choix_bool])[:,1]


    y_prob2_train = np.zeros(len(x_train))
    for i in range(len(x_train)):
        if (y_prob1_train[i] > threshold1_low):
            y_prob2_train[i] = clf2.predict_proba(x_train[i,:].reshape(1,-1))[:,1]
    
    
    
    Plot_Prob_Distribution.Plot_probability(y_test[test_choix_bool],y_prob2,threshold2_low,threshold2_low)
    
    y_prob2 = np.zeros(len(x_test))
    for i in range(len(x_test)):
        if  (y_prob1[i] > threshold1_low):

            y_prob2[i] = clf2.predict_proba(x_test[i,:].reshape(1,-1))[:,1]
    
            
    for i in range(len(y_test)):
        if (y_prob1[i]+y_prob2[i])/2 <= threshold2_low:
            Prediction[i] = -1
        else:
            Prediction[i] = clf2.predict(x_test[i,:].reshape(1, -1))
            
    
    ################################################## Data third Classifier
    print('################################################## Data third Classifier')

    train_choix_bool = (y_prob1_train>threshold1_low) & (y_prob2_train>threshold2_low) 
    test_choix_bool = (y_prob1>threshold1_low) & (y_prob2>threshold2_low) 
            
    print('Train Clients %s'%Counter(y_train[train_choix_bool]))
    print('Test Clients %s'%Counter(y_test[test_choix_bool]))
    
    clf3.fit(x_train[train_choix_bool],y_train[train_choix_bool])
    
    with open('BRF_clf3_low.pkl', 'wb') as f:
        pickle.dump(clf3, f, pickle.HIGHEST_PROTOCOL)
    
    y_prob3 = clf3.predict_proba(x_test[test_choix_bool])[:,1]
    
    y_prob3_train = np.zeros(len(x_train))
    for i in range(len(x_train)):
        if  (y_prob1_train[i]>threshold1_low) & (y_prob2_train[i]>threshold2_low) :
            y_prob3_train[i] = clf3.predict_proba(x_train[i,:].reshape(1,-1))[:,1]
    
    
    Plot_Prob_Distribution.Plot_probability(y_test[test_choix_bool],y_prob3,threshold3_low,threshold3_low)
    
    y_prob3 = np.zeros(len(x_test))
    for i in range(len(x_test)):
        if  (y_prob1[i]>threshold1_low) & (y_prob2[i]>threshold2_low) :

            y_prob3[i] = clf3.predict_proba(x_test[i,:].reshape(1,-1))[:,1]
    
    ##########  Model 1        
    for i in range(len(y_test)):
        if y_prob3[i] <= threshold3_low:
            Prediction[i] = -1
        else:
            Prediction[i] = clf3.predict(x_test[i,:].reshape(1, -1))

    ##########  Model 2        
    y_Prob = np.zeros(len(x_test))
    for i in range(len(x_test)):
        if  (y_prob1[i]<threshold1_low) :
            y_Prob[i] = -1
        else:
            if (y_prob1[i]+y_prob2[i])/2 < threshold2_low:
                y_Prob[i] = -1
            else:
                y_Prob[i] = (y_prob1[i]+y_prob2[i]+y_prob3[i])/3
            
    y_Pred = np.sign(y_Prob-0.5)
    
    return y_pred1, y_Pred
示例#17
0
    def random_boruta(self):

        with open(self.result_folder +
                  '/param_RF_{}.json'.format(self.epoch)) as f:

            dati = json.load(f)
            for data in dati:
                del data['value']

                brfmodel = BalancedRandomForestClassifier(**data)

                cv = StratifiedKFold(n_splits=5, shuffle=True)

                for train_index, test_index in cv.split(self.X, self.y):

                    X_train = self.X.iloc[lambda x: train_index]
                    X_test = self.X.iloc[lambda x: test_index]
                    y_train = np.take(self.y, train_index)
                    y_test = np.take(self.y, test_index)

                    median_imputer = SimpleImputer(missing_values=np.NaN,
                                                   strategy='median')
                    imputer = median_imputer.fit(X_train)
                    vX_train = imputer.transform(X_train)
                    imputertest = median_imputer.fit(X_test)
                    vX_test = imputertest.transform(X_test)

                    X_train = pd.DataFrame(vX_train,
                                           columns=X_train.columns,
                                           index=X_train.index)
                    X_test = pd.DataFrame(vX_test,
                                          columns=X_test.columns,
                                          index=X_test.index)
                    Feature_Selector = BorutaShap(model=brfmodel,
                                                  importance_measure='shap',
                                                  percentile=85,
                                                  pvalue=0.08,
                                                  classification=True)

                    Feature_Selector.fit(X_train,
                                         y_train,
                                         n_trials=200,
                                         random_state=0)
                    Feature_Selector.TentativeRoughFix()

                    Feature_Selector.plot(X_size=12,
                                          figsize=(12, 8),
                                          y_scale='log',
                                          which_features='all')

                    Xstrain = Feature_Selector.Subset()
                    selected = [x for x in Xstrain.columns]
                    print('features selected', selected)

                    v_test_X = median_imputer.fit_transform(self.X_test)
                    test_X = pd.DataFrame(v_test_X,
                                          columns=self.X_test.columns,
                                          index=self.X_test.index)

                    valx = self.X_validation
                    valy = self.y_validation
                    vvalX = imputer.transform(valx)
                    valx = pd.DataFrame(vvalX,
                                        columns=valx.columns,
                                        index=valx.index)

                    print('AUC')
                    brfmodel.fit(X_train, y_train)
                    roc = roc_auc_score(y_test,
                                        brfmodel.predict_proba(X_test)[:, 1])
                    print(roc)

                    print('AUC Validation')
                    roc_test = roc_auc_score(
                        self.y_validation,
                        brfmodel.predict_proba(valx)[:, 1])

                    print(roc_test)

                    print('AUC ridotte')
                    brfmodel.fit(Xstrain, y_train)
                    roc = roc_auc_score(
                        y_test,
                        brfmodel.predict_proba(X_test[selected])[:, 1])

                    print(roc)
                    roc_test = roc_auc_score(
                        self.y_validation,
                        brfmodel.predict_proba(valx[selected])[:, 1])

                    print(roc_test)
示例#18
0
weighted_clf.classes_

sample.to_csv('weighted_rfc.csv',index = False)




#balancedrfc

from imblearn.ensemble import BalancedRandomForestClassifier
brfc = BalancedRandomForestClassifier(n_estimators=500,random_state=0).fit(X_tr,y_tr)
roc_auc_score(y_tst,brfc.predict(X_tst))


sample['risk_flag'] = brfc.predict(test_df.drop(columns = ['risk_flag']))
sample['risk_flag_proba'] = brfc.predict_proba(test_df.drop(columns = ['risk_flag']))[:,1]
weighted_clf.classes_




print("F1 Score for Balanced Random Forest Classifier is ", f1_score(y_test,brfc.predict(X_test)))
print("Accuracy  Score for Balanced Random Fo
      
      
#catboost



submission = pd.read_csv('catboost_+_feats_+15000itrs.csv')
sample['risk_flag_cat'] = submission['risk_flag']
class BalancedBinaryClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self,
                 max_depth=None,
                 n_features=10,
                 selector=ranksum,
                 trend="both",
                 space_mask=None):
        self.max_depth = max_depth
        self.n_features = n_features
        self.selector = selector
        self.model_ = BalancedRandomForestClassifier(max_depth=max_depth,
                                                     n_estimators=100,
                                                     random_state=777)
        self.trend = trend
        self.space_mask = space_mask

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        self.mask = self.trend_mask = np.zeros(X.shape[1])
        self.classes_ = unique_labels(y)
        if self.classes_.shape[0] != 2:
            raise Exception(
                'Current implementation only support binary classification')
        self.importance = self.selector(X, y)
        flag1 = flag2 = False
        mean_diff = X[y == 1, :].mean(axis=0) - X[y == 0, :].mean(axis=0)
        if self.trend == "up":
            flag1 = True
            self.trend_mask[mean_diff <= 0] = 1
            print("Trend mask: {}/{}".format(int(self.trend_mask.sum()),
                                             X.shape[1]))
        if self.space_mask is not None:
            flag2 = True
            self.space_mask = np.array(self.space_mask).astype(int)
            print("Space mask: {}/{}".format(int(self.space_mask.sum()),
                                             X.shape[1]))
        else:
            self.space_mask = np.zeros(X.shape[1])
        if flag1 or flag2:
            self.mask = self.trend_mask + self.space_mask
            self.mask[self.mask > 1] = 1
            print("Remained: {}/{}".format(X.shape[1] - int(self.mask.sum()),
                                           X.shape[1]))
            self.importance[self.mask.astype(bool)] = self.importance.min() - 1
        if self.trend == "both_balance":
            n_up = int(self.n_features / 2)
            n_down = self.n_features - n_up
            up_importance = copy(self.importance)
            down_importance = copy(self.importance)
            up_importance[mean_diff < 0] = up_importance.min() - 1
            down_importance[mean_diff > 0] = down_importance.min() - 1
            up_order = np.argsort(up_importance)[::-1]
            down_order = np.argsort(down_importance)[::-1]
            features = np.array(
                list(up_order[:n_up]) + list(down_order[:n_down]))
            print(features)
        else:
            order = np.argsort(self.importance)[::-1]
            features = order[:self.n_features]
        self.features = features
        self.model_.fit(X[:, self.features], y)

    def predict(self, X):
        check_is_fitted(self)
        return self.model_.predict(X[:, self.features])

    def predict_proba(self, X):
        check_is_fitted(self)
        return self.model_.predict_proba(X[:, self.features])
示例#20
0
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.31, random_state=0,stratify=y)  
instance = X_train.sample(n=50000).values #instance randomized to avoid RAM error
# Scale data (standardize it)
scaler.fit_transform(X_train)
scaler.transform(X_test)

from imblearn.metrics import classification_report_imbalanced
clf = BalancedRandomForestClassifier(max_depth=None, random_state=0,oob_score=True,n_estimators=100)
clf.fit(X_train,y_train)


y_pred = clf.predict(X_test)
y_pred= pd.DataFrame(y_pred)
print(classification_report_imbalanced(y_test, y_pred,     target_names=["0","1"]))

clf_probs = clf.predict_proba(X_test)[:,1]
clf_probs0 = clf.predict_proba(X_test)[:,0]
print(roc_auc_score(y_test, clf_probs,average="weighted"))
print(roc_auc_score(y_test, clf_probs0,average="weighted"))

np.set_printoptions(precision=2)
from sklearn.utils.multiclass import unique_labels
#classes_spr = list(unique_labels(y_test, y_pred))
classes_spr = ["0","1"]
plot_confusion_matrix(y_test, y_pred, classes=classes_spr, normalize=False,
                      title='confusion matrix for Balanced Random Forest')
plt.show()

from treeinterpreter import treeinterpreter as ti
from collections import defaultdict 
import random
示例#21
0
names = [train.columns[i] for i in indices]

# Barplot: Add bars
plt.bar(range(train.shape[1]), importances[indices])
# Add feature names as x-axis labels
plt.xticks(range(train.shape[1]), names, rotation=20, fontsize=8)
plt.yticks(range(0, 35, 5), fontsize=12)
plt.grid(b=None, axis='x')
# Create plot title
plt.title("Feature Importances")
# Show plot
plt.show()

#Training data prediction
train_rf_predictions = model.predict(train)
train_rf_probs = model.predict_proba(train)[:, 1]

# Testing predictions (to determine performance)
rf_predictions = model.predict(test)
rf_probs = model.predict_proba(test)[:, 1]

#Combine predicted train data odds with team name and year
train_1 = pd.concat([train_year, train_team, train], axis=1)
train_1.reset_index(drop=True, inplace=True)
train_1 = pd.concat([train_1, pd.DataFrame(train_labels)], axis=1)
train_1 = train_1.rename(columns={0: 'Champion'})
train_1 = pd.concat([train_1, pd.DataFrame(train_rf_probs)], axis=1)
train_1 = train_1.rename(columns={'Year': 'Year', 'Team': 'Team', 0: 'Probs'})

#train_1.sort_values(by=['Probs'],ascending=False)
X_train = X.loc[pos_ids_train+neg_ids_train,:].values
y_train = np.array([1]*len(pos_ids_train)+[0]*len(neg_ids_train))
X_test = X.loc[pos_ids_test+neg_ids_test,:].values
y_test = np.array([1]*len(pos_ids_test)+[0]*len(neg_ids_test))

print(pos_ids_test[:4])
print(neg_ids_test[:4])
print(X_test[:4,:])

clf2 = BalancedRandomForestClassifier(n_estimators=100,random_state=777)  #666
grid = {"max_depth":[2,4,8,16,None]}
clf2 = autoTune(clf2,X_train,y_train,grid) 
#y_pred = clf2.predict_proba(X.values)[:,1]
prob = pd.DataFrame(index=sample_ids,columns=["probability","dataset"])
prob.loc[pos_ids_test+neg_ids_test,"probability"]=clf2.predict_proba(X_test)[:,1]
prob.loc[pos_ids_train+neg_ids_train,"probability"]=clf2.predict_proba(X_train)[:,1]
prob.loc[pos_ids_train+neg_ids_train,"dataset"]="train"
prob.loc[pos_ids_test+neg_ids_test,"dataset"]="test"
prob.to_csv(args.probability,sep="\t")
y_pred = prob.loc[pos_ids_test+neg_ids_test,"probability"]
print("ROC-AUC on test set")
print("full-training-set\ttest_set\t{}".format(metrics.roc_auc_score(y_test,y_pred)),sep="\t")
auroc=metrics.roc_auc_score(y_test,y_pred)
with open(args.auroc,"w") as f:
    f.write(str(auroc))
fpr, tpr, thresholds = metrics.roc_curve(y_test,y_pred)
roc_data = {}
roc_data["fpr"] = fpr
roc_data["tpr"] = tpr
roc_data["thresholds"] = thresholds
    sampling_strategy='not minority',
    oob_score=True,
    n_jobs=4,
    random_state=42,
    verbose=1
)
clf.fit(X_train, Y_train)

Y_test_pred = clf.predict(X_test)
print('\nClassifier performance')
print('Out of sample:\n', metrics.classification_report(Y_test, Y_test_pred, zero_division=0)) 

# This will be the training set
Y_in_train = clf.oob_decision_function_.astype('float32')
# This will be the test set
Y_in_test = clf.predict_proba(X_test).astype('float32')

# %% [markdown]
'''

## Architecture design
As a baseline, let's use a single-layer bidirectional LSTM.
PyTorch uses a sligtly unintuitive array format for the input and output of
its LSTM module.
The array shape for both input and output is `(seq_length,N,num_labels)`, corresponding to 
`N` sequences of `seq_length` elements of size `num_labels`. 
Here, each element is a vector of label probabilities/logits.
'''

# %%
class LSTM(nn.Module):
    def feature_importance(self, n_rows, n_cols, X_train, y_train, X_valid,
                           y_valid):
        '''Calculate feature importance from Logistic, Random Forest, CatBoost, XGB, LGBM'''
        # train classifiers
        lr = LogisticRegression(max_iter=100, random_state=42)
        lr.fit(X_train, y_train)
        lr_prob = lr.predict_proba(X_valid)
        rfc = RandomForestClassifier(n_jobs=2, random_state=42)
        rfc.fit(X_train, y_train)
        rfc_prob = rfc.predict_proba(X_valid)
        brfc = BalancedRandomForestClassifier(random_state=42)
        brfc.fit(X_train, y_train)
        brfc_prob = brfc.predict_proba(X_valid)
        cb = CatBoostClassifier(random_state=42, verbose=False)
        cb.fit(X_train, y_train)
        cb_prob = cb.predict_proba(X_valid)
        xgb = XGBClassifier(random_state=42)
        xgb.fit(X_train, y_train)
        xgb_prob = xgb.predict_proba(X_valid)
        lgbm = LGBMClassifier(random_state=42, n_jobs=-1)
        lgbm.fit(X_train, y_train)
        lgbm_prob = lgbm.predict_proba(X_valid)

        feat_importance_list = [
            lr.coef_[0], rfc.feature_importances_, brfc.feature_importances_,
            cb.feature_importances_, xgb.feature_importances_,
            lgbm.feature_importances_
        ]
        model_name = [
            'Logistic Regression', 'Random Forest Classifier',
            'Balanced Random Forest Classifier', 'CatBoost Classifier',
            'XGB Classifier', 'LGBM Classifier'
        ]

        # generate feature importance plots
        fig, ax = plt.subplots(nrows=n_rows, ncols=n_cols, figsize=(18, 20))
        sns.set(font_scale=1.5)
        for feature, name, n, ax in zip(feat_importance_list, model_name,
                                        list(range(n_rows * n_cols)),
                                        ax.flatten()):
            # get feature importance
            importance = feature

            # create dataframe
            df_imp = pd.DataFrame()

            # calculate importance of each variable
            df_imp['importance'] = pd.Series(importance,
                                             index=list(X_train.columns))

            # transform dataframe
            long_df = pd.melt(df_imp.T)

            # plot barplot
            plt.subplot(n_rows, n_cols, n + 1)
            sns.barplot(y=long_df.variable,
                        x=long_df.value,
                        order=long_df.sort_values(
                            'value', ascending=False)['variable'].to_list())
            plt.title(f'{name}')

        # adjusts subplot
        plt.tight_layout()

        # displays the plot
        plt.show()
示例#25
0
def Clasificar(database, new, path):
    pd.options.mode.chained_assignment = None
    if 'Response by Category' in list(database.columns):
        database = database.drop(['Response by Category','Response by Description'], axis = 1)
    database = database.sample(frac= 0.4, replace = False)
    
    #Chequeo las companias que ya estaban clasificadas
    #d = new.merge(database, how ='left', left_on='Organization Name', right_on = 'Investee')[['Investee','Category.1','Area of Focus']]
    #new = new.merge(d, how = "left", left_on = "Organization Name", right_on = "Investee")
    #new = new.drop(columns=["Investee"])
    
    database["Category.1"] = database["Category.1"].replace("rejected", "Rejected")
    database["Category.1"] = database["Category.1"].replace("B2C ", "B2C")
    database["Category.1"] = database["Category.1"].replace("FIntech", "Fintech")

    database['Prediction'] = np.nan
    new['Prediction'] = np.nan
    new = new.drop(['Prediction'], axis=1)

    #CLASIFICADOR
    
    warnings.filterwarnings('ignore')
    
    
    print('Importando bases de datos')
    
    new = new.rename(columns = {'Categories':'Category','Organization Name':'Investee'})
    train = database[['Operation','Investee', 'Category', 'Description', 'Category.1', 'Area of Focus']].dropna()
    newdata = new[['Transaction Name','Investee', 'Category', 'Description']]
    
    
    print('Preprocesamiento del texto')
    
    stop_words = stopwords.words('english')
    
    for column in ['Category','Description']:
        
        train[column] = train[column].apply(lambda x: (" ".join(str(x).lower() for x in str(x).split())).encode('utf-8').decode('utf-8'))  # lower case
        train[column] = train[column].str.replace('[^\w\s]', ' ')          																											# removing punctuation
        train[column] = train[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words))   # removing stop words
        newdata[column] = newdata[column].apply(lambda x: (" ".join(x.lower() for x in str(x).split())))  # lower case
        newdata[column] = newdata[column].str.replace('[^\w\s]', ' ')																		# removing punctuation
        newdata[column] = newdata[column].apply(lambda x: " ".join(str(x) for x in str(x).split() if x not in stop_words))   # removing stop words
    
    
    train_src1 = train[['Category','Description','Category.1']]
    train_src1['Rejected?'] = 0
    train_src1.loc[train_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1
    
    new_src1 = newdata[['Category','Description']]
    #new_src1['Rejected?'] = 0
    #new_src1.loc[new_src1['Category.1'] != 'Rejected', 'Rejected?'] = 1
    
    
    #Binarizacion
    vectorizer = CountVectorizer()
    
    vectorI = pd.DataFrame(vectorizer.fit_transform(train_src1['Category']).toarray())
    vectorI_new = pd.DataFrame(vectorizer.transform(new_src1['Category']).toarray())
    vectorIdes = pd.DataFrame(vectorizer.fit_transform(train_src1['Description']).toarray())
    vectorIdes_new = pd.DataFrame(vectorizer.transform(new_src1['Description']).toarray())
    
    vectorI = pd.concat([vectorI, vectorIdes], axis = 1)
    vectorI_new = pd.concat([vectorI_new, vectorIdes_new], axis = 1)
    
    print('Entrenamiento')
    
    #Clasificacion binaria: Rechazadas vs no rechazadas
                #Resampling + Random Forest
    brf = BalancedRandomForestClassifier(n_estimators=100, random_state=0)
    brf.fit(vectorI, train_src1['Rejected?'])
    y_train_pred = brf.predict(vectorI)
    print('Confusion matrix: \n' , confusion_matrix(train_src1['Rejected?'], y_train_pred))
    print('Accuracy: \n' , accuracy_score(train_src1['Rejected?'], y_train_pred))
    print('Recall: \n' , recall_score(train_src1['Rejected?'], y_train_pred))
    
    
    print('Clasificacion y exportacion')
    #Ajustando modelo a nuevos datos
    y_new_predict = brf.predict(vectorI_new)
    y_new_predict_proba = brf.predict_proba(vectorI_new)
    
    newdata['Prediction'] = y_new_predict
    newdata['Prob. of being rejected'] = y_new_predict_proba[:,0]
    newdata['Prob. of being of interest'] = y_new_predict_proba[:,1]
    

    
    #Creamos archivo Companies y exportamos
    new = pd.concat([new, newdata[['Prediction','Prob. of being rejected','Prob. of being of interest']]], axis=1, sort=False) 

    return new
def main():
    f = open("trainingData/featuresall_train.txt")

    data = []
    floatData = []
    label = []
    for lineNumber, line in enumerate(f):
        if lineNumber != 0:
            entries = line.split('\t')
            data.append(list(map(float, entries[2:])))
            label.append(int(entries[1]))
            #data.append((list(map(float, entries[2:])), int(entries[1])))

    f.close()

    # f2 = open("testingData/features103_test.txt")

    # extractTest = []
    # extractTestLabels = []
    # for lineNumber, line in enumerate(f2):
    #     if lineNumber != 0:
    #         entries = line.split('\t')
    #         extractTest.append(list(map(float, entries[1:])))
    #         # extractTestLabels.append(float(entries[1]))

    # testData = np.asarray(extractTest)
    # # testLabels = np.asarray(extractTestLabels)

    classLabel = np.array(label)
    trainData = np.array(data)

    # ros = RandomOverSampler(random_state=0)

    # X_resampled, y_resampled = SMOTE().fit_resample(trainData, classLabel)

    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        trainData, classLabel, test_size=0.33)
    # X_train, X_test, y_train, y_test = model_selection.train_test_split(X_resampled, y_resampled, test_size = 0.33)

    # X_train, y_train = ros.fit_resample(X_train, y_train)
    X_train, y_train = SMOTE().fit_resample(X_train, y_train)

    # Gaussian Naive Bayes SUCKS don't use it
    nb = GaussianNB()
    nb.fit(X_train, y_train)
    y_nb_pred = nb.predict_proba(X_test)
    # print(y_nb_pred)
    onlyPKpredictions = y_nb_pred[:, 1]
    # print(onlyPKpredictions)
    # print("accuracy KAPPA:",metrics.cohen_kappa_score(y_test, y_nb_pred))
    fpr, tpr, thresholds = metrics.roc_curve(y_test, onlyPKpredictions)
    print("accuracy NB AUC:", metrics.auc(fpr, tpr))
    # print("confusion:\n",metrics.confusion_matrix(y_test, y_nb_pred))

    # We are blessed with DT
    clf = tree.DecisionTreeClassifier(criterion='entropy',
                                      max_depth=5)  #, max_depth=20)
    # clf = tree.DecisionTreeRegressor()
    clf = clf.fit(X_train, y_train)

    y_dt_pred = clf.predict_proba(X_test)
    # print(y_dt_pred[:,0])

    onlyPKpredictions = y_dt_pred[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, onlyPKpredictions)
    print("accuracy DT AUC:", metrics.auc(fpr, tpr))
    # print(y_dt_pred)

    # print("accuracy KAPPA:",metrics.cohen_kappa_score(y_test, y_dt_pred))
    # print("accuracy AUC:",metrics.roc_auc_score(y_test, y_dt_pred))
    # print("DT:\n",metrics.confusion_matrix(y_test, y_dt_pred))

    model = BalancedRandomForestClassifier(n_estimators=100, max_depth=5)
    model = model.fit(X_train, y_train)
    y_rfc_pred = model.predict_proba(X_test)
    onlyPKpredictions = y_rfc_pred[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y_test, onlyPKpredictions)
    print("accuracy RFC AUC:", metrics.auc(fpr, tpr))
    # print(y_rfc_pred)
    # print("accuracy AUC:",metrics.roc_auc_score(y_test, y_rfc_pred))

    f3 = open('results_train103.txt')
    grab = []
    for line in f3:
        spl = line.split('\t')
        grab.append(float(spl[1]))

    nn = np.array(grab)

    fpr, tpr, thresholds = metrics.roc_curve(classLabel, nn)
    print("accuracy NN AUC:", metrics.auc(fpr, tpr))

    f3.close()