Exemplo n.º 1
0
                ncols = df_for_fs.shape[1]

                if nrows > 100 & ncols > 1:

                    data_cols2 = [
                        x for x in df_for_fs.columns if x not in [
                            'GEOID_DKU', P_TARGET, P_COLUMN_STATES, P_ID_COL,
                            P_CENSUS_LEVEL_COLUMN
                        ]
                    ]

                    if len(data_cols2) > 0:

                        try:
                            pval = feature_selection.univariate_feature_selection(
                                P_SUPERVISION_ALGO, df_for_fs[data_cols2],
                                df_for_fs[P_TARGET])

                            for c in zip(data_cols2, pval):
                                dict_features['predictor'].append(c[0])
                                dict_features['pvalue'].append(c[1])
                                #dict_features['state'].append(state)
                                dict_features['segment_number'].append(
                                    segment_number)
                                dict_features['created_at'].append(
                                    process_date)
                                #dict_features['feature_catalog'].append('') ## future usage
                                dict_features['nrows'].append(nrows)
                        except:
                            print 'Warning - Feature selection issue for this segment, you should consider another imputation strategy'
                            params = {
Exemplo n.º 2
0
def main():
    # Set hyperparameters
    num_folds = 100
    label_name = "1"

    # Specify data location
    data_path = "Data/test_data.csv"

    # Load data to table
    df = pd.read_csv(data_path, sep=";", index_col=0)

    # Check if any labels are missing
    print("Number of missing values:\n", df.isnull().sum())
    print()

    # Only keep first instance if multiple instances have the same key
    num_instances_before = len(df)
    df = df[~df.index.duplicated(keep="first")]
    num_instances_diff = num_instances_before - len(df)
    if num_instances_diff > 0:
        print(
            "Warning: {} instances removed due to duplicate keys - only keeping first occurrence!"
            .format(num_instances_diff))

    # Perform standardized preprocessing
    preprocessor = TabularPreprocessor()
    df = preprocessor.fit_transform(df)

    # Display bar chart with number of samples per class
    # seaborn.countplot(x=label_name, data=df)
    # plt.title("Original class frequencies")
    # plt.savefig("Results/original_class_frequencies.png")
    # plt.close()

    # Separate data into training and test
    y = df[label_name]
    x = df.drop(label_name, axis="columns")

    # Get samples per class
    print("Samples per class")
    for (label, count) in zip(*np.unique(y, return_counts=True)):
        print("{}: {}".format(label, count))
    print()

    # Get number of classes
    num_classes = len(np.unique(df[label_name].values))

    # Setup classifiers
    knn = KNeighborsClassifier(weights="distance")
    knn_param_grid = {
        "n_neighbors":
        [int(val)
         for val in np.round(np.sqrt(x.shape[1])) + np.arange(5) + 1] +
        [
            int(val)
            for val in np.round(np.sqrt(x.shape[1])) - np.arange(5) if val >= 1
        ],
        "p":
        np.arange(1, 5)
    }

    dt = DecisionTreeClassifier()
    dt_param_grid = {
        "criterion": ["gini", "entropy"],
        "splitter": ["best", "random"],
        "max_depth": np.arange(1, 20),
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 3, 5, 6],
        "max_features": ["auto", "sqrt", "log2"]
    }

    rf = RandomForestClassifier(n_estimators=100,
                                criterion="entropy",
                                max_depth=5,
                                min_samples_split=5,
                                min_samples_leaf=2)
    rf_param_grid = {}

    nn = MLPClassifier(hidden_layer_sizes=(32, 64, 32), activation="relu")
    nn_param_grid = {}

    clfs = {
        "knn": {
            "classifier": knn,
            "parameters": knn_param_grid
        },
        "dt": {
            "classifier": dt,
            "parameters": dt_param_grid
        },
        "rf": {
            "classifier": rf,
            "parameters": rf_param_grid
        },
        "nn": {
            "classifier": nn,
            "parameters": nn_param_grid
        }
    }

    clfs_performance = {"acc": [], "sns": [], "spc": [], "auc": []}

    # Initialize result table
    results = pd.DataFrame(index=list(clfs.keys()))

    # Iterate over classifiers
    for clf in clfs:

        # Initialize cumulated confusion matrix and fold-wise performance containers
        cms = np.zeros((num_classes, num_classes))
        performance_foldwise = {"acc": [], "sns": [], "spc": [], "auc": []}

        # Iterate over MCCV
        for fold_index in np.arange(num_folds):

            # Split into training and test data
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=0.15, stratify=y, random_state=fold_index)

            # Perform standardization and feature imputation
            intra_fold_preprocessor = TabularIntraFoldPreprocessor(
                k="automated", normalization="standardize")
            intra_fold_preprocessor = intra_fold_preprocessor.fit(x_train)
            x_train = intra_fold_preprocessor.transform(x_train)
            x_test = intra_fold_preprocessor.transform(x_test)

            # Perform (ANOVA) feature selection
            selected_indices, x_train, x_test = univariate_feature_selection(
                x_train.values,
                y_train.values,
                x_test.values,
                score_func=f_classif,
                num_features="log2n")

            # # Random undersampling
            # rus = RandomUnderSampler(random_state=fold_index, sampling_strategy=0.3)
            # x_train, y_train = rus.fit_resample(x_train, y_train)

            # SMOTE
            smote = SMOTE(random_state=fold_index, sampling_strategy=1)
            x_train, y_train = smote.fit_resample(x_train, y_train)

            # Setup model
            model = clfs[clf]["classifier"]
            model.random_state = fold_index

            # Hyperparameter tuning and keep model trained with the best set of hyperparameters
            optimized_model = RandomizedSearchCV(
                model,
                param_distributions=clfs[clf]["parameters"],
                cv=5,
                random_state=fold_index)
            optimized_model.fit(x_train, y_train)

            # Predict test data using trained model
            y_pred = optimized_model.predict(x_test)

            # Compute performance
            cm = confusion_matrix(y_test, y_pred)
            acc = accuracy_score(y_test, y_pred)
            sns = metrics.sensitivity(y_test, y_pred)
            spc = metrics.specificity(y_test, y_pred)
            auc = metrics.roc_auc(y_test, y_pred)

            # Append performance to fold-wise and overall containers
            cms += cm
            performance_foldwise["acc"].append(acc)
            performance_foldwise["sns"].append(sns)
            performance_foldwise["spc"].append(spc)
            performance_foldwise["auc"].append(auc)

        # Calculate overall performance
        for metric in performance_foldwise:
            avg_metric = np.round(
                np.sum(performance_foldwise[metric]) /
                len(performance_foldwise[metric]), 2)
            clfs_performance[metric].append(avg_metric)

        # Display overall performances
        print("== {} ==".format(clf))
        print("Cumulative CM:\n", cms)
        for metric in clfs_performance:
            print("Avg {}: {}".format(metric, clfs_performance[metric][-1]))
        print()

        # Display confusion matrix
        # sns.heatmap(cms, annot=True, cmap="Blues", fmt="g")
        # plt.xlabel("Predicted")
        # plt.ylabel("Actual")
        # plt.title("{} - Confusion matrix".format(clf))
        # plt.savefig("Results/confusion_matrix-{}.png".format(clf))
        # plt.close()

    # Append performance to result table
    for metric in clfs_performance:
        results[metric] = clfs_performance[metric]

    # Save result table
    results.to_csv("performances.csv", sep=";")
    results.plot.bar(rot=45).legend(loc="upper right")
    plt.savefig("performance.png".format(clf))
    plt.show()
    plt.close()