Python handle_outlier示例

编程语言: Python

命名空间/包名称: preprocessing

方法/功能: handle_outlier

hotexamples.com的示例: 5

Python handle_outlier - 已找到5个示例。这些是从开源项目中提取的最受好评的preprocessing.handle_outlier现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： main_ian.py 项目： mohamedhozayen/SYSC5405-Project

def run_bs_adaboost():
    df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') 
    df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
    df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20))
    df = prc.standarize(df) # or normalize
    dt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1, class_weight = {1: 20, 0:1}), n_estimators=20)
    print(main(df, "AdaBoost", dt, bs_estimate = True, verbose=True))

示例#2

显示文件

def run_bs_adaboost():
    df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',')
    df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
    df = prc.handle_outlier(
        prc.detect_outlier_iterative_IQR(df).dropna(thresh=20))
    df = prc.standarize(df)  # or normalize
    dt = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1,
                                                   class_weight={
                                                       1: 20,
                                                       0: 1
                                                   }),
                            n_estimators=20)
    print(main(df, "AdaBoost", dt, bs_estimate=True, verbose=True))


# run_depth_test()
# run_bs_dt()
#run_bs_adaboost()

# Test meta learning example
#abc = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100)
#main(df=df, name = "AdaBoost Decision Stumps", model=abc)
# Print PR Curves from test
#plt.legend(loc=1)
#plt.title("Precision Recall Curve")
#plt.show()

示例#3

显示文件

文件： main_ian.py 项目： mohamedhozayen/SYSC5405-Project

def run_depth_test():
    df = pd.read_csv('Files/csv_result-Descriptors_Training.csv', sep=',') 
    df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
    df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20))
    df = prc.standarize(df) # or normalize
    rslt = test_tree_depth(df)

    print("Run Time: " + str(datetime.now() - startTime))

    # Print PR Curves from test
    plt.legend(loc=1)
    plt.title("Precision Recall Curve")
    plt.show()

    # Print out the distribution of curves 
    plt.plot(list(range(2, len(rslt))), rslt[2:])
    plt.ylabel("Depth of Tree")
    plt.xlabel("Pr@Re>50")
    plt.title("Testing Decision Tree Depth")
    plt.xticks(list(range(2, len(rslt))))
    plt.show()

示例#4

显示文件

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_mldata
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

import preprocessing as prc
import feature_selection as fs

df = pd.read_csv('Files\csv_result-Descriptors_Training.csv', sep=',')
df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df).dropna(thresh=20))
df_norm = prc.normalize(df)  #normalize

features = df_norm.iloc[:, :-1]
target = df_norm.iloc[:, -1]

from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
f_anova = fs.select_k_best(features, target, f_classif, 2)
df = df_norm[f_anova.iloc[:, 0].append(pd.Series('class'))]  #

#sns.set()
#plt.title("Distribution of Feature 15")
#sns.distplot(df['Pb_NO_sideR35_S'])
#plt.show()

#plt.figure(figsize=(16,7))

示例#5

显示文件

文件： features_optimal.py 项目： mohamedhozayen/SYSC5405-Project

        
        df = fs.select_k_best_ANOVA(data, k=n_features)
        out = old_main.test_tree_depth(df, class_weight="balanced")
        summary_balance.append([data_str_name + '-ANOVA', i, out.index(max(out)), max(out)])
        
        df = fs.RFECV_DT(data, min_features_to_select=n_features, max_depth=max_dapth)
        out = old_main.test_tree_depth(df, class_weight="balanced")
        summary_balance.append([data_str_name + '-RFECV', i,  out.index(max(out)), max(out)])

    return summary_balance

summary_balance = []

df = pd.read_csv('Files\csv_result-Descriptors_Training.csv', sep=',') 
df = df.drop(['id'], axis=1).replace(['P', 'N'], [1, 0])
df = prc.handle_outlier(prc.detect_outlier_iterative_IQR(df))
df = prc.standarize(df) # or normalize


# =============================================================================
# Unsupervised optimal feature selection | optimal tree depth
# =============================================================================
vt = fs.variance_threshold(df, threshold=1)
rslt_vt = main.test_tree_depth(vt, class_weight="balanced")
summary_balance.append(['variance-threshold', rslt_vt.index(max(rslt_vt)), max(rslt_vt)])

pca_2 = fs.pca_linear(df, n=2) # n_c9 is 9, based VarianceThreshold results, axis to gain most information
rslt_pca = main.test_tree_depth(pca_2, class_weight="balanced")
summary_balance.append(['pca-2', rslt_pca.index(max(rslt_pca)), max(rslt_pca)])

pca_7 = fs.pca_linear(df, n=7) # n_c9 is 9, based VarianceThreshold results, axis to gain most information