Python split_dataset 예제들, utils_cd.split_dataset Python 예제들

예제 #1

0

파일 보기

def get_data(fillna=True):
    aps_train = pd.read_csv('./aps_failure_training_set.csv',
                            skiprows=20,
                            keep_default_na=False)
    aps_test = pd.read_csv('./aps_failure_test_set.csv',
                           skiprows=20,
                           keep_default_na=False)
    aps_train.replace('na', np.nan, inplace=True)
    aps_test.replace('na', np.nan, inplace=True)
    columns_to_remove = [
        'br_000', 'bq_000', 'bp_000', 'bo_000', 'ab_000', 'cr_000', 'bn_000',
        'bm_000', 'cd_000'
    ]
    aps_train = aps_train.drop(columns=columns_to_remove)
    aps_test = aps_test.drop(columns=columns_to_remove)
    if fillna:
        aps_train = aps_train.fillna(-1)
        aps_test = aps_test.fillna(-1)
    X, y = split_dataset(aps_train, CLASS)
    X_test, y_test = split_dataset(aps_test, CLASS)
    X = X.astype('float64')
    X_test = X_test.astype('float64')
    if not fillna:
        X = X.apply(lambda x: x.fillna(x.median()))
        X_test = X_test.apply(lambda x: x.fillna(x.median()))
    y = y.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})

    return X, X_test, y, y_test

예제 #2

0

파일 보기

파일: full_analysis.py 프로젝트: waque/CD

                         keep_default_na=False,
                         na_values='na')
orig_test = pd.read_csv('./aps_failure_test_set.csv',
                        skiprows=20,
                        keep_default_na=False,
                        na_values='na')
"""

Let's first analyze the baseline we are working with,
we are going to evaluate Naive Bayes, Knn and Random Forest
with no preprocessing and checck if the results between them 
are similar or not.

"""

X_train, y_train = split_dataset(orig_train, CLASS)
X_test, y_test = split_dataset(orig_test, CLASS)
X_train, X_test = X_train.fillna(0), X_test.fillna(0)
y_train = y_train.map({'pos': 1, 'neg': 0})
y_test = y_test.map({'pos': 1, 'neg': 0})
balanced = False

print('Balanced data: {}'.format(collections.Counter(y_train)))


def get_data(X_train=X_train, X_test=X_test):
    global y_train
    global balanced
    high_corr = get_high_correlated_cols()
    X_train = X_train.drop(columns=high_corr)
    X_test = X_test.drop(columns=high_corr)

예제 #3

0

파일 보기

schiller_data = pd.read_csv('../schiller.csv')

data = [[green_data, 'green_data'], [hinselmann_data, 'hinselmann_data'],
        [schiller_data, 'schiller_data']]

green_data['hinselmann'] = 0
green_data['schiller'] = 0
hinselmann_data['hinselmann'] = 1
hinselmann_data['schiller'] = 0
schiller_data['hinselmann'] = 0
schiller_data['schiller'] = 1

super_table = green_data.append(hinselmann_data)
super_table = super_table.append(schiller_data)

X, y = split_dataset(super_table, CLASS)
X = getKBest(X, y)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.3,
                                                    random_state=0)

sm = SMOTE(random_state=2)

X_train_res, y_train_res = sm.fit_sample(X_train, y_train.ravel())

results = {}

for clf in base_clfs:
    clf_name = type(clf).__name__

예제 #4

0

파일 보기

def explore():
    sns.set()
    sns.set_style("darkgrid")
    sns.despine()
    """
    
    Problem description:
        Minimize the costs of repairing APS.
        Try to predict when a failure is going to occur.
        Predicting a failure when it's not cost: 10
        Missing a failure cost: 500
        (this is available in the txt that comes with the dataset)
    """

    aps_train = pd.read_csv('./aps_failure_training_set.csv',
                            skiprows=20,
                            keep_default_na=False)

    aps_test = pd.read_csv('./aps_failure_test_set.csv',
                           skiprows=20,
                           keep_default_na=False)
    """
    
    Problem:
        Missing values are represented as 'na' need to replace those with nulls
        
    Explore:
        Different strategies of replacing the nans values and comparing the results
    
    """

    aps_train.replace('na', np.nan, inplace=True)
    aps_test.replace('na', np.nan, inplace=True)
    """
    
    Problem:
        Data balancing, the classes distribution is not balanced
        this is normal since this is a failure thing and it's normal
        that it doesn't fail as much as it works
    """

    print(aps_train[CLASS].value_counts())
    print(aps_train[CLASS].value_counts())
    """
    train_hist = sns.countplot(x=CLASS, data=aps_train)
    train_hist.set(xlabel='Failure', ylabel='Count')
    #plt.savefig('images/unbalanced_train.pdf')
    plt.clf()
    
    test_hist = sns.countplot(x=CLASS, data=aps_test)
    test_hist.set(xlabel='Failure', ylabel='Count')
    #plt.savefig('images/unbalanced_test.pdf')
    plt.clf()
    
    """

    X_train, y_train = split_dataset(aps_train, CLASS)
    X_test, y_test = split_dataset(aps_train, CLASS)

    y_train = y_train.map({'pos': 1, 'neg': 0})
    y_test = y_test.map({'pos': 1, 'neg': 0})
    """
    
    Good thing:
        no class attributes, no need for dummy transformations
        only on the class itself
    
    """

    X_train = X_train.astype('float64')
    X_test = X_test.astype('float64')
    """
    Data exploration:
        There are 170 columns, all of them are numeric values.
        talk about training and test dataset instances
        There is no information about the attributes real meaning
        because they are anonymized for proprietary issues. We can't inffer attributes
        because of this.
        There are attributes that correspond to intervals, "bins",
        discretize (?) this attributes.
    """
    """
    Problem:
        Missing data.
    """

    print('Number of rows after removing training missing values: {}'.format(
        aps_train.dropna().shape[0]))
    print('Number of rows after removing test missing values: {}'.format(
        aps_test.dropna().shape[0]))
    """
    
    the number of instances available after removing the missing values is 
    very low comparing to the original dataset.
    Try to explore the dataset to remove columns that have to many missing values or 
    values that don't vary accross instances
    
    I think it makes more sense to work with both datasets concatenated, so we can see 
    the overall thing.
    Check if instances are 'pos' before removing them.
    
    """

    aps = pd.concat([aps_train, aps_test])
    X, y = split_dataset(aps, CLASS)
    X = X.astype('float64')

    num_instances = X.shape[0]

    cols_missing = {}
    no_std_dev = []
    for col in X:
        #print('Analyzing col {}'.format(col))
        #print()
        std_dev = standard_deviation(X[col])
        #print('Standard deviation of attribute: {}'.format(std_dev))
        if std_dev == 0:
            no_std_dev.append(col)

        num_missing = X[col].isna().sum()
        percentage_missing = (num_missing / num_instances) * 100
        cols_missing[col] = percentage_missing
        #print('Percentage of missing: {}'.format(percentage_missing))
        #print()
        #print()

    ATTRIBUTE = 'Attribute'
    PERCENTAGE = 'Percentage'

    missing_df = pd.DataFrame(list(cols_missing.items()))
    missing_df.columns = [ATTRIBUTE, PERCENTAGE]
    missing_df = missing_df.sort_values([PERCENTAGE],
                                        ascending=False).reset_index(drop=True)
    # filter by high percentage
    missing_df = missing_df.loc[missing_df[PERCENTAGE] > 30]
    """
    plt.figure(figsize=(missing_df.shape[1] + 5, 10))
    ax = sns.barplot(missing_df.index, missing_df[PERCENTAGE], color="steelblue")
    ax.set(xlabel=ATTRIBUTE, ylabel=PERCENTAGE)
    
    
    ax.set_xticklabels(missing_df[ATTRIBUTE])
    for item in ax.get_xticklabels(): item.set_rotation(45)
    plt.savefig('images/missing_values.pdf')
    plt.clf()
    """
    """
    There is a column, 'cd_000', which has no variance in it's values,
    this is, the values are all the same so it isn't representative of the data or 
    something like that. Drop this column
    
    """

    high_percentage_missing = missing_df.loc[
        missing_df[PERCENTAGE] > 60][ATTRIBUTE].values

    print(high_percentage_missing)
    """
    There are 7 columns with a missing percentage of over 70%
    Consider to drop this columns because they have too many different 
    missing values.
    With this parameters there are 26535 rows left, it's better.
    """
    """
    Before removing the attributes one should analyse if the attribute
    are important to determine the class. "discriminative (supervised learning) 
    or informative (unsupervised learning)"
    
    """
    """
    
    Analyze distributions
    
    """
    """
    sns.pairplot(aps, hue=CLASS)
    plt.savefig('images/pair_plot.pdf')
    plt.clf()
    
    plt.figure(figsize=(140,140))
    sns.heatmap(data=X.corr())
    plt.savefig('images/correlations.pdf')
    plt.clf()
    """
    """

예제 #5

0

파일 보기

파일: knn.py 프로젝트: waque/CD

                        keep_default_na=False)

aps_test = pd.read_csv('./aps_failure_test_set.csv',
                       skiprows=20,
                       keep_default_na=False)

aps_train.replace('na', np.nan, inplace=True)
aps_test.replace('na', np.nan, inplace=True)

#drop NaN values
#problema -> passamos de 60000 rows para apenas 591
aps_train = aps_train.dropna()

aps_test = aps_test.dropna()

X_train, y_train = split_dataset(aps_train, 'class')
X_test, y_test = split_dataset(aps_train, 'class')

y_train = y_train.map({'pos': 1, 'neg': 0})
y_test = y_test.map({'pos': 1, 'neg': 0})

X_train = X_train.astype('float64')
X_test = X_test.astype('float64')

neigh = KNeighborsClassifier(n_neighbors=10)

res = classifier_statistics(neigh, X_train, X_test, y_train, y_test)

pprint(res)
"""
k=2