Пример #1
0
__author__ = 'Aaron Yang'
__email__ = '*****@*****.**'
__date__ = '10/4/2019 10:10 PM'

import pandas as pd

from ay_hw_4._global import ROOT_PATH, APS_TRAIN, APS_TEST, APS_FULL_COLUMNS
from ay_hw_4.util_data import load_data
from ay_hw_4.util_stratistic import count_neg_and_pos

if __name__ == "__main__":
    pd.set_option('display.max_columns', 100)
    X_train, y_train = load_data(ROOT_PATH + APS_TRAIN,
                                 skip_first_row=21,
                                 y_column_index=0,
                                 assignedColumnNames=APS_FULL_COLUMNS,
                                 missingSymbol='na',
                                 needImpute=True,
                                 dropOrNot=False)

    X_test, y_test = load_data(ROOT_PATH + APS_TEST,
                               skip_first_row=21,
                               y_column_index=0,
                               assignedColumnNames=APS_FULL_COLUMNS,
                               missingSymbol='na',
                               needImpute=True,
                               dropOrNot=False)

    train_num_pos, train_num_neg = count_neg_and_pos(y_train)
    test_num_pos, test_num_neg = count_neg_and_pos(y_test)
    print("the number of pos data is : ", train_num_pos + test_num_pos)
Пример #2
0
#
__author__ = 'Aaron Yang'
__email__ = '*****@*****.**'
__date__ = '10/2/2019 9:10 AM'

from ay_hw_4._global import ROOT_PATH, CRIME
from ay_hw_4.util_data import load_data, train_test_split_by_size

if __name__ == "__main__":
    X_data, y_data = load_data(ROOT_PATH + CRIME, y_column_index=-1)

    print("X Row Data Shape: ", X_data.shape)
    print("y Row Data Shape: ", y_data.shape)
    X_train, X_test, y_train, y_test = train_test_split_by_size(
        X_data, y_data, train_size=1495, random_state=2333)
    print("X_Train Data Shape: ", X_train.shape)
    print("y_Train Data Shape: ", y_train.shape)
    print("X_test Data Shape: ", X_test.shape)
    print("y_test Data Shape: ", y_test.shape)
Пример #3
0
#
__author__ = 'Aaron Yang'
__email__ = '*****@*****.**'
__date__ = '10/4/2019 5:30 PM'

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from ay_hw_4._global import ROOT_PATH, APS_TRAIN, APS_FULL_COLUMNS
from ay_hw_4.util_data import load_data, to_binary_numeric

if __name__ == "__main__":
    pd.set_option('display.max_columns', 100)
    X_data, y_data = load_data(ROOT_PATH + APS_TRAIN,
                               skip_first_row=21,
                               y_column_index=0,
                               assignedColumnNames=APS_FULL_COLUMNS,
                               missingSymbol='na',
                               needImpute=True,
                               dropOrNot=True)
    y_data = to_binary_numeric(y_data, classNeg="neg")
    data = pd.concat([y_data, X_data], axis=1)
    correlation = data.corr()
    fig = plt.figure(figsize=(20, 15))
    sns.heatmap(correlation, vmin=-1, vmax=1, cmap=sns.color_palette("Blues"))
    plt.show()

# 把dropOrNot打开 将报错, 因为数据中有10列存在NaN
Пример #4
0
#
__author__ = 'Aaron Yang'
__email__ = '*****@*****.**'
__date__ = '10/2/2019 10:36 PM'

import pandas as pd
import numpy as np

from ay_hw_4._global import ROOT_PATH, CRIME
from ay_hw_4.util_data import load_data

if __name__ == "__main__":
    X_data, y_data = load_data(ROOT_PATH + CRIME,
                               skip_first_column=5,
                               y_column_index=-1,
                               needImpute=True)
    data = pd.concat([X_data, y_data], axis=1)
    cvFormula = lambda x: np.std(x) / np.mean(x)
    cvResult = np.apply_along_axis(cvFormula, axis=0, arr=data.to_numpy())
    print("The total {} features CV are: (first 20 rows)\n {}".format(
        len(cvResult), cvResult))
Пример #5
0
#
__author__ = 'Aaron Yang'
__email__ = '*****@*****.**'
__date__ = '10/2/2019 6:39 PM'

import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer

from ay_hw_4._global import ROOT_PATH, CRIME
from ay_hw_4.util_data import load_data

if __name__ == "__main__":
    pd.set_option('display.max_columns', 100)
    X_data, y_data = load_data(ROOT_PATH + CRIME,
                               y_column_index=-1,
                               skip_first_column=5)
    print("X_data Row Data Shape: ", X_data.shape)
    print("y Row Data Shape: ", y_data.shape)
    X_data = X_data.replace('?', np.nan)
    missingValueColumnIndex = X_data.columns[X_data.isnull().any()]

    print(
        "In the beginning, there are total {} columns has missing value in the dataset "
        .format(missingValueColumnIndex.shape[0]))
    print(
        "------------------------------------------------------------------------------"
    )
    print(X_data[missingValueColumnIndex].describe())

    # so  we can only impute only one column (index=25)