示例#1
0
def main():
    ####################################################################################################################
    # Main used for house price data set

    #  at this point once the data has been explored, want train_Y to be in its own variable separate from train_X to
    #  pre-process the data train_X and test_X should not be combined at any point as the data should be preprocessed in
    #  one go for train_X but in a real world scenario, test_X may not come in as a large dataset
    model_house = DataModeler(pd.read_csv("Data_In/House_Prices/train.csv"),
                              pd.read_csv("Data_In/House_Prices/test.csv"))

    # model_house.box_plot('SalePrice', 'YearBuilt')
    # model_house.bar_graph_attribute('YrSold')
    # model_house.line_graph_percentage_difference('YrSold')

    # dropped after looking at a scatter plot of the two attributes
    # model_house.scatter_plot('SalePrice', 'GrLivArea')
    model_house.drop_outliers_target_less_y_attribute_greater_x(
        'SalePrice', 300000, 'GrLivArea', 4000)
    model_house.drop_outliers_target_greater_y_attribute_greater_x(
        'SalePrice', 200000, 'LotFrontage', 300)
    # model_house.scatter_plot('SalePrice', 'GrLivArea')

    print(model_house._train_data_set.shape)

    model_house.switch_na_to_median_other_attribute('LotFrontage',
                                                    'Neighborhood')
    # all features in train are all pub and 2 na, 'NoSewa' is in test set hence the attribute doesnt help in any way
    # with the model so it is dropped
    attributes_to_none = [
        "PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageCond",
        "GarageQual", "GarageFinish", "GarageYrBlt", "GarageType",
        "BsmtFinType2", "BsmtExposure", "BsmtFinType1", "BsmtCond", "BsmtQual",
        "MasVnrType"
    ]

    attributes_to_zero = [
        'MasVnrArea', 'BsmtHalfBath', 'BsmtFullBath', 'GarageArea',
        'GarageCars', 'TotalBsmtSF', 'BsmtUnfSF', 'BsmtFinSF2', 'BsmtFinSF1'
    ]

    attributes_to_mode = [
        'Electrical', 'MSZoning', 'Functional', 'SaleType', 'KitchenQual',
        'Exterior2nd', 'Exterior1st'
    ]

    attributes_to_categorical = [
        'MSSubClass', 'OverallCond', 'YrSold', 'MoSold'
    ]

    for x in attributes_to_none:
        model_house.impute_none(x)
    for x in attributes_to_zero:
        model_house.impute_zero(x)
    for x in attributes_to_mode:
        model_house.impute_mode(x)
    for x in attributes_to_categorical:
        model_house.convert_attribute_to_categorical(x)

    # drops id column from train_X and test_X to move it to _test_y_id
    model_house.index_column_drop_and_move_to_pred_y('Id')
    model_house.move_target_to_train_y(
        'SalePrice')  # moves saleprice to train_Y
    model_house.train_missing_data_ratio_print()
    model_house.test_missing_data_ratio_print()
    ####################################################################################################################
    # all the missing values are inputted!!!!
    ####################################################################################################################

    # print('The dimension of the train is', model_house._train_data_set.shape)
    # print('The dimension of the test is', model_house._test_data_set.shape)

    ####################################################################################################################
    #  need to add these in later
    attributes_to_drop = [
        'Utilities', 'YearBuilt', 'MoSold', 'YrSold', 'GarageYrBlt',
        'YearRemodAdd'
    ]
    for x in attributes_to_drop:
        model_house.drop_attribute(x)
    ####################################################################################################################

    attributes_to_normalise = [
        'LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
        'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
        'GrLivArea', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
        'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal'
    ]

    for x in attributes_to_normalise:
        model_house.normalise_attribute(x)

    attributes_to_one_hot_encode = [
        'MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
        'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
        'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
        'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
        'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
        'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
        'Functional', 'GarageType', 'FireplaceQu', 'GarageFinish',
        'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence',
        'MiscFeature', 'SaleType', 'OverallCond', 'SaleCondition',
        'OverallQual', 'FullBath', 'BedroomAbvGr', 'TotRmsAbvGrd',
        'GarageCars', 'BsmtFullBath', 'KitchenAbvGr', 'BsmtHalfBath',
        'HalfBath', 'Fireplaces'
    ]

    for x in attributes_to_one_hot_encode:
        model_house.one_hot_encode_attribute(x)

    model_house.delete_unnecessary_one_hot_encoded_columns()

    # print('The dimension of the train is', model_house._train_data_set.shape)
    # print('The dimension of the test is', model_house._test_data_set.shape)

    # could transform target before and after
    # model_house.box_cox_target(0.1)

    ####################################################################################################################
    #  Lasso
    model_house.lasso_compare_alpha([75, 100, 200, 300]).to_csv(
        'Data_Out/Lasso_model_alpha_1_0point1_0point01.csv', index=False)

    lasso_model_grid_parameters = [{'alpha': [75, 100, 200, 300]}]
    model_house.regression_model_grid_search(Lasso,
                                             lasso_model_grid_parameters, 10)
    lasso_model_tuned_parameters = {'alpha': 1000, 'random_state': 1}
    model_house.regression_model_submission(Lasso, 'SalePrice',
                                            lasso_model_tuned_parameters)

    ####################################################################################################################
    # ridge regression optimised

    ridge_model_grid_parameters = [{'alpha': [1, 5, 7, 10]}]
    model_house.regression_model_grid_search(Ridge,
                                             ridge_model_grid_parameters, 10)
    ridge_model_tuned_parameters = {'alpha': 10.0}
    model_house.regression_model_submission(Ridge, 'SalePrice',
                                            ridge_model_tuned_parameters)

    ####################################################################################################################
    # kernel ridge regression gridsearch

    kernel_ridge_model_grid_parameters = {
        'alpha': [5, 9, 10, 11],
        'kernel': ['linear'],
        'degree': [1, 2, 3]
    }
    model_house.regression_model_grid_search(
        KernelRidge, kernel_ridge_model_grid_parameters, 10)
    kernel_ridge_model_fine_tuned_parameters = {
        'alpha': 9,
        'kernel': 'linear',
        'degree': 1
    }
    model_house.regression_model_submission(
        KernelRidge, 'SalePrice', kernel_ridge_model_fine_tuned_parameters)

    ####################################################################################################################
    # linear optimised

    linear_model_grid_parameters = {
        'fit_intercept': [True, False],
        'normalize': [True, False],
        'copy_X': [True, False]
    }
    model_house.regression_model_grid_search(LinearRegression,
                                             linear_model_grid_parameters, 10)
    linear_model_fine_tuned_parameters = {
        'fit_intercept': True,
        'normalize': True,
        'copy_X': False
    }
    model_house.regression_model_submission(
        LinearRegression, 'SalePrice', linear_model_fine_tuned_parameters)
def main():
    ####################################################################################################################
    # Main used for iris data set
    iris_data = pd.read_csv('Data_In/Iris/iris.txt', header=None)
    iris_data.columns = [
        'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
        'classification'
    ]

    model_iris = DataModeler(iris_data, 0)
    model_iris.split_data_set_if_test_not_split('classification', 0.7, 0)

    model_iris.heat_map()
    print(model_iris._train_data_set.head())
    print(model_iris._test_data_set.head())

    model_iris.describe_attribute('sepal_length')
    model_iris.histogram_and_q_q('sepal_length')

    tuned_parameters_svm = {
        'kernel': 'rbf',
        'gamma': 1,
        'C': 10,
        'decision_function_shape': 'ovo'
    }
    model_iris.classification_model(SVC, tuned_parameters_svm, 10)
示例#3
0
def main():
    # constructs the object car_insurance_model by loading in the data to the class DataModeler
    car_insurance_model = DataModeler(
        pd.read_csv("Data_In/Car_Insurance/DS_Assessment.csv"),
        pd.DataFrame({'null_df': []}))

    ####################################################################################################################
    # EXPLORATION
    ####################################################################################################################

    # Get a very broad understanding of the data from the dimension and first 5 rows

    # Prints the dimension of the data after successfully storing the data set as a Pandas data frame
    print("The dimension of the car insurance data is: ",
          car_insurance_model._train_data_set.shape)
    print(car_insurance_model._train_data_set.head()
          )  # prints the first 5 rows of the data set

    ####################################################################################################################
    # Example of some of the graphs used to explore the data for the attribute: Age
    # These methods can and were used for all the attributes
    '''
    # counts number of each age
    car_insurance_model.attribute_value_count('Age')
    # counts Sale or NoSale for each number of age
    car_insurance_model.attribute_value_count_by_classification('Age', 'Sale')
    # displays and saves a bar graph showing the percentage of each value for the attribute Age in the data set
    car_insurance_model.bar_graph_attribute('Age')
    # displays a stacked Sale and NoSale bar graph for each attribute in Age in the data set
    car_insurance_model.bar_graph_attribute_by_classification('Age', 'Sale')
    # prints a summary of the distribution of the column 'Age' such as mean, standard deviation etc
    car_insurance_model.describe_attribute('Age')
    # plots a histogram of the attribute Age and also a quantile quantile plot
    # car_insurance_model.histogram_and_q_q('Age')
    # plots a scatter plot of Age and Price
    car_insurance_model.scatter_plot('Age', 'Price')
    # plots a scatter plot of Age and Price for Sale and NoSale
    car_insurance_model.scatter_plot_by_classification('Age', 'Price', 'Sale')

    car_insurance_model.histogram_and_q_q('Credit_Score')

    ####################################################################################################################
    # Observe how much data is missing for each attribute
    car_insurance_model.train_missing_data_ratio_print()
    # displays and saves a bar graph of the percentage of missing values
    car_insurance_model.missing_data_ratio_bar_graph()

    car_insurance_model.heat_map()
    '''

    ####################################################################################################################
    # PROCESSING
    ####################################################################################################################
    # Attempted to log and sqrt transform some skewed parameters however, I found the models to perform worse hence I
    # decided to instead normalise the attributes to have a mean of 0 and standard deviation of 1. Code below
    # demonstrates some of my attempts to better fit the data to a normal distribution
    '''
    car_insurance_model.histogram_and_q_q('Credit_Score')
    # max_price = car_insurance_model._data_set['Price'].max()
    # car_insurance_model._data_set['Price'] = max_price + 1 - car_insurance_model._data_set['Price']
    car_insurance_model.boxcox_trans_attribute('Credit_Score', 0.1)
    # car_insurance_model._data_set['Price'] = np.sqrt(car_insurance_model._data_set['Price'])
    car_insurance_model.histogram_and_q_q('Credit_Score')
    '''

    ####################################################################################################################

    # Normalise attributes to a mean of zero and standard deviation of 1 before imputing
    attributes_to_normalise = [
        'Veh_Mileage', 'Credit_Score', 'License_Length', 'Veh_Value', 'Price',
        'Age', 'Tax'
    ]

    for i in attributes_to_normalise:
        car_insurance_model.normalise_attribute(i)

    ####################################################################################################################
    # creating new features from the attribute date

    # decided to add day_of_the_week column to see if any information can be extracted
    ####################################################################################################################
    # convert type of column date form object to datetime64
    car_insurance_model._train_data_set['Date'] = pd.to_datetime(
        car_insurance_model._train_data_set['Date'],
        infer_datetime_format=True)
    # add new column named days_of_the_week that has the day of the week
    car_insurance_model._train_data_set = car_insurance_model._train_data_set.assign(
        days_of_the_week=car_insurance_model._train_data_set['Date'].dt.
        weekday_name)
    ####################################################################################################################
    # bar graph of new column to see if any new information can be obtained
    # car_insurance_model.bar_graph_attribute_by_classification('days_of_the_week', 'Sale')
    # can see that on Friday typically there are less sales hence decided to create new column

    # used similar method to extract month and year, found month would have added too many attributes when one hot
    # encoding and year to not have any significant difference between 2015 and 2016

    # one hot encodes the column days_of_the_week by adding 7 new attributes
    car_insurance_model.one_hot_encode_attribute('days_of_the_week')
    # drop date as there are so many different days
    car_insurance_model.drop_attribute('Date')

    ####################################################################################################################
    # Dealing with the attributes Tax and Price

    # scatter plot the two attributes as they appear very highly correlated and could be used to impute the data
    # car_insurance_model.scatter_plot_by_classification("Tax", "Price")
    # found that tax and price follow two linear equations using car_insurance_model.scatter_plot("Tax", "Price")
    # the cutoff between following either equation was when the tax was between a value of 32 to 35 which was found by
    # looking through the data set:
    # typically when tax < 34, tax = 0.05 * price and when tax > 34, tax = 0.1 * price
    # hence this can be used to impute missing values more accurately

    # compare how many values are imputed using this method
    car_insurance_model.train_missing_data_ratio_print()
    ####################################################################################################################
    # impute price
    # as only 5 values are missing for both Price and Tax, the mean is imputed for these values

    # create a list of the index of the missing values in the price attribute
    price_missing_value_index = car_insurance_model._train_data_set[
        car_insurance_model._train_data_set['Price'].isnull()].index.tolist()

    # loop through the missing value index for price
    for i in price_missing_value_index:
        # if the value for tax in the same column as price is greater than 34
        if car_insurance_model._train_data_set['Tax'].values[i] > 33:
            # set the value for price equal to ten times the value of tax in the same column
            car_insurance_model._train_data_set['Price'].values[
                i] = car_insurance_model._train_data_set['Tax'].values[i] * 10
        else:
            # else set the price to 5 times the tax
            car_insurance_model._train_data_set['Price'].values[
                i] = car_insurance_model._train_data_set['Tax'].values[i] * 5

    print('The number of price values imputed is ',
          len(price_missing_value_index))
    ####################################################################################################################
    # impute tax
    # create a list of the index of the missing values in the tax attribute
    tax_missing_value_index = car_insurance_model._train_data_set[car_insurance_model._train_data_set['Tax'].isnull()].\
        index.tolist()
    # loop through the missing value index for price
    for i in tax_missing_value_index:
        # if the value for tax in the same column as price is greater than 34
        if car_insurance_model._train_data_set['Price'].values[i] > 330:
            # set the value for price equal to ten times the value of tax in the same column
            car_insurance_model._train_data_set['Tax'].values[
                i] = car_insurance_model._train_data_set['Price'].values[
                    i] * 0.1
        else:
            # else set the price to 5 times the tax
            car_insurance_model._train_data_set['Tax'].values[
                i] = car_insurance_model._train_data_set['Price'].values[
                    i] * 0.05

    print('The number of tax values imputed is ', len(tax_missing_value_index))
    ####################################################################################################################
    car_insurance_model.train_missing_data_ratio_print()

    # as only 5 values are missing for both Price and Tax, the mean is imputed for these values
    car_insurance_model.impute_mean('Price')
    car_insurance_model.impute_mean('Tax')

    ####################################################################################################################

    # one hot encoding certain attributes
    car_insurance_model.one_hot_encode_attribute(
        'Marital_Status')  # one hot encodes Marital_Status

    ####################################################################################################################

    # found credit score to have an interesting value of 9999 for some customers, I attempted to one hot encode all the
    # customers that had this score to a new column however, found this to have no significant difference on the model
    # however, I decided to leave the code in the class DataPreprocessor:

    # create a list of the indices of the credit score with a score of 9999
    credit_score_9999_index = car_insurance_model._train_data_set[
        car_insurance_model._train_data_set['Credit_Score'] ==
        9999].index.tolist()

    # create a new column for the credit scores with 9999 so they can be put into a different attribute
    car_insurance_model._train_data_set['Infinite_Credit_Score'] = 0

    for i in credit_score_9999_index:
        # set the new column values to 1 (one hot encoding)
        car_insurance_model._train_data_set['Infinite_Credit_Score'].values[
            i] = 1
        # drop the credit score of 9999 from the attribute Credit_Score
        car_insurance_model._train_data_set['Credit_Score'].values[i] = 0

    ####################################################################################################################

    # attempted to impute using knn from a package known as fancyimpute however, I found this to be extremely
    # inefficient and instead used standard methods. the code is left the class DataPreprocessor and called on the next
    # line:
    # car_insurance_model.impute_knn(3)

    ####################################################################################################################

    # Impute the other attributes using standard methods
    car_insurance_model.impute_median('Credit_Score')
    car_insurance_model.impute_mode('Veh_Mileage')
    car_insurance_model.impute_median(
        'License_Length'
    )  # should try to impute by first categorising by Maritial_Status
    car_insurance_model.impute_mode('Veh_Value')  # should use a better method
    car_insurance_model.impute_median('Age')

    ####################################################################################################################

    # check all values have been imputed
    print(
        'After imputing all the attributes, the missing ratio is found to be:')
    car_insurance_model.train_missing_data_ratio_print()

    ####################################################################################################################
    # MODELS
    ####################################################################################################################

    car_insurance_model.shuffle_data_set(
    )  # shuffle the data set before splitting

    # split data to 75% training, 25% test with a seed set to 2 (to get the same split when running the code
    car_insurance_model.split_data_set_if_test_not_split('Sale', 0.25, 2)

    ####################################################################################################################
    # Knn model
    # gridsearch for knn

    # uncomment to run grid search
    grid_parameters_knn = [{'n_neighbors': [5, 15, 19]}]
    car_insurance_model.classification_model_grid_search(
        KNeighborsClassifier, grid_parameters_knn, 2)

    # fit a knn with k=5 and print percentage accuracy for 10-fold cross validation and confusion matrix against the
    # test set
    tuned_parameters_knn = {'n_neighbors': 19}
    car_insurance_model.classification_model(KNeighborsClassifier,
                                             tuned_parameters_knn, 10)

    ####################################################################################################################
    # SMV model
    # found these set of parameters to be the most optimum when performing a grid search

    # uncomment to run grid search
    tuned_parameters_svm = [{
        'kernel': ['rbf'],
        'gamma': [1 / 15, 1 / 16, 1 / 17],
        'C': [11, 10, 12],
        'decision_function_shape': ['ovo']
    }]
    car_insurance_model.classification_model_grid_search(
        SVC, tuned_parameters_svm, 2)

    # fit a svm and print percentage accuracy for 10-fold cross and shows the confusion matrix for the best
    # hyper-parameters found when performing the grid-search

    # k-fold cross validation for optimum hyper-parameters to validate SVM model
    tuned_parameters_svm = {
        'C': 10,
        'decision_function_shape': 'ovo',
        'degree': 3,
        'gamma': 1 / 16,
        'kernel': 'rbf'
    }
    car_insurance_model.classification_model(SVC, tuned_parameters_svm, 10)
示例#4
0
def main():
    ####################################################################################################################
    # Main used for iris data set
    iris_data = pd.read_csv('Data_In/Iris/iris.txt', header=None)
    iris_data.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'classification']

    model_iris = DataModeler(iris_data, 0)
    model_iris.split_data_set_if_test_not_split('classification', 0.7, 0)

    model_iris.heat_map()
    print(model_iris._x_train.head())
    print(model_iris._x_test.head())

    model_iris.describe_attribute('sepal_length')
    model_iris.histogram_and_q_q('sepal_length')

    model_iris.split_data_data_set_X_data_set_y('classification')

    model_iris.svm_model('auto', 1, 10)
    model_iris.neural_network_model(1e-5, (25, 12, 5), 10)
示例#5
0
def main():
    ####################################################################################################################
    # Main used for house price data set

    #  at this point once the data has been explored, want train_Y to be in its own variable separate from train_X to
    #  pre-process the data train_X and test_X should not be combined at any point as the data should be preprocessed in
    #  one go for train_X but in a real world scenario, test_X may not come in as a large dataset

    model_adult = DataModeler(pd.read_csv("Data_In/Adult/adult.data.txt", header=None, sep=",\s", na_values=["?"])
                            , pd.read_csv("Data_In/Adult/adult.test.txt", header = None, sep=",\s", na_values=["?"]))
    model_adult._train_data_set.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                                            "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                                            "hours-per-week", "native-country", "salary"]
    print(model_adult._train_data_set)
    model_adult._test_data_set.columns = ["age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
                                            "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
                                            "hours-per-week", "native-country", "salary"]
    #model_adult.box_plot("age","salary")
    #model_adult.missing_data_ratio_bar_graph()
    #model_adult.scatter_plot("hours-per-week","age")
    #model_adult.train_missing_data_ratio_print()
    # model_adult.histogram_and_q_q("age")
    # model_adult.histogram_and_q_q("hours-per-week")
    # model_adult.histogram_and_q_q("capital-gain")
    # model_adult.histogram_and_q_q("capital-loss")
    #     # model_adult.bar_graph_attribute("occupation")
    #     # model_adult.bar_graph_attribute("workclass")

    # print(model_adult._train_data_set["capital-loss"])
    #
    # model_adult.normalise_attribute("capital-loss")
    # model_adult.box_cox_trans_attribute("capital-loss", 5)
    # model_adult.normalise_attribute("capital-loss")
    # print(model_adult._train_data_set["capital-loss"])
    # model_adult.histogram_and_q_q("capital-loss")
    model_adult.move_target_to_train_y("salary")
    model_adult.move_target_to_test_y('salary')

    #print(model_adult._x_train)
   # model_adult.random_forest()
    model_adult.drop_attribute("fnlwgt")
    model_adult.drop_attribute("workclass")
    model_adult.drop_attribute("education-num")
    model_adult.drop_attribute("marital-status")
    model_adult.drop_attribute("occupation")
    model_adult.one_hot_encode_attribute("relationship")
    model_adult.one_hot_encode_attribute("education")
    model_adult.drop_attribute("capital-gain")
    model_adult.drop_attribute('capital-loss')
    model_adult.drop_attribute("hours-per-week")
    model_adult.one_hot_encode_attribute("race")
    model_adult.one_hot_encode_attribute("sex")
    model_adult.drop_attribute("native-country")
    model_adult.drop_attribute('age')
    print(model_adult._test_data_set)

    #model_adult.random_forest()
    model_adult.shuffle_data_set()

    #print(model_adult._test_data_set)
    model_adult.delete_unnecessary_one_hot_encoded_columns()

    for i in range(len(model_adult._y_test.values)):
        model_adult._y_test.values[i] = model_adult._y_test.values[i].strip('.')

    my_random_forest_model = model_adult.random_forest()