def main():
    ####################################################################################################################
    # Main used for iris data set
    iris_data = pd.read_csv('Data_In/Iris/iris.txt', header=None)
    iris_data.columns = [
        'sepal_length', 'sepal_width', 'petal_length', 'petal_width',
        'classification'
    ]

    model_iris = DataModeler(iris_data, 0)
    model_iris.split_data_set_if_test_not_split('classification', 0.7, 0)

    model_iris.heat_map()
    print(model_iris._train_data_set.head())
    print(model_iris._test_data_set.head())

    model_iris.describe_attribute('sepal_length')
    model_iris.histogram_and_q_q('sepal_length')

    tuned_parameters_svm = {
        'kernel': 'rbf',
        'gamma': 1,
        'C': 10,
        'decision_function_shape': 'ovo'
    }
    model_iris.classification_model(SVC, tuned_parameters_svm, 10)
示例#2
0
def main():
    ####################################################################################################################
    # Main used for iris data set
    iris_data = pd.read_csv('Data_In/Iris/iris.txt', header=None)
    iris_data.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'classification']

    model_iris = DataModeler(iris_data, 0)
    model_iris.split_data_set_if_test_not_split('classification', 0.7, 0)

    model_iris.heat_map()
    print(model_iris._x_train.head())
    print(model_iris._x_test.head())

    model_iris.describe_attribute('sepal_length')
    model_iris.histogram_and_q_q('sepal_length')

    model_iris.split_data_data_set_X_data_set_y('classification')

    model_iris.svm_model('auto', 1, 10)
    model_iris.neural_network_model(1e-5, (25, 12, 5), 10)
def main():
    # constructs the object car_insurance_model by loading in the data to the class DataModeler
    car_insurance_model = DataModeler(pd.read_csv("Data_In/Car_Insurance/DS_Assessment.csv"), 0)

    ####################################################################################################################
    # EXPLORATION
    ####################################################################################################################

    # Get a very broad understanding of the data from the dimension and first 5 rows

    # Prints the dimension of the data after successfully storing the data set as a Pandas data frame
    print("The dimension of the car insurance data is: ", car_insurance_model._train_data_set.shape)
    print(car_insurance_model._train_data_set.head())  # prints the first 5 rows of the data set

    ####################################################################################################################
    # Example of some of the graphs used to explore the data for the attribute: Age
    # These methods can and were used for all the attributes

    # counts number of each age
    car_insurance_model.attribute_value_count('Age')
    # counts Sale or NoSale for each number of age
    car_insurance_model.attribute_value_count_by_classification('Age', 'Sale')
    # displays and saves a bar graph showing the percentage of each value for the attribute Age in the data set
    car_insurance_model.bar_graph_attribute('Age')
    # displays a stacked Sale and NoSale bar graph for each attribute in Age in the data set
    car_insurance_model.bar_graph_attribute_by_classification('Age', 'Sale')
    # prints a summary of the distribution of the column 'Age' such as mean, standard deviation etc
    car_insurance_model.describe_attribute('Age')
    # plots a histogram of the attribute Age and also a quantile quantile plot
    car_insurance_model.histogram_and_q_q('Age')
    # plots a scatter plot of Age and Price
    car_insurance_model.scatter_plot('Age', 'Price')
    # plots a scatter plot of Age and Price for Sale and NoSale
    car_insurance_model.scatter_plot_by_classification('Age', 'Price', 'Sale')

    car_insurance_model.histogram_and_q_q('Credit_Score')

    ####################################################################################################################
    # Observe how much data is missing for each attribute
    car_insurance_model.missing_data_ratio_print()
    # displays and saves a bar graph of the percentage of missing values
    car_insurance_model.missing_data_ratio_bar_graph()

    car_insurance_model.heat_map()

    ####################################################################################################################
    # PROCESSING
    ####################################################################################################################
    # Attempted to log and sqrt transform some skewed parameters however, I found the models to perform worse hence I
    # decided to instead normalise the attributes to have a mean of 0 and standard deviation of 1. Code below
    # demonstrates some of my attempts to better fit the data to a normal distribution

    '''
    car_insurance_model.histogram_and_q_q('Credit_Score')
    # max_price = car_insurance_model._data_set['Price'].max()
    # car_insurance_model._data_set['Price'] = max_price + 1 - car_insurance_model._data_set['Price']
    car_insurance_model.boxcox_trans_attribute('Credit_Score', 0.1)
    # car_insurance_model._data_set['Price'] = np.sqrt(car_insurance_model._data_set['Price'])
    car_insurance_model.histogram_and_q_q('Credit_Score')
    '''

    ####################################################################################################################

    # Normalise attributes to a mean of zero and standard deviation of 1 before imputing
    attributes_to_normalise = ['Veh_Mileage', 'Credit_Score', 'License_Length', 'Veh_Value', 'Price', 'Age', 'Tax']

    for i in attributes_to_normalise:
        car_insurance_model.normalise_attribute(i)

    ####################################################################################################################
    # creating new features from the attribute date

    # decided to add day_of_the_week column to see if any information can be extracted
    car_insurance_model.add_day_of_week_attribute()
    # bar graph of new column to see if any new information can be obtained
    car_insurance_model.bar_graph_attribute_by_classification('days_of_the_week', 'Sale')
    # can see that on Friday typically there are less sales hence decided to create new column

    # used similar method to extract month and year, found month would have added too many attributes when one hot
    # encoding and year to not have any significant difference between 2015 and 2016

    # one hot encodes the column days_of_the_week by adding 7 new attributes
    car_insurance_model.one_hot_encode_attribute('days_of_the_week')
    # drop date as there are so many different days
    car_insurance_model.drop_attribute('Date')

    ####################################################################################################################
    # Dealing with the attributes Tax and Price

    # scatter plot the two attributes as they appear very highly correlated and could be used to impute the data
    # car_insurance_model.scatter_plot_by_classification("Tax", "Price")
    # found that tax and price follow two linear equations using car_insurance_model.scatter_plot("Tax", "Price")
    # the cutoff between following either equation was when the tax was between a value of 32 to 35 which was found by
    # looking through the data set:
    # typically when tax < 34, tax = 0.05 * price and when tax > 34, tax = 0.1 * price
    # hence this can be used to impute missing values more accurately

    # compare how many values are imputed using this method
    car_insurance_model.missing_data_ratio_print()
    car_insurance_model.impute_price()
    car_insurance_model.impute_tax()
    car_insurance_model.missing_data_ratio_print()

    # as only 5 values are missing for both Price and Tax, the mean is imputed for these values
    car_insurance_model.impute_mean('Price')
    car_insurance_model.impute_mean('Tax')

    ####################################################################################################################

    # one hot encoding certain attributes
    car_insurance_model.one_hot_encode_attribute('Marital_Status')  # one hot encodes Marital_Status

    ####################################################################################################################

    # found credit score to have an interesting value of 9999 for some customers, I attempted to one hot encode all the
    # customers that had this score to a new column however, found this to have no significant difference on the model
    # however, I decided to leave the code in the class DataPreprocessor:

    # car_insurance_model.new_column_infinite_credit_score()

    ####################################################################################################################

    # attempted to impute using knn from a package known as fancyimpute however, I found this to be extremely
    # inefficient and instead used standard methods. the code is left the class DataPreprocessor and called on the next
    # line:
    # car_insurance_model.impute_knn(3)

    ####################################################################################################################

    # Impute the other attributes using standard methods
    car_insurance_model.impute_median('Credit_Score')
    car_insurance_model.impute_mode('Veh_Mileage')
    car_insurance_model.impute_median('License_Length')  # should try to impute by first categorising by Maritial_Status
    car_insurance_model.impute_mode('Veh_Value')  # should use a better method
    car_insurance_model.impute_median('Age')

    ####################################################################################################################

    # check all values have been imputed
    print('After imputing all the attributes, the missing ratio is found to be:')
    car_insurance_model.missing_data_ratio_print()

    ####################################################################################################################
    # MODELS
    ####################################################################################################################

    car_insurance_model.shuffle_data_set()  # shuffle the data set before splitting

    # split data to 75% training, 25% test with a seed set to 2 (to get the same split when running the code
    car_insurance_model.split_data_set_if_test_not_split('Sale', 0.25, 2)

    ####################################################################################################################
    # Knn model
    # gridsearch for knn

    # uncomment to run grid search
    # tuned_parameters_knn = [{'n_neighbors': [5, 15, 19]}]
    # car_insurance_model.knn_model_grid_search(tuned_parameters_knn, 3)

    # fit a knn with k=5 and print percentage accuracy for 10-fold cross validation and confusion matrix against the
    # test set
    car_insurance_model.knn_model(15, 10)

    ####################################################################################################################
    # SMV model
    # found these set of parameters to be the most optimum when performing a grid search

    # uncomment to run grid search
    # tuned_parameters_svm = [{'kernel': ['rbf'], 'gamma': [1/15, 1/16, 1/17], 'C': [11, 10, 12]}]
    # car_insurance_model.svm_model_grid_search(tuned_parameters_svm, 3)

    # fit a svm and print percentage accuracy for 10-fold cross and shows the confusion matrix for the best
    # hyper-parameters found when performing the grid-search

    # k-fold cross validation for optimum hyper-parameters to validate SVM model
    car_insurance_model.svm_model(1/16, 10, 10)