def find_best_model(x_clean, y_raw, variables=9, pricing=False):

    train_x, valid_x, train_y, valid_y = train_test_split(x_clean,
                                                          y_raw,
                                                          test_size=0.2)

    # Up sample
    (unique, counts) = np.unique(train_y, return_counts=True)
    total_train = np.append(train_x, train_y, axis=1)
    total_train = pd.DataFrame(total_train)

    df_class_0 = total_train[total_train.iloc[:, -1] == 0]
    df_class_1 = total_train[total_train.iloc[:, -1] == 1]

    total_train_class_1_over = df_class_1.sample(counts[0], replace=True)
    test_over = pd.concat([df_class_0, total_train_class_1_over], axis=0)

    total_train = np.array(test_over)

    new_train_y = total_train[:, -1]
    new_train_x = total_train[:, :-1]
    new_train_y = np.expand_dims(new_train_y, 1)
    max_metric = 0
    searches = 10

    for i in range(searches):
        new_net = ClaimClassifier(variables=len(train_x[0]), linear=True)
        lrn_rate = np.random.uniform(0.0001, 1)
        loss = nn.BCELoss()
        epochs = round(np.random.uniform(50, 150))
        new_net.train()
        optimizer = optim.SGD(new_net.parameters(), lr=lrn_rate)
        for j in range(epochs):
            X = torch.Tensor(new_train_x)
            Y = torch.Tensor(new_train_y)

            # changed from optimizer to net.zero_grad
            new_net.zero_grad()

            output = new_net(X)

            loss_obj = loss(output, Y)

            loss_obj.backward()
            optimizer.step()

        new_net.eval()
        print("Model (" + str(i + 1) + ") out of " + str(searches))
        pred, probabilities = new_net.predict_probabilities(valid_x,
                                                            pricing=True)
        metric = roc_auc_score(valid_y, probabilities)
        print("Roc Score:" + str(metric))
        if metric > max_metric:
            max_metric = metric
            best_lr = lrn_rate
            max_epochs = epochs
            best_net = new_net

    return best_net
 def __init__(self, epoch=100, batchsize=64, learnrate=0.0001, neurons=9, num_features=13, calibrate_probabilities=False):
     """
     Feel free to alter this as you wish, adding instance variables as
     necessary.
     """
     self.y_median = None
     self.calibrate = calibrate_probabilities
     self.trained = False
     self.label_binarizer = {}
     self.base_classifier = ClaimClassifier(epoch, batchsize, learnrate, neurons, num_features)
 def __init__(self, calibrate_probabilities=False):
     """
     Feel free to alter this as you wish, adding instance variables as
     necessary.
     """
     self.y_mean = None
     self.calibrate = calibrate_probabilities
     # =============================================================
     # READ ONLY IF WANTING TO CALIBRATE
     # Place your base classifier here
     # NOTE: The base estimator must have:
     #    1. A .fit method that takes two arguments, X, y
     #    2. Either a .predict_proba method or a decision
     #       function method that returns classification scores
     #
     # Note that almost every classifier you can find has both.
     # If the one you wish to use does not then speak to one of the TAs
     #
     # If you wish to use the classifier in part 2, you will need
     # to implement a predict_proba for it before use
     # =============================================================
     self.base_classifier = ClaimClassifier()
Пример #4
0
class PricingModel():
    # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY
    def __init__(self, calibrate_probabilities=False):
        """
        Feel free to alter this as you wish, adding instance variables as
        necessary.
        """
        self.y_mean = None
        self.calibrate = calibrate_probabilities
        # =============================================================
        # READ ONLY IF WANTING TO CALIBRATE
        # Place your base classifier here
        # NOTE: The base estimator must have:
        #    1. A .fit method that takes two arguments, X, y
        #    2. Either a .predict_proba method or a decision
        #       function method that returns classification scores
        #
        # Note that almost every classifier you can find has both.
        # If the one you wish to use does not then speak to one of the TAs
        #
        # If you wish to use the classifier in part 2, you will need
        # to implement a predict_proba for it before use
        # =============================================================
        self.base_classifier = ClaimClassifier(
        )  # ADD YOUR BASE CLASSIFIER HERE

    # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY TO THE _preprocessor METHOD
    def _preprocessor(self, X_raw, training=False):
        """Data preprocessing function.

        This function prepares the features of the data for training,
        evaluation, and prediction.

        Parameters
        ----------
        X_raw : ndarray
            An array, this is the raw data as downloaded

        Returns
        -------
        X: ndarray
            A clean data set that is used for training and prediction.
        """
        # =============================================================
        # YOUR CODE HERE

        # Load simple data set used in part 2
        part2_headers = [
            'drv_age1', 'vh_age', 'vh_cyl', 'vh_din', 'pol_bonus',
            'vh_sale_begin', 'vh_sale_end', 'vh_value', 'vh_speed',
            'drv_age_lic1', 'pol_duration', 'pol_sit_duration', 'drv_age2'
        ]
        #  added from before
        # 'drv_age_lic1'
        #  pol_duration
        #  pol_sit_duration
        #  drv_age2

        required_attributes = X_raw[part2_headers]

        required_attributes = np.array(required_attributes)

        if training:
            self.means = np.mean(required_attributes, axis=0)
            self.std_dev = np.std(required_attributes, axis=0)

        x_normed = (required_attributes - self.means) / self.std_dev

        # Add extra columns here
        multiple_binarizers = []
        binarizer = LabelBinarizer()

        headers = [
            'drv_sex1', 'vh_type', 'pol_coverage', 'pol_usage', 'pol_payd'
        ]
        i = 0
        for header in headers:
            data = X_raw[header]
            if training:
                binarized = binarizer.fit_transform(data)
                multiple_binarizers.append(binarizer)
            else:
                binarized = self.saved_binarizers[i].transform(data)
            if len(binarized[0]) > 1:
                binarized = binarized[:, :-1]
            i += 1
            binarized = np.asarray(binarized)
            total = np.append(x_normed, binarized, axis=1)

        if training:
            self.saved_binarizers = multiple_binarizers

        return total

    def fit(self, X_raw, y_raw, claims_raw):
        """Classifier training function.

        Here you will use the fit function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded
        y_raw : ndarray
            A one dimensional array, this is the binary target variable
        claims_raw: ndarray
            A one dimensional array which records the severity of claims

        Returns
        -------
        self: (optional)
            an instance of the fitted model

        """
        nnz = np.where(claims_raw != 0)[0]
        self.y_mean = np.mean(claims_raw[nnz])
        # =============================================================
        # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE
        X_clean = self._preprocessor(X_raw, training=True)

        #Split into training/Validation
        training_x, validation_x, training_y, validation_y = train_test_split(
            X_clean, y_raw, test_size=0.2)

        #Upsample data
        (unique, counts) = np.unique(training_y, return_counts=True)
        total_train = np.append(training_x, training_y, axis=1)
        total_train = pd.DataFrame(total_train)

        df_class_0 = total_train[total_train.iloc[:, -1] == 0]
        df_class_1 = total_train[total_train.iloc[:, -1] == 1]

        total_train_class_1_over = df_class_1.sample(counts[0], replace=True)
        test_over = pd.concat([df_class_0, total_train_class_1_over], axis=0)

        total_train = np.array(test_over)

        new_train_y = total_train[:, -1]
        new_train_x = total_train[:, :-1]
        new_train_y = np.expand_dims(new_train_y, 1)

        #(unique, counts) = np.unique(new_train_y, return_counts=True)
        varaibles = len(new_train_x[0])

        validation_x = np.array(validation_x)
        validation_y = np.array(validation_y)

        # Find best parameters best classifier
        best_lr, best_epochs, multiplier, best_net = \
            part2.ClaimClassifierHyperParameterSearch(new_train_x, new_train_y, validation_x, validation_y, varaibles,
                                                      pricing=True)
        print("Best lr = " + str(best_lr))
        print("Best epochs = " + str(best_epochs))
        print("Multiplier = " + str(multiplier))

        # THE FOLLOWING GETS CALLED IF YOU WISH TO CALIBRATE YOUR PROBABILITES
        if self.calibrate:
            self.base_classifier = fit_and_calibrate_classifier(
                self.base_classifier, X_clean, y_raw)
        else:
            self.base_classifier = best_net  # Set classifier to model found

        return self.base_classifier

    def predict_claim_probability(self, X_raw):
        """Classifier probability prediction function.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded

        Returns
        -------
        ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """
        # =============================================================
        # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE
        copyOfData = X_raw
        X_clean = self._preprocessor(copyOfData)
        self.base_classifier.eval()
        X = torch.Tensor(X_clean)
        oupt = self.base_classifier(X)
        prob_y = oupt.detach().numpy()
        #pred_y, prob_y = self.base_classifier.predict_probabilities(X_clean, pricing=True)

        return prob_y

    def predict_premium(self, X_raw):
        """Predicts premiums based on the pricing model.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : numpy.ndarray
            A numpy array, this is the raw data as downloaded

        Returns
        -------
        numpy.ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of belonging to the
            POSITIVE class (that had accidents)
        """
        # =============================================================
        # REMEMBER TO INCLUDE ANY PRICING STRATEGY HERE.
        # For example you could scale all your prices down by a factor
        premium_factor = 0.20
        premiums = self.predict_claim_probability(
            X_raw) * self.y_mean * premium_factor
        premiums = np.array(premiums)
        premiums = premiums.flatten()

        return premiums

    def save_model(self):
        """Saves the class instance as a pickle file."""
        # =============================================================
        with open('part3_pricing_model.pickle', 'wb') as target:
            pickle.dump(self, target)
Пример #5
0
import pandas as pd
import numpy as np
from part2_claim_classifier import ClaimClassifier
from sklearn.metrics import accuracy_score
dataset = pd.read_csv("part2_data.csv").values
X = dataset[:, 0:9]
Y = dataset[:, -1]
nn = ClaimClassifier()
nn.fit(X, Y)
nn.evaluate_architecture(X, Y)
# nn.save_model()
# print(nn.predict(X))
#model = nn.fit_skl(X,Y)
data_test = dataset[np.where(dataset[:, -1] == 1)]
X = data_test[:, 0:9]
Y = data_test[:, -1]
y_pred = nn.predict(X)
print(y_pred)
print(accuracy_score(Y, y_pred))
Пример #6
0
class PricingModelLinear():
    # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY
    def __init__(self, calibrate_probabilities=False):
        """
        Feel free to alter this as you wish, adding instance variables as
        necessary.
        """
        self.y_mean = None
        self.calibrate = calibrate_probabilities
        # =============================================================
        # READ ONLY IF WANTING TO CALIBRATE
        # Place your base classifier here
        # NOTE: The base estimator must have:
        #    1. A .fit method that takes two arguments, X, y
        #    2. Either a .predict_proba method or a decision
        #       function method that returns classification scores
        #
        # Note that almost every classifier you can find has both.
        # If the one you wish to use does not then speak to one of the TAs
        #
        # If you wish to use the classifier in part 2, you will need
        # to implement a predict_proba for it before use
        # =============================================================
        self.base_classifier = ClaimClassifier(
            Insurance_NN_4())  # ADD YOUR BASE CLASSIFIER HERE

    # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY TO THE _preprocessor METHOD
    def _preprocessor(self, X_raw):
        """Data preprocessing function.

        This function prepares the features of the data for training,
        evaluation, and prediction.

        Parameters
        ----------
        X_raw : ndarray
            An array, this is the raw data as downloaded

        Returns
        -------
        X: ndarray
            A clean data set that is used for training and prediction.
        """
        # =============================================================
        # YOUR CODE HERE
        X_raw = copy.deepcopy(X_raw[[
            'pol_coverage', 'vh_age', 'vh_din', 'vh_fuel', 'vh_sale_begin',
            'vh_sale_end', 'vh_speed', 'vh_value', 'vh_weight'
        ]])

        X_raw.dropna(how="any", inplace=True)
        X_raw = self.integer_encode(X_raw)

        if not isinstance(X_raw, np.ndarray):
            X_raw = X_raw.to_numpy(dtype=np.float)

        min_max_scaler = preprocessing.MinMaxScaler()
        X_raw = min_max_scaler.fit_transform(X_raw)

        return X_raw.astype(np.float32)

    def fit(self, X_raw, y_raw, claims_raw, prepro=True):
        """Classifier training function.

        Here you will use the fit function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded
        y_raw : ndarray
            A one dimensional array, this is the binary target variable
        claims_raw: ndarray
            A one dimensional array which records the severity of claims

        Returns
        -------
        self: (optional)
            an instance of the fitted model

        """
        nnz = np.where(claims_raw != 0)[0]
        self.y_mean = np.mean(claims_raw[nnz])
        # =============================================================
        # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE
        if prepro:
            X_clean = self._preprocessor(X_raw)
        else:
            X_clean = X_raw

        # THE FOLLOWING GETS CALLED IF YOU WISH TO CALIBRATE YOUR PROBABILITES
        if self.calibrate:
            self.base_classifier = fit_and_calibrate_classifier(
                self.base_classifier, X_clean, y_raw)
            self.save_model()
        else:
            self.base_classifier = self.base_classifier.fit(X_clean, y_raw)
            self.save_model()
        return self.base_classifier

    def predict_claim_probability(self, X_raw):
        """Classifier probability prediction function.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded

        Returns
        -------
        ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """
        # =============================================================
        # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE
        X_clean = self._preprocessor(X_raw)
        #X_clean = X_raw
        # return probabilities for the positive class (label 1)
        return self.base_classifier.predict_proba(X_clean)[:, 1]

    def predict_premium(self, X_raw):
        """Predicts premiums based on the pricing model.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : numpy.ndarray
            A numpy array, this is the raw data as downloaded

        Returns
        -------
        numpy.ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """
        # =============================================================
        # REMEMBER TO INCLUDE ANY PRICING STRATEGY HERE.
        # For example you could scale all your prices down by a factor

        return self.predict_claim_probability(X_raw) * self.y_mean * 0.2725

    def save_model(self):
        """Saves the class instance as a pickle file."""
        # =============================================================
        with open('part3_pricing_model_linear.pickle', 'wb') as target:
            pickle.dump(self, target)

    def load_data(self, filename):
        """
        Function to load data from file
        Args:
            filename (str) - name of .txt file you are loading data from
        Output:
            (x, y) (tuple) - x: 2D array of training data where each row
            corresponds to a different sample and each column corresponds to a
            different attribute.
                            y: 1D array where each index corresponds to the
            ground truth label of the sample x[index][]
        """

        dat = pd.read_csv("part3_training_data.csv")
        #dat.drop(columns=["drv_sex2"], inplace=True)
        #dat.dropna(how="any", inplace=True)
        x = dat.drop(columns=["claim_amount", "made_claim"])
        y = dat["made_claim"]
        y1 = dat["claim_amount"]
        y2 = y1[y1 != 0]

        return x, y, y2.to_numpy(), y1

    def separate_pos_neg(self, x, y):

        # Separate into positive and negative samples
        pos_train_y = []
        pos_train_x = np.empty((0, x.shape[1]), np.float32)
        neg_train_y = []
        neg_train_x = np.empty((0, x.shape[1]), np.float32)
        for i in range(y.shape[0]):
            if y[i] == 1:
                pos_train_y.append(y[i])
                pos_train_x = np.vstack((pos_train_x, x[i]))
            else:
                neg_train_y.append(y[i])
                neg_train_x = np.vstack((neg_train_x, x[i]))

        neg_train_y = np.array(neg_train_y, dtype=np.float32)
        pos_train_y = np.array(pos_train_y, dtype=np.float32)

        return (neg_train_x, neg_train_y), (pos_train_x, pos_train_y)

    def integer_encode(self, x):
        """
        Encode all columns containing strings with unique numbers for every
        category type
        """
        x = x.to_numpy(dtype=str)
        for att_i in range(x.shape[1]):
            try:
                float(x[0, att_i])

            except ValueError:
                values = x[:, att_i]
                # integer encode
                label_encoder = LabelEncoder()
                integer_encoded = label_encoder.fit_transform(values)
                x[:, att_i] = integer_encoded
        return x.astype(float)
class PricingModel():
    def __init__(self, epoch=100, batchsize=64, learnrate=0.0001, neurons=9, num_features=13, calibrate_probabilities=False):
        """
        Feel free to alter this as you wish, adding instance variables as
        necessary.
        """
        self.y_median = None
        self.calibrate = calibrate_probabilities
        self.trained = False
        self.label_binarizer = {}
        self.base_classifier = ClaimClassifier(epoch, batchsize, learnrate, neurons, num_features)

        # =============================================================
        # READ ONLY IF WANTING TO CALIBRATE
        # Place your base classifier here

        # NOTE: The base estimator must have:
        #    1. A .fit method that takes two arguments, X, y
        #    2. Either a .predict_proba method or a decision
        #       function method that returns classification scores
        #
        # Note that almost every classifier you can find has both.
        # If the one you wish to use does not then speak to one of the TAs
        #
        # If you wish to use the classifier in part 2, you will need
        # to implement a predict_proba for it before use
        # =============================================================


    def _balance_dataset(self, X_y_raw):
        """Function to balance dataset used for training/validation/testing

        This function balances the dataset so it contains an equal number of
        Class 0 and Class 1 events

        Parameters
        ----------
        X_y_raw : ndarray
            An array, this is the raw data

        Returns
        -------
        X_y_balanced: ndarray
            An array, but balanced for each Class
        """
        # Seperate dataset into Class 0 and Class 1 events
        class_0 = X_y_raw[X_y_raw[:,-1] == 0]
        class_1 = X_y_raw[X_y_raw[:,-1] == 1]

        # Shuffle Class_0 events
        np.random.shuffle(class_0)

        # Take Subset of Class_0 events of equal size to Class 1 events
        class_1_size = class_1.shape[0]
        class_0_subset = class_0[:class_1_size,]
        X_y_balanced = np.vstack((class_0_subset,class_1))

        # Shuffle combined balanced dataset before returning
        np.random.shuffle(X_y_balanced)

        return X_y_balanced


    def _preprocessor(self, X_raw):
        """Data preprocessing function.

        This function prepares the features of the data for training,
        evaluation, and prediction.

        Parameters
        ----------
        X_raw : ndarray
            An array, this is the raw data as downloaded

        Returns
        -------
        X: ndarray
            A clean data set that is used for training and prediction.
        """

        features_to_keep = ['pol_coverage', 'vh_age', 'vh_din', 'vh_fuel', 'vh_sale_begin', 'vh_sale_end', 'vh_speed', 'vh_weight']
        X_pre = X_raw[features_to_keep]

        for col in features_to_keep:

            if X_pre.dtypes[col] != 'float64' and X_pre.dtypes[col] != 'int64':

                X_pre[col].fillna("empty")

                if col not in self.label_binarizer.keys():
                    self.label_binarizer[col] = LabelBinarizer()

                if self.trained == False:
                    X_pre = X_pre.join(pd.DataFrame(self.label_binarizer[col].fit_transform(X_pre[col]),
                                                    columns=self.label_binarizer[col].classes_,
                                                    index=X_pre.index))
                else:
                    X_pre = X_pre.join(pd.DataFrame(self.label_binarizer[col].transform(X_pre[col]),
                                                    columns=self.label_binarizer[col].classes_,
                                                    index=X_pre.index))

                X_pre = X_pre.drop(columns=col)
            else:
                mean = np.nanmean(X_pre[col].values)
                X_pre[col].fillna(mean)

        return X_pre

    def fit(self, X_raw, y_raw, claims_raw):
        """Classifier training function.

        Here you will use the fit function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded
        y_raw : ndarray
            A one dimensional array, this is the binary target variable
        claims_raw: ndarray
            A one dimensional array which records the severity of claims

        Returns
        -------
        self: (optional)
            an instance of the fitted model

        """
        nnz = np.where(claims_raw != 0)[0]
        self.y_median = np.median(claims_raw[nnz])

        X_clean = self._preprocessor(X_raw)
        X_Y_pandas = pd.concat([X_clean, y_raw], axis=1).reindex(X_clean.index)
        X_Y_clean = X_Y_pandas.to_numpy()

        X_Y_clean_balanced = self._balance_dataset(X_Y_clean)

        X_clean_balanced = pd.DataFrame(X_Y_clean_balanced[:,:-1])
        y_clean_balanced = pd.DataFrame(X_Y_clean_balanced[:,-1:])

        X_clean = X_clean_balanced
        y_raw = y_clean_balanced

        if self.calibrate:
            self.base_classifier = fit_and_calibrate_classifier(
                self.base_classifier, X_clean, y_raw)
        else:
            self.base_classifier = self.base_classifier.fit(X_clean, y_raw)

        self.trained = True
        return self.base_classifier

    def predict_claim_probability(self, X_raw):
        """Classifier probability prediction function.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded

        Returns
        -------
        ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """

        X_clean = self._preprocessor(X_raw)
        return self.base_classifier.predict(X_clean)


    def predict_premium(self, X_raw):
        """Predicts premiums based on the pricing model.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : numpy.ndarray
            A numpy array, this is the raw data as downloaded

        Returns
        -------
        numpy.ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """

        factor = 0.8 # 0.8 has taken account of both the inflation and investment returns expected
        return self.predict_claim_probability(X_raw) * self.y_median * factor

    def save_model(self):
        """Saves the class instance as a pickle file."""
        # =============================================================
        with open('part3_pricing_model.pickle', 'wb') as target:
            pickle.dump(self, target)


    def evaluate_architecture(self, X_test, Y_test):
        X = self._preprocessor(X_test)
        return self.base_classifier.evaluate_architecture(X, Y_test)
class PricingModelLinear():
    # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY
    def __init__(self, calibrate_probabilities=False):
        """
        Feel free to alter this as you wish, adding instance variables as
        necessary.
        """
        self.y_mean = None
        self.calibrate = calibrate_probabilities
        # =============================================================
        # READ ONLY IF WANTING TO CALIBRATE
        # Place your base classifier here
        # NOTE: The base estimator must have:
        #    1. A .fit method that takes two arguments, X, y
        #    2. Either a .predict_proba method or a decision
        #       function method that returns classification scores
        #
        # Note that almost every classifier you can find has both.
        # If the one you wish to use does not then speak to one of the TAs
        #
        # If you wish to use the classifier in part 2, you will need
        # to implement a predict_proba for it before use
        # =============================================================
        self.base_classifier = ClaimClassifier()

    # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY TO THE _preprocessor METHOD
    def _preprocessor(self, X_raw, training=False):
        """Data preprocessing function.

        This function prepares the features of the data for training,
        evaluation, and prediction.

        Parameters
        ----------
        X_raw : ndarray
            An array, this is the raw data as downloaded

        Returns
        -------
        X: ndarray
            A clean data set that is used for training and prediction.
        """
        # =============================================================
        # YOUR CODE HERE

        # Load simple data set used in part 2
        part2_headers = [
            "drv_age1", 'vh_age', 'vh_cyl', 'vh_din', 'pol_bonus',
            'vh_sale_begin', 'vh_sale_end', 'vh_value', 'vh_speed',
            'drv_age_lic1', 'pol_duration', 'pol_sit_duration', 'drv_age2'
        ]
        #  added from before
        # 'drv_age_lic1'
        #  pol_duration
        #  pol_sit_duration
        #  drv_age2

        required_attributes = X_raw[part2_headers]

        required_attributes = np.array(required_attributes)

        if training:
            self.means = np.mean(required_attributes, axis=0)
            self.std_dev = np.std(required_attributes, axis=0)

        x_normed = (required_attributes - self.means) / self.std_dev

        # Add extra columns here
        multiple_binarizers = []
        binarizer = LabelBinarizer()

        headers = ['drv_sex1', 'vh_type', 'pol_coverage', 'pol_usage']
        i = 0
        for header in headers:
            data = X_raw[header]
            if training:
                binarized = binarizer.fit_transform(data)
                multiple_binarizers.append(binarizer)
            else:
                binarized = self.saved_binarizers[i].transform(data)
            if len(binarized[0]) > 1:
                binarized = binarized[:, :-1]
            i += 1
            binarized = np.asarray(binarized)
            total = np.append(x_normed, binarized, axis=1)

        if training:
            self.saved_binarizers = multiple_binarizers

        return total

    def fit(self, X_raw, y_raw, claims_raw):
        """Classifier training function.

        Here you will use the fit function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded
        y_raw : ndarray
            A one dimensional array, this is the binary target variable
        claims_raw: ndarray
            A one dimensional array which records the severity of claims

        Returns
        -------
        self: (optional)
            an instance of the fitted model

        """
        nnz = np.where(claims_raw != 0)[0]
        self.y_mean = np.mean(claims_raw[nnz])
        # =============================================================
        # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE
        X_clean = self._preprocessor(X_raw, training=True)

        # THE FOLLOWING GETS CALLED IF YOU WISH TO CALIBRATE YOUR PROBABILITES
        if self.calibrate:
            self.base_classifier = fit_and_calibrate_classifier(
                self.base_classifier, X_clean, y_raw)
        else:
            self.base_classifier = find_best_model(X_clean, y_raw)

        return self.base_classifier

    def predict_claim_probability(self, X_raw):
        """Classifier probability prediction function.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded

        Returns
        -------
        ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """
        # =============================================================
        # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE
        X_clean = self._preprocessor(X_raw)
        pred, prob_y = self.base_classifier.predict_probabilities(X_clean,
                                                                  pricing=True)

        return prob_y

    def predict_premium(self, X_raw):
        """Predicts premiums based on the pricing model.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : numpy.ndarray
            A numpy array, this is the raw data as downloaded

        Returns
        -------
        numpy.ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """
        # =============================================================
        # REMEMBER TO INCLUDE ANY PRICING STRATEGY HERE.
        # For example you could scale all your prices down by a factor

        premium_factor = 0.27
        premiums = self.predict_claim_probability(
            X_raw) * self.y_mean * premium_factor
        premiums = np.array(premiums)
        premiums = premiums.flatten()

        return premiums

    def save_model(self):
        """Saves the class instance as a pickle file."""
        # =============================================================
        with open('part3_pricing_model_linear.pickle', 'wb') as target:
            pickle.dump(self, target)
class PricingModel():
    # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY
    def __init__(self, calibrate_probabilities=False):
        """
        Feel free to alter this as you wish, adding instance variables as
        necessary.
        """
        self.y_mean = None
        self.y_std = None
        self.calibrate = calibrate_probabilities
        # =============================================================
        # READ ONLY IF WANTING TO CALIBRATE
        # Place your base classifier here
        # NOTE: The base estimator must have:
        #    1. A .fit method that takes two arguments, X, y
        #    2. Either a .predict_proba method or a decision
        #       function method that returns classification scores
        #
        # Note that almost every classifier you can find has both.
        # If the one you wish to use does not then speak to one of the TAs
        #
        # If you wish to use the classifier in part 2, you will need
        # to implement a predict_proba for it before use
        # =============================================================
        self.base_classifier = ClaimClassifier(
            Insurance_NN_3())  # ADD YOUR BASE CLASSIFIER HERE

    # YOU ARE ALLOWED TO ADD MORE ARGUMENTS AS NECESSARY TO THE _preprocessor METHOD
    def _preprocessor(self, X_raw):
        """Data preprocessing function.

        This function prepares the features of the data for training,
        evaluation, and prediction.

        Parameters
        ----------
        X_raw : ndarray
            An array, this is the raw data as downloaded

        Returns
        -------
        X: ndarray
            A clean data set that is used for training and prediction.
        """
        # =============================================================
        # YOUR CODE HERE
        X_raw = copy.deepcopy(X_raw[[
            'pol_coverage', 'vh_age', 'vh_din', 'vh_fuel', 'vh_sale_begin',
            'vh_sale_end', 'vh_speed', 'vh_value', 'vh_weight'
        ]])

        X_raw.dropna(how="any", inplace=True)
        X_raw = self.integer_encode(X_raw)

        if not isinstance(X_raw, np.ndarray):
            X_raw = X_raw.to_numpy(dtype=np.float)

        min_max_scaler = preprocessing.MinMaxScaler()
        X_raw = min_max_scaler.fit_transform(X_raw)

        return X_raw.astype(np.float32)

    def fit(self, X_raw, y_raw, claims_raw, prepro=True):
        """Classifier training function.

        Here you will use the fit function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded
        y_raw : ndarray
            A one dimensional array, this is the binary target variable
        claims_raw: ndarray
            A one dimensional array which records the severity of claims

        Returns
        -------
        self: (optional)
            an instance of the fitted model

        """
        nnz = np.where(claims_raw != 0)[0]
        self.y_mean = np.mean(claims_raw[nnz])
        self.y_std = np.std(claims_raw[nnz])
        print(self.y_mean, self.y_std)
        # =============================================================
        # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE
        if prepro:
            X_clean = self._preprocessor(X_raw)
        else:
            X_clean = X_raw

        # THE FOLLOWING GETS CALLED IF YOU WISH TO CALIBRATE YOUR PROBABILITES
        if self.calibrate:
            self.base_classifier = fit_and_calibrate_classifier(
                self.base_classifier, X_clean, y_raw)
            self.save_model()
        else:
            self.base_classifier.fit(X_clean, y_raw)
            self.save_model()
        return self

    def predict_claim_probability(self, X_raw):
        """Classifier probability prediction function.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : ndarray
            This is the raw data as downloaded

        Returns
        -------
        ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """
        # =============================================================
        # REMEMBER TO A SIMILAR LINE TO THE FOLLOWING SOMEWHERE IN THE CODE
        X_clean = self._preprocessor(X_raw)

        # return probabilities for the positive class (label 1)
        return self.base_classifier.predict_proba(X_clean)[:, 1]

    def predict_premium(self, X_raw):
        """Predicts premiums based on the pricing model.

        Here you will implement the predict function for your classifier.

        Parameters
        ----------
        X_raw : numpy.ndarray
            A numpy array, this is the raw data as downloaded

        Returns
        -------
        numpy.ndarray
            A one dimensional array of the same length as the input with
            values corresponding to the probability of beloning to the
            POSITIVE class (that had accidents)
        """
        # =============================================================
        # REMEMBER TO INCLUDE ANY PRICING STRATEGY HERE.
        # For example you could scale all your prices down by a factor

        return self.predict_claim_probability(X_raw) * self.y_mean * 0.2775

    def save_model(self):
        """Saves the class instance as a pickle file."""
        # =============================================================
        with open('part3_pricing_model.pickle', 'wb') as target:
            pickle.dump(self, target)

    # -------- NEW FUNCTIONS -----------

    def load_data(self, filename):
        """
        Function to load data from file
        Args:
            filename (str) - name of .txt file you are loading data from
        Output:
            (x, y) (tuple) - x: 2D array of training data where each row
            corresponds to a different sample and each column corresponds to a
            different attribute.
                            y: 1D array where each index corresponds to the
            ground truth label of the sample x[index][]
        """

        dat = pd.read_csv("part3_training_data.csv")
        #dat.drop(columns=["drv_sex2"], inplace=True)
        #dat.dropna(how="any", inplace=True)
        x = dat.drop(columns=["claim_amount", "made_claim"])
        y = dat["made_claim"]
        y1 = dat["claim_amount"]
        y2 = y1[y1 != 0]
        """
        # load data to single 2D array
        data_set = np.genfromtxt(filename, dtype=str, delimiter=',', skip_header=1)

        num_att = len(data_set[0])  # number of parameters

        x = np.array(data_set[:, :(num_att-2)], dtype=str)
        y = np.array(data_set[:, (num_att-1)], dtype=np.float)
        """

        return x, y, y2.to_numpy(), y1

    def set_axis_style(self, ax, labels):
        ax.get_xaxis().set_tick_params(direction='out')
        ax.xaxis.set_ticks_position('bottom')
        ax.set_xticks(np.arange(1, len(labels) + 1))
        ax.set_xticklabels(labels)
        ax.set_xlim(0.25, len(labels) + 0.75)
        ax.set_xlabel('Sample name')

    def evaluate_input1(self, X_raw):
        """
        Function to evaluate data loaded from file

        """

        attributes = []
        for i in range(np.shape(X_raw)[1]):
            attributes.append(X_raw[:, i])

        fig, ax1 = plt.subplots(figsize=(18, 4))

        # type of plot
        ax1.boxplot(attributes)

        labels = [
            1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 15, 16, 17, 18, 19, 20,
            21, 22, 24, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35
        ]

        self.set_axis_style(ax1, labels)

        plt.subplots_adjust(bottom=0.15, wspace=0.05)
        # plt.show()
        plt.xlabel("Attribute Type")
        plt.ylabel("Attribute Value")

        plt.savefig("box_3.pdf", bbox_inches='tight')

        ####################

        plt.cla()
        ax1.violinplot(attributes)

        self.set_axis_style(ax1, labels)

        plt.subplots_adjust(bottom=0.15, wspace=0.05)
        plt.xlabel("Attribute Type")
        plt.ylabel("Attribute Value")

        plt.savefig("violin_3.pdf", bbox_inches='tight')

    def evaluate_input2(self, x, y):
        """
        Function to evaluate data loaded from file

        """

        # Separate positive and negative results

        (neg_x, neg_y), (pos_x, pos_y) = self.separate_pos_neg(x, y)
        attributes1 = []
        attributes2 = []
        for i in range(np.shape(neg_x)[1]):
            attributes1.append(neg_x[:, i])
            attributes2.append(pos_x[:, i])

        fig, axs = plt.subplots(2, figsize=(11, 11))

        # type of plot
        axs[0].boxplot(attributes1)
        axs[1].boxplot(attributes2)

        labels = np.genfromtxt("part3_training_data.csv",
                               dtype=str,
                               delimiter=',',
                               max_rows=1)

        self.set_axis_style(axs[0], labels)
        self.set_axis_style(axs[1], labels)

        # plt.show()
        axs[0].set(xlabel="Attribute Type", ylabel="Attribute Value")
        axs[0].set_title("No Claim")
        axs[1].set(xlabel="Attribute Type", ylabel="Attribute Value")
        axs[1].set_title("Claim")

        plt.subplots_adjust(bottom=0.15, wspace=0.05)
        plt.savefig("compare_box_3.pdf", bbox_inches='tight')

    def evaluate_input3(self, x, y, split=0):
        """
        Function to evaluate data loaded from file

        """

        # Separate positive and negative results
        if split == 0:
            (neg_x, neg_y), (pos_x, pos_y) = self.separate_pos_neg(x, y)
        else:
            (neg_x, neg_y), (pos_x, pos_y) = split
            print(split[0][0].shape, split[1][0].shape)

        attributes1 = []
        attributes2 = []
        difference = []
        difference2 = []
        for i in range(np.shape(neg_x)[1]):

            attributes1.append(np.mean(neg_x[:, i]))
            attributes2.append(np.mean(pos_x[:, i]))
            difference.append(
                ((attributes2[i] - attributes1[i]) * 100) / attributes1[i])
            difference2.append(stats.ks_2samp(neg_x[:, i], pos_x[:, i]))
            print(i)

        print(attributes1)
        print(attributes2)
        print(difference)
        print(difference2)
        for i in range(len(difference2)):
            if difference2[i][0] > 0.1 and difference2[i][1] < 0.001:
                print(i, difference2[i])

    def separate_pos_neg(self, x, y):

        # Separate into positive and negative samples
        pos_train_y = []
        pos_train_x = np.empty((0, x.shape[1]), np.float32)
        neg_train_y = []
        neg_train_x = np.empty((0, x.shape[1]), np.float32)
        for i in range(y.shape[0]):
            if y[i] == 1:
                pos_train_y.append(y[i])
                pos_train_x = np.vstack((pos_train_x, x[i]))
            else:
                neg_train_y.append(y[i])
                neg_train_x = np.vstack((neg_train_x, x[i]))

        neg_train_y = np.array(neg_train_y, dtype=np.float32)
        pos_train_y = np.array(pos_train_y, dtype=np.float32)

        return (neg_train_x, neg_train_y), (pos_train_x, pos_train_y)

    def integer_encode(self, x):
        """
        Encode all columns containing strings with unique numbers for every
        category type
        """
        x = x.to_numpy(dtype=str)
        for att_i in range(x.shape[1]):
            try:
                float(x[0, att_i])

            except ValueError:
                values = x[:, att_i]
                # integer encode
                label_encoder = LabelEncoder()
                integer_encoded = label_encoder.fit_transform(values)
                x[:, att_i] = integer_encoded
        return x.astype(float)