Exemplo n.º 1
0
    def _preprocessor(self, X_raw):
        """Data preprocessing function.

        This function prepares the features of the data for training,
        evaluation, and prediction.

        Parameters
        ----------
        X_raw : numpy.ndarray (NOTE, IF WE CAN USE PANDAS HERE IT WOULD BE GREAT)
            A numpy array, this is the raw data as downloaded

        Returns
        -------
        X: numpy.ndarray (NOTE, IF WE CAN USE PANDAS HERE IT WOULD BE GREAT)
            A clean data set that is used for training and prediction.
        """
        # YOUR CODE HERE

        # data = X_raw.drop(columns=drop_cols)
        #
        # X = data.loc[:, ~data.columns.isin(label_cols)].values
        # Y = data.loc[:, data.columns.isin(label_cols)].values.ravel()
        #
        # split_idx = int(0.8 * len(X))
        #
        # x_train = X[:split_idx]
        # y_train = Y[:split_idx]
        # x_val = X[split_idx:]
        # y_val = Y[split_idx:]

        prep = nn.Preprocessor(X_raw)
        x_train_pre = prep.apply(X_raw)

        return x_train_pre  # YOUR CLEAN DATA AS A NUMPY ARRAY
Exemplo n.º 2
0
def load_model():
    model = tf.keras.models.load_model('part3_pricing_model.h5')
    df = pd.read_csv('part3_data.csv').sample(frac=1)
    x_train = df[df.columns[:-2]].to_numpy()
    y_train = df[df.columns[-1]].to_numpy()
    claim_train = np.expand_dims(df[df.columns[-2]].to_numpy(), axis=1)
    numerical_features_names_sel = ['pol_bonus', 'pol_duration', 'pol_sit_duration', 'drv_age1', 'drv_age2',
                                    'vh_age', 'vh_cyl', 'vh_value', 'town_mean_altitude', 'population', 'vh_speed',
                                    'vh_weight']
    categorical_feature_names_sel = ['pol_coverage', 'pol_usage', 'drv_drv2', 'vh_make', 'vh_type',
                                     'vh_fuel', ]
    le = LabelEncoder()
    df2 = df[categorical_feature_names_sel].apply(le.fit_transform)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(df[numerical_features_names_sel])
    num_features = np.concatenate((imp.transform(df[numerical_features_names_sel].to_numpy(dtype=np.float64)),
                                   df2.to_numpy(dtype=np.float64)),
                                  axis=1)
    preprocessor = nn_lib.Preprocessor(num_features)
    pm = PricingModel(preprocessor=preprocessor, imputer=imp, encoder=le, categorical=categorical_feature_names_sel,
                      numerical=numerical_features_names_sel)
    nnz = np.where(claim_train != 0)[0]
    pm.y_mean = np.mean(claim_train[nnz])
    pm.median_vh_value = df['vh_value'].median()
    pm.warp(model)
    return pm
Exemplo n.º 3
0
def load_model():

    model = tf.keras.models.load_model('part2_model.h5')
    data = np.genfromtxt('part2_data.csv', delimiter=',')
    split_index = int(0.8 * data.shape[0])
    y_train = data[1:split_index, -1]
    x_train = data[1:split_index, :9]
    preprocessor = nn_lib.Preprocessor(x_train)
    m = ClaimClassifier(preprocessor)
    m.warp(model)
    return m
Exemplo n.º 4
0
def train_model():
    df = pd.read_csv('part3_data.csv').sample(frac=1)
    x_train = df[df.columns[:-2]].to_numpy()
    y_train = df[df.columns[-1]].to_numpy()
    claim_train = df[df.columns[-2]].to_numpy()
    numerical_features_names_sel = ['pol_bonus', 'pol_duration', 'pol_sit_duration', 'drv_age1', 'drv_age2',
                                    'vh_age', 'vh_cyl', 'vh_value', 'town_mean_altitude', 'population', 'vh_speed',
                                    'vh_weight']
    categorical_feature_names_sel = ['pol_coverage', 'pol_usage', 'drv_drv2', 'vh_make', 'vh_type',
                                     'vh_fuel', ]
    le = LabelEncoder()
    df2 = df[categorical_feature_names_sel].apply(le.fit_transform)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(df[numerical_features_names_sel])
    num_features = np.concatenate((imp.transform(df[numerical_features_names_sel].to_numpy(dtype=np.float64)),
                                   df2.to_numpy(dtype=np.float64)),
                                  axis=1)
    preprocessor = nn_lib.Preprocessor(num_features)
    pm = PricingModel(preprocessor=preprocessor, imputer=imp, encoder=le, categorical=categorical_feature_names_sel,
                      numerical=numerical_features_names_sel)
    pm.fit(x_train, y_train, claim_train)
    pm.save_model()
Exemplo n.º 5
0
def evaluate_model():
    df = pd.read_csv('part3_data.csv').sample(frac=1)
    split_index = int(0.8*df.shape[0])
    # print(df.head())
    df_train = df.iloc[:split_index, :]
    df_test = df.iloc[split_index:, :]
    x_train = df_train[df_train.columns[:-2]].to_numpy()
    y_train = df_train[df_train.columns[-1]].to_numpy()
    # print(y_train)
    claim_train = df_train[df_train.columns[-2]].to_numpy()
    # print(x_train)
    # print(y_train)
    # print(claim_train)
    x_test = df_test[df_test.columns[:-2]].to_numpy()
    y_test = df_test[df_test.columns[-1]].to_numpy()
    claim_test = df_test[df_test.columns[-2]].to_numpy()
    numerical_features_names_sel = ['pol_bonus', 'pol_duration', 'pol_sit_duration', 'drv_age1', 'drv_age2',
                                    'vh_age', 'vh_cyl', 'vh_value',  'town_mean_altitude', 'population', 'vh_speed',
                                    'vh_weight']
    categorical_feature_names_sel = ['pol_coverage', 'pol_usage', 'drv_drv2', 'vh_make', 'vh_type',
                                     'vh_fuel', ]
    le = LabelEncoder()
    df2 = df[categorical_feature_names_sel].apply(le.fit_transform)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    imp.fit(df[numerical_features_names_sel])
    num_features = np.concatenate((imp.transform(df[numerical_features_names_sel].to_numpy(dtype=np.float64)),
                                   df2.to_numpy(dtype=np.float64)),
                                  axis=1)
    preprocessor = nn_lib.Preprocessor(num_features)
    pm = PricingModel(preprocessor=preprocessor, imputer=imp, encoder=le, categorical=categorical_feature_names_sel,
                      numerical=numerical_features_names_sel)
    # print(y_test.max(), claim_test.max())
    pm.fit(x_train, y_train, claim_train)
    predicted_y = pm.predict_claim_probability(x_test)
    print(predicted_y)
    print("AUC score is %.3f" % pm.evaluate_architecture(x_test, y_test, predicted_y))
    print('Premioum', pm.predict_premium(x_test))