class DFCatBoostEncoder(BaseEstimator, TransformerMixin): def __init__(self, columns=None, **kwargs): self.columns = columns self.model = CatBoostEncoder(**kwargs) self.transform_cols = None def fit(self, X, y): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] self.model.fit(X[self.transform_cols], y) return self def transform(self, X): return self.__transform(X) def __transform(self, X, y=None): if self.transform_cols is None: raise NotFittedError( f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator." ) new_X = X.drop(columns=self.transform_cols) new_X = pd.concat([ new_X, self.model.transform(X[self.transform_cols]) if y is None else self.model.fit_transform(X[self.transform_cols], y) ], axis=1) return new_X def fit_transform(self, X, y): # NOTE: Result of fit_transform() is different from fit() + transform() return self.fit(X, y).__transform(X, y)
def encode_cat_features(self, X, y, cat_features, train_mask, val_mask, test_mask): from category_encoders import CatBoostEncoder enc = CatBoostEncoder() A = X.to_numpy(copy=True) b = y.to_numpy(copy=True) A[np.ix_(train_mask, cat_features)] = enc.fit_transform(A[np.ix_(train_mask, cat_features)], b[train_mask]) A[np.ix_(val_mask + test_mask, cat_features)] = enc.transform(A[np.ix_(val_mask + test_mask, cat_features)]) A = A.astype(float) return pd.DataFrame(A, columns=X.columns)
def reg_model(labelled_data, unlabelled_data): """ Parameters: training dataframe, unknown dataframe Returns: results dataframe (Instance, Income) ffill on NaN from training data, Replaces NaN in test data with ffill, cat-encodes non-numeric fields, scales values, 80/20 splits data to help verify model, uses LightGBM """ # print("throwing away rows to speed up model") # speed up testing by throwing away some data # clean_labelled = labelled_data.sample(frac=0.2) clean_labelled = labelled_data.copy() clean_unlabelled = unlabelled_data.copy() print("cleaning data...") # get rid of weird value clean_labelled.loc[:, "Work Experience in Current Job [years]"] = pandas.to_numeric( labelled_data[ "Work Experience in Current Job [years]"], errors="coerce") clean_unlabelled.loc[:, "Work Experience in Current Job [years]"] = pandas.to_numeric( unlabelled_data[ "Work Experience in Current Job [years]"], errors="coerce") print("mixed type issue fixed..") # fix additional income field clean_labelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric( np.fromiter(map( lambda s: s.replace(" EUR", ""), clean_labelled[ "Yearly Income in addition to Salary (e.g. Rental Income)"], ), dtype=np.float), errors="coerce") clean_unlabelled.loc[:, "Yearly Income in addition to Salary (e.g. Rental Income)"] = pandas.to_numeric( np.fromiter(map( lambda s: s.replace(" EUR", ""), clean_unlabelled[ "Yearly Income in addition to Salary (e.g. Rental Income)"], ), dtype=np.float), errors="coerce") # dropping useless columns drop_columns(clean_unlabelled) drop_columns(clean_labelled) # removing NaN values clean_labelled.fillna(method="ffill", inplace=True) clean_unlabelled = clean_unlabelled[all_columns] clean_unlabelled.fillna(method="ffill", inplace=True) # input data for final predictions unknown_data = clean_unlabelled.drop(["Instance"], axis=1) print("splitting data into train and test...") # 80/20 split, and separating targets split = split_data(clean_labelled) train_data, train_target, test_data, test_target = split print("encoding categorical data...") # categorical encoding cat = CatBoostEncoder() train_data = cat.fit_transform(train_data, train_target) test_data = cat.transform(test_data) unknown_data = cat.transform(unknown_data) # separate additional income train_add_income = train_data[ "Yearly Income in addition to Salary (e.g. Rental Income)"].values test_add_income = test_data[ "Yearly Income in addition to Salary (e.g. Rental Income)"].values unknown_add_income = unknown_data[ "Yearly Income in addition to Salary (e.g. Rental Income)"].values train_data = train_data[no_income_columns] test_data = test_data[no_income_columns] unknown_data = unknown_data[no_income_columns] train_target = train_target[ "Total Yearly Income [EUR]"].values - train_add_income test_target = test_target["Total Yearly Income [EUR]"].values print("scaling values...") # scaling values scaler = StandardScaler() train_data = scaler.fit_transform(train_data) test_data = scaler.transform(test_data) unknown_data = scaler.transform(unknown_data) print("fitting model...") # fit model reg = LGBMRegressor() # reg = TransformedTargetRegressor( # regressor=mod, # transformer=scaler # ) reg.fit(train_data, train_target) print("predicting test data...") test_result = reg.predict(test_data, num_iterations=15000) # add additional income test_result = test_result + test_add_income print("analysing test results...") # validate test error = mean_absolute_error(test_target, test_result) score = explained_variance_score(test_target, test_result) print("Mean absolute error of test data: ", error) print("Score: ", score) print("predicting unknown data...") # predict and format values = reg.predict(unknown_data) values = values + unknown_add_income results = pandas.DataFrame({ "Instance": clean_unlabelled["Instance"].values, "Total Yearly Income [EUR]": values }) print("Finished.") return results
def _ml_data_prep(self): """Prepares datasets for ML This does one hot encoding, cat boost encoding, and train test split (if necessary). """ df_post = copy.deepcopy(self.df_post) train_prior = copy.deepcopy(self.df_prior) # create test data if not provided if self.test_data is None: logger.info( "No test data was provided. Test data will be created with", "a {}-{} ".format(self.train_size*100, (1-self.train_size)*100), "shuffle split from the post data set." ) df_post = shuffle(df_post) n_split = int(len(df_post)*self.train_size) train_post = df_post.iloc[:n_split] test = df_post.iloc[n_split:] else: test = copy.deepcopy(self.test_data) train_post = df_post # determine columns for OHE & CatBoost OHE_columns = [col for col in self.OHE_columns if col != self.target_column] high_cardinality_columns = [col for col in self.high_cardinality_columns if col != self.target_column] if len(OHE_columns) > 0: logger.info("One hot encoded columns: ", OHE_columns) if len(high_cardinality_columns) > 0: logger.info("Cat boost encoded columns: ", high_cardinality_columns) # concat and then OHE to ensure columns match train_prior['source'] = "Train Prior" test['source'] = "Test" train_post['source'] = "Train Post" df = pd.concat([train_prior, test, train_post]) df = pd.get_dummies(data=df, columns=OHE_columns) train_prior = df[df.source == 'Train Prior'].drop('source', axis=1) test = df[df.source == 'Test'].drop('source', axis=1) train_post = df[df.source == 'Train Post'].drop('source', axis=1) # CatBoostEncoder for high cardinality columns test_prior = copy.deepcopy(test) test_post = copy.deepcopy(test) tf_prior = CatBoostEncoder(cols=high_cardinality_columns, random_state=self.random_state) tf_post = CatBoostEncoder(cols=high_cardinality_columns, random_state=self.random_state) train_prior[high_cardinality_columns] = ( tf_prior.fit_transform(train_prior[high_cardinality_columns], train_prior[self.target_column]) ) test_prior[high_cardinality_columns] = ( tf_prior.transform(test_prior[high_cardinality_columns], test_prior[self.target_column]) ) train_post[high_cardinality_columns] = ( tf_post.fit_transform(train_post[high_cardinality_columns], train_post[self.target_column]) ) test_post[high_cardinality_columns] = ( tf_post.transform(test_post[high_cardinality_columns], test_post[self.target_column]) ) X_train_prior = train_prior.drop(self.target_column, axis=1).astype(float) y_train_prior = train_prior[self.target_column].astype(float) X_test_prior = test_prior.drop(self.target_column, axis=1).astype(float) y_test = test[self.target_column].astype(float) X_train_post = train_post.drop(self.target_column, axis=1).astype(float) y_train_post = train_post[self.target_column].astype(float) X_test_post = test_post.drop(self.target_column, axis=1).astype(float) self.X_train_prior = X_train_prior self.y_train_prior = y_train_prior self.X_test_prior = X_test_prior self.y_test = y_test self.X_train_post = X_train_post self.y_train_post = y_train_post self.X_test_post = X_test_post