from census.utils import cols_name_to_indices from sklearn.ensemble import RandomForestClassifier from census.transformer import MyScaler from sklearn.pipeline import Pipeline from census.shortcuts import load_datas print(""" Loading data from disk and transform it with : - categorical variables are label so that the corresponding numerical values are growing with the proportion of +50.000$ """) limit_to_rows = 199523 # corresponds to the whole dataset X, y, X_test, y_test, columns, cols_interpret = load_datas( nrows=limit_to_rows, sort_mapping=True, booleans=False # dummify=True ) # SET MODEL TYPE IN A PIPELINE scale_cols = ['AGE', 'INSTANCE_WEIGHT', 'WWORKY', 'WPERH', 'CAP_GAINS', 'CAP_LOSSES', 'STOCK_DIV'] idx_scale_cols = cols_name_to_indices(columns, scale_cols) ms = MyScaler(idx_scale_cols) rf = RandomForestClassifier(n_jobs=-1) predictor = Pipeline([('my_scaling', ms), ('rf', rf)]) print(""" ---------------------------------
from census.shortcuts import load_datas print(""" Loading data from disk and transform it with : - dummification of categorical variables - removing of some columns (see below) - add boolean variables when numerical inputs code in fact two variables eg: WWORKY weeks worked per year : have instance worked in term of weeks ? if yes, how much ? """) limit_to_rows = 199523 # corresponds to the whole dataset X, y, X_test, y_test, columns, cols_interpret = load_datas( nrows=limit_to_rows, remove_cols=['YEAR', 'RESI_REGION', 'RESI_PREV', 'RESI_1YEAR'], sort_mapping=False, booleans=True, dummify=True ) # SET MODEL TYPE IN A PIPELINE scale_cols = ['AGE', 'INSTANCE_WEIGHT', 'WWORKY', 'WPERH', 'CAP_GAINS', 'CAP_LOSSES', 'STOCK_DIV'] idx_scale_cols = cols_name_to_indices(columns, scale_cols) ms = MyScaler(idx_scale_cols) rf = RandomForestClassifier(n_jobs=-1) predictor = Pipeline([('my_scaling', ms), ('rf', rf)]) print("""
from census.learn import FindBest, most_influential, report_influential, score_report from census.utils import cols_name_to_indices from sklearn.linear_model import LogisticRegression from census.transformer import MyScaler from sklearn.pipeline import Pipeline from census.shortcuts import load_datas # LOAD DATA limit_to_rows = 199523 # corresponds to the whole dataset X, y, X_test, y_test, columns, cols_interpret = load_datas( nrows=limit_to_rows, remove_cols=[ "YEAR", "RESI_REGION", "RESI_PREV", "RESI_1YEAR", # 'MIG_REGION', 'MIG_MOVE', 'MIG_MSA', 'INSTANCE_WEIGHT', # 'ORIG_MOTHER' ], dummify=True, ) # SET MODEL TYPE IN A PIPELINE scale_cols = ["AGE", "INSTANCE_WEIGHT", "WWORKY", "WPERH", "CAP_GAINS", "CAP_LOSSES", "STOCK_DIV"] idx_scale_cols = cols_name_to_indices(columns, scale_cols) ms = MyScaler(idx_scale_cols) lr = LogisticRegression(penalty="l1") predictor = Pipeline([("my_scaling", ms), ("lr", lr)])