示例#1
0
from census.utils import cols_name_to_indices
from sklearn.ensemble import RandomForestClassifier
from census.transformer import MyScaler
from sklearn.pipeline import Pipeline
from census.shortcuts import load_datas

print("""
Loading data from disk and transform it with :
  - categorical variables are label so that the corresponding
  numerical values are growing with the proportion of +50.000$
""")

limit_to_rows = 199523  # corresponds to the whole dataset
X, y, X_test, y_test, columns, cols_interpret = load_datas(
    nrows=limit_to_rows,
    sort_mapping=True,
    booleans=False
    # dummify=True
    )

# SET MODEL TYPE IN A PIPELINE

scale_cols = ['AGE', 'INSTANCE_WEIGHT', 'WWORKY', 'WPERH',
              'CAP_GAINS', 'CAP_LOSSES', 'STOCK_DIV']
idx_scale_cols = cols_name_to_indices(columns, scale_cols)
ms = MyScaler(idx_scale_cols)
rf = RandomForestClassifier(n_jobs=-1)
predictor = Pipeline([('my_scaling', ms), ('rf', rf)])


print("""
---------------------------------
示例#2
0
from census.shortcuts import load_datas

print("""
Loading data from disk and transform it with :
    - dummification of categorical variables
    - removing of some columns (see below)
    - add boolean variables when numerical inputs code in fact two variables
    eg: WWORKY weeks worked per year : have instance worked in term of weeks ?
                                       if yes, how much ?
""")

limit_to_rows = 199523  # corresponds to the whole dataset
X, y, X_test, y_test, columns, cols_interpret = load_datas(
    nrows=limit_to_rows,
    remove_cols=['YEAR', 'RESI_REGION', 'RESI_PREV', 'RESI_1YEAR'],
    sort_mapping=False,
    booleans=True,
    dummify=True
    )

# SET MODEL TYPE IN A PIPELINE

scale_cols = ['AGE', 'INSTANCE_WEIGHT', 'WWORKY', 'WPERH',
              'CAP_GAINS', 'CAP_LOSSES', 'STOCK_DIV']
idx_scale_cols = cols_name_to_indices(columns, scale_cols)
ms = MyScaler(idx_scale_cols)
rf = RandomForestClassifier(n_jobs=-1)
predictor = Pipeline([('my_scaling', ms), ('rf', rf)])


print("""
示例#3
0
from census.learn import FindBest, most_influential, report_influential, score_report
from census.utils import cols_name_to_indices
from sklearn.linear_model import LogisticRegression
from census.transformer import MyScaler
from sklearn.pipeline import Pipeline
from census.shortcuts import load_datas

# LOAD DATA

limit_to_rows = 199523  # corresponds to the whole dataset
X, y, X_test, y_test, columns, cols_interpret = load_datas(
    nrows=limit_to_rows,
    remove_cols=[
        "YEAR",
        "RESI_REGION",
        "RESI_PREV",
        "RESI_1YEAR",
        # 'MIG_REGION', 'MIG_MOVE', 'MIG_MSA', 'INSTANCE_WEIGHT',
        # 'ORIG_MOTHER'
    ],
    dummify=True,
)


# SET MODEL TYPE IN A PIPELINE

scale_cols = ["AGE", "INSTANCE_WEIGHT", "WWORKY", "WPERH", "CAP_GAINS", "CAP_LOSSES", "STOCK_DIV"]
idx_scale_cols = cols_name_to_indices(columns, scale_cols)
ms = MyScaler(idx_scale_cols)
lr = LogisticRegression(penalty="l1")
predictor = Pipeline([("my_scaling", ms), ("lr", lr)])