예제 #1
0
    X_train, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE)

# intervalos de tiempo
X_train[config.YEAR_VARIABLE] = pf.elapsed_years(X_train,
                                                 config.YEAR_VARIABLE,
                                                 ref_var='YrSold')

# transformación logarítmica
for var in config.NUMERICAL_LOG:
    X_train[var] = pf.log_transform(X_train, var)

# agrupación de categorías poco frecuentes
for var in config.CATEGORICAL_ENCODE:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])

# codificación de variables categóricas
for var in config.CATEGORICAL_ENCODE:
    X_train[var] = pf.encode_categorical(X_train, var,
                                         config.ENCODING_MAPPINGS[var])

# entrenear y guardar el escalador
scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH)

# escalar variables
X_train = scaler.transform(X_train[config.FEATURES])

# entrenar y guardar el modelo
pf.train_model(X_train, np.log(y_train), config.OUTPUT_MODEL_PATH)

print('Enterenamiento terminado')
예제 #2
0
    X_train[var + '_na'] = pf.add_missing_indicator(X_train, var)
    X_train[var] = pf.impute_na(X_train, var, config.IMPUTATION_DICT[var])
    X_test[var + '_na'] = pf.add_missing_indicator(X_test, var)
    X_test[var] = pf.impute_na(X_test, var, config.IMPUTATION_DICT[var])

# Group rare labels
for var in config.CATEGORICAL_VARS:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])
    X_test[var] = pf.remove_rare_labels(X_test, var,
                                        config.FREQUENT_LABELS[var])

# encode categorical variables
for var in config.CATEGORICAL_VARS:
    X_train = pf.encode_categorical(X_train, var)
    X_test = pf.encode_categorical(X_test, var)

# check all dummies were added
X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)
X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES)

# train scaler and save
scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH)

# scale train set
X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH)

# train model and save
pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)

print('Finished training')
예제 #3
0
# impute numerical variables
medians = config_file[1]['Parameters'].get('imputation_dict')
for var in num_vars:
    X_train = pf.add_missing_indicator(X_train, var)
    X_train[var] = pf.impute_na(X_train, var, medians.get(var))

## Group rare labels
frequent_list = config_file[1]['Parameters'].get('frequent_labels')
for var in cat_vars:
    X_train[var] = pf.remove_rare_labels(X_train, var, frequent_list)

# encode categorical variables
dummies = config_file[1]['Parameters'].get('dummy_variables')
for var in cat_vars:
    X_train = pf.encode_categorical(X_train, var)

# check all dummies were added
X_train = pf.check_dummy_variables(X_train, dummies)

# train scaler and save
output_path = config_file[0]['Paths'].get('output_scaler_path')
output_model_path = config_file[0]['Paths'].get('output_model_path')
scaler = pf.train_scaler(X_train, output_path)

# scale train set
X_train = scaler.transform(X_train)
y_train = y_train.astype(int)
# train model and save
pf.train_model(X_train, y_train, output_model_path)
print('Finished training')
예제 #4
0
xtrain['cabin'] = pf.extract_cabin_letter(xtrain, 'cabin')

# # impute categorical variables
xtrain[config.CATEGORICAL_VARS] = pf.impute_na(xtrain[config.CATEGORICAL_VARS],
                                               'Missing')

# # impute numerical variable
xtrain[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(
    xtrain[config.NUMERICAL_TO_IMPUTE], 'Numerical')

# # Group rare labels
for var in config.CATEGORICAL_VARS:
    xtrain[var] = pf.remove_rare_labels(xtrain, var,
                                        config.FREQUENT_LABELS[var])

# # encode categorical variables
xtrain = pf.encode_categorical(xtrain, config.CATEGORICAL_VARS)

# # check all dummies were added
xtrain = pf.check_dummy_variables(xtrain, config.DUMMY_VARIABLES)

# # train scaler and save
scaler = pf.train_scaler(xtrain, config.OUTPUT_SCALER_PATH)

# # scale train set
xtrain = pf.scale_features(xtrain, config.OUTPUT_SCALER_PATH)

# train model and save
pf.train_model(xtrain, ytrain, config.OUTPUT_MODEL_PATH)

print('Finished training')
# log transform numerical variables
for var in config.NUMERICAL_LOG:
    X_train[var] = pf.log_transform(X_train, var)


# Group rare labels
for var in config.CATEGORICAL_ENCODE:
    X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var])


# encode categorical variables
for var in config.CATEGORICAL_ENCODE:
    X_train[var] = pf.encode_categorical(X_train, var,
           config.ENCODING_MAPPINGS[var])


# train scaler and save
scaler = pf.train_scaler(X_train[config.FEATURES],
                         config.OUTPUT_SCALER_PATH)

# scale train set
X_train = scaler.transform(X_train[config.FEATURES])

# train model and save
pf.train_model(X_train,
               np.log(y_train), #Since features are log transformed,
               # prediction variable should also be log transformed.
               config.OUTPUT_MODEL_PATH)

print('Finished training')
예제 #6
0
# add missing indicator #Note that I added this to conform train.py with notebook.
for var in ['age', 'fare']:
    X_train = pf.add_missing_indicator(X_train, var)


# Group rare labels
for var in config.CATEGORICAL_VARS:
    X_train = pf.remove_rare_labels(X_train, config.FREQUENT_LABELS, var)


# encode categorical variables
for var in config.CATEGORICAL_VARS:
    X_train = pf.encode_categorical(X_train, var)


# check all dummies were added
X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)

# train scaler and save
pf.train_scaler(X_train, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH)


# scale train set
X_train = pf.scale_features(X_train, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH)


# train model and save
pf.train_model(X_train, config.ORDERED_COLUMNS, y_train, config.OUTPUT_MODEL_PATH)


print('Finished training')
예제 #7
0
# Group rare labels
X_train = pf.remove_rare_labels(X_train, config.FREQUENT_LABELS)
X_test = pf.remove_rare_labels(X_test, config.FREQUENT_LABELS)

# encode categorical variables
for var in config.CATEGORICAL_VARS:
    X_train = pf.encode_categorical(X_train, var)
    X_test = pf.encode_categorical(X_test, var)
X_train.drop(labels=config.CATEGORICAL_VARS, axis=1, inplace=True)
X_test.drop(labels=config.CATEGORICAL_VARS, axis=1, inplace=True)

# check all dummies were added
X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)
X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES)

# train scaler and save
pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH)

# scale train set
X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH)
X_test = pf.scale_features(X_test, config.OUTPUT_SCALER_PATH)

# train model and save
pf.train_model(X_train,
               y_train,
               config.OUTPUT_MODEL_PATH,
               seed=config.GLOBAL_SEED,
               C=config.NORM_CONSTANT)

print('Finished training')