예제 #1
0
def predict(data):

    # extract first letter from cabin
    data['cabin'] = pf.extract_cabin_letter(data, 'cabin')

    # impute NA categorical
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.impute_na(data, var, replacement='Missing')

    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:

        #add missing indicator (0,1)
        data[var + 'NA'] = pf.add_missing_indicator(data, var)
        # impute NA
        data[var] = pf.impute_na(data,
                                 var,
                                 replacement=config.IMPUTATION_DICT[var])

    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.remove_rare_labels(data, var,
                                          config.FREQUENT_LABELS[var])

    # encode variables
    for var in config.CATEGORICAL_VARS:
        data = pf.encode_categorical(data, var)

    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)

    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
def predict(data):
    
    # impute NA
    for var in config.CATEGORICAL_TO_IMPUTE:
        data[var] = pf.impute_na(data, var, replacement='Missing')
    
    data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(data,
           config.NUMERICAL_TO_IMPUTE,
           replacement=config.LOTFRONTAGE_MODE)
    
    
    # capture elapsed time
    data[config.YEAR_VARIABLE] = pf.elapsed_years(data,
           config.YEAR_VARIABLE, ref_var='YrSold')
    
    
    # log transform numerical variables
    for var in config.NUMERICAL_LOG:
       data[var] = pf.log_transform(data, var)
    
    
    # Group rare labels
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var])
    
    # encode variables
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.encode_categorical(data, var, config.ENCODING_MAPPINGS[var])
    
    
    # scale variables
    data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH)
    
    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)
    
    return predictions
예제 #3
0
for var in config.NUMERICAL_TO_IMPUTE:
    X_train[var + '_na'] = pf.add_missing_indicator(X_train, var)
    X_train[var] = pf.impute_na(X_train, var, config.IMPUTATION_DICT[var])
    X_test[var + '_na'] = pf.add_missing_indicator(X_test, var)
    X_test[var] = pf.impute_na(X_test, var, config.IMPUTATION_DICT[var])

# Group rare labels
for var in config.CATEGORICAL_VARS:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])
    X_test[var] = pf.remove_rare_labels(X_test, var,
                                        config.FREQUENT_LABELS[var])

# encode categorical variables
for var in config.CATEGORICAL_VARS:
    X_train = pf.encode_categorical(X_train, var)
    X_test = pf.encode_categorical(X_test, var)

# check all dummies were added
X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)
X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES)

# train scaler and save
scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH)

# scale train set
X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH)

# train model and save
pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)
예제 #4
0
파일: train.py 프로젝트: gonzalo71/TestGit
    # add missing indicator
    X_train[var + '_NA'] = pf.add_missing_indicator(X_train, var)

    # replace NaN by median
    X_train[var] = pf.impute_na(X_train,
                                var,
                                replacement=config.IMPUTATION_DICT[var])

# Group rare labels
for var in config.CATEGORICAL_VARS:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])

# encode categorical variables
for var in config.CATEGORICAL_VARS:
    X_train = pf.encode_categorical(X_train, var)

# check all dummies were added
X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)

# train scaler and save
scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH)

# scale train set
X_train = scaler.transform(X_train)

# train model and save
pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)

print('Finished training')
예제 #5
0
    X_train, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE)

# intervalos de tiempo
X_train[config.YEAR_VARIABLE] = pf.elapsed_years(X_train,
                                                 config.YEAR_VARIABLE,
                                                 ref_var='YrSold')

# transformación logarítmica
for var in config.NUMERICAL_LOG:
    X_train[var] = pf.log_transform(X_train, var)

# agrupación de categorías poco frecuentes
for var in config.CATEGORICAL_ENCODE:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])

# codificación de variables categóricas
for var in config.CATEGORICAL_ENCODE:
    X_train[var] = pf.encode_categorical(X_train, var,
                                         config.ENCODING_MAPPINGS[var])

# entrenear y guardar el escalador
scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH)

# escalar variables
X_train = scaler.transform(X_train[config.FEATURES])

# entrenar y guardar el modelo
pf.train_model(X_train, np.log(y_train), config.OUTPUT_MODEL_PATH)

print('Enterenamiento terminado')
# Apply cube-root transformation
for var in config.CBRT_TRANSFORM:
    X_train[var] = pf.cbrt_transform(X_train, var)
y_train = pf.cbrt_transform(y_train)

# Train standard scaler on numerical variables only
scaled = X_train[config.NUM_VARS].copy()
scaler = pf.train_scaler(scaled, config.SCALER_PATH)

# Scale the numerical data
scaled.iloc[:,:] = pf.scale_features(scaled, config.SCALER_PATH)

# One-hot encode all the categorical variables
categoricals = []
for var in config.CAT_VARS:
    categoricals.append(pf.encode_categorical(X_train, var))

# Final design matrix for training
X_train = pf.concat_dfs(scaled, categoricals)

# Assert we have the desired features
assert X_train.columns.tolist() == config.FEATURES

# Train the default linear regression model
pf.train_linreg_model(X_train, y_train, config.LINEAR_REG_MODEL_PATH)

# Train the linear regression model via speed
pf.train_linreg_model(X_train, y_train_aux, config.LINEAR_REG_SPEED_MODEL_PATH)

# Train the neural network
pf.train_nn_model(X_train, y_train, config.NET_ARCHITECTURE_AND_PARAMETERS, config.NEURAL_NET_MODEL_PATH)
    X_train = pf.add_missing_indicator(X_train, var)
    X_train[var] = pf.impute_na(X_train,
                                var,
                                value=config.IMPUTATION_DICT[var])

# Group rare labels

for col in config.CATEGORICAL_VARS:
    X_train[col] = pf.remove_rare_labels(
        X_train, col, freq_labels=config.FREQUENT_LABELS[col])

# encode categorical variables

oh = pf.train_encoder(X_train, config.CATEGORICAL_VARS,
                      config.OUTPUT_ENCODER_PATH)
X_train = pf.encode_categorical(X_train, config.CATEGORICAL_VARS,
                                config.OUTPUT_ENCODER_PATH)
print(X_train.shape)
print(X_train.head())
# check all dummies were added

X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)

# train scaler and save

scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH)

# scale train set

X_train = scaler.transform(X_train)
# train model and save
예제 #8
0
xtrain['cabin'] = pf.extract_cabin_letter(xtrain, 'cabin')

# # impute categorical variables
xtrain[config.CATEGORICAL_VARS] = pf.impute_na(xtrain[config.CATEGORICAL_VARS],
                                               'Missing')

# # impute numerical variable
xtrain[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(
    xtrain[config.NUMERICAL_TO_IMPUTE], 'Numerical')

# # Group rare labels
for var in config.CATEGORICAL_VARS:
    xtrain[var] = pf.remove_rare_labels(xtrain, var,
                                        config.FREQUENT_LABELS[var])

# # encode categorical variables
xtrain = pf.encode_categorical(xtrain, config.CATEGORICAL_VARS)

# # check all dummies were added
xtrain = pf.check_dummy_variables(xtrain, config.DUMMY_VARIABLES)

# # train scaler and save
scaler = pf.train_scaler(xtrain, config.OUTPUT_SCALER_PATH)

# # scale train set
xtrain = pf.scale_features(xtrain, config.OUTPUT_SCALER_PATH)

# train model and save
pf.train_model(xtrain, ytrain, config.OUTPUT_MODEL_PATH)

print('Finished training')
예제 #9
0
# get first letter from cabin variable
X_train = pf.extract_cabin_letter(X_train, 'cabin')

# impute categorical variables
X_train = pf.add_missing_indicator(X_train, config.CATEGORICAL_VARS)
X_train = pf.impute_na(X_train, config.CATEGORICAL_VARS)

# impute numerical variable
X_train = pf.add_missing_indicator(X_train, config.NUMERICAL_TO_IMPUTE)
X_train = pf.impute_num(X_train, config.NUMERICAL_TO_IMPUTE)

# Group rare labels
X_train = pf.remove_rare_labels(X_train, config.CATEGORICAL_VARS)

# encode categorical variables
X_train, X_train_features = pf.encode_categorical(X_train,
                                                  config.CATEGORICAL_VARS)

# check dummy variables
X_check

# train scaler and save
pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH)

# scale train set
X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH)

# train model and save
pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)

print('Finished training')
예제 #10
0
    X_test[var] = pf.impute_na(X_test, var, replacement='Missing')

# impute numerical variable
for var in config.NUMERICAL_TO_IMPUTE:
    X_train[var] = pf.add_missing_indicator(df=X_train, var=var)
    X_test[var] = pf.add_missing_indicator(df=X_test, var=var)

# Group rare labels
for var in config.CATEGORICAL_VARS:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])
    X_test[var] = pf.remove_rare_labels(X_test, var,
                                        config.FREQUENT_LABELS[var])

# encode categorical variables
X_train = pf.encode_categorical(df=X_train, var=config.CATEGORICAL_VARS)
X_test = pf.encode_categorical(df=X_test, var=config.CATEGORICAL_VARS)

# check all dummies were added
X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)
X_test = pf.check_dummy_variables(df=X_test, dummy_list=config.DUMMY_VARIABLES)

# train scaler and save
scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH)

# scale train set
X_train = scaler.transform(X_train[config.FEATURES])
X_test = scaler.transform(X_test[config.FEATURES])

# train model and save
pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)