def predict(data):

    # extract first letter from cabin

    pf.extract_cabin_letter(data, 'cabin')

    # impute NA categorical
    for var in ['age', 'fare']:
        pf.add_missing_indicator(data, var)

    # impute NA numerical

    for var in config.CATEGORICAL_VARS:
        pf.impute_na(data, var)

    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        pf.remove_rare_labels(data, var, config.FREQUENT_LABELS)

    # encode variables
    data = pf.encode_categorical(df, config.CATEGORICAL_VARS)

    # scale variables

    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions, _ = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
示例#2
0
文件: score.py 项目: NGYB/Courses
def predict(data):

    # extract first letter from cabin
    data['cabin'] = pf.extract_cabin_letter(data, 'cabin')

    # impute NA categorical
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.impute_na(data, var, replacement='Missing')

    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:
        data[var + '_NA'] = pf.add_missing_indicator(data, var)
        data[var] = pf.impute_na(data,
                                 var,
                                 replacement=config.IMPUTATION_DICT[var])

    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.remove_rare_labels(data, var,
                                          config.FREQUENT_LABELS[var])

    # encode variables
    for var in config.CATEGORICAL_VARS:
        data = pf.encode_categorical(data, var)

    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)

    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
def predict(data):

    # extract first letter from cabin
    data[config.EXTRACT_VARIABLE] = pf.extract_cabin_letter(
        data, config.EXTRACT_VARIABLE)

    # impute NA categorical
    for var in config.CATEGORICAL_TO_ENCODE:
        data[var] = pf.impute_na(data, var, replacement='Missing')

    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:
        if (var == 'age'):
            data[var] = pf.add_missing_indicator(data, var, config.AGE_MEDIAN)
        else:
            data[var] = pf.add_missing_indicator(data, var, config.FARE_MEDIAN)

    # Group rare labels
    for var in config.CATEGORICAL_TO_ENCODE:
        data[var] = pf.remove_rare_labels(data, var, config.RARE_VALUE)

    # encode variables
    for var in config.CATEGORICAL_TO_ENCODE:
        data = pf.encode_categorical(data, var)

    # check all dummies were added
    pf.check_dummy_variables(data, config.DUMMY_VARIABLE)

    # scale variables
    data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
示例#4
0
def predict(data):

    # extract first letter from cabin
    data['cabin'] = pf.extract_cabin_letter(data, 'cabin')

    # impute NA categorical
    data[config.CATEGORICAL_VARS] = pf.impute_na(data[config.CATEGORICAL_VARS],
                                                 'Missing')

    # impute NA numerical
    data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(
        data[config.NUMERICAL_TO_IMPUTE], 'Numerical')

    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.remove_rare_labels(data, var,
                                          config.FREQUENT_LABELS[var])

    # encode variables
    data = pf.encode_categorical(data, config.CATEGORICAL_VARS)
    print(data.shape)

    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)
    print(data.shape)

    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
示例#5
0
def predict(df):
    print('predict function b4 anything', df.shape)

    print('predict function', df.shape)

    df = pf.extract_time(df)

    df = pf.log_transform(df, config.LOG_VARS)

    df = pf.to_str(df, config.VAR_TO_STR)

    df = pf.reduce_cardinality(df, df)

    df = pf.cat_to_str(df)

    encoder = ce.OneHotEncoder(use_cat_names=True)
    df = encoder.fit_transform(df)

    # scale variables
    df = pf.scale_features(df[config.FEATURES], config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(df, config.OUTPUT_MODEL_PATH)

    return predictions
def predict(data):

    # impute NA
    for var in config.CATEGORICAL_TO_IMPUTE:
        data[var] = pf.impute_na(data, var, replacement='Missing')

    data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(
        data, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE)

    # capture elapsed time
    data[config.YEAR_VARIABLE] = pf.elapsed_years(data,
                                                  config.YEAR_VARIABLE,
                                                  ref_var='YrSold')

    # log transform numerical variables
    for var in config.NUMERICAL_LOG:
        data[var] = pf.log_transform(data, var)

    # Group rare labels
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.remove_rare_labels(data, var,
                                          config.FREQUENT_LABELS[var])

    # encode variables
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.encode_categorical(data, var,
                                          config.ENCODING_MAPPINGS[var])

    # scale variables
    data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
示例#7
0
def predict(data):
    
    # extract first letter from cabin
    X_test = pf.extract_cabin_letter(data, config.IMPUTATION_DICT['cabin_variable'])

    # impute NA categorical
    X_test = pf.add_missing_indicator(X_test, config.CATEGORICAL_VARS)
    
    
    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:
        X_test = pf.impute_na(X_test,var,replace_by=config.IMPUTATION_DICT[var], add_na_columns=True)

    
    # Group rare labels
    X_test = pf.remove_rare_labels(X_test, config.FREQUENT_LABELS)
    
    # encode variables
    for var in config.CATEGORICAL_VARS:
        X_test = pf.encode_categorical(X_test, var)
    X_test.drop(labels=config.CATEGORICAL_VARS, axis=1, inplace=True)
        
    # check all dummies were added
    X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES)

    
    # scale variables
    X_test = pf.scale_features(X_test, config.OUTPUT_SCALER_PATH)
    
    # make predictions
    predictions = pf.predict(X_test,config.OUTPUT_MODEL_PATH)

    
    return predictions
示例#8
0
def predict(data):
    
    # extract first letter from cabin
    data = pf.extract_cabin_letter(data, 'cabin')


    # impute NA categorical
    for var in config.CATEGORICAL_VARS:
        data = pf.impute_na(data, var, config.IMPUTATION_DICT)    
    
    # impute NA numerical
    for var in ['age', 'fare']:
        data = pf.impute_na(data, var, config.IMPUTATION_DICT)
    
    # add indicator variables
    for var in ['age', 'fare']:
        data = pf.add_missing_indicator(data, var)
    
    
    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        data = pf.remove_rare_labels(data, config.FREQUENT_LABELS, var)
    
    # encode variables
    for var in config.CATEGORICAL_VARS:
        data = pf.encode_categorical(data, var)
        
        
    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)

    
    # scale variables
    data = pf.scale_features(data, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH)

    
    # make predictions
    predictions = pf.predict(data, config.ORDERED_COLUMNS, config.OUTPUT_MODEL_PATH)

    
    return predictions
示例#9
0
def predict(data):

    # remove duplicate
    data = pf.drop_duplicate(data)

    # Engineer BMI column
    data['bmi'] = pf.bmi_feature(data, 'weight', 'height')

    # Remove extreme outliers for ap_hi
    data['ap_hi'] = pf.remove_outlier(data, 'ap_hi')

    # Remove extreme outliers for ap_lo
    data['ap_lo'] = pf.remove_outlier(data, 'ap_lo')

    # scale variables
    data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
示例#10
0
def predict(data):

    data = pf.load_data(config.PATH_TO_DATASET)

    X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET)
    data = X_test.copy()

        # impute categorical variables
    data = pf.add_missing_indicator(data, config.CATEGORICAL_VARS)

    # extract first letter from cabin
    data = pf.extract_cabin_letter(data, 'cabin')

    # impute NA categorical
    data = pf.impute_na(data, config.CATEGORICAL_VARS)

    # impute NA numerical
    data = pf.add_missing_indicator(data, config.NUMERICAL_TO_IMPUTE)
    data = pf.impute_num(data, config.NUMERICAL_TO_IMPUTE)

    # Group rare labels
    data = pf.remove_rare_labels(data, config.CATEGORICAL_VARS)

    # encode variables
    data, data_features = pf.encode_categorical(data, config.CATEGORICAL_VARS)

    print(data.head(1))
    
    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)
    
    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    class_, pred = pf.predict(data, config.OUTPUT_MODEL_PATH)

    
    return class_
示例#11
0
def predict(data):
    
    # imputar datos faltantes
    for var in config.CATEGORICAL_TO_IMPUTE:
        data[var] = pf.impute_na(data, var, replacement='Missing')
    
    data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(data,
           config.NUMERICAL_TO_IMPUTE,
           replacement=config.LOTFRONTAGE_MODE)
    
    
    # intervalos de tiempo
    data[config.YEAR_VARIABLE] = pf.elapsed_years(data,
           config.YEAR_VARIABLE, ref_var='YrSold')
    
    
    # transformación logarítmica
    for var in config.NUMERICAL_LOG:
       data[var] = pf.log_transform(data, var)
    
    
    # agrupación de etiquetas poco frecuentes
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var])
    
    # codificación de var. categóricas
    for var in config.CATEGORICAL_ENCODE:
        data[var] = pf.encode_categorical(data, var,
               config.ENCODING_MAPPINGS[var])
    
    
    # escalar variables
    data = pf.scale_features(data[config.FEATURES],
                             config.OUTPUT_SCALER_PATH)
    
    # obtener predicciones
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)
    
    return predictions
示例#12
0
    X_train[var + '_na'] = pf.add_missing_indicator(X_train, var)
    X_train[var] = pf.impute_na(X_train, var, config.IMPUTATION_DICT[var])
    X_test[var + '_na'] = pf.add_missing_indicator(X_test, var)
    X_test[var] = pf.impute_na(X_test, var, config.IMPUTATION_DICT[var])

# Group rare labels
for var in config.CATEGORICAL_VARS:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])
    X_test[var] = pf.remove_rare_labels(X_test, var,
                                        config.FREQUENT_LABELS[var])

# encode categorical variables
for var in config.CATEGORICAL_VARS:
    X_train = pf.encode_categorical(X_train, var)
    X_test = pf.encode_categorical(X_test, var)

# check all dummies were added
X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)
X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES)

# train scaler and save
scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH)

# scale train set
X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH)

# train model and save
pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)

print('Finished training')
X_train = pf.add_features(X_train)

# Add the region column
X_train = pf.add_region(X_train, config.REGION_BOUNDS)

# Apply cube-root transformation
for var in config.CBRT_TRANSFORM:
    X_train[var] = pf.cbrt_transform(X_train, var)
y_train = pf.cbrt_transform(y_train)

# Train standard scaler on numerical variables only
scaled = X_train[config.NUM_VARS].copy()
scaler = pf.train_scaler(scaled, config.SCALER_PATH)

# Scale the numerical data
scaled.iloc[:,:] = pf.scale_features(scaled, config.SCALER_PATH)

# One-hot encode all the categorical variables
categoricals = []
for var in config.CAT_VARS:
    categoricals.append(pf.encode_categorical(X_train, var))

# Final design matrix for training
X_train = pf.concat_dfs(scaled, categoricals)

# Assert we have the desired features
assert X_train.columns.tolist() == config.FEATURES

# Train the default linear regression model
pf.train_linreg_model(X_train, y_train, config.LINEAR_REG_MODEL_PATH)
示例#14
0
# add missing indicator #Note that I added this to conform train.py with notebook.
for var in ['age', 'fare']:
    X_train = pf.add_missing_indicator(X_train, var)


# Group rare labels
for var in config.CATEGORICAL_VARS:
    X_train = pf.remove_rare_labels(X_train, config.FREQUENT_LABELS, var)


# encode categorical variables
for var in config.CATEGORICAL_VARS:
    X_train = pf.encode_categorical(X_train, var)


# check all dummies were added
X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)

# train scaler and save
pf.train_scaler(X_train, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH)


# scale train set
X_train = pf.scale_features(X_train, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH)


# train model and save
pf.train_model(X_train, config.ORDERED_COLUMNS, y_train, config.OUTPUT_MODEL_PATH)


print('Finished training')