예제 #1
0
파일: score.py 프로젝트: annplaube/mldeploy
def predict(data):

    # extract first letter from cabin
    pf.extract_cabin_letter(data, 'cabin')

    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:
        pf.add_missing_indicator(data, var)
        pf.impute_na(data, var, config.IMPUTATION_DICT[var])

    # impute NA categorical
    for var in config.CATEGORICAL_VARS:
        pf.impute_na(data, var)

    # Group rare labels
    for var, labels in config.FREQUENT_LABELS.items():
        pf.remove_rare_labels(data, var, labels)

    # encode variables
    for var in config.CATEGORICAL_VARS:
        data = pf.encode_categorical(data, var)

    # check all dummies were added
    pf.check_dummy_variables(data, config.DUMMY_VARIABLES)

    data = data[config.ALL_VARS]

    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
예제 #2
0
def predict(data):

    # extract first letter from cabin
    data[config.EXTRACT_VARIABLE] = pf.extract_cabin_letter(
        data, config.EXTRACT_VARIABLE)

    # impute NA categorical
    for var in config.CATEGORICAL_TO_ENCODE:
        data[var] = pf.impute_na(data, var, replacement='Missing')

    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:
        if (var == 'age'):
            data[var] = pf.add_missing_indicator(data, var, config.AGE_MEDIAN)
        else:
            data[var] = pf.add_missing_indicator(data, var, config.FARE_MEDIAN)

    # Group rare labels
    for var in config.CATEGORICAL_TO_ENCODE:
        data[var] = pf.remove_rare_labels(data, var, config.RARE_VALUE)

    # encode variables
    for var in config.CATEGORICAL_TO_ENCODE:
        data = pf.encode_categorical(data, var)

    # check all dummies were added
    pf.check_dummy_variables(data, config.DUMMY_VARIABLE)

    # scale variables
    data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
def predict(data):
    # impute categorical variables
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.impute_na(data, var, replacement='Missing')

    # impute numerical variables
    for var in config.NUMERICAL_TO_IMPUTE:

        # add missing indicator first
        data[var + '_na'] = pf.add_missing_indicator(data, var)

        # impute NA
        data[var] = pf.impute_na(data, var,
                                 replacement=config.IMPUTATION_DICT[var])

    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.remove_rare_labels(data, var,
                                          config.FREQUENT_LABELS[var])
    # encode categorical variables
    for var in config.CATEGORICAL_VARS:
        data = pf.encode_categorical(data, var)

    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)

    # scale variables
    scaler = pf.scale_features(data,
                               config.OUTPUT_SCALER_PATH)
    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
예제 #4
0
def predict(data):

    # extract first letter from cabin
    data['cabin'] = pf.extract_cabin_letter(data, 'cabin')

    # impute NA categorical
    data[config.CATEGORICAL_VARS] = pf.impute_na(data[config.CATEGORICAL_VARS],
                                                 'Missing')

    # impute NA numerical
    data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(
        data[config.NUMERICAL_TO_IMPUTE], 'Numerical')

    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.remove_rare_labels(data, var,
                                          config.FREQUENT_LABELS[var])

    # encode variables
    data = pf.encode_categorical(data, config.CATEGORICAL_VARS)
    print(data.shape)

    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)
    print(data.shape)

    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
def predict(data):

    # extract first letter from cabin
    data["cabin"] = pf.extract_cabin_letter(data, "cabin")

    # impute NA categorical
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.impute_na(data, var, replacement="Missing")

    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:
        median_val = data[var].median()
        data[var] = pf.impute_na(data, var, replacement=median_val)

    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var])

    # encode variables
    for var in config.CATEGORICAL_VARS:
        data = pf.encode_categorical(data, var)

    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)

    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)
    return predictions
예제 #6
0
def predict(data):

    # extract first letter from cabin
    data["cabin"] = pf.extract_cabin_letter(data, "cabin")

    # impute NA categorical
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.impute_na(data, var, replacement='Missing')

    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:
        data[var + "_na"] = pf.add_missing_indicator(data, var)
        median_train_var = config.IMPUTATION_DICT[var]
        data[var] = pf.impute_na(data, var, replacement=median_train_var)

    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        freq_labels = config.FREQUENT_LABELS[var]
        # Remove rare labels from both train and test set
        data[var] = pf.remove_rare_labels(data, var, freq_labels)

    # encode variables
    for var in config.CATEGORICAL_VARS:
        data = pf.encode_categorical(data, var)

    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)

    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
예제 #7
0
def predict(data):
    
    # extract first letter from cabin
    data['cabin'] = pf.extract_cabin_letter(data, var='cabin')

    # impute NA categorical
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.impute_na(data, var, replacement='Missing')
    
    
    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:
        data[var] = pf.add_missing_indicator(data, var)
    
    
    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        data[var] = pf.remove_rare_labels(data, var, frequent_labels = config.FREQUENT_LABELS[var])
    
    # encode variables
    data = pf.encode_categorical(data, var=config.CATEGORICAL_VARS)
        
        
    # check all dummies were added
    data = pf.check_dummy_variables(data, dummy_list = config.DUMMY_VARIABLES)
    
    # scale variables
    data = pf.scale_features(data[config.FEATURES],
                             config.OUTPUT_SCALER_PATH)
    
    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    
    return predictions
예제 #8
0
def predict(data):
    
    # extract first letter from cabin
    X_test = pf.extract_cabin_letter(data, config.IMPUTATION_DICT['cabin_variable'])

    # impute NA categorical
    X_test = pf.add_missing_indicator(X_test, config.CATEGORICAL_VARS)
    
    
    # impute NA numerical
    for var in config.NUMERICAL_TO_IMPUTE:
        X_test = pf.impute_na(X_test,var,replace_by=config.IMPUTATION_DICT[var], add_na_columns=True)

    
    # Group rare labels
    X_test = pf.remove_rare_labels(X_test, config.FREQUENT_LABELS)
    
    # encode variables
    for var in config.CATEGORICAL_VARS:
        X_test = pf.encode_categorical(X_test, var)
    X_test.drop(labels=config.CATEGORICAL_VARS, axis=1, inplace=True)
        
    # check all dummies were added
    X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES)

    
    # scale variables
    X_test = pf.scale_features(X_test, config.OUTPUT_SCALER_PATH)
    
    # make predictions
    predictions = pf.predict(X_test,config.OUTPUT_MODEL_PATH)

    
    return predictions
예제 #9
0
def predict(data):
    
    # extract first letter from cabin
    data = pf.extract_cabin_letter(data, 'cabin')


    # impute NA categorical
    for var in config.CATEGORICAL_VARS:
        data = pf.impute_na(data, var, config.IMPUTATION_DICT)    
    
    # impute NA numerical
    for var in ['age', 'fare']:
        data = pf.impute_na(data, var, config.IMPUTATION_DICT)
    
    # add indicator variables
    for var in ['age', 'fare']:
        data = pf.add_missing_indicator(data, var)
    
    
    # Group rare labels
    for var in config.CATEGORICAL_VARS:
        data = pf.remove_rare_labels(data, config.FREQUENT_LABELS, var)
    
    # encode variables
    for var in config.CATEGORICAL_VARS:
        data = pf.encode_categorical(data, var)
        
        
    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)

    
    # scale variables
    data = pf.scale_features(data, config.ORDERED_COLUMNS, config.OUTPUT_SCALER_PATH)

    
    # make predictions
    predictions = pf.predict(data, config.ORDERED_COLUMNS, config.OUTPUT_MODEL_PATH)

    
    return predictions
예제 #10
0
def predict(data):

    data = pf.load_data(config.PATH_TO_DATASET)

    X_train, X_test, y_train, y_test = pf.divide_train_test(data, config.TARGET)
    data = X_test.copy()

        # impute categorical variables
    data = pf.add_missing_indicator(data, config.CATEGORICAL_VARS)

    # extract first letter from cabin
    data = pf.extract_cabin_letter(data, 'cabin')

    # impute NA categorical
    data = pf.impute_na(data, config.CATEGORICAL_VARS)

    # impute NA numerical
    data = pf.add_missing_indicator(data, config.NUMERICAL_TO_IMPUTE)
    data = pf.impute_num(data, config.NUMERICAL_TO_IMPUTE)

    # Group rare labels
    data = pf.remove_rare_labels(data, config.CATEGORICAL_VARS)

    # encode variables
    data, data_features = pf.encode_categorical(data, config.CATEGORICAL_VARS)

    print(data.head(1))
    
    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)
    
    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    class_, pred = pf.predict(data, config.OUTPUT_MODEL_PATH)

    
    return class_
예제 #11
0
def predict(data):

    # extract first letter from cabin
    data['cabin'] = pf.extract_cabin_letter(data, 'cabin')

    # impute NA categorical
    for el in config.CATEGORICAL_VARS:
        data[el] = pf.impute_na(data, el, replacement='Missing')

    # impute NA numerical
    for el in config.NUMERICAL_TO_IMPUTE:

        # add missing indicator first
        data[el + '_NA'] = pf.add_missing_indicator(data, el)

        # impute NA
        data[el] = pf.impute_na(data,
                                el,
                                replacement=config.IMPUTATION_DICT[el])

    # Group rare labels
    for el in config.CATEGORICAL_VARS:
        data[el] = pf.remove_rare_labels(data, el, config.FREQUENT_LABELS[el])

    # encode variables
    for el in config.CATEGORICAL_VARS:
        data = pf.encode_categorical(data, el)

    # check all dummies were added
    data = pf.check_dummy_variables(data, config.DUMMY_VARIABLES)

    # scale variables
    data = pf.scale_features(data, config.OUTPUT_SCALER_PATH)

    # make predictions
    predictions = pf.predict(data, config.OUTPUT_MODEL_PATH)

    return predictions
예제 #12
0
    X_train[var + '_na'] = pf.add_missing_indicator(X_train, var)
    X_train[var] = pf.impute_na(X_train, var, config.IMPUTATION_DICT[var])
    X_test[var + '_na'] = pf.add_missing_indicator(X_test, var)
    X_test[var] = pf.impute_na(X_test, var, config.IMPUTATION_DICT[var])

# Group rare labels
for var in config.CATEGORICAL_VARS:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])
    X_test[var] = pf.remove_rare_labels(X_test, var,
                                        config.FREQUENT_LABELS[var])

# encode categorical variables
for var in config.CATEGORICAL_VARS:
    X_train = pf.encode_categorical(X_train, var)
    X_test = pf.encode_categorical(X_test, var)

# check all dummies were added
X_train = pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES)
X_test = pf.check_dummy_variables(X_test, config.DUMMY_VARIABLES)

# train scaler and save
scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH)

# scale train set
X_train = pf.scale_features(X_train, config.OUTPUT_SCALER_PATH)

# train model and save
pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)

print('Finished training')
예제 #13
0
for var in config.NUMERICAL_TO_IMPUTE:
    # add missing indicator
    pf.add_missing_indicator(X_train, var)

    # replace NaN by median
    median_val = X_train[var].median()
    X_train[var] = pf.impute_na(X_train, var, value=median_val)

# Group rare labels
for var in config.CATEGORICAL_VARS:
    X_train[var] = pf.remove_rare_labels(X_train, var,
                                         config.FREQUENT_LABELS[var])

# encode categorical variables
for var in config.CATEGORICAL_VARS:
    X_train = pf.encode_categorical(X_train, var)

# check all dummies were added
for var in config.CATEGORICAL_VARS:
    pf.check_dummy_variables(X_train, config.DUMMY_VARIABLES[var])

# train scaler and save
scaler = pf.train_scaler(X_train, config.OUTPUT_SCALER_PATH)

# scale train set
X_train = scaler.transform(X_train)

# train model and save
pf.train_model(X_train, y_train, config.OUTPUT_MODEL_PATH)

print('Finished training')
예제 #14
0
# impute numerical variables
medians = config_file[1]['Parameters'].get('imputation_dict')
for var in num_vars:
    X_train = pf.add_missing_indicator(X_train, var)
    X_train[var] = pf.impute_na(X_train, var, medians.get(var))

## Group rare labels
frequent_list = config_file[1]['Parameters'].get('frequent_labels')
for var in cat_vars:
    X_train[var] = pf.remove_rare_labels(X_train, var, frequent_list)

# encode categorical variables
dummies = config_file[1]['Parameters'].get('dummy_variables')
for var in cat_vars:
    X_train = pf.encode_categorical(X_train, var)

# check all dummies were added
X_train = pf.check_dummy_variables(X_train, dummies)

# train scaler and save
output_path = config_file[0]['Paths'].get('output_scaler_path')
output_model_path = config_file[0]['Paths'].get('output_model_path')
scaler = pf.train_scaler(X_train, output_path)

# scale train set
X_train = scaler.transform(X_train)
y_train = y_train.astype(int)
# train model and save
pf.train_model(X_train, y_train, output_model_path)
print('Finished training')
예제 #15
0
xtrain['cabin'] = pf.extract_cabin_letter(xtrain, 'cabin')

# # impute categorical variables
xtrain[config.CATEGORICAL_VARS] = pf.impute_na(xtrain[config.CATEGORICAL_VARS],
                                               'Missing')

# # impute numerical variable
xtrain[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(
    xtrain[config.NUMERICAL_TO_IMPUTE], 'Numerical')

# # Group rare labels
for var in config.CATEGORICAL_VARS:
    xtrain[var] = pf.remove_rare_labels(xtrain, var,
                                        config.FREQUENT_LABELS[var])

# # encode categorical variables
xtrain = pf.encode_categorical(xtrain, config.CATEGORICAL_VARS)

# # check all dummies were added
xtrain = pf.check_dummy_variables(xtrain, config.DUMMY_VARIABLES)

# # train scaler and save
scaler = pf.train_scaler(xtrain, config.OUTPUT_SCALER_PATH)

# # scale train set
xtrain = pf.scale_features(xtrain, config.OUTPUT_SCALER_PATH)

# train model and save
pf.train_model(xtrain, ytrain, config.OUTPUT_MODEL_PATH)

print('Finished training')