def predict(data): # impute NA for var in config.CATEGORICAL_TO_IMPUTE: data[var] = pf.impute_na(data, var, replacement='Missing') data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na( data, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # capture elapsed time data[config.YEAR_VARIABLE] = pf.elapsed_years(data, config.YEAR_VARIABLE, ref_var='YrSold') # log transform numerical variables for var in config.NUMERICAL_LOG: data[var] = pf.log_transform(data, var) # Group rare labels for var in config.CATEGORICAL_ENCODE: data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var]) # encode variables for var in config.CATEGORICAL_ENCODE: data[var] = pf.encode_categorical(data, var, config.ENCODING_MAPPINGS[var]) # scale variables data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
def predict(df): print('predict function b4 anything', df.shape) print('predict function', df.shape) df = pf.extract_time(df) df = pf.log_transform(df, config.LOG_VARS) df = pf.to_str(df, config.VAR_TO_STR) df = pf.reduce_cardinality(df, df) df = pf.cat_to_str(df) encoder = ce.OneHotEncoder(use_cat_names=True) df = encoder.fit_transform(df) # scale variables df = pf.scale_features(df[config.FEATURES], config.OUTPUT_SCALER_PATH) # make predictions predictions = pf.predict(df, config.OUTPUT_MODEL_PATH) return predictions
def predict(data): # imputar datos faltantes for var in config.CATEGORICAL_TO_IMPUTE: data[var] = pf.impute_na(data, var, replacement='Missing') data[config.NUMERICAL_TO_IMPUTE] = pf.impute_na(data, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # intervalos de tiempo data[config.YEAR_VARIABLE] = pf.elapsed_years(data, config.YEAR_VARIABLE, ref_var='YrSold') # transformación logarítmica for var in config.NUMERICAL_LOG: data[var] = pf.log_transform(data, var) # agrupación de etiquetas poco frecuentes for var in config.CATEGORICAL_ENCODE: data[var] = pf.remove_rare_labels(data, var, config.FREQUENT_LABELS[var]) # codificación de var. categóricas for var in config.CATEGORICAL_ENCODE: data[var] = pf.encode_categorical(data, var, config.ENCODING_MAPPINGS[var]) # escalar variables data = pf.scale_features(data[config.FEATURES], config.OUTPUT_SCALER_PATH) # obtener predicciones predictions = pf.predict(data, config.OUTPUT_MODEL_PATH) return predictions
# imputar variables categóricas for var in config.CATEGORICAL_TO_IMPUTE: X_train[var] = pf.impute_na(X_train, var, replacement='Missing') # imputar variables numéricas X_train[config.NUMERICAL_TO_IMPUTE] = pf.impute_na( X_train, config.NUMERICAL_TO_IMPUTE, replacement=config.LOTFRONTAGE_MODE) # intervalos de tiempo X_train[config.YEAR_VARIABLE] = pf.elapsed_years(X_train, config.YEAR_VARIABLE, ref_var='YrSold') # transformación logarítmica for var in config.NUMERICAL_LOG: X_train[var] = pf.log_transform(X_train, var) # agrupación de categorías poco frecuentes for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.remove_rare_labels(X_train, var, config.FREQUENT_LABELS[var]) # codificación de variables categóricas for var in config.CATEGORICAL_ENCODE: X_train[var] = pf.encode_categorical(X_train, var, config.ENCODING_MAPPINGS[var]) # entrenear y guardar el escalador scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH) # escalar variables
train = pf.trip_length(train, 'tpep_pickup_datetime', 'tpep_dropoff_datetime') train = pf.remove_zero_or_neg_time(train, 'trip_seconds') # divide data into train, val, test train = df[df.tpep_dropoff_datetime <= pd.to_datetime('2017-06-30')] test = df[df.tpep_dropoff_datetime >= pd.to_datetime('2017-11-01')] val = train[train.tpep_dropoff_datetime >= pd.to_datetime('2017-06-01')] train = train[train.tpep_dropoff_datetime < pd.to_datetime('2017-06-01')] # continue preprocessing train = pf.extract_time(train) train = pf.log_transform(train, config.LOG_VARS) train = pf.to_str(train, config.VAR_TO_STR) train = pf.reduce_cardinality(train, train) train = pf.cat_to_str(train) encoder = ce.OneHotEncoder(use_cat_names=True) X_train = encoder.fit_transform(train) # y_train y_train = train[config.TARGET] # train scaler and save scaler = pf.train_scaler(X_train[config.FEATURES], config.OUTPUT_SCALER_PATH)