def main(input_train, input_test, output_train, output_test): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') raw_data = DataSet(input_train, input_test) df_train = raw_data.get_train_set() df_test = raw_data.get_test_set() TitanicPreProcessing(df_train, output_train) TitanicPreProcessing(df_test, output_test)
def main(input_data, output_model): """ Runs modeling scripts using processed data (../raw) to create model. Model is saved as pickle (saved in ../models). """ logger = logging.getLogger(__name__) logger.info('training model') data = DataSet(train_dir=input_data) train = data.get_train_set() X_train = data.get_features(train) y = data.get_label(train) clf = models[4] param_grid = params[4] model = Model.tune(clf, X_train, y, param_grid) model.save(output_model + model.name)
def main(input_filepath, output_filepath): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') raw_data = DataSet(train_dir=input_filepath + '/train.csv', test_dir=input_filepath + '/test.csv') cleaning = DataWrangling(train_dir=output_filepath + '/train_clean.csv', test_dir=output_filepath + '/test_clean.csv') df_train = raw_data.get_train_set() df_test = raw_data.get_test_set() df_train_clean = cleaning.apply_preprocessing(df_train, target='Survived') df_test_clean = cleaning.apply_preprocessing(df_test, target='Survived') cleaning.processed_train_data(df_train_clean) cleaning.processed_test_data(df_test_clean)