def main(input_train, input_test, output_train, output_test): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') raw_data = DataSet(input_train, input_test) df_train = raw_data.get_train_set() df_test = raw_data.get_test_set() TitanicPreProcessing(df_train, output_train) TitanicPreProcessing(df_test, output_test)
def main(input_filepath, output_filepath): """ Runs data processing scripts to turn raw data from (../raw) into cleaned data ready to be analyzed (saved in ../processed). """ logger = logging.getLogger(__name__) logger.info('making final data set from raw data') raw_data = DataSet(train_dir=input_filepath + '/train.csv', test_dir=input_filepath + '/test.csv') cleaning = DataWrangling(train_dir=output_filepath + '/train_clean.csv', test_dir=output_filepath + '/test_clean.csv') df_train = raw_data.get_train_set() df_test = raw_data.get_test_set() df_train_clean = cleaning.apply_preprocessing(df_train, target='Survived') df_test_clean = cleaning.apply_preprocessing(df_test, target='Survived') cleaning.processed_train_data(df_train_clean) cleaning.processed_test_data(df_test_clean)
def main(input_train, input_test, input_model, output_prediction): """ Runs modeling scripts using model pickle (../models) to predict outcomes. Outcomes file is saved as .csv (saved in ../models). """ logger = logging.getLogger(__name__) logger.info('predicting outcomes') data = DataSet(train_dir=input_train, test_dir=input_test) test = data.get_test_set() X_test = data.get_features(test) model = Model.load(input_model + 'XGBClassifier') y_pred = model.predict(X_test) output = pd.DataFrame({ 'PassengerId': test['PassengerId'], 'Survived': y_pred }) output.to_csv(output_prediction + 'submission_{}.csv'.format(model.name), index=False)