Пример #1
0
def main(input_train, input_test, output_train, output_test):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    raw_data = DataSet(input_train, input_test)

    df_train = raw_data.get_train_set()
    df_test = raw_data.get_test_set()

    TitanicPreProcessing(df_train, output_train)
    TitanicPreProcessing(df_test, output_test)
Пример #2
0
def main(input_data, output_model):
    """ Runs modeling scripts using processed data (../raw) to
        create model. Model is saved as pickle (saved in ../models).
    """
    logger = logging.getLogger(__name__)
    logger.info('training model')

    data = DataSet(train_dir=input_data)
    train = data.get_train_set()
    X_train = data.get_features(train)
    y = data.get_label(train)

    clf = models[4]
    param_grid = params[4]

    model = Model.tune(clf, X_train, y, param_grid)
    model.save(output_model + model.name)
Пример #3
0
def main(input_filepath, output_filepath):
    """ Runs data processing scripts to turn raw data from (../raw) into
        cleaned data ready to be analyzed (saved in ../processed).
    """
    logger = logging.getLogger(__name__)
    logger.info('making final data set from raw data')

    raw_data = DataSet(train_dir=input_filepath + '/train.csv',
                       test_dir=input_filepath + '/test.csv')
    cleaning = DataWrangling(train_dir=output_filepath + '/train_clean.csv',
                             test_dir=output_filepath + '/test_clean.csv')

    df_train = raw_data.get_train_set()
    df_test = raw_data.get_test_set()
    df_train_clean = cleaning.apply_preprocessing(df_train, target='Survived')
    df_test_clean = cleaning.apply_preprocessing(df_test, target='Survived')
    cleaning.processed_train_data(df_train_clean)
    cleaning.processed_test_data(df_test_clean)