Exemplo n.º 1
0
def test_single_prediction(trained_model):

    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
    # get first row of the Dataframe
    single_test_input = test_data.iloc[0:1]

    pred = make_prediction(input_data=single_test_input)

    assert pred is not None
    # assert isinstance(pred.get('predictions')[0], float)
    assert pytest.approx(pred.get('predictions')[0], 0.0285, abs=1e-3)
def run_testing(file_name=config.TESTING_DATA_FILE) -> Tuple[float, float]:
    """
    Run testing using held out data
    """

    test_data = load_dataset(file_name=file_name)

    pipeline_file_name = f'{config.PIPELINE_SAVE_FILE}{_version}.pkl'
    curr_model = load_pipeline(file_name=pipeline_file_name)
    test_mape, test_99per = get_accuracy(curr_model, test_data,
                                         test_data['wall_time'])

    logger.info(f'Testing Mean absolute % error: {test_mape}')
    logger.info(f'Testing 99th Percentile % error: {test_99per}')

    return test_mape, test_99per
Exemplo n.º 3
0
def test_multiple_predictions(trained_model):

    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)

    pred = make_prediction(input_data=test_data)

    assert pred is not None
    assert len(pred.get('predictions')) == test_data.shape[0]

    pipeline_file_name = f'{config.PIPELINE_SAVE_FILE}{_version}.pkl'
    curr_model = load_pipeline(file_name=pipeline_file_name)
    test_mape, percentile_99 = get_accuracy(curr_model, test_data, test_data['wall_time'])
    print(f'Test MAPE score: {test_mape}, 99th Percentile: {percentile_99}')

    # Current Model expected MAPE accuracy is ~18.0
    assert test_mape < 30.0

    assert percentile_99 < 150.0
    'nn_model__input_dim': [
        22,
    ],
    'nn_model__nodes_per_layer': [(10, 10, 5), (10, 10, 7, 5)],
    'nn_model__dropout': [0, 0.05, 0.1, 0.015, 0.2],
    'nn_model__batch_size': [64, 128, 256, 512],
    'nn_model__epochs': [100, 200, 300, 400],
    'nn_model__optimizer': ['adam'],  #, 'rmsprop'],  # adam is better
    'nn_model__learning_rate': [0.0001, 0.0005, 0.001, 0.005, 0.01],
}

if __name__ == "__main__":
    # multiprocessing requires the fork to happen in a __main__ protected block

    # change max_rows, None means all data
    data = load_dataset(file_name=config.TRAINING_DATA_FILE, nrows=None)

    X_train, X_test, y_train, y_test = get_train_test_split(data,
                                                            test_size=0.2)

    grid_search = GridSearchCV(
        qc_time_nn,
        parameters,
        scoring={
            'percentile99': make_scorer(percentile_rel_90,
                                        greater_is_better=False),
            'MAPE': make_scorer(mape, greater_is_better=False),
        },
        refit='percentile99',
        n_jobs=-1,  # -2 to use all CPUs except one
        return_train_score=True,
Exemplo n.º 5
0
def test_load_dataset():

    data = load_dataset(file_name=config.TRAINING_DATA_FILE, nrows=5)

    assert data.shape[0] == 5
def run_training(with_accuracy=True,
                 overwrite=True,
                 use_all_data=False) -> Union[Tuple[float, float], None]:
    """
    Run trainging using the data and prams in the config file
    Saves the model (using the name and location in the config)
    Optionally: calculate the train and test accuracy (Mean Absolute Percent Error)

    Parameters
    ----------

    with_accuracy: bool, default True
        If true, calculate and return the training and test accuracy

    overwrite: bool
        overwrite the model file if it exists

    use_all_data: bool
        use all available data for training (used ONLY for out of sample prediction
        in production)

    """

    if not overwrite and current_model_exists():
        logger.info("Model is already saved. Skipping training")
        return

    logger.info('Reading training data.')
    # read training data
    data = load_dataset(file_name=config.TRAINING_DATA_FILE)

    logger.debug(f'Training data columns: \n{data.columns}')

    test_size, train_size = config.TEST_SIZE, config.TRAIN_SIZE
    if use_all_data:
        test_size, train_size = None, 0.99

    X_train, X_test, y_train, y_test = get_train_test_split(
        data, test_size=test_size, train_size=train_size)

    logger.info('Start fitting model...')

    # Save some formatted test data
    X_test_ = input_features_pipeline.fit_transform(X_test, y_test)
    save_data(X=X_test_, y=y_test,
              file_name=config.TESTING_DATA_FILE)  #, max_rows=5000)

    # train and save the model
    model.set_params(**config.BEST_MODEL_PARAMS)
    model.fit(X_train, y_train)

    logger.info(f'Saving model version: {_version}')
    save_pipeline(pipeline_to_persist=model)

    if with_accuracy:
        train_mape, train_99per = get_accuracy(model, X_train, y_train)
        test_mape, test_99per = get_accuracy(model, X_test, y_test)

        logger.info(f'Training Mean absolute % error: {train_mape}')
        logger.info(f'Testing Mean absolute % error: {test_mape}')

        logger.info(f'Training 99th Percentile % error: {train_99per}')
        logger.info(f'Testing 99th Percentile % error: {test_99per}')

        return train_mape, test_mape