示例#1
0
def validate(pipeline_params, model, encoder, track, date_cutoff, max_date):
    target_name = 'unit_sales'
    validate_stream = (row for row in stream_data(pipeline_params)
                       if validate_filter(row, date_cutoff, max_date))

    encoded_validate_stream = encoder.encode_data_stream(validate_stream)
    validation_predictions = [float(model.predict([row]))
                              for row in encoded_validate_stream]

    target = [row[target_name] for row in stream_data(pipeline_params)
              if validate_filter(row, date_cutoff, max_date)]

    print("Calculating metrics")
    validation_metrics = {'r2_score': metrics.r2_score(
        y_true=target, y_pred=validation_predictions)}

    track.log_metrics(validation_metrics)
    fluentd_logger.log('validation_metrics', validation_metrics)

    write_predictions_and_score(validation_metrics)

    print("Evaluation done with metrics {}.".format(
        json.dumps(validation_metrics)))

    write_model(model)

    track.log_artifact(file_names['model'])

    make_validation_plot(target, validation_predictions, track)
示例#2
0
def run_ml_model(pipeline_params, encoder, track, date_cutoff, seed=None):
    target_name = 'unit_sales'

    train_stream = (row for row in stream_data(pipeline_params)
                    if train_filter(row, date_cutoff))
    encoded_train_stream = encoder.encode_data_stream(train_stream)

    print('Encoding data')
    # batch step, read it all in
    encoded_train_data = list(encoded_train_stream)

    print('Getting target')
    # read it all in
    target = [
        row[target_name] for row in stream_data(pipeline_params)
        if train_filter(row, date_cutoff)
    ]

    model_name = pipeline_params['model_name']
    params = pipeline_params['model_params'][model_name]

    track.log_ml_params(params)
    track.log_pipeline_params(pipeline_params)

    trained_model, params = train_model(encoded_train_data,
                                        target,
                                        model_name,
                                        params,
                                        seed=seed)

    return trained_model, params
示例#3
0
def test_get_encoder_from_stream():
    stream = stream_data(pipeline_params)
    stream_small = (next(stream) for _ in range(100))
    encoder = get_encoder_from_stream(stream_small)
    assert isinstance(encoder, OneHotEncoder)

    stream = stream_data(pipeline_params)
    stream_small = (next(stream) for _ in range(100))
    encoded = list(encoder.encode_data_stream(stream_small))
    assert len(encoded) == 100

    row_in = {
        'id': '88219279',
        'date': '2016-08-16',
        'item_nbr': '103520',
        'unit_sales': '10.0',
        'family': 'GROCERY I',
        'class': '1028',
        'perishable': '0',
        'year': '2016',
        'month': '8',
        'day': '16',
        'dayofweek': '1',
        'days_til_end_of_data': '364',
        'dayoff': 'False'
    }

    encoded = encoder.encode_data([row_in])
    decoded = encoder.decode_data(encoded)

    print(decoded[0].keys())
    print(row_in.keys())
    del row_in['date'], row_in['id'], row_in['unit_sales']

    assert decoded[0].keys() == row_in.keys()

    #  make a level not seen
    row_in['class'] = 'FOO'

    encoded = encoder.encode_data([row_in])
    decoded = encoder.decode_data(encoded)

    print(decoded[0].keys())
    print(row_in.keys())

    assert decoded[0].keys() == row_in.keys()
    print(decoded)
    assert decoded[0]['class'] == 'UNKNOWN_CATEGORICAL_LEVEL'
示例#4
0
def test_stream_data():

    stream = stream_data(pipeline_params)
    row = next(stream)
    assert isinstance(row, dict)
    assert 'perishable' in row
    assert isinstance(row['perishable'], int)
示例#5
0
def get_max_date(pipeline_params):
    # batch step
    # '2017-08-15'
    print('Getting max date')
    max_date = max(
        date_string_to_date(row["date"])
        for row in stream_data(pipeline_params))
    print('Max date: %s' % max_date)
    return max_date.strftime('%Y-%m-%d')