Exemplo n.º 1
0
def fit_model_and_forecast(id_list, config):

    # Cast collection of distinct time series IDs into Python list
    id_list = list(id_list)

    # Open connections to S3 File System
    s3 = s3fs.S3FileSystem()
    s3_open1 = s3.open
    s3_open2 = boto.connect_s3(host=config['s3_host'])

    # Loop over time series IDs
    for i, id in enumerate(id_list):

        # Determine S3 file path and load data into pandas dataframe
        file_path = s3.glob(config['path_training_data_parquet'] + 'ID=' + str(id) +
                            '/*.parquet')
        df_data = ParquetFile(file_path,open_with=s3_open1).to_pandas()

        # Sort time series data according to original ordering
        df_data = df_data.sort_values('ORDER')

        # Initialize dataframe to store forecast
        df_forecasts = pd.DataFrame(np.nan, index=range(0, config['len_eval']),
                                    columns=['FORECAST'])

        # Add columns with ID, true data and ordering information
        df_forecasts.insert(0, 'ID', id, allow_duplicates=True)
        df_forecasts.insert(1, 'ORDER', np.arange(1, config['len_eval'] + 1))
        df_forecasts.insert(2, 'DATA', df_data['DATA'][range((config['len_series'] -
                                                              config['len_eval']),
                                                             config['len_series'])].values,
                                                             allow_duplicates=True)

        # Loop over successive estimation windows
        for j, train_end in enumerate(range((config['len_series'] - config['len_eval'] - 1),
                                            (config['len_series'] - 1))):

            # Fit ARMA(2,2) model and forecast one-step ahead
            model = ARMA(df_data['DATA'][range(0, train_end+1)], (2, 2)).fit(disp=False)
            df_forecasts.at[j, 'FORECAST'] = model.predict(train_end+1, train_end+1)

        # Write dataframe with forecast to S3 in Parquet file format
        path = config['path_forecasts'] + 'ID=' + str(id) + '.parquet'
        write(path, df_forecasts, write_index=False, append=False, open_with=s3_open1)

        # Save fitted ARMA model to S3 in pickle file format
        path = config['path_models'] + 'ID=' + str(id) + '.model'
        with s3io.open(path, mode='w', s3_connection=s3_open2) as s3_file:
            joblib.dump(model, s3_file)
Exemplo n.º 2
0
def test_mixed_partition_types(tempdir, partitions):
    df = pd.DataFrame({'partitions': partitions, 'x': [1, 2]})
    write(tempdir, df, file_scheme='hive', partition_on=['partitions'])
    out = ParquetFile(tempdir).to_pandas()
    assert (out.sort_values("x").set_index("x").partitions == df.sort_values(
        "x").set_index("x").partitions).all()