def pipeline(): remote_data_path = sys.argv[1] if len(sys.argv) > 1 else \ "s3a://asystem-astore-staging" remote_model_path = sys.argv[2] if len(sys.argv) > 2 else \ "s3a://asystem-amodel-staging/asystem/amodel/energyforecastintraday" local_model_path = sys.argv[3] if len(sys.argv) > 3 else \ tempfile.mkdtemp() print("Pipeline starting on [{}]\n".format(remote_data_path)) time_start = int(round(time.time())) spark = SparkSession.builder \ .appName("asystem-amodel-energyforecastintraday").getOrCreate() print("Session created ...") ds_energy = spark.read.parquet(*paths( qualify(remote_data_path + "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy" ), ["/*/*/*/*/astore_metric=energy"], "/*.snappy.parquet")) ds_sun = spark.read.parquet(*paths( qualify(remote_data_path + "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy" ), ["/*/*/*/*/astore_metric=sun"], "/*.snappy.parquet")) print("Listing finished ...") ds_energy.createOrReplaceTempView('energy') ds_energy.cache() df_energy = spark.sql(""" SELECT bin_timestamp, data_value / data_scale AS bin_energy FROM energy WHERE data_metric='energy__production__inverter' AND data_type='integral' AND bin_width=1 AND bin_unit='day' ORDER BY bin_timestamp ASC """).toPandas() ds_sun.createOrReplaceTempView('sun') ds_sun.cache() df_sun_rise = spark.sql(""" SELECT bin_timestamp, data_value / data_scale AS bin_sunrise FROM sun WHERE data_metric='sun__outdoor__rise' AND data_type='epoch' AND bin_width=1 AND bin_unit='day' ORDER BY bin_timestamp ASC """).toPandas() df_sun_set = spark.sql(""" SELECT bin_timestamp, data_value / data_scale AS bin_sunset FROM sun WHERE data_metric='sun__outdoor__set' AND data_type='epoch' AND bin_width=1 AND bin_unit='day' ORDER BY bin_timestamp ASC """).toPandas() spark.catalog.clearCache() print("Dataframes collected ...") df = df_energy.set_index( pd.to_datetime(df_energy['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE)) df['bin_date'] = df.index.date df.set_index('bin_date', inplace=True) df_energy_day = df.groupby(df.index)['bin_energy'].max().to_frame() \ .rename(columns={'bin_energy': 'bin_energy_day'}) df = df.merge(df_energy_day, how='inner', left_index=True, right_index=True) df_sun_rise.set_index( pd.to_datetime(df_sun_rise['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE), inplace=True) df_sun_rise['bin_date'] = df_sun_rise.index.date df_sun_rise.set_index('bin_date', inplace=True) df = df.merge(df_sun_rise.groupby( df_sun_rise.index)['bin_sunrise'].max().to_frame(), how='inner', left_index=True, right_index=True) df_sun_set.set_index( pd.to_datetime(df_sun_set['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE), inplace=True) df_sun_set['bin_date'] = df_sun_set.index.date df_sun_set.set_index('bin_date', inplace=True) df = df.merge(df_sun_set.groupby( df_sun_set.index)['bin_sunset'].max().to_frame(), how='inner', left_index=True, right_index=True) df.set_index( pd.to_datetime(df['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE), inplace=True) df.sort_index(inplace=True) print("Output compiled ...") print("\nTraining data:\n{}\n\n".format(df.describe())) dfvs = {'VETTED': {}, 'PURGED': {}, 'TOVETT': {}} for dfs in df.groupby(df.index.date): day = dfs[0].strftime('%Y/%m/%d') dfvs[('PURGED' if day in DAYS_BLACK_LIST else ('TOVETT' if day >= datetime.datetime.now().strftime("%Y/%m/%d") else 'VETTED'))][day] = dfs[1] for vetting in dfvs: for day, dfv in sorted(dfvs[vetting].iteritems()): dfv.set_index(pd.to_datetime( dfv['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE), inplace=True) if DAYS_PLOT and DAYS_PLOT_DEBUG: dfv.plot(title="Energy ({}) - {}".format(day, vetting), y=['bin_energy', 'bin_energy_day']) for vetting in dfvs: print("Processed {} {} days ...".format(len(dfvs[vetting]), vetting.lower())) dfnss = [] bins = 1000 for day, dfv in sorted(dfvs['VETTED'].iteritems()): dfv['normalised'] = dfv['bin_energy'] / dfv['bin_energy_day'] dfv['standardised'] = bins * ( dfv['bin_timestamp'] - dfv['bin_sunrise']) / \ (dfv['bin_sunset'] - dfv['bin_sunrise']) dfv['standardised'] = dfv['standardised'].clip(0, bins).astype(int) dfns = dfv.drop([ 'bin_timestamp', 'bin_energy', 'bin_energy_day', 'bin_sunrise', 'bin_sunset' ], axis=1).drop_duplicates() dfns.set_index('standardised', inplace=True) dfns.sort_index(inplace=True) dfns = dfns[~dfns.index.duplicated(keep='first')] dfns = dfns.reindex(np.arange(0, bins + 1)).ffill() dfns.loc[0:10] = 0 dfns.loc[990:1000] = 1 dfnss.append(dfns) if DAYS_PLOT and DAYS_PLOT_DEBUG: dfns.plot(title="Energy ({}) - VETTED".format(day)) dfnsa = pd.concat(dfnss, axis=1, ignore_index=True) if DAYS_PLOT: dfnsa.plot(title="Energy Normalised/Standardised (All) - VETTED", legend=False) dfnsa = pd.concat(dfnss) dfnsa = dfnsa.groupby(dfnsa.index).mean() if DAYS_PLOT: dfnsa.plot(title="Energy Normalised/Standardised (Mean) - VETTED", legend=False) print("Model built ...") model_file = '/model/pickle/joblib/none/' \ 'amodel_version=${project.version}/amodel_model=${asystem-model-energyforecast-intraday.build.version}/model.pkl' local_model_file = local_model_path + model_file remote_model_file = remote_model_path + model_file if os.path.exists(os.path.dirname(local_model_file)): shutil.rmtree(os.path.dirname(local_model_file)) os.makedirs(os.path.dirname(local_model_file)) pickled_execute = StringIO() dill.dump(execute, pickled_execute) pickled_execute.flush() joblib.dump({ 'pipeline': dfnsa, 'execute': pickled_execute }, local_model_file, compress=True) print("Model serialised ...") model = joblib.load(local_model_file) dfi = pd.DataFrame([{ "energy__production_Dforecast_Ddaylight__inverter": 0 }, { "energy__production_Dforecast_Ddaylight__inverter": 250 }, { "energy__production_Dforecast_Ddaylight__inverter": 500 }, { "energy__production_Dforecast_Ddaylight__inverter": 750 }, { "energy__production_Dforecast_Ddaylight__inverter": 1000 }]).apply(pd.to_numeric, errors='ignore') dfo = dill.load(StringIO(model['execute'].getvalue())) \ (model=model, features=dfi, prediction=True) print("Model de-serialised ...") print("\nEnergy Mean Input:\n{}\n\nEnergy Mean Output:\n{}\n".format( dfi, dfo)) publish(local_model_file, remote_model_file) shutil.rmtree(local_model_path) print("Model published ...") print("\nPipeline finished in [{}] s".format( int(round(time.time())) - time_start))
def pipeline(): remote_data_path = sys.argv[1] if len( sys.argv) > 1 else "s3a://asystem-astore" print("Pipeline starting on [{}]\n".format(remote_data_path)) time_start = int(round(time.time())) spark = SparkSession.builder.appName( "asystem-amodel-dataset").getOrCreate() print("Session created ...") dataset = spark.read.parquet(*paths( qualify( remote_data_path + "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy"), ["/*/*/*/*/astore_metric=temperature"], "/*.snappy.parquet")) print("Listing finished ...") dataset.createOrReplaceTempView('dataset') dataset.cache() dataset = spark.sql(""" SELECT bin_timestamp AS timestamp, data_metric AS metric, data_temporal AS temporal, data_value / data_scale AS temperature FROM dataset WHERE data_temporal='current' AND data_type='point' AND data_version=2 AND data_metric NOT LIKE '%forecast%' AND data_metric NOT LIKE '%parents' AND data_metric NOT LIKE '%shed' AND data_metric NOT LIKE '%roof' ORDER BY timestamp """) dataframe = dataset.toPandas() spark.catalog.clearCache() print("Dataframe collected ...") dataframe = dataframe.pivot_table(values='temperature', index='timestamp', columns='metric') dataframe = dataframe.set_index( pd.to_datetime( dataframe.index, unit='s').tz_localize('UTC').tz_convert('Australia/Perth')) dataframe = dataframe.loc[(dataframe.index.strftime('%Y-%m-%d') >= '2018-07-19')] dataframe = dataframe.fillna(method='bfill') dataframe = dataframe.fillna(method='ffill') dataframe = dataframe.resample('300S').mean() dataframe = dataframe.fillna(method='bfill') dataframe = dataframe.fillna(method='ffill') dataframe = dataframe.round(1) dataframe = dataframe.loc[(dataframe < 50).all(axis=1), :] dataframe = dataframe.loc[(dataframe > -10).all(axis=1), :] dataframe.columns = dataframe.columns.map( lambda name: re.compile('.*__.*__(.*)').sub('\\1', name)) print("Output compiled ...") print("\nTraining data:\n{}\n\n".format(dataframe.describe())) output = tempfile.NamedTemporaryFile(prefix='asystem-temperature-', suffix='.csv', delete=False).name dataframe.to_csv(output) print("Wrote output to [{}]".format(output)) print("\nPipeline finished in [{}] s".format( int(round(time.time())) - time_start))