def pipeline(): remote_data_path = sys.argv[1] if len(sys.argv) > 1 else \ "s3a://asystem-amodel-staging/asystem/amodel/energyforecastinterday" remote_model_path = sys.argv[2] if len(sys.argv) > 2 else \ "s3a://asystem-amodel-staging/asystem/amodel/energyforecastinterday" local_model_path = sys.argv[3] if len(sys.argv) > 3 else \ tempfile.mkdtemp() spark = SparkSession.builder.appName( "asystem-amodel-energyforecast").getOrCreate() # noinspection PyStringFormat print( "Session started:\n Model version: [${asystem-model-energyforecast-interday.build.version}]\n " "ASystem version: [${project.version}]\n Local path: [{}]\n " "Data URI: [{}]\n Model URI: [{}]\n".format(local_model_path, remote_data_path, remote_model_path)) training_uri = nearest( qualify( remote_data_path + "/train/text/csv/none/" + "amodel_version=${project.version}/amodel_model=${asystem-model-energyforecast-interday.build.version}" )) print("Training:\n URI: [{}] ".format(training_uri)) df = spark.read.csv(training_uri, header=True).toPandas() \ .apply(pd.to_numeric, errors='ignore') df2 = execute(features=df, engineering=True) print(" Dataframe:\n{}\n\n".format(df2.describe())) test_uri = nearest( qualify( remote_data_path + "/test/text/csv/none/" + "amodel_version=${project.version}/amodel_model=${asystem-model-energyforecast-interday.build.version}" )) print("Testing:\n URI: [{}]".format(test_uri)) dfv = spark.read.csv(test_uri, header=True).toPandas() \ .apply(pd.to_numeric, errors='ignore') dfv2 = execute(features=dfv, engineering=True) print(" Dataframe:\n{}\n".format(dfv2.describe())) features_statistics = execute(features=pd.concat([df2, dfv2]), statistics=True) # Plot the pairplot to discover correlation between power generation and other variables. # Plot${TEMPLATE.PRE-PROCESSOR.OPEN} sns.set(style="ticks") # Plot${TEMPLATE.PRE-PROCESSOR.OPEN} sns.pairplot(df2, hue="condition") # Plot${TEMPLATE.PRE-PROCESSOR.OPEN} plt.show(block=False) def rmse(actual, predicted): from sklearn.metrics import mean_squared_error from math import sqrt return sqrt(mean_squared_error(actual, predicted)) def train_model(_regr, train_dict, target): estimators = [('vectorizer', DictVectorizer(sparse=False)), ('regr', _regr)] _pl = Pipeline(estimators) _pl.fit(train_dict, target) return _pl def prepare_data(raw_df, predictor_columns=execute(labels=True)[0]): predictors = raw_df[predictor_columns] target = None if 'energy' in raw_df.columns: target = raw_df.energy return predictors, target # # Build predictive models with linear regression # # 1. Split train and test data set # 2. Filter predictor columns # 3. Create dummy variables for categorical varibales # 4. Build model with Linear Regression # 5. Evaluate # # From 1 to 4 should be handle with pipeline model. # Split development data and test data # Training data is the range of the first data dev_data = df2 model_test_data = dfv2 energies_train, energies_target = prepare_data(dev_data) # ## Encode condition from category to dummy variable vectorizer = DictVectorizer(sparse=False) energies_cat_train = vectorizer.fit_transform( energies_train.to_dict(orient='record')) # ## Build a model with linear regression def evaluate_by_loo(energies_train, energies_target, regr=LinearRegression()): loo = LeaveOneOut() loo.get_n_splits(energies_train) train_r2_scores = np.array([]) test_r2_scores = np.array([]) train_rmse_scores = np.array([]) test_rmse_scores = np.array([]) predicted_powers = np.array([]) actual_powers = np.array([]) # Train Linear Regression model # It is small data, so for train_index, test_index in loo.split(energies_train): # print("Test index:{}".format(test_index)) # print("TRAIN:", train_index, "TEST:", test_index) # regr = LinearRegression() x_train, x_test = energies_train[train_index], \ energies_train[test_index] y_train, y_test = energies_target.iloc[train_index], \ energies_target.iloc[test_index] regr.fit(x_train, y_train) # print(X_test, y_test) y_train_pred = execute( { 'pipeline': regr, 'statistics': features_statistics }, features=x_train, prediction=True) y_test_pred = execute( { 'pipeline': regr, 'statistics': features_statistics }, features=x_test, prediction=True) # print(y_test.values, y_test_pred) train_r2_score = regr.score(x_train, y_train) train_r2_scores = np.append(train_r2_scores, train_r2_score) test_r2_score = r2_score(y_test.values, y_test_pred) test_r2_scores = np.append(test_r2_scores, test_r2_score) train_rmse_score = rmse(y_train, y_train_pred) train_rmse_scores = np.append(train_rmse_scores, train_rmse_score) test_rmse_score = rmse(y_test.values, y_test_pred) test_rmse_scores = np.append(test_rmse_scores, test_rmse_score) actual_powers = np.append(actual_powers, y_test.values[0]) predicted_powers = np.append(predicted_powers, y_test_pred[0]) # print("Actual energy generation: {}\tPredicted energy generation: {}" # .format(y_test.values[0], y_test_pred[0])) # print("Train R^2 score: {}\tTest R^2 score:{}" # .format(train_r2_score, test_r2_score)) # print("Train RMSE: {}\tTest RMSE:{}\n" # .format(train_rmse_score, test_rmse_score)) # Standard deviation of training data is base line of RMSE # print("Standard deviation: {}".format(pd.DataFrame.std(energies_target))) print("Train average RMSE: {}\tTest average RMSE:{}".format( np.average(train_rmse_scores), np.average(test_rmse_scores))) print("Train average R^2: {}\tTest average R^2:{}".format( np.average(train_r2_scores), np.average(test_r2_scores))) return actual_powers, predicted_powers # Plotting LOO predictions # http://scikit-learn.org/stable/auto_examples/plot_cv_predict.html actual_powers, predicted_powers = evaluate_by_loo(energies_cat_train, energies_target) def plot_predict_actual(actual, predicted): import matplotlib.pyplot as plt fig, ax = plt.subplots() ax.scatter(actual, predicted, edgecolors=(0, 0, 0)) ax.plot([actual.min(), actual.max()], [actual.min(), actual.max()], 'k--', lw=4) ax.set_xlabel('Measured') ax.set_ylabel('Predicted') plt.show(block=False) # Plot${TEMPLATE.PRE-PROCESSOR.OPEN} plot_predict_actual(actual_powers, predicted_powers) # Create model with dev data def train_and_predict(regr, cat_train, target, cat_test, test_target): regr.fit(cat_train, target) pred_train = execute( { 'pipeline': regr, 'statistics': features_statistics }, features=cat_train, prediction=True) pred = execute({ 'pipeline': regr, 'statistics': features_statistics }, features=cat_test, prediction=True) dev_rmse = rmse(target.values, pred_train) test_rmse = rmse(test_target.values, pred) print("Dev RMSE: {}\tDev R^2 score: {}".format( dev_rmse, r2_score(target.values, pred_train))) print("Test RMSE: {}\tTest R^2 score: {}".format( test_rmse, r2_score(test_target.values, pred))) return regr, dev_rmse, test_rmse energies_test, energies_test_target = prepare_data(model_test_data) energies_cat_test = vectorizer.transform( energies_test.to_dict(orient='record')) min_rmse = 10000000000 best_model_test_rmse = 10000000000 best_model = None for _regr in [ LinearRegression(), ElasticNetCV(cv=4), RidgeCV(cv=4), LassoCV(cv=4) ]: print(type(_regr).__name__) _model, _rmse, _test_rmse = train_and_predict(_regr, energies_cat_train, energies_target, energies_cat_test, energies_test_target) if min_rmse > _rmse: best_model = _model min_rmse = _rmse best_model_test_rmse = _test_rmse print("Best model: {}\tMin Dev RMSE: {}\tTest RMSE: {}".format( type(best_model).__name__, min_rmse, best_model_test_rmse)) model_file = '/model/pickle/joblib/none/amodel_version=${project.version}' \ '/amodel_model=${asystem-model-energyforecast-interday.build.version}/model.pkl' local_model_file = local_model_path + model_file remote_model_file = remote_model_path + model_file if os.path.exists(os.path.dirname(local_model_file)): shutil.rmtree(os.path.dirname(local_model_file)) os.makedirs(os.path.dirname(local_model_file)) import dill from StringIO import StringIO from sklearn.externals import joblib pickled_execute = StringIO() dill.dump(execute, pickled_execute) pickled_execute.flush() joblib.dump( { 'vectorizer': vectorizer, 'pipeline': best_model, 'statistics': features_statistics, 'execute': pickled_execute }, local_model_file, compress=True) # Example of serialized model usage model = joblib.load(local_model_file) model['execute'] = dill.load(StringIO(model['execute'].getvalue())) energy_production_actual = dfv['energy__production__inverter'].iloc[0] energy_production_prediction = round( model['execute'](model=model, features=model['execute'](features=dfv, engineering=True), prediction=True)[0], 1) energy_production_accuracy = int( round(energy_production_prediction / energy_production_actual * 100)) print("Model prediction [{}] versus actual [{}] at accuracy [{}%]".format( energy_production_prediction, energy_production_actual, energy_production_accuracy)) print("Model copy: {} -> {}".format(local_model_file, remote_model_file)) publish(local_model_file, remote_model_file) shutil.rmtree(local_model_path)
def pipeline(): remote_data_path = sys.argv[1] if len(sys.argv) > 1 else \ "s3a://asystem-astore-staging" remote_model_path = sys.argv[2] if len(sys.argv) > 2 else \ "s3a://asystem-amodel-staging/asystem/amodel/energyforecastintraday" local_model_path = sys.argv[3] if len(sys.argv) > 3 else \ tempfile.mkdtemp() print("Pipeline starting on [{}]\n".format(remote_data_path)) time_start = int(round(time.time())) spark = SparkSession.builder \ .appName("asystem-amodel-energyforecastintraday").getOrCreate() print("Session created ...") ds_energy = spark.read.parquet(*paths( qualify(remote_data_path + "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy" ), ["/*/*/*/*/astore_metric=energy"], "/*.snappy.parquet")) ds_sun = spark.read.parquet(*paths( qualify(remote_data_path + "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy" ), ["/*/*/*/*/astore_metric=sun"], "/*.snappy.parquet")) print("Listing finished ...") ds_energy.createOrReplaceTempView('energy') ds_energy.cache() df_energy = spark.sql(""" SELECT bin_timestamp, data_value / data_scale AS bin_energy FROM energy WHERE data_metric='energy__production__inverter' AND data_type='integral' AND bin_width=1 AND bin_unit='day' ORDER BY bin_timestamp ASC """).toPandas() ds_sun.createOrReplaceTempView('sun') ds_sun.cache() df_sun_rise = spark.sql(""" SELECT bin_timestamp, data_value / data_scale AS bin_sunrise FROM sun WHERE data_metric='sun__outdoor__rise' AND data_type='epoch' AND bin_width=1 AND bin_unit='day' ORDER BY bin_timestamp ASC """).toPandas() df_sun_set = spark.sql(""" SELECT bin_timestamp, data_value / data_scale AS bin_sunset FROM sun WHERE data_metric='sun__outdoor__set' AND data_type='epoch' AND bin_width=1 AND bin_unit='day' ORDER BY bin_timestamp ASC """).toPandas() spark.catalog.clearCache() print("Dataframes collected ...") df = df_energy.set_index( pd.to_datetime(df_energy['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE)) df['bin_date'] = df.index.date df.set_index('bin_date', inplace=True) df_energy_day = df.groupby(df.index)['bin_energy'].max().to_frame() \ .rename(columns={'bin_energy': 'bin_energy_day'}) df = df.merge(df_energy_day, how='inner', left_index=True, right_index=True) df_sun_rise.set_index( pd.to_datetime(df_sun_rise['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE), inplace=True) df_sun_rise['bin_date'] = df_sun_rise.index.date df_sun_rise.set_index('bin_date', inplace=True) df = df.merge(df_sun_rise.groupby( df_sun_rise.index)['bin_sunrise'].max().to_frame(), how='inner', left_index=True, right_index=True) df_sun_set.set_index( pd.to_datetime(df_sun_set['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE), inplace=True) df_sun_set['bin_date'] = df_sun_set.index.date df_sun_set.set_index('bin_date', inplace=True) df = df.merge(df_sun_set.groupby( df_sun_set.index)['bin_sunset'].max().to_frame(), how='inner', left_index=True, right_index=True) df.set_index( pd.to_datetime(df['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE), inplace=True) df.sort_index(inplace=True) print("Output compiled ...") print("\nTraining data:\n{}\n\n".format(df.describe())) dfvs = {'VETTED': {}, 'PURGED': {}, 'TOVETT': {}} for dfs in df.groupby(df.index.date): day = dfs[0].strftime('%Y/%m/%d') dfvs[('PURGED' if day in DAYS_BLACK_LIST else ('TOVETT' if day >= datetime.datetime.now().strftime("%Y/%m/%d") else 'VETTED'))][day] = dfs[1] for vetting in dfvs: for day, dfv in sorted(dfvs[vetting].iteritems()): dfv.set_index(pd.to_datetime( dfv['bin_timestamp'], unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE), inplace=True) if DAYS_PLOT and DAYS_PLOT_DEBUG: dfv.plot(title="Energy ({}) - {}".format(day, vetting), y=['bin_energy', 'bin_energy_day']) for vetting in dfvs: print("Processed {} {} days ...".format(len(dfvs[vetting]), vetting.lower())) dfnss = [] bins = 1000 for day, dfv in sorted(dfvs['VETTED'].iteritems()): dfv['normalised'] = dfv['bin_energy'] / dfv['bin_energy_day'] dfv['standardised'] = bins * ( dfv['bin_timestamp'] - dfv['bin_sunrise']) / \ (dfv['bin_sunset'] - dfv['bin_sunrise']) dfv['standardised'] = dfv['standardised'].clip(0, bins).astype(int) dfns = dfv.drop([ 'bin_timestamp', 'bin_energy', 'bin_energy_day', 'bin_sunrise', 'bin_sunset' ], axis=1).drop_duplicates() dfns.set_index('standardised', inplace=True) dfns.sort_index(inplace=True) dfns = dfns[~dfns.index.duplicated(keep='first')] dfns = dfns.reindex(np.arange(0, bins + 1)).ffill() dfns.loc[0:10] = 0 dfns.loc[990:1000] = 1 dfnss.append(dfns) if DAYS_PLOT and DAYS_PLOT_DEBUG: dfns.plot(title="Energy ({}) - VETTED".format(day)) dfnsa = pd.concat(dfnss, axis=1, ignore_index=True) if DAYS_PLOT: dfnsa.plot(title="Energy Normalised/Standardised (All) - VETTED", legend=False) dfnsa = pd.concat(dfnss) dfnsa = dfnsa.groupby(dfnsa.index).mean() if DAYS_PLOT: dfnsa.plot(title="Energy Normalised/Standardised (Mean) - VETTED", legend=False) print("Model built ...") model_file = '/model/pickle/joblib/none/' \ 'amodel_version=${project.version}/amodel_model=${asystem-model-energyforecast-intraday.build.version}/model.pkl' local_model_file = local_model_path + model_file remote_model_file = remote_model_path + model_file if os.path.exists(os.path.dirname(local_model_file)): shutil.rmtree(os.path.dirname(local_model_file)) os.makedirs(os.path.dirname(local_model_file)) pickled_execute = StringIO() dill.dump(execute, pickled_execute) pickled_execute.flush() joblib.dump({ 'pipeline': dfnsa, 'execute': pickled_execute }, local_model_file, compress=True) print("Model serialised ...") model = joblib.load(local_model_file) dfi = pd.DataFrame([{ "energy__production_Dforecast_Ddaylight__inverter": 0 }, { "energy__production_Dforecast_Ddaylight__inverter": 250 }, { "energy__production_Dforecast_Ddaylight__inverter": 500 }, { "energy__production_Dforecast_Ddaylight__inverter": 750 }, { "energy__production_Dforecast_Ddaylight__inverter": 1000 }]).apply(pd.to_numeric, errors='ignore') dfo = dill.load(StringIO(model['execute'].getvalue())) \ (model=model, features=dfi, prediction=True) print("Model de-serialised ...") print("\nEnergy Mean Input:\n{}\n\nEnergy Mean Output:\n{}\n".format( dfi, dfo)) publish(local_model_file, remote_model_file) shutil.rmtree(local_model_path) print("Model published ...") print("\nPipeline finished in [{}] s".format( int(round(time.time())) - time_start))