def pipeline():

    remote_data_path = sys.argv[1] if len(sys.argv) > 1 else \
        "s3a://asystem-amodel-staging/asystem/amodel/energyforecastinterday"
    remote_model_path = sys.argv[2] if len(sys.argv) > 2 else \
        "s3a://asystem-amodel-staging/asystem/amodel/energyforecastinterday"
    local_model_path = sys.argv[3] if len(sys.argv) > 3 else \
        tempfile.mkdtemp()

    spark = SparkSession.builder.appName(
        "asystem-amodel-energyforecast").getOrCreate()
    # noinspection PyStringFormat
    print(
        "Session started:\n  Model version: [${asystem-model-energyforecast-interday.build.version}]\n  "
        "ASystem version: [${project.version}]\n  Local path: [{}]\n  "
        "Data URI: [{}]\n  Model URI: [{}]\n".format(local_model_path,
                                                     remote_data_path,
                                                     remote_model_path))

    training_uri = nearest(
        qualify(
            remote_data_path + "/train/text/csv/none/" +
            "amodel_version=${project.version}/amodel_model=${asystem-model-energyforecast-interday.build.version}"
        ))
    print("Training:\n  URI: [{}]   ".format(training_uri))
    df = spark.read.csv(training_uri, header=True).toPandas() \
        .apply(pd.to_numeric, errors='ignore')
    df2 = execute(features=df, engineering=True)
    print("  Dataframe:\n{}\n\n".format(df2.describe()))

    test_uri = nearest(
        qualify(
            remote_data_path + "/test/text/csv/none/" +
            "amodel_version=${project.version}/amodel_model=${asystem-model-energyforecast-interday.build.version}"
        ))
    print("Testing:\n  URI: [{}]".format(test_uri))
    dfv = spark.read.csv(test_uri, header=True).toPandas() \
        .apply(pd.to_numeric, errors='ignore')
    dfv2 = execute(features=dfv, engineering=True)
    print("  Dataframe:\n{}\n".format(dfv2.describe()))

    features_statistics = execute(features=pd.concat([df2, dfv2]),
                                  statistics=True)

    # Plot the pairplot to discover correlation between power generation and other variables.
    # Plot${TEMPLATE.PRE-PROCESSOR.OPEN}    sns.set(style="ticks")
    # Plot${TEMPLATE.PRE-PROCESSOR.OPEN}    sns.pairplot(df2, hue="condition")
    # Plot${TEMPLATE.PRE-PROCESSOR.OPEN}    plt.show(block=False)

    def rmse(actual, predicted):
        from sklearn.metrics import mean_squared_error
        from math import sqrt

        return sqrt(mean_squared_error(actual, predicted))

    def train_model(_regr, train_dict, target):
        estimators = [('vectorizer', DictVectorizer(sparse=False)),
                      ('regr', _regr)]
        _pl = Pipeline(estimators)
        _pl.fit(train_dict, target)

        return _pl

    def prepare_data(raw_df, predictor_columns=execute(labels=True)[0]):
        predictors = raw_df[predictor_columns]
        target = None
        if 'energy' in raw_df.columns:
            target = raw_df.energy

        return predictors, target

    # # Build predictive models with linear regression
    #
    # 1. Split train and test data set
    # 2. Filter predictor columns
    # 3. Create dummy variables for categorical varibales
    # 4. Build model with Linear Regression
    # 5. Evaluate
    #
    # From 1 to 4 should be handle with pipeline model.

    # Split development data and test data
    # Training data is the range of the first data

    dev_data = df2
    model_test_data = dfv2

    energies_train, energies_target = prepare_data(dev_data)

    # ## Encode condition from category to dummy variable
    vectorizer = DictVectorizer(sparse=False)
    energies_cat_train = vectorizer.fit_transform(
        energies_train.to_dict(orient='record'))

    # ## Build a model with linear regression

    def evaluate_by_loo(energies_train,
                        energies_target,
                        regr=LinearRegression()):
        loo = LeaveOneOut()
        loo.get_n_splits(energies_train)

        train_r2_scores = np.array([])
        test_r2_scores = np.array([])
        train_rmse_scores = np.array([])
        test_rmse_scores = np.array([])
        predicted_powers = np.array([])
        actual_powers = np.array([])

        # Train Linear Regression model
        # It is small data, so
        for train_index, test_index in loo.split(energies_train):
            # print("Test index:{}".format(test_index))
            # print("TRAIN:", train_index, "TEST:", test_index)
            # regr = LinearRegression()

            x_train, x_test = energies_train[train_index], \
                              energies_train[test_index]
            y_train, y_test = energies_target.iloc[train_index], \
                              energies_target.iloc[test_index]
            regr.fit(x_train, y_train)
            # print(X_test, y_test)

            y_train_pred = execute(
                {
                    'pipeline': regr,
                    'statistics': features_statistics
                },
                features=x_train,
                prediction=True)
            y_test_pred = execute(
                {
                    'pipeline': regr,
                    'statistics': features_statistics
                },
                features=x_test,
                prediction=True)

            # print(y_test.values, y_test_pred)

            train_r2_score = regr.score(x_train, y_train)
            train_r2_scores = np.append(train_r2_scores, train_r2_score)
            test_r2_score = r2_score(y_test.values, y_test_pred)
            test_r2_scores = np.append(test_r2_scores, test_r2_score)

            train_rmse_score = rmse(y_train, y_train_pred)
            train_rmse_scores = np.append(train_rmse_scores, train_rmse_score)
            test_rmse_score = rmse(y_test.values, y_test_pred)
            test_rmse_scores = np.append(test_rmse_scores, test_rmse_score)

            actual_powers = np.append(actual_powers, y_test.values[0])
            predicted_powers = np.append(predicted_powers, y_test_pred[0])
            # print("Actual energy generation: {}\tPredicted energy generation: {}"
            #      .format(y_test.values[0], y_test_pred[0]))
            # print("Train R^2 score: {}\tTest R^2 score:{}"
            #      .format(train_r2_score, test_r2_score))
            # print("Train RMSE: {}\tTest RMSE:{}\n"
            #      .format(train_rmse_score, test_rmse_score))

        # Standard deviation of training data is base line of RMSE
        # print("Standard deviation: {}".format(pd.DataFrame.std(energies_target)))

        print("Train average RMSE: {}\tTest average RMSE:{}".format(
            np.average(train_rmse_scores), np.average(test_rmse_scores)))
        print("Train average R^2: {}\tTest average R^2:{}".format(
            np.average(train_r2_scores), np.average(test_r2_scores)))

        return actual_powers, predicted_powers

    # Plotting LOO predictions
    # http://scikit-learn.org/stable/auto_examples/plot_cv_predict.html

    actual_powers, predicted_powers = evaluate_by_loo(energies_cat_train,
                                                      energies_target)

    def plot_predict_actual(actual, predicted):
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots()
        ax.scatter(actual, predicted, edgecolors=(0, 0, 0))
        ax.plot([actual.min(), actual.max()],
                [actual.min(), actual.max()],
                'k--',
                lw=4)
        ax.set_xlabel('Measured')
        ax.set_ylabel('Predicted')
        plt.show(block=False)

    # Plot${TEMPLATE.PRE-PROCESSOR.OPEN}    plot_predict_actual(actual_powers, predicted_powers)

    # Create model with dev data

    def train_and_predict(regr, cat_train, target, cat_test, test_target):
        regr.fit(cat_train, target)

        pred_train = execute(
            {
                'pipeline': regr,
                'statistics': features_statistics
            },
            features=cat_train,
            prediction=True)
        pred = execute({
            'pipeline': regr,
            'statistics': features_statistics
        },
                       features=cat_test,
                       prediction=True)

        dev_rmse = rmse(target.values, pred_train)
        test_rmse = rmse(test_target.values, pred)
        print("Dev RMSE: {}\tDev R^2 score: {}".format(
            dev_rmse, r2_score(target.values, pred_train)))
        print("Test RMSE: {}\tTest R^2 score: {}".format(
            test_rmse, r2_score(test_target.values, pred)))
        return regr, dev_rmse, test_rmse

    energies_test, energies_test_target = prepare_data(model_test_data)
    energies_cat_test = vectorizer.transform(
        energies_test.to_dict(orient='record'))

    min_rmse = 10000000000
    best_model_test_rmse = 10000000000
    best_model = None

    for _regr in [
            LinearRegression(),
            ElasticNetCV(cv=4),
            RidgeCV(cv=4),
            LassoCV(cv=4)
    ]:
        print(type(_regr).__name__)
        _model, _rmse, _test_rmse = train_and_predict(_regr,
                                                      energies_cat_train,
                                                      energies_target,
                                                      energies_cat_test,
                                                      energies_test_target)
        if min_rmse > _rmse:
            best_model = _model
            min_rmse = _rmse
            best_model_test_rmse = _test_rmse

    print("Best model: {}\tMin Dev RMSE: {}\tTest RMSE: {}".format(
        type(best_model).__name__, min_rmse, best_model_test_rmse))

    model_file = '/model/pickle/joblib/none/amodel_version=${project.version}' \
                 '/amodel_model=${asystem-model-energyforecast-interday.build.version}/model.pkl'
    local_model_file = local_model_path + model_file
    remote_model_file = remote_model_path + model_file
    if os.path.exists(os.path.dirname(local_model_file)):
        shutil.rmtree(os.path.dirname(local_model_file))
    os.makedirs(os.path.dirname(local_model_file))

    import dill
    from StringIO import StringIO
    from sklearn.externals import joblib

    pickled_execute = StringIO()
    dill.dump(execute, pickled_execute)
    pickled_execute.flush()

    joblib.dump(
        {
            'vectorizer': vectorizer,
            'pipeline': best_model,
            'statistics': features_statistics,
            'execute': pickled_execute
        },
        local_model_file,
        compress=True)

    # Example of serialized model usage
    model = joblib.load(local_model_file)
    model['execute'] = dill.load(StringIO(model['execute'].getvalue()))
    energy_production_actual = dfv['energy__production__inverter'].iloc[0]
    energy_production_prediction = round(
        model['execute'](model=model,
                         features=model['execute'](features=dfv,
                                                   engineering=True),
                         prediction=True)[0], 1)
    energy_production_accuracy = int(
        round(energy_production_prediction / energy_production_actual * 100))
    print("Model prediction [{}] versus actual [{}] at accuracy [{}%]".format(
        energy_production_prediction, energy_production_actual,
        energy_production_accuracy))

    print("Model copy: {} -> {}".format(local_model_file, remote_model_file))
    publish(local_model_file, remote_model_file)
    shutil.rmtree(local_model_path)
Пример #2
0
def pipeline():

    remote_data_path = sys.argv[1] if len(sys.argv) > 1 else \
        "s3a://asystem-astore-staging"
    remote_model_path = sys.argv[2] if len(sys.argv) > 2 else \
        "s3a://asystem-amodel-staging/asystem/amodel/energyforecastintraday"
    local_model_path = sys.argv[3] if len(sys.argv) > 3 else \
        tempfile.mkdtemp()
    print("Pipeline starting on [{}]\n".format(remote_data_path))

    time_start = int(round(time.time()))
    spark = SparkSession.builder \
        .appName("asystem-amodel-energyforecastintraday").getOrCreate()
    print("Session created ...")

    ds_energy = spark.read.parquet(*paths(
        qualify(remote_data_path +
                "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy"
                ), ["/*/*/*/*/astore_metric=energy"], "/*.snappy.parquet"))
    ds_sun = spark.read.parquet(*paths(
        qualify(remote_data_path +
                "/[0-9]/asystem/astore/processed/canonical/parquet/dict/snappy"
                ), ["/*/*/*/*/astore_metric=sun"], "/*.snappy.parquet"))
    print("Listing finished ...")

    ds_energy.createOrReplaceTempView('energy')
    ds_energy.cache()
    df_energy = spark.sql("""
        SELECT
          bin_timestamp,
          data_value / data_scale AS bin_energy
        FROM energy
        WHERE
          data_metric='energy__production__inverter' AND 
          data_type='integral' AND
          bin_width=1 AND
          bin_unit='day'
        ORDER BY bin_timestamp ASC
    """).toPandas()
    ds_sun.createOrReplaceTempView('sun')
    ds_sun.cache()
    df_sun_rise = spark.sql("""
        SELECT
          bin_timestamp,
          data_value / data_scale AS bin_sunrise
        FROM sun
        WHERE          
          data_metric='sun__outdoor__rise' AND
          data_type='epoch' AND
          bin_width=1 AND
          bin_unit='day'
        ORDER BY bin_timestamp ASC
    """).toPandas()
    df_sun_set = spark.sql("""
        SELECT
          bin_timestamp,
          data_value / data_scale AS bin_sunset
        FROM sun
        WHERE          
          data_metric='sun__outdoor__set' AND
          data_type='epoch' AND
          bin_width=1 AND
          bin_unit='day'
        ORDER BY bin_timestamp ASC
    """).toPandas()
    spark.catalog.clearCache()
    print("Dataframes collected ...")

    df = df_energy.set_index(
        pd.to_datetime(df_energy['bin_timestamp'],
                       unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE))
    df['bin_date'] = df.index.date
    df.set_index('bin_date', inplace=True)
    df_energy_day = df.groupby(df.index)['bin_energy'].max().to_frame() \
        .rename(columns={'bin_energy': 'bin_energy_day'})
    df = df.merge(df_energy_day,
                  how='inner',
                  left_index=True,
                  right_index=True)
    df_sun_rise.set_index(
        pd.to_datetime(df_sun_rise['bin_timestamp'],
                       unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE),
        inplace=True)
    df_sun_rise['bin_date'] = df_sun_rise.index.date
    df_sun_rise.set_index('bin_date', inplace=True)
    df = df.merge(df_sun_rise.groupby(
        df_sun_rise.index)['bin_sunrise'].max().to_frame(),
                  how='inner',
                  left_index=True,
                  right_index=True)
    df_sun_set.set_index(
        pd.to_datetime(df_sun_set['bin_timestamp'],
                       unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE),
        inplace=True)
    df_sun_set['bin_date'] = df_sun_set.index.date
    df_sun_set.set_index('bin_date', inplace=True)
    df = df.merge(df_sun_set.groupby(
        df_sun_set.index)['bin_sunset'].max().to_frame(),
                  how='inner',
                  left_index=True,
                  right_index=True)
    df.set_index(
        pd.to_datetime(df['bin_timestamp'],
                       unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE),
        inplace=True)
    df.sort_index(inplace=True)
    print("Output compiled ...")
    print("\nTraining data:\n{}\n\n".format(df.describe()))

    dfvs = {'VETTED': {}, 'PURGED': {}, 'TOVETT': {}}
    for dfs in df.groupby(df.index.date):
        day = dfs[0].strftime('%Y/%m/%d')
        dfvs[('PURGED' if day in DAYS_BLACK_LIST else
              ('TOVETT' if day >= datetime.datetime.now().strftime("%Y/%m/%d")
               else 'VETTED'))][day] = dfs[1]

    for vetting in dfvs:
        for day, dfv in sorted(dfvs[vetting].iteritems()):
            dfv.set_index(pd.to_datetime(
                dfv['bin_timestamp'],
                unit='s').dt.tz_localize('UTC').dt.tz_convert(TIMEZONE),
                          inplace=True)
            if DAYS_PLOT and DAYS_PLOT_DEBUG:
                dfv.plot(title="Energy ({}) - {}".format(day, vetting),
                         y=['bin_energy', 'bin_energy_day'])

    for vetting in dfvs:
        print("Processed {} {} days ...".format(len(dfvs[vetting]),
                                                vetting.lower()))

    dfnss = []
    bins = 1000
    for day, dfv in sorted(dfvs['VETTED'].iteritems()):
        dfv['normalised'] = dfv['bin_energy'] / dfv['bin_energy_day']
        dfv['standardised'] = bins * (
                dfv['bin_timestamp'] - dfv['bin_sunrise']) / \
                              (dfv['bin_sunset'] - dfv['bin_sunrise'])
        dfv['standardised'] = dfv['standardised'].clip(0, bins).astype(int)
        dfns = dfv.drop([
            'bin_timestamp', 'bin_energy', 'bin_energy_day', 'bin_sunrise',
            'bin_sunset'
        ],
                        axis=1).drop_duplicates()
        dfns.set_index('standardised', inplace=True)
        dfns.sort_index(inplace=True)
        dfns = dfns[~dfns.index.duplicated(keep='first')]
        dfns = dfns.reindex(np.arange(0, bins + 1)).ffill()
        dfns.loc[0:10] = 0
        dfns.loc[990:1000] = 1
        dfnss.append(dfns)
        if DAYS_PLOT and DAYS_PLOT_DEBUG:
            dfns.plot(title="Energy ({}) - VETTED".format(day))
    dfnsa = pd.concat(dfnss, axis=1, ignore_index=True)
    if DAYS_PLOT:
        dfnsa.plot(title="Energy Normalised/Standardised (All) - VETTED",
                   legend=False)
    dfnsa = pd.concat(dfnss)
    dfnsa = dfnsa.groupby(dfnsa.index).mean()
    if DAYS_PLOT:
        dfnsa.plot(title="Energy Normalised/Standardised (Mean) - VETTED",
                   legend=False)
    print("Model built ...")

    model_file = '/model/pickle/joblib/none/' \
                 'amodel_version=${project.version}/amodel_model=${asystem-model-energyforecast-intraday.build.version}/model.pkl'
    local_model_file = local_model_path + model_file
    remote_model_file = remote_model_path + model_file
    if os.path.exists(os.path.dirname(local_model_file)):
        shutil.rmtree(os.path.dirname(local_model_file))
    os.makedirs(os.path.dirname(local_model_file))
    pickled_execute = StringIO()
    dill.dump(execute, pickled_execute)
    pickled_execute.flush()
    joblib.dump({
        'pipeline': dfnsa,
        'execute': pickled_execute
    },
                local_model_file,
                compress=True)
    print("Model serialised ...")

    model = joblib.load(local_model_file)
    dfi = pd.DataFrame([{
        "energy__production_Dforecast_Ddaylight__inverter": 0
    }, {
        "energy__production_Dforecast_Ddaylight__inverter": 250
    }, {
        "energy__production_Dforecast_Ddaylight__inverter": 500
    }, {
        "energy__production_Dforecast_Ddaylight__inverter": 750
    }, {
        "energy__production_Dforecast_Ddaylight__inverter":
        1000
    }]).apply(pd.to_numeric, errors='ignore')
    dfo = dill.load(StringIO(model['execute'].getvalue())) \
        (model=model, features=dfi, prediction=True)
    print("Model de-serialised ...")
    print("\nEnergy Mean Input:\n{}\n\nEnergy Mean Output:\n{}\n".format(
        dfi, dfo))

    publish(local_model_file, remote_model_file)
    shutil.rmtree(local_model_path)
    print("Model published ...")

    print("\nPipeline finished in [{}] s".format(
        int(round(time.time())) - time_start))