Exemplo n.º 1
0
def forecastSingleFrame(frame: pd.DataFrame, options: dict) -> pd.DataFrame:
    """
    Basically the equivalent of splitFramesAndForecast() but for a single frame

    Parameters
    ----------
    frame : pd.DataFrame
        Data needed to forecast
    options : dict
        Instructions as to how to handle 'frame'

    Returns
    -------
    frame : pd.DataFrame
        The original 'frame' parameter but with additional columns for the
        forecast.

    """
    try:
        method = params.getParam('method', options)
        currentOptions = options.copy()

        model = Forecast()

        if (method == 'MLR'):
            if params.getParam('seasonality', options) == 'Auto':
                currentOptions['seasonality'] = findOptimalSeasonality(
                    frame.copy(), options.copy())

            fdict = model.forecastMLR(frame, currentOptions.copy())
        elif method.lower() == 'Prophet'.lower():
            fdict = model.forecastProphet(frame, options.copy())
        else:
            fdict = model.forecastARIMA(frame, currentOptions.copy())

        frame = fdict['frame']
        frame['X_ERROR'] = None
        frame['X_METHOD'] = method

    except Exception as e:
        ed = str(traceback.format_exc()).replace('\n', ' ')
        frame['X_ERROR'] = e
        frame['X_METHOD'] = method

    return frame
Exemplo n.º 2
0
    def predictTrend(self, fdict: dict) -> dict:
        """
        Makes a prediction on the X_TREND column using the predictorColumns
        as found in the options dictionary stored under the 'options' key in
        fdict. It is trained on historical data but is then predicted using
        all available data.

        Parameters
        ----------
        fdict : dict
            Dictionary object that must include at least a 'frame' key holding
            the data columns specified in the options index, an 'options' key
            holding the options index, and a 'hisotircalIdx' key specifying
            the index of the historical data v the future data

        Returns
        -------
        dict
            Same as the fdict passed into the model, except that we now have
            X_PREDICTORS, X_COEFFICIENTS, X_INTERCEPT, and X_TREND_PREDICTED
            as columns in the dataframe stored in 'frame'

        """
        frame = fdict['frame']
        historicalData = frame[fdict['historicalIdx']]
        options = fdict['options']

        x = historicalData[params.getParam('predictorColumns', options)]
        y = historicalData['X_TREND']

        model = Ridge(alpha=params.getParam('ridgeAlpha', options))
        model.fit(x, y)

        xscore = frame[params.getParam('predictorColumns', options)]
        yhat = model.predict(xscore)

        frame['X_PREDICTORS'] = ','.join(
            params.getParam('predictorColumns', options))
        frame['X_COEFFICIENTS'] = ','.join(map(str, model.coef_))
        frame['X_INTERCEPT'] = str(model.intercept_)

        frame['X_TREND_PREDICTED'] = yhat

        return fdict
Exemplo n.º 3
0
def findOptimalSeasonality(frame: pd.DataFrame, options: dict) -> str:
    """
    Compares the MAPE as calculated in findMAPE() to determine whether
    'additive', 'multiplicative', or 'none' is the best seasonality
    qualifier to apply

    Parameters
    ----------
    frame : pd.DataFrame
        Data to use in forecast
    options : dict
        Instructions as to how to read the data

    Returns
    -------
    str
        Will be "None", "Additive", or "Multiplicative"

    """
    nonNullRowCount = frame[params.getParam('targetColumn', options)].count()
    periodicity = params.getParam('periodicity', options)

    if nonNullRowCount < periodicity:
        return "None"

    noSeasonalityMAPE = findMAPE(frame, options.copy(), 'None')
    additiveMAPE = findMAPE(frame, options.copy(), 'Additive')
    multiplicativeMAPE = findMAPE(frame, options.copy(), 'Multiplicative')

    minMAPE = min(noSeasonalityMAPE, additiveMAPE, multiplicativeMAPE)

    # in case of ties this returns a hierarchy of simple to complex
    if (noSeasonalityMAPE == minMAPE):
        return "None"
    elif (additiveMAPE == minMAPE):
        return "Additive"
    else:
        return "Multiplicative"
Exemplo n.º 4
0
    def forecastARIMA(self, frame: pd.DataFrame, options: dict) -> dict:
        """
        This function makes a forecast based on the ARIMA model. There is some
        built in autotuning of the parameters for that model, but more info
        can be gleaned from the pmdarimo package documentation.

        Parameters
        ----------
        frame : pd.DataFrame
            pandas dataframe with the requisite data
        options : dict
            dictionary with the necessary keys and values

        Returns
        -------
        dict
            dictionary that includes new data forecast and original data

        """
        targetColumn = params.getParam('targetColumn', options)
        newTargetColumn = 'X_' + targetColumn

        if params.getParam('autoDetectOutliers', options):
            fdict = self.preOutlierDetection(frame, options)
            frame = fdict['frame']

        # if we have done outlier detection there will be an interpolated column that has the interpolated actuals
        if 'X_INTERPOLATED' in frame:
            frame[newTargetColumn] = list(
                map(lambda x: (x if x != 0.0 else random.random() / 1E5),
                    frame['X_INTERPOLATED']))
        else:
            frame[newTargetColumn] = list(
                map(lambda x: (x if x != 0.0 else random.random() / 1E5),
                    frame[params.getParam('targetColumn', options)]))

        options['targetColumn'] = newTargetColumn

        frame['X_INDEX'] = frame.index.values

        # split the data into past/future based on null in target column
        lastNonNullIdx = self.lastNonNullIndex(frame[targetColumn])
        numHoldoutRows = params.getParam('numHoldoutRows', options)
        lastNonNullIdx = lastNonNullIdx - numHoldoutRows
        historicalIdx = frame['X_INDEX'] <= lastNonNullIdx
        historicalData = frame[historicalIdx]
        futureIdx = frame['X_INDEX'] > lastNonNullIdx
        futureData = frame[futureIdx]
        if (numHoldoutRows > 0):
            evalIdx = list(
                map(
                    lambda x: x > lastNonNullIdx and x <=
                    (lastNonNullIdx + numHoldoutRows), frame['X_INDEX']))
        else:
            evalIdx = historicalIdx

        y = np.asarray(historicalData[newTargetColumn].tolist())

        if len(params.getParam('predictorColumns', options)) > 0:
            x = historicalData[params.getParam('predictorColumns', options)]
            model = pm.auto_arima(
                y,
                exogenous=x,
                seasonal=True,
                stepwise=not (params.getParam('hypertune', options)),
                suppress_warnings=True,
                error_action='ignore')

            histPreds, histConf_int = model.predict_in_sample(
                exogenous=x, return_conf_int=True)

            x = futureData[params.getParam('predictorColumns', options)]
            preds, conf_int = model.predict(exogenous=x,
                                            n_periods=len(futureData),
                                            return_conf_int=True)
        else:
            model = pm.auto_arima(y,
                                  seasonal=True,
                                  stepwise=False,
                                  suppress_warnings=True,
                                  error_action='ignore')

            histPreds, histConf_int = model.predict_in_sample(
                return_conf_int=True)

            preds, conf_int = model.predict(n_periods=len(futureData),
                                            return_conf_int=True)

        forecast = np.concatenate((histPreds, preds))
        lpi = np.concatenate(
            (list(map(lambda x: x[0],
                      histConf_int)), list(map(lambda x: x[0], conf_int))))
        upi = np.concatenate(
            (list(map(lambda x: x[1],
                      histConf_int)), list(map(lambda x: x[1], conf_int))))

        frame['X_LPI'] = lpi
        frame['X_FORECAST'] = forecast
        frame['X_UPI'] = upi

        evalFrame = frame[evalIdx]
        mape = calcMAPE(evalFrame['X_FORECAST'], evalFrame[targetColumn])
        frame['X_MAPE'] = mape

        frame['X_RESIDUAL'] = frame['X_FORECAST'] - frame[targetColumn]

        frame['X_APE'] = np.nan
        for index, row in frame.iterrows():
            frame['X_APE'][index] = (
                abs(row['X_FORECAST'] - row[targetColumn]) /
                row[targetColumn] * 100.0) if row[targetColumn] != 0 else None

        if params.getParam('forceNonNegative', options):
            frame.loc[frame['X_FORECAST'] < 0, 'X_FORECAST'] = 0
            frame.loc[frame['X_UPI'] < 0, 'X_UPI'] = 0
            frame.loc[frame['X_LPI'] < 0, 'X_LPI'] = 0

        # add columns for consistency with other methods
        frame['X_SEASONALITY'] = np.nan
        frame['X_SEASONALITY_TYPE'] = None
        frame['X_TREND_PREDICTED'] = np.nan
        frame['X_TREND_RATIO'] = np.nan
        frame['X_HYPERTUNE'] = None
        frame['X_PREDICTORS'] = None
        frame['X_COEFFICIENTS'] = None
        frame['X_INTERCEPT'] = np.nan

        fdict = dict()
        fdict['frame'] = frame
        return fdict
Exemplo n.º 5
0
    def preOutlierDetection(self, frame: pd.DataFrame, options: dict) -> dict:
        """
        This function utilizes the loess method to strip the seasonality from
        the target column and determine a trend. Based on the difference
        between the trend and the actual target column, outliers are 
        identified as a function of the options['outlierStdevMultiplier']
        value, which should be an int or a float.

        Parameters
        ----------
        frame : pd.DataFrame
            pandas dataframe that includes the data to be forecast
        options : dict
            dictionary that includes at least 'seasonalityBandwidth', 
            'targetColumn', 'outlierStdevMultiplier'

        Returns
        -------
        dict
            Will return with two keys, 'frame' which will be the original
            pandas dataframe but now with the X_INTERPOLATED and X_OUTLIER
            columns, and 'options', the value of which is whatever dictionary
            was originally passed through the options parameter.

        """
        targetColumn = options['targetColumn']

        frame['X_INDEX'] = frame.index.values
        frame['X_INTERPOLATED'] = frame[targetColumn]

        # split the data into past/future based on null in target column
        nullIdx = frame[targetColumn].isnull()
        futureData = frame[nullIdx]
        historicalIdx = list(map(operator.not_, nullIdx))
        historicalData = frame[historicalIdx]

        x = np.asarray(historicalData['X_INDEX'].tolist())
        y = np.asarray(historicalData[params.getParam('targetColumn',
                                                      options)].tolist())
        bandwidth = params.getParam('seasonalityBandwidth', options)
        xout, yout, weights = lo.loess_1d(x, y, frac=bandwidth, degree=2)

        frame['X_TREND'] = np.append(
            yout, np.asarray(futureData[targetColumn].tolist()))
        frame['X_TREND_DIFF'] = frame[targetColumn] - frame['X_TREND']

        stdev = frame['X_TREND_DIFF'].std()
        avg = frame['X_TREND_DIFF'].mean()

        mult = params.getParam('outlierStdevMultiplier', options)

        #identifies outliers based on the number of standard deviations
        #from the mean as specified in line 100. It is thus not the strict
        #mean of the target column, but the mean of the difference
        #between the target column and the loess trend calculated in line 94.
        frame['X_OUTLIER'] = 0
        for index, row in frame.iterrows():
            diff = abs(frame['X_TREND_DIFF'][index])
            if diff > avg + mult * stdev:
                if index > 0 and index <= frame.shape[0] - 1:
                    frame['X_INTERPOLATED'][index] = mean([
                        frame['X_INTERPOLATED'][index - 1],
                        frame['X_INTERPOLATED'][index + 1]
                    ])
                    frame['X_OUTLIER'][index] = 1
                else:
                    frame['X_INTERPOLATED'][index] = frame['X_TREND'][index]
                    frame['X_OUTLIER'][index] = 1

        frame.drop(columns=['X_TREND', 'X_TREND_DIFF', 'X_INDEX'])

        fdict = dict()
        fdict['frame'] = frame
        fdict['options'] = options

        return fdict
Exemplo n.º 6
0
    def forecastProphet(self, frame: pd.DataFrame, options: dict) -> dict:
        """
        This is the function where the forecast produced by 
        forecastProphetInternal() gets integrated into the data we work
        with. It has two functionalities based on the options['hyptertune']
        option. When that value is False, it runs forecastProphetInternal()
        with "vanilla" settings for Prophet(), if True it works through a 
        grid to get an optimal answer.

        Parameters
        ----------
        frame : pd.DataFrame
            pandas dataframe with all the data we want to use to forecast
        options : dict
            dictionary with the necessary specifications

        Returns
        -------
        dict
            dictionary including the MAPE, the data, and the options used
            to get there

        """
        targetColumn = params.getParam('targetColumn', options)
        newTargetColumn = 'X_' + targetColumn

        if params.getParam('autoDetectOutliers', options):
            fdict = self.preOutlierDetection(frame, options)
            frame = fdict['frame']

        # if we have done outlier detection there will be an interpolated column that has the interpolated actuals
        if 'X_INTERPOLATED' in frame:
            frame[newTargetColumn] = list(
                map(lambda x: (x if x != 0.0 else random.random() / 1E5),
                    frame['X_INTERPOLATED']))
        else:
            frame[newTargetColumn] = list(
                map(lambda x: (x if x != 0.0 else random.random() / 1E5),
                    frame[params.getParam('targetColumn', options)]))

        options['targetColumn'] = newTargetColumn

        frame['X_INDEX'] = frame.index.values

        # split the data into past/future based on null in target column
        lastNonNullIdx = self.lastNonNullIndex(frame[targetColumn])
        numHoldoutRows = params.getParam('numHoldoutRows', options)
        lastNonNullIdx = lastNonNullIdx - numHoldoutRows
        historicalIdx = frame['X_INDEX'] <= lastNonNullIdx
        historicalData = frame[historicalIdx]
        futureIdx = frame['X_INDEX'] > lastNonNullIdx
        futureData = frame[futureIdx]
        if (numHoldoutRows > 0):
            evalIdx = list(
                map(
                    lambda x: x > lastNonNullIdx and x <=
                    (lastNonNullIdx + numHoldoutRows), frame['X_INDEX']))
        else:
            evalIdx = historicalIdx

        pframe = pd.DataFrame()
        pframe['ds'] = historicalData[params.getParam('timestampColumn',
                                                      options)]
        pframe['y'] = historicalData[params.getParam('targetColumn', options)]

        championPJSON = None
        if not (params.getParam('hypertune', options)):
            interval_width = params.getParam('interval_width', options)
            changepoint_prior_scale = params.getParam(
                'changepoint_prior_scale', options)
            holidays_prior_scale = params.getParam('holidays_prior_scale',
                                                   options)
            changepoints_fraction = params.getParam('changepoints_fraction',
                                                    options)

            forecast = self.forecastProphetInternal(
                frame, options, pframe, historicalData, futureData,
                'multiplicative', interval_width, changepoint_prior_scale,
                holidays_prior_scale, changepoints_fraction)
        else:
            forecast = None
            championP = None
            championMAPE = 1E20

            # find best model through hyper-tuning
            pvals = {
                'seasonality_mode': ('multiplicative', 'additive'),
                'changepoint_prior_scale': [0.05, 0.15, 0.25],
                'holidays_prior_scale': [0.1, 1.0, 10.0],
                'changepoint_fraction': [0.05, 0.1, 0.2]
            }

            pgrid = ParameterGrid(pvals)

            interval_width = params.getParam('interval_width', options)

            for p in pgrid:
                challengerForecast = self.forecastProphetInternal(
                    frame, options, pframe, historicalData, futureData,
                    p['seasonality_mode'], interval_width,
                    p['changepoint_prior_scale'], p['holidays_prior_scale'],
                    p['changepoint_fraction'])
                evalFrame = challengerForecast[evalIdx]
                actualsFrame = frame[evalIdx]
                mape = calcMAPE(
                    evalFrame['yhat'],
                    actualsFrame[params.getParam('targetColumn', options)])

                if mape < championMAPE:
                    championP = p
                    championMAPE = mape
                    forecast = challengerForecast

            championPJSON = json.dumps(championP)

            print('Champion parameters: ', championPJSON, 'Champion MAPE: ',
                  championMAPE)

        frame['X_LPI'] = forecast['yhat_lower']
        frame['X_FORECAST'] = forecast['yhat']
        frame['X_UPI'] = forecast['yhat_upper']

        evalFrame = frame[evalIdx]
        mape = calcMAPE(evalFrame['X_FORECAST'],
                        evalFrame[params.getParam('targetColumn', options)])
        frame['X_MAPE'] = mape

        frame['X_RESIDUAL'] = frame['X_FORECAST'] - frame[params.getParam(
            'targetColumn', options)]

        targetColumn = params.getParam('targetColumn', options)
        frame['X_APE'] = np.nan
        for index, row in frame.iterrows():
            frame['X_APE'][index] = (
                abs(row['X_FORECAST'] - row[targetColumn]) /
                row[targetColumn] * 100.0) if row[targetColumn] != 0 else None

        if 'forceNonNegative' in options and params.getParam(
                'forceNonNegative', options):
            frame.loc[frame['X_FORECAST'] < 0, 'X_FORECAST'] = 0
            frame.loc[frame['X_UPI'] < 0, 'X_UPI'] = 0
            frame.loc[frame['X_LPI'] < 0, 'X_LPI'] = 0

        # add columns for consistency with other methods
        frame['X_SEASONALITY'] = np.nan
        frame['X_SEASONALITY_TYPE'] = None
        frame['X_TREND_PREDICTED'] = np.nan
        frame['X_TREND_RATIO'] = np.nan
        frame['X_PREDICTORS'] = None
        frame['X_COEFFICIENTS'] = None
        frame['X_INTERCEPT'] = np.nan
        frame['X_HYPERTUNE'] = championPJSON

        fdict = dict()
        fdict['MAPE'] = mape
        fdict['frame'] = frame

        return fdict
Exemplo n.º 7
0
    def forecastProphetInternal(self, frame: pd.DataFrame, options: dict,
                                pframe: pd.DataFrame,
                                historicalData: pd.DataFrame,
                                futureData: pd.DataFrame, seasonalityMode: str,
                                intervalWidth: float,
                                changePointPriorScale: float,
                                holidayPriorScale: float,
                                changePointFraction: float) -> pd.DataFrame:
        """
        This function handles the actual running of the Prophet() function and
        its interaction with the option dictionary and the frame dataframe. It
        is called in the forecastProphet() method.
        
        ****Add explanation of the "changepoints" concept

        Parameters
        ----------
        frame : pd.DataFrame
            Has all the info necessary for the forecast
        options : dict
            Has parameters determining how forecast is to be run
        pframe : pd.DataFrame
            Dataframe slice to be the one that the model is trained on
        historicalData : pd.DataFrame
            Should be a subset of the frame parameter
        futureData : pd.DataFrame
            Should be a subset of the frame parameter, should be the 
            difference between frame and historicalData
        seasonalityMode : str
            In this package this defaults to "multiplicative"
        intervalWidth : float
            Width of the uncertainty intervals provided in the forecast
        changePointPriorScale : float
            Determines the limit on changepoints. Higher values mean more
            changepoints will be allowed.
        holidayPriorScale : float
            See Prophet() documentation
        changePointFraction : float
            Used to determine the number of changepoints

        Returns
        -------
        forecast : TYPE
            Forecast returned by the model

        """
        nChangePoints = math.ceil(len(historicalData) * changePointFraction)

        periodicity = params.getParam('periodicity', options)

        weeklySeasonality = (periodicity == 52 or periodicity == 53)
        dailySeasonality = (periodicity == 356)

        model = Prophet(yearly_seasonality=True,
                        daily_seasonality=dailySeasonality,
                        weekly_seasonality=weeklySeasonality,
                        interval_width=intervalWidth,
                        seasonality_mode=seasonalityMode,
                        changepoint_prior_scale=changePointPriorScale,
                        holidays_prior_scale=holidayPriorScale,
                        n_changepoints=nChangePoints)

        for pred in params.getParam('predictorColumns', options):
            model.add_regressor(pred)
            pframe[pred] = historicalData[pred]

        model.fit(pframe)

        #
        # Frequencies are defined here:
        # https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-offset-aliases
        #
        freq = None
        if (periodicity == 12):
            freq = 'MS'  # monthly start
        elif (periodicity == 365):
            freq = 'D'  # daily
        elif (periodicity == 4):
            freq = 'QS'  # quarterly start
        elif (periodicity == 52 or periodicity == 53):
            freq = 'W'  # weekly

        future = model.make_future_dataframe(periods=len(futureData),
                                             freq=freq)

        for pred in params.getParam('predictorColumns', options):
            future[pred] = frame[pred]

        forecast = model.predict(future)

        return forecast
Exemplo n.º 8
0
    def forecastMLR(self, frame: pd.DataFrame, options: dict) -> dict:
        """
        Calculates the multiple linear regression (MLR) forecast for the
        data provided in frame and the parameters specified in options.

        Parameters
        ----------
        frame : pd.DataFrame
            Data to be used to forecast
        options : dict
            dictionary that must include at least the predictors and the
            target column specifications

        Returns
        -------
        dict
            Returns a dictionary that has, as its main components, the 
            dataframe passed to frame but now with the forecast from MLR, 
            as well as other MLR-specific columns like lower and upper bounds
            on the forecast, etc etc

        """
        predCols = params.getParam('predictorColumns', options)
        if not ('X_INDEX' in predCols):
            predCols.append('X_INDEX')
            options['predictorColumns'] = predCols

        if params.getParam('autoDetectOutliers', options):
            fdict = self.preOutlierDetection(frame, options)
            frame = fdict['frame']

        fdict = self.prepare(frame, options)

        fdict = self.predictTrend(fdict)

        fdict = self.calcSeasonality(fdict)

        options = fdict['options']
        frame['X_SEASONALITY_TYPE'] = params.getParam('seasonality', options)

        #construct forecast from columns calculated by prepare(),
        #predictTrend() and calcSeasonality() functions
        if (params.getParam('seasonality', options) == 'Additive'):
            frame['X_FORECAST'] = frame['X_SEASONALITY'] + frame[
                'X_TREND_PREDICTED']
        elif (params.getParam('seasonality', options) == 'Multiplicative'):
            frame['X_FORECAST'] = frame['X_SEASONALITY'] * frame[
                'X_TREND_PREDICTED']
        else:
            frame['X_FORECAST'] = frame['X_TREND_PREDICTED']

        evalIdx = fdict['evalIdx']
        targetColumn = params.getParam('targetColumn', options)
        evalFrame = frame[evalIdx]
        mape = calcMAPE(evalFrame['X_FORECAST'], evalFrame[targetColumn])
        frame['X_MAPE'] = mape
        fdict['MAPE'] = mape

        frame['X_RESIDUAL'] = frame['X_FORECAST'] - frame[params.getParam(
            'targetColumn', options)]

        predictionInterval = calcPredictionInterval(frame['X_RESIDUAL'])

        frame['X_LPI'] = frame['X_FORECAST'] - predictionInterval
        frame['X_UPI'] = frame['X_FORECAST'] + predictionInterval

        targetColumn = params.getParam('targetColumn', options)
        frame['X_APE'] = np.nan
        for index, row in frame.iterrows():
            frame['X_APE'][index] = (
                abs(row['X_FORECAST'] - row[targetColumn]) /
                row[targetColumn] * 100.0) if row[targetColumn] != 0 else None

        if params.getParam('forceNonNegative', options):
            frame.loc[frame['X_FORECAST'] < 0, 'X_FORECAST'] = 0
            frame.loc[frame['X_UPI'] < 0, 'X_UPI'] = 0
            frame.loc[frame['X_LPI'] < 0, 'X_LPI'] = 0

        frame['X_HYPERTUNE'] = None

        return fdict
Exemplo n.º 9
0
    def calcSeasonality(self, fdict: dict) -> dict:
        """
        Calculates the additive or multiplicative seasonality of the data 
        contained in fdict['frame'] unless otherwise specified by 
        fdict['periodicity'].

        Parameters
        ----------
        fdict : dict
            A dictionary that has gone through the prepare() function

        Returns
        -------
        dict
            equal to fdict parameter except for the addition of the 
            X_SEASONALITY column in the dataframe stored in fdict['frame']

        """
        frame = fdict['frame']
        historicalData = frame[fdict['historicalIdx']]
        options = fdict['options']
        periodicity = params.getParam('periodicity', options)

        # short-circuit if none is chosen
        if params.getParam('seasonality', options) == 'None':
            frame['X_SEASONALITY'] = np.nan
            fdict['frame'] = frame
            return fdict

        diffData = historicalData['X_TREND_DIFF' if params.
                                  getParam('seasonality', options) ==
                                  'Additive' else 'X_TREND_RATIO']

        trendData = historicalData['X_TREND']

        buckets = []  # holds trend (diff/ratio) data indexed by periodicity
        tbuckets = []  # holds corresponding trend data
        for idx in range(len(diffData)):
            diffVal = diffData.iloc[idx]
            trendVal = trendData.iloc[idx]

            bucketIdx = math.floor(idx % periodicity)
            if (len(buckets) < (bucketIdx + 1)):
                tbuckets.append([trendVal])
                buckets.append([diffVal])
            else:
                tbuckets[bucketIdx].append(trendVal)
                buckets[bucketIdx].append(diffVal)

        seasonality = []
        rowCount = frame.shape[0]

        medianVals = []
        for diffs in buckets:
            medianVals.append(median(diffs))

        for idx in range(rowCount):
            diffIdx = math.floor(idx % periodicity)
            seasonality.append(medianVals[diffIdx])

        frame['X_SEASONALITY'] = seasonality
        fdict['frame'] = frame

        return fdict
Exemplo n.º 10
0
    def prepare(self, frame: pd.DataFrame, options: dict) -> dict:
        """
        This function does a few things in an attempt to prepare the data for
        forecasting. First, if specified in the 'options' dictionary, it will
        scale all of the variables to between 0 and 1. It will then add to the
        dataframe 'frame' an X_TREND, X_TREND_DIFF, and X_TREND_RATIO column
        to be used in later modeling/prediction. It will also create, as 
        specified in the 'options' dictionary, indexes that will be passed
        through in the return to identify which parts of the model should be
        used for training, evaluation, etc.

        Parameters
        ----------
        frame : pd.DataFrame
            pandas dataframe that includes all the information necessary to
            forecast
        options : dict
            dictionary that includes at least the keys 'scalePredictors',
            'predictorColumns', 'targetColumn', 'numHoldoutRows', and 
            'seasonalityBandwidth'

        Returns
        -------
        dict
            dictionary that includes the dataframe passed through the frame
            parameter with additions, the options dictionary, and keys 
            'historicalIdx', 'futureIdx', and 'evalIdx', corresponding to
            the training, forecasting, and holdout periods as measured by
            the index of the dataframe stored under the 'frame' key

        """
        # create copy of target for modification (fill zeros with very small number)
        random.seed(158923)
        targetColumn = params.getParam('targetColumn', options)

        frame['X_INDEX'] = frame.index.values

        # scale predictors between 0 and 1
        try:
            newPredCols = []
            if params.getParam('scalePredictors', options):
                for predCol in params.getParam('predictorColumns', options):
                    newCol = 'X_' + predCol
                    frame[newCol] = (frame[predCol] - frame[predCol].min()) / (
                        frame[predCol].max() - frame[predCol].min())
                    newPredCols.append(newCol)
                options['predictorColumns'] = newPredCols
        except Exception as e:
            print("Unable to scale predictors: ", e)

        # ensure predictors and target are float
        frame[targetColumn] = frame[targetColumn].astype(float)
        frame[params.getParam('predictorColumns',
                              options)] = frame[params.getParam(
                                  'predictorColumns', options)].astype(float)

        newTargetColumn = 'X_' + targetColumn

        # if we have done outlier detection there will be an interpolated column that has the interpolated actuals
        if 'X_INTERPOLATED' in frame:
            frame[newTargetColumn] = list(
                map(lambda x: (x if x != 0.0 else random.random() / 1E5),
                    frame['X_INTERPOLATED']))
        else:
            frame[newTargetColumn] = list(
                map(lambda x: (x if x != 0.0 else random.random() / 1E5),
                    frame[params.getParam('targetColumn', options)]))

        options['targetColumn'] = newTargetColumn

        # split the data into past/future based on null in target column
        lastNonNullIdx = self.lastNonNullIndex(frame[targetColumn])
        fullHistoricalIdx = frame['X_INDEX'] <= lastNonNullIdx

        # we use full history for trending/smoothing (but NOT modeling in the future)
        fullHistoricalData = frame[fullHistoricalIdx]
        numHoldoutRows = params.getParam('numHoldoutRows', options)
        fullFutureIdx = frame['X_INDEX'] > lastNonNullIdx
        fullFutureData = frame[fullFutureIdx]

        # we store history minus hold-out for future modeling

        # could these variables potentially be renamed?
        # the subtraction of numHoldoutRows really changes the "concept"
        # of what's being discussed here
        lastNonNullIdx = lastNonNullIdx - numHoldoutRows
        historicalIdx = frame['X_INDEX'] <= lastNonNullIdx
        #historicalData = frame[historicalIdx]
        futureIdx = frame['X_INDEX'] > lastNonNullIdx
        #futureData = frame[futureIdx]

        if (numHoldoutRows > 0):
            # if variable names are switched as discussed above it would avoid
            # some of the awkward constructions here
            evalIdx = list(
                map(
                    lambda x: x > lastNonNullIdx and x <=
                    (lastNonNullIdx + numHoldoutRows), frame['X_INDEX']))
        else:
            evalIdx = historicalIdx

        x = np.asarray(fullHistoricalData['X_INDEX'].tolist())
        y = np.asarray(fullHistoricalData[newTargetColumn].tolist())
        bandwidth = params.getParam('seasonalityBandwidth', options)
        xout, yout, weights = lo.loess_1d(x, y, frac=bandwidth, degree=2)

        frame['X_TREND'] = np.append(
            yout, np.asarray(fullFutureData[targetColumn].tolist()))
        #for use with additive seasonality?
        frame['X_TREND_DIFF'] = frame[targetColumn] - frame['X_TREND']
        #for use with multiplicative seasonality?
        frame['X_TREND_RATIO'] = frame[targetColumn] / frame['X_TREND']

        fdict = dict()
        fdict['historicalIdx'] = historicalIdx
        fdict['futureIdx'] = futureIdx
        fdict['evalIdx'] = evalIdx
        fdict['frame'] = frame
        fdict['options'] = options

        return fdict
Exemplo n.º 11
0
def splitFramesAndForecast(frame: pd.DataFrame, options: dict) -> pd.DataFrame:
    """
    Automates forecasting for each subframe defined by the sortColumns options
    parameter. Will return the forecast using the method with the best MAPE for
    each subframe. Will also print the MAPEs of each method for each forecast
    made.

    Parameters
    ----------
    frame : pd.DataFrame
        Data needed for forecast
    options : dict
        Instructions on how to read 'frame'

    Returns
    -------
    outputFrame : pd.DataFrame
        Same as original 'frame' but with all the columns associated with a
        forecast added.

    """
    #creates a list of frames, each of which will correspond to a different
    #forecast
    frame.sort_values(by=params.getParam('sortColumns', options),
                      ascending=True,
                      inplace=True)

    frames = list(frame.groupby(by=params.getParam('splitColumns', options)))

    outputFrame = None

    for frame in frames:
        frame = frame[1]
        frame.reset_index(drop=True, inplace=True)

        method = params.getParam('method', options)

        #specifies actions if the forecast method is set to "Auto" in
        #the options dictionary
        if method == 'Auto':
            opts = options.copy()
            opts['method'] = 'ARIMA'
            arimaFrame = forecastSingleFrame(frame.copy(), opts)
            arimaMAPE = 1E6 if 'X_MAPE' not in arimaFrame else arimaFrame[
                'X_MAPE'][0]

            opts = options.copy()
            opts['method'] = 'Prophet'
            prophetFrame = forecastSingleFrame(frame.copy(), opts)
            prophetMAPE = 1E6 if 'X_MAPE' not in prophetFrame else prophetFrame[
                'X_MAPE'][0]

            opts = options.copy()
            opts['method'] = 'MLR'
            mlrFrame = forecastSingleFrame(frame.copy(), opts)
            mlrMAPE = 1E6 if 'X_MAPE' not in mlrFrame else mlrFrame['X_MAPE'][0]

            if 'X_FORECAST' in mlrFrame and 'X_FORECAST' in prophetFrame and 'X_FORECAST' in arimaFrame:
                ensembleFrame = mlrFrame.copy()

                # we calculate MAPE using original data column
                targetColumn = params.getParam('targetColumn', options)
                if (targetColumn.startswith('X_')):
                    targetColumn = targetColumn[2:]

                # split the data into past/future based on null in target column
                numHoldoutRows = params.getParam('numHoldoutRows', options)
                lastNonNullIdx = Forecast().lastNonNullIndex(
                    ensembleFrame[targetColumn])
                lastNonNullIdx = lastNonNullIdx - numHoldoutRows

                if (numHoldoutRows > 0):
                    evalIdx = list(
                        map(
                            lambda x: x > lastNonNullIdx and x <=
                            (lastNonNullIdx + numHoldoutRows),
                            ensembleFrame['X_INDEX']))
                else:
                    evalIdx = ensembleFrame['X_INDEX'] <= lastNonNullIdx

                ensembleFrame['X_FORECAST'] = list(
                    map(lambda x, y, z: median([x, y, z]),
                        mlrFrame['X_FORECAST'], arimaFrame['X_FORECAST'],
                        prophetFrame['X_FORECAST']))
                ensembleFrame['X_LPI'] = list(
                    map(lambda x, y, z: median([x, y, z]), mlrFrame['X_LPI'],
                        arimaFrame['X_LPI'], prophetFrame['X_LPI']))
                ensembleFrame['X_UPI'] = list(
                    map(lambda x, y, z: median([x, y, z]), mlrFrame['X_UPI'],
                        arimaFrame['X_UPI'], prophetFrame['X_UPI']))

                evalFrame = ensembleFrame[evalIdx]
                try:
                    ensembleMAPE = calcMAPE(evalFrame['X_FORECAST'],
                                            evalFrame[targetColumn])
                    ensembleFrame['X_MAPE'] = ensembleMAPE
                    for index, row in ensembleFrame.iterrows():
                        ensembleFrame['X_APE'][index] = (
                            abs(row['X_FORECAST'] - row[targetColumn]) /
                            row[targetColumn] *
                            100.0) if row[targetColumn] != 0 else None
                except:
                    # this may be needed if all forecasts frame and MAPE, APE cannot be calculated
                    if (not ('X_MAPE' in ensembleFrame)):
                        ensembleFrame['X_MAPE'] = 1E6
                    if (not ('X_APE' in ensembleFrame)):
                        ensembleFrame['X_APE'] = 1E6

                mapes = [mlrMAPE, arimaMAPE, prophetMAPE, ensembleMAPE]
            else:
                mapes = [mlrMAPE, arimaMAPE, prophetMAPE]

            print("Auto MAPEs (MLR, ARIMA, Prophet, Ensemble): ", mapes)

            minMAPE = min(mapes)

            if (mlrMAPE <= minMAPE):
                frame = mlrFrame
                frame['X_METHOD'] = 'MLR'
            elif (prophetMAPE <= minMAPE):
                frame = prophetFrame
                frame['X_METHOD'] = 'Prophet'
            elif (arimaMAPE <= minMAPE):
                frame = arimaFrame
                frame['X_METHOD'] = 'ARIMA'
            else:
                frame = ensembleFrame
                frame['X_METHOD'] = 'Ensemble'

        else:
            frame = forecastSingleFrame(frame, options.copy())

        outputFrame = frame if outputFrame is None else outputFrame.append(
            frame, ignore_index=True)

    return outputFrame