Exemplo n.º 1
0
class ProphetForQlik:
    """
    A class to provide Facebook Prophet functions for Qlik.
    """
    
    # Counter used to name log files for instances of the class
    log_no = 0
    
    # Dates in Qlik are stored as serial number that equals the number of days since December 30, 1899. 
    # This variable is used in correctly translating dates.
    qlik_cal_start = pd.Timestamp('1899-12-30')
    # This variable denotes the unit of time used in Qlik for numerical representation of datetime values
    qlik_cal_unit = 'D'
    
    def __init__(self, request):
        """
        Class initializer.
        :param request: an iterable sequence of RowData
        :Sets up the input data frame and parameters based on the request
        """
        
        # Set the request variable for this object instance
        self.request = request

        # Create a Pandas Data Frame with column ds for the dates and column y for values
        self.request_df = pd.DataFrame([(row.duals[0].numData, row.duals[1].numData) \
                                        for request_rows in self.request \
                                        for row in request_rows.rows], \
                                       columns=['ds','y'])
        
        # Handle null value rows in the request dataset
        self.NaT_df = self.request_df.loc[self.request_df.ds.isnull()].copy()
        
        # If such a row exists it will be sliced off and then added back to the response
        if len(self.NaT_df) > 0:
            self.NaT_df.loc[:,'y'] = 0
            self.request_df = self.request_df.loc[self.request_df.ds.notnull()]               
        
        # Get additional arguments from the third column in the request data
        # Arguments should take the form of a comma separated string: 'arg1=value1, arg2=value2'
        self._set_params()
        
        # If the request contains holidays create a holidays data frame
        if self.has_holidays:
            self.holidays_df = pd.DataFrame([(row.duals[0].numData, row.duals[2].strData)\
                                             for request_rows in self.request\
                                             for row in request_rows.rows],\
                                            columns=['ds','holiday'])
            
            if self.lower_window is not None:
                self.holidays_df.loc[:, 'lower_window'] = self.lower_window
                
            if self.upper_window is not None:
                self.holidays_df.loc[:, 'upper_window'] = self.upper_window
        
        # Additional information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            self._print_log(1)
        
        # Convert numerical date values to datetime
        self.request_df.loc[:,'ds'] = pd.to_datetime(self.request_df.loc[:,'ds'], unit=self.qlik_cal_unit,
                                                     origin=self.qlik_cal_start)
        
        # If the request contains holidays update the ds column for it as well
        if self.has_holidays:
            self.holidays_df.loc[:,'ds'] = self.request_df.loc[:,'ds'].copy()
            
            # Also remove rows from the holidays data frame where the holiday or ds column is empty
            self.holidays_df = self.holidays_df.loc[self.holidays_df.holiday != '']
            self.holidays_df = self.holidays_df.loc[self.holidays_df.ds.notnull()]
            
            # Make the holidays names lower case to avoid the return argument becoming case sensitive
            self.holidays_df.loc[:,'holiday'] = self.holidays_df.holiday.str.lower()
            # Also remove spaces and apostrophes
            self.holidays_df.loc[:,'holiday'] = self.holidays_df.holiday.str.replace(" ", "_")
            self.holidays_df.loc[:,'holiday'] = self.holidays_df.holiday.str.replace("'", "")
            
            # And sort by the ds column and reset indexes
            self.holidays_df = self.holidays_df.sort_values('ds')
            self.holidays_df = self.holidays_df.reset_index(drop=True)
            
            # Finally add this to the key word argumemnts for Prophet
            self.prophet_kwargs['holidays'] = self.holidays_df
        
        # Sort the Request Data Frame based on dates, as Qlik may send unordered data
        self.request_df = self.request_df.sort_values('ds')
        
        # Store the original indexes for re-ordering output later
        self.request_index = self.request_df.loc[:,'ds']
        
        # Ignore the placeholder rows which will be filled with forecasted figures later
        self.input_df = self.request_df.iloc[:-self.periods].copy()
        
        # Reset the indexes for the input data frame. 
        # Not doing this interferes with correct ordering of the output from Prophet
        self.input_df = self.input_df.reset_index(drop=True)
        
        # If take_log = true take logarithm of relevant input values.
        # This is usually to make the timeseries more stationary
        if self.take_log:
            self.input_df.loc[:,'y'] = np.log(self.input_df.loc[:,'y'])
            
            if self.cap is not None:
                self.cap = np.log(self.cap)
            
                if self.floor is not None:
                    self.floor = np.log(self.floor)
        
        # If a logistic growth model is applied add the cap and floor columns to the input data frame
        if self.cap is not None:
            self.input_df.loc[:,'cap'] = self.cap
            
            if self.floor is not None:
                self.input_df.loc[:,'floor'] = self.floor
            
        if self.debug:
            self._print_log(2)
    
    @classmethod
    def init_seasonality(cls, request):
        """
        Alternative initialization method for this class
        Used when the request contains the timeseries as a contatenated string, repeated for every row
        This is used when the number of input data points differs from the output rows required for seasonality plots
        """
                
        # The rows are duplicates in this kind of request, so inputs are simply taken from the first row
        # First we store the correct number of rows to be output.
        request_row_count = len([row for request_rows in request for row in request_rows.rows])
        # The timeseries is accepted as a string from the second column of the first row
        timeseries = request[0].rows[0].duals[1].strData
        # The holidays are taken from the third column of the first row
        holidays = request[0].rows[0].duals[2].strData
        # The key word arguments are taken from the fourth column of the first row
        args = request[0].rows[0].duals[3]
        
        # The data may be sent unsorted by Qlik, so we have to store the order to use when sending the results
        sort_order = pd.DataFrame([(row.duals[0].numData, row.duals[0].strData) \
                                        for request_rows in request \
                                        for row in request_rows.rows], \
                                       columns=['seasonality_num', 'seasonality_str'])
        
        # We ignore Null values here as these are handled separately in the response
        sort_order = sort_order.loc[sort_order.seasonality_num.notnull()]
        
        # The correct sort order is based on the data frame's index after sorting on the seasonality field
        sort_order = sort_order.sort_values('seasonality_num')
        
        # Re-create the request with ds and y columns
        pairs = timeseries.split(";")
        request_df = pd.DataFrame([p.split(":") for p in pairs], columns=['ds', 'y'])
        
        # Convert strings to numeric values, replace conversion errors with Null values
        request_df = request_df.applymap(lambda s: locale.atof(s) if s else np.NaN)      
        
        # Check if the holidays column is populated
        if len(holidays) > 0:
            # Create a holidays data frame
            pairs = holidays.split(";")
            holiday_df = pd.DataFrame([p.split(":") for p in pairs], columns=['ds', 'holiday'])
            
            # Merge the holidays with the request data frame using column ds as key
            request_df = pd.merge(request_df, holiday_df, on='ds', how='left')
            
            # Replace null values in the holiday column with empty strings
            request_df = request_df.fillna(value={'holiday': ''})
        
        # Values in the data frame are converted to type SSE.Dual
        request_df.loc[:,'ds'] = request_df.loc[:,'ds'].apply(lambda result: SSE.Dual(numData=result))
        request_df.loc[:,'y'] = request_df.loc[:,'y'].apply(lambda result: SSE.Dual(numData=result))
        if 'holiday' in request_df.columns:
            request_df.loc[:,'holiday'] = request_df.loc[:,'holiday'].apply(lambda result: SSE.Dual(strData=result))
        
        # Add the keyword arguments to the data frame as well, already of type SSE.Dual
        request_df.loc[:, 'args'] = args
        
        # Create the updated request list and convert to SSE data types
        request_list = request_df.values.tolist()
        request_list = [SSE.Row(duals=duals) for duals in request_list]
        updated_request = [SSE.BundledRows(rows=request_list)]
                
        # Call the default initialization method
        instance = ProphetForQlik(updated_request)
        
        # Handle null value row in the request dataset
        instance.NaT_df = request_df.loc[request_df.ds.isnull()].copy()
        
        # If such a row exists it will be sliced off and then added back to the response
        if len(instance.NaT_df) > 0:
            instance.NaT_df.loc[:,'y'] = 0         
        
        # Set a property that lets us know this instance was created for seasonality forecasts
        instance.is_seasonality_request = True
        
        # Set a property that lets us know the row count in the original request as this will be different from request_df
        instance.request_row_count = request_row_count
        
        # Update the default result type if this was not passed in arguments
        if instance.result_type == 'yhat':
            instance.result_type = instance.seasonality
        
        # Set the sort order to be used when returning the results
        instance.sort_order = sort_order
        
        # Return the initialized ProphetForQlik instance
        return instance
    
    def predict(self):
        """
        Calculate forecasted values using the Prophet library.
        """
        
        # If the input data frame contains less than 2 non-Null rows, prediction is not possible
        if len(self.input_df) - self.input_df.y.isnull().sum() <= 2:
            
            if self.debug:
                self._print_log(3)
            
            # A series of null values is returned to avoid an error in Qlik
            return pd.Series([np.NaN for y in range(self.request_row_count)])
        
        # Instantiate a Prophet object and fit the input data frame:
        
        if len(self.prophet_kwargs) > 0:
            self.model = Prophet(**self.prophet_kwargs)
        else:
            self.model = Prophet()
        
        # Add custom seasonalities if defined in the arguments
        if self.name is not None and len(self.add_seasonality_kwargs) > 0:
            self.model.add_seasonality(**self.add_seasonality_kwargs)
        
        self.model.fit(self.input_df)
             
        # Create a data frame for future values
        self.future_df = self.model.make_future_dataframe(**self.make_kwargs)
        
        # If a logistic growth model is applied add the cap and floor columns to the future data frame
        if self.cap is not None:
            self.future_df.loc[:,'cap'] = self.cap
            
            if self.floor is not None:
                self.future_df.loc[:,'floor'] = self.floor
        
        # Prepare the forecast
        self._forecast()
        
        if self.debug:
            self._print_log(4)
        
        return self.forecast.loc[:,self.result_type]
    
    def _set_params(self):
        """
        Set input parameters based on the request.
        Parameters implemented for the Prophet() function are: growth, cap, floor, changepoint_prior_scale, interval_width 
        Parameters implemented for the make_future_dataframe() function are: freq, periods
        Parameters implemented for seasonality are: add_seasonality, seasonality_period, seasonality_fourier, seasonality_prior_scale
        Parameters implemented for holidays are: holidays_prior_scale, lower_window, upper_window
        Additional parameters for seasonlity requests are: weekly_start, yearly_start
        Additional parameters used are: return, take_log, seasonality, debug
        """
        
        # Calculate the forecast periods based on the number of placeholders in the data
        self.periods = utils.count_placeholders(self.request_df.loc[:,'y'])
        
        # Set the row count in the original request
        self.request_row_count = len(self.request_df) + len(self.NaT_df)
        
        # Set default values which will be used if an argument is not passed
        self.result_type = 'yhat'
        self.take_log  = False
        self.seasonality = 'yearly'
        self.debug = False
        self.freq = 'D'
        self.cap = None
        self.floor = None
        self.growth = None
        self.changepoint_prior_scale = None
        self.interval_width = None
        self.name = None
        self.period = None
        self.fourier_order = None
        self.seasonality_prior_scale = None
        self.holidays_prior_scale = None
        self.is_seasonality_request = False
        self.weekly_start = 6 # Defaulting to a Monday start for the week as used in Qlik
        self.yearly_start = 0
        self.lower_window = None
        self.upper_window = None
        
        # Set optional parameters
        
        # Check if there is a fourth column in the request
        try:
            # If there is a fourth column, it is assumed to contain the key word arguments
            args = self.request[0].rows[0].duals[3].strData
            
            # The third column should then provide the holiday name or null for each row
            self.has_holidays = True
            
        except IndexError:
            # If there is no fourth column, the request does not include holidays
            self.has_holidays = False
        
        # If the fourth column did not exist, we try again with the third column
        if not self.has_holidays:
            try:
                args = self.request[0].rows[0].duals[2].strData
            except IndexError:
                args = None
        
        # If the key word arguments were included in the request, get the parameters and values
        if args is not None:
            
            # The parameter and values are transformed into key value pairs
            args = args.translate(str.maketrans('', '', string.whitespace)).split(",")
            self.kwargs = dict([arg.split("=") for arg in args])
            
            # Make sure the key words are in lower case
            self.kwargs = {k.lower(): v for k, v in self.kwargs.items()}
            
            # Set the return type 
            # Valid values are: yhat, trend, seasonal, seasonalities. 
            # Add _lower or _upper to the series name to get lower or upper limits.
            if 'return' in self.kwargs:
                self.result_type = self.kwargs['return'].lower()
            
            # Set the option to take a logarithm of y values before forecast calculations
            # Valid values are: true, false
            if 'take_log' in self.kwargs:
                self.take_log = 'true' == self.kwargs['take_log'].lower()
                
            # Set the type of seasonlity requested. Used only for seasonality requests
            # Valid values are: yearly, weekly, monthly, holidays
            if 'seasonality' in self.kwargs:
                self.seasonality = self.kwargs['seasonality'].lower()
            
            # Set the debug option for generating execution logs
            # Valid values are: true, false
            if 'debug' in self.kwargs:
                self.debug = 'true' == self.kwargs['debug'].lower()
            
            # Set the frequency of the timeseries
            # Any valid frequency for pd.date_range, such as 'D' or 'M' 
            # For options see: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
            if 'freq' in self.kwargs:
                self.freq = self.kwargs['freq']
            
            # Set the cap which adds an upper limit at which the forecast will saturate
            # This changes the default linear growth model to a logistic growth model
            if 'cap' in self.kwargs:
                self.cap = float(self.kwargs['cap'])
                self.growth = 'logistic'
            
                # Set the floor which adds a lower limit at which the forecast will saturate
                # To use a logistic growth trend with a floor, a cap must also be specified
                if 'floor' in self.kwargs:
                    self.floor = float(self.kwargs['floor'])
            
            # Set the changepoint_prior_scale to adjust the trend flexibility
            # If the trend changes are being overfit (too much flexibility) or underfit (not enough flexibility), 
            # you can adjust the strength of the sparse prior. 
            # Default value is 0.05. Increasing it will make the trend more flexible.
            if 'changepoint_prior_scale' in self.kwargs:
                self.changepoint_prior_scale = float(self.kwargs['changepoint_prior_scale'])
            
            # Set the width for the uncertainty intervals
            # Default value is 0.8 (i.e. 80%)
            if 'interval_width' in self.kwargs:
                self.interval_width = float(self.kwargs['interval_width'])
            
            # Set additional seasonality to be added to the model
            # Default seasonalities are yearly and weekly, as well as daily for sub daily data
            if 'add_seasonality' in self.kwargs:
                self.name = self.kwargs['add_seasonality'].lower()
            
            # Set the seasonality period 
            # e.g. 30.5 for 'monthly' seasonality
            if 'seasonality_period' in self.kwargs:
                self.period = float(self.kwargs['seasonality_period'])
            
            # Set the seasonality fourier terms 
            # Increasing the number of Fourier terms allows the seasonality to fit faster changing cycles, 
            # but can also lead to overfitting
            if 'seasonality_fourier' in self.kwargs:
                self.fourier_order = int(self.kwargs['seasonality_fourier'])
            
            # Set the seasonality prior scale to smooth seasonality effects. 
            # Reducing this parameter dampens seasonal effects
            if 'seasonality_prior_scale' in self.kwargs:
                self.seasonality_prior_scale = float(self.kwargs['seasonality_prior_scale'])
            
            # Set the holiday prior scale to smooth holiday effects. 
            # Reducing this parameter dampens holiday effects. Default is 10, which provides very little regularization.
            if 'holidays_prior_scale' in self.kwargs:
                self.holidays_prior_scale = float(self.kwargs['holidays_prior_scale'])
            
            # Set the weekly start for 'weekly' seasonality requests 
            # Default week start is 0 which represents Sunday. Add offset as required.
            if 'weekly_start' in self.kwargs:
                self.weekly_start = int(self.kwargs['weekly_start'])
            
            # Set the weekly start for 'yearly' seasonality requests 
            # Default week start is 0 which represents 1st of Jan. Add offset as required.
            if 'yearly_start' in self.kwargs:
                self.yearly_start = int(self.kwargs['yearly_start'])
            
            # Set a period to extend the holidays by lower_window number of days before the date. 
            # This can be used to extend the holiday effect
            if 'lower_window' in self.kwargs:
                self.lower_window = int(self.kwargs['lower_window'])
            
            # Set a period to extend the holidays by upper_window number of days after the date. 
            # This can be used to extend the holiday effect
            if 'upper_window' in self.kwargs:
                self.upper_window = int(self.kwargs['upper_window'])
        
        # Create dictionary of arguments for the Prophet(), make_future_dataframe() and add_seasonality() functions
        self.prophet_kwargs = {}
        self.make_kwargs = {}
        self.add_seasonality_kwargs = {}
        
        # Populate the parameters in the corresponding dictionary:
        
        # Set up a list of possible key word arguments for the Prophet() function
        prophet_params = ['growth', 'changepoint_prior_scale', 'interval_width', 'seasonality_prior_scale',\
                          'holidays_prior_scale']
        
        # Create dictionary of key word arguments for the Prophet() function
        self.prophet_kwargs = self._populate_dict(prophet_params)
        
        # Set up a list of possible key word arguments for the make_future_dataframe() function
        make_params = ['periods', 'freq']
        
        # Create dictionary of key word arguments for the make_future_dataframe() function
        self.make_kwargs = self._populate_dict(make_params)
        
        # Set up a list of possible key word arguments for the add_seasonality() function
        seasonality_params = ['name', 'period', 'fourier_order']
        
        # Create dictionary of key word arguments for the add_seasonality() function
        self.add_seasonality_kwargs = self._populate_dict(seasonality_params)
            
    def _populate_dict(self, params):
        """
        Populate a dictionary based on a list of parameters. 
        The parameters should already exist in this object.
        """
        
        output_dict = {}
        
        for prop in params:
            if getattr(self, prop) is not None:
                output_dict[prop] = getattr(self, prop)
        
        return output_dict
    
    def _forecast(self):
        """
        Execute the forecast algorithm according to the request type
        """
        
        # If this is a seasonality request, we need to return the relevant seasonlity component
        if self.is_seasonality_request:

            if self.seasonality == 'weekly':
                # Prepare the seasonality data frame
                # Parameter start needs to be any arbitrary week starting on a Sunday
                days = (pd.date_range(start='2017-01-01', periods=7) + pd.Timedelta(days=self.weekly_start))
                df_w = self.model.seasonality_plot_df(days)

                # Calculate seasonal components 
                self.forecast = self.model.predict_seasonal_components(df_w)

            elif self.seasonality == 'yearly':
                # Prepare the seasonality data frame
                # Parameter start needs to be 1st January for any arbitrary year
                days = (pd.date_range(start='2017-01-01', periods=365) + pd.Timedelta(days=self.yearly_start))
                df_y = self.model.seasonality_plot_df(days)

                # Calculate seasonal components 
                self.forecast = self.model.predict_seasonal_components(df_y)

            else:
                # Prepare the seasonality data frame
                start = pd.to_datetime('2017-01-01 0000')
                period = self.model.seasonalities[self.seasonality]['period']
                
                end = start + pd.Timedelta(days=period)
                # plot_points = 200
                # plot_points is used instead of period below in fbprophet/forecaster.py. 
                # However, it seems to make more sense to use period given the expected usage in Qlik
                intervals = pd.to_datetime(np.linspace(start.value, end.value, period)) 
                
                df_x = self.model.seasonality_plot_df(intervals)

                # Calculate seasonal components 
                self.forecast = self.model.predict_seasonal_components(df_x)
            
            # Set the correct sort order for the response
            self.forecast = self.forecast.reindex(self.sort_order.index)
        
        # For standard forecast the output rows equal the input rows
        else:
            # Prepare the forecast
            self.forecast = self.model.predict(self.future_df)
            
            # For return=y_then_yhat[_upper / _lower] we return y values followed by relevant results for the forecast periods
            if 'y_then_yhat' in self.result_type:
                relevant_result = self.result_type.replace('y_then_', '')

                # Copy yhat / yhat_upper / yhat_lower values to the new column
                self.forecast.loc[:, self.result_type] = self.forecast.loc[:, relevant_result]

                if 'upper' in self.result_type or 'lower' in self.result_type:
                    # Overwrite historic values with Nulls
                    self.forecast.loc[:len(self.forecast) - self.periods - 1, self.result_type] \
                    = np.NaN
                else:
                    # Overwrite with y values for historic data
                    self.forecast.loc[:len(self.forecast) - self.periods - 1, self.result_type] \
                    = self.request_df.loc[:len(self.request_df) - self.periods - 1, 'y']
            
            # Update to the original index from the request data frame
            self.forecast.index = self.request_index.index
            
            # Reset to the original sort order of the data sent by Qlik
            self.forecast = self.forecast.sort_index()

        # Undo the logarithmic conversion if it was applied during initialization
        if self.take_log:
            self.forecast.loc[:,self.result_type] = np.exp(self.forecast.loc[:,self.result_type])

        # Add back the null row if it was received in the request
        if len(self.NaT_df) > 0:
            self.NaT_df = self.NaT_df.rename({'y': self.result_type}, axis='columns')
            self.forecast = self.forecast.append(self.NaT_df)
        
    def _print_log(self, step):
        """
        Output useful information to stdout and the log file if debugging is required.
        step: Print the corresponding step in the log
        """
        
        if step == 1:
            # Increment log counter for the class. Each instance of the class generates a new log.
            self.__class__.log_no += 1
             
            # Create a log file for the instance
            # Logs will be stored in ..\logs\Prophet Log <n>.txt
            self.logfile = os.path.join(os.getcwd(), 'logs', 'Prophet Log {}.txt'.format(self.log_no))
            
            # Output log header
            sys.stdout.write("ProphetForQlik Log: {0} \n\n".format(time.ctime(time.time())))
            with open(self.logfile,'w') as f:
                f.write("ProphetForQlik Log: {0} \n\n".format(time.ctime(time.time())))
        
        elif step == 2:
            # Output the request and input data frames to the terminal
            sys.stdout.write("Prophet parameters: {0}\n\n".format(self.kwargs))
            sys.stdout.write("Instance creation parameters: {0}\n\n".format(self.prophet_kwargs))
            sys.stdout.write("Make future data frame parameters: {0}\n\n".format(self.make_kwargs))
            sys.stdout.write("Add seasonality parameters: {0}\n\n".format(self.add_seasonality_kwargs))
            sys.stdout.write("REQUEST DATA FRAME: {0} rows x cols\n\n".format(self.request_df.shape))
            sys.stdout.write("{0} \n\n".format(self.request_df.to_string()))
            if len(self.NaT_df) > 0:
                sys.stdout.write("REQUEST NULL VALUES DATA FRAME: {0} rows x cols\n\n".format(self.NaT_df.shape))
                sys.stdout.write("{0} \n\n".format(self.NaT_df.to_string()))
            sys.stdout.write("INPUT DATA FRAME: {0} rows x cols\n\n".format(self.input_df.shape))
            sys.stdout.write("{} \n\n".format(self.input_df.to_string()))
            if self.has_holidays:
                sys.stdout.write("HOLIDAYS DATA FRAME: {0} rows x cols\n\n".format(self.holidays_df.shape))
                sys.stdout.write("{0} \n\n".format(self.holidays_df.to_string()))
            
            # Output the request and input data frames to the log file 
            with open(self.logfile,'a') as f:
                f.write("Prophet parameters: {0}\n\n".format(self.kwargs))
                f.write("Instance creation parameters: {0}\n\n".format(self.prophet_kwargs))
                f.write("Make future data frame parameters: {0}\n\n".format(self.make_kwargs))
                f.write("Add seasonality parameters: {0}\n\n".format(self.add_seasonality_kwargs))
                f.write("REQUEST DATA FRAME: {0} rows x cols\n\n".format(self.request_df.shape))
                f.write("{0} \n\n".format(self.request_df.to_string()))
                if len(self.NaT_df) > 0:
                    f.write("REQUEST NULL VALUES DATA FRAME: {0} rows x cols\n\n".format(self.NaT_df.shape))
                    f.write("{0} \n\n".format(self.NaT_df.to_string()))
                f.write("INPUT DATA FRAME: {0} rows x cols\n\n".format(self.input_df.shape))
                f.write("{0} \n\n".format(self.input_df.to_string()))
                if self.has_holidays:
                    f.write("HOLIDAYS DATA FRAME: {0} rows x cols\n\n".format(self.holidays_df.shape))
                    f.write("{0} \n\n".format(self.holidays_df.to_string()))
        
        elif step == 3:
            # Output in case the input contains less than 2 non-Null rows
            sys.stdout.write("\nForecast cannot be generated as the request contains less than two non-Null rows\n\n")
            with open(self.logfile,'a') as f:
                f.write("\nForecast cannot be generated as the request contains less than two non-Null rows\n\n")
        
        elif step == 4:         
            # Output the forecast data frame and returned series to the terminal
            sys.stdout.write("\nFORECAST DATA FRAME: {0} rows x cols\n\n".format(self.forecast.shape))
            sys.stdout.write("RESULT COLUMNS:\n\n")
            [sys.stdout.write("{}\n".format(col)) for col in self.forecast]
            sys.stdout.write("\nSAMPLE RESULTS:\n{0} \n\n".format(self.forecast.tail(self.periods).to_string()))
            sys.stdout.write("FORECAST RETURNED:\n{0}\n\n".format(self.forecast.loc[:,self.result_type].to_string()))
            
            # Output the forecast data frame and returned series to the log file
            with open(self.logfile,'a') as f:
                f.write("\nFORECAST DATA FRAME: {0} rows x cols\n\n".format(self.forecast.shape))
                f.write("RESULT COLUMNS:\n\n")
                [f.write("{}\n".format(col)) for col in self.forecast]
                f.write("\nSAMPLE RESULTS:\n{0} \n\n".format(self.forecast.tail(self.periods).to_string()))
                f.write("FORECAST RETURNED:\n{0}\n\n".format(self.forecast.loc[:,self.result_type].to_string()))
    
    @staticmethod
    def timeit(request):
        """
        Time the different components of the forecast
        """
        
        import timeit
        import ServerSideExtension_pb2 as SSE
        
        # Create a log file for the 
        logfile = os.path.join(os.getcwd(), 'logs', 'Prophet Performance Log.txt')

        def t1(request):
            return ProphetForQlik(request)

        def t2(predictor):
            return predictor.predict()

        def t3(forecast):
            return forecast.apply(lambda result: iter([SSE.Dual(numData=result)]))

        def t4(response_rows):
            return response_rows.apply(lambda duals: SSE.Row(duals=duals)).tolist()
        
        def dotime1():
            t = timeit.Timer("t1(request)")
            time = t.timeit(1)
            sys.stdout.write("Time taken to create an instance of ProphetForQlik: {}\n".format(time))
            with open(logfile,'a') as f:
                f.write("Time taken to create an instance of ProphetForQlik: {}\n".format(time))
            
        predictor = ProphetForQlik(request)
        
        def dotime2():
            t = timeit.Timer("t2(predictor)")
            time = t.timeit(1)
            sys.stdout.write("Time taken to calculate the forecast: {}\n".format(time))
            with open(logfile,'a') as f:
                f.write("Time taken to calculate the forecast: {}\n".format(time))
        
        forecast = predictor.predict()
        
        def dotime3():
            t = timeit.Timer("t3(forecast)")
            time = t.timeit(1)
            sys.stdout.write("Time taken to convert results to SSE.Dual: {}\n".format(time))
            with open(logfile,'a') as f:
                f.write("Time taken to convert results to SSE.Dual: {}\n".format(time))
        
        response_rows = forecast.apply(lambda result: iter([SSE.Dual(numData=result)]))
        
        def dotime4():
            t = timeit.Timer("t4(response_rows)")
            time = t.timeit(1)
            sys.stdout.write("Time taken to convert duals to SSE.Row: {}\n".format(time))
            with open(logfile,'a') as f:
                f.write("Time taken to convert duals to SSE.Row: {}\n".format(time))

        import builtins
        builtins.__dict__.update(locals())

        dotime1()
        dotime2()
        dotime3()
        dotime4()
Exemplo n.º 2
0
class ProphetForQlik:
    """
    A class to provide Facebook Prophet functions for Qlik.
    """

    # Counter used to name log files for instances of the class
    log_no = 0

    # Dates in Qlik are stored as serial number that equals the number of days since December 30, 1899.
    # This variable is used in correctly translating dates.
    qlik_cal_start = pd.Timestamp('1899-12-30')
    # This variable denotes the unit of time used in Qlik for numerical representation of datetime values
    qlik_cal_unit = 'D'

    def __init__(self, request, context):
        """
        Class initializer.
        :param request: an iterable sequence of RowData
        :Sets up the input data frame and parameters based on the request
        """

        # Set the request and context variables for this object instance
        self.request = request
        self.context = context

        # Create a Pandas Data Frame with column ds for the dates and column y for values
        self.request_df = pd.DataFrame([(row.duals[0].numData, row.duals[1].numData) \
                                        for request_rows in self.request \
                                        for row in request_rows.rows], \
                                       columns=['ds','y'])

        # Handle null value rows in the request dataset
        self.NaT_df = self.request_df.loc[self.request_df.ds.isnull()].copy()

        # If such a row exists it will be sliced off and then added back to the response
        if len(self.NaT_df) > 0:
            self.NaT_df.loc[:, 'y'] = 0
            self.request_df = self.request_df.loc[self.request_df.ds.notnull()]

        # Get additional arguments from the third column in the request data
        # Arguments should take the form of a comma separated string: 'arg1=value1, arg2=value2'
        self._set_params()

        # Additional information is printed to the terminal and logs if the paramater debug = true
        if self.debug:
            self._print_log(1)

        # Convert numerical date values to datetime
        self.request_df.loc[:,
                            'ds'] = pd.to_datetime(self.request_df.loc[:,
                                                                       'ds'],
                                                   unit=self.qlik_cal_unit,
                                                   origin=self.qlik_cal_start)

        # If the request contains holidays, prepare a holidays data frame
        if self.has_holidays:
            self._prep_holidays()

        # If the request contains additional regressors, add them to a regressors data frame
        if self.has_regressors:
            self._prep_regressors()

        # Sort the Request Data Frame based on dates, as Qlik may send unordered data
        self.request_df = self.request_df.sort_values('ds')

        # Store the original indexes for re-ordering output later
        self.request_index = self.request_df.loc[:, 'ds']

        # Add additional regressors to the request data frame
        if self.has_regressors:
            self.request_df = self.request_df.merge(self.regressors_df,
                                                    how='left',
                                                    left_index=True,
                                                    right_index=True)

        # Ignore the placeholder rows which will be filled with forecasted figures later
        self.input_df = self.request_df.iloc[:-self.periods].copy()

        # Reset the indexes for the input data frame.
        # Not doing this interferes with correct ordering of the output from Prophet
        self.input_df = self.input_df.reset_index(drop=True)

        # If the input data frame contains less than 2 non-Null rows, prediction is not possible
        if len(self.input_df) - self.input_df.y.isnull().sum() >= 2:
            # If take_log = true take logarithm of relevant input values.
            # This is usually to make the timeseries more stationary
            if self.take_log:
                self.input_df.loc[:, 'y'] = np.log(self.input_df.loc[:, 'y'])

                if self.cap is not None:
                    self.cap = np.log(self.cap)

                    if self.floor is not None:
                        self.floor = np.log(self.floor)

            # If a logistic growth model is applied add the cap and floor columns to the input data frame
            if self.cap is not None:
                self.input_df.loc[:, 'cap'] = self.cap

                if self.floor is not None:
                    self.input_df.loc[:, 'floor'] = self.floor

        if self.debug:
            self._print_log(2)

    @classmethod
    def init_seasonality(cls, request, context):
        """
        Alternative initialization method for this class
        Used when the request contains the timeseries as a contatenated string, repeated for every row
        This is used when the number of input data points differs from the output rows required for seasonality plots
        """

        # The rows are duplicates in this kind of request, so inputs are simply taken from the first row
        # First we store the correct number of rows to be output.
        request_row_count = len(
            [row for request_rows in request for row in request_rows.rows])
        # The timeseries is accepted as a string from the second column of the first row
        timeseries = request[0].rows[0].duals[1].strData
        # The holidays are taken from the third column of the first row
        holidays = request[0].rows[0].duals[2].strData

        # Get the number of columns in the request
        cols = len(request[0].rows[0].duals)

        # If additional regressors are included we extract them from the request as well
        if cols > 4:
            regressors = request[0].rows[0].duals[3].strData
            regressor_args = request[0].rows[0].duals[4]

        # The key word arguments are taken from the last column of the first row
        args = request[0].rows[0].duals[cols - 1]

        # The data may be sent unsorted by Qlik, so we have to store the order to use when sending the results
        sort_order = pd.DataFrame([(row.duals[0].numData, row.duals[0].strData) \
                                        for request_rows in request \
                                        for row in request_rows.rows], \
                                       columns=['seasonality_num', 'seasonality_str'])

        # We ignore Null values here as these are handled separately in the response
        sort_order = sort_order.loc[sort_order.seasonality_num.notnull()]

        # Re-create the request with ds and y columns
        pairs = timeseries.split(";")
        request_df = pd.DataFrame([p.split(":") for p in pairs],
                                  columns=['ds', 'y'])

        # Convert strings to numeric values, replace conversion errors with Null values
        request_df = request_df.applymap(lambda s: utils.atof(s)
                                         if s else np.NaN)

        # Check if the holidays column is populated
        if len(holidays) > 0:
            # Create a holidays data frame
            pairs = holidays.split(";")
            holiday_df = pd.DataFrame([p.split(":") for p in pairs],
                                      columns=['ds', 'holiday'])

            # Workaround for Pandas not converting the ds column to floats like it does for request_df
            holiday_df.loc[:, 'ds'] = holiday_df.loc[:, 'ds'].astype('float64')

            # Merge the holidays with the request data frame using column ds as key
            request_df = pd.merge(request_df, holiday_df, on='ds', how='left')

            # Replace null values in the holiday column with empty strings
            request_df = request_df.fillna(value={'holiday': ''})

        # If additional regressors are included in the request
        if cols > 4:
            # Create a regressors data frame
            pairs = regressors.split(";")
            regressors_df = pd.DataFrame([p.split(":") for p in pairs],
                                         columns=['ds', 'regressors'])

            # Merge the holidays with the request data frame using column ds as key
            request_df = pd.merge(request_df,
                                  regressors_df,
                                  on='ds',
                                  how='left')

            # Replace null values in the holiday column with empty strings
            request_df = request_df.fillna(value={'regressors': ''})

            # Add keyword arguments for the additional regressors to the request data frame as well
            request_df.loc[:, 'regressor_args'] = regressor_args

        # Values in the data frame are converted to type SSE.Dual
        request_df.loc[:, 'ds'] = request_df.loc[:, 'ds'].apply(
            lambda result: SSE.Dual(numData=result))
        request_df.loc[:, 'y'] = request_df.loc[:, 'y'].apply(
            lambda result: SSE.Dual(numData=result))
        if 'holiday' in request_df.columns:
            request_df.loc[:, 'holiday'] = request_df.loc[:, 'holiday'].apply(
                lambda result: SSE.Dual(strData=result))
        if 'regressors' in request_df.columns:
            request_df.loc[:,
                           'regressors'] = request_df.loc[:,
                                                          'regressors'].apply(
                                                              lambda result:
                                                              SSE.Dual(strData=
                                                                       result))

        # Add the keyword arguments to the data frame as well, already of type SSE.Dual
        request_df.loc[:, 'args'] = args

        # Create the updated request list and convert to SSE data types
        request_list = request_df.values.tolist()
        request_list = [SSE.Row(duals=duals) for duals in request_list]
        updated_request = [SSE.BundledRows(rows=request_list)]

        # Call the default initialization method
        instance = ProphetForQlik(updated_request, context)

        # Handle null value row in the request dataset
        instance.NaT_df = request_df.loc[request_df.ds.isnull()].copy()

        # If such a row exists it will be sliced off and then added back to the response
        if len(instance.NaT_df) > 0:
            instance.NaT_df.loc[:, 'y'] = 0

        # Set a property that lets us know this instance was created for seasonality forecasts
        instance.is_seasonality_request = True

        # Set a property that lets us know the row count in the original request as this will be different from request_df
        instance.request_row_count = request_row_count

        # Update the default result type if this was not passed in arguments
        if instance.result_type == 'yhat':
            instance.result_type = instance.seasonality

        if instance.seasonality == 'weekly':
            # For weekly seasonlity the return sort order is based on the day number from 0-6, with 0 being Monday
            instance.sort_order = sort_order.set_index(
                sort_order.seasonality_num)
        else:
            # Else the return sort order is based on the data frame's index after sorting on the seasonality field
            instance.sort_order = sort_order.sort_values('seasonality_num')

        # Return the initialized ProphetForQlik instance
        return instance

    def predict(self):
        """
        Calculate forecasted values using the Prophet library.
        """

        # If the input data frame contains less than 2 non-Null rows, prediction is not possible
        if len(self.input_df) - self.input_df.y.isnull().sum() <= 2:

            if self.debug:
                self._print_log(3)

            # A series of null values is returned to avoid an error in Qlik
            return pd.Series([np.NaN for y in range(self.request_row_count)])

        # Instantiate a Prophet object and fit the input data frame:

        if len(self.prophet_kwargs) > 0:
            self.model = Prophet(**self.prophet_kwargs)
        else:
            self.model = Prophet()

        # Add custom seasonalities if defined in the arguments
        if self.name is not None and len(self.add_seasonality_kwargs) > 0:
            self.model.add_seasonality(**self.add_seasonality_kwargs)

        # Add additional regressors if defined in the arguments
        if self.has_regressors:
            i = 0
            for regressor in self.regressors_df.columns:
                self.model.add_regressor(regressor, **self.regressor_kwargs[i])
                i += 1

        self.model.fit(self.input_df, **self.fit_kwargs)

        # Create a data frame for future values
        self.future_df = self.model.make_future_dataframe(**self.make_kwargs)

        # If a logistic growth model is applied add the cap and floor columns to the future data frame
        if self.cap is not None:
            self.future_df.loc[:, 'cap'] = self.cap

            if self.floor is not None:
                self.future_df.loc[:, 'floor'] = self.floor

        # Add additional regressors to the future data frame
        if self.has_regressors:
            # index_slice = self.regressors_df.shape[0] - self.periods
            for regressor in self.regressors_df.columns:
                self.future_df[regressor] = self.regressors_df.loc[:,
                                                                   regressor]

        if self.debug:
            self._print_log(4)

        # Prepare the forecast
        self._forecast()

        # If the function was called through the load script we return a Data Frame
        if self.load_script:
            # If the response is the seasonality plot we return all seasonality components
            if self.is_seasonality_request:
                # Add an index column to the response
                self.response = self.forecast.reset_index()
            # Otherwise we add dates to the response
            else:
                # Set up the response data frame
                self.response = self.forecast if self.result_type == 'all' else self.forecast.loc[:, [
                    'ds', self.result_type
                ]]
                # Update the ds column as formatted strings
                self.response['ds'] = self.request_df['ds'].dt.strftime(
                    '%Y-%m-%d %r')

            if self.debug:
                self._print_log(5)

            # Send meta data on the response to Qlik
            self._send_table_description()

            return self.response
        else:
            if self.debug:
                self._print_log(5)

            return self.forecast.loc[:, self.result_type]

    def _set_params(self):
        """
        Set input parameters based on the request.
        Parameters implemented for the Prophet() function are: growth, cap, floor, changepoint_prior_scale, interval_width 
        Parameters implemented for the make_future_dataframe() function are: freq, periods
        Parameters implemented for seasonality are: add_seasonality, seasonality_period, seasonality_fourier, seasonality_prior_scale
        Parameters implemented for holidays are: holidays_prior_scale, lower_window, upper_window
        Additional parameters for seasonlity requests are: weekly_start, yearly_start
        Additional parameters used are: return, take_log, seasonality, debug
        """

        # Calculate the forecast periods based on the number of placeholders in the data
        self.periods = utils.count_placeholders(self.request_df.loc[:, 'y'])

        # Set the row count in the original request
        self.request_row_count = len(self.request_df) + len(self.NaT_df)

        # Set default values which will be used if an argument is not passed
        self.load_script = False
        self.result_type = 'yhat'
        self.take_log = False
        self.seasonality = 'yearly'
        self.seasonality_mode = None
        self.debug = False
        self.freq = 'D'
        self.cap = None
        self.floor = None
        self.growth = None
        self.changepoint_prior_scale = None
        self.interval_width = None
        self.name = None
        self.period = None
        self.fourier_order = None
        self.mode = None
        self.seasonality_prior_scale = None
        self.holidays_prior_scale = None
        self.mcmc_samples = None
        self.seed = None
        self.n_changepoints = None
        self.changepoint_range = None
        self.uncertainty_samples = None
        self.is_seasonality_request = False
        self.weekly_start = 1  # Defaulting to a Monday start for the week as used in Qlik
        self.yearly_start = 0
        self.lower_window = None
        self.upper_window = None

        # Set optional parameters

        # Check the number of columns in the request to determine whether we have holidays and/or added regressors
        cols = len(self.request[0].rows[0].duals)
        self.has_holidays = False
        self.has_regressors = False

        # If we receive five columns, we expect both holidays and additional regressors
        if cols == 6:
            self.has_regressors = True
        # For a request with four columns, we only expect holidays
        if cols >= 4:
            self.has_holidays = True

        # If there are three or more columns, the last column should contain the key word arguments
        if cols < 3:
            args = None
        else:
            args = self.request[0].rows[0].duals[cols - 1].strData

        # If the key word arguments were included in the request, get the parameters and values
        if args is not None:

            # The parameter and values are transformed into key value pairs
            args = args.translate(str.maketrans('', '',
                                                string.whitespace)).split(",")
            self.kwargs = dict([arg.split("=") for arg in args])

            # Make sure the key words are in lower case
            self.kwargs = {k.lower(): v for k, v in self.kwargs.items()}

            # Set the load_script parameter to determine the output format
            # Set to 'true' if calling the functions from the load script in the Qlik app
            if 'load_script' in self.kwargs:
                self.load_script = 'true' == self.kwargs['load_script'].lower()

            # Set the return type
            # Valid values are: yhat, trend, seasonal, seasonalities, all, y_then_yhat, residual.
            # Add _lower or _upper to the series name to get lower or upper limits.
            # The special case of 'all' returns all output columns from Prophet. This can only be used with 'load_script=true'.
            # 'y_then_yhat' returns actual values for historical periods and forecast values for future periods
            # 'residual' returns y - yhat for historical periods
            if 'return' in self.kwargs:
                self.result_type = self.kwargs['return'].lower()

            # Set a flag to return the seasonality plot instead
            # Only usable through the load script as the result will have a different cardinality to the request
            if 'is_seasonality_request' in self.kwargs:
                self.is_seasonality_request = 'true' == self.kwargs[
                    'is_seasonality_request'].lower()
                self.load_script = True

            # Set the option to take a logarithm of y values before forecast calculations
            # Valid values are: true, false
            if 'take_log' in self.kwargs:
                self.take_log = 'true' == self.kwargs['take_log'].lower()

            # Set the type of seasonlity requested. Used only for seasonality requests
            # Valid values are: yearly, weekly, monthly, holidays
            if 'seasonality' in self.kwargs:
                self.seasonality = self.kwargs['seasonality'].lower()

            # Set the seasonlity mode. Useful if the seasonality is not a constant additive factor as assumed by Prophet
            # Valid values are: additive, multiplicative
            if 'seasonality_mode' in self.kwargs:
                self.seasonality_mode = self.kwargs['seasonality_mode'].lower()

            # Set the debug option for generating execution logs
            # Valid values are: true, false
            if 'debug' in self.kwargs:
                self.debug = 'true' == self.kwargs['debug'].lower()

            # Set the frequency of the timeseries
            # Any valid frequency for pd.date_range, such as 'D' or 'M'
            # For options see: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases
            if 'freq' in self.kwargs:
                self.freq = self.kwargs['freq']

            # Set the cap which adds an upper limit at which the forecast will saturate
            # This changes the default linear growth model to a logistic growth model
            if 'cap' in self.kwargs:
                self.cap = utils.atof(self.kwargs['cap'])
                self.growth = 'logistic'

                # Set the floor which adds a lower limit at which the forecast will saturate
                # To use a logistic growth trend with a floor, a cap must also be specified
                if 'floor' in self.kwargs:
                    self.floor = utils.atof(self.kwargs['floor'])

            # Set the changepoint_prior_scale to adjust the trend flexibility
            # If the trend changes are being overfit (too much flexibility) or underfit (not enough flexibility),
            # you can adjust the strength of the sparse prior.
            # Default value is 0.05. Increasing it will make the trend more flexible.
            if 'changepoint_prior_scale' in self.kwargs:
                self.changepoint_prior_scale = utils.atof(
                    self.kwargs['changepoint_prior_scale'])

            # Set the width for the uncertainty intervals
            # Default value is 0.8 (i.e. 80%)
            if 'interval_width' in self.kwargs:
                self.interval_width = utils.atof(self.kwargs['interval_width'])

            # Set additional seasonality to be added to the model
            # Default seasonalities are yearly and weekly, as well as daily for sub daily data
            if 'add_seasonality' in self.kwargs:
                self.name = self.kwargs['add_seasonality'].lower()

            # Set 'additive' or 'multiplicative' mode for the additional seasonality
            # Default value follows the seasonality_mode parameter
            if 'add_seasonality_mode' in self.kwargs:
                self.mode = self.kwargs['add_seasonality_mode'].lower()

            # Set the seasonality period
            # e.g. 30.5 for 'monthly' seasonality
            if 'seasonality_period' in self.kwargs:
                self.period = utils.atof(self.kwargs['seasonality_period'])

            # Set the seasonality fourier terms
            # Increasing the number of Fourier terms allows the seasonality to fit faster changing cycles,
            # but can also lead to overfitting
            if 'seasonality_fourier' in self.kwargs:
                self.fourier_order = int(self.kwargs['seasonality_fourier'])

            # Set the seasonality prior scale to smooth seasonality effects.
            # Reducing this parameter dampens seasonal effects
            if 'seasonality_prior_scale' in self.kwargs:
                self.seasonality_prior_scale = utils.atof(
                    self.kwargs['seasonality_prior_scale'])

            # Set the holiday prior scale to smooth holiday effects.
            # Reducing this parameter dampens holiday effects. Default is 10, which provides very little regularization.
            if 'holidays_prior_scale' in self.kwargs:
                self.holidays_prior_scale = utils.atof(
                    self.kwargs['holidays_prior_scale'])

            # Set the number of MCMC samples.
            # If greater than 0, Prophet will do full Bayesian inference with the specified number of MCMC samples.
            # If 0, Prophet will do MAP estimation. Default is 0.
            if 'mcmc_samples' in self.kwargs:
                self.mcmc_samples = utils.atoi(self.kwargs['mcmc_samples'])

            # Random seed that can be used to control stochasticity.
            # Used for setting the numpy random seed used in predict and also for pystan when using mcmc_samples>0.
            if 'random_seed' in self.kwargs:
                self.seed = utils.atoi(self.kwargs['random_seed'])

                # Set the random seed for numpy
                np.random.seed(self.seed)

            # Number of potential changepoints to include. Default value is 25.
            # Potential changepoints are selected uniformly from the first `changepoint_range` proportion of the history.
            if 'n_changepoints' in self.kwargs:
                self.n_changepoints = utils.atoi(self.kwargs['n_changepoints'])

            # Proportion of history in which trend changepoints will be estimated.
            # Defaults to 0.8 for the first 80%.
            if 'changepoint_range' in self.kwargs:
                self.changepoint_range = utils.atof(
                    self.kwargs['changepoint_range'])

            # Number of simulated draws used to estimate uncertainty intervals.
            if 'uncertainty_samples' in self.kwargs:
                self.uncertainty_samples = utils.atoi(
                    self.kwargs['uncertainty_samples'])

            # Set the weekly start for 'weekly' seasonality requests
            # Default week start is 0 which represents Sunday. Add offset as required.
            if 'weekly_start' in self.kwargs:
                self.weekly_start = utils.atoi(self.kwargs['weekly_start'])

            # Set the weekly start for 'yearly' seasonality requests
            # Default week start is 0 which represents 1st of Jan. Add offset as required.
            if 'yearly_start' in self.kwargs:
                self.yearly_start = utils.atoi(self.kwargs['yearly_start'])

            # Set a period to extend the holidays by lower_window number of days before the date.
            # This can be used to extend the holiday effect
            if 'lower_window' in self.kwargs:
                self.lower_window = utils.atoi(self.kwargs['lower_window'])

            # Set a period to extend the holidays by upper_window number of days after the date.
            # This can be used to extend the holiday effect
            if 'upper_window' in self.kwargs:
                self.upper_window = utils.atoi(self.kwargs['upper_window'])

        # Create dictionary of arguments for the Prophet(), make_future_dataframe(), add_seasonality() and fit() functions
        self.prophet_kwargs = {}
        self.make_kwargs = {}
        self.add_seasonality_kwargs = {}
        self.fit_kwargs = {}

        # Populate the parameters in the corresponding dictionary:

        # Set up a list of possible key word arguments for the Prophet() function
        prophet_params = ['seasonality_mode', 'growth', 'changepoint_prior_scale', 'interval_width',\
                          'seasonality_prior_scale', 'holidays_prior_scale', 'mcmc_samples', 'n_changepoints',\
                          'changepoint_range', 'uncertainty_samples']

        # Create dictionary of key word arguments for the Prophet() function
        self.prophet_kwargs = self._populate_dict(prophet_params)

        # Set up a list of possible key word arguments for the make_future_dataframe() function
        make_params = ['periods', 'freq']

        # Create dictionary of key word arguments for the make_future_dataframe() function
        self.make_kwargs = self._populate_dict(make_params)

        # Set up a list of possible key word arguments for the add_seasonality() function
        seasonality_params = ['name', 'period', 'fourier_order', 'mode']

        # Create dictionary of key word arguments for the add_seasonality() function
        self.add_seasonality_kwargs = self._populate_dict(seasonality_params)

        # Pass the random seed to the fit method if MCMC is being used
        if self.mcmc_samples is not None and self.mcmc_samples > 0:
            # Set up a list of possible key word arguments for the fit() function
            fit_params = ['seed']
            # Create dictionary of key word arguments for the fit() function
            self.fit_kwargs = self._populate_dict(fit_params)

    def _populate_dict(self, params):
        """
        Populate a dictionary based on a list of parameters. 
        The parameters should already exist in this object.
        """

        output_dict = {}

        for prop in params:
            if getattr(self, prop) is not None:
                output_dict[prop] = getattr(self, prop)

        return output_dict

    def _prep_holidays(self):
        """
        Prepare the holidays data frame.
        The request should contain a holiday column which provides the holidays for past and future dates.
        The column provides holiday names, while the ds column provides the holiday's date. 
        Rows without a holiday name are considered non-holidays and not part of the holiday data frame.
        """

        # Create a holidays data frame
        self.holidays_df = pd.DataFrame([(row.duals[0].numData, row.duals[2].strData)\
                                            for request_rows in self.request\
                                            for row in request_rows.rows],\
                                        columns=['ds','holiday'])

        # Add upper and lower window for the holidays if applicable
        if self.lower_window is not None:
            self.holidays_df.loc[:, 'lower_window'] = self.lower_window
        if self.upper_window is not None:
            self.holidays_df.loc[:, 'upper_window'] = self.upper_window

        # Copy dates from the request_df
        self.holidays_df.loc[:, 'ds'] = self.request_df.loc[:, 'ds'].copy()

        # Remove rows from the holidays data frame where the holiday or ds column is empty
        self.holidays_df = self.holidays_df.loc[self.holidays_df.holiday != '']
        self.holidays_df = self.holidays_df.loc[self.holidays_df.ds.notnull()]

        # If the holidays data frame is empty we don't need to add it to the key word arguments for prophet
        if self.holidays_df.empty:
            self.has_holidays = False
            return

        # Make the holidays names lower case to avoid the return argument becoming case sensitive
        self.holidays_df.loc[:,
                             'holiday'] = self.holidays_df.holiday.str.lower()
        # Also remove spaces and apostrophes
        self.holidays_df.loc[:,
                             'holiday'] = self.holidays_df.holiday.str.replace(
                                 " ", "_")
        self.holidays_df.loc[:,
                             'holiday'] = self.holidays_df.holiday.str.replace(
                                 "'", "")

        # Sort by the ds column and reset indexes
        self.holidays_df = self.holidays_df.sort_values('ds').reset_index(
            drop=True)

        # Finally add this to the key word argumemnts for Prophet
        self.prophet_kwargs['holidays'] = self.holidays_df

    def _prep_regressors(self):
        """
        Parse the request for additional regressors and arguments.
        The regressors are expected as a string of pipe separated values.
        e.g. a single entry with three regressors could be '1.2|200|3'
        
        Arguments for the regressors can be passed in a separate string of keyword arguments.
        The keyword and the value should be separated by equals signs, different keywords by commas, and arguments for different regressors by pipe.
        If a single set of arguments is provided (i.e. no pipe characters are found), we apply the same arguments to all regressors.
        e.g. 'prior_scale=10, mode=additive| mode=multiplicative| mode=multiplicative' for specifying different arguments per regressor
              or 'mode=additive' for using the same arguments for all regressors.

        Returns a data frame with the additional regressors.
        """

        # Create a Pandas Data Frame with additional regressors and their keyword arguments
        self.regressors_df = pd.DataFrame([(row.duals[0].numData, row.duals[3].strData, row.duals[4].strData) \
            for request_rows in self.request \
                for row in request_rows.rows], \
                    columns=['ds', 'regressors', 'kwargs'])

        # Handle null value rows in the request dataset
        self.regressors_df = self.regressors_df.loc[
            self.regressors_df.ds.notnull()]

        # Check if the regressors column is empty
        if len(self.regressors_df.regressors.unique()) == 1:
            # Return without further processing
            self.has_regressors = False
            if self.debug:
                self._print_log(7)
            return None

        # Get the regressor arguments as a string
        arg_string = self.regressors_df.loc[0, 'kwargs']

        # Add kwargs for regressors to a list of dictionaries
        self.regressor_kwargs = []
        for kwargs_string in arg_string.replace(' ', '').split('|'):
            if len(kwargs_string) > 0:
                kwargs = {}
                for kv in kwargs_string.split(','):
                    pair = kv.split('=')
                    if 'prior_scale' in pair[0]:
                        pair[1] = utils.atof(pair[1])
                    if 'standardize' in pair[0] and pair[1].lower() != 'auto':
                        pair[1] = 'true' == pair[1].lower()
                    kwargs[pair[0]] = pair[1]
                self.regressor_kwargs.append(kwargs)

        # Split up the additional regressors into multiple columns
        self.regressors_df = pd.DataFrame(self.regressors_df.regressors.str.split('|', expand=True).values, \
            index=self.regressors_df.index).add_prefix('regressor_')

        # Convert the strings to floats
        self.regressors_df = self.regressors_df.applymap(utils.atof)

        # Copy dates from the request_df
        self.regressors_df.loc[:, 'ds'] = self.request_df.loc[:, 'ds'].copy()

        # Sort by the ds column and reset indexes
        self.regressors_df = self.regressors_df.sort_values('ds').reset_index(
            drop=True).drop(columns=['ds'])

        # If there are no regressor kwargs add empty dictionaries
        if len(self.regressor_kwargs) == 0:
            self.regressor_kwargs = [{} for c in self.regressors_df.columns]
        # If there is just 1 dictionary, replicate it for each regressor
        elif len(self.regressor_kwargs) == 1:
            kwargs = self.regressor_kwargs[0].copy()
            self.regressor_kwargs = [
                kwargs for c in self.regressors_df.columns
            ]
        elif len(self.regressor_kwargs) != len(self.regressors_df.columns):
            err = "The number of additional regressors does not match the keyword arguments provided for the regressors."
            raise IndexError(err)

        return self.regressors_df

    def _forecast(self):
        """
        Execute the forecast algorithm according to the request type
        """

        # If this is a seasonality request, we need to return the relevant seasonlity component
        if self.is_seasonality_request:

            if self.seasonality == 'weekly':
                # Prepare the seasonality data frame
                # Parameter start needs to be any arbitrary week starting on a Sunday
                days = (pd.date_range(start='2017-01-01', periods=7) +
                        pd.Timedelta(days=self.weekly_start))
                df_w = plot.seasonality_plot_df(self.model, days)

                # Calculate seasonal components
                self.forecast = self.model.predict_seasonal_components(df_w)

            elif self.seasonality == 'yearly':
                # Prepare the seasonality data frame
                # Parameter start needs to be 1st January for any arbitrary year
                days = (pd.date_range(start='2017-01-01', periods=365) +
                        pd.Timedelta(days=self.yearly_start))
                df_y = plot.seasonality_plot_df(self.model, days)

                # Calculate seasonal components
                self.forecast = self.model.predict_seasonal_components(df_y)

            else:
                # Prepare the seasonality data frame
                start = pd.to_datetime('2017-01-01 0000')
                period = self.model.seasonalities[self.seasonality]['period']

                end = start + pd.Timedelta(days=period)
                # plot_points = 200
                # plot_points is used instead of period below in fbprophet/forecaster.py.
                # However, it seems to make more sense to use period given the expected usage in Qlik
                intervals = pd.to_datetime(
                    np.linspace(start.value, end.value, period))

                df_x = plot.seasonality_plot_df(self.model, intervals)

                # Calculate seasonal components
                self.forecast = self.model.predict_seasonal_components(df_x)

            # Set the correct sort order for the response
            try:
                self.forecast = self.forecast.reindex(self.sort_order.index)
            except AttributeError:
                pass

        # For standard forecast the output rows equal the input rows
        else:
            # Prepare the forecast
            self.forecast = self.model.predict(self.future_df)

            # For return=y_then_yhat[_upper / _lower] we return y values followed by relevant results for the forecast periods
            if 'y_then_yhat' in self.result_type:
                relevant_result = self.result_type.replace('y_then_', '')

                # Copy yhat / yhat_upper / yhat_lower values to the new column
                self.forecast.loc[:, self.
                                  result_type] = self.forecast.loc[:,
                                                                   relevant_result]

                if 'upper' in self.result_type or 'lower' in self.result_type:
                    # Overwrite historic values with Nulls
                    self.forecast.loc[:len(self.forecast) - self.periods - 1, self.result_type] \
                    = np.NaN
                else:
                    # Overwrite with y values for historic data
                    self.forecast.loc[:len(self.forecast) - self.periods - 1, self.result_type] \
                    = self.input_df.loc[:len(self.request_df) - self.periods - 1, 'y']

            # For return=residual we return y - yhat for historical periods and Null for future periods
            elif 'residual' in self.result_type:
                # Create the residuals for historical periods by subtracting yhat from y
                self.forecast.loc[:len(
                    self.request_df
                ) - self.periods - 1, self.result_type] = self.input_df.loc[:len(
                    self.request_df
                ) - self.periods - 1, 'y'] - self.forecast.loc[:len(
                    self.request_df) - self.periods - 1, 'yhat']

            # Update to the original index from the request data frame
            self.forecast.index = self.request_index.index

            # Reset to the original sort order of the data sent by Qlik
            self.forecast = self.forecast.sort_index()

        # Undo the logarithmic conversion if it was applied during initialization
        if self.take_log:
            if self.result_type == 'all':
                self.forecast.loc[:, self.forecast.columns != 'ds'] = np.exp(
                    self.forecast.loc[:, self.forecast.columns != 'ds'])
            else:
                self.forecast.loc[:, self.result_type] = np.exp(
                    self.forecast.loc[:, self.result_type])

        # Add back the null row if it was received in the request
        if len(self.NaT_df) > 0:
            if self.result_type == 'all':
                col = 'yhat'
            else:
                col = self.result_type
            self.NaT_df = self.NaT_df.rename({'y': col}, axis='columns')
            self.forecast = self.forecast.append(self.NaT_df)

    def _send_table_description(self):
        """
        Send the table description to Qlik as meta data.
        Only used when the SSE is called from the Qlik load script.
        """

        # Set up the table description to send as metadata to Qlik
        self.table = SSE.TableDescription()
        self.table.name = "ProphetForecast"
        self.table.numberOfRows = len(self.response)

        # Set up fields for the table
        if self.is_seasonality_request:
            for col in self.response.columns:
                self.table.fields.add(name=col, dataType=1)
        elif self.result_type == 'all':
            for col in self.response.columns:
                dataType = 0 if col == 'ds' else 1
                self.table.fields.add(name=col, dataType=dataType)
        else:
            self.table.fields.add(name="ds", dataType=0)
            self.table.fields.add(name=self.result_type, dataType=1)

        if self.debug:
            self._print_log(6)

        # Send table description
        table_header = (('qlik-tabledescription-bin',
                         self.table.SerializeToString()), )
        self.context.send_initial_metadata(table_header)

    def _print_log(self, step):
        """
        Output useful information to stdout and the log file if debugging is required.
        step: Print the corresponding step in the log
        """

        # Set mode to append to log file
        mode = 'a'

        if step == 1:
            # Increment log counter for the class. Each instance of the class generates a new log.
            self.__class__.log_no += 1

            # Create a log file for the instance
            # Logs will be stored in ..\logs\Prophet Log <n>.txt
            self.logfile = os.path.join(
                os.getcwd(), 'logs', 'Prophet Log {}.txt'.format(self.log_no))

            # Output log header
            output = "ProphetForQlik Log: {0} \n\n".format(
                time.ctime(time.time()))
            # Set mode to write new log file
            mode = 'w'

        elif step == 2:
            # Output the request and input data frames
            output = "Prophet parameters: {0}\n\n".format(self.kwargs)
            output += "Instance creation parameters: {0}\n\n".format(
                self.prophet_kwargs)
            output += "Make future data frame parameters: {0}\n\n".format(
                self.make_kwargs)
            output += "Add seasonality parameters: {0}\n\n".format(
                self.add_seasonality_kwargs)
            output += "Fit parameters: {0}\n\n".format(self.fit_kwargs)
            if self.has_regressors and len(self.regressor_kwargs):
                output += "Additional regressor parameters: {0}\n\n".format(
                    self.regressor_kwargs)
            output += "REQUEST DATA FRAME: {0} rows x cols\n\n".format(
                self.request_df.shape)
            output += "{0}\n...\n{1}\n\n".format(
                self.request_df.head(5).to_string(),
                self.request_df.tail(5).to_string())
            if len(self.NaT_df) > 0:
                output += "REQUEST NULL VALUES DATA FRAME: {0} rows x cols\n\n".format(
                    self.NaT_df.shape)
                output += "{0} \n\n".format(self.NaT_df.to_string())
            output += "INPUT DATA FRAME: {0} rows x cols\n\n".format(
                self.input_df.shape)
            output += "{0}\n...\n{1}\n\n".format(
                self.input_df.head(5).to_string(),
                self.input_df.tail(5).to_string())
            if self.has_holidays:
                output += "HOLIDAYS DATA FRAME: {0} rows x cols\n\n".format(
                    self.holidays_df.shape)
                output += "{0} \n\n".format(self.holidays_df.to_string())

        elif step == 3:
            # Output in case the input contains less than 2 non-Null rows
            output = "\nForecast cannot be generated as the request contains less than two non-Null rows\n\n"

        elif step == 4:
            # Output the future data frame
            output = "\nFUTURE DATA FRAME: {0} rows x cols\n\n".format(
                self.future_df.shape)
            output += "{0}\n...\n{1}\n\n".format(
                self.future_df.head(5).to_string(),
                self.future_df.tail(5).to_string())

        elif step == 5:
            # Output the forecast data frame and returned series
            output = "\nFORECAST DATA FRAME: {0} rows x cols\n\n".format(
                self.forecast.shape)
            output += "RESULT COLUMNS:\n\n"
            for col in self.forecast:
                output += "{}\n".format(col)

            output += "\nSAMPLE RESULTS:\n{0} \n\n".format(
                self.forecast.tail(5).to_string())

            result = self.response if self.load_script else self.forecast
            cols = result.columns if self.result_type == 'all' else [
                'ds', self.result_type
            ]
            output += "FORECAST RETURNED:\n{0}\n...\n{1}\n\n".format(result.loc[:, cols].head(5).to_string(),\
                result.loc[:, cols].tail(5).to_string())

        elif step == 6:
            # Print the table description if the call was made from the load script
            output = "\nTABLE DESCRIPTION SENT TO QLIK:\n\n{0} \n\n".format(
                self.table)

        elif step == 7:
            # Inform of fall back when additional regressors are incorrect
            output = "\nAdditional regressors have not been passed correctly. Falling back to a basic model.\n\n"

        sys.stdout.write(output)
        with open(self.logfile, mode, encoding='utf-8') as f:
            f.write(output)

    @staticmethod
    def timeit(request):
        """
        Time the different components of the forecast
        """

        import timeit
        import ServerSideExtension_pb2 as SSE

        # Create a log file for the
        logfile = os.path.join(os.getcwd(), 'logs',
                               'Prophet Performance Log.txt')

        def t1(request):
            return ProphetForQlik(request)

        def t2(predictor):
            return predictor.predict()

        def t3(forecast):
            return forecast.apply(
                lambda result: iter([SSE.Dual(numData=result)]))

        def t4(response_rows):
            return response_rows.apply(
                lambda duals: SSE.Row(duals=duals)).tolist()

        def dotime1():
            t = timeit.Timer("t1(request)")
            time = t.timeit(1)
            sys.stdout.write(
                "Time taken to create an instance of ProphetForQlik: {}\n".
                format(time))
            with open(logfile, 'a') as f:
                f.write(
                    "Time taken to create an instance of ProphetForQlik: {}\n".
                    format(time))

        predictor = ProphetForQlik(request)

        def dotime2():
            t = timeit.Timer("t2(predictor)")
            time = t.timeit(1)
            sys.stdout.write(
                "Time taken to calculate the forecast: {}\n".format(time))
            with open(logfile, 'a') as f:
                f.write(
                    "Time taken to calculate the forecast: {}\n".format(time))

        forecast = predictor.predict()

        def dotime3():
            t = timeit.Timer("t3(forecast)")
            time = t.timeit(1)
            sys.stdout.write(
                "Time taken to convert results to SSE.Dual: {}\n".format(time))
            with open(logfile, 'a') as f:
                f.write(
                    "Time taken to convert results to SSE.Dual: {}\n".format(
                        time))

        response_rows = forecast.apply(
            lambda result: iter([SSE.Dual(numData=result)]))

        def dotime4():
            t = timeit.Timer("t4(response_rows)")
            time = t.timeit(1)
            sys.stdout.write(
                "Time taken to convert duals to SSE.Row: {}\n".format(time))
            with open(logfile, 'a') as f:
                f.write("Time taken to convert duals to SSE.Row: {}\n".format(
                    time))

        import builtins
        builtins.__dict__.update(locals())

        dotime1()
        dotime2()
        dotime3()
        dotime4()