def __init__(self, request, context, variant="standard"): """ Class initializer. :param request: an iterable sequence of RowData :param context: :param variant: a string to indicate the request format :Sets up the input data frame and parameters based on the request """ # Set the request, context and variant variables for this object instance self.request = request self.context = context self.variant = variant if variant == "two_dims": row_template = ['strData', 'strData', 'numData', 'strData'] col_headers = ['key', 'dim', 'measure', 'kwargs'] elif variant == "lat_long": row_template = ['strData', 'numData', 'numData', 'strData'] col_headers = ['key', 'lat', 'long', 'kwargs'] else: row_template = ['strData', 'strData', 'strData'] col_headers = ['key', 'measures', 'kwargs'] # Create a Pandas Data Frame for the request data self.request_df = utils.request_df(request, row_template, col_headers) # Handle null value rows in the request dataset self.NaN_df = self.request_df.loc[self.request_df['key'].str.len() == 0].copy() # If null rows exist they will be sliced off and then added back to the response if len(self.NaN_df) > 0: self.request_df = self.request_df.loc[ self.request_df['key'].str.len() != 0] # Get additional arguments from the 'kwargs' column in the request data # Arguments should take the form of a comma separated string: 'arg1=value1, arg2=value2' kwargs = self.request_df.loc[0, 'kwargs'] self._set_params(kwargs) # Additional information is printed to the terminal and logs if the paramater debug = true if self.debug: # Increment log counter for the class. Each instance of the class generates a new log. self.__class__.log_no += 1 # Create a log file for the instance # Logs will be stored in ..\logs\Cluster Log <n>.txt self.logfile = os.path.join( os.getcwd(), 'logs', 'Cluster Log {}.txt'.format(self.log_no)) self._print_log(1) # Set up an input Data Frame, excluding the arguments column self.input_df = self.request_df.loc[:, self.request_df.columns. difference(['kwargs'])] # For the two_dims variant we pivot the data to change dim into columns and with key as the index if variant == "two_dims": self.input_df = self.input_df.pivot(index='key', columns='dim') # For the other two variants we also set the index as the 'key' column else: self.input_df = self.input_df.set_index('key') # For the standard variant we split the measures string into multiple columns and make the values numeric if variant == "standard": self.input_df = pd.DataFrame( [s.split(';') for r in self.input_df.values for s in r], index=self.input_df.index) # Convert strings to numbers using locale settings self.input_df = self.input_df.applymap(lambda s: utils.atof(s) if s else np.NaN) # Finally we prepare the data for the clustering algorithm: # If scaling does not need to be applied, we just fill in missing values if self.scaler == "none": self.input_df = utils.fillna(self.input_df, method=self.missing) # Otherwise we apply strategies for both filling missing values and then scaling the data else: self.input_df = utils.scale(self.input_df, missing=self.missing, scaler=self.scaler, **self.scaler_kwargs) # For the lat_long variant we do some additional transformations if self.variant == "lat_long": # The input values are converted to radians self.input_df = self.input_df.apply(np.radians) if self.debug: self._print_log(2)
def _correlation(request, context): """ Calculate the correlation coefficient for two columns. Scalar function. :param request: an iterable sequence of RowData :param context: :return: the correlation coefficient for each row :Qlik expression examples: :<AAI Connection Name>.Pearson('1;NA;3;4;5;6.9', ';11;12;;14;') :<AAI Connection Name>.Correlation('1;NA;3;4;5;6.9', ';11;12;;14;', 'pearson') :Possible values for the third argument are 'pearson', 'kendall' or 'spearman' """ # Iterate over bundled rows for request_rows in request: response_rows = [] # Set to True for additional info in terminal and log file debug = False if debug: # Create a log file for the logfile = os.path.join(os.getcwd(), 'logs', 'Correlation Log.txt') sys.stdout.write("Function Call: {0} \n\n".format( time.ctime(time.time()))) with open(logfile, 'a') as f: f.write("Function Call: {0} \n\n".format( time.ctime(time.time()))) # Iterating over rows for row in request_rows.rows: # Retrieve the value of the parameters # Two or Three columns are sent from the client, hence the length of params will be 2 or 3 params = [col.strData for col in row.duals] if debug: sys.stdout.write("\nPARAMETERS:\n\n{0}\n".format( "\n\n".join(str(x) for x in params))) with open(logfile, 'a') as f: f.write("\nPARAMETERS:\n\n{0}\n".format("\n\n".join( str(x) for x in params))) # Create lists for the two series x = params[0].split(";") y = params[1].split(";") # Set the correlation type based on the third argument. # Default is Pearson if the arg is missing. try: corr_type = params[2].lower() except IndexError: corr_type = 'pearson' if debug: sys.stdout.write( "\n\nx ({0:d} data points):\n{1}\n".format( len(x), " ".join(str(v) for v in x))) sys.stdout.write("\ny ({0:d} data points):\n{1}\n".format( len(y), " ".join(str(v) for v in y))) sys.stdout.write( "\nCorrelation Type: {0}\n\n".format(corr_type)) with open(logfile, 'a') as f: f.write("\n\nx ({0:d} data points):\n{1}\n".format( len(x), " ".join(str(v) for v in x))) f.write("\ny ({0:d} data points):\n{1}\n".format( len(y), " ".join(str(v) for v in y))) f.write( "\nCorrelation Type: {0}\n\n".format(corr_type)) # Check that the lists are of equal length if len(x) == len(y) and len(x) > 0: # Create a Pandas data frame using the lists df = pd.DataFrame({'x': [utils.atof(d) for d in x], \ 'y': [utils.atof(d) for d in y]}) # Calculate the correlation matrix for the two series in the data frame corr_matrix = df.corr(method=corr_type) if debug: sys.stdout.write( "\n\nCorrelation Matrix:\n{}\n".format( corr_matrix.to_string())) with open(logfile, 'a') as f: f.write("\n\nCorrelation Matrix:\n{}\n".format( corr_matrix.to_string())) # Prepare the result if corr_matrix.size > 1: result = corr_matrix.iloc[0, 1] else: result = None else: result = None # Create an iterable of Dual with a numerical value duals = iter([SSE.Dual(numData=result)]) # Append the row data constructed to response_rows response_rows.append(SSE.Row(duals=duals)) # Yield Row data as Bundled rows yield SSE.BundledRows(rows=response_rows)
def _set_params(self, kwargs): """ Set input parameters based on the request. : :Parameters implemented for the HDBSCAN() function are: algorithm, metric, min_cluster_size, min_samples, :p, alpha, cluster_selection_method, allow_single_cluster, match_reference_implementation. :More information here: https://hdbscan.readthedocs.io/en/latest/api.html#hdbscan : :Scaler types implemented for preprocessing data are: StandardScaler, MinMaxScaler, MaxAbsScaler, :RobustScaler and QuantileTransformer. :More information here: http://scikit-learn.org/stable/modules/preprocessing.html : :Additional parameters used are: load_script, return, missing, scaler, debug """ # Set the row count in the original request self.request_row_count = len(self.request_df) + len(self.NaN_df) # Set default values which will be used if arguments are not passed # SSE parameters: self.load_script = False self.result_type = 'labels_' self.missing = 'zeros' self.scaler = 'robust' self.debug = False # HDBSCAN parameters: self.algorithm = None self.metric = None self.min_cluster_size = None self.min_samples = None self.p = None self.alpha = None self.cluster_selection_method = None self.allow_single_cluster = None self.match_reference_implementation = None # Standard scaler parameters: self.with_mean = None self.with_std = None # MinMaxScaler scaler parameters: self.feature_range = None # Robust scaler parameters: self.with_centering = None self.with_scaling = None self.quantile_range = None # Quantile Transformer parameters: self.n_quantiles = None self.output_distribution = None self.ignore_implicit_zeros = None self.subsample = None self.random_state = None # Adjust default options if variant is two_dims if self.variant == "two_dims": self.load_script = True # Adjust default options if variant is lat_long elif self.variant == "lat_long": self.scaler = "none" self.metric = "haversine" # Set optional parameters # If the key word arguments were included in the request, get the parameters and values if len(kwargs) > 0: # The parameter and values are transformed into key value pairs args = kwargs.translate(str.maketrans( '', '', string.whitespace)).split(",") self.kwargs = dict([arg.split("=") for arg in args]) # Make sure the key words are in lower case self.kwargs = {k.lower(): v for k, v in self.kwargs.items()} # Set the load_script parameter to determine the output format # Set to 'true' if calling the functions from the load script in the Qlik app if 'load_script' in self.kwargs: self.load_script = 'true' == self.kwargs['load_script'].lower() # Set the return type # Valid values are: labels, probabilities, cluster_persistence, outlier_scores if 'return' in self.kwargs: self.result_type = self.kwargs['return'].lower() + '_' # Set the strategy for missing data # Valid values are: zeros, mean, median, mode if 'missing' in self.kwargs: self.missing = self.kwargs['missing'].lower() # Set the standardization strategy for the data # Valid values are: standard, minmax, maxabs, robust, quantile, none if 'scaler' in self.kwargs: self.scaler = self.kwargs['scaler'].lower() # Set the debug option for generating execution logs # Valid values are: true, false if 'debug' in self.kwargs: self.debug = 'true' == self.kwargs['debug'].lower() # Set optional parameters for the HDBSCAN algorithmn # For documentation see here: https://hdbscan.readthedocs.io/en/latest/api.html#id20 # Options are: best, generic, prims_kdtree, prims_balltree, boruvka_kdtree, boruvka_balltree # Default is 'best'. if 'algorithm' in self.kwargs: self.algorithm = self.kwargs['algorithm'].lower() # The metric to use when calculating distance between instances in a feature array. # More information here: https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html#what-about-different-metrics # And here: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html # Default is 'euclidean' for 'standard' and 'two_dims' variants, and 'haversine' for the lat_long variant. if 'metric' in self.kwargs: self.metric = self.kwargs['metric'].lower() # The minimum size of clusters. # The default value is 5. if 'min_cluster_size' in self.kwargs: self.min_cluster_size = utils.atoi( self.kwargs['min_cluster_size']) # The number of samples in a neighbourhood for a point to be considered a core point. if 'min_samples' in self.kwargs: self.min_samples = utils.atoi(self.kwargs['min_samples']) # p value to use if using the minkowski metric. if 'p' in self.kwargs: self.p = utils.atoi(self.kwargs['p']) # A distance scaling parameter as used in robust single linkage. if 'alpha' in self.kwargs: self.alpha = utils.atof(self.kwargs['alpha']) # The method used to select clusters from the condensed tree. # Options are: eom, leaf. if 'cluster_selection_method' in self.kwargs: self.cluster_selection_method = self.kwargs[ 'cluster_selection_method'].lower() # By default HDBSCAN* will not produce a single cluster. # Setting this to True will override this and allow single cluster results. if 'allow_single_cluster' in self.kwargs: self.allow_single_cluster = 'true' == self.kwargs[ 'allow_single_cluster'].lower() # There exist some interpretational differences between this HDBSCAN implementation # and the original authors reference implementation in Java. # Note that there is a performance cost for setting this to True. if 'match_reference_implementation' in self.kwargs: self.match_reference_implementation = 'true' == self.kwargs[ 'match_reference_implementation'] # Set optional parameters for the scaler functions # Parameters for the Standard scaler # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html if self.scaler == 'standard': if 'with_mean' in self.kwargs: self.with_mean = 'true' == self.kwargs['with_mean'].lower() if 'with_std' in self.kwargs: self.with_std = 'true' == self.kwargs['with_std'].lower() # Parameters for the MinMax scaler # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html if self.scaler == 'minmax': if 'feature_range' in self.kwargs: self.feature_range = ''.join( c for c in self.kwargs['feature_range'] if c not in '()').split(';') self.feature_range = (utils.atoi(self.feature_range[0]), utils.atoi(self.feature_range[1])) # Parameters for the Robust scaler # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html if self.scaler == 'robust': if 'with_centering' in self.kwargs: self.with_centering = 'true' == self.kwargs[ 'with_centering'].lower() if 'with_scaling' in self.kwargs: self.with_scaling = 'true' == self.kwargs[ 'with_scaling'].lower() if 'quantile_range' in self.kwargs: self.quantile_range = ''.join( c for c in self.kwargs['quantile_range'] if c not in '()').split(';') self.quantile_range = (utils.atof(self.quantile_range[0]), utils.atof(self.quantile_range[1])) # Parameters for the Quantile Transformer # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html if self.scaler == 'quantile': if 'n_quantiles' in self.kwargs: self.n_quantiles = utils.atoi(self.kwargs['n_quantiles']) if 'output_distribution' in self.kwargs: self.output_distribution = self.kwargs[ 'output_distribution'].lower() if 'ignore_implicit_zeros' in self.kwargs: self.ignore_implicit_zeros = 'true' == self.kwargs[ 'ignore_implicit_zeros'].lower() if 'subsample' in self.kwargs: self.subsample = utils.atoi(self.kwargs['subsample']) if 'random_state' in self.kwargs: self.random_state = utils.atoi(self.kwargs['random_state']) # Set up a list of possible key word arguments for the HDBSCAN() function hdbscan_params = ['algorithm', 'metric', 'min_cluster_size', 'min_samples', 'p', 'alpha',\ 'cluster_selection_method', 'allow_single_cluster', 'match_reference_implementation'] # Create dictionary of key word arguments for the HDBSCAN() function self.hdbscan_kwargs = self._populate_dict(hdbscan_params) # Set up a list of possible key word arguments for the sklearn preprocessing functions scaler_params = ['with_mean', 'with_std', 'feature_range', 'with_centering', 'with_scaling',\ 'quantile_range', 'n_quantiles', 'output_distribution', 'ignore_implicit_zeros',\ 'subsample', 'random_state'] # Create dictionary of key word arguments for the scaler functions self.scaler_kwargs = self._populate_dict(scaler_params)
def _set_params(self): """ Set input parameters based on the request. Parameters implemented for the Prophet() function are: growth, cap, floor, changepoint_prior_scale, interval_width Parameters implemented for the make_future_dataframe() function are: freq, periods Parameters implemented for seasonality are: add_seasonality, seasonality_period, seasonality_fourier, seasonality_prior_scale Parameters implemented for holidays are: holidays_prior_scale, lower_window, upper_window Additional parameters for seasonlity requests are: weekly_start, yearly_start Additional parameters used are: return, take_log, seasonality, debug """ # Calculate the forecast periods based on the number of placeholders in the data self.periods = utils.count_placeholders(self.request_df.loc[:, 'y']) # Set the row count in the original request self.request_row_count = len(self.request_df) + len(self.NaT_df) # Set default values which will be used if an argument is not passed self.load_script = False self.result_type = 'yhat' self.take_log = False self.seasonality = 'yearly' self.seasonality_mode = None self.debug = False self.freq = 'D' self.cap = None self.floor = None self.growth = None self.changepoint_prior_scale = None self.interval_width = None self.name = None self.period = None self.fourier_order = None self.mode = None self.seasonality_prior_scale = None self.holidays_prior_scale = None self.mcmc_samples = None self.seed = None self.n_changepoints = None self.changepoint_range = None self.uncertainty_samples = None self.is_seasonality_request = False self.weekly_start = 6 # Defaulting to a Monday start for the week as used in Qlik self.yearly_start = 0 self.lower_window = None self.upper_window = None # Set optional parameters # Check if there is a fourth column in the request try: # If there is a fourth column, it is assumed to contain the key word arguments args = self.request[0].rows[0].duals[3].strData # The third column should then provide the holiday name or null for each row self.has_holidays = True except IndexError: # If there is no fourth column, the request does not include holidays self.has_holidays = False # If the fourth column did not exist, we try again with the third column if not self.has_holidays: try: args = self.request[0].rows[0].duals[2].strData except IndexError: args = None # If the key word arguments were included in the request, get the parameters and values if args is not None: # The parameter and values are transformed into key value pairs args = args.translate(str.maketrans('', '', string.whitespace)).split(",") self.kwargs = dict([arg.split("=") for arg in args]) # Make sure the key words are in lower case self.kwargs = {k.lower(): v for k, v in self.kwargs.items()} # Set the load_script parameter to determine the output format # Set to 'true' if calling the functions from the load script in the Qlik app if 'load_script' in self.kwargs: self.load_script = 'true' == self.kwargs['load_script'].lower() # Set the return type # Valid values are: yhat, trend, seasonal, seasonalities. # Add _lower or _upper to the series name to get lower or upper limits. if 'return' in self.kwargs: self.result_type = self.kwargs['return'].lower() # Set the option to take a logarithm of y values before forecast calculations # Valid values are: true, false if 'take_log' in self.kwargs: self.take_log = 'true' == self.kwargs['take_log'].lower() # Set the type of seasonlity requested. Used only for seasonality requests # Valid values are: yearly, weekly, monthly, holidays if 'seasonality' in self.kwargs: self.seasonality = self.kwargs['seasonality'].lower() # Set the seasonlity mode. Useful if the seasonality is not a constant additive factor as assumed by Prophet # Valid values are: additive, multiplicative if 'seasonality_mode' in self.kwargs: self.seasonality_mode = self.kwargs['seasonality_mode'].lower() # Set the debug option for generating execution logs # Valid values are: true, false if 'debug' in self.kwargs: self.debug = 'true' == self.kwargs['debug'].lower() # Set the frequency of the timeseries # Any valid frequency for pd.date_range, such as 'D' or 'M' # For options see: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases if 'freq' in self.kwargs: self.freq = self.kwargs['freq'] # Set the cap which adds an upper limit at which the forecast will saturate # This changes the default linear growth model to a logistic growth model if 'cap' in self.kwargs: self.cap = utils.atof(self.kwargs['cap']) self.growth = 'logistic' # Set the floor which adds a lower limit at which the forecast will saturate # To use a logistic growth trend with a floor, a cap must also be specified if 'floor' in self.kwargs: self.floor = utils.atof(self.kwargs['floor']) # Set the changepoint_prior_scale to adjust the trend flexibility # If the trend changes are being overfit (too much flexibility) or underfit (not enough flexibility), # you can adjust the strength of the sparse prior. # Default value is 0.05. Increasing it will make the trend more flexible. if 'changepoint_prior_scale' in self.kwargs: self.changepoint_prior_scale = utils.atof( self.kwargs['changepoint_prior_scale']) # Set the width for the uncertainty intervals # Default value is 0.8 (i.e. 80%) if 'interval_width' in self.kwargs: self.interval_width = utils.atof(self.kwargs['interval_width']) # Set additional seasonality to be added to the model # Default seasonalities are yearly and weekly, as well as daily for sub daily data if 'add_seasonality' in self.kwargs: self.name = self.kwargs['add_seasonality'].lower() # Set 'additive' or 'multiplicative' mode for the additional seasonality # Default value follows the seasonality_mode parameter if 'add_seasonality_mode' in self.kwargs: self.mode = self.kwargs['add_seasonality_mode'].lower() # Set the seasonality period # e.g. 30.5 for 'monthly' seasonality if 'seasonality_period' in self.kwargs: self.period = utils.atof(self.kwargs['seasonality_period']) # Set the seasonality fourier terms # Increasing the number of Fourier terms allows the seasonality to fit faster changing cycles, # but can also lead to overfitting if 'seasonality_fourier' in self.kwargs: self.fourier_order = int(self.kwargs['seasonality_fourier']) # Set the seasonality prior scale to smooth seasonality effects. # Reducing this parameter dampens seasonal effects if 'seasonality_prior_scale' in self.kwargs: self.seasonality_prior_scale = utils.atof( self.kwargs['seasonality_prior_scale']) # Set the holiday prior scale to smooth holiday effects. # Reducing this parameter dampens holiday effects. Default is 10, which provides very little regularization. if 'holidays_prior_scale' in self.kwargs: self.holidays_prior_scale = utils.atof( self.kwargs['holidays_prior_scale']) # Set the number of MCMC samples. # If greater than 0, Prophet will do full Bayesian inference with the specified number of MCMC samples. # If 0, Prophet will do MAP estimation. Default is 0. if 'mcmc_samples' in self.kwargs: self.mcmc_samples = utils.atoi(self.kwargs['mcmc_samples']) # Random seed that can be used to control stochasticity. # Used for setting the numpy random seed used in predict and also for pystan when using mcmc_samples>0. if 'random_seed' in self.kwargs: self.seed = utils.atoi(self.kwargs['random_seed']) # Set the random seed for numpy np.random.seed(self.seed) # Number of potential changepoints to include. Default value is 25. # Potential changepoints are selected uniformly from the first `changepoint_range` proportion of the history. if 'n_changepoints' in self.kwargs: self.n_changepoints = utils.atoi(self.kwargs['n_changepoints']) # Proportion of history in which trend changepoints will be estimated. # Defaults to 0.8 for the first 80%. if 'changepoint_range' in self.kwargs: self.changepoint_range = utils.atof( self.kwargs['changepoint_range']) # Number of simulated draws used to estimate uncertainty intervals. if 'uncertainty_samples' in self.kwargs: self.uncertainty_samples = utils.atoi( self.kwargs['uncertainty_samples']) # Set the weekly start for 'weekly' seasonality requests # Default week start is 0 which represents Sunday. Add offset as required. if 'weekly_start' in self.kwargs: self.weekly_start = utils.atoi(self.kwargs['weekly_start']) # Set the weekly start for 'yearly' seasonality requests # Default week start is 0 which represents 1st of Jan. Add offset as required. if 'yearly_start' in self.kwargs: self.yearly_start = utils.atoi(self.kwargs['yearly_start']) # Set a period to extend the holidays by lower_window number of days before the date. # This can be used to extend the holiday effect if 'lower_window' in self.kwargs: self.lower_window = utils.atoi(self.kwargs['lower_window']) # Set a period to extend the holidays by upper_window number of days after the date. # This can be used to extend the holiday effect if 'upper_window' in self.kwargs: self.upper_window = utils.atoi(self.kwargs['upper_window']) # Create dictionary of arguments for the Prophet(), make_future_dataframe(), add_seasonality() and fit() functions self.prophet_kwargs = {} self.make_kwargs = {} self.add_seasonality_kwargs = {} self.fit_kwargs = {} # Populate the parameters in the corresponding dictionary: # Set up a list of possible key word arguments for the Prophet() function prophet_params = ['seasonality_mode', 'growth', 'changepoint_prior_scale', 'interval_width',\ 'seasonality_prior_scale', 'holidays_prior_scale', 'mcmc_samples', 'n_changepoints',\ 'changepoint_range', 'uncertainty_samples'] # Create dictionary of key word arguments for the Prophet() function self.prophet_kwargs = self._populate_dict(prophet_params) # Set up a list of possible key word arguments for the make_future_dataframe() function make_params = ['periods', 'freq'] # Create dictionary of key word arguments for the make_future_dataframe() function self.make_kwargs = self._populate_dict(make_params) # Set up a list of possible key word arguments for the add_seasonality() function seasonality_params = ['name', 'period', 'fourier_order', 'mode'] # Create dictionary of key word arguments for the add_seasonality() function self.add_seasonality_kwargs = self._populate_dict(seasonality_params) # Pass the random seed to the fit method if MCMC is being used if self.mcmc_samples is not None and self.mcmc_samples > 0: # Set up a list of possible key word arguments for the fit() function fit_params = ['seed'] # Create dictionary of key word arguments for the fit() function self.fit_kwargs = self._populate_dict(fit_params)
def init_seasonality(cls, request, context): """ Alternative initialization method for this class Used when the request contains the timeseries as a contatenated string, repeated for every row This is used when the number of input data points differs from the output rows required for seasonality plots """ # The rows are duplicates in this kind of request, so inputs are simply taken from the first row # First we store the correct number of rows to be output. request_row_count = len( [row for request_rows in request for row in request_rows.rows]) # The timeseries is accepted as a string from the second column of the first row timeseries = request[0].rows[0].duals[1].strData # The holidays are taken from the third column of the first row holidays = request[0].rows[0].duals[2].strData # The key word arguments are taken from the fourth column of the first row args = request[0].rows[0].duals[3] # The data may be sent unsorted by Qlik, so we have to store the order to use when sending the results sort_order = pd.DataFrame([(row.duals[0].numData, row.duals[0].strData) \ for request_rows in request \ for row in request_rows.rows], \ columns=['seasonality_num', 'seasonality_str']) # We ignore Null values here as these are handled separately in the response sort_order = sort_order.loc[sort_order.seasonality_num.notnull()] # The correct sort order is based on the data frame's index after sorting on the seasonality field sort_order = sort_order.sort_values('seasonality_num') # Re-create the request with ds and y columns pairs = timeseries.split(";") request_df = pd.DataFrame([p.split(":") for p in pairs], columns=['ds', 'y']) # Convert strings to numeric values, replace conversion errors with Null values request_df = request_df.applymap(lambda s: utils.atof(s) if s else np.NaN) # Check if the holidays column is populated if len(holidays) > 0: # Create a holidays data frame pairs = holidays.split(";") holiday_df = pd.DataFrame([p.split(":") for p in pairs], columns=['ds', 'holiday']) # Merge the holidays with the request data frame using column ds as key request_df = pd.merge(request_df, holiday_df, on='ds', how='left') # Replace null values in the holiday column with empty strings request_df = request_df.fillna(value={'holiday': ''}) # Values in the data frame are converted to type SSE.Dual request_df.loc[:, 'ds'] = request_df.loc[:, 'ds'].apply( lambda result: SSE.Dual(numData=result)) request_df.loc[:, 'y'] = request_df.loc[:, 'y'].apply( lambda result: SSE.Dual(numData=result)) if 'holiday' in request_df.columns: request_df.loc[:, 'holiday'] = request_df.loc[:, 'holiday'].apply( lambda result: SSE.Dual(strData=result)) # Add the keyword arguments to the data frame as well, already of type SSE.Dual request_df.loc[:, 'args'] = args # Create the updated request list and convert to SSE data types request_list = request_df.values.tolist() request_list = [SSE.Row(duals=duals) for duals in request_list] updated_request = [SSE.BundledRows(rows=request_list)] # Call the default initialization method instance = ProphetForQlik(updated_request, context) # Handle null value row in the request dataset instance.NaT_df = request_df.loc[request_df.ds.isnull()].copy() # If such a row exists it will be sliced off and then added back to the response if len(instance.NaT_df) > 0: instance.NaT_df.loc[:, 'y'] = 0 # Set a property that lets us know this instance was created for seasonality forecasts instance.is_seasonality_request = True # Set a property that lets us know the row count in the original request as this will be different from request_df instance.request_row_count = request_row_count # Update the default result type if this was not passed in arguments if instance.result_type == 'yhat': instance.result_type = instance.seasonality # Set the sort order to be used when returning the results instance.sort_order = sort_order # Return the initialized ProphetForQlik instance return instance
def _set_params(self, kwargs): """ Set input parameters based on the request. : :For details refer to the GitHub project: https://github.com/nabeel-oz/qlik-py-tools """ # Set default values which will be used if execution arguments are not passed # Default parameters: self.debug = False self.model = 'en_core_web_sm' self.custom = False self.base_model = 'en_core_web_sm' self.blank = False self.epochs = 100 self.batch_size = compounding(4.0, 32.0, 1.001) self.drop = 0.25 self.test = 0 # Extract the model path if required try: # Get the model name from the first row in the request_df self.model = self.request_df.loc[0, 'model_name'] # Remove the model_name column from the request_df self.request_df = self.request_df.drop(['model_name'], axis=1) except KeyError: pass # If key word arguments were included in the request, get the parameters and values if len(kwargs) > 0: # Transform the string of arguments into a dictionary self.kwargs = utils.get_kwargs(kwargs) # Set the debug option for generating execution logs # Valid values are: true, false if 'debug' in self.kwargs: self.debug = 'true' == self.kwargs['debug'].lower() # Additional information is printed to the terminal and logs if the paramater debug = true if self.debug: # Increment log counter for the class. Each instance of the class generates a new log. self.__class__.log_no += 1 # Create a log file for the instance # Logs will be stored in ..\logs\SpaCy Log <n>.txt self.logfile = os.path.join( os.getcwd(), 'logs', 'SpaCy Log {}.txt'.format(self.log_no)) self._print_log(1) # Set whether the model (if getting named entites) or base model (if retraining) is a custom model # i.e. not one of the pre-trained models provided by spaCy if 'custom' in self.kwargs: self.custom = 'true' == self.kwargs['custom'].lower() # Set the base model, i.e an existing spaCy model to be retrained. if 'base_model' in self.kwargs: self.base_model = self.kwargs['base_model'].lower() # Set the retraining to be done on a blank Language class if 'blank' in self.kwargs: self.blank = 'true' == self.kwargs['blank'].lower() # Set the epochs for training the model. # This is the the number times that the learning algorithm will work through the entire training dataset. # Valid values are an integer e.g. 200 if 'epochs' in self.kwargs: self.epochs = utils.atoi(self.kwargs['epochs']) # Set the batch size to be used during model training. # The model's internal parameters will be updated at the end of each batch. # Valid values are a single integer or compounding or decaying parameters. if 'batch_size' in self.kwargs: # The batch size may be a single integer try: self.batch_size = utils.atoi(self.kwargs['batch_size']) # Or a list of floats except ValueError: sizes = utils.get_kwargs_by_type(self.kwargs['batch_size']) # If the start < end, batch sizes will be compounded if sizes[0] < sizes[1]: self.batch_size = compounding(sizes[0], sizes[1], sizes[2]) # else bath sizes will decay during training else: self.batch_size = decaying(sizes[0], sizes[1], sizes[2]) # Set the dropout rate for retraining the model # This determines the likelihood that a feature or internal representation in the model will be dropped, # making it harder for the model to memorize the training data. # Valid values are a float lesser than 1.0 e.g. 0.35 if 'drop' in self.kwargs: self.drop = utils.atof(self.kwargs['drop']) # Set the ratio of data to be used for testing. # This data will be held out from training and just used to provide evaluation metrics. # Valid values are a float >= zero and < 1.0 e.g. 0.3 if 'test' in self.kwargs: self.test = utils.atof(self.kwargs['test']) # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(2) # Remove the kwargs column from the request_df self.request_df = self.request_df.drop(['kwargs'], axis=1)
def _prep_regressors(self): """ Parse the request for additional regressors and arguments. The regressors are expected as a string of pipe separated values. e.g. a single entry with three regressors could be '1.2|200|3' Arguments for the regressors can be passed in a separate string of keyword arguments. The keyword and the value should be separated by equals signs, different keywords by commas, and arguments for different regressors by pipe. If a single set of arguments is provided (i.e. no pipe characters are found), we apply the same arguments to all regressors. e.g. 'prior_scale=10, mode=additive| mode=multiplicative| mode=multiplicative' for specifying different arguments per regressor or 'mode=additive' for using the same arguments for all regressors. Returns a data frame with the additional regressors. """ # Create a Pandas Data Frame with additional regressors and their keyword arguments self.regressors_df = pd.DataFrame([(row.duals[0].numData, row.duals[3].strData, row.duals[4].strData) \ for request_rows in self.request \ for row in request_rows.rows], \ columns=['ds', 'regressors', 'kwargs']) # Handle null value rows in the request dataset self.regressors_df = self.regressors_df.loc[ self.regressors_df.ds.notnull()] # Check if the regressors column is empty if len(self.regressors_df.regressors.unique()) == 1: # Return without further processing self.has_regressors = False if self.debug: self._print_log(7) return None # Get the regressor arguments as a string arg_string = self.regressors_df.loc[0, 'kwargs'] # Add kwargs for regressors to a list of dictionaries self.regressor_kwargs = [] for kwargs_string in arg_string.replace(' ', '').split('|'): if len(kwargs_string) > 0: kwargs = {} for kv in kwargs_string.split(','): pair = kv.split('=') if 'prior_scale' in pair[0]: pair[1] = utils.atof(pair[1]) if 'standardize' in pair[0] and pair[1].lower() != 'auto': pair[1] = 'true' == pair[1].lower() kwargs[pair[0]] = pair[1] self.regressor_kwargs.append(kwargs) # Split up the additional regressors into multiple columns self.regressors_df = pd.DataFrame(self.regressors_df.regressors.str.split('|', expand=True).values, \ index=self.regressors_df.index).add_prefix('regressor_') # Convert the strings to floats self.regressors_df = self.regressors_df.applymap(utils.atof) # Copy dates from the request_df self.regressors_df.loc[:, 'ds'] = self.request_df.loc[:, 'ds'].copy() # Sort by the ds column and reset indexes self.regressors_df = self.regressors_df.sort_values('ds').reset_index( drop=True).drop(columns=['ds']) # If there are no regressor kwargs add empty dictionaries if len(self.regressor_kwargs) == 0: self.regressor_kwargs = [{} for c in self.regressors_df.columns] # If there is just 1 dictionary, replicate it for each regressor elif len(self.regressor_kwargs) == 1: kwargs = self.regressor_kwargs[0].copy() self.regressor_kwargs = [ kwargs for c in self.regressors_df.columns ] elif len(self.regressor_kwargs) != len(self.regressors_df.columns): err = "The number of additional regressors does not match the keyword arguments provided for the regressors." raise IndexError(err) return self.regressors_df