def _set_params(self, kwargs): """ Set input parameters based on the request. : :Parameters implemented for the HDBSCAN() function are: algorithm, metric, min_cluster_size, min_samples, :p, alpha, cluster_selection_method, allow_single_cluster, match_reference_implementation. :More information here: https://hdbscan.readthedocs.io/en/latest/api.html#hdbscan : :Scaler types implemented for preprocessing data are: StandardScaler, MinMaxScaler, MaxAbsScaler, :RobustScaler and QuantileTransformer. :More information here: http://scikit-learn.org/stable/modules/preprocessing.html : :Additional parameters used are: load_script, return, missing, scaler, debug """ # Set the row count in the original request self.request_row_count = len(self.request_df) + len(self.NaN_df) # Set default values which will be used if arguments are not passed # SSE parameters: self.load_script = False self.result_type = 'labels_' self.missing = 'zeros' self.scaler = 'robust' self.debug = False # HDBSCAN parameters: self.algorithm = None self.metric = None self.min_cluster_size = None self.min_samples = None self.p = None self.alpha = None self.cluster_selection_method = None self.allow_single_cluster = None self.match_reference_implementation = None # Standard scaler parameters: self.with_mean = None self.with_std = None # MinMaxScaler scaler parameters: self.feature_range = None # Robust scaler parameters: self.with_centering = None self.with_scaling = None self.quantile_range = None # Quantile Transformer parameters: self.n_quantiles = None self.output_distribution = None self.ignore_implicit_zeros = None self.subsample = None self.random_state = None # Adjust default options if variant is two_dims if self.variant == "two_dims": self.load_script = True # Adjust default options if variant is lat_long elif self.variant == "lat_long": self.scaler = "none" self.metric = "haversine" # Set optional parameters # If the key word arguments were included in the request, get the parameters and values if len(kwargs) > 0: # The parameter and values are transformed into key value pairs args = kwargs.translate(str.maketrans( '', '', string.whitespace)).split(",") self.kwargs = dict([arg.split("=") for arg in args]) # Make sure the key words are in lower case self.kwargs = {k.lower(): v for k, v in self.kwargs.items()} # Set the load_script parameter to determine the output format # Set to 'true' if calling the functions from the load script in the Qlik app if 'load_script' in self.kwargs: self.load_script = 'true' == self.kwargs['load_script'].lower() # Set the return type # Valid values are: labels, probabilities, cluster_persistence, outlier_scores if 'return' in self.kwargs: self.result_type = self.kwargs['return'].lower() + '_' # Set the strategy for missing data # Valid values are: zeros, mean, median, mode if 'missing' in self.kwargs: self.missing = self.kwargs['missing'].lower() # Set the standardization strategy for the data # Valid values are: standard, minmax, maxabs, robust, quantile, none if 'scaler' in self.kwargs: self.scaler = self.kwargs['scaler'].lower() # Set the debug option for generating execution logs # Valid values are: true, false if 'debug' in self.kwargs: self.debug = 'true' == self.kwargs['debug'].lower() # Set optional parameters for the HDBSCAN algorithmn # For documentation see here: https://hdbscan.readthedocs.io/en/latest/api.html#id20 # Options are: best, generic, prims_kdtree, prims_balltree, boruvka_kdtree, boruvka_balltree # Default is 'best'. if 'algorithm' in self.kwargs: self.algorithm = self.kwargs['algorithm'].lower() # The metric to use when calculating distance between instances in a feature array. # More information here: https://hdbscan.readthedocs.io/en/latest/basic_hdbscan.html#what-about-different-metrics # And here: http://scikit-learn.org/stable/modules/generated/sklearn.neighbors.DistanceMetric.html # Default is 'euclidean' for 'standard' and 'two_dims' variants, and 'haversine' for the lat_long variant. if 'metric' in self.kwargs: self.metric = self.kwargs['metric'].lower() # The minimum size of clusters. # The default value is 5. if 'min_cluster_size' in self.kwargs: self.min_cluster_size = utils.atoi( self.kwargs['min_cluster_size']) # The number of samples in a neighbourhood for a point to be considered a core point. if 'min_samples' in self.kwargs: self.min_samples = utils.atoi(self.kwargs['min_samples']) # p value to use if using the minkowski metric. if 'p' in self.kwargs: self.p = utils.atoi(self.kwargs['p']) # A distance scaling parameter as used in robust single linkage. if 'alpha' in self.kwargs: self.alpha = utils.atof(self.kwargs['alpha']) # The method used to select clusters from the condensed tree. # Options are: eom, leaf. if 'cluster_selection_method' in self.kwargs: self.cluster_selection_method = self.kwargs[ 'cluster_selection_method'].lower() # By default HDBSCAN* will not produce a single cluster. # Setting this to True will override this and allow single cluster results. if 'allow_single_cluster' in self.kwargs: self.allow_single_cluster = 'true' == self.kwargs[ 'allow_single_cluster'].lower() # There exist some interpretational differences between this HDBSCAN implementation # and the original authors reference implementation in Java. # Note that there is a performance cost for setting this to True. if 'match_reference_implementation' in self.kwargs: self.match_reference_implementation = 'true' == self.kwargs[ 'match_reference_implementation'] # Set optional parameters for the scaler functions # Parameters for the Standard scaler # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html if self.scaler == 'standard': if 'with_mean' in self.kwargs: self.with_mean = 'true' == self.kwargs['with_mean'].lower() if 'with_std' in self.kwargs: self.with_std = 'true' == self.kwargs['with_std'].lower() # Parameters for the MinMax scaler # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html if self.scaler == 'minmax': if 'feature_range' in self.kwargs: self.feature_range = ''.join( c for c in self.kwargs['feature_range'] if c not in '()').split(';') self.feature_range = (utils.atoi(self.feature_range[0]), utils.atoi(self.feature_range[1])) # Parameters for the Robust scaler # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.RobustScaler.html if self.scaler == 'robust': if 'with_centering' in self.kwargs: self.with_centering = 'true' == self.kwargs[ 'with_centering'].lower() if 'with_scaling' in self.kwargs: self.with_scaling = 'true' == self.kwargs[ 'with_scaling'].lower() if 'quantile_range' in self.kwargs: self.quantile_range = ''.join( c for c in self.kwargs['quantile_range'] if c not in '()').split(';') self.quantile_range = (utils.atof(self.quantile_range[0]), utils.atof(self.quantile_range[1])) # Parameters for the Quantile Transformer # http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.QuantileTransformer.html if self.scaler == 'quantile': if 'n_quantiles' in self.kwargs: self.n_quantiles = utils.atoi(self.kwargs['n_quantiles']) if 'output_distribution' in self.kwargs: self.output_distribution = self.kwargs[ 'output_distribution'].lower() if 'ignore_implicit_zeros' in self.kwargs: self.ignore_implicit_zeros = 'true' == self.kwargs[ 'ignore_implicit_zeros'].lower() if 'subsample' in self.kwargs: self.subsample = utils.atoi(self.kwargs['subsample']) if 'random_state' in self.kwargs: self.random_state = utils.atoi(self.kwargs['random_state']) # Set up a list of possible key word arguments for the HDBSCAN() function hdbscan_params = ['algorithm', 'metric', 'min_cluster_size', 'min_samples', 'p', 'alpha',\ 'cluster_selection_method', 'allow_single_cluster', 'match_reference_implementation'] # Create dictionary of key word arguments for the HDBSCAN() function self.hdbscan_kwargs = self._populate_dict(hdbscan_params) # Set up a list of possible key word arguments for the sklearn preprocessing functions scaler_params = ['with_mean', 'with_std', 'feature_range', 'with_centering', 'with_scaling',\ 'quantile_range', 'n_quantiles', 'output_distribution', 'ignore_implicit_zeros',\ 'subsample', 'random_state'] # Create dictionary of key word arguments for the scaler functions self.scaler_kwargs = self._populate_dict(scaler_params)
def _set_params(self, kwargs): """ Set input parameters based on the request. : :For details refer to the GitHub project: https://github.com/nabeel-oz/qlik-py-tools """ # Set default values which will be used if execution arguments are not passed # Default parameters: self.debug = False self.model = 'en_core_web_sm' self.custom = False self.base_model = 'en_core_web_sm' self.blank = False self.epochs = 100 self.batch_size = compounding(4.0, 32.0, 1.001) self.drop = 0.25 self.test = 0 # Extract the model path if required try: # Get the model name from the first row in the request_df self.model = self.request_df.loc[0, 'model_name'] # Remove the model_name column from the request_df self.request_df = self.request_df.drop(['model_name'], axis=1) except KeyError: pass # If key word arguments were included in the request, get the parameters and values if len(kwargs) > 0: # Transform the string of arguments into a dictionary self.kwargs = utils.get_kwargs(kwargs) # Set the debug option for generating execution logs # Valid values are: true, false if 'debug' in self.kwargs: self.debug = 'true' == self.kwargs['debug'].lower() # Additional information is printed to the terminal and logs if the paramater debug = true if self.debug: # Increment log counter for the class. Each instance of the class generates a new log. self.__class__.log_no += 1 # Create a log file for the instance # Logs will be stored in ..\logs\SpaCy Log <n>.txt self.logfile = os.path.join( os.getcwd(), 'logs', 'SpaCy Log {}.txt'.format(self.log_no)) self._print_log(1) # Set whether the model (if getting named entites) or base model (if retraining) is a custom model # i.e. not one of the pre-trained models provided by spaCy if 'custom' in self.kwargs: self.custom = 'true' == self.kwargs['custom'].lower() # Set the base model, i.e an existing spaCy model to be retrained. if 'base_model' in self.kwargs: self.base_model = self.kwargs['base_model'].lower() # Set the retraining to be done on a blank Language class if 'blank' in self.kwargs: self.blank = 'true' == self.kwargs['blank'].lower() # Set the epochs for training the model. # This is the the number times that the learning algorithm will work through the entire training dataset. # Valid values are an integer e.g. 200 if 'epochs' in self.kwargs: self.epochs = utils.atoi(self.kwargs['epochs']) # Set the batch size to be used during model training. # The model's internal parameters will be updated at the end of each batch. # Valid values are a single integer or compounding or decaying parameters. if 'batch_size' in self.kwargs: # The batch size may be a single integer try: self.batch_size = utils.atoi(self.kwargs['batch_size']) # Or a list of floats except ValueError: sizes = utils.get_kwargs_by_type(self.kwargs['batch_size']) # If the start < end, batch sizes will be compounded if sizes[0] < sizes[1]: self.batch_size = compounding(sizes[0], sizes[1], sizes[2]) # else bath sizes will decay during training else: self.batch_size = decaying(sizes[0], sizes[1], sizes[2]) # Set the dropout rate for retraining the model # This determines the likelihood that a feature or internal representation in the model will be dropped, # making it harder for the model to memorize the training data. # Valid values are a float lesser than 1.0 e.g. 0.35 if 'drop' in self.kwargs: self.drop = utils.atof(self.kwargs['drop']) # Set the ratio of data to be used for testing. # This data will be held out from training and just used to provide evaluation metrics. # Valid values are a float >= zero and < 1.0 e.g. 0.3 if 'test' in self.kwargs: self.test = utils.atof(self.kwargs['test']) # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(2) # Remove the kwargs column from the request_df self.request_df = self.request_df.drop(['kwargs'], axis=1)
def _set_params(self): """ Set input parameters based on the request. Parameters implemented for the Prophet() function are: growth, cap, floor, changepoint_prior_scale, interval_width Parameters implemented for the make_future_dataframe() function are: freq, periods Parameters implemented for seasonality are: add_seasonality, seasonality_period, seasonality_fourier, seasonality_prior_scale Parameters implemented for holidays are: holidays_prior_scale, lower_window, upper_window Additional parameters for seasonlity requests are: weekly_start, yearly_start Additional parameters used are: return, take_log, seasonality, debug """ # Calculate the forecast periods based on the number of placeholders in the data self.periods = utils.count_placeholders(self.request_df.loc[:, 'y']) # Set the row count in the original request self.request_row_count = len(self.request_df) + len(self.NaT_df) # Set default values which will be used if an argument is not passed self.load_script = False self.result_type = 'yhat' self.take_log = False self.seasonality = 'yearly' self.seasonality_mode = None self.debug = False self.freq = 'D' self.cap = None self.floor = None self.growth = None self.changepoint_prior_scale = None self.interval_width = None self.name = None self.period = None self.fourier_order = None self.mode = None self.seasonality_prior_scale = None self.holidays_prior_scale = None self.mcmc_samples = None self.seed = None self.n_changepoints = None self.changepoint_range = None self.uncertainty_samples = None self.is_seasonality_request = False self.weekly_start = 6 # Defaulting to a Monday start for the week as used in Qlik self.yearly_start = 0 self.lower_window = None self.upper_window = None # Set optional parameters # Check if there is a fourth column in the request try: # If there is a fourth column, it is assumed to contain the key word arguments args = self.request[0].rows[0].duals[3].strData # The third column should then provide the holiday name or null for each row self.has_holidays = True except IndexError: # If there is no fourth column, the request does not include holidays self.has_holidays = False # If the fourth column did not exist, we try again with the third column if not self.has_holidays: try: args = self.request[0].rows[0].duals[2].strData except IndexError: args = None # If the key word arguments were included in the request, get the parameters and values if args is not None: # The parameter and values are transformed into key value pairs args = args.translate(str.maketrans('', '', string.whitespace)).split(",") self.kwargs = dict([arg.split("=") for arg in args]) # Make sure the key words are in lower case self.kwargs = {k.lower(): v for k, v in self.kwargs.items()} # Set the load_script parameter to determine the output format # Set to 'true' if calling the functions from the load script in the Qlik app if 'load_script' in self.kwargs: self.load_script = 'true' == self.kwargs['load_script'].lower() # Set the return type # Valid values are: yhat, trend, seasonal, seasonalities. # Add _lower or _upper to the series name to get lower or upper limits. if 'return' in self.kwargs: self.result_type = self.kwargs['return'].lower() # Set the option to take a logarithm of y values before forecast calculations # Valid values are: true, false if 'take_log' in self.kwargs: self.take_log = 'true' == self.kwargs['take_log'].lower() # Set the type of seasonlity requested. Used only for seasonality requests # Valid values are: yearly, weekly, monthly, holidays if 'seasonality' in self.kwargs: self.seasonality = self.kwargs['seasonality'].lower() # Set the seasonlity mode. Useful if the seasonality is not a constant additive factor as assumed by Prophet # Valid values are: additive, multiplicative if 'seasonality_mode' in self.kwargs: self.seasonality_mode = self.kwargs['seasonality_mode'].lower() # Set the debug option for generating execution logs # Valid values are: true, false if 'debug' in self.kwargs: self.debug = 'true' == self.kwargs['debug'].lower() # Set the frequency of the timeseries # Any valid frequency for pd.date_range, such as 'D' or 'M' # For options see: http://pandas.pydata.org/pandas-docs/stable/timeseries.html#offset-aliases if 'freq' in self.kwargs: self.freq = self.kwargs['freq'] # Set the cap which adds an upper limit at which the forecast will saturate # This changes the default linear growth model to a logistic growth model if 'cap' in self.kwargs: self.cap = utils.atof(self.kwargs['cap']) self.growth = 'logistic' # Set the floor which adds a lower limit at which the forecast will saturate # To use a logistic growth trend with a floor, a cap must also be specified if 'floor' in self.kwargs: self.floor = utils.atof(self.kwargs['floor']) # Set the changepoint_prior_scale to adjust the trend flexibility # If the trend changes are being overfit (too much flexibility) or underfit (not enough flexibility), # you can adjust the strength of the sparse prior. # Default value is 0.05. Increasing it will make the trend more flexible. if 'changepoint_prior_scale' in self.kwargs: self.changepoint_prior_scale = utils.atof( self.kwargs['changepoint_prior_scale']) # Set the width for the uncertainty intervals # Default value is 0.8 (i.e. 80%) if 'interval_width' in self.kwargs: self.interval_width = utils.atof(self.kwargs['interval_width']) # Set additional seasonality to be added to the model # Default seasonalities are yearly and weekly, as well as daily for sub daily data if 'add_seasonality' in self.kwargs: self.name = self.kwargs['add_seasonality'].lower() # Set 'additive' or 'multiplicative' mode for the additional seasonality # Default value follows the seasonality_mode parameter if 'add_seasonality_mode' in self.kwargs: self.mode = self.kwargs['add_seasonality_mode'].lower() # Set the seasonality period # e.g. 30.5 for 'monthly' seasonality if 'seasonality_period' in self.kwargs: self.period = utils.atof(self.kwargs['seasonality_period']) # Set the seasonality fourier terms # Increasing the number of Fourier terms allows the seasonality to fit faster changing cycles, # but can also lead to overfitting if 'seasonality_fourier' in self.kwargs: self.fourier_order = int(self.kwargs['seasonality_fourier']) # Set the seasonality prior scale to smooth seasonality effects. # Reducing this parameter dampens seasonal effects if 'seasonality_prior_scale' in self.kwargs: self.seasonality_prior_scale = utils.atof( self.kwargs['seasonality_prior_scale']) # Set the holiday prior scale to smooth holiday effects. # Reducing this parameter dampens holiday effects. Default is 10, which provides very little regularization. if 'holidays_prior_scale' in self.kwargs: self.holidays_prior_scale = utils.atof( self.kwargs['holidays_prior_scale']) # Set the number of MCMC samples. # If greater than 0, Prophet will do full Bayesian inference with the specified number of MCMC samples. # If 0, Prophet will do MAP estimation. Default is 0. if 'mcmc_samples' in self.kwargs: self.mcmc_samples = utils.atoi(self.kwargs['mcmc_samples']) # Random seed that can be used to control stochasticity. # Used for setting the numpy random seed used in predict and also for pystan when using mcmc_samples>0. if 'random_seed' in self.kwargs: self.seed = utils.atoi(self.kwargs['random_seed']) # Set the random seed for numpy np.random.seed(self.seed) # Number of potential changepoints to include. Default value is 25. # Potential changepoints are selected uniformly from the first `changepoint_range` proportion of the history. if 'n_changepoints' in self.kwargs: self.n_changepoints = utils.atoi(self.kwargs['n_changepoints']) # Proportion of history in which trend changepoints will be estimated. # Defaults to 0.8 for the first 80%. if 'changepoint_range' in self.kwargs: self.changepoint_range = utils.atof( self.kwargs['changepoint_range']) # Number of simulated draws used to estimate uncertainty intervals. if 'uncertainty_samples' in self.kwargs: self.uncertainty_samples = utils.atoi( self.kwargs['uncertainty_samples']) # Set the weekly start for 'weekly' seasonality requests # Default week start is 0 which represents Sunday. Add offset as required. if 'weekly_start' in self.kwargs: self.weekly_start = utils.atoi(self.kwargs['weekly_start']) # Set the weekly start for 'yearly' seasonality requests # Default week start is 0 which represents 1st of Jan. Add offset as required. if 'yearly_start' in self.kwargs: self.yearly_start = utils.atoi(self.kwargs['yearly_start']) # Set a period to extend the holidays by lower_window number of days before the date. # This can be used to extend the holiday effect if 'lower_window' in self.kwargs: self.lower_window = utils.atoi(self.kwargs['lower_window']) # Set a period to extend the holidays by upper_window number of days after the date. # This can be used to extend the holiday effect if 'upper_window' in self.kwargs: self.upper_window = utils.atoi(self.kwargs['upper_window']) # Create dictionary of arguments for the Prophet(), make_future_dataframe(), add_seasonality() and fit() functions self.prophet_kwargs = {} self.make_kwargs = {} self.add_seasonality_kwargs = {} self.fit_kwargs = {} # Populate the parameters in the corresponding dictionary: # Set up a list of possible key word arguments for the Prophet() function prophet_params = ['seasonality_mode', 'growth', 'changepoint_prior_scale', 'interval_width',\ 'seasonality_prior_scale', 'holidays_prior_scale', 'mcmc_samples', 'n_changepoints',\ 'changepoint_range', 'uncertainty_samples'] # Create dictionary of key word arguments for the Prophet() function self.prophet_kwargs = self._populate_dict(prophet_params) # Set up a list of possible key word arguments for the make_future_dataframe() function make_params = ['periods', 'freq'] # Create dictionary of key word arguments for the make_future_dataframe() function self.make_kwargs = self._populate_dict(make_params) # Set up a list of possible key word arguments for the add_seasonality() function seasonality_params = ['name', 'period', 'fourier_order', 'mode'] # Create dictionary of key word arguments for the add_seasonality() function self.add_seasonality_kwargs = self._populate_dict(seasonality_params) # Pass the random seed to the fit method if MCMC is being used if self.mcmc_samples is not None and self.mcmc_samples > 0: # Set up a list of possible key word arguments for the fit() function fit_params = ['seed'] # Create dictionary of key word arguments for the fit() function self.fit_kwargs = self._populate_dict(fit_params)
def _set_params(self, kwargs): """ Set input parameters based on the request. : :For details refer to the GitHub project: https://github.com/nabeel-oz/qlik-py-tools """ # If key word arguments were included in the request, get the parameters and values, # by transforming the string of arguments into a dictionary self.kwargs = {} if len(kwargs) == 0 else utils.get_kwargs(kwargs) # Set the debug option for generating execution logs # Valid values are: true, false self.debug = False if 'debug' not in self.kwargs else ( 'true' == self.kwargs.pop('debug').lower()) # Additional information is printed to the terminal and logs if the paramater debug = true if self.debug: # Increment log counter for the class. Each instance of the class generates a new log. self.__class__.log_no += 1 # Create a log file for the instance # Logs will be stored in ..\logs\Common Functions Log <n>.txt self.logfile = os.path.join( os.getcwd(), 'logs', 'Common Functions Log {}.txt'.format(self.log_no)) self._print_log(1) # Set the name of the function to be called on the model # By default this is the 'predict' function, but could be other functions such as 'predict_proba' if supported by the model self.prediction_func = 'predict' if 'return' not in self.kwargs else self.kwargs.pop( 'return') # Certain models may need sorted data for predictions # A feature can be specified for use in sorting using the identifier argument. self.identifier = None if 'identifier' not in self.kwargs else self.kwargs.pop( 'identifier') # The identifier can be excluded from the inputs to the model using the exclude_identifier argument. self.exclude_identifier = False if 'exclude_identifier' not in self.kwargs else ( self.kwargs.pop('exclude_identifier').lower() == 'true') # Number of seconds to wait if a Keras model is being loaded by another thread self.wait = 2 if 'keras_wait' not in self.kwargs else utils.atoi( self.kwargs.pop('keras_wait')) # Number of retries if a Keras model is being loaded by another thread self.retries = 5 if 'keras_retries' not in self.kwargs else utils.atoi( self.kwargs.pop('keras_retries')) # Get the rest of the parameters, converting values to the correct data type self.pass_on_kwargs = {} if len( self.kwargs) == 0 else utils.get_kwargs_by_type(self.kwargs) # The predictions may need to be decoded in case of classification labels # The labels can be passed as a dictionary using the 'labels' argument. self.labels = None if 'labels' not in self.pass_on_kwargs else self.pass_on_kwargs.pop( 'labels') # Debug information is printed to the terminal and logs if the paramater debug = true if self.debug: self._print_log(2) # Remove the kwargs column from the request_df self.request_df = self.request_df.drop(['kwargs'], axis=1)