def discretize(self, discretization_type=Variable.TYPE_EQUALFREQ, nb_bins=10): """ Args: discretization_type (str): "equal-freq" or "equal-width", default is "equal-freq" nb_bins (int): number of bins to target after discretization, default is 10 Returns: ContinuousVariable : variable that has been discretized """ # Eventually existing discretization will be overriden by the new one if not isinstance(nb_bins, int): raise ApiException('Number of bins must be an integer') if discretization_type == Variable.TYPE_EQUALFREQ: data = [{'name': self.name, 'equalFreqBins': nb_bins}] elif discretization_type == Variable.TYPE_EQUALWIDTH: data = [{'name': self.name, 'equalWidthBins': nb_bins}] else: raise ApiException('Discretization Type does not exist: {}'.format(discretization_type)) creation_json = self.__api.Datasets.discretize(project_ID=self.project_id, dataset_ID=self.dataset_id, json=data) try: self.__api.handle_work_states(self.project_id, work_type='discretization', work_id=creation_json.get('_id')) except Exception as E: raise ApiException('Unable to get the discretization status', str(E)) self.__json_returned = self._update() return self
def apihandle(): json = {'project_ID': project_id, 'data': data, 'streaming': True} creation_json = self.__api.Datasets.uploaddatasets(**json) print('\n') try: self.__api.handle_work_states( project_id, work_type='datasetValidation', query={"datasetId": creation_json.get('_id')}) except Exception as E: raise ApiException( 'Unable to get the dataset validation status', str(E)) try: self.__api.handle_work_states( project_id, work_type='datasetDescription', query={"datasetId": creation_json.get('_id')}) except Exception as E: raise ApiException( 'Unable to get the dataset description status', str(E)) returned_json = self.__api.Datasets.getadataset( project_ID=project_id, dataset_ID=creation_json.get('_id')) return json, returned_json
def __prepare_kpi_data(self, target): kpi_data = { "kpiName": target.name, "kpiType": target.indicator_type, "output": target.variable_name, "kpiFamily": target.indicator_family, "scoreType": "Shift" if target.indicator_type == TargetFactory.KPI_TYPE_CONTINUOUS else "Lift", "isMainKey": False, } if isinstance(target, Target): dictionary = dict(zip(target.scores, target.score_ids)) if target.indicator_type == TargetFactory.KPI_TYPE_DISCRETE or target.indicator_type == TargetFactory.KPI_TYPE_DISCRETE_MODALITY: kpi_data['_id'] = dictionary[TargetFactory.KPI_SCORE_PURITY] kpi_data["omodality"] = target.modality elif target.indicator_type == TargetFactory.KPI_TYPE_CONTINUOUS: kpi_data['_id'] = dictionary[TargetFactory.KPI_SCORE_AVERAGE_VALUE] else: raise ApiException('Unexpected target indicator type') elif isinstance(target, Description): kpi_data['_id'] = target.score_id else: raise ApiException('Unexpected target Structure') return kpi_data
def create(self, dataset, name, target=None, targets=None, quantiles=10, enable_custom_discretizations=True): """ Args: dataset (Dataset) : dataset on which Xray will be created name (str): name of the Xray to create target (Target or Description): one target to generate the Xray targets (Target or Description): array of targets to generate the Xray (ignored if 'target' parameter is defined) quantiles (int): Number of intervals the continuous variables are quantized in, default is 10 enable_custom_discretizations (boolean): use custom discretizations, eventually use "quantiles" parameter for remaining variables, default is True Returns: Xray """ if enable_custom_discretizations is True: discretizations = dataset._discretizations else: discretizations = {} if target is not None: kpis = [self.__prepare_kpi_data(target)] elif targets is not None: kpis = [self.__prepare_kpi_data(target) for target in targets] else: raise ApiException('A target should be defined') data = { "projectId": self.__project_id, "task": { "type": "simplelift", "datasetName": dataset.name, "datasetId": dataset.dataset_id, "projectId": self.__project_id, "params": { "source": dataset.source_file_name, "kpis": kpis, "name": name, "quantileOrder": quantiles, "separator": dataset.separator, "discretizations": discretizations } } } creation_json = self.__api.SimpleLift.newsimplelift( project_ID=self.__project_id, json=data) try: self.__api.handle_work_states(self.__project_id, work_type='simplelift', work_id=creation_json.get('_id')) except Exception as E: raise ApiException('Unable to get the X-ray status', str(E)) return Xray(self.__api, creation_json)
def split(self, train_ratio=0.7, random_state=42, keep_proportion_variable=None, train_dataset_name=None, train_dataset_desc=None, test_dataset_name=None, test_dataset_desc=None): """ Split the dataset into two subsets for training and testing models. Args: train_ratio (float): ratio between training set size and original data set size random_state (int): seed used by the random number generator keep_proportion_variable (Variable): discrete variable which modalities keep similar proportions in training and test sets train_dataset_name (str): name of the training set train_dataset_desc (str): description of the training set test_dataset_name (str): name of the test set test_dataset_desc (str): description of the test set Returns: The new training and test datasets """ if not self._is_deleted: if not 0 < train_ratio < 1: raise ApiException('train_ratio must be greater than 0 and lower than 1') if not 0 < random_state < 1001: raise ApiException('random_state must be greater than 0 and lower than 1001') if keep_proportion_variable and not keep_proportion_variable.is_discrete: raise ApiException('keep_proportion_variable must be a discrete variable') train_name = train_dataset_name or self.name + '_train' test_name = test_dataset_name or self.name + '_test' train_name, test_name = self.__get_unique_names(train_name, test_name) data = { 'charactInvalidTest': '', 'charactInvalidTrain': '', 'dataset': self.__json_returned, 'datasetId': self.dataset_id, 'projectId': self.project_id, 'randomState': random_state, 'target': keep_proportion_variable._json if keep_proportion_variable else '', 'testDescription': test_dataset_desc or 'Test set of dataset ' + self.name, 'testName': test_name, 'train': train_ratio, 'trainDescription': train_dataset_desc or 'Train set of dataset ' + self.name, 'trainName': train_name } json = {'project_ID': self.project_id, 'dataset_ID': self.dataset_id, 'json': data} split_json = self.__api.Datasets.split(**json) try: self.__api.handle_work_states(self.project_id, work_type='datasetSplit', work_id=split_json.get('id')) except Exception as E: raise ApiException('Unable to get the split status', str(E)) factory = DatasetFactory(self.__api, self.project_id) return factory.get(train_name), factory.get(test_name)
def create_from_sql(self, name, connection_string, query, description='', modalities=2, continuous_threshold=0.95, missing_threshold=0.95): """ Create a Dataset from a sql database. Supported systems : PostgreSql Args: name (str): The name of the dataset connection_string (str): The connection string to the database (format : 'postgresql://*****:*****@host:port/database') query : The query to execute to fetch the data (example : 'SELECT * FROM data_table') description (str): The dataset description, default is '' modalities (int): Modality threshold for discrete variables, default is 2 continuous_threshold (float): % of continuous values threshold for continuous variables, default is 0.95 missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95 Returns: Dataset """ project_id = self.__project_id SEPARATOR = "semicolon" ENCODING = "utf-8" dataset_data = { 'datasetName': name, 'description': description, 'cached': True, 'separator': SEPARATOR, 'encoding': ENCODING, 'type': 'dbAccess', 'dbSystem': 'pgsql', 'query': query, 'connectionString': connection_string } json = {'project_ID': project_id, 'json': dataset_data} creation_json = self.__api.Datasets.createdataset(**json) try: self.__api.handle_work_states(project_id, work_type='datasetValidation', query={"datasetId": creation_json.get('_id')}) except Exception as E: raise ApiException('Unable to get the dataset validation status', str(E)) try: self.__api.handle_work_states(project_id, work_type='datasetDescription', query={"datasetId": creation_json.get('_id')}) except Exception as E: raise ApiException('Unable to get the dataset description status', str(E)) returned_json = self.__api.Datasets.getadataset(project_ID=project_id, dataset_ID=creation_json.get('_id')) return Dataset(self.__api, json, returned_json)
def export_scores(self, path, variables=None): """ Export the scores of this model in a csv file Args: path (str): the destination path for the exported scores variables (list of Variable): the variables of the dataset to add in the file. Default is None """ data = { 'datasetId': self.__json_returned.get('datasetId'), 'columns': [variable.name for variable in variables] if variables else [] } json = {'project_ID': self.project_id, 'model_ID': self.id, 'json': data} json_returned = self.__api.Prediction.postexportscores(**json) try: self.__api.handle_work_states(self.project_id, work_type=json_returned.get('type'), work_id=json_returned.get('_id')) except Exception as E: raise ApiException('Unable to export the model scores for ' + self.name, str(E)) outputFile = json_returned.get('workParams').get('outputFile').split("_")[1] data = { 'outputFile': outputFile } json = {'project_ID': self.project_id, 'model_ID': self.id, 'params': data} to_export = self.__api.Prediction.getexportscores(**json) with open(path, 'wb') as FILE_OUT: FILE_OUT.write(to_export)
def get_required_module(module_name): try: return importlib.import_module(module_name) except ModuleNotFoundError: warn_msg = 'The module {md} is missing and required for this function.\n' warn_msg = warn_msg + 'To install it on a notebook, execute "!pip install {md}" and restart the kernel' logging.warning(warn_msg.format(md=module_name)) raise ApiException(f'Missing module for this function : {module_name}')
def create(self, variable, modality=None, scoreTypes=None): """ create(variable, modality=None, scoreTypes=None) Create one target for the given variable. Args: variable (Variable): the variable defining the target modality (int or str or float): modality of the target if variable is discrete. Default is most frequent modality. scoreTypes (list of str): score types to be defined for the target. Default is [TargetFactory.KPI_SCORE_PURITY, TargetFactory.KPI_SCORE_COVERAGE] if variable is discrete and [TargetFactory.KPI_SCORE_AVERAGE_VALUE] if variable is continuous. Returns: Target: The new target """ if variable.is_discrete: # checking if modality exists if modality is None: kpiModality = variable.top_modality else: if modality in variable.modalities: kpiModality = modality else: raise ApiException( 'Modality {} does not exist for variable {}'.format( modality, variable.name)) kpiScoreTypes = set(scoreTypes or []).union( {self.KPI_SCORE_PURITY, self.KPI_SCORE_COVERAGE}) else: kpiModality = None kpiScoreTypes = set(scoreTypes or []).union({self.KPI_SCORE_AVERAGE_VALUE}) targetData = self.__get_target_data(variable, [ kpiModality, ], kpiScoreTypes) data = {"kpis": targetData} json = {'project_ID': self.__project_id, 'json': data} returned_json = self.__api.Kpi.addkpi(**json) if variable.is_discrete: targets_returned_json = [ kpi for kpi in returned_json['kpis'] if kpi['kpiFamily'] == 'target' and kpi['variable'] == variable.name and kpi['modality'] == kpiModality ] else: targets_returned_json = [ kpi for kpi in returned_json['kpis'] if kpi['kpiFamily'] == 'target' and kpi['variable'] == variable.name ] unique_targets_returned_json = unique_list(targets_returned_json) target_json = unique_targets_returned_json[0] return Target(self.__api, json, target_json)
def get_confusion_matrix(self, top_score_ratio): if not 0 <= top_score_ratio <= 1: raise ApiException('top_score_ratio must be greater or equal to 0 and lower or equal to 1') self.__load_confusion_matrix() index = self.__get_index(top_score_ratio) values = self.__json_confusion_matrix['Lift curve'][index] return ConfusionMatrix(true_positives=values['TP'], false_positives=values['FP'], true_negatives=values['TN'], false_negatives=values['FN'])
def predict_scores(self, dataset, keep_applied_model=False): """ Predict target scores for input dataset Args: dataset (Dataset): the dataset containing the input samples. keep_applied_model (boolean): A HyperCube applied model is temporarily created to compute these scores, set this parameter to True if you want this model to be persisted. Default is False. Returns: a NumPy array of shape [n_samples,] where n_samples is the number of samples in the input dataset """ applied_model = self.apply(dataset, '{}_applied_{}'.format(self.name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))) data = { 'datasetId': dataset.dataset_id, 'columns': [] } json = {'project_ID': applied_model.project_id, 'model_ID': applied_model.id, 'json': data} json_returned = self.__api.Prediction.postexportscores(**json) try: self.__api.handle_work_states(applied_model.project_id, work_type=json_returned.get('type'), work_id=json_returned.get('_id')) except Exception as E: raise ApiException('Unable to get the model scores for {}'.format(self.name), str(E)) outputFile = json_returned.get('workParams').get('outputFile').split("_")[1] data = { 'outputFile': outputFile } json = {'project_ID': applied_model.project_id, 'model_ID': applied_model.id, 'params': data} scores = self.__api.Prediction.getexportscores(**json) scoreIO = StringIO(scores.decode('utf-8')) try: df = read_csv(scoreIO, sep=';', skiprows=1, usecols=[1]) except Exception as E: raise ApiException('Unable to read the model scores for {}'.format(self.name), str(E)) if not keep_applied_model: applied_model.delete() return reshape(df.values, (df.values.shape[0]))
def export_dataframe(self): """ Export the dataset to a Pandas DataFrame Returns: DataFrame """ if not self._is_deleted: try: import pandas except ImportError as E: raise ApiException( 'Pandas is required for this operation, please execute "!pip install pandas" and restart the kernel', str(E)) _data = io.StringIO(self._export().decode('utf-8')) return pandas.read_csv(_data, sep=";")
def apply(self, dataset, applied_model_name, add_score_to_dataset=False, score_column_name=None): """ Apply the HyperCube classifier model on a selected data set Args: dataset (Dataset): Dataset the model is applied on applied_model_name (str): Name of the new applied model add_score_to_dataset (boolean): if set to True a new column containing the scores is added to the dataset. Default is False. score_column_name (str): name of the score column, used only if add_score_to_dataset is set to True Returns: the applied Model """ params = dict(self.__json_returned) params['modelName'] = applied_model_name data = { 'datasetId': dataset.dataset_id, 'datasetName': dataset.name, 'fromDatasetId': self.__json_returned.get('datasetId'), 'modelId': self.__json_returned.get('_id'), 'params': params, 'projectId': dataset.project_id, 'spark': False, 'type': 'applyPrediction' } if add_score_to_dataset: data['saveScore'] = score_column_name or 'score_' + applied_model_name json = {'project_ID': dataset.project_id, 'json': data} json_returned = self.__api.Task.createtask(**json) try: self.__api.handle_work_states(dataset.project_id, work_type=json_returned.get('type'), work_id=json_returned.get('_id')) except Exception as E: raise ApiException('Unable to create the applied HyperCube model ' + applied_model_name, str(E)) return HyperCube(self.__api, json_returned)
def create_hypercube(self, dataset, name, target, purity_min=None, coverage_min=None, rule_complexity=2, quantiles=10, min_marginal_contribution=None, max_complexity=3, nb_minimizations=1, coverage_increment=0.01, split_ratio=0.7, nb_iterations=1, purity_tolerance=0.1, enable_custom_discretizations=True, save_all_rules=False): """ Create a HyperCube classifier model Args: dataset (Dataset): Dataset the model is fitted on name (str): Name of the new model target (Target): Target used to generate the model purity_min (float): Minimum purity of rules, default is the entire dataset purity coverage_min (int): Minimum coverage of the target population for each rule, default is 10 rule_complexity (int): Maximum number of variables in rules, default is 2 quantiles (int): Number of bins for all continuous numeric variables during quantization, default is 10 min_marginal_contribution (float): a new rule R', created by adding a new constraint to an existing rule R (and thus increasing its complexity), is added to the ruleset if and only if it increases the original purity of R by the minimum marginal contribution or more. Default is 0.1 max_complexity (int): maximum number of variables contained in the rules created during the local complexity increase phase. Default is 3 nb_minimizations (int): Number of minimizations to perform on the ruleset, default is 1 coverage_increment (float): Percentage increment of target samples that a new rule must bring to be added to the minimized ruleset, default is 0.01 split_ratio (float): the first step in the model generation is the random split of the original dataset into a learning (or train) dataset representing by default 70% of the original dataset, and a validation (or test) dataset containing the remaining 30%. Default is 0.7 nb_iterations (int): The final model is the result of several models based on different splits of the original dataset, using a bootstrap method. The parameter "Number of iterations" corresponds to the number of these splits that are made. Default is 1 purity_tolerance (float): maximum spread between the purities of the rules applied to the learning and validation datasets enable_custom_discretizations (boolean): when ticked use the custom discretization(s) link to the selected dataset, eventually use "Quantiles" parameter for remaining variables. Default is True save_all_rules (boolean): save all generated rules in a new ruleset. Default is False Returns: the created model """ variable = next(variable for variable in dataset.variables if variable.name == target.variable_name) index = variable.modalities.index(target.modality) datasetPurity = variable.purities[index] score_purity_min = purity_min or round(datasetPurity, 3) if min_marginal_contribution is None: if score_purity_min > 0.99: min_marginal_contribution = round(1 / score_purity_min - 1, 3) elif score_purity_min > 0.9: min_marginal_contribution = round(0.99 / score_purity_min - 1, 3) else: min_marginal_contribution = 0.1 coverage_min = coverage_min or 10 if (variable.frequencies[index] < 1000) else 0.01 if enable_custom_discretizations is True: discretizations = dataset._discretizations else: discretizations = {} scores = [] for score_id, score_type in zip(target.score_ids, target.scores): score = { 'deleted': False, 'kpiFamily': target.indicator_family, 'kpiName': target.name, 'kpiType': target.indicator_type, 'omodality': target.modality, 'output': target.variable_name, 'projectId': target.project_id, 'scoreType': score_type, '_id': score_id } if score_type == self._PURITY: score['minValue'] = score_purity_min elif score_type == self._COVERAGE: score['minValue'] = coverage_min elif score_type == self._LIFT: score['minValue'] = 1 scores.append(score) kpisel = { 'datasetPurity': datasetPurity, 'kpiFamily': target.indicator_family, 'kpiName': target.name, 'kpiType': target.indicator_type, 'projectId': target.project_id, 'scores': scores, 'selectedBy': 'target' } params = { 'algoType': 'HyperCube', 'complexityExhaustive': rule_complexity, 'countQuantiles': quantiles, 'coverageIncrement': coverage_increment, 'coverageThreshold': 10, 'delimiter': 'semicolon', 'discretizations': discretizations, 'dtmaxdepth': 4, 'elasticNetParam': 0, 'enableCustomDiscretizations': enable_custom_discretizations, 'featureSubsetStrategy': 'sqrt', 'gbmaxdepth': 3, 'gbn_estimators': 100, 'kpis': [], 'learning_rate': 0.1, 'lrCost': 1, 'maxComplexity': max_complexity, 'maxDepth': 3, 'maxIter': 100, 'minInfoGain': 0, 'minInstance': 1, 'minMarginalContribution': min_marginal_contribution, 'minObservation': 3, 'missingValues': 0.1, 'modelName': name, 'nbMaxModality': 50, 'nbMinObservation': 10, 'nbMinimizations': nb_minimizations, 'nbModels': nb_iterations, 'numTrees': 10, 'percentageSplit': split_ratio, 'purityThreshold': score_purity_min, 'purityTolerance': purity_tolerance, 'regParam': 0, 'replaceMissingValues': 'Median', 'rfmaxdepth': 2, 'rfn_estimators': 100, 'saveAllRules': 1 if save_all_rules else 0, 'sourceFileName': dataset.source_file_name, 'splitRatio': split_ratio, 'stepSize': 0.1, 'subsamplingRate': 1, 'target': [score for score in scores if score['scoreType'] == self._PURITY or score['scoreType'] == self._COVERAGE], 'tol': 0.000001 } data = { 'algo': 'HyperCube', 'algolist': ['HyperCube', 'LogisticRegression', 'DecisionTree', 'RandomForest', 'GradientBoosting'], 'datasetId': dataset.dataset_id, 'datasetName': dataset.name, 'dtcr': 'gini', 'enableCustomDiscretizations': enable_custom_discretizations, 'gbloss': 'deviance', 'keyIndicators': [], 'kpisel': kpisel, 'lrPenalty': 'l2', 'lrSolver': 'liblinear', 'modelFile': '', 'modelName': name, 'params': params, 'projectId': dataset.project_id, 'rfcr': 'gini', 'saveAllRules': save_all_rules, 'selectedDataset': dataset._json, 'sourceFileName': '', 'spark': False, 'type': 'hypercubePrediction', 'validTarget': True } json = {'project_ID': dataset.project_id, 'json': data} json_returned = self.__api.Task.createtask(**json) try: self.__api.handle_work_states(dataset.project_id, work_type=json_returned.get('type'), work_id=json_returned.get('_id')) except Exception as E: raise ApiException('Unable to create the HyperCube model ' + name, str(E)) return HyperCube(self.__api, json_returned)
def create(self, dataset, target, params): """ Private method. Create a classifier or regressor Scikit-learn model Args: dataset (Dataset): Dataset the model is fitted on target (Target): Target used to generate the model params (dict): parameters used by the HyperWorker Returns: the created model """ if params['modelType'] not in ModelTypes.LIST: print('Unexpected model name : {}, valid options are : {}'.format( params['modelType'], ', '.join(ModelTypes.LIST))) return if params['algoType'] not in AlgoTypes.LIST: print('Unexpected model name : {}, valid options are : {}'.format( params['algoType'], ', '.join(AlgoTypes.LIST))) return if target.indicator_type == self._INDICATOR_DISCRETE_WITH_MODALITY: variable = next(variable for variable in dataset.variables if variable.name == target.variable_name) index = variable.modalities.index(target.modality) datasetPurity = variable.purities[index] score_purity_min = round(datasetPurity, 3) coverage_min = 10 if (variable.frequencies[index] < 1000) else 0.01 scores = [] for score_id, score_type in zip(target.score_ids, target.scores): score = { 'deleted': False, 'kpiFamily': target.indicator_family, 'kpiName': target.name, 'kpiType': target.indicator_type, 'output': target.variable_name, 'projectId': target.project_id, 'scoreType': score_type, '_id': score_id } if target.indicator_type == self._INDICATOR_DISCRETE_WITH_MODALITY: score['omodality'] = target.modality if score_type == self._PURITY: score['minValue'] = score_purity_min elif score_type == self._COVERAGE: score['minValue'] = coverage_min elif score_type == self._LIFT: score['minValue'] = 1 scores.append(score) scores = [ score for score in scores if score['scoreType'] == self._PURITY or score['scoreType'] == self._COVERAGE ] else: scores.append(score) kpisel = { 'kpiFamily': target.indicator_family, 'kpiName': target.name, 'kpiType': target.indicator_type, 'projectId': target.project_id, 'selectedBy': 'target' } if target.indicator_type == self._INDICATOR_DISCRETE_WITH_MODALITY: kpisel['datasetPurity'] = datasetPurity params['sourceFileName'] = dataset.source_file_name params['target'] = scores data = { 'datasetId': dataset.dataset_id, 'datasetName': dataset.name, 'kpi': kpisel, 'modelName': params['modelName'], 'params': params, 'projectId': dataset.project_id, 'selectedDataset': dataset._json, 'type': 'automatedModels', 'validTarget': True, } new_automodel = self.__api.AutomatedPrediction.createautomatedmodel( project_ID=self.__project_id, json=data) try: self.__api.handle_work_states(self.__project_id, work_type='automatedModels', work_id=new_automodel.get('workId')) except Exception as E: raise ApiException('Unable to get the automated model status', str(E)) return AutomatedModel(self.__api, new_automodel)
def create(self, name, file_path, decimal='.', delimiter='semicolon', encoding='UTF-8', selectedSheet=1, description='', modalities=2, continuous_threshold=0.95, missing_threshold=0.95, metadata_file_path=None, discreteDict_file_path=None, keepVariableName=None): """ Create a Dataset from a file (csv, Excel) Args: name (str): The name of the dataset file_path (str): The origin path of the file decimal (str): Decimal separator - csv files only, default is '.' delimiter (str): The csv field delimiter - csv files only, default is 'semicolon' encoding (str): The file encoding - csv files only, default is 'UTF-8' selectedSheet (int): The worksheet to use (starts at 1 like in Hypercube User Interface) - Excel files only, default is 1 description (str): The dataset description, default is '' modalities (int): Modality threshold for discrete variables, default is 2 continuous_threshold (float): % of continuous values threshold for continuous variables, default is 0.95 missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95 Returns: Dataset """ project_id = self.__project_id _, file_name = split(file_path) if metadata_file_path: _, metadata_file_name = split(metadata_file_path) else: metadata_file_name = None if discreteDict_file_path: _, discreteDict_file_name = split(discreteDict_file_path) else: discreteDict_file_name = None selectedSheet = max(1, selectedSheet) # historically, delimiter/separator were stored as explicit strings in our database (ex: "semicolon") # we want to keep it that way if delimiter in self.char_delimiters: delimiter = self.string_delimiters[self.char_delimiters.index(delimiter)] elif delimiter not in self.string_delimiters: raise ApiException(f'Unsupported value for delimiter: {delimiter}', f'Supported values: {self.string_delimiters}') data = { 'name': name, 'fileName': file_name, 'decimalDelimiter': decimal, 'delimiter': delimiter, 'separator': delimiter, 'encoding': encoding, 'usePython': description, 'useSpark': 'False', 'sourceFileName': file_name, 'selectedSheet': str(selectedSheet), 'description': description, 'size': '{}'.format(getsize(file_path)), 'nbModalitiesThreshold': str(modalities), 'percentageContinuousThreshold': str(continuous_threshold), 'percentageMissingThreshold': str(missing_threshold) } if keepVariableName: data['keepVariableName'] = keepVariableName def apihandle(): json = {'project_ID': project_id, 'data': data, 'streaming': True} creation_json = self.__api.Datasets.uploaddatasets(**json) print('\n') try: self.__api.handle_work_states(project_id, work_type='datasetValidation', query={"datasetId": creation_json.get('_id')}) except Exception as E: raise ApiException('Unable to get the dataset validation status', str(E)) try: self.__api.handle_work_states(project_id, work_type='datasetDescription', query={"datasetId": creation_json.get('_id')}) except Exception as E: raise ApiException('Unable to get the dataset description status', str(E)) returned_json = self.__api.Datasets.getadataset(project_ID=project_id, dataset_ID=creation_json.get('_id')) return json, returned_json if metadata_file_name and discreteDict_file_name: data['metadataFileName'] = metadata_file_name, data['discreteDictFileName'] = discreteDict_file_name, with open(file_path, 'rb') as FILE: with open(metadata_file_path, 'rb') as METADATA: with open(discreteDict_file_path, 'rb') as DISCRETEDICT: data['file[0]'] = ( file_name, FILE, 'application/vnd.ms-excel', ) data['file[1]'] = ( metadata_file_name, METADATA, 'application/json', ) data['file[2]'] = ( discreteDict_file_name, DISCRETEDICT, 'application/json', ) json, returned_json = apihandle() elif metadata_file_name: data['metadataFileName'] = metadata_file_name, with open(file_path, 'rb') as FILE: with open(metadata_file_path, 'rb') as METADATA: data['file[0]'] = ( file_name, FILE, 'application/vnd.ms-excel', ) data['file[1]'] = ( metadata_file_name, METADATA, 'application/json', ) json, returned_json = apihandle() else: with open(file_path, 'rb') as FILE: data['file[0]'] = ( file_name, FILE, 'application/vnd.ms-excel', ) json, returned_json = apihandle() return Dataset(self.__api, json, returned_json)
def create_from_dataframe(self, name, dataframe, description='', modalities=2, continuous_threshold=0.95, missing_threshold=0.95, metadata=None, discreteDict=None, keepVariableName=None): """ Create a Dataset from a Pandas DataFrame Args: name (str): The name of the dataset dataframe (pandas.DataFrame): The dataframe to import description (str): The dataset description, default is '' modalities (int): Modality threshold for discrete variables, default is 2 continuous_threshold (float): % of continuous values threshold for continuous variables ,default is 0.95 missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95 Returns: Dataset """ project_id = self.__project_id file_name = '{}.csv'.format(uuid.uuid4()) metadata_file_name = '{}.json'.format(uuid.uuid4()) discreteDict_file_name = '{}.json'.format(uuid.uuid4()) DECIMAL = "." SEPARATOR = "semicolon" ENCODING = "utf-8" sep = self.char_delimiters[self.string_delimiters.index(SEPARATOR)] stream_df = io.StringIO(dataframe.to_csv(sep=sep, index=False)) if metadata: import json stream_metadata = io.StringIO() json.dump(metadata, stream_metadata) if discreteDict: stream_discreteDict = io.StringIO() json.dump(discreteDict, stream_discreteDict) data = { 'name': name, 'fileName': file_name, 'decimalDelimiter': DECIMAL, 'delimiter': SEPARATOR, 'separator': SEPARATOR, 'encoding': ENCODING, 'usePython': description, 'useSpark': 'False', 'sourceFileName': file_name, 'description': description, 'size': '{}'.format(sys.getsizeof(dataframe)), 'nbModalitiesThreshold': str(modalities), 'percentageContinuousThreshold': str(continuous_threshold), 'percentageMissingThreshold': str(missing_threshold) } if keepVariableName: data['keepVariableName'] = keepVariableName data['file[0]'] = ( file_name, stream_df, 'application/vnd.ms-excel', ) if metadata: data['metadataFileName'] = metadata_file_name data['file[1]'] = ( metadata_file_name, stream_metadata, 'application/json', ) if discreteDict: data['discreteDictFileName'] = discreteDict_file_name data['file[2]'] = ( discreteDict_file_name, stream_discreteDict, 'application/json', ) json_ = {'project_ID': project_id, 'data': data, 'streaming': True} creation_json = self.__api.Datasets.uploaddatasets(**json_) try: self.__api.handle_work_states(project_id, work_type='datasetValidation', query={"datasetId": creation_json.get('_id')}) except Exception as E: raise ApiException('Unable to get the dataset validation status', str(E)) try: self.__api.handle_work_states(project_id, work_type='datasetDescription', query={"datasetId": creation_json.get('_id')}) except Exception as E: raise ApiException('Unable to get the dataset description status', str(E)) returned_json = self.__api.Datasets.getadataset(project_ID=project_id, dataset_ID=creation_json.get('_id')) return Dataset(self.__api, json_, returned_json)
def display_curve(self, curve='ROC curve', title=None, model_line=None, random_line=None, legend=None): """ Plot the selected curve of this model Args: curve (str): curve to be diplayed, options are 'ROC curve', 'Gain curve', 'Lift curve', 'Purity curve' and 'Precision Recall'. Default is 'ROC curve'. title (str): Title of the diagram. Default is a custom model name model_line (dict): display options of model line, ex: dict(color=('rgb(205, 12, 24)'), dash='dash', width=1). Default is a blue line. see https://plot.ly/python/line-and-scatter/ random_line (dict): display options of random line. Default is a red dash line. legend (dict): legend options, ex: dict(orientation="h") or dict(x=-.1, y=1.2). Default is at the right of the diagram. see https://plot.ly/python/legend/ Returns: plot of the curve """ try: import plotly.graph_objs as go import plotly.offline as py from plotly.offline import init_notebook_mode except ImportError as E: raise ApiException('Plotly external package is required for this operation, please execute "!pip install plotly" and restart the kernel', str(E)) if curve not in ['ROC curve', 'Gain curve', 'Lift curve', 'Purity curve', 'Precision Recall']: print('Unexpected curve type : {}, valid options are : {}'.format(curve, "'ROC curve', 'Gain curve', 'Lift curve', 'Purity curve', 'Precision Recall'")) return self.__load_confusion_matrix() x, y, x_name, y_name = self.__get_x_y_info(curve) init_notebook_mode(connected=False) if model_line: roc = go.Scatter(x=x, y=y, name='{}'.format(self.name), mode='lines', line=model_line) else: roc = go.Scatter(x=x, y=y, name='{}'.format(self.name), mode='lines') data = [roc] random_line_arg = random_line or dict(color=('rgb(205, 12, 24)'), dash='dash', width=1) if curve == 'ROC curve' or curve == 'Gain curve': random = go.Scatter(x=[0, 1], y=[0, 1], name='Random', mode='lines', line=random_line_arg) elif curve == 'Lift curve': random = go.Scatter(x=[0, 1], y=[1, 1], name='Random', mode='lines', line=random_line_arg) else: random = None if random: data.append(random) default_title = '{} of {}'.format(curve, self.name) if curve == 'ROC curve' or curve == 'Precision Recall': default_title = '{} (AUC = {:0.2f})'.format(default_title, self.__json_confusion_matrix[curve][-1]['auc']) curve_title = title or default_title layout = dict(title=curve_title, xaxis=dict(title=x_name, range=[0, 1]), yaxis=dict(title=y_name, range=[0, max(y) + 0.05]),) if legend: layout['legend'] = legend fig = dict(data=data, layout=layout) py.iplot(fig, validate=False)
def minimize(self, ruleset, minimization_name, score_to_minimize='Purity', increment_threshold=0.01): """ minimize(ruleset, minimization_name, score_to_minimize='Purity', increment_threshold=0.01) Perform a minimzation on a given ruleset. Args: ruleset (Ruleset): Ruleset to minimize minimization_name (str): Name of the new ruleset score_to_minimize (str): Score to apply the minimization, default is 'Purity' increment_threshold (float): Percentage increment of target samples that a new rule must bring to be added to the minimized ruleset, default is 0.01 Return: Ruleset: Minimized ruleset """ kpisList = ruleset.kpis.copy() json = { "type": "minimization", "datasetId": ruleset.dataset_id, "projectId": ruleset.project_id, "params": { "query": "tagsfilter={}".format(urllib.parse.quote(ruleset.name)), "taglist": [ruleset.name], "incrementThreshold": increment_threshold, "tag": minimization_name } } _kpiId = decode_kpiname_to_id(ruleset.kpis, score_to_minimize) if _kpiId != score_to_minimize: json['params']['kpiId'] = _kpiId _kpis_corr = self.__api.Kpi.getkpicorrelation( project_ID=ruleset.project_id) for _kpi in kpisList: _kpi_corr = next((_kpi_corr for _kpi_corr in _kpis_corr if _kpi_corr.get('_id') == _kpi.get('kpiId')), {}) _kpi.update(_kpi_corr) if kpisList[0].get('kpiType') in [ RulesetFactory._CONTINUOUS, RulesetFactory._CONTINUOUS_RATIO ]: raise ApiException( f'Unsupported target type in ruleset minimization: {kpisList[0].get("kpiType")}', f'Supported types: {RulesetFactory._DISCRETE_MODALITY}') json['params']['kpisList'] = kpisList _ruleset = self.__api.Task.createtask(project_ID=ruleset.project_id, json=json) self.__api.handle_work_states(ruleset.project_id, work_type='minimization', work_id=_ruleset.get('_id')) return self.get(minimization_name)
def create(self, dataset, name, target, purity_min=None, coverage_min=None, lift_min=None, zscore_min=None, average_value_min=None, standard_deviation_max=None, shift_min=None, rule_complexity=2, quantiles=10, enable_custom_discretizations=True, min_marginal_contribution=None, compute_other_key_indicators=None, locally_increase_complexity=False, max_complexity=3, nb_minimizations=1, coverage_increment=0.01, validate_stability=False, split_ratio=0.7, nb_iterations=1, purity_tolerance=0.1): """ Create a new ruleset Args: dataset (Dataset): Dataset used to generate the ruleset name (str): Name of the new ruleset target (Target): Target to generate the ruleset purity_min (float): Minimum purity of rules, default is the entire dataset purity (discrete target only) coverage_min (int): Minimum coverage of the target population for each rule, default is 10 (discrete target only) lift_min (float): Minimum lift, default is 1 (discrete target only) zscore_min (float): Minimum Z-score, default is None (discrete target only) average_value_min (float): Minimum average value, default is average value of the target on the whole dataset (continuous target only) standard_deviation_max (float) : Maximum standard deviation, default is None (continuous target only) shift_min (float): Minimum shift, default is None (continuous target only) rule_complexity (int): Maximum number of variables in rules, default is 2 quantiles (int): Number of intervals the continuous variables are quantized in, default is 10 enable_custom_discretizations (boolean): use custom discretizations, eventually use "quantiles" parameter for remaining variables, default is True min_marginal_contribution (float): a new rule R', created by adding a new constraint to an existing rule R (and thus increasing its complexity), is added to the ruleset if and only if it increases the original purity of R by the minimum marginal contribution or more. Default is 0.1 compute_other_key_indicators (list of KeyIndicatorOption): Compute other Key Indicators. locally_increase_complexity (bool): Enable the locally increase complexity when set as true. Default is False max_complexity (int): Maximum numbers of features per rule. Default is 3 nb_minimizations (int):Interate the minimization process. Default is 1 coverage_increment (float): Percentage increment of target samples that a new rule must bring to be added to the minimization ruleset. Default is 0.01 validate_stability (bool): Enable to split your dataset, add iteration and set a purity tolerance when set as true. Default is False split_ratio (float): The percentage for the split (Between 0 and 1). Default is 0.7 nb_iterations (int): Number of iterations wanted. Default is 1 purity_tolerance (float): Purity tolerence allowed (Between 0 and 1). Default is 0.1 Returns: Ruleset """ variable = next(variable for variable in dataset.variables if variable.name == target.variable_name) score_purity_min = None if (variable.is_discrete): index = variable.modalities.index(target.modality) datasetPurity = variable.purities[index] score_purity_min = purity_min or round(datasetPurity, 3) if min_marginal_contribution is None: if score_purity_min > 0.99: min_marginal_contribution = round(1 / score_purity_min - 1, 3) elif score_purity_min > 0.9: min_marginal_contribution = round( 0.99 / score_purity_min - 1, 3) else: min_marginal_contribution = 0.1 coverage_min = coverage_min or 10 if ( variable.frequencies[index] < 1000) else 0.01 else: min_marginal_contribution = 0.1 if enable_custom_discretizations is True: discretizations = dataset._discretizations else: discretizations = {} if not compute_other_key_indicators: compute_other_key_indicators = [] if not target: raise ApiException('You need a target to create a ruleset') if isinstance(target, Description): raise ApiException( 'Cannot perform a ruleset with a description kpi') data = { "projectId": self.__project_id, "task": { "type": "learning", "datasetId": dataset.dataset_id, "projectId": self.__project_id, "params": { "learningName": name, "datasetName": dataset.name, "buildPredictiveModel": 0, "sourceFileName": dataset.source_file_name, "delimiter": dataset.separator, "complexityExhaustive": rule_complexity, "countQuantiles": quantiles, "discretizations": discretizations, "minMarginalContribution": min_marginal_contribution, "target": [], "kpis": [] } } } for _id, _type in zip(target.score_ids, target.scores): _kpiData = { "kpiId": _id, "type": _type, "kpiFamily": target.indicator_family, "scoreType": _type, "kpiType": target.indicator_type, "output": target.variable_name, "kpiName": target.name, "omodality": target.modality } if _type == self._PURITY and score_purity_min is not None: _kpiData['minValue'] = score_purity_min elif _type == self._COVERAGE and coverage_min is not None: _kpiData['minValue'] = coverage_min elif _type == self._LIFT and lift_min is not None: _kpiData['minValue'] = lift_min elif _type == self._ZSCORE and zscore_min is not None: _kpiData['minValue'] = zscore_min elif _type == self._AVERAGE_VALUE and average_value_min is not None: _kpiData['minValue'] = average_value_min elif _type == self._STANDARD_DEVIATION and standard_deviation_max is not None: _kpiData['maxValue'] = standard_deviation_max elif _type == self._SHIFT and shift_min is not None: _kpiData['minValue'] = shift_min data['task']['params']['target'].append(_kpiData) msg = "Ruleset settings: \n\t- Target: {}".format(target.name) + \ ("\n\t- Min Purity: {}".format(score_purity_min) if score_purity_min is not None else "") + \ ("\n\t- Min Coverage: {}".format(coverage_min) if coverage_min is not None else "") + \ ("\n\t- Min Lift: {}".format(lift_min) if lift_min is not None else "") + \ ("\n\t- Min Z-score: {}".format(zscore_min) if zscore_min is not None else "") + \ ("\n\t- Min Average value: {}".format(average_value_min) if average_value_min is not None else "") + \ ("\n\t- Max Standard deviation: {}".format(standard_deviation_max) if standard_deviation_max is not None else "") + \ ("\n\t- Min Shift: {}".format(shift_min) if shift_min is not None else "") + \ "\n\t- Rule Complexity: {}\n\t- Default Number of Bins: {} \n\t- Enable custom discretizations: {} \n\t- Min Marginal contribution: \ {}" .format(rule_complexity, quantiles, enable_custom_discretizations, min_marginal_contribution) if (len(compute_other_key_indicators) > 0): for key_indicator in compute_other_key_indicators: for _id, _type in zip(key_indicator.target.score_ids, key_indicator.target.scores): _kpiKI = { "kpiId": _id, "type": _type, "kpiFamily": key_indicator.target.indicator_family, "scoreType": _type, "kpiType": key_indicator.target.indicator_type, "output": key_indicator.target.variable_name, "kpiName": key_indicator.target.name, "omodality": key_indicator.target.modality } if (key_indicator.target.indicator_type == self._DISCRETE_MODALITY or key_indicator.target.indicator_type == self._DISCRETE): if _type == self._PURITY: if key_indicator.purity_min is not None: _kpiKI['minValue'] = key_indicator.purity_min if key_indicator.purity_max is not None: _kpiKI['maxValue'] = key_indicator.purity_max elif _type == self._COVERAGE: if key_indicator.coverage_min is not None: _kpiKI['minValue'] = key_indicator.coverage_min if key_indicator.coverage_max is not None: _kpiKI['maxValue'] = key_indicator.coverage_max elif _type == self._LIFT: if key_indicator.lift_min is not None: _kpiKI['minValue'] = key_indicator.lift_min if key_indicator.lift_max is not None: _kpiKI['maxValue'] = key_indicator.lift_max elif _type == self._ZSCORE: if key_indicator.zscore_min is not None: _kpiKI['minValue'] = key_indicator.zscore_min if key_indicator.zscore_max is not None: _kpiKI['maxValue'] = key_indicator.zscore_max else: if _type == self._AVERAGE_VALUE: if key_indicator.average_value_min is not None: _kpiKI[ 'minValue'] = key_indicator.average_value_min if key_indicator.average_value_max is not None: _kpiKI[ 'maxValue'] = key_indicator.average_value_max elif _type == self._STANDARD_DEVIATION: if key_indicator.standard_deviation_min is not None: _kpiKI[ 'minValue'] = key_indicator.standard_deviation_min if key_indicator.standard_deviation_max is not None: _kpiKI[ 'maxValue'] = key_indicator.standard_deviation_max elif _type == self._SHIFT: if key_indicator.shift_min is not None: _kpiKI['minValue'] = key_indicator.shift_min if key_indicator.shift_max is not None: _kpiKI['maxValue'] = key_indicator.shift_max if 'kpis' not in data['task']['params']: data['task']['params']['kpis'] = [] data['task']['params']['kpis'].append(_kpiKI) if (locally_increase_complexity): data['task']['params']['maxComplexity'] = max_complexity data['task']['params']['nbMinimizations'] = nb_minimizations data['task']['params']['coverageIncrement'] = coverage_increment msg += "\n\t- Max complexity: {} \n\t- Number of Minimizations: {} \n\t- Minimization \ Coverage Increment: {}".format(max_complexity, nb_minimizations, coverage_increment) if (validate_stability): data['task']['params']['percentageSplit'] = split_ratio data['task']['params']['nbModels'] = nb_iterations data['task']['params']['purityTolerance'] = purity_tolerance msg += "\n\t- Percentage split: {} \n\t- Number of Iterations: {} \n\t- Purity Tolerance: {}".format( split_ratio, nb_iterations, purity_tolerance) print(msg) _ruleset = self.__api.Task.createtask(project_ID=self.__project_id, json=data) self.__api.handle_work_states(self.__project_id, work_type='learning', work_id=_ruleset.get('_id')) return self.get(name)
def create(self, name, file_path, decimal='.', delimiter=';', encoding='UTF-8', selectedSheet=1, description='', modalities=2, continuous_threshold=0.95, missing_threshold=0.95): """ Create a Dataset from a file (csv, Excel) Args: name (str): The name of the dataset file_path (str): The origin path of the file decimal (str): Decimal separator - csv files only, default is '.' delimiter (str): The csv field delimiter - csv files only, default is ';' encoding (str): The file encoding - csv files only, default is 'UTF-8' selectedSheet (int): The worksheet to use (starts at 1 like in Hypercube User Interface) - Excel files only, default is 1 description (str): The dataset description, default is '' modalities (int): Modality threshold for discrete variables, default is 2 continuous_threshold (float): % of continuous values threshold for continuous variables, default is 0.95 missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95 Returns: Dataset """ project_id = self.__project_id dataset_path, file_name = split(file_path) selectedSheet = max(1, selectedSheet) data = { 'name': name, 'fileName': file_name, 'decimalDelimiter': decimal, 'delimiter': delimiter, 'separator': delimiter, 'encoding': encoding, 'usePython': description, 'useSpark': 'False', 'sourceFileName': file_name, 'selectedSheet': str(selectedSheet), 'description': description, 'size': '{}'.format(getsize(file_path)), 'nbModalitiesThreshold': str(modalities), 'percentageContinuousThreshold': str(continuous_threshold), 'percentageMissingThreshold': str(missing_threshold) } with open(file_path, 'rb') as FILE: data['file[0]'] = ( file_name, FILE, 'application/vnd.ms-excel', ) json = {'project_ID': project_id, 'data': data, 'streaming': True} creation_json = self.__api.Datasets.uploaddatasets(**json) print('\n') try: self.__api.handle_work_states( project_id, work_type='datasetValidation', query={"datasetId": creation_json.get('_id')}) self.__api.handle_work_states( project_id, work_type='datasetDescription', query={"datasetId": creation_json.get('_id')}) except Exception as E: raise ApiException('Unable to get the dataset status', str(E)) returned_json = self.__api.Datasets.getadataset( project_ID=project_id, dataset_ID=creation_json.get('_id')) return Dataset(self.__api, json, returned_json)