示例#1
0
    def discretize(self, discretization_type=Variable.TYPE_EQUALFREQ, nb_bins=10):
        """
        Args:
            discretization_type (str): "equal-freq" or "equal-width", default is "equal-freq"
            nb_bins (int): number of bins to target after discretization, default is 10

        Returns:
            ContinuousVariable : variable that has been discretized
        """
        # Eventually existing discretization will be overriden by the new one
        if not isinstance(nb_bins, int):
            raise ApiException('Number of bins must be an integer')
        if discretization_type == Variable.TYPE_EQUALFREQ:
            data = [{'name': self.name, 'equalFreqBins': nb_bins}]
        elif discretization_type == Variable.TYPE_EQUALWIDTH:
            data = [{'name': self.name, 'equalWidthBins': nb_bins}]
        else:
            raise ApiException('Discretization Type does not exist: {}'.format(discretization_type))
        creation_json = self.__api.Datasets.discretize(project_ID=self.project_id, dataset_ID=self.dataset_id, json=data)

        try:
            self.__api.handle_work_states(self.project_id, work_type='discretization', work_id=creation_json.get('_id'))
        except Exception as E:
            raise ApiException('Unable to get the discretization status', str(E))

        self.__json_returned = self._update()
        return self
示例#2
0
        def apihandle():
            json = {'project_ID': project_id, 'data': data, 'streaming': True}

            creation_json = self.__api.Datasets.uploaddatasets(**json)
            print('\n')

            try:
                self.__api.handle_work_states(
                    project_id,
                    work_type='datasetValidation',
                    query={"datasetId": creation_json.get('_id')})
            except Exception as E:
                raise ApiException(
                    'Unable to get the dataset validation status', str(E))
            try:
                self.__api.handle_work_states(
                    project_id,
                    work_type='datasetDescription',
                    query={"datasetId": creation_json.get('_id')})
            except Exception as E:
                raise ApiException(
                    'Unable to get the dataset description status', str(E))

            returned_json = self.__api.Datasets.getadataset(
                project_ID=project_id, dataset_ID=creation_json.get('_id'))
            return json, returned_json
示例#3
0
    def __prepare_kpi_data(self, target):
        kpi_data = {
            "kpiName": target.name,
            "kpiType": target.indicator_type,
            "output": target.variable_name,
            "kpiFamily": target.indicator_family,
            "scoreType": "Shift" if target.indicator_type == TargetFactory.KPI_TYPE_CONTINUOUS else "Lift",
            "isMainKey": False,
        }

        if isinstance(target, Target):
            dictionary = dict(zip(target.scores, target.score_ids))
            if target.indicator_type == TargetFactory.KPI_TYPE_DISCRETE or target.indicator_type == TargetFactory.KPI_TYPE_DISCRETE_MODALITY:
                kpi_data['_id'] = dictionary[TargetFactory.KPI_SCORE_PURITY]
                kpi_data["omodality"] = target.modality
            elif target.indicator_type == TargetFactory.KPI_TYPE_CONTINUOUS:
                kpi_data['_id'] = dictionary[TargetFactory.KPI_SCORE_AVERAGE_VALUE]
            else:
                raise ApiException('Unexpected target indicator type')

        elif isinstance(target, Description):
            kpi_data['_id'] = target.score_id
        else:
            raise ApiException('Unexpected target Structure')

        return kpi_data
示例#4
0
    def create(self,
               dataset,
               name,
               target=None,
               targets=None,
               quantiles=10,
               enable_custom_discretizations=True):
        """
        Args:
            dataset (Dataset) : dataset on which Xray will be created
            name (str): name of the Xray to create
            target (Target or Description): one target to generate the Xray
            targets (Target or Description): array of targets to generate the Xray (ignored if 'target' parameter is defined)
            quantiles (int): Number of intervals the continuous variables are quantized in, default is 10
            enable_custom_discretizations (boolean): use custom discretizations, eventually use "quantiles" parameter for remaining variables, default is True

        Returns:
            Xray
        """
        if enable_custom_discretizations is True:
            discretizations = dataset._discretizations
        else:
            discretizations = {}

        if target is not None:
            kpis = [self.__prepare_kpi_data(target)]
        elif targets is not None:
            kpis = [self.__prepare_kpi_data(target) for target in targets]
        else:
            raise ApiException('A target should be defined')

        data = {
            "projectId": self.__project_id,
            "task": {
                "type": "simplelift",
                "datasetName": dataset.name,
                "datasetId": dataset.dataset_id,
                "projectId": self.__project_id,
                "params": {
                    "source": dataset.source_file_name,
                    "kpis": kpis,
                    "name": name,
                    "quantileOrder": quantiles,
                    "separator": dataset.separator,
                    "discretizations": discretizations
                }
            }
        }
        creation_json = self.__api.SimpleLift.newsimplelift(
            project_ID=self.__project_id, json=data)

        try:
            self.__api.handle_work_states(self.__project_id,
                                          work_type='simplelift',
                                          work_id=creation_json.get('_id'))
        except Exception as E:
            raise ApiException('Unable to get the X-ray status', str(E))

        return Xray(self.__api, creation_json)
示例#5
0
    def split(self, train_ratio=0.7, random_state=42, keep_proportion_variable=None, train_dataset_name=None,
              train_dataset_desc=None, test_dataset_name=None, test_dataset_desc=None):
        """
        Split the dataset into two subsets for training and testing models.

        Args:
            train_ratio (float): ratio between training set size and original data set size
            random_state (int): seed used by the random number generator
            keep_proportion_variable (Variable): discrete variable which modalities
                keep similar proportions in training and test sets
            train_dataset_name (str): name of the training set
            train_dataset_desc (str): description of the training set
            test_dataset_name (str): name of the test set
            test_dataset_desc (str): description of the test set

        Returns:
            The new training and test datasets
        """
        if not self._is_deleted:
            if not 0 < train_ratio < 1:
                raise ApiException('train_ratio must be greater than 0 and lower than 1')

            if not 0 < random_state < 1001:
                raise ApiException('random_state must be greater than 0 and lower than 1001')

            if keep_proportion_variable and not keep_proportion_variable.is_discrete:
                raise ApiException('keep_proportion_variable must be a discrete variable')

            train_name = train_dataset_name or self.name + '_train'
            test_name = test_dataset_name or self.name + '_test'
            train_name, test_name = self.__get_unique_names(train_name, test_name)

            data = {
                'charactInvalidTest': '',
                'charactInvalidTrain': '',
                'dataset': self.__json_returned,
                'datasetId': self.dataset_id,
                'projectId': self.project_id,
                'randomState': random_state,
                'target': keep_proportion_variable._json if keep_proportion_variable else '',
                'testDescription': test_dataset_desc or 'Test set of dataset ' + self.name,
                'testName': test_name,
                'train': train_ratio,
                'trainDescription': train_dataset_desc or 'Train set of dataset ' + self.name,
                'trainName': train_name
            }
            json = {'project_ID': self.project_id, 'dataset_ID': self.dataset_id, 'json': data}
            split_json = self.__api.Datasets.split(**json)

            try:
                self.__api.handle_work_states(self.project_id, work_type='datasetSplit', work_id=split_json.get('id'))
            except Exception as E:
                raise ApiException('Unable to get the split status', str(E))

            factory = DatasetFactory(self.__api, self.project_id)
            return factory.get(train_name), factory.get(test_name)
示例#6
0
    def create_from_sql(self, name, connection_string, query, description='', modalities=2,
                        continuous_threshold=0.95, missing_threshold=0.95):
        """
        Create a Dataset from a sql database.
        Supported systems : PostgreSql

        Args:
            name (str): The name of the dataset
            connection_string (str): The connection string to the database (format : 'postgresql://*****:*****@host:port/database')
            query : The query to execute to fetch the data (example : 'SELECT * FROM data_table')
            description (str): The dataset description, default is ''
            modalities (int): Modality threshold for discrete variables, default is 2
            continuous_threshold (float): % of continuous values threshold for continuous variables, default is 0.95
            missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95

        Returns:
            Dataset
        """
        project_id = self.__project_id
        SEPARATOR = "semicolon"
        ENCODING = "utf-8"

        dataset_data = {
            'datasetName': name,
            'description': description,
            'cached': True,
            'separator': SEPARATOR,
            'encoding': ENCODING,
            'type': 'dbAccess',
            'dbSystem': 'pgsql',
            'query': query,
            'connectionString': connection_string
        }
        json = {'project_ID': project_id, 'json': dataset_data}
        creation_json = self.__api.Datasets.createdataset(**json)

        try:
            self.__api.handle_work_states(project_id, work_type='datasetValidation', query={"datasetId": creation_json.get('_id')})
        except Exception as E:
            raise ApiException('Unable to get the dataset validation status', str(E))
        try:
            self.__api.handle_work_states(project_id, work_type='datasetDescription', query={"datasetId": creation_json.get('_id')})
        except Exception as E:
            raise ApiException('Unable to get the dataset description status', str(E))

        returned_json = self.__api.Datasets.getadataset(project_ID=project_id, dataset_ID=creation_json.get('_id'))

        return Dataset(self.__api, json, returned_json)
示例#7
0
    def export_scores(self, path, variables=None):
        """
        Export the scores of this model in a csv file

        Args:
            path (str): the destination path for the exported scores
            variables (list of Variable): the variables of the dataset to add in the file. Default is None
        """
        data = {
            'datasetId': self.__json_returned.get('datasetId'),
            'columns': [variable.name for variable in variables] if variables else []
        }
        json = {'project_ID': self.project_id, 'model_ID': self.id, 'json': data}
        json_returned = self.__api.Prediction.postexportscores(**json)

        try:
            self.__api.handle_work_states(self.project_id, work_type=json_returned.get('type'), work_id=json_returned.get('_id'))
        except Exception as E:
            raise ApiException('Unable to export the model scores for ' + self.name, str(E))

        outputFile = json_returned.get('workParams').get('outputFile').split("_")[1]
        data = {
            'outputFile': outputFile
        }
        json = {'project_ID': self.project_id, 'model_ID': self.id, 'params': data}
        to_export = self.__api.Prediction.getexportscores(**json)

        with open(path, 'wb') as FILE_OUT:
            FILE_OUT.write(to_export)
示例#8
0
def get_required_module(module_name):
    try:
        return importlib.import_module(module_name)
    except ModuleNotFoundError:
        warn_msg = 'The module {md} is missing and required for this function.\n'
        warn_msg = warn_msg + 'To install it on a notebook, execute "!pip install {md}" and restart the kernel'
        logging.warning(warn_msg.format(md=module_name))
        raise ApiException(f'Missing module for this function : {module_name}')
示例#9
0
    def create(self, variable, modality=None, scoreTypes=None):
        """
        create(variable, modality=None, scoreTypes=None)

        Create one target for the given variable.

        Args:
            variable (Variable): the variable defining the target
            modality (int or str or float): modality of the target if variable is discrete.
                Default is most frequent modality.
            scoreTypes (list of str): score types to be defined for the target.
                Default is [TargetFactory.KPI_SCORE_PURITY, TargetFactory.KPI_SCORE_COVERAGE] if variable is discrete and
                [TargetFactory.KPI_SCORE_AVERAGE_VALUE] if variable is continuous.

        Returns:
            Target: The new target
        """

        if variable.is_discrete:
            # checking if modality exists
            if modality is None:
                kpiModality = variable.top_modality
            else:
                if modality in variable.modalities:
                    kpiModality = modality
                else:
                    raise ApiException(
                        'Modality {} does not exist for variable {}'.format(
                            modality, variable.name))
            kpiScoreTypes = set(scoreTypes or []).union(
                {self.KPI_SCORE_PURITY, self.KPI_SCORE_COVERAGE})
        else:
            kpiModality = None
            kpiScoreTypes = set(scoreTypes
                                or []).union({self.KPI_SCORE_AVERAGE_VALUE})

        targetData = self.__get_target_data(variable, [
            kpiModality,
        ], kpiScoreTypes)
        data = {"kpis": targetData}
        json = {'project_ID': self.__project_id, 'json': data}
        returned_json = self.__api.Kpi.addkpi(**json)
        if variable.is_discrete:
            targets_returned_json = [
                kpi for kpi in returned_json['kpis']
                if kpi['kpiFamily'] == 'target' and kpi['variable'] ==
                variable.name and kpi['modality'] == kpiModality
            ]
        else:
            targets_returned_json = [
                kpi for kpi in returned_json['kpis']
                if kpi['kpiFamily'] == 'target'
                and kpi['variable'] == variable.name
            ]
        unique_targets_returned_json = unique_list(targets_returned_json)
        target_json = unique_targets_returned_json[0]

        return Target(self.__api, json, target_json)
示例#10
0
    def get_confusion_matrix(self, top_score_ratio):
        if not 0 <= top_score_ratio <= 1:
            raise ApiException('top_score_ratio must be greater or equal to 0 and lower or equal to 1')

        self.__load_confusion_matrix()
        index = self.__get_index(top_score_ratio)
        values = self.__json_confusion_matrix['Lift curve'][index]
        return ConfusionMatrix(true_positives=values['TP'], false_positives=values['FP'],
                               true_negatives=values['TN'], false_negatives=values['FN'])
示例#11
0
    def predict_scores(self, dataset, keep_applied_model=False):
        """
        Predict target scores for input dataset

        Args:
            dataset (Dataset): the dataset containing the input samples.
            keep_applied_model (boolean): A HyperCube applied model is temporarily created to compute these scores,
                set this parameter to True if you want this model to be persisted. Default is False.

        Returns:
            a NumPy array of shape [n_samples,] where n_samples is the number of samples in the input dataset
        """
        applied_model = self.apply(dataset, '{}_applied_{}'.format(self.name, datetime.now().strftime("%Y-%m-%d_%H-%M-%S")))
        data = {
            'datasetId': dataset.dataset_id,
            'columns': []
        }
        json = {'project_ID': applied_model.project_id, 'model_ID': applied_model.id, 'json': data}
        json_returned = self.__api.Prediction.postexportscores(**json)

        try:
            self.__api.handle_work_states(applied_model.project_id, work_type=json_returned.get('type'), work_id=json_returned.get('_id'))
        except Exception as E:
            raise ApiException('Unable to get the model scores for {}'.format(self.name), str(E))

        outputFile = json_returned.get('workParams').get('outputFile').split("_")[1]
        data = {
            'outputFile': outputFile
        }
        json = {'project_ID': applied_model.project_id, 'model_ID': applied_model.id, 'params': data}
        scores = self.__api.Prediction.getexportscores(**json)
        scoreIO = StringIO(scores.decode('utf-8'))

        try:
            df = read_csv(scoreIO, sep=';', skiprows=1, usecols=[1])
        except Exception as E:
            raise ApiException('Unable to read the model scores for {}'.format(self.name), str(E))

        if not keep_applied_model:
            applied_model.delete()

        return reshape(df.values, (df.values.shape[0]))
示例#12
0
    def export_dataframe(self):
        """
        Export the dataset to a Pandas DataFrame

        Returns:
            DataFrame
        """
        if not self._is_deleted:
            try:
                import pandas
            except ImportError as E:
                raise ApiException(
                    'Pandas is required for this operation, please execute "!pip install pandas" and restart the kernel',
                    str(E))
            _data = io.StringIO(self._export().decode('utf-8'))
            return pandas.read_csv(_data, sep=";")
示例#13
0
    def apply(self, dataset, applied_model_name, add_score_to_dataset=False, score_column_name=None):
        """
        Apply the HyperCube classifier model on a selected data set

        Args:
            dataset (Dataset): Dataset the model is applied on
            applied_model_name (str): Name of the new applied model
            add_score_to_dataset (boolean): if set to True a new column containing the scores is added to the dataset.
                Default is False.
            score_column_name (str): name of the score column, used only if add_score_to_dataset is set to True

        Returns:
            the applied Model
        """

        params = dict(self.__json_returned)
        params['modelName'] = applied_model_name

        data = {
            'datasetId': dataset.dataset_id,
            'datasetName': dataset.name,
            'fromDatasetId': self.__json_returned.get('datasetId'),
            'modelId': self.__json_returned.get('_id'),
            'params': params,
            'projectId': dataset.project_id,
            'spark': False,
            'type': 'applyPrediction'
        }

        if add_score_to_dataset:
            data['saveScore'] = score_column_name or 'score_' + applied_model_name

        json = {'project_ID': dataset.project_id, 'json': data}
        json_returned = self.__api.Task.createtask(**json)
        try:
            self.__api.handle_work_states(dataset.project_id, work_type=json_returned.get('type'), work_id=json_returned.get('_id'))
        except Exception as E:
            raise ApiException('Unable to create the applied HyperCube model ' + applied_model_name, str(E))

        return HyperCube(self.__api, json_returned)
示例#14
0
    def create_hypercube(self, dataset, name, target, purity_min=None, coverage_min=None, rule_complexity=2, quantiles=10, min_marginal_contribution=None,
                         max_complexity=3, nb_minimizations=1, coverage_increment=0.01, split_ratio=0.7, nb_iterations=1,
                         purity_tolerance=0.1, enable_custom_discretizations=True, save_all_rules=False):
        """
        Create a HyperCube classifier model

        Args:
            dataset (Dataset): Dataset the model is fitted on
            name (str): Name of the new model
            target (Target): Target used to generate the model
            purity_min (float): Minimum purity of rules, default is the entire dataset purity
            coverage_min (int): Minimum coverage of the target population for each rule, default is 10
            rule_complexity (int): Maximum number of variables in rules, default is 2
            quantiles (int): Number of bins for all continuous numeric variables during quantization, default is 10
            min_marginal_contribution (float): a new rule R', created by adding a new constraint to an existing rule R (and thus increasing its complexity),
                is added to the ruleset if and only if it increases the original purity of R by the minimum marginal contribution or more. Default is 0.1
            max_complexity (int): maximum number of variables contained in the rules created during the local complexity increase phase. Default is 3
            nb_minimizations (int): Number of minimizations to perform on the ruleset, default is 1
            coverage_increment (float): Percentage increment of target samples that a new rule must bring to be added to the minimized ruleset,
                default is 0.01
            split_ratio (float): the first step in the model generation is the random split of the original dataset into a learning (or train) dataset
                representing by default 70% of the original dataset, and a validation (or test) dataset containing the remaining 30%. Default is 0.7
            nb_iterations (int): The final model is the result of several models based on different splits of the original dataset, using a bootstrap method.
                The parameter "Number of iterations" corresponds to the number of these splits that are made. Default is 1
            purity_tolerance (float): maximum spread between the purities of the rules applied to the learning and validation datasets
            enable_custom_discretizations (boolean): when ticked use the custom discretization(s) link to the selected dataset,
                eventually use "Quantiles" parameter for remaining variables. Default is True
            save_all_rules (boolean): save all generated rules in a new ruleset. Default is False

        Returns:
            the created model
        """
        variable = next(variable for variable in dataset.variables if variable.name == target.variable_name)
        index = variable.modalities.index(target.modality)
        datasetPurity = variable.purities[index]
        score_purity_min = purity_min or round(datasetPurity, 3)

        if min_marginal_contribution is None:
            if score_purity_min > 0.99:
                min_marginal_contribution = round(1 / score_purity_min - 1, 3)
            elif score_purity_min > 0.9:
                min_marginal_contribution = round(0.99 / score_purity_min - 1, 3)
            else:
                min_marginal_contribution = 0.1

        coverage_min = coverage_min or 10 if (variable.frequencies[index] < 1000) else 0.01

        if enable_custom_discretizations is True:
            discretizations = dataset._discretizations
        else:
            discretizations = {}

        scores = []
        for score_id, score_type in zip(target.score_ids, target.scores):
            score = {
                'deleted': False,
                'kpiFamily': target.indicator_family,
                'kpiName': target.name,
                'kpiType': target.indicator_type,
                'omodality': target.modality,
                'output': target.variable_name,
                'projectId': target.project_id,
                'scoreType': score_type,
                '_id': score_id
            }
            if score_type == self._PURITY:
                score['minValue'] = score_purity_min
            elif score_type == self._COVERAGE:
                score['minValue'] = coverage_min
            elif score_type == self._LIFT:
                score['minValue'] = 1
            scores.append(score)

        kpisel = {
            'datasetPurity': datasetPurity,
            'kpiFamily': target.indicator_family,
            'kpiName': target.name,
            'kpiType': target.indicator_type,
            'projectId': target.project_id,
            'scores': scores,
            'selectedBy': 'target'
        }

        params = {
            'algoType': 'HyperCube',
            'complexityExhaustive': rule_complexity,
            'countQuantiles': quantiles,
            'coverageIncrement': coverage_increment,
            'coverageThreshold': 10,
            'delimiter': 'semicolon',
            'discretizations': discretizations,
            'dtmaxdepth': 4,
            'elasticNetParam': 0,
            'enableCustomDiscretizations': enable_custom_discretizations,
            'featureSubsetStrategy': 'sqrt',
            'gbmaxdepth': 3,
            'gbn_estimators': 100,
            'kpis': [],
            'learning_rate': 0.1,
            'lrCost': 1,
            'maxComplexity': max_complexity,
            'maxDepth': 3,
            'maxIter': 100,
            'minInfoGain': 0,
            'minInstance': 1,
            'minMarginalContribution': min_marginal_contribution,
            'minObservation': 3,
            'missingValues': 0.1,
            'modelName': name,
            'nbMaxModality': 50,
            'nbMinObservation': 10,
            'nbMinimizations': nb_minimizations,
            'nbModels': nb_iterations,
            'numTrees': 10,
            'percentageSplit': split_ratio,
            'purityThreshold': score_purity_min,
            'purityTolerance': purity_tolerance,
            'regParam': 0,
            'replaceMissingValues': 'Median',
            'rfmaxdepth': 2,
            'rfn_estimators': 100,
            'saveAllRules': 1 if save_all_rules else 0,
            'sourceFileName': dataset.source_file_name,
            'splitRatio': split_ratio,
            'stepSize': 0.1,
            'subsamplingRate': 1,
            'target': [score for score in scores if score['scoreType'] == self._PURITY or score['scoreType'] == self._COVERAGE],
            'tol': 0.000001
        }

        data = {
            'algo': 'HyperCube',
            'algolist': ['HyperCube', 'LogisticRegression', 'DecisionTree', 'RandomForest', 'GradientBoosting'],
            'datasetId': dataset.dataset_id,
            'datasetName': dataset.name,
            'dtcr': 'gini',
            'enableCustomDiscretizations': enable_custom_discretizations,
            'gbloss': 'deviance',
            'keyIndicators': [],
            'kpisel': kpisel,
            'lrPenalty': 'l2',
            'lrSolver': 'liblinear',
            'modelFile': '',
            'modelName': name,
            'params': params,
            'projectId': dataset.project_id,
            'rfcr': 'gini',
            'saveAllRules': save_all_rules,
            'selectedDataset': dataset._json,
            'sourceFileName': '',
            'spark': False,
            'type': 'hypercubePrediction',
            'validTarget': True
        }

        json = {'project_ID': dataset.project_id, 'json': data}
        json_returned = self.__api.Task.createtask(**json)

        try:
            self.__api.handle_work_states(dataset.project_id, work_type=json_returned.get('type'), work_id=json_returned.get('_id'))
        except Exception as E:
            raise ApiException('Unable to create the HyperCube model ' + name, str(E))

        return HyperCube(self.__api, json_returned)
示例#15
0
    def create(self, dataset, target, params):
        """
        Private method. Create a classifier or regressor Scikit-learn model

        Args:
            dataset (Dataset): Dataset the model is fitted on
            target (Target): Target used to generate the model
            params (dict): parameters used by the HyperWorker
        Returns:
            the created model
        """
        if params['modelType'] not in ModelTypes.LIST:
            print('Unexpected model name : {}, valid options are : {}'.format(
                params['modelType'], ', '.join(ModelTypes.LIST)))
            return
        if params['algoType'] not in AlgoTypes.LIST:
            print('Unexpected model name : {}, valid options are : {}'.format(
                params['algoType'], ', '.join(AlgoTypes.LIST)))
            return

        if target.indicator_type == self._INDICATOR_DISCRETE_WITH_MODALITY:
            variable = next(variable for variable in dataset.variables
                            if variable.name == target.variable_name)
            index = variable.modalities.index(target.modality)
            datasetPurity = variable.purities[index]
            score_purity_min = round(datasetPurity, 3)
            coverage_min = 10 if (variable.frequencies[index] < 1000) else 0.01

        scores = []
        for score_id, score_type in zip(target.score_ids, target.scores):
            score = {
                'deleted': False,
                'kpiFamily': target.indicator_family,
                'kpiName': target.name,
                'kpiType': target.indicator_type,
                'output': target.variable_name,
                'projectId': target.project_id,
                'scoreType': score_type,
                '_id': score_id
            }
            if target.indicator_type == self._INDICATOR_DISCRETE_WITH_MODALITY:
                score['omodality'] = target.modality
                if score_type == self._PURITY:
                    score['minValue'] = score_purity_min
                elif score_type == self._COVERAGE:
                    score['minValue'] = coverage_min
                elif score_type == self._LIFT:
                    score['minValue'] = 1
                scores.append(score)
                scores = [
                    score for score in scores
                    if score['scoreType'] == self._PURITY
                    or score['scoreType'] == self._COVERAGE
                ]
            else:
                scores.append(score)

        kpisel = {
            'kpiFamily': target.indicator_family,
            'kpiName': target.name,
            'kpiType': target.indicator_type,
            'projectId': target.project_id,
            'selectedBy': 'target'
        }
        if target.indicator_type == self._INDICATOR_DISCRETE_WITH_MODALITY:
            kpisel['datasetPurity'] = datasetPurity

        params['sourceFileName'] = dataset.source_file_name
        params['target'] = scores
        data = {
            'datasetId': dataset.dataset_id,
            'datasetName': dataset.name,
            'kpi': kpisel,
            'modelName': params['modelName'],
            'params': params,
            'projectId': dataset.project_id,
            'selectedDataset': dataset._json,
            'type': 'automatedModels',
            'validTarget': True,
        }
        new_automodel = self.__api.AutomatedPrediction.createautomatedmodel(
            project_ID=self.__project_id, json=data)
        try:
            self.__api.handle_work_states(self.__project_id,
                                          work_type='automatedModels',
                                          work_id=new_automodel.get('workId'))
        except Exception as E:
            raise ApiException('Unable to get the automated model status',
                               str(E))

        return AutomatedModel(self.__api, new_automodel)
示例#16
0
    def create(self, name, file_path, decimal='.',
               delimiter='semicolon', encoding='UTF-8', selectedSheet=1,
               description='', modalities=2, continuous_threshold=0.95, missing_threshold=0.95,
               metadata_file_path=None, discreteDict_file_path=None, keepVariableName=None):
        """
        Create a Dataset from a file (csv, Excel)
        Args:
            name (str): The name of the dataset
            file_path (str): The origin path of the file
            decimal (str): Decimal separator - csv files only, default is '.'
            delimiter (str): The csv field delimiter - csv files only, default is 'semicolon'
            encoding (str): The file encoding - csv files only, default is 'UTF-8'
            selectedSheet (int): The worksheet to use (starts at 1 like in Hypercube User Interface) - Excel files only, default is 1
            description (str): The dataset description, default is ''
            modalities (int): Modality threshold for discrete variables, default is 2
            continuous_threshold (float): % of continuous values threshold for continuous variables, default is 0.95
            missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95
        Returns:
            Dataset
        """

        project_id = self.__project_id
        _, file_name = split(file_path)
        if metadata_file_path:
            _, metadata_file_name = split(metadata_file_path)
        else:
            metadata_file_name = None
        if discreteDict_file_path:
            _, discreteDict_file_name = split(discreteDict_file_path)
        else:
            discreteDict_file_name = None
        selectedSheet = max(1, selectedSheet)

        # historically, delimiter/separator were stored as explicit strings in our database (ex: "semicolon")
        # we want to keep it that way
        if delimiter in self.char_delimiters:
            delimiter = self.string_delimiters[self.char_delimiters.index(delimiter)]
        elif delimiter not in self.string_delimiters:
            raise ApiException(f'Unsupported value for delimiter: {delimiter}', f'Supported values: {self.string_delimiters}')

        data = {
            'name': name,
            'fileName': file_name,
            'decimalDelimiter': decimal,
            'delimiter': delimiter,
            'separator': delimiter,
            'encoding': encoding,
            'usePython': description,
            'useSpark': 'False',
            'sourceFileName': file_name,
            'selectedSheet': str(selectedSheet),
            'description': description,
            'size': '{}'.format(getsize(file_path)),
            'nbModalitiesThreshold': str(modalities),
            'percentageContinuousThreshold': str(continuous_threshold),
            'percentageMissingThreshold': str(missing_threshold)
        }

        if keepVariableName:
            data['keepVariableName'] = keepVariableName

        def apihandle():
                json = {'project_ID': project_id, 'data': data, 'streaming': True}

                creation_json = self.__api.Datasets.uploaddatasets(**json)
                print('\n')

                try:
                    self.__api.handle_work_states(project_id, work_type='datasetValidation', query={"datasetId": creation_json.get('_id')})
                except Exception as E:
                    raise ApiException('Unable to get the dataset validation status', str(E))
                try:
                    self.__api.handle_work_states(project_id, work_type='datasetDescription', query={"datasetId": creation_json.get('_id')})
                except Exception as E:
                    raise ApiException('Unable to get the dataset description status', str(E))

                returned_json = self.__api.Datasets.getadataset(project_ID=project_id, dataset_ID=creation_json.get('_id'))
                return json, returned_json

        if metadata_file_name and discreteDict_file_name:
            data['metadataFileName'] = metadata_file_name,
            data['discreteDictFileName'] = discreteDict_file_name,
            with open(file_path, 'rb') as FILE:
                with open(metadata_file_path, 'rb') as METADATA:
                    with open(discreteDict_file_path, 'rb') as DISCRETEDICT:
                        data['file[0]'] = (
                            file_name,
                            FILE,
                            'application/vnd.ms-excel',
                        )
                        data['file[1]'] = (
                            metadata_file_name,
                            METADATA,
                            'application/json',
                        )
                        data['file[2]'] = (
                            discreteDict_file_name,
                            DISCRETEDICT,
                            'application/json',
                        )
                        json, returned_json = apihandle()
        elif metadata_file_name:
            data['metadataFileName'] = metadata_file_name,
            with open(file_path, 'rb') as FILE:
                with open(metadata_file_path, 'rb') as METADATA:
                    data['file[0]'] = (
                        file_name,
                        FILE,
                        'application/vnd.ms-excel',
                    )
                    data['file[1]'] = (
                        metadata_file_name,
                        METADATA,
                        'application/json',
                    )
                    json, returned_json = apihandle()
        else:
            with open(file_path, 'rb') as FILE:
                data['file[0]'] = (
                    file_name,
                    FILE,
                    'application/vnd.ms-excel',
                )
                json, returned_json = apihandle()

        return Dataset(self.__api, json, returned_json)
示例#17
0
    def create_from_dataframe(self, name, dataframe, description='', modalities=2,
                              continuous_threshold=0.95, missing_threshold=0.95,
                              metadata=None, discreteDict=None, keepVariableName=None):
        """
        Create a Dataset from a Pandas DataFrame
        Args:
            name (str): The name of the dataset
            dataframe (pandas.DataFrame): The dataframe to import
            description (str): The dataset description, default is ''
            modalities (int): Modality threshold for discrete variables, default is 2
            continuous_threshold (float): % of continuous values threshold for continuous variables ,default is 0.95
            missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95
        Returns:
            Dataset
        """
        project_id = self.__project_id
        file_name = '{}.csv'.format(uuid.uuid4())
        metadata_file_name = '{}.json'.format(uuid.uuid4())
        discreteDict_file_name = '{}.json'.format(uuid.uuid4())
        DECIMAL = "."
        SEPARATOR = "semicolon"
        ENCODING = "utf-8"

        sep = self.char_delimiters[self.string_delimiters.index(SEPARATOR)]
        stream_df = io.StringIO(dataframe.to_csv(sep=sep, index=False))
        if metadata:
            import json
            stream_metadata = io.StringIO()
            json.dump(metadata, stream_metadata)
            if discreteDict:
                stream_discreteDict = io.StringIO()
                json.dump(discreteDict, stream_discreteDict)

        data = {
            'name': name,
            'fileName': file_name,
            'decimalDelimiter': DECIMAL,
            'delimiter': SEPARATOR,
            'separator': SEPARATOR,
            'encoding': ENCODING,
            'usePython': description,
            'useSpark': 'False',
            'sourceFileName': file_name,
            'description': description,
            'size': '{}'.format(sys.getsizeof(dataframe)),
            'nbModalitiesThreshold': str(modalities),
            'percentageContinuousThreshold': str(continuous_threshold),
            'percentageMissingThreshold': str(missing_threshold)
        }

        if keepVariableName:
            data['keepVariableName'] = keepVariableName

        data['file[0]'] = (
            file_name,
            stream_df,
            'application/vnd.ms-excel',
        )
        if metadata:
            data['metadataFileName'] = metadata_file_name
            data['file[1]'] = (
                metadata_file_name,
                stream_metadata,
                'application/json',
            )
            if discreteDict:
                data['discreteDictFileName'] = discreteDict_file_name
                data['file[2]'] = (
                    discreteDict_file_name,
                    stream_discreteDict,
                    'application/json',
                )
        json_ = {'project_ID': project_id, 'data': data, 'streaming': True}

        creation_json = self.__api.Datasets.uploaddatasets(**json_)
        try:
            self.__api.handle_work_states(project_id, work_type='datasetValidation', query={"datasetId": creation_json.get('_id')})
        except Exception as E:
            raise ApiException('Unable to get the dataset validation status', str(E))
        try:
            self.__api.handle_work_states(project_id, work_type='datasetDescription', query={"datasetId": creation_json.get('_id')})
        except Exception as E:
            raise ApiException('Unable to get the dataset description status', str(E))
        returned_json = self.__api.Datasets.getadataset(project_ID=project_id, dataset_ID=creation_json.get('_id'))

        return Dataset(self.__api, json_, returned_json)
示例#18
0
    def display_curve(self, curve='ROC curve', title=None, model_line=None, random_line=None, legend=None):
        """
        Plot the selected curve of this model

        Args:
            curve (str): curve to be diplayed, options are 'ROC curve', 'Gain curve', 'Lift curve', 'Purity curve' and
                'Precision Recall'. Default is 'ROC curve'.
            title (str): Title of the diagram. Default is a custom model name
            model_line (dict): display options of model line, ex: dict(color=('rgb(205, 12, 24)'), dash='dash', width=1).
                Default is a blue line. see https://plot.ly/python/line-and-scatter/
            random_line (dict): display options of random line. Default is a red dash line.
            legend (dict): legend options, ex: dict(orientation="h") or dict(x=-.1, y=1.2).
                Default is at the right of the diagram. see https://plot.ly/python/legend/

        Returns:
            plot of the curve
        """

        try:
            import plotly.graph_objs as go
            import plotly.offline as py
            from plotly.offline import init_notebook_mode
        except ImportError as E:
            raise ApiException('Plotly external package is required for this operation, please execute "!pip install plotly" and restart the kernel', str(E))

        if curve not in ['ROC curve', 'Gain curve', 'Lift curve', 'Purity curve', 'Precision Recall']:
            print('Unexpected curve type : {}, valid options are : {}'.format(curve,
                  "'ROC curve', 'Gain curve', 'Lift curve', 'Purity curve', 'Precision Recall'"))
            return

        self.__load_confusion_matrix()

        x, y, x_name, y_name = self.__get_x_y_info(curve)

        init_notebook_mode(connected=False)
        if model_line:
            roc = go.Scatter(x=x, y=y, name='{}'.format(self.name), mode='lines', line=model_line)
        else:
            roc = go.Scatter(x=x, y=y, name='{}'.format(self.name), mode='lines')

        data = [roc]
        random_line_arg = random_line or dict(color=('rgb(205, 12, 24)'), dash='dash', width=1)
        if curve == 'ROC curve' or curve == 'Gain curve':
            random = go.Scatter(x=[0, 1], y=[0, 1], name='Random', mode='lines', line=random_line_arg)
        elif curve == 'Lift curve':
            random = go.Scatter(x=[0, 1], y=[1, 1], name='Random', mode='lines', line=random_line_arg)
        else:
            random = None

        if random:
            data.append(random)

        default_title = '{} of {}'.format(curve, self.name)
        if curve == 'ROC curve' or curve == 'Precision Recall':
            default_title = '{} (AUC = {:0.2f})'.format(default_title,
                                                        self.__json_confusion_matrix[curve][-1]['auc'])
        curve_title = title or default_title

        layout = dict(title=curve_title, xaxis=dict(title=x_name, range=[0, 1]),
                      yaxis=dict(title=y_name, range=[0, max(y) + 0.05]),)
        if legend:
            layout['legend'] = legend

        fig = dict(data=data, layout=layout)
        py.iplot(fig, validate=False)
示例#19
0
    def minimize(self,
                 ruleset,
                 minimization_name,
                 score_to_minimize='Purity',
                 increment_threshold=0.01):
        """
        minimize(ruleset, minimization_name, score_to_minimize='Purity', increment_threshold=0.01)

        Perform a minimzation on a given ruleset.

        Args:
            ruleset (Ruleset): Ruleset to minimize
            minimization_name (str): Name of the new ruleset
            score_to_minimize (str): Score to apply the minimization, default is 'Purity'
            increment_threshold (float): Percentage increment of target samples that a new rule must bring to be added to the minimized ruleset, default is 0.01

        Return:
            Ruleset: Minimized ruleset
        """
        kpisList = ruleset.kpis.copy()

        json = {
            "type": "minimization",
            "datasetId": ruleset.dataset_id,
            "projectId": ruleset.project_id,
            "params": {
                "query":
                "tagsfilter={}".format(urllib.parse.quote(ruleset.name)),
                "taglist": [ruleset.name],
                "incrementThreshold": increment_threshold,
                "tag": minimization_name
            }
        }
        _kpiId = decode_kpiname_to_id(ruleset.kpis, score_to_minimize)
        if _kpiId != score_to_minimize:
            json['params']['kpiId'] = _kpiId

        _kpis_corr = self.__api.Kpi.getkpicorrelation(
            project_ID=ruleset.project_id)

        for _kpi in kpisList:
            _kpi_corr = next((_kpi_corr for _kpi_corr in _kpis_corr
                              if _kpi_corr.get('_id') == _kpi.get('kpiId')),
                             {})
            _kpi.update(_kpi_corr)

        if kpisList[0].get('kpiType') in [
                RulesetFactory._CONTINUOUS, RulesetFactory._CONTINUOUS_RATIO
        ]:
            raise ApiException(
                f'Unsupported target type in ruleset minimization: {kpisList[0].get("kpiType")}',
                f'Supported types: {RulesetFactory._DISCRETE_MODALITY}')

        json['params']['kpisList'] = kpisList

        _ruleset = self.__api.Task.createtask(project_ID=ruleset.project_id,
                                              json=json)
        self.__api.handle_work_states(ruleset.project_id,
                                      work_type='minimization',
                                      work_id=_ruleset.get('_id'))
        return self.get(minimization_name)
示例#20
0
    def create(self,
               dataset,
               name,
               target,
               purity_min=None,
               coverage_min=None,
               lift_min=None,
               zscore_min=None,
               average_value_min=None,
               standard_deviation_max=None,
               shift_min=None,
               rule_complexity=2,
               quantiles=10,
               enable_custom_discretizations=True,
               min_marginal_contribution=None,
               compute_other_key_indicators=None,
               locally_increase_complexity=False,
               max_complexity=3,
               nb_minimizations=1,
               coverage_increment=0.01,
               validate_stability=False,
               split_ratio=0.7,
               nb_iterations=1,
               purity_tolerance=0.1):
        """
        Create a new ruleset

        Args:
            dataset (Dataset): Dataset used to generate the ruleset
            name (str): Name of the new ruleset
            target (Target): Target to generate the ruleset
            purity_min (float): Minimum purity of rules, default is the entire dataset purity (discrete target only)
            coverage_min (int): Minimum coverage of the target population for each rule, default is 10 (discrete target only)
            lift_min (float): Minimum lift, default is 1 (discrete target only)
            zscore_min (float): Minimum Z-score, default is None (discrete target only)
            average_value_min (float): Minimum average value, default is average value of the target on the whole dataset (continuous target only)
            standard_deviation_max (float) : Maximum standard deviation, default is None (continuous target only)
            shift_min (float): Minimum shift, default is None (continuous target only)
            rule_complexity (int): Maximum number of variables in rules, default is 2
            quantiles (int): Number of intervals the continuous variables are quantized in, default is 10
            enable_custom_discretizations (boolean): use custom discretizations, eventually use "quantiles" parameter for remaining variables, default is True
            min_marginal_contribution (float): a new rule R', created by adding a new constraint to an existing rule R (and thus increasing its complexity),
                is added to the ruleset if and only if it increases the original purity of R by the minimum marginal contribution or more. Default is 0.1
            compute_other_key_indicators (list of KeyIndicatorOption): Compute other Key Indicators.
            locally_increase_complexity (bool): Enable the locally increase complexity when set as true. Default is False
            max_complexity (int): Maximum numbers of features per rule. Default is 3
            nb_minimizations (int):Interate the minimization process. Default is 1
            coverage_increment (float): Percentage increment of target samples that a new rule must bring to be added to the minimization ruleset.
                Default is 0.01
            validate_stability (bool): Enable to split your dataset, add iteration and set a purity tolerance when set as true. Default is False
            split_ratio (float): The percentage for the split (Between 0 and 1). Default is 0.7
            nb_iterations (int): Number of iterations wanted. Default is 1
            purity_tolerance (float): Purity tolerence allowed (Between 0 and 1). Default is 0.1

        Returns:
            Ruleset
        """
        variable = next(variable for variable in dataset.variables
                        if variable.name == target.variable_name)
        score_purity_min = None
        if (variable.is_discrete):
            index = variable.modalities.index(target.modality)
            datasetPurity = variable.purities[index]
            score_purity_min = purity_min or round(datasetPurity, 3)

            if min_marginal_contribution is None:
                if score_purity_min > 0.99:
                    min_marginal_contribution = round(1 / score_purity_min - 1,
                                                      3)
                elif score_purity_min > 0.9:
                    min_marginal_contribution = round(
                        0.99 / score_purity_min - 1, 3)
                else:
                    min_marginal_contribution = 0.1

            coverage_min = coverage_min or 10 if (
                variable.frequencies[index] < 1000) else 0.01
        else:
            min_marginal_contribution = 0.1

        if enable_custom_discretizations is True:
            discretizations = dataset._discretizations
        else:
            discretizations = {}

        if not compute_other_key_indicators:
            compute_other_key_indicators = []

        if not target:
            raise ApiException('You need a target to create a ruleset')
        if isinstance(target, Description):
            raise ApiException(
                'Cannot perform a ruleset with a description kpi')

        data = {
            "projectId": self.__project_id,
            "task": {
                "type": "learning",
                "datasetId": dataset.dataset_id,
                "projectId": self.__project_id,
                "params": {
                    "learningName": name,
                    "datasetName": dataset.name,
                    "buildPredictiveModel": 0,
                    "sourceFileName": dataset.source_file_name,
                    "delimiter": dataset.separator,
                    "complexityExhaustive": rule_complexity,
                    "countQuantiles": quantiles,
                    "discretizations": discretizations,
                    "minMarginalContribution": min_marginal_contribution,
                    "target": [],
                    "kpis": []
                }
            }
        }

        for _id, _type in zip(target.score_ids, target.scores):
            _kpiData = {
                "kpiId": _id,
                "type": _type,
                "kpiFamily": target.indicator_family,
                "scoreType": _type,
                "kpiType": target.indicator_type,
                "output": target.variable_name,
                "kpiName": target.name,
                "omodality": target.modality
            }
            if _type == self._PURITY and score_purity_min is not None:
                _kpiData['minValue'] = score_purity_min
            elif _type == self._COVERAGE and coverage_min is not None:
                _kpiData['minValue'] = coverage_min
            elif _type == self._LIFT and lift_min is not None:
                _kpiData['minValue'] = lift_min
            elif _type == self._ZSCORE and zscore_min is not None:
                _kpiData['minValue'] = zscore_min
            elif _type == self._AVERAGE_VALUE and average_value_min is not None:
                _kpiData['minValue'] = average_value_min
            elif _type == self._STANDARD_DEVIATION and standard_deviation_max is not None:
                _kpiData['maxValue'] = standard_deviation_max
            elif _type == self._SHIFT and shift_min is not None:
                _kpiData['minValue'] = shift_min
            data['task']['params']['target'].append(_kpiData)

        msg = "Ruleset settings: \n\t- Target: {}".format(target.name) + \
              ("\n\t- Min Purity: {}".format(score_purity_min) if score_purity_min is not None else "") + \
              ("\n\t- Min Coverage: {}".format(coverage_min) if coverage_min is not None else "") + \
              ("\n\t- Min Lift: {}".format(lift_min) if lift_min is not None else "") + \
              ("\n\t- Min Z-score: {}".format(zscore_min) if zscore_min is not None else "") + \
              ("\n\t- Min Average value: {}".format(average_value_min) if average_value_min is not None else "") + \
              ("\n\t- Max Standard deviation: {}".format(standard_deviation_max) if standard_deviation_max is not None else "") + \
              ("\n\t- Min Shift: {}".format(shift_min) if shift_min is not None else "") + \
              "\n\t- Rule Complexity: {}\n\t- Default Number of Bins: {} \n\t- Enable custom discretizations: {}  \n\t- Min Marginal contribution: \
{}"   .format(rule_complexity, quantiles, enable_custom_discretizations, min_marginal_contribution)

        if (len(compute_other_key_indicators) > 0):
            for key_indicator in compute_other_key_indicators:
                for _id, _type in zip(key_indicator.target.score_ids,
                                      key_indicator.target.scores):
                    _kpiKI = {
                        "kpiId": _id,
                        "type": _type,
                        "kpiFamily": key_indicator.target.indicator_family,
                        "scoreType": _type,
                        "kpiType": key_indicator.target.indicator_type,
                        "output": key_indicator.target.variable_name,
                        "kpiName": key_indicator.target.name,
                        "omodality": key_indicator.target.modality
                    }
                    if (key_indicator.target.indicator_type
                            == self._DISCRETE_MODALITY
                            or key_indicator.target.indicator_type
                            == self._DISCRETE):
                        if _type == self._PURITY:
                            if key_indicator.purity_min is not None:
                                _kpiKI['minValue'] = key_indicator.purity_min
                            if key_indicator.purity_max is not None:
                                _kpiKI['maxValue'] = key_indicator.purity_max
                        elif _type == self._COVERAGE:
                            if key_indicator.coverage_min is not None:
                                _kpiKI['minValue'] = key_indicator.coverage_min
                            if key_indicator.coverage_max is not None:
                                _kpiKI['maxValue'] = key_indicator.coverage_max
                        elif _type == self._LIFT:
                            if key_indicator.lift_min is not None:
                                _kpiKI['minValue'] = key_indicator.lift_min
                            if key_indicator.lift_max is not None:
                                _kpiKI['maxValue'] = key_indicator.lift_max
                        elif _type == self._ZSCORE:
                            if key_indicator.zscore_min is not None:
                                _kpiKI['minValue'] = key_indicator.zscore_min
                            if key_indicator.zscore_max is not None:
                                _kpiKI['maxValue'] = key_indicator.zscore_max
                    else:
                        if _type == self._AVERAGE_VALUE:
                            if key_indicator.average_value_min is not None:
                                _kpiKI[
                                    'minValue'] = key_indicator.average_value_min
                            if key_indicator.average_value_max is not None:
                                _kpiKI[
                                    'maxValue'] = key_indicator.average_value_max
                        elif _type == self._STANDARD_DEVIATION:
                            if key_indicator.standard_deviation_min is not None:
                                _kpiKI[
                                    'minValue'] = key_indicator.standard_deviation_min
                            if key_indicator.standard_deviation_max is not None:
                                _kpiKI[
                                    'maxValue'] = key_indicator.standard_deviation_max
                        elif _type == self._SHIFT:
                            if key_indicator.shift_min is not None:
                                _kpiKI['minValue'] = key_indicator.shift_min
                            if key_indicator.shift_max is not None:
                                _kpiKI['maxValue'] = key_indicator.shift_max
                    if 'kpis' not in data['task']['params']:
                        data['task']['params']['kpis'] = []
                    data['task']['params']['kpis'].append(_kpiKI)

        if (locally_increase_complexity):
            data['task']['params']['maxComplexity'] = max_complexity
            data['task']['params']['nbMinimizations'] = nb_minimizations
            data['task']['params']['coverageIncrement'] = coverage_increment
            msg += "\n\t- Max complexity: {} \n\t- Number of Minimizations: {} \n\t- Minimization \
            Coverage Increment: {}".format(max_complexity, nb_minimizations,
                                           coverage_increment)
        if (validate_stability):
            data['task']['params']['percentageSplit'] = split_ratio
            data['task']['params']['nbModels'] = nb_iterations
            data['task']['params']['purityTolerance'] = purity_tolerance
            msg += "\n\t- Percentage split: {} \n\t- Number of Iterations: {} \n\t- Purity Tolerance: {}".format(
                split_ratio, nb_iterations, purity_tolerance)

        print(msg)
        _ruleset = self.__api.Task.createtask(project_ID=self.__project_id,
                                              json=data)
        self.__api.handle_work_states(self.__project_id,
                                      work_type='learning',
                                      work_id=_ruleset.get('_id'))
        return self.get(name)
示例#21
0
    def create(self,
               name,
               file_path,
               decimal='.',
               delimiter=';',
               encoding='UTF-8',
               selectedSheet=1,
               description='',
               modalities=2,
               continuous_threshold=0.95,
               missing_threshold=0.95):
        """
        Create a Dataset from a file (csv, Excel)

        Args:
            name (str): The name of the dataset
            file_path (str): The origin path of the file
            decimal (str): Decimal separator - csv files only, default is '.'
            delimiter (str): The csv field delimiter - csv files only, default is ';'
            encoding (str): The file encoding - csv files only, default is 'UTF-8'
            selectedSheet (int): The worksheet to use (starts at 1 like in Hypercube User Interface) - Excel files only, default is 1
            description (str): The dataset description, default is ''
            modalities (int): Modality threshold for discrete variables, default is 2
            continuous_threshold (float): % of continuous values threshold for continuous variables, default is 0.95
            missing_threshold (float): % of missing values threshold for ignored variables, default is 0.95

        Returns:
            Dataset
        """

        project_id = self.__project_id
        dataset_path, file_name = split(file_path)
        selectedSheet = max(1, selectedSheet)

        data = {
            'name': name,
            'fileName': file_name,
            'decimalDelimiter': decimal,
            'delimiter': delimiter,
            'separator': delimiter,
            'encoding': encoding,
            'usePython': description,
            'useSpark': 'False',
            'sourceFileName': file_name,
            'selectedSheet': str(selectedSheet),
            'description': description,
            'size': '{}'.format(getsize(file_path)),
            'nbModalitiesThreshold': str(modalities),
            'percentageContinuousThreshold': str(continuous_threshold),
            'percentageMissingThreshold': str(missing_threshold)
        }

        with open(file_path, 'rb') as FILE:
            data['file[0]'] = (
                file_name,
                FILE,
                'application/vnd.ms-excel',
            )
            json = {'project_ID': project_id, 'data': data, 'streaming': True}

            creation_json = self.__api.Datasets.uploaddatasets(**json)
            print('\n')

            try:
                self.__api.handle_work_states(
                    project_id,
                    work_type='datasetValidation',
                    query={"datasetId": creation_json.get('_id')})
                self.__api.handle_work_states(
                    project_id,
                    work_type='datasetDescription',
                    query={"datasetId": creation_json.get('_id')})
            except Exception as E:
                raise ApiException('Unable to get the dataset status', str(E))

            returned_json = self.__api.Datasets.getadataset(
                project_ID=project_id, dataset_ID=creation_json.get('_id'))

        return Dataset(self.__api, json, returned_json)