Exemplo n.º 1
0
    def _setup(self, query, host='localhost', user='******', password=None,
               port=8123, protocol='http'):

        if protocol not in ('https', 'http'):
            raise ValueError('Unexpected protocol {}'.fomat(protocol))

        if ' format ' in query.lower():
            err_msg = 'Please refrain from adding a "FORMAT" statement to the query'
            log.error(err_msg)
            raise Exception(err_msg)
        
        query = '{} FORMAT JSON'.format(query.rstrip(" ;\n"))
        log.info(f'Getting data via the query: "{query}"')

        params = {'user': user}
        if password is not None:
            params['password'] = password

        response = requests.post(f'{protocol}://{host}:{port}', data=query, params=params)
        
        try:
            data = response.json()['data']
        except:
            log.error(f'Got an invalid response from the database: {response.text}')
            raise Exception(response.text)

        df = pd.DataFrame(data)
        
        col_map = {}
        for col in df.columns:
            col_map[col] = col

        return df, col_map
Exemplo n.º 2
0
def getDS(from_data):
    '''
    Get a datasource give the input

    :param input: a string or an object
    :return: a datasource
    '''

    if isinstance(from_data, DataSource):
        from_ds = from_data

    elif isinstance(from_data, DataFrame):
        from_ds = DataSource(from_data)

    else:  # assume is a file
        from_ds = FileDS(from_data)
        if from_ds is None:
            log.error('No data matched the input data')

    return from_ds
Exemplo n.º 3
0
    def run(self):
        np.seterr(divide='warn', invalid='warn')
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [
            col for col in self.transaction.lmd['columns']
            if col not in output_columns
            and col not in self.transaction.lmd['columns_to_ignore']
        ]

        # Make predictions on the validation dataset normally and with various columns missing
        normal_predictions = self.transaction.model_backend.predict('validate')

        normal_predictions_test = self.transaction.model_backend.predict(
            'test')
        normal_accuracy = evaluate_accuracy(
            normal_predictions,
            self.transaction.input_data.validation_df,
            self.transaction.lmd['stats_v2'],
            output_columns,
            backend=self.transaction.model_backend)

        for col in output_columns:
            reals = self.transaction.input_data.validation_df[col]
            preds = normal_predictions[col]

            fails = False

            data_type = self.transaction.lmd['stats_v2'][col]['typing'][
                'data_type']
            data_subtype = self.transaction.lmd['stats_v2'][col]['typing'][
                'data_subtype']

            if data_type == DATA_TYPES.CATEGORICAL:
                if data_subtype == DATA_SUBTYPES.TAGS:
                    encoder = self.transaction.model_backend.predictor._mixer.encoders[
                        col]
                    if accuracy_score(
                            encoder.encode(reals),
                            encoder.encode(preds)) <= self.transaction.lmd[
                                'stats_v2'][col]['guess_probability']:
                        fails = True
                else:
                    if accuracy_score(reals, preds) <= self.transaction.lmd[
                            'stats_v2'][col]['guess_probability']:
                        fails = True
            elif data_type == DATA_TYPES.NUMERIC:
                if r2_score(reals, preds) < 0:
                    fails = True
            else:
                pass

            if fails:
                if not self.transaction.lmd['force_predict']:

                    def predict_wrapper(*args, **kwargs):
                        raise Exception('Failed to train model')

                    self.session.predict = predict_wrapper
                log.error('Failed to train model to predict {}'.format(col))

        empty_input_predictions = {}
        empty_input_accuracy = {}
        empty_input_predictions_test = {}

        ignorable_input_columns = [
            x for x in input_columns if self.transaction.lmd['stats_v2'][x]
            ['typing']['data_type'] != DATA_TYPES.FILE_PATH and x not in
            [y[0] for y in self.transaction.lmd['model_order_by']]
        ]

        for col in ignorable_input_columns:
            empty_input_predictions[
                col] = self.transaction.model_backend.predict(
                    'validate', ignore_columns=[col])
            empty_input_predictions_test[
                col] = self.transaction.model_backend.predict(
                    'test', ignore_columns=[col])
            empty_input_accuracy[col] = evaluate_accuracy(
                empty_input_predictions[col],
                self.transaction.input_data.validation_df,
                self.transaction.lmd['stats_v2'],
                output_columns,
                backend=self.transaction.model_backend)

        # Get some information about the importance of each column
        self.transaction.lmd['column_importances'] = {}
        for col in ignorable_input_columns:
            accuracy_increase = (normal_accuracy - empty_input_accuracy[col])
            # normalize from 0 to 10
            self.transaction.lmd['column_importances'][col] = 10 * max(
                0, accuracy_increase)

        # Run Probabilistic Validator
        overall_accuracy_arr = []
        self.transaction.lmd['accuracy_histogram'] = {}
        self.transaction.lmd['confusion_matrices'] = {}
        self.transaction.lmd['accuracy_samples'] = {}
        self.transaction.hmd['probabilistic_validators'] = {}

        self.transaction.lmd['train_data_accuracy'] = {}
        self.transaction.lmd['test_data_accuracy'] = {}
        self.transaction.lmd['valid_data_accuracy'] = {}

        for col in output_columns:

            # Training data accuracy
            predictions = self.transaction.model_backend.predict(
                'predict_on_train_data',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['train_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.train_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

            # Testing data accuracy
            predictions = self.transaction.model_backend.predict(
                'test',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['test_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.test_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

            # Validation data accuracy
            predictions = self.transaction.model_backend.predict(
                'validate',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['valid_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.validation_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

        for col in output_columns:
            pval = ProbabilisticValidator(
                col_stats=self.transaction.lmd['stats_v2'][col],
                col_name=col,
                input_columns=input_columns)
            predictions_arr = [normal_predictions_test] + [
                x for x in empty_input_predictions_test.values()
            ]

            pval.fit(self.transaction.input_data.test_df, predictions_arr,
                     [[ignored_column]
                      for ignored_column in empty_input_predictions_test])
            overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats(
            )
            overall_accuracy_arr.append(overall_accuracy)

            self.transaction.lmd['accuracy_histogram'][
                col] = accuracy_histogram
            self.transaction.lmd['confusion_matrices'][col] = cm
            self.transaction.lmd['accuracy_samples'][col] = accuracy_samples
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(
                pval)

        self.transaction.lmd['validation_set_accuracy'] = sum(
            overall_accuracy_arr) / len(overall_accuracy_arr)
Exemplo n.º 4
0
    def run(self):
        np.seterr(divide='warn', invalid='warn')
        """
        # Runs the model on the validation set in order to fit a probabilistic model that will evaluate the accuracy of future predictions
        """

        output_columns = self.transaction.lmd['predict_columns']
        input_columns = [
            col for col in self.transaction.lmd['columns']
            if col not in output_columns
            and col not in self.transaction.lmd['columns_to_ignore']
        ]

        # Make predictions on the validation dataset normally and with various columns missing
        normal_predictions = self.transaction.model_backend.predict('validate')

        normal_predictions_test = self.transaction.model_backend.predict(
            'test')
        normal_accuracy = evaluate_accuracy(
            normal_predictions,
            self.transaction.input_data.validation_df,
            self.transaction.lmd['stats_v2'],
            output_columns,
            backend=self.transaction.model_backend)

        for col in output_columns:
            if self.transaction.lmd['tss']['is_timeseries']:
                reals = list(self.transaction.input_data.validation_df[
                    self.transaction.input_data.
                    validation_df['make_predictions'] == True][col])
            else:
                reals = self.transaction.input_data.validation_df[col]
            preds = normal_predictions[col]

            fails = False

            data_type = self.transaction.lmd['stats_v2'][col]['typing'][
                'data_type']
            data_subtype = self.transaction.lmd['stats_v2'][col]['typing'][
                'data_subtype']

            if data_type == DATA_TYPES.CATEGORICAL:
                if data_subtype == DATA_SUBTYPES.TAGS:
                    encoder = self.transaction.model_backend.predictor._mixer.encoders[
                        col]
                    if balanced_accuracy_score(
                            encoder.encode(reals).argmax(axis=1),
                            encoder.encode(preds).argmax(
                                axis=1)) <= self.transaction.lmd['stats_v2'][
                                    col]['balanced_guess_probability']:
                        fails = True
                else:
                    if balanced_accuracy_score(
                            reals, preds) <= self.transaction.lmd['stats_v2'][
                                col]['balanced_guess_probability']:
                        fails = True
            elif data_type == DATA_TYPES.NUMERIC:
                if r2_score(reals, preds) < 0:
                    fails = True
            else:
                pass

            if fails:
                if not self.transaction.lmd['force_predict']:

                    def predict_wrapper(*args, **kwargs):
                        raise Exception('Failed to train model')

                    self.session.predict = predict_wrapper
                log.error('Failed to train model to predict {}'.format(col))

        empty_input_predictions = {}
        empty_input_accuracy = {}
        empty_input_predictions_test = {}

        ignorable_input_columns = [
            x for x in input_columns if self.transaction.lmd['stats_v2'][x]
            ['typing']['data_type'] != DATA_TYPES.FILE_PATH and (
                not self.transaction.lmd['tss']['is_timeseries']
                or x not in self.transaction.lmd['tss']['order_by'])
        ]

        for col in ignorable_input_columns:
            empty_input_predictions[
                col] = self.transaction.model_backend.predict(
                    'validate', ignore_columns=[col])
            empty_input_predictions_test[
                col] = self.transaction.model_backend.predict(
                    'test', ignore_columns=[col])
            empty_input_accuracy[col] = evaluate_accuracy(
                empty_input_predictions[col],
                self.transaction.input_data.validation_df,
                self.transaction.lmd['stats_v2'],
                output_columns,
                backend=self.transaction.model_backend)

        # Get some information about the importance of each column
        self.transaction.lmd['column_importances'] = {}
        for col in ignorable_input_columns:
            accuracy_increase = (normal_accuracy - empty_input_accuracy[col])
            # normalize from 0 to 10
            self.transaction.lmd['column_importances'][col] = 10 * max(
                0, accuracy_increase)

        # Run Probabilistic Validator
        overall_accuracy_arr = []
        self.transaction.lmd['accuracy_histogram'] = {}
        self.transaction.lmd['confusion_matrices'] = {}
        self.transaction.lmd['accuracy_samples'] = {}
        self.transaction.hmd['probabilistic_validators'] = {}

        self.transaction.lmd['train_data_accuracy'] = {}
        self.transaction.lmd['test_data_accuracy'] = {}
        self.transaction.lmd['valid_data_accuracy'] = {}

        for col in output_columns:

            # Training data accuracy
            predictions = self.transaction.model_backend.predict(
                'predict_on_train_data',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['train_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.train_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

            # Testing data accuracy
            predictions = self.transaction.model_backend.predict(
                'test',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['test_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.test_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

            # Validation data accuracy
            predictions = self.transaction.model_backend.predict(
                'validate',
                ignore_columns=self.transaction.lmd['stats_v2']
                ['columns_to_ignore'])
            self.transaction.lmd['valid_data_accuracy'][
                col] = evaluate_accuracy(
                    predictions,
                    self.transaction.input_data.validation_df,
                    self.transaction.lmd['stats_v2'], [col],
                    backend=self.transaction.model_backend)

        for col in output_columns:
            pval = ProbabilisticValidator(
                col_stats=self.transaction.lmd['stats_v2'][col],
                col_name=col,
                input_columns=input_columns)
            predictions_arr = [normal_predictions_test] + [
                x for x in empty_input_predictions_test.values()
            ]

            pval.fit(self.transaction.input_data.test_df, predictions_arr,
                     [[ignored_column]
                      for ignored_column in empty_input_predictions_test])
            overall_accuracy, accuracy_histogram, cm, accuracy_samples = pval.get_accuracy_stats(
            )
            overall_accuracy_arr.append(overall_accuracy)

            self.transaction.lmd['accuracy_histogram'][
                col] = accuracy_histogram
            self.transaction.lmd['confusion_matrices'][col] = cm
            self.transaction.lmd['accuracy_samples'][col] = accuracy_samples
            self.transaction.hmd['probabilistic_validators'][col] = pickle_obj(
                pval)

        self.transaction.lmd['validation_set_accuracy'] = sum(
            overall_accuracy_arr) / len(overall_accuracy_arr)

        # conformal prediction confidence estimation
        self.transaction.lmd['stats_v2']['train_std_dev'] = {}
        self.transaction.hmd['label_encoders'] = {}
        self.transaction.hmd['icp'] = {'active': False}

        for target in output_columns:
            data_type = self.transaction.lmd['stats_v2'][target]['typing'][
                'data_type']
            data_subtype = self.transaction.lmd['stats_v2'][target]['typing'][
                'data_subtype']
            is_classification = data_type == DATA_TYPES.CATEGORICAL

            fit_params = {
                'target': target,
                'all_columns': self.transaction.lmd['columns'],
                'columns_to_ignore': []
            }
            fit_params['columns_to_ignore'].extend(
                self.transaction.lmd['columns_to_ignore'])
            fit_params['columns_to_ignore'].extend(
                [col for col in output_columns if col != target])

            if is_classification:
                if data_subtype != DATA_SUBTYPES.TAGS:
                    all_targets = [
                        elt[1][target].values for elt in inspect.getmembers(
                            self.transaction.input_data)
                        if elt[0] in {'test_df', 'train_df', 'validation_df'}
                    ]
                    all_classes = np.unique(
                        np.concatenate([np.unique(arr)
                                        for arr in all_targets]))

                    enc = OneHotEncoder(sparse=False, handle_unknown='ignore')
                    enc.fit(all_classes.reshape(-1, 1))
                    fit_params['one_hot_enc'] = enc
                    self.transaction.hmd['label_encoders'][target] = enc
                else:
                    fit_params['one_hot_enc'] = None
                    self.transaction.hmd['label_encoders'][target] = None

                adapter = ConformalClassifierAdapter
                nc_function = MarginErrFunc(
                )  # better than IPS as we'd need the complete distribution over all classes
                nc_class = ClassifierNc
                icp_class = IcpClassifier

            else:
                adapter = ConformalRegressorAdapter
                nc_function = AbsErrorErrFunc()
                nc_class = RegressorNc
                icp_class = IcpRegressor

            if (data_type == DATA_TYPES.NUMERIC or
                (is_classification and data_subtype != DATA_SUBTYPES.TAGS)
                ) and not self.transaction.lmd['tss']['is_timeseries']:
                model = adapter(self.transaction.model_backend.predictor,
                                fit_params=fit_params)
                nc = nc_class(model, nc_function)

                X = deepcopy(self.transaction.input_data.train_df)
                y = X.pop(target)

                if is_classification:
                    self.transaction.hmd['icp'][target] = icp_class(
                        nc, smoothing=False)
                else:
                    self.transaction.hmd['icp'][target] = icp_class(nc)
                    self.transaction.lmd['stats_v2']['train_std_dev'][
                        target] = self.transaction.input_data.train_df[
                            target].std()

                X = clean_df(X, self.transaction.lmd['stats_v2'],
                             output_columns)
                self.transaction.hmd['icp'][target].fit(X.values, y.values)
                self.transaction.hmd['icp']['active'] = True

                # calibrate conformal estimator on test set
                X = deepcopy(self.transaction.input_data.validation_df)
                y = X.pop(target).values

                if is_classification:
                    if isinstance(enc.categories_[0][0], str):
                        cats = enc.categories_[0].tolist()
                        y = np.array([cats.index(i) for i in y])
                    y = y.astype(int)

                X = clean_df(X, self.transaction.lmd['stats_v2'],
                             output_columns)
                self.transaction.hmd['icp'][target].calibrate(X.values, y)
Exemplo n.º 5
0
    def _getDataIo(self, file):
        """
        This gets a file either url or local file and defiens what the format is as well as dialect
        :param file: file path or url
        :return: data_io, format, dialect
        """

        ############
        # get file as io object
        ############

        data = BytesIO()

        # get data from either url or file load in memory
        if file.startswith('http:') or file.startswith('https:'):
            r = requests.get(file, stream=True)
            if r.status_code == 200:
                for chunk in r:
                    data.write(chunk)
            data.seek(0)

        # else read file from local file system
        else:
            try:
                data = open(file, 'rb')
            except Exception as e:
                error = 'Could not load file, possible exception : {exception}'.format(
                    exception=e)
                log.error(error)
                raise ValueError(error)

        dialect = None

        ############
        # check for file type
        ############

        # try to guess if its an excel file
        xlsx_sig = b'\x50\x4B\x05\06'
        xlsx_sig2 = b'\x50\x4B\x03\x04'
        xls_sig = b'\x09\x08\x10\x00\x00\x06\x05\x00'

        # different whence, offset, size for different types
        excel_meta = [('xls', 0, 512, 8), ('xlsx', 2, -22, 4)]

        for filename, whence, offset, size in excel_meta:

            try:
                data.seek(offset, whence)  # Seek to the offset.
                bytes = data.read(
                    size)  # Capture the specified number of bytes.
                data.seek(0)
                codecs.getencoder('hex')(bytes)

                if bytes == xls_sig:
                    return data, 'xls', dialect
                elif bytes == xlsx_sig:
                    return data, 'xlsx', dialect

            except:
                data.seek(0)

        # if not excel it can be a json file or a CSV, convert from binary to stringio

        byte_str = data.read()
        # Move it to StringIO
        try:
            # Handle Microsoft's BOM "special" UTF-8 encoding
            if byte_str.startswith(codecs.BOM_UTF8):
                data = StringIO(byte_str.decode('utf-8-sig'))
            else:
                data = StringIO(byte_str.decode('utf-8'))

        except:
            log.error(traceback.format_exc())
            log.error('Could not load into string')

        # see if its JSON
        buffer = data.read(100)
        data.seek(0)
        text = buffer.strip()
        # analyze first n characters
        if len(text) > 0:
            text = text.strip()
            # it it looks like a json, then try to parse it
            if text.startswith('{') or text.startswith('['):
                try:
                    json.loads(data.read())
                    data.seek(0)
                    return data, 'json', dialect
                except:
                    data.seek(0)
                    return data, None, dialect

        # lets try to figure out if its a csv
        try:
            data.seek(0)
            first_few_lines = []
            i = 0
            for line in data:
                if line in ['\r\n', '\n']:
                    continue
                first_few_lines.append(line)
                i += 1
                if i > 0:
                    break

            accepted_delimiters = [',', '\t', ';']
            dialect = csv.Sniffer().sniff(''.join(first_few_lines[0]),
                                          delimiters=accepted_delimiters)
            data.seek(0)
            # if csv dialect identified then return csv
            if dialect:
                return data, 'csv', dialect
            else:
                return data, None, dialect
        except:
            data.seek(0)
            log.error('Could not detect format for this file')
            log.error(traceback.format_exc())
            # No file type identified
            return data, None, dialect