Пример #1
0
    def approxi_quantile(data_instances, params, cols_dict, abnormal_list,
                         header, is_sparse):
        """
        Calculates each quantile information

        Parameters
        ----------
        data_instances : DTable
            The input data

        cols_dict: dict
            Record key, value pairs where key is cols' name, and value is cols' index.

        params : FeatureBinningParam object,
                Parameters that user set.

        abnormal_list: list, default: None
            Specify which columns are abnormal so that will not static when traveling.

        header: list,
            Storing the header information.

        is_sparse: bool
            Specify whether data_instance is in sparse type

        Returns
        -------
        summary_dict: dict
            {'col_name1': summary1,
             'col_name2': summary2,
             ...
             }

        """

        summary_dict = {}

        summary_param = {
            'compress_thres': params.compress_thres,
            'head_size': params.head_size,
            'error': params.error,
            'abnormal_list': abnormal_list
        }

        for col_name, col_index in cols_dict.items():
            quantile_summaries = quantile_summary_factory(
                is_sparse=is_sparse, param_dict=summary_param)
            summary_dict[col_name] = quantile_summaries

        QuantileBinning.insert_datas(data_instances, summary_dict, cols_dict,
                                     header, is_sparse)
        for _, summary_obj in summary_dict.items():
            summary_obj.compress()
        return summary_dict
Пример #2
0
    def feature_summary(data_iter, params, cols_dict, abnormal_list, header,
                        is_sparse):
        summary_dict = {}

        summary_param = {
            'compress_thres': params.compress_thres,
            'head_size': params.head_size,
            'error': params.error,
            'abnormal_list': abnormal_list
        }

        for col_name, col_index in cols_dict.items():
            quantile_summaries = quantile_summary_factory(
                is_sparse=is_sparse, param_dict=summary_param)
            summary_dict[col_name] = quantile_summaries
        _ = str(uuid.uuid1())
        for _, instant in data_iter:
            if not is_sparse:
                if type(instant).__name__ == 'Instance':
                    features = instant.features
                else:
                    features = instant
                for col_name, summary in summary_dict.items():
                    col_index = cols_dict[col_name]
                    summary.insert(features[col_index])
            else:
                data_generator = instant.features.get_all_data()
                for col_idx, col_value in data_generator:
                    col_name = header[col_idx]
                    if col_name not in cols_dict:
                        continue
                    summary = summary_dict[col_name]
                    summary.insert(col_value)

        result = []
        for features_name, summary_obj in summary_dict.items():
            summary_obj.compress()
            # result.append(((_, features_name), summary_obj))
            result.append((features_name, summary_obj))

        return result