Exemplo n.º 1
0
    def generate_header(self, input_data, input_data_feature):
        header = storage.get_data_table_meta_by_instance("header", input_data)
        sid_name = storage.get_data_table_meta_by_instance("sid", input_data)
        LOGGER.debug("header is {}".format(header))
        LOGGER.debug("sid_name is {}".format(sid_name))

        if not header:
            feature_shape = data_overview.get_data_shape(input_data_feature)
            self.header = ["fid" + str(i) for i in range(feature_shape)]
            self.sid_name = "sid"

            if self.with_label:
                self.label_name = "label"
        else:
            if not sid_name:
                self.sid_name = "sid"
            else:
                self.sid_name = sid_name

            if self.with_label:
                self.header = header.split(self.delimitor, -1)[: self.label_idx] + \
                              header.split(self.delimitor, -1)[self.label_idx + 1:]
                self.label_name = header.split(self.delimitor,
                                               -1)[self.label_idx]
            else:
                self.header = header.split(self.delimitor, -1)

        schema = make_schema(self.header, self.sid_name, self.label_name)
        set_schema(input_data_feature, schema)

        return schema
Exemplo n.º 2
0
    def read_data(self, input_data, mode="fit"):
        LOGGER.info("start to read dense data and change data to instance")

        abnormal_detection.empty_table_detection(input_data)

        input_data_labels = None

        self.generate_header(input_data, mode=mode)

        if self.with_label:
            data_shape = data_overview.get_data_shape(input_data)
            if not data_shape or self.label_idx >= data_shape:
                raise ValueError(
                    "input data's value is empty, it does not contain a label")

            input_data_features = input_data.mapValues(
                lambda value: [] if data_shape == 1 else value.split(
                    self.delimitor, -1)[:self.label_idx] + value.split(
                        self.delimitor, -1)[self.label_idx + 1:])
            input_data_labels = input_data.mapValues(
                lambda value: value.split(self.delimitor, -1)[self.label_idx])

        else:
            input_data_features = input_data.mapValues(lambda value: [
            ] if not value else value.split(self.delimitor, -1))

        if mode == "fit":
            data_instance = self.fit(input_data, input_data_features,
                                     input_data_labels)
        else:
            data_instance = self.transform(input_data_features,
                                           input_data_labels)

        return data_instance
Exemplo n.º 3
0
    def read_data(self, table_name, namespace, mode="fit"):
        input_data = storage.get_data_table(table_name, namespace)
        LOGGER.info("start to read dense data and change data to instance")
        
        abnormal_detection.empty_table_detection(input_data)
        
        input_data_features = None
        input_data_labels = None

        if self.with_label:
            if type(self.label_idx).__name__ != "int":
                raise ValueError("label index should be integer")

            data_shape = data_overview.get_data_shape(input_data)
            if not data_shape or self.label_idx >= data_shape:
                raise ValueError("input data's value is empty, it does not contain a label")

            input_data_features = input_data.mapValues(
                lambda value: [] if data_shape == 1 else value.split(self.delimitor, -1)[:self.label_idx] + value.split(self.delimitor, -1)[
                                                                                 self.label_idx + 1:])
            input_data_labels = input_data.mapValues(lambda value: value.split(self.delimitor, -1)[self.label_idx])

        else:
            input_data_features = input_data.mapValues(lambda value: [] if not value else value.split(self.delimitor, -1))

        if mode == "fit":
            data_instance = self.fit(input_data_features, input_data_labels, table_name, namespace)
        else:
            data_instance = self.transform(input_data_features, input_data_labels)

        set_schema(data_instance, self.header)

        return data_instance
Exemplo n.º 4
0
    def generate_header(self, input_data_feature, table_name, namespace):
        self.header = storage.get_data_table_meta("header", table_name, namespace)

        if not self.header:
            feature_shape = data_overview.get_data_shape(input_data_feature)
            self.header = ["fid" + str(i) for i in range(feature_shape)]
        else:
            if self.with_label:
                self.header = self.header.split(self.delimitor, -1)[: self.label_idx] + \
                              self.header.split(self.delimitor, -1)[self.label_idx + 1:]
Exemplo n.º 5
0
    def __fit_replace(self,
                      data,
                      replace_method,
                      replace_value=None,
                      output_format=None,
                      quantile=None):
        if replace_method is not None and replace_method != consts.DESIGNATED:
            cols_transform_value = self.__get_cols_transform_value(
                data, replace_method, quantile=quantile)
            if output_format is not None:
                f = functools.partial(
                    Imputer.
                    __replace_missing_value_with_cols_transform_value_format,
                    transform_list=cols_transform_value,
                    missing_value_list=self.missing_value_list,
                    output_format=output_format)
            else:
                f = functools.partial(
                    Imputer.__replace_missing_value_with_cols_transform_value,
                    transform_list=cols_transform_value,
                    missing_value_list=self.missing_value_list)

            transform_data = data.mapValues(f)
            LOGGER.info(
                "finish replace missing value with cols transform value, replace method is {}"
                .format(replace_method))
            return transform_data, cols_transform_value
        else:
            if replace_value is None:
                raise ValueError("Replace value should not be None")
            if output_format is not None:
                f = functools.partial(
                    Imputer.__replace_missing_value_with_replace_value_format,
                    replace_value=replace_value,
                    missing_value_list=self.missing_value_list,
                    output_format=output_format)
            else:
                f = functools.partial(
                    Imputer.__replace_missing_value_with_replace_value,
                    replace_value=replace_value,
                    missing_value_list=self.missing_value_list)
            transform_data = data.mapValues(f)
            LOGGER.info(
                "finish replace missing value with replace value {}, replace method is:{}"
                .format(replace_value, replace_method))
            shape = data_overview.get_data_shape(data)
            replace_value = [replace_value for _ in range(shape)]

            return transform_data, replace_value
Exemplo n.º 6
0
    def read_data(self, input_data, mode="fit"):
        LOGGER.info("start to read sparse data and change data to instance")

        abnormal_detection.empty_table_detection(input_data)

        if not data_overview.get_data_shape(input_data):
            raise ValueError("input data's value is empty, it does not contain a label")

        if mode == "fit":
            data_instance = self.fit(input_data)
        else:
            data_instance = self.transform(input_data)

        schema = make_schema(self.header, self.sid_name, self.label_name)
        set_schema(data_instance, schema)
        return data_instance