def generate_header(self, input_data, input_data_feature): header = storage.get_data_table_meta_by_instance("header", input_data) sid_name = storage.get_data_table_meta_by_instance("sid", input_data) LOGGER.debug("header is {}".format(header)) LOGGER.debug("sid_name is {}".format(sid_name)) if not header: feature_shape = data_overview.get_data_shape(input_data_feature) self.header = ["fid" + str(i) for i in range(feature_shape)] self.sid_name = "sid" if self.with_label: self.label_name = "label" else: if not sid_name: self.sid_name = "sid" else: self.sid_name = sid_name if self.with_label: self.header = header.split(self.delimitor, -1)[: self.label_idx] + \ header.split(self.delimitor, -1)[self.label_idx + 1:] self.label_name = header.split(self.delimitor, -1)[self.label_idx] else: self.header = header.split(self.delimitor, -1) schema = make_schema(self.header, self.sid_name, self.label_name) set_schema(input_data_feature, schema) return schema
def read_data(self, input_data, mode="fit"): LOGGER.info("start to read dense data and change data to instance") abnormal_detection.empty_table_detection(input_data) input_data_labels = None self.generate_header(input_data, mode=mode) if self.with_label: data_shape = data_overview.get_data_shape(input_data) if not data_shape or self.label_idx >= data_shape: raise ValueError( "input data's value is empty, it does not contain a label") input_data_features = input_data.mapValues( lambda value: [] if data_shape == 1 else value.split( self.delimitor, -1)[:self.label_idx] + value.split( self.delimitor, -1)[self.label_idx + 1:]) input_data_labels = input_data.mapValues( lambda value: value.split(self.delimitor, -1)[self.label_idx]) else: input_data_features = input_data.mapValues(lambda value: [ ] if not value else value.split(self.delimitor, -1)) if mode == "fit": data_instance = self.fit(input_data, input_data_features, input_data_labels) else: data_instance = self.transform(input_data_features, input_data_labels) return data_instance
def read_data(self, table_name, namespace, mode="fit"): input_data = storage.get_data_table(table_name, namespace) LOGGER.info("start to read dense data and change data to instance") abnormal_detection.empty_table_detection(input_data) input_data_features = None input_data_labels = None if self.with_label: if type(self.label_idx).__name__ != "int": raise ValueError("label index should be integer") data_shape = data_overview.get_data_shape(input_data) if not data_shape or self.label_idx >= data_shape: raise ValueError("input data's value is empty, it does not contain a label") input_data_features = input_data.mapValues( lambda value: [] if data_shape == 1 else value.split(self.delimitor, -1)[:self.label_idx] + value.split(self.delimitor, -1)[ self.label_idx + 1:]) input_data_labels = input_data.mapValues(lambda value: value.split(self.delimitor, -1)[self.label_idx]) else: input_data_features = input_data.mapValues(lambda value: [] if not value else value.split(self.delimitor, -1)) if mode == "fit": data_instance = self.fit(input_data_features, input_data_labels, table_name, namespace) else: data_instance = self.transform(input_data_features, input_data_labels) set_schema(data_instance, self.header) return data_instance
def generate_header(self, input_data_feature, table_name, namespace): self.header = storage.get_data_table_meta("header", table_name, namespace) if not self.header: feature_shape = data_overview.get_data_shape(input_data_feature) self.header = ["fid" + str(i) for i in range(feature_shape)] else: if self.with_label: self.header = self.header.split(self.delimitor, -1)[: self.label_idx] + \ self.header.split(self.delimitor, -1)[self.label_idx + 1:]
def __fit_replace(self, data, replace_method, replace_value=None, output_format=None, quantile=None): if replace_method is not None and replace_method != consts.DESIGNATED: cols_transform_value = self.__get_cols_transform_value( data, replace_method, quantile=quantile) if output_format is not None: f = functools.partial( Imputer. __replace_missing_value_with_cols_transform_value_format, transform_list=cols_transform_value, missing_value_list=self.missing_value_list, output_format=output_format) else: f = functools.partial( Imputer.__replace_missing_value_with_cols_transform_value, transform_list=cols_transform_value, missing_value_list=self.missing_value_list) transform_data = data.mapValues(f) LOGGER.info( "finish replace missing value with cols transform value, replace method is {}" .format(replace_method)) return transform_data, cols_transform_value else: if replace_value is None: raise ValueError("Replace value should not be None") if output_format is not None: f = functools.partial( Imputer.__replace_missing_value_with_replace_value_format, replace_value=replace_value, missing_value_list=self.missing_value_list, output_format=output_format) else: f = functools.partial( Imputer.__replace_missing_value_with_replace_value, replace_value=replace_value, missing_value_list=self.missing_value_list) transform_data = data.mapValues(f) LOGGER.info( "finish replace missing value with replace value {}, replace method is:{}" .format(replace_value, replace_method)) shape = data_overview.get_data_shape(data) replace_value = [replace_value for _ in range(shape)] return transform_data, replace_value
def read_data(self, input_data, mode="fit"): LOGGER.info("start to read sparse data and change data to instance") abnormal_detection.empty_table_detection(input_data) if not data_overview.get_data_shape(input_data): raise ValueError("input data's value is empty, it does not contain a label") if mode == "fit": data_instance = self.fit(input_data) else: data_instance = self.transform(input_data) schema = make_schema(self.header, self.sid_name, self.label_name) set_schema(data_instance, schema) return data_instance