def empty_feature_detection(data_instances): is_empty_feature = data_overview.is_empty_feature(data_instances) if is_empty_feature: table_name = data_instances.get_name() namespace = data_instances.get_namespace() raise ValueError( "Number of features of DTable is 0., table_name: {}, namespace: {}" .format(table_name, namespace))
def fit(self, data): LOGGER.debug(f"fit receives data is {data}") if not isinstance(data, dict): raise ValueError( "Union module must receive more than one table as input.") empty_count = 0 combined_table = None combined_schema = None metrics = [] for (key, local_table) in data.items(): LOGGER.debug("table to combine name: {}".format(key)) num_data = local_table.count() LOGGER.debug("table count: {}".format(num_data)) metrics.append(Metric(key, num_data)) self.add_summary(key, num_data) if num_data == 0: LOGGER.warning("Table {} is empty.".format(key)) if combined_table is None: combined_table = local_table combined_schema = local_table.schema empty_count += 1 continue local_is_data_instance = self.check_is_data_instance(local_table) if combined_table is None: self.is_data_instance = local_is_data_instance LOGGER.debug(f"self.is_data_instance is {self.is_data_instance}, " f"local_is_data_instance is {local_is_data_instance}") if self.is_data_instance != local_is_data_instance: raise ValueError( f"Cannot combine DataInstance and non-DataInstance object. Union aborted." ) if self.is_data_instance: self.is_empty_feature = data_overview.is_empty_feature( local_table) if self.is_empty_feature: LOGGER.warning("Table {} has empty feature.".format(key)) else: self.check_schema_content(local_table.schema) if combined_table is None: # first table to combine combined_table = local_table combined_schema = local_table.schema else: self.check_id(local_table, combined_table) self.check_label_name(local_table, combined_table) self.check_header(local_table, combined_table) if self.keep_duplicate: repeated_ids = combined_table.join(local_table, lambda v1, v2: 1) self.repeated_ids = [v[0] for v in repeated_ids.collect()] self.key = key local_table = local_table.flatMap(self._renew_id) combined_table = combined_table.union(local_table, self._keep_first) combined_table.schema = combined_schema # only check feature length if not empty if self.is_data_instance and not self.is_empty_feature: self.feature_count = len(combined_schema.get("header")) LOGGER.debug("feature count: {}".format(self.feature_count)) combined_table.mapValues(self.check_feature_length) if combined_table is None: num_data = 0 LOGGER.warning( "All tables provided are empty or have empty features.") else: num_data = combined_table.count() metrics.append(Metric("Total", num_data)) self.add_summary("Total", num_data) self.callback_metric(metric_name=self.metric_name, metric_namespace=self.metric_namespace, metric_data=metrics) self.tracker.set_metric_meta(metric_namespace=self.metric_namespace, metric_name=self.metric_name, metric_meta=MetricMeta( name=self.metric_name, metric_type=self.metric_type)) LOGGER.info( "Union operation finished. Total {} empty tables encountered.". format(empty_count)) return combined_table
def empty_feature_detection(data_instances): is_empty_feature = data_overview.is_empty_feature(data_instances) if is_empty_feature: raise ValueError(f"Number of features of Table is 0: {data_instances}")
def fit(self, data): if not isinstance(data, dict): raise ValueError( "Union module must receive more than one table as input.") empty_count = 0 combined_table = None combined_schema = None metrics = [] for (key, local_table) in data.items(): LOGGER.debug("table to combine name: {}".format(key)) num_data = local_table.count() LOGGER.debug("table count: {}".format(num_data)) local_schema = local_table.schema metrics.append(Metric(key, num_data)) if num_data == 0: LOGGER.warning("Table {} is empty.".format(key)) empty_count += 1 continue if combined_table is None: self.check_is_data_instance(local_table) if self.is_data_instance: self.is_empty_feature = data_overview.is_empty_feature( local_table) if self.is_empty_feature: LOGGER.warning("Table {} has empty feature.".format(key)) if combined_table is None: # first table to combine combined_table = local_table if self.is_data_instance: combined_schema = local_table.schema combined_table.schema = combined_schema else: self.check_schema_id(local_schema, combined_schema) self.check_schema_label_name(local_schema, combined_schema) self.check_schema_header(local_schema, combined_schema) combined_table = combined_table.union(local_table, self._keep_first) # only check feature length if not empty if self.is_data_instance and not self.is_empty_feature: self.feature_count = len(combined_schema.get("header")) LOGGER.debug("feature count: {}".format(self.feature_count)) combined_table.mapValues(self.check_feature_length) if combined_table is None: num_data = 0 LOGGER.warning( "All tables provided are empty or have empty features.") else: num_data = combined_table.count() metrics.append(Metric("Total", num_data)) self.callback_metric(metric_name=self.metric_name, metric_namespace=self.metric_namespace, metric_data=metrics) self.tracker.set_metric_meta(metric_namespace=self.metric_namespace, metric_name=self.metric_name, metric_meta=MetricMeta( name=self.metric_name, metric_type=self.metric_type)) LOGGER.info( "Union operation finished. Total {} empty tables encountered.". format(empty_count)) return combined_table