def get_new_schema(original_data, feature_mask): old_header = original_data.schema.get("header") sid_name = original_data.schema.get("sid_name") label_name = original_data.schema.get("label_name") new_header = [old_header[i] for i in np.where(feature_mask > 0)[0]] schema = make_schema(new_header, sid_name, label_name) return schema
def prepare_data(self, data_num, feature_num, header, sid_name, label_name): final_result = [] for i in range(data_num): tmp = i * np.ones(feature_num) inst = Instance(inst_id=i, features=tmp, label=0) tmp = (i, inst) final_result.append(tmp) table = self.session.parallelize(final_result, include_key=True, partition=3) schema = data_io.make_schema(header, sid_name, label_name) table.schema = schema return table
def fit(self, data): if len(data) <= 0: LOGGER.warning("Union receives no data input.") return empty_count = 0 combined_table = None combined_schema = None metrics = [] for (key, local_table) in data.items(): LOGGER.debug("table to combine name: {}".format(key)) num_data = local_table.count() LOGGER.debug("table count: {}".format(num_data)) local_schema = local_table.schema metrics.append(Metric(key, num_data)) if num_data == 0: LOGGER.warning("Table {} has no entries.".format(key)) empty_count += 1 continue if combined_table is None: self.check_is_data_instance(local_table) if self.is_data_instance: self.is_empty_feature = data_overview.is_empty_feature( local_table) if self.is_empty_feature: LOGGER.warning("Table {} has no entries.".format(key)) if combined_table is None: # first table to combine combined_table = local_table else: self.check_schema_id(local_schema, combined_schema) self.check_schema_label_name(local_schema, combined_schema) self.check_schema_header(local_schema, combined_schema) combined_table = combined_table.union(local_table, self._keep_first) combined_schema = make_schema(local_table.schema.get("header"), local_table.schema.get("sid"), local_table.schema.get("label_name")) combined_table.schema = combined_schema # only check feature length if not empty if self.is_data_instance and not self.is_empty_feature: self.feature_count = len(combined_schema.get("header")) LOGGER.debug("feature count: {}".format(self.feature_count)) combined_table.mapValues(self.check_feature_length) if combined_table is None: num_data = 0 LOGGER.warning( "All tables provided are empty or have empty features.") else: num_data = combined_table.count() metrics.append(Metric("Total", num_data)) self.callback_metric(metric_name=self.metric_name, metric_namespace=self.metric_namespace, metric_data=metrics) self.tracker.set_metric_meta(metric_namespace=self.metric_namespace, metric_name=self.metric_name, metric_meta=MetricMeta( name=self.metric_name, metric_type=self.metric_type)) LOGGER.debug("after union schema: {}".format(combined_table.schema)) LOGGER.info( "Union operation finished. Total {} empty tables encountered.". format(empty_count)) return combined_table