def synthesis(self): new_data_name = "{}MaxDiffMin_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, self.__path, self.__col) dst_table = self.dataset.tables[self.__path.dst] dst_df = dst_table.df dst_data = dst_df[self.__col].values time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in self.__path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in self.__path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in self.__path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in self.__path.relations ] max_new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "max", "max") min_new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "min", "min") new_data = max_new_data - min_new_data new_data = new_data.astype(np.float32) self.__dataset.tables[self.__path.src].set_new_data( new_data, new_data_name)
def synthesis(self): if len(self.__path) == 0: return if self.__is_cat: new_data_name = "{}Identify_{}_{}".format( feature_types.aggregate_processed_categorical.prefix, self.__path, self.__col) else: new_data_name = "{}Identify_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, self.__path, self.__col) dst_table = self.dataset.tables[self.__path.dst] dst_df = dst_table.df dst_data = dst_df[self.__col].values time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in self.__path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in self.__path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in self.__path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in self.__path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "last", "last") """ if ( not self.__is_cat and not auc_selection.numerical_adversarial_auc_select( self.__dataset, new_data, 0.2 ) ): return """ self.__dataset.tables[self.__path.src].set_new_data( new_data, new_data_name)
def __recursive_synthesis(self, path): if len(self.__path) == 0: return new_data_name = "{}OneHotSum_{}_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, path, self.__col, self.__value) dst_table = self.dataset.tables[self.__path.dst] if dst_table.has_cache(("categorical_manager", self.__col)): categorical_manager = dst_table.get_cache( ("categorical_manager", self.__col)) else: processing_data = \ dst_table.df[self.__col].fillna("").astype(str).values categorical_manager = \ _core.CategoricalManager(processing_data) dst_table.set_cache(("categorical_manager", self.__col), categorical_manager) dst_data = categorical_manager.is_array(self.__value) time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in self.__path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in self.__path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in self.__path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in self.__path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "sum", "sum") self.__dataset.tables[self.__path.src].set_new_data( new_data, new_data_name)
def __recursive_synthesis(self, path): if len(self.__path) == 0: return new_data_name = "{}Count_{}".format( feature_types.aggregate_processed_numerical.prefix, path) dst_table = self.dataset.tables[self.__path.dst] dst_df = dst_table.df dst_data = np.ones(len(dst_df)).astype(np.float32) time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in self.__path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in self.__path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in self.__path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in self.__path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "sum", "sum") self.__dataset.tables[self.__path.src].set_new_data( new_data, new_data_name)
def __recursive_synthesis(self, path): if len(self.__path) == 0: return new_data_name = "{}OneHotMean_{}_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, path, self.__col, self.__value) dst_table = self.dataset.tables[self.__path.dst] if dst_table.has_cache(("categorical_manager", self.__col)): categorical_manager = dst_table.get_cache( ("categorical_manager", self.__col)) else: processing_data = \ dst_table.df[self.__col].fillna("").astype(str).values categorical_manager = \ _core.CategoricalManager(processing_data) dst_table.set_cache(("categorical_manager", self.__col), categorical_manager) dst_data = categorical_manager.is_array(self.__value) time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in self.__path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in self.__path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in self.__path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in self.__path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "mean", "mean") train_size = np.isfinite(self.__dataset.target).sum() train_isfinite = np.isfinite(new_data[:train_size]) if (len(np.unique(new_data[:train_size][train_isfinite])) <= 1): return auc = metrics.roc_auc_score( self.__dataset.target[:train_size][train_isfinite], new_data[:train_size][train_isfinite]) if (auc < 0.5001 and auc > 0.4999): return self.__dataset.tables[self.__path.src].set_new_data( new_data, new_data_name)
def synthesis(self): dst_table = self.dataset.tables[self.path.dst] if dst_table.has_cache(("multi_categorical_manager", self.__col)): multi_categorical_manager = dst_table.get_cache( ("multi_categorical_manager", self.__col)) else: multi_categorical_string_values = \ dst_table.df[self.col].fillna("").values multi_categorical_manager = \ _core.MultiCategoricalManager(multi_categorical_string_values) dst_table.set_cache(("multi_categorical_manager", self.__col), multi_categorical_manager) to_one_path = None to_many_path = None for i in range(len(self.__path), -1, -1): if self.__path[i:].is_substance_to_one_with_col( self.__dataset, self.__col): to_one_path = self.__path[i:] to_many_path = self.__path[:i] # to_one identify of dst_induces if len(to_one_path) > 0: dst_induces = np.arange(len(dst_table.df)) time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(to_one_path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(to_one_path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in to_one_path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in to_one_path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in to_one_path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in to_one_path.relations ] dst_induces = _core.Aggregator().aggregate( dst_induces, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "last", "last") dst_induces = dst_induces.astype(np.int32) dst_induces[dst_induces < 0] = -1 else: dst_table = self.dataset.tables[to_one_path.dst] dst_induces = np.arange(len(dst_table.df)) dst_induces = dst_induces.astype(np.int32) dst_induces[dst_induces < 0] = -1 # target encoding dst_table = self.dataset.tables[to_many_path.dst] if not dst_table.has_pseudo_target: return if dst_table.has_time: sorted_index = dst_table.sorted_time_index if dst_table.has_hist_time_data: time_data = dst_table.hist_time_data else: time_data = dst_table.time_data new_data = multi_categorical_manager \ .temporal_target_encode_with_dst_induces( dst_table.pseudo_target, dst_induces, time_data, sorted_index, multi_categorical_manager.unique_word_num, multi_categorical_manager.row_num ) else: new_data = multi_categorical_manager \ .target_encode_with_dst_induces( dst_table.pseudo_target, dst_induces, multi_categorical_manager.unique_word_num, multi_categorical_manager.row_num ) # to_many_aggregate or set if len(to_many_path) == 0: new_data_name = "{}MultiCategoricalTargetEncoding_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, self.__path, self.__col) self.__dataset.tables[to_many_path.src].set_new_data( new_data, new_data_name) else: # to_many_aggregate dst_data = new_data time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate( to_many_path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate( to_many_path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in to_many_path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in to_many_path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in to_many_path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in to_many_path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "mean", "mean") new_data_name = "{}MultiCategoricalTargetEncoding_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, self.__path, self.__col) self.__dataset.tables[to_many_path.src].set_new_data( new_data, new_data_name)
def synthesis(self): to_one_path = None to_many_path = None for i in range(len(self.__path), -1, -1): if self.__path[i:].is_substance_to_one_with_col( self.__dataset, self.__col1) \ and self.__path[i:].is_substance_to_one_with_col( self.__dataset, self.__col2): to_one_path = self.__path[i:] to_many_path = self.__path[:i] # to_one identify if len(to_one_path) > 0: dst_table = self.dataset.tables[to_one_path.dst] dst_data1 = dst_table.df[self.col1].values dst_data2 = dst_table.df[self.col2].values time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(to_one_path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(to_one_path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in to_one_path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in to_one_path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in to_one_path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in to_one_path.relations ] ids1 = _core.Aggregator().aggregate( dst_data1, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "last", "last") ids2 = _core.Aggregator().aggregate( dst_data2, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "last", "last") ids1 = ids1.astype(np.int32) ids1[ids1 < 0] = -1 ids2 = ids2.astype(np.int32) ids2[ids2 < 0] = -1 else: dst_table = self.dataset.tables[to_one_path.dst] ids1 = dst_table.df[self.col1].values ids2 = dst_table.df[self.col2].values ids1 = ids1.astype(np.int32) ids1[ids1 < 0] = -1 ids2 = ids2.astype(np.int32) ids2[ids2 < 0] = -1 # target encoding dst_table = self.dataset.tables[to_many_path.dst] if not dst_table.has_pseudo_target: return targets = dst_table.pseudo_target encoder = _core.FactorizedTargetEncoder() k1 = len(np.unique(ids1)) k2 = len(np.unique(ids2)) k0 = k1 * k2 if dst_table.has_hist_time_data: sorted_index = dst_table.sorted_time_index time_data = dst_table.hist_time_data new_data = encoder.temporal_encode(targets, ids1, ids2, time_data, sorted_index, k0, k1, k2) elif dst_table.has_time: sorted_index = dst_table.sorted_time_index time_data = dst_table.time_data new_data = encoder.temporal_encode(targets, ids1, ids2, time_data, sorted_index, k0, k1, k2) else: new_data = encoder.encode(targets, ids1, ids2, k0, k1, k2) if len(to_many_path) == 0: new_data_name = "{}FactorizedTargetEncoding_{}_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, self.__path, self.__col1, self.__col2) self.__dataset.tables[to_many_path.src].set_new_data( new_data, new_data_name) else: # to_many_aggregate dst_data = new_data time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate( to_many_path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate( to_many_path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in to_many_path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in to_many_path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in to_many_path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in to_many_path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "mean", "mean") new_data_name = "{}FactorizedTargetEncoding_{}_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, self.__path, self.__col1, self.__col2) train_size = np.isfinite(self.dataset.target).sum() from sklearn import metrics print("Factorized") print( metrics.roc_auc_score( self.dataset.target[:train_size][np.isfinite( new_data[:train_size])], new_data[:train_size][np.isfinite(new_data[:train_size])])) self.__dataset.tables[to_many_path.src].set_new_data( new_data, new_data_name)
def synthesis(self): if len(self.__path) == 0: return if self.__is_cat: new_data_name = "{}{}_{}_{}".format( feature_types.aggregate_processed_categorical.prefix, self.__name, self.__path, self.__col) else: new_data_name = "{}{}_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, self.__name, self.__path, self.__col) dst_table = self.dataset.tables[self.__path.dst] dst_df = dst_table.df dst_data = dst_df[self.__col].values time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(self.__path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in self.__path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in self.__path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in self.__path.relations ] """ dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in self.__path.relations ] """ dst_is_unique_for_each_relation = get_dst_is_substance_unique( self.__path, self.__dataset, self.__col) new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, self.__last_agg, self.__other_agg) if self.__is_cat: self.__dataset.tables[self.__path.src].set_new_data( new_data, new_data_name) else: train_size = np.isfinite(self.__dataset.target).sum() train_isfinite = np.isfinite(new_data[:train_size]) if not train_isfinite.any(): return score = metrics.roc_auc_score( self.__dataset.target[:train_size][train_isfinite], new_data[:train_size][train_isfinite]) score = np.abs(score - 0.5) if score > 0.001: self.__dataset.tables[self.__path.src].set_new_data( new_data, new_data_name)
def synthesis(self): dst_table = self.dataset.tables[self.path.dst] if dst_table.has_cache(("categorical_manager", self.__col)): categorical_manager = dst_table.get_cache( ("categorical_manager", self.__col)) else: processing_data = \ dst_table.df[self.__col].fillna("").astype(str).values categorical_manager = \ _core.CategoricalManager(processing_data) dst_table.set_cache(("categorical_manager", self.__col), categorical_manager) to_one_path = None to_many_path = None for i in range(len(self.__path), -1, -1): if self.__path[i:].is_substance_to_one_with_col( self.__dataset, self.__col): to_one_path = self.__path[i:] to_many_path = self.__path[:i] # to_one identify if len(to_one_path) > 0: dst_induces = np.arange(len(dst_table.df)) time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate(to_one_path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate(to_one_path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in to_one_path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in to_one_path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in to_one_path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in to_one_path.relations ] dst_induces = _core.Aggregator().aggregate( dst_induces, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "last", "last") dst_induces = dst_induces.astype(np.int32) dst_induces[dst_induces < 0] = -1 else: dst_table = self.dataset.tables[to_one_path.dst] dst_induces = np.arange(len(dst_table.df)) dst_induces = dst_induces.astype(np.int32) dst_induces[dst_induces < 0] = -1 # target encoding dst_table = self.dataset.tables[to_many_path.dst] if not dst_table.has_pseudo_target: return targets = dst_table.pseudo_target if dst_table.has_time: sorted_index = dst_table.sorted_time_index if dst_table.has_hist_time_data: time_data = dst_table.hist_time_data else: time_data = dst_table.time_data new_data = categorical_manager \ .temporal_target_encode_with_dst_induces( targets, dst_induces, time_data, sorted_index, categorical_manager.unique_num ) else: new_data = \ categorical_manager.target_encode_with_dst_induces( targets, dst_induces, categorical_manager.unique_num ) if len(to_many_path) > 0: # to_many_aggregate dst_data = new_data time_for_each_table = { table_idx: self.dataset.tables[table_name].hour_time_data for table_idx, table_name in enumerate( to_many_path.table_names) if self.dataset.tables[table_name].has_time } sorted_index_for_each_table = { table_idx: self.dataset.tables[table_name].sorted_time_index for table_idx, table_name in enumerate( to_many_path.table_names) if self.dataset.tables[table_name].has_time } src_id_for_each_relation = [ self.dataset.tables[rel.src].df[rel.src_id].values for rel in to_many_path.relations ] dst_id_for_each_relation = [ self.dataset.tables[rel.dst].df[rel.dst_id].values for rel in to_many_path.relations ] src_is_unique_for_each_relation = [ rel.type.src_is_unique for rel in to_many_path.relations ] dst_is_unique_for_each_relation = [ rel.type.dst_is_unique for rel in to_many_path.relations ] new_data = _core.Aggregator().aggregate( dst_data, time_for_each_table, sorted_index_for_each_table, src_id_for_each_relation, dst_id_for_each_relation, src_is_unique_for_each_relation, dst_is_unique_for_each_relation, "mean", "mean") new_data_name = "{}TargetEncoding_{}_{}".format( feature_types.aggregate_processed_numerical.prefix, self.__path, self.__col) train_size = np.isfinite(self.__dataset.target).sum() train_isfinite = np.isfinite(new_data[:train_size]) if (len(np.unique(new_data[:train_size][train_isfinite])) <= 1): return if not auc_selection.numerical_adversarial_auc_select( self.__dataset, new_data, 0.2): return self.__dataset.tables[to_many_path.src].set_new_data( new_data, new_data_name)