def sequentialize_data_no_padding(self, train_contents, feature_mode, val_contents=[], tokenizer=None, max_length=None, Max_Vocab_Size=None): if Max_Vocab_Size is None: Vocab_Size = MAX_VOCAB_SIZE else: Vocab_Size = Max_Vocab_Size info("Max Vocab Size is {}".format(Vocab_Size)) if tokenizer is None: if feature_mode == 0: tokenizer = text.Tokenizer(num_words=Vocab_Size, char_level=True, oov_token="UNK") elif feature_mode == 1: tokenizer = text.Tokenizer(num_words=Vocab_Size) tokenizer.fit_on_texts(train_contents) _max_length = max_length word_index = tokenizer.word_index num_features = min(len(word_index) + 1, Vocab_Size) info("vacab_word:", len(word_index)) if val_contents: return word_index, num_features, tokenizer, _max_length else: return word_index, num_features, tokenizer, _max_length
def get_nlp_test_numpy_prefetch(self): if self.test_tfds is None: error("Error: test_tfds is None.") return self.accum_test_x, self.accum_test_y X, Y = [], [] if len(self.accum_test_x) == 0: time_test_np_start = time.time() tfds_test_os_iterator = self.test_tfds.make_one_shot_iterator() tfds_test_iter_next = tfds_test_os_iterator.get_next() with tf.Session(config=tf.ConfigProto(log_device_placement=False)) as sess: while True: try: example, labels = sess.run(tfds_test_iter_next) example = np.squeeze(example, (2, 3)) example = np.squeeze(example, axis=-1) example = example.astype(np.int) X.extend(example) Y.extend(labels) self.accum_test_x.extend(example) self.accum_test_y.extend(labels) # self.accm_test_cnt += 1 self.accm_test_cnt += example.shape[0] except tf.errors.OutOfRangeError: break time_test_np_end = time.time() info("note: now take test accm_test_cnt={}, cost_time={}s".format(self.accm_test_cnt, round( time_test_np_end - time_test_np_start, 3))) # self.accum_test_y = np.array(self.accum_test_y) return X, Y
def sample_train_index_add(self, add_index): train_label_distribution = np.sum(np.array(self.update_y), 0) print("new sample train_distribution: ", train_label_distribution) # 获取对应label的分布 self.max_sample_num_per_class = int( np.max(train_label_distribution) * 4 / 5) if self.sample_num_per_class is None: if self.num_samples_train < MAX_SAMPLE_TRIAN: self.sample_num_per_class = self.max_sample_num_per_class else: self.sample_num_per_class = min(self.max_sample_num_per_class, self.MAX_TRAIN_PERCLASS_SAMPLE) info("start sample data") max_sample_num = min(self.sample_num_per_class, int(np.mean(train_label_distribution))) if self.imbalance_flg: max_sample_num = int(max_sample_num * self.normal_std) # max_sample_num = self.sample_num_per_class print("max_sample_num is {}".format(max_sample_num)) meta_train_add_index = [] for i in range(self.num_classes): # 按label类别抽取 if len(add_index[i]) == 0: continue elif len(add_index[i]) < self.sample_num_per_class and len(add_index[i]) > 0: if self.imbalance_flg: if len(add_index[i]) < max_sample_num: tmp = add_index[i] * int( max_sample_num / len(add_index[i])) tmp += random.sample(add_index[i], max_sample_num - len(tmp)) else: tmp = random.sample( add_index[i], max_sample_num) # tmp = add_index[i] * int( # self.sample_num_per_class / len(add_index[i])) # tmp += random.sample(add_index[i], # self.sample_num_per_class - len(tmp)) # 再采样n个样本, n取差值 else: tmp = add_index[i] * int( self.sample_num_per_class / len(add_index[i])) tmp += random.sample(add_index[i], self.sample_num_per_class - len(tmp)) meta_train_add_index += tmp else: # 随机抽取 if self.imbalance_flg: meta_train_add_index += random.sample( add_index[i], max_sample_num) else: meta_train_add_index += random.sample( add_index[i], self.sample_num_per_class) info("end sample data") random.shuffle(meta_train_add_index) self.meta_train_add_index = meta_train_add_index return meta_train_add_index
def get_train_numpy(self, update_train_num): if self.train_tfds is None: error("Error: train_tfds is None.") return self.accum_train_x, self.accum_train_y if self.tfds_train_os_iterator is None: self.tfds_train_os_iterator = self.train_tfds.make_one_shot_iterator() self.tfds_train_iter_next =self.tfds_train_os_iterator.get_next() X, Y = [], [] cur_get_cnt = 0 cur_data_y = list() if self.accm_train_cnt < self.train_num: info("show the accum_train_cnt {}".format(self.accm_train_cnt)) time_train_np_start = time.time() while True: try: example, labels = self.tfds_convertor_sess.run(self.tfds_train_iter_next) example = np.squeeze(example, (2, 3)) example = np.squeeze(example, axis=-1) # print("Note:time example shape={}".format(example.shape)) example = example.astype(np.int) self.accum_train_x.extend(example) # cur_data_y.extend(labels) X.extend(example) Y.extend(labels) # cur_get_cnt += 1 # self.accm_train_cnt += 1 cur_get_cnt += example.shape[0] self.accm_train_cnt += example.shape[0] # example_batch_num += 1 if cur_get_cnt >= update_train_num or self.accm_train_cnt >= self.train_num: time_train_np_end = time.time() info("note: now take train update={}, accm_train_cnt={}, cost_time={}s".format(cur_get_cnt, self.accm_train_cnt, time_train_np_end - time_train_np_start)) break except tf.errors.OutOfRangeError: break if self.accum_train_y is None: self.accum_train_y = np.array(cur_data_y) else: print("check accum_train_y shape {}, cur_data_y shape {}".format(self.accum_train_y.shape, np.array(cur_data_y).shape)) # self.accum_train_y = np.concatenate((self.accum_train_y, np.array(cur_data_y)), axis=0) else: self.tfds_convertor_sess.close() return X, Y
def __init__(self, x_train, y_train, metadata, imbalance_level=-1, multi_label=False): self.meta_data_x, self.meta_data_y = x_train, y_train # 第一次增量的数据就是初始的数据 self.update_x, self.update_y = x_train, y_train self.metadata = metadata self.num_classes = self.metadata['class_num'] self.num_samples_train = self.metadata['train_num'] self.language = metadata['language'] self.multi_label = multi_label print("num_samples_train:", self.num_samples_train) print("num_class_train:", self.num_classes) self.val_index = None self.tokenizer = None self.max_length = None self.sample_num_per_class = None self.data_feature = {} self.eda_feature = {} self.pseudo_x_train_size = 0 self.full_x = [] self.full_y = np.array([]) self.x_dict = {i: [] for i in range(self.num_classes)} self.imbalance_flg = False self.do_generate_sample = False self.empty_class_ = [] self.meta_train_x = [] self.meta_train_y = np.array([]) self.full_index = None self.imbalance_level = imbalance_level self.MAX_TRAIN_PERCLASS_SAMPLE = MAX_TRAIN_PERCLASS_SAMPLE info("Init Data Manager! Imbalance_level is {}".format(self.imbalance_level)) if self.num_classes <= 5 and self.imbalance_level <= 1 and self.num_classes > 2: self.MAX_TRAIN_PERCLASS_SAMPLE = 3000 elif self.num_classes == 2 and self.imbalance_level <= 1: self.MAX_TRAIN_PERCLASS_SAMPLE = 3500 if self.multi_label: if self.num_classes<50: #类别不是特别多,每类取100条 self.MAX_TRAIN_PERCLASS_SAMPLE = 100 elif self.num_classes<100: self.MAX_TRAIN_PERCLASS_SAMPLE = 50 else: self.MAX_TRAIN_PERCLASS_SAMPLE = 20 info("Init Data Manager! MAX_TRAIN_PERCLASS_SAMPLE is {}".format(self.MAX_TRAIN_PERCLASS_SAMPLE))
def generate_pseudo_samples(self, train_x, train_y): info("Do Radam Create Samples!") for i in range(self.num_classes): new_samples = self.new_generate_samples_idx[i] if len(new_samples) == 0: continue train_x.extend(new_samples) new_label = np.zeros((len(new_samples), self.num_classes)) new_label[:, i] = 1 train_y = np.concatenate([train_y, new_label], axis=0) return train_x, train_y
def sample_dataset_pipeline(self, use_val=False, update_train=True, data_x=None, data_y=None): """ 全局采样pipeline :param use_val: 是否采用val数据 :param update_train: 是否更新全量train :param data_x: 采样数据来源x:增量数据或者全量原始数据 :param data_y: 采样数据来源y:增量数据或者全量原始数据 :return: 均衡采样后的训练集/评估集,use_val为True时,评估集为空 """ val_diff_x, val_diff_y = None, None ############################ 采样准备阶段 ################################### if update_train: # 增量更新(第一次样本即增量) self.add_index, self.add_val_index = self.sample_val_index(data_y) val_diff_x, val_diff_y = map_x_y(self.add_val_index, data_x, data_y) # 此时的训练集没有进行采样 train_diff_x, train_diff_y = flat_map_x_y(index=self.add_index, x=data_x, y=data_y) if use_val: # 如果采用val,即当前不分train valid,全部数据更新meta_train info(color_msg(msg="use val is True", color='blue')) train_diff_x = train_diff_x + val_diff_x train_diff_y = np.concatenate([train_diff_y, val_diff_y], axis=0) val_diff_x = None val_diff_y = None self._update_train_meta(train_diff_x, train_diff_y) if val_diff_x: val_label_distribution = np.sum(np.array(val_diff_y), 0) info("val_distribution: {}".format(val_label_distribution)) info("Check val_diff_x size {}, val_diff_y size {}".format(len(val_diff_x), val_diff_y.shape[0])) info("Check meta_train_x size {}, meta_train_y size {}".format(len(self.meta_train_x), self.meta_train_y.shape[0])) info("Check meta_data_x size {}, meta_data_y size {}".format(len(self.meta_data_x), self.meta_data_y.shape[0])) ############################ 进入采样阶段 ################################### train_x, train_y = self.get_sampling_data_frm_full_train() return train_x, train_y, val_diff_x, val_diff_y
def get_sampling_data_frm_full_train(self): """ 从全局的train data中采样,只看当前的 meta_train_x, meta_train_y :return: """ sample_index = get_sample_index(self.meta_train_y, self.num_classes) train_label_distribution = np.sum(np.array(self.meta_train_y), 0) info(color_msg("before sampling--train_distribution: {}".format(train_label_distribution), color='yellow')) # 获取对应label的分布 self.balance_sampling_index(sample_index, train_label_distribution) # 每次只看当前需要采样的数据是否均衡,是否需要生成伪样本 self.normal_std, self.empty_class_ = get_imbalance_statistic(train_label_distribution) self.check_imbalance_level(train_label_distribution) self.new_generate_samples_idx = self.generate_presudo_samples(sample_index) self.show_data_info() self.imbalance_flg = False train_x, train_y = self.extend_train_data(x=self.meta_train_x, y=self.meta_train_y) train_label_distribution = np.sum(np.array(train_y), 0) info(color_msg("after sampling--train_distribution: {}".format(train_label_distribution), color='yellow')) # 获取对应label的分布 return train_x, train_y
def _set_max_train_sample_num(self, train_label_distribution): self.max_sample_num_per_class = int( np.max(train_label_distribution) * 4 / 5) if self.sample_num_per_class is None: if self.num_samples_train < MAX_SAMPLE_TRIAN: self.sample_num_per_class = self.max_sample_num_per_class else: self.sample_num_per_class = min(self.max_sample_num_per_class, self.MAX_TRAIN_PERCLASS_SAMPLE) else: # 避免类别数多的情况下,第一次进样少,导致后面连续采样过低 self.sample_num_per_class = max(self.max_sample_num_per_class, int(np.mean(train_label_distribution))) info("check sample_num_per_class:{}".format(self.sample_num_per_class)) if self.imbalance_flg: max_sample_num = min(self.sample_num_per_class, int(np.mean(train_label_distribution))) max_sample_num = min(max_sample_num, self.MAX_TRAIN_PERCLASS_SAMPLE) else: max_sample_num = min(self.sample_num_per_class, self.MAX_TRAIN_PERCLASS_SAMPLE) info("max_sample_num is {}".format(max_sample_num)) return max_sample_num
def show_data_info(self): info("check empty class {}, imbalance_flg is {}, normalized std is {} and do generate sample flg is {}".format( self.empty_class_, self.imbalance_flg, round(self.normal_std, 6), self.do_generate_sample))