def save_params(self): self.params['num_classes'] = self.num_classes self.params['p2s_dict'] = self.p2s_dict self.params['i2p_dict'] = self.i2p_dict self.params['p2o_dict'] = self.p2o_dict self.params['max_len'] = self.max_len save_json(jsons=self.params, json_path=self.params_path)
def save_params(self): self.params['num_classes'] = self.num_classes self.params['labels'] = self.labels self.params['index2label'] = self.index2label self.params['label2index'] = self.label2index self.params['max_len'] = self.max_len save_json(jsons=self.params, json_path=self.params_path)
def fit_generator(self): # 保存超参数 self.parameters['model_env_parameters'][ 'is_training'] = False # 预测时候这些设为False self.parameters['model_env_parameters']['trainable'] = False save_json(jsons=self.i2l, json_path=self.index2label_path) save_json(jsons=self.parameters, json_path=self.path_parameters) train_D = MyDataGenerator(self.train_data, self.l2i, self.tokenizer, self.categories, self.max_len, self.batch_size, shuffle=True) valid_D = MyDataGenerator(self.valid_data, self.l2i, self.tokenizer, self.categories, self.max_len, self.batch_size, shuffle=True) # test_D = DataGenerator(self.test_data, self.l2i,self.tokenizer, self.categories, self.max_len, self.batch_size, # shuffle=True) # 模型训练 history = self.model.fit_generator( train_D.__iter__(), steps_per_epoch=len(train_D), epochs=self.epoch, validation_data=valid_D.__iter__(), validation_steps=len(valid_D), callbacks=self.callback(), ) epoch = history.epoch[-1] + 1 acc = history.history['acc'][-1] val_acc = history.history['val_acc'][-1] logger.info("model:{} last_epoch:{} train_acc{} val_acc{}".format( self.model_code, epoch, acc, val_acc))
def fit_process(self, embedding_type, path, embed, rate=1, shuffle=True): data = pd.read_csv(path) ques = data['ques'].tolist() label = data['label'].tolist() ques = [str(q).upper() for q in ques] label = [str(l).upper() for l in label] if shuffle: ques = np.array(ques) label = np.array(label) indexs = [ids for ids in range(len(label))] random.shuffle(indexs) ques, label = ques[indexs].tolist(), label[indexs].tolist() # 如果label2index存在则不转换了 if not os.path.exists(self.path_fast_text_model_l2i_i2l): label_set = set(label) count = 0 label2index = {} index2label = {} for label_one in label_set: label2index[label_one] = count index2label[count] = label_one count = count + 1 l2i_i2l = {} l2i_i2l['l2i'] = label2index l2i_i2l['i2l'] = index2label save_json(l2i_i2l, self.path_fast_text_model_l2i_i2l) else: l2i_i2l = load_json(self.path_fast_text_model_l2i_i2l) len_ql = int(rate * len(ques)) if len_ql <= 500: # sample时候不生效,使得语料足够训练 len_ql = len(ques) x = [] print("ques to index start!") ques_len_ql = ques[0:len_ql] for i in tqdm(range(len_ql)): que = ques_len_ql[i] que_embed = embed.sentence2idx(que) x.append(que_embed) # [[], ] label_zo = [] print("label to onehot start!") label_len_ql = label[0:len_ql] for j in tqdm(range(len_ql)): label_one = label_len_ql[j] label_zeros = [0] * len(l2i_i2l['l2i']) label_zeros[l2i_i2l['l2i'][label_one]] = 1 label_zo.append(label_zeros) count = 0 if embedding_type in ['bert', 'albert']: x_, y_ = np.array(x), np.array(label_zo) x_1 = np.array([x[0] for x in x_]) x_2 = np.array([x[1] for x in x_]) x_all = [x_1, x_2] return x_all, y_ elif embedding_type == 'xlnet': count += 1 if count == 1: x_0 = x[0] print(x[0][0][0]) x_, y_ = x, np.array(label_zo) x_1 = np.array([x[0][0] for x in x_]) x_2 = np.array([x[1][0] for x in x_]) x_3 = np.array([x[2][0] for x in x_]) if embed.trainable: x_4 = np.array([x[3][0] for x in x_]) x_all = [x_1, x_2, x_3, x_4] else: x_all = [x_1, x_2, x_3] return x_all, y_ else: x_, y_ = np.array(x), np.array(label_zo) return x_, y_
def fit_generator(self): # 保存超参数 self.parameters['model_env_parameters'][ 'is_training'] = False # 预测时候这些设为False self.parameters['model_env_parameters']['trainable'] = False save_json(jsons=self.i2l, json_path=self.index2label_path) save_json(jsons=self.parameters, json_path=self.path_parameters) model_code = self.model_code # DataGenerator只是一种为了节约内存的数据方式 class DataGenerator: def __init__(self, data, l2i, tokenizer, categories, maxlen=128, batch_size=32, shuffle=True): self.data = data self.l2i = l2i self.batch_size = batch_size self.categories = categories self.maxlen = maxlen self.tokenizer = tokenizer self.shuffle = shuffle self.steps = len(self.data) // self.batch_size if len(self.data) % self.batch_size != 0: self.steps += 1 def __len__(self): return self.steps def __iter__(self): while True: idxs = list(range(len(self.data))) if self.shuffle: np.random.shuffle(idxs) X, Y = [], [] for i in idxs: d = self.data[i] text = d[1][:self.maxlen].replace(' ', '') x = self.tokenizer.encode( text, algo_code=model_code) # token_ids # print(text) # print(x) y = self.l2i.get(str(d[0])) X.append(x) Y.append(y) if len(X) == self.batch_size or i == idxs[-1]: X = seq_padding(X, 0, self.maxlen) Y = np.array(to_categorical(Y, self.categories)) # print("*"*10,X.shape) yield (X, Y) X, Y = [], [] train_D = MyDataGenerator(self.train_data, self.l2i, self.tokenizer, self.categories, self.max_len, self.batch_size, shuffle=True) valid_D = MyDataGenerator(self.valid_data, self.l2i, self.tokenizer, self.categories, self.max_len, self.batch_size, shuffle=True) # test_D = DataGenerator(self.test_data, self.l2i,self.tokenizer, self.categories, self.max_len, self.batch_size, # shuffle=True) # 模型训练 history = self.model.fit_generator( train_D.__iter__(), steps_per_epoch=len(train_D), epochs=self.epoch, validation_data=valid_D.__iter__(), validation_steps=len(valid_D), callbacks=self.callback(), ) epoch = history.epoch[-1] + 1 acc = history.history['acc'][-1] val_acc = history.history['val_acc'][-1] logger.info("model:{} last_epoch:{} train_acc{} val_acc{}".format( self.model_code, epoch, acc, val_acc))
def save_params(self): self.params['max_len'] = self.max_len save_json(jsons=self.params, json_path=self.params_path)