from flyai.dataset import Dataset from model import Model data = Dataset() model = Model(data) # p = model.predict(age=53, sex=1, cp=3, trestbps=130, chol=246, fbs=1, restecg=2, thalach=173, exang=0, oldpeak=0.0, # slope=1, ca=3, thal=3) # print(p) tData = data.get_all_data() preds = model.predict_all(tData[0]) y_test = [] for label in tData[1]: y_test.append(label['label']) rCount = 0.0 for i in range(0, len(preds)): if preds[i] == y_test[i]: rCount += 1. test_acc = rCount / len(preds) print('accuracy %g' % test_acc)
class Instructor(object): """ 特点:使用flyai字典的get all data | 自己进行划分next batch """ def __init__(self, arguments): self.arguments = arguments def train(self, train_category, dev_category, train_news, dev_news, tokenizer, Net=None, model=None): if os.path.exists(self.arguments.output_config_file) is True: os.remove(self.arguments.output_config_file) logger.info('>>train.shape: {} | dev.shape: {}'.format( train_category.shape, dev_category.shape)) train_dataloader, train_examples_len = Util.load_data( news=train_news, category=train_category, data_type='train', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) dev_dataloader, dev_examples_len = Util.load_data( news=dev_news, category=dev_category, data_type='dev', label_list=self.arguments.label_list, max_length=self.arguments.max_seq_length, tokenizer=tokenizer, batch_size=self.arguments.BATCH) num_train_optimization_steps = int( train_examples_len / self.arguments.BATCH / self.arguments.gradient_accumulation_steps) * self.arguments.EPOCHS # 模型准备 logger.info("model name is {}".format(self.arguments.model_name)) if model is None: if self.arguments.model_name == "BertOrigin": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == 'BertHAN': model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertCNN": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertATT": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir) elif self.arguments.model_name == "BertRCNN": model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, rnn_hidden_size=self.arguments.rnn_hidden_size, num_layers=self.arguments.num_layers, bidirectional=self.arguments.bidirectional, dropout=self.arguments.dropout) elif self.arguments.model_name == "BertCNNPlus": filter_sizes = [ int(val) for val in self.arguments.filter_sizes.split() ] model = Net.from_pretrained( pretrained_model_name_or_path=self.arguments. bert_model_dir, num_labels=self.arguments.num_labels, cache_dir=self.arguments.cache_dir, n_filters=self.arguments.filter_num, filter_sizes=filter_sizes) model.to(DEVICE) """ 优化器准备 """ param_optimizer = list(model.named_parameters()) no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in param_optimizer if not any(nd in n for nd in no_decay) ], 'weight_decay': 0.01 }, { 'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0 }] # To reproduce BertAdam specific behavior set correct_bias=False optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.arguments.learning_rate, correct_bias=False) # PyTorch scheduler scheduler = WarmupLinearSchedule( optimizer=optimizer, warmup_steps=self.arguments.warmup_proportion, t_total=num_train_optimization_steps) """ 损失函数准备 """ if self.arguments.use_label_smoothing: criterion = NMTCriterion( label_smoothing=self.arguments.label_smoothing) else: criterion = nn.CrossEntropyLoss() criterion = criterion.to(DEVICE) best_auc, best_acc, global_step, early_stop_times = 0, 0, 0, 0 for epoch in range(int(self.arguments.EPOCHS)): if early_stop_times >= self.arguments.early_stop * ( train_examples_len // self.arguments.BATCH): break logger.info(f'---------------- Epoch: {epoch + 1:02} ----------') for step, batch in enumerate( tqdm(train_dataloader, desc="Iteration")): model.train() if self.arguments.label_smoothing: criterion.train() batch = tuple(t.to(DEVICE) for t in batch) _, input_ids, input_mask, segment_ids, label_ids = batch logits = model(input_ids, segment_ids, input_mask, labels=None) loss = criterion(inputs=logits, labels=label_ids, normalization=1.0, reduce=False) # 修正 if self.arguments.gradient_accumulation_steps > 1: loss = loss / self.arguments.gradient_accumulation_steps loss.backward(torch.ones_like(loss)) scheduler.step() if (step + 1) % self.arguments.gradient_accumulation_steps == 0: optimizer.step() optimizer.zero_grad() global_step += 1 if global_step % self.arguments.print_step == 0 and global_step != 0: dev_loss, dev_acc, dev_report, dev_auc = Util.evaluate( model, dev_dataloader, criterion, DEVICE, self.arguments.label_list, args=self.arguments) logger.info('\n>>>dev report: \n{}'.format(dev_report)) # 以 acc 取优 if dev_acc > best_acc: best_acc = dev_acc # 以 auc 取优 if dev_auc > best_auc: best_auc = dev_auc # 保存模型 model_to_save = model.module if hasattr( model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) early_stop_times = 0 else: early_stop_times += 1 if os.path.exists(self.arguments.output_config_file) is False: model_to_save = model.module if hasattr(model, 'module') else model torch.save(model_to_save.state_dict(), self.arguments.output_model_file) with open(self.arguments.output_config_file, 'w') as f: f.write(model_to_save.config.to_json_string()) def generate(self): self.dataset = Dataset(epochs=self.arguments.EPOCHS, batch=self.arguments.BATCH, val_batch=self.arguments.BATCH) news, category, _, _ = self.dataset.get_all_data() news = np.asarray([i['news'] for i in news]) category = np.asarray([i['category'] for i in category]) index = [i for i in range(len(news))] np.random.shuffle(np.asarray(index)) train_news, dev_news = news[index[0:int(len(index) * 0.9)]], news[ index[int(len(index) * 0.9):]] train_category, dev_category = category[ index[0:int(len(index) * 0.9)]], category[index[int(len(index) * 0.9):]] return train_news, train_category, dev_news, dev_category def run(self): remote_helper.get_remote_date( "https://www.flyai.com/m/chinese_base.zip") before_vocab_dir = os.path.join(os.getcwd(), 'vocab.txt') after_vocab_dir = os.path.join(args.bert_model_dir, 'vocab.txt') logger.info('>before_vocab_dir:{}'.format(before_vocab_dir)) logger.info('>after_vocab_dir:{}'.format(after_vocab_dir)) shutil.copyfile(before_vocab_dir, after_vocab_dir) if not os.path.exists(self.arguments.output_dir): os.mkdir(self.arguments.output_dir) self.arguments.BATCH = self.arguments.BATCH // self.arguments.gradient_accumulation_steps # 数据准备 分词器选择 tokenizer = BertTokenizer( self.arguments.bert_vocab_file).from_pretrained( self.arguments.bert_model_dir, do_lower_case=self.arguments.do_lower_case) # 获取数据 news/keywords train_news, train_category, dev_news, dev_category = self.generate() self.train(Net=Net, train_category=train_category, dev_category=dev_category, train_news=train_news, dev_news=dev_news, tokenizer=tokenizer)
import re import jieba import gensim import json # ---------超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") args = parser.parse_args() # ---------数据获取辅助类 dataset = Dataset(epochs=args.EPOCHS, batch=args.BATCH) # ---------模型操作辅助类 modelpp = Model(dataset) # 统计数据的分布 train_x, train_label, val_x, val_label = dataset.get_all_data() topic_a = { 'topic': '深圳禁摩限电', 'total_number': 0, 'None': 0, 'Favor': 0, 'Agan': 0, 'text': [] } topic_b = { 'topic': '春节放鞭炮', 'total_number': 0, 'None': 0, 'Favor': 0, 'Agan': 0,
# -*- coding: utf-8 -* from flyai.source.source import Source from flyai.utils.yaml_helper import Yaml from flyai.dataset import Dataset from model import Model print('调用了predict') data = Dataset() model = Model(data) p = model.predict_all(data.get_all_data()[0]) print(p)
参数为csv中作为输入x的一条数据,该方法会被dataset.next_train_batch() 和dataset.next_validation_batch()多次调用。评估的时候会调用该方法做数据处理 该方法字段与app.yaml中的input:->columns:对应 ''' def output_x(self, TARGET, TEXT): text2vec = self.input_x(TARGET, TEXT) return text2vec ''' 输出的结果,会被dataset.to_categorys(data)调用 ''' def output_y(self, data): index = np.argmax(data) return index if __name__ == '__main__': from flyai.dataset import Dataset dataset = Dataset(10, 32) train_x, train_y, val_x, val_y = dataset.get_all_data() preTrainedEmbedding = PreTrainedEmbedding() contents = [x['TEXT'] for x in train_x] unfounds = [] for words in contents: print(words) vector, unfound = preTrainedEmbedding.turnToVectors(words) unfounds.append(unfound) print("unfound probability is: %f", np.mean(unfounds))
class Instructor(object): """ 特点:使用flyai字典的get all data | 自己进行划分next batch """ def __init__(self, args): self.args = args self.tag_map = {label: i for i, label in enumerate(self.args.labels)} def train(self, train_source, train_target, dev_source, dev_target): if os.path.exists(self.args.output_dir) is True: shutil.rmtree(self.args.output_dir) train_dataloader = create_batch_iter(mode='train', X=train_source, y=train_target, batch_size=self.args.BATCH) dev_dataloader = create_batch_iter(mode='dev', X=dev_source, y=dev_target, batch_size=self.args.BATCH) self.model.to(DEVICE) # 优化器准备 param_optimizer = list(self.model.named_parameters()) no_decay = list(['bias', 'LayerNorm.bias', 'LayerNorm.weight']) optimizer_grouped_parameters = list([{'params': [p for n, p in param_optimizer if not any( nd in n for nd in no_decay)], 'weight_decay': 0.01}, {'params': [p for n, p in param_optimizer if any( nd in n for nd in no_decay)], 'weight_decay': 0.0}]) optimizer = AdamW(params=optimizer_grouped_parameters, lr=self.args.learning_rate) total_size = math.ceil(len(train_source) / self.args.BATCH) best_acc = 0 for epoch in range(self.args.EPOCHS): for train_step, train_batch in enumerate(tqdm(train_dataloader, desc='Train_Iteration')): self.model.train() self.model.zero_grad() train_batch = tuple(t.to(DEVICE) for t in train_batch) t_input_ids, t_input_mask, t_labels, t_out_masks = train_batch t_bert_encode = self.model(t_input_ids, t_input_mask) loss = self.model.loss_fn(bert_encode=t_bert_encode, tags=t_labels, output_mask=t_out_masks) loss.backward() # 梯度裁剪 # torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1) optimizer.step() if train_step % 10 == 0: self.model.eval() eval_loss = 0 for dev_step, dev_batch in enumerate(dev_dataloader): dev_batch = tuple(t.to(DEVICE) for t in dev_batch) d_input_ids, d_input_mask, d_label_ids, d_output_mask = dev_batch with torch.no_grad(): d_bert_encode = self.model(d_input_ids, d_input_mask) eval_loss += self.model.loss_fn(bert_encode=d_bert_encode, tags=d_label_ids, output_mask=d_output_mask) predicts = self.model.predict(d_bert_encode, d_output_mask) d_label_ids = d_label_ids.view(1, -1) d_label_ids = d_label_ids[d_label_ids != -1] eval_acc, eval_f1 = self.model.acc_f1(predicts, d_label_ids) if eval_acc > best_acc: best_acc = eval_acc save_model(self.model, self.args.output_dir) self.model.class_report(predicts, d_label_ids) logger.info("\n>step {}".format(train_step)) logger.info("\n>epoch [{}] {}/{}\n\tloss {:.2f}".format(epoch, train_step, total_size, loss.item())) if self.args.output_dir is False: save_model(self.model, self.args.output_dir) def generate(self): self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) source, target, _, _ = self.dataset.get_all_data() source = np.asarray([i['source'].split(' ') for i in source]) target = np.asarray([i['target'].split(' ') for i in target]) index = [i for i in range(len(source))] np.random.shuffle(np.asarray(index)) train_source, dev_source = source[index[0:int(len(index) * 0.9)]], source[index[int(len(index) * 0.9):]] train_target, dev_target = target[index[0:int(len(index) * 0.9)]], target[index[int(len(index) * 0.9):]] return train_source, train_target, dev_source, dev_target def run(self): # ## albert-base # remote_helper.get_remote_date('https://www.flyai.com/m/albert_base_zh_tensorflow.zip') # convert_tf_checkpoint_to_pytorch( # tf_checkpoint_path="./data/input/model", # bert_config_file="./data/input/model/albert_config_base.json", # pytorch_dump_path="./data/input/model/pytorch_model.bin", # share_type="all") # ## albert-large remote_helper.get_remote_date('https://www.flyai.com/m/albert_large_zh.zip') convert_tf_checkpoint_to_pytorch( tf_checkpoint_path="./data/input/model", bert_config_file="./data/input/model/albert_config_large.json", pytorch_dump_path="./data/input/model/pytorch_model.bin", share_type="all") # ## albert-xlarge # remote_helper.get_remote_date('https://www.flyai.com/m/albert_xlarge_zh_183k.zip') # convert_tf_checkpoint_to_pytorch(tf_checkpoint_path="./data/input/model", # bert_config_file="./data/input/model/albert_config_xlarge.json", # pytorch_dump_path="./data/input/model/pytorch_model.bin", # share_type="all") self.model = Net( tag_map=self.tag_map, batch_size=self.args.BATCH, dropout=self.args.dropout, embedding_dim=self.args.embedding_size, hidden_dim=self.args.hidden_size, ) train_source, train_target, dev_source, dev_target = self.generate() self.train(train_source, train_target, dev_source, dev_target)
# -*- coding: utf-8 -* from flyai.dataset import Dataset from model import Model from processor import Processor # 数据获取辅助类 dataset = Dataset() # 模型操作辅助类 model = Model(dataset) result = model.predict( text="您好!我们这边是施华洛世奇鄞州万达店!您是我们尊贵的会员,特意邀请您参加我们x.x-x.x的三八女人节活动!满xxxx元享晶璨花漾丝巾") print(result) tData = dataset.get_all_data() preds = model.predict_all(tData[0]) y_test = [] for label in tData[1]: y_test.append(label['label']) rCount = 0.0 for i in range(0, len(preds)): if preds[i] == y_test[i]: rCount += 1. test_acc = rCount / len(preds) print('accuracy %g' % test_acc)
target["labels"] = labels target["image_id"] = image_id target["area"] = areas target["iscrowd"] = iscrowd return torchvision.transforms.ToTensor()(img), target def __len__(self): return len(self.img_path_list) def collate_fn(batch): return tuple(zip(*batch)) # 获取所有原始数据 x_train, y_train, x_val, y_val = dataset.get_all_data( ) # 示例: [{'img_path': 'img/019646.jpg'}, ...] [{'label_path': 'label/019646.jpg.txt'}, ...] # 构建自己的数据加载器 train_dataset = MaskDataset(x_train, y_train) valid_dataset = MaskDataset(x_val, y_val) # 批大小 train_batch_size = args.BATCH valid_batch_size = 1 train_data_loader = torch.utils.data.DataLoader(train_dataset, batch_size=train_batch_size, shuffle=True, num_workers=0, collate_fn=collate_fn) valid_data_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=valid_batch_size, shuffle=False, num_workers=0,
class Run(object): def __init__(self): self.args = args self.dataset = Dataset(epochs=self.args.total_epochs, batch=self.args.batch_size, val_batch=self.args.batch_size) def train(self): self.audio_paths, self.labels, _, _ = self.dataset.get_all_data() # unit2idx unit2idx = {} with open(self.args.vocab_txt_path, 'r', encoding='utf-8') as fr: for line in fr: unit, idx = line.strip().split() unit2idx[unit] = int(idx) # 模型定义 model = Transformer( input_size=self.args.input_size, vocab_size=self.args.vocab_size, d_model=self.args.model_size, n_heads=self.args.n_heads, d_ff=self.args.model_size * 4, num_enc_blocks=self.args.num_enc_blocks, num_dec_blocks=self.args.num_dec_blocks, residual_dropout_rate=self.args.residual_dropout_rate, share_embedding=self.args.share_embedding) if torch.cuda.is_available(): model.cuda() # 将模型加载到GPU中 # 根据生成词表指定大小 vocab_size = len(unit2idx) print('Set the size of vocab: %d' % vocab_size) # 将模型加载 dataset = AudioDataset( audios_list=[i['audio_path'] for i in self.audio_paths], labels_list=[i['label'] for i in self.labels], unit2idx=unit2idx) dataloader = DataLoader(dataset, batch_size=self.args.batch_size, shuffle=True, num_workers=0, pin_memory=False, collate_fn=Util.collate_fn) # lr = Util.get_learning_rate(step=1) lr = self.args.lr_factor optimizer = torch.optim.Adam(model.parameters(), lr=lr, betas=(0.9, 0.98), eps=1e-9) if not os.path.exists(self.args.data_model_dir): os.makedirs(self.args.data_model_dir) global_step = 1 step_loss = 0 print('Begin to Train...') for epoch in range(self.args.total_epochs): print('***** epoch: %d *****' % epoch) for step, (inputs, targets) in enumerate(dataloader): # 将输入加载到GPU中 if torch.cuda.is_available(): inputs = inputs.cuda() targets = targets.cuda() loss = model(inputs, targets) loss.backward() step_loss += loss.item() if (step + 1) % self.args.accu_grads_steps == 0: # 梯度裁剪 torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0) optimizer.step() optimizer.zero_grad() if global_step % 10 == 0: print( '-Training-Epoch-%d, Global Step:%d, lr:%.8f, Loss:%.5f' % (epoch, global_step, lr, step_loss / self.args.accu_grads_steps)) global_step += 1 step_loss = 0 # 学习率更新 # lr = Util.get_learning_rate(global_step) lr = self.args.lr_factor for param_group in optimizer.param_groups: param_group['lr'] = lr # 模型保存 checkpoint = model.state_dict() torch.save( checkpoint, os.path.join(self.args.data_model_dir, 'model.epoch.%d.pt' % epoch)) print('Done!')
transforms.RandomVerticalFlip(), transforms.RandomAffine(degrees=30, translate=(0.1, 0.1), scale=(0.9, 1.1)), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]), transforms.RandomErasing() ]), 'val': transforms.Compose([ transforms.Resize(crop_size), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), } # 加载数据 dataset = Dataset() x_train, y_train, x_val, y_val = dataset.get_all_data() # 打乱数据 all_x = x_train + x_val all_y = y_train + y_val length = len(all_x) split = int(length * 0.1) random.seed(0) samples = random.sample(range(length), length) # list无法转换,使用numpy all_x, all_y = np.array(all_x),np.array(all_y) all_x = all_x[samples] all_y = all_y[samples] x_train, y_train, x_val, y_val = all_x[:-split], all_y[:-split], all_x[-split:], all_y[-split:]
class StanceDetection(object): def __init__(self, exec_type='train'): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=50, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH) self.model_dir = os.path.join(os.getcwd(), arguments.model_dir) # 1. Split the data, read into defined format label2idx = dict((arguments.labels[i], i) for i in range(len(arguments.labels))) target_text, stance, _, _ = self.dataset.get_all_data() indexes = [" ".join(jieba.cut(i['TARGET'].lower(), cut_all=False)) for i in target_text] questions = [" ".join(jieba.cut(i['TEXT'].lower(), cut_all=False)) for i in target_text] labels = [i['STANCE'] for i in stance] data = [indexes, questions, labels] assert len(data[0]) == len(data[1]) == len(data[2]) # 2. Data follows this order: train, test train_num = int(len(data[0]) * arguments.portion) train_data = [d[:train_num] for d in data] dev_data = [d[train_num:] for d in data] # 3. Read the vocab text file and get VOCAB dictionary vocab = read_emb(filename=os.path.join(os.getcwd(), arguments.sgns_dir), stat_lines=1) # 4. Transform text into indexes self.datasets, word2idx, embeddings = make_datasets(vocab=vocab, raw_data={'training': train_data, 'validation': dev_data}, label2idx=label2idx, big_voc=arguments.big_voc, feat_names=arguments.feat_names) self.datasets_train = load_tvt(tvt_set=self.datasets['training'], max_lens=[arguments.ans_len, arguments.ask_len], feat_names=arguments.feat_names) self.datasets_dev = load_tvt(tvt_set=self.datasets['validation'], max_lens=[arguments.ans_len, arguments.ask_len], feat_names=arguments.feat_names) idx2word = dict((v, k) for k, v in word2idx.items()) self.datasets["word2idx"] = word2idx self.datasets["idx2word"] = idx2word self.embeddings = torch.from_numpy(np.asarray(embeddings, dtype=np.float32)) if exec_type == 'train': self.main() else: model = load_torch_model(self.model_dir) test(model=model, dataset=self.datasets, test_set=None) def main(self): """ continue training or not """ if arguments.proceed: if os.path.exists(self.model_dir): with open(self.model_dir, "rb") as saved_model: model = torch.load(saved_model) else: models = {"Net": Net} model = models[arguments.model](embeddings=self.embeddings, input_dim=self.embeddings.size(1), hidden_dim=arguments.nhid, num_layers=arguments.nlayers, output_dim=arguments.nclass, max_step=[arguments.ans_len, arguments.ask_len], dropout=arguments.dropout) if arguments.model in ["Net"]: model.nhops = arguments.nhops # train model.to(device=DEVICE) # 优化器 optimizer = optim.Adam(model.parameters(), lr=arguments.lr, weight_decay=5e-5) # 损失函数 criterion = nn.CrossEntropyLoss() best_f1_test, best_p_valid, best_f1_valid = -np.inf, -np.inf, -np.inf epoch_f1_test, epoch_f1_valid, epoch_f1_cur = 0, 0, 0 batches_per_epoch = len(self.datasets_train) // self.args.BATCH max_train_steps = int(self.args.EPOCHS * batches_per_epoch) print("--------------\nEpoch 0 begins!") bar = Bar(" Processing", max=max_train_steps) print(max_train_steps, self.args.EPOCHS, len(self.datasets_train), self.args.BATCH) for step in range(max_train_steps): bar.next() training_batch = self.datasets_train.next_batch(self.args.BATCH) features, seq_lens, mask_matrice, labels = training_batch (answers, answers_seqlen, answers_mask), (questions, questions_seqlen, questions_mask) \ = zip(features, seq_lens, mask_matrice) assert self.args.BATCH == len(labels) == len(questions) == len(answers) # Prepare data and prediction labels_ = Variable(torch.LongTensor(labels)).to(DEVICE) # necessary for Room model torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=0.25) # zero grad model.train() model.zero_grad() outputs = classify_batch(model=model, features=[answers, answers_seqlen, answers_mask, questions, questions_seqlen, questions_mask], max_lens=(arguments.ans_len, arguments.ask_len)) loss = criterion(outputs[0].view(len(labels_), -1), labels_) loss.backward() optimizer.step() # Test after each epoch if (step + 1) % batches_per_epoch == 0: tic = time.time() f1_score, p_score = test(model=model, log_result=False, dataset=self.datasets, test_set=self.datasets_dev, batch_size=self.args.BATCH) print("\n Begin to predict the results on Valid") print(" using %.5fs" % (time.time() - tic)) print(" ----Old best F1 on Valid is %f on epoch %d" % (best_f1_valid, epoch_f1_valid)) print(" ----Old best F1 on Test is %f on epoch %d" % (best_f1_test, epoch_f1_test)) if f1_score > best_f1_valid: with open(self.model_dir, 'wb') as to_save: torch.save(model, to_save) best_f1_valid = f1_score print(" ----New best F1 on Valid is %f" % f1_score) epoch_f1_valid = self.datasets_train.epochs_completed print("--------------\nEpoch %d begins!" % (self.datasets_train.epochs_completed + 1)) bar.finish()
class Instructor(object): """ 特点:使用flyai字典的get all data | 自己进行划分next batch | 按照不同的topic进行单独训练 """ def __init__(self, arguments): # 项目的超参 parser = argparse.ArgumentParser() parser.add_argument("-e", "--EPOCHS", default=5, type=int, help="train epochs") parser.add_argument("-b", "--BATCH", default=2, type=int, help="batch size") self.args = parser.parse_args() self.arguments = arguments self.dataset = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) if 'bert' in self.arguments.model_name: self.tokenizer = Tokenizer4Bert(max_seq_len=self.arguments.max_seq_len, pretrained_bert_name=os.path.join(os.getcwd(), self.arguments.pretrained_bert_name)) bert = BertModel.from_pretrained(pretrained_model_name_or_path=self.arguments.pretrained_bert_name) self.model = self.arguments.model_class(bert, self.arguments).to(self.arguments.device) else: self.tokenizer = Util.bulid_tokenizer( fnames=[self.arguments.dataset_file['train'], self.arguments.dataset_file['test']], max_seq_len=self.arguments.max_seq_len, dat_fname='{0}_tokenizer.dat'.format(self.arguments.dataset) ) embedding_matrix = Util.build_embedding_matrix( word2idx=self.tokenizer.word2idx, embed_dim=self.arguments.embed_dim, dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(self.arguments.embed_dim), self.arguments.dataset) ) self.model = self.arguments.model_class(embedding_matrix, self.arguments).to(self.arguments.device) if self.arguments.device.type == 'cuda': logger.info( 'cuda memory allocated: {}'.format(torch.cuda.memory_allocated(device=self.arguments.device.index))) Util.print_args(model=self.model, logger=logger, args=self.arguments) target_text, stance, _, _ = self.dataset.get_all_data() self.target = np.asarray([i['TARGET'].lower() for i in target_text]) text = np.asarray([i['TEXT'].lower() for i in target_text]) self.stance = np.asarray([i['STANCE'] for i in stance]) self.target_set = set() for tar in self.target: self.target_set.add(tar) self.text = PreProcessing(text).get_file_text() def run(self): # loss and optimizer criterion = nn.CrossEntropyLoss() _params = filter(lambda x: x.requires_grad, self.model.parameters()) optimizer = self.arguments.optimizer(_params, lr=self.arguments.learning_rate, weight_decay=self.arguments.l2reg) for topic in self.arguments.topics: logger.info('>' * 100) logger.info('topic: {}'.format(topic)) index = np.where(self.target == topic.lower()) self.trainset = ABSADataset(data_type=None, fname=(self.target[index], self.text[index], self.stance[index]), tokenizer=self.tokenizer) self.valset_len = int(len(self.trainset) * self.arguments.valset_ratio) self.trainset, self.valset = random_split(self.trainset, (len(self.trainset) - self.valset_len, self.valset_len)) train_data_loader = DataLoader(dataset=self.trainset, batch_size=self.args.BATCH, shuffle=True) val_data_loader = DataLoader(dataset=self.valset, batch_size=self.args.BATCH, shuffle=False) # 训练 max_val_acc = 0 max_val_f1 = 0 global_step = 0 best_model_path = None Util.reset_params(model=self.model, args=self.arguments) for epoch in range(self.args.EPOCHS): logger.info('>>') logger.info('epoch: {}'.format(epoch)) n_correct, n_total, loss_total = 0, 0, 0 self.model.train() for i_batch, sample_batched in enumerate(train_data_loader): global_step += 1 optimizer.zero_grad() inputs = [sample_batched[col].to(self.arguments.device) for col in self.arguments.inputs_cols] outputs = self.model(inputs) targets = torch.tensor(sample_batched['polarity']).to(self.arguments.device) loss = criterion(outputs, targets) loss.backward() optimizer.step() n_correct += (torch.argmax(outputs, -1) == targets).sum().item() n_total += len(outputs) loss_total += loss.item() * len(outputs) if global_step % self.arguments.log_step == 0: train_acc = n_correct / n_total train_loss = loss_total / n_total logger.info('loss: {:.4f}, acc: {:.4f}'.format(train_loss, train_acc)) val_acc, val_f1 = Util.evaluate_acc_f1(model=self.model, args=self.arguments, data_loader=val_data_loader) logger.info('> val_acc: {:.4f}, val_f1: {:.4f}'.format(val_acc, val_f1)) if val_acc > max_val_acc: max_val_acc = val_acc best_model_path = os.path.join(os.getcwd(), self.arguments.best_model_path, topic) if os.path.exists(best_model_path) is False: os.mkdir(best_model_path) Util.save_model(model=self.model, output_dir=best_model_path) logger.info('>> saved: {}'.format(best_model_path)) if val_f1 > max_val_f1: max_val_f1 = val_f1 Util.save_model(model=self.model, output_dir=best_model_path) logger.info('>>> target: {}'.format(self.target_set)) logger.info('> max_val_acc: {0} max_val_f1: {1}'.format(max_val_acc, max_val_f1)) logger.info('> train save model path: {}'.format(best_model_path))
class Instructor(object): """ 特点:使用flyai字典的get all data | 自己进行划分next batch """ def __init__(self, args): self.args = args self.sortedDict = SortedByCountsDict(dump_dir=self.args.vocab_dump_dir) self.acoustic_vocab_size, self.acoustic_vocab = Util.get_acoustic_vocab_list( ) self.language_vocab_size, self.language_vocab = Util.get_language_vocab_list( ) def generate(self): self.data = Dataset(epochs=self.args.EPOCHS, batch=self.args.BATCH, val_batch=self.args.BATCH) audio_paths, labels, _, _ = self.data.get_all_data() # wav文件路径 audio_paths = [i['audio_path'] for i in audio_paths] # wav文本数据 TODO:此处包含空格, 测试去掉空格能否提升模型性能 audio_labels = [] # wav文本拼音 audio_pinyins = [] for label in labels: label = label['label'].split(' ') audio_labels.append(''.join(label)) audio_pinyins.append(' '.join([ ' '.join([ ' '.join(j) for j in pinyin(i, style=Style.TONE3, heteronym=False) ]) for i in label ])) # 构建字典 for label in labels: self.sortedDict.append_tokens(label) self.sortedDict.dump_pkl() # 划分训练/验证 audio_paths = np.asarray(audio_paths) audio_labels = np.asarray(audio_labels) audio_pinyins = np.asarray(audio_pinyins) index = [i for i in range(len(audio_paths))] np.random.shuffle(np.asarray(index)) train_audio_paths, dev_audio_paths = audio_paths[ index[0:int(len(index) * 0.9)]], audio_paths[index[int(len(index) * 0.9):]] train_labels, dev_labels = audio_labels[ index[0:int(len(index) * 0.9)]], audio_labels[index[int(len(index) * 0.9):]] train_pinyins, dev_pinyins = audio_pinyins[ index[0:int(len(index) * 0.9)]], audio_pinyins[index[int(len(index) * 0.9):]] return train_audio_paths.tolist(), train_labels.tolist( ), train_pinyins.tolist(), dev_audio_paths.tolist(), dev_labels.tolist( ), dev_pinyins.tolist() def train_am(self, train_audio_paths, train_labels, train_pinyins, dev_audio_paths, dev_labels, dev_pinyins): """ 训练声学模型 :param train_audio_paths: :param train_labels: :param train_pinyins: :param dev_audio_paths: :param dev_labels: :param dev_pinyins: :return: """ model = CNNCTCModel(args=self.args, vocab_size=self.acoustic_vocab_size) # model = CNNRNNCTCModel(args=self.args, vocab_size=self.acoustic_vocab_size) hp = self.args hp.batch_size = self.args.am_batch_size hp.epochs = self.args.am_epochs hp.data_path = self.args.wav_dir hp.data_type = 'train' hp.feature_max_length = hp.am_feature_max_length train_generator = DataGenerator(audio_paths=train_audio_paths, labels=train_labels, pinyins=train_pinyins, hp=hp, acoustic_vocab=self.acoustic_vocab) hp.data_type = 'dev' dev_generator = DataGenerator(audio_paths=dev_audio_paths, labels=dev_labels, pinyins=dev_pinyins, hp=hp, acoustic_vocab=self.acoustic_vocab) cpCallBack = ModelCheckpoint(os.path.join(self.args.AmModelFolder, hp.am_ckpt), verbose=1, save_best_only=True) tbCallBack = keras.callbacks.TensorBoard( log_dir=self.args.AmModelTensorBoard, histogram_freq=0, write_graph=True, write_images=True, update_freq='epoch') select_model = '0' if os.path.exists(hp.AmModelFolder + select_model + '.hdf5'): print('load acoustic model...') model.load_model(select_model) model.ctc_model.fit_generator(train_generator, steps_per_epoch=len(train_pinyins) // hp.batch_size, validation_data=dev_generator, validation_steps=20, epochs=hp.epochs, workers=10, use_multiprocessing=True, callbacks=[cpCallBack, tbCallBack]) def train_lm(self, train_labels, train_pinyins): """ 训练语言学模型 :param train_labels: :param train_pinyins: :param dev_audio_paths: :param dev_labels: :param dev_pinyins: :return: """ hp = self.args hp.batch_size = self.args.lm_batch_size hp.epochs = self.args.lm_epochs hp.data_type = 'train' hp.max_len = self.args.lm_max_len hp.hidden_units = self.args.lm_hidden_units hp.is_training = self.args.lm_is_training hp.feature_dim = self.args.lm_feature_dim hp.num_heads = self.args.lm_num_heads hp.num_blocks = self.args.lm_num_blocks hp.position_max_length = self.args.lm_position_max_length hp.lr = self.args.lm_lr hp.dropout_rate = self.args.lm_dropout_rate epochs = hp.epochs lm_model = TransformerModel( arg=hp, acoustic_vocab_size=self.acoustic_vocab_size, language_vocab_size=self.language_vocab_size) batch_num = len(train_pinyins) // hp.batch_size with lm_model.graph.as_default(): saver = tf.train.Saver(max_to_keep=50) config = tf.ConfigProto() # 占用GPU90%的显存 config.gpu_options.per_process_gpu_memory_fraction = 0.9 with tf.Session(graph=lm_model.graph, config=config) as sess: merged = tf.summary.merge_all() sess.run(tf.global_variables_initializer()) if os.path.exists(hp.LmModelFolder): print('loading language model...') latest = tf.train.latest_checkpoint(hp.LmModelFolder) if latest is not None: saver.restore(sess, latest) writer = tf.summary.FileWriter(hp.LmModelTensorboard, tf.get_default_graph()) for k in range(epochs): total_loss = 0 batch = Util.get_lm_batch(args=hp, pny_lst=train_pinyins, han_lst=train_labels, acoustic_vocab=self.acoustic_vocab, language_vocab=self.language_vocab) for i in range(batch_num): input_batch, label_batch = next(batch) feed = {lm_model.x: input_batch, lm_model.y: label_batch} cost, _ = sess.run([lm_model.mean_loss, lm_model.train_op], feed_dict=feed) total_loss += cost if i % 10 == 0: print("epoch: %d step: %d/%d train loss=6%f" % (k + 1, i, batch_num, cost)) if i % 5000 == 0: rs = sess.run(merged, feed_dict=feed) writer.add_summary(rs, k * batch_num + i) print('epochs', k + 1, ': average loss = ', total_loss / batch_num) saver.save(sess, hp.LmModelFolder + hp.lm_ckpt) writer.close() pass def run(self): # 拷贝文件 for name, after_dir in zip( ['dict.txt', 'hanzi.txt', 'mixdict.txt'], [self.args.dict_dir, self.args.hanzi_dir, self.args.mixdict_dir]): before_dir = os.path.join(os.getcwd(), 'attach_data', name) logger.info('>>>name:{}'.format(name)) logger.info('>before_dir:{}'.format(before_dir)) logger.info('>after_dir:{}'.format(after_dir)) shutil.copyfile(before_dir, after_dir) train_audio_paths, train_labels, train_pinyins, dev_audio_paths, dev_labels, dev_pinyins = self.generate( ) logger.info('start train am model!') self.train_am(train_audio_paths, train_labels, train_pinyins, dev_audio_paths, dev_labels, dev_pinyins) logger.info('end train am model!') logger.info('start train lm model!') self.train_lm(train_labels=train_labels, train_pinyins=train_pinyins) logger.info('end train lm model!')