def __init__(self, args): self.args = args cfg = train.Config.from_json(args.train_cfg) model_cfg = models.Config.from_json(args.model_cfg) set_seeds(cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x) ) pipeline = [ Preprocess4Pretrain(args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, model_cfg.max_len, args.mask_alpha, args.mask_beta, args.max_gram) ] data_iter = DataLoader(SentPairDataset(args.data_file, cfg.batch_size, tokenize, model_cfg.max_len, pipeline=pipeline), batch_size=cfg.batch_size, collate_fn=seq_collate, num_workers=mp.cpu_count()) model = Generator(model_cfg) self.optimizer = optim.optim4GPU(cfg, model) self.trainer = train.MLMTrainer(cfg, model, data_iter, self.optimizer, args.save_dir, get_device()) os.makedirs(os.path.join(args.log_dir, args.name), exist_ok=True) self.writer = SummaryWriter(log_dir=os.path.join( args.log_dir, args.name)) # for tensorboardX
def main(task='mrpc', train_cfg='config/train_mrpc.json', model_cfg='config/bert_base.json', data_file='../glue/MRPC/train.tsv', model_file=None, pretrain_file='../uncased_L-12_H-768_A-12/bert_model.ckpt', data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', save_dir='../exp/bert/mrpc', max_len=128, mode='train'): cfg = train.Config.from_json(train_cfg) model_cfg = models.Config.from_json(model_cfg) set_seeds(cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) TaskDataset = dataset_class( task) # task dataset class according to the task pipeline = [ Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), AddSpecialTokensWithTruncation(max_len), TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, max_len) ] dataset = TaskDataset(data_file, pipeline) data_iter = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True) model = Classifier(model_cfg, len(TaskDataset.labels)) criterion = nn.CrossEntropyLoss() trainer = train.Trainer(cfg, model, data_iter, optim.optim4GPU(cfg, model), save_dir, get_device()) if mode == 'train': def get_loss(model, batch, global_step): # make sure loss is a scalar tensor input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) loss = criterion(logits, label_id) return loss trainer.train(get_loss, model_file, pretrain_file, data_parallel) elif mode == 'eval': def evaluate(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() #.cpu().numpy() accuracy = result.mean() return accuracy, result results = trainer.eval(evaluate, model_file, data_parallel) total_accuracy = torch.cat(results).mean().item() print('Accuracy:', total_accuracy)
def main(config='config/finetune/agnews/train.json'): cfg = Config(**json.load(open(config, "r"))) cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r"))) cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r"))) cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r"))) set_seeds(cfg.seed) TaskDataset = data.get_class( cfg_data.task) # task dataset class according to the task tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file, do_lower_case=True) dataset = TaskDataset( cfg_data.data_file[cfg.mode], pipelines=[ data.RemoveSymbols('\\'), data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), data.AddSpecialTokensWithTruncation(cfg_data.max_len), data.TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, cfg_data.max_len) ], n_data=None) dataset = TensorDataset(*dataset.get_tensors()) # To Tensors data_iter = DataLoader(dataset, batch_size=cfg_optim.batch_size, shuffle=True) classifier = models.Classifier4Transformer(cfg_model, len(TaskDataset.labels)) optimizer = optim.optim4GPU(cfg_optim, classifier) train_loop = trainer.TrainLoop(cfg_optim, classifier, data_iter, optimizer, cfg.save_dir, get_device()) def get_loss(model, batch, global_step): # make sure loss is a scalar tensor input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) loss = nn.CrossEntropyLoss()(logits, label_id) return loss def evaluate(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() #.cpu().numpy() accuracy = result.mean() return accuracy, result if cfg.mode == "train": train_loop.train(get_loss, cfg.model_file, cfg.pretrain_file) print("Training has been done properly.") elif cfg.mode == "eval": results = train_loop.eval(evaluate, cfg.model_file) total_accuracy = torch.cat(results).mean().item() print(f"Accuracy: {total_accuracy}")
def main(train_cfg='config/bert_pretrain.json', model_cfg='config/bert_base.json', data_file='../tbc/books_large_all.txt', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', save_dir='../exp/bert/pretrain', log_dir='../exp/bert/pretrain/runs', max_len=512, max_pred=20, mask_prob=0.15): train_cfg = BertTrainConfig.from_json(train_cfg) model_cfg = BertModelConfig.from_json(model_cfg) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, max_len)] data_iter = SentPairDataLoader(data_file, train_cfg.batch_size, tokenize, max_len, pipeline=pipeline) model = BertModel4Pretrain(model_cfg) criterion1 = nn.CrossEntropyLoss(reduction='none') criterion2 = nn.CrossEntropyLoss() optimizer = optim.optim4GPU(train_cfg, model) trainer = train.Trainer(train_cfg, model_cfg, model, data_iter, optimizer, save_dir, get_device()) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX def get_loss(model, batch, global_step, train_cfg, model_cfg): # make sure loss is tensor input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = batch logits_lm, logits_clsf = model(input_ids, segment_ids, input_mask, masked_pos) loss_lm = criterion1(logits_lm.transpose(1, 2), masked_ids) # for masked LM loss_lm = (loss_lm*masked_weights.float()).mean() loss_clsf = criterion2(logits_clsf, is_next) # for sentence classification writer.add_scalars('data/scalar_group', {'loss_lm': loss_lm.item(), 'loss_clsf': loss_clsf.item(), 'loss_total': (loss_lm + loss_clsf).item(), 'lr': optimizer.get_lr()[0], }, global_step) return loss_lm + loss_clsf trainer.train(get_loss, model_file, None, data_parallel)
def main(train_cfg='config/electra_pretrain.json', model_cfg='config/electra_small.json', data_file='../tbc/books_large_all.txt', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', log_dir='../exp/electra/pretrain/runs', save_dir='../exp/electra/pretrain', max_len=128, max_pred=20, mask_prob=0.15, quantize=False): check_dirs_exist([log_dir, save_dir]) train_cfg = ElectraConfig().from_json_file(train_cfg) model_cfg = ElectraConfig().from_json_file(model_cfg) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [ Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, max_len) ] data_iter = SentPairDataLoader(data_file, train_cfg.batch_size, tokenize, max_len, pipeline=pipeline) # Get distilled-electra and quantized-distilled-electra generator = ElectraForMaskedLM.from_pretrained( 'google/electra-small-generator') t_discriminator = ElectraForPreTraining.from_pretrained( 'google/electra-base-discriminator') s_discriminator = QuantizedElectraForPreTraining( model_cfg) if quantize else ElectraForPreTraining s_discriminator = s_discriminator.from_pretrained( 'google/electra-small-discriminator', config=model_cfg) # model # config is used for model "QuantizedElectraForPreTraining" model = DistillElectraForPreTraining(generator, t_discriminator, s_discriminator, model_cfg) optimizer = optim.optim4GPU(train_cfg, model) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX base_trainer_args = (train_cfg, model_cfg, model, data_iter, None, optimizer, save_dir, get_device()) trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args) trainer.train(model_file, None, data_parallel) trainer._eval()
def main(task='mrpc', base_train_cfg='config/QDElectra_pretrain.json', train_cfg='config/train_mrpc.json', model_cfg='config/QDElectra_base.json', data_file='../glue/MRPC/train.tsv', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', log_dir='../exp/electra/pretrain/runs', save_dir='../exp/bert/mrpc', mode='train', pred_distill=True): train_cfg_dict = json.load(open(base_train_cfg, "r")) train_cfg_dict.update(json.load(open(train_cfg, "r"))) train_cfg = ElectraConfig().from_dict(train_cfg_dict) # train_cfg = ElectraConfig().from_json_file(train_cfg) model_cfg = ElectraConfig().from_json_file(model_cfg) output_mode, train_cfg.n_epochs, max_len = get_task_params(task) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) TaskDataset = dataset_class(task) # task dataset class according to the task num_labels = len(TaskDataset.labels) pipeline = [ Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), AddSpecialTokensWithTruncation(max_len), TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len) ] data_set = TaskDataset(data_file, pipeline) data_iter = DataLoader(data_set, batch_size=train_cfg.batch_size, shuffle=True) t_discriminator = ElectraForSequenceClassification.from_pretrained( 'google/electra-base-discriminator' ) s_discriminator = QuantizedElectraForSequenceClassification.from_pretrained( 'google/electra-small-discriminator', config=model_cfg ) model = DistillElectraForSequenceClassification(t_discriminator, s_discriminator, model_cfg) optimizer = optim.optim4GPU(train_cfg, model) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX base_trainer_args = (train_cfg, model_cfg, model, data_iter, optimizer, save_dir, get_device()) trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args) if mode == 'train': trainer.train(model_file, None, data_parallel) elif mode == 'eval': input_ids, attention_mask, token_type_ids, label_ids = TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len) _, eval_labels = get_tensor_data(output_mode, input_ids, attention_mask, token_type_ids, label_ids) results = trainer.eval(model_file, output_mode, eval_labels, num_labels, data_parallel) total_accuracy = torch.cat(results).mean().item() print('Accuracy:', total_accuracy)
def main(args): cfg = train.Config.from_json(args.train_cfg) model_cfg = models.Config.from_json(args.model_cfg) set_seeds(cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [ Preprocess4Pretrain(args.max_pred, args.mask_prob, list(tokenizer.vocab.keys()), tokenizer.convert_tokens_to_ids, model_cfg.max_len, args.mask_alpha, args.mask_beta, args.max_gram) ] data_iter = SentPairDataLoader(args.data_file, cfg.batch_size, tokenize, model_cfg.max_len, pipeline=pipeline) model = BertModel4Pretrain(model_cfg) criterion1 = nn.CrossEntropyLoss(reduction='none') criterion2 = nn.CrossEntropyLoss() optimizer = optim.optim4GPU(cfg, model) trainer = train.Trainer(cfg, model, data_iter, optimizer, args.save_dir, get_device()) writer = SummaryWriter(log_dir=args.log_dir) # for tensorboardX def get_loss(model, batch, global_step): # make sure loss is tensor input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = batch logits_lm, logits_clsf = model(input_ids, segment_ids, input_mask, masked_pos) loss_lm = criterion1(logits_lm.transpose(1, 2), masked_ids) # for masked LM loss_lm = (loss_lm * masked_weights.float()).mean() loss_sop = criterion2(logits_clsf, is_next) # for sentence classification writer.add_scalars( 'data/scalar_group', { 'loss_lm': loss_lm.item(), 'loss_sop': loss_sop.item(), 'loss_total': (loss_lm + loss_sop).item(), 'lr': optimizer.get_lr()[0], }, global_step) return loss_lm + loss_sop trainer.train(get_loss, model_file=None, data_parallel=True)
def main(args): cfg = train.Config.from_json(args.train_cfg) model_cfg = models.Config.from_json(args.model_cfg) tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab, do_lower_case=True) tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)) pipeline = [Preprocess4Pretrain(args)] data_iter = SentPairDataLoader(args.data_file, cfg.batch_size, tokenize, model_cfg.max_len, pipeline=pipeline) model = BertModel4Pretrain(model_cfg) criterion1 = nn.CrossEntropyLoss(reduction='none') criterion2 = nn.CrossEntropyLoss() optimizer = optim.optim4GPU(cfg, model) trainer = train.Trainer(cfg, model, data_iter, optimizer, args.save_dir, get_device()) def get_loss(model, batch, global_step): # make sure loss is tensor input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = batch # input_ids : 마스크 처리된 전체 seq의 id # segment_ids : 문장 구분을 위한 [0, 1]의 segment 정보의 id # input_mask : 실제로 사용되는 id들 (zero-padding된 경우 제외) # masked_ids : 마스킹된 token들의 원래 값의 id(zero-padding됨) # maksed_pos : 마스킹된 token들의 위치 id # masked_weights : 마스크된 token의 갯수만큼 1로 채워진 배열 # is_next : instance 생성에서 만든 값 boolean 값. logits_lm, logits_clsf = model(input_ids, segment_ids, input_mask, masked_pos) # logits_lm : [B, mS, V] # logits_clsf : [B, 1, 2] loss_lm = criterion1(logits_lm.transpose(1, 2), masked_ids) # for masked LM loss_lm = (loss_lm * masked_weights.float()).mean() loss_sop = criterion2(logits_clsf, is_next) # for sentence classification writer.add_scalars( 'data/scalar_group', { 'loss_lm': loss_lm.item(), 'loss_sop': loss_sop.item(), 'loss_total': (loss_lm + loss_sop).item(), 'lr': optimizer.get_lr()[0], }, global_step) return loss_lm + loss_sop trainer.train(get_loss, model_file=None, data_parallel=True)
def train(self, model_file, pretrain_file, get_loss_CNN, get_loss_Attn_LSTM, evalute_CNN_SSL, pseudo_labeling,evalute_Attn_LSTM,evalute_CNN,evalute_Attn_LSTM_SSL, generating_lexiocn, data_parallel=False): """ Train Loop """ self.model.train() # train mode self.load3(model_file, pretrain_file) self.model2.train() # train mode model = self.model.to(self.device) model2 = self.model2.to(self.device) t = self.kkk if(self.dataName == 'IMDB'): rnn_save_name = "./IMDB_model_save/checkpoint_RNN"+str(t)+".pt" cnn_save_name = "./IMDB_model_save/checkpoint_CNN"+str(t)+".pt" result_name = "./result/result_IMDB.txt" pseudo_name = "./result/pseudo_train_set_IMDB.txt" elif(self.dataName == "AGNews"): rnn_save_name = "./AGNews_model_save/checkpoint_RNN"+str(t)+".pt" cnn_save_name = "./AGNews_model_save/checkpoint_CNN"+str(t)+".pt" result_name = "./result/result_AGNews.txt" pseudo_name = "./result/pseudo_train_set_AGNews.txt" elif(self.dataName == "DBpedia"): rnn_save_name = "./DBpedia_model_save/checkpoint_RNN"+str(t)+".pt" cnn_save_name = "./DBpedia_model_save/checkpoint_CNN"+str(t)+".pt" result_name = "./result/result_DBpedia.txt" pseudo_name = "./result/pseudo_train_set_DBpedia.txt" elif(self.dataName == "yahoo"): rnn_save_name = "./yahoo_model_save/checkpoint_RNN"+str(t)+".pt" cnn_save_name = "./yahoo_model_save/checkpoint_CNN"+str(t)+".pt" result_name = "./result/result_yahoo.txt" pseudo_name = "./result/pseudo_train_set_yahoo.txt" num_a=0 global_step = 0 # global iteration steps regardless of epochs global_step3 = 0 before = -50 curTemp=0 print("self.cfg.n_epochs#:", self.cfg.n_epochs) ddf = open(result_name,'a', encoding='UTF8') ddf.write("############################################"+str(t)+": ramdom_samplimg###########################################"+'\n') ddf.close() ddf = open(pseudo_name,'a', encoding='UTF8') ddf.write("############################################"+str(t)+": ramdom_samplimg###########################################"+'\n') ddf.close() for e in range(self.cfg.n_epochs): if(e==0): temp=987654321 early_stopping = EarlyStopping(patience=10, verbose=True) valid_losses = [] while(1): self.optimizer = optim.optim4GPU(self.cfg, model, len(self.data_iter3_b)) global_step = 0 # global iteration steps regardless of epochs global_step3 = 0 loss_sum = 0. # the sum of iteration losses to get average loss in every epoch iter_bar = tqdm(self.data_iter3_b, desc='Iter (loss=X.XXX)') model.train() for i, batch in enumerate(iter_bar): batch = [t.to(self.device) for t in batch] self.optimizer.zero_grad() loss = get_loss_CNN(model, batch, global_step).mean() # mean() for Data Parallelism loss.backward() self.optimizer.step() global_step += 1 loss_sum += loss.item() iter_bar.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) model.eval()# evaluation mode loss_sum = 0. global_step3 = 0 iter_bar_dev = tqdm(self.dataset_dev_b, desc='Iter (loss=X.XXX)') self.optimizer = optim.optim4GPU(self.cfg, model, len(self.dataset_dev_b)) for i, batch in enumerate(iter_bar_dev): batch = [t.to(self.device) for t in batch] loss = get_loss_CNN(model, batch,global_step3).mean() # mean() for Data Parallelism valid_losses.append(loss.item()) global_step3 += 1 loss_sum += loss.item() iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss = np.average(valid_losses) loss_min=early_stopping(valid_loss, model,"./model_save/checkpoint_BERT_real.pt") valid_losses = [] if early_stopping.early_stop: print("Early stopping") break model.load_state_dict(torch.load("./model_save/checkpoint_BERT_real.pt")) print("Early stopping") model.eval()# evaluation mode p=[] l=[] p3=[] p2=[] iter_bar = tqdm(self.data_iter2_b, desc='Iter (f1-score=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = evalute_CNN(model, batch) # accuracy to print softmax = nn.Softmax() y_pred3 = softmax(y_pred1) #print("y_pred3#:", y_pred3) y_pred33, y_pred1 = torch.max(y_pred3, 1) print(y_pred1) p2.append(np.ndarray.flatten(y_pred3[:, 1].data.cpu().numpy())) p.append(np.ndarray.flatten(y_pred1.data.cpu().numpy())) l.append(np.ndarray.flatten(label_id.data.cpu().numpy())) result2 = 0 iter_bar.set_description('Iter(roc=%5.3f)'%result2) p2 = [item for sublist in p2 for item in sublist] p = [item for sublist in p for item in sublist] l = [item for sublist in l for item in sublist] p=np.array(p) l=np.array(l) F1score = f1_score(l,p,average='micro') accur = accuracy_score(l,p) ddf = open(result_name,'a', encoding='UTF8') ddf.write(str(t)+": "+ str(num_a)+"aucr: "+str(accur)+"f1-score: "+str(F1score)+'\n') ddf.close() num_a+=1 temp=987654321 early_stopping = EarlyStopping(patience=30, verbose=True) valid_losses = [] while(1): model2.train() loss_sum = 0 global_step3 = 0 iter_bar3 = tqdm(self.data_iter3, desc='Iter (loss=X.XXX)') for i, batch in enumerate(iter_bar3): batch = [t.to(self.device) for t in batch] loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism self.optimizer2.zero_grad() loss.backward() self.optimizer2.step() global_step3 += 1 loss_sum += loss.item() iter_bar3.set_description('Iter (loss=%5.3f)'%loss.item()) if global_step3 % self.cfg.save_steps == 0: # save self.save(global_step3) if self.cfg.total_steps and self.cfg.total_steps < global_step3: print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) print('The Total Steps have been reached.') self.save(global_step3) # save and finish when global_steps reach total_steps return print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) model2.eval() loss_sum = 0. global_step3 = 0 iter_bar_dev = tqdm(self.dataset_dev, desc='Iter (loss=X.XXX)') for i, batch in enumerate(iter_bar_dev): batch = [t.to(self.device) for t in batch] loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism valid_losses.append(loss.item()) global_step3 += 1 loss_sum += loss.item() iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item()) if global_step3 % self.cfg.save_steps == 0: # save self.save(global_step3) if self.cfg.total_steps and self.cfg.total_steps < global_step3: print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) print('The Total Steps have been reached.') self.save(global_step3) # save and finish when global_steps reach total_steps return print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss = np.average(valid_losses) loss_min=early_stopping(valid_loss, model2,"./model_save/checkpoint_LSTM_real.pt") valid_losses = [] if early_stopping.early_stop: print("Early stopping") break model2.eval() p=[] l=[] p3=[] iter_bar4 = tqdm(self.data_iter2, desc='Iter (f1-score=X.XXX)') global_step3=0 for batch in iter_bar4: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = evalute_Attn_LSTM(model2, batch, global_step3,len(iter_bar4))# accuracy to print _, y_pred3 = y_pred1.max(1) global_step3+=1 p2=[] l2=[] for i in range(0,len(y_pred3)): p3.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy())) l.append(np.ndarray.flatten(label_id[i].data.cpu().numpy())) p2.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy())) l2.append(np.ndarray.flatten(label_id[i].data.cpu().numpy())) p2 = [item for sublist in p2 for item in sublist] l2 = [item for sublist in l2 for item in sublist] result2 = f1_score(l2, p2,average='micro') iter_bar4.set_description('Iter(roc=%5.3f)'%result2) p3 = [item for sublist in p3 for item in sublist] l = [item for sublist in l for item in sublist] p=np.array(p) l=np.array(l) results2 = accuracy_score(l, p3) F1score = f1_score(l,p3,average='micro') ddf = open(result_name,'a', encoding='UTF8') ddf.write(str(t)+": "+str(num_a)+"aucr: "+str(results2)+"f1-score: "+str(F1score)+'\n') ddf.close() num_a+=1 elif(e%2==1): global_step1 = 0 model2.eval() labell=[] iter_bar = tqdm(self.data_iter, desc='Iter (loss=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = generating_lexiocn(model2, batch,global_step1,len(iter_bar),e) # accuracy to print global_step1+=1 global_step1 = 0 model.eval() labell=[] iter_bar = tqdm(self.data_iter_b, desc='Iter (loss=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = evalute_CNN_SSL(model, batch,global_step1) # accuracy to print global_step1+=1 global_step1 = 0 model2.eval() sen = [] labell=[] iter_bar = tqdm(self.data_iter, desc='Iter (loss=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1,result_label,result3,data_temp, data_temp_b, data_iter_temp_na, data_iter_temp_na_b = pseudo_labeling( model2,batch,global_step1,len(iter_bar),e) # accuracy to print global_step1+=1 self.data_iter_temp = data_temp self.data_iter_temp_b = data_temp_b self.data_iter = data_iter_temp_na self.data_iter_b = data_iter_temp_na_b #print(result3) num_good=0 num_label=0 num_label1=0 ddf = open(pseudo_name,'a', encoding='UTF8') for i in range(0, len(result3)): sen.append(result3[i]) num_label=0 num_label1=0 num_good = 0 for i in range(0, len(result3)): if(result3[i] != -1): num_good +=1 if(result3[i] == result_label[i]): num_label+=1 ddf.write(str(t)+" " +"number of good :"+str(num_good)+" ") ddf.write("number of label :"+str(num_label)+" ") ddf.write("\n") ddf.close() print("num_good#:", num_good) print("before#:", before) if(num_good < self.stopNum): curTemp+=1 else: curTemp=0 if(curTemp>=2): break elif(e%2==0 ): self.model.train() # train mode self.load3(model_file, pretrain_file) model = self.model.to(self.device) b=0 early_stopping = EarlyStopping(patience=1, verbose=True) valid_losses = [] bb=987654321 while(1): self.optimizer = optim.optim4GPU(self.cfg, model, len(self.data_iter_temp_b)) iter_bar = tqdm(self.data_iter_temp_b, desc='Iter (loss=X.XXX)') model.train() global_step = 0 global_step3 = 0 valid_losses2 = [] for i, batch in enumerate(iter_bar): batch = [t.to(self.device) for t in batch] self.optimizer.zero_grad() loss = get_loss_CNN(model, batch, global_step).mean() # mean() for Data Parallelism valid_losses2.append(loss.item()) loss.backward() self.optimizer.step() global_step += 1 loss_sum += loss.item() iter_bar.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss2 = np.average(valid_losses2) bb= min(bb, valid_loss2.item()) valid_losses2 = [] model.eval()# evaluation mode loss_sum = 0. global_step3 = 0 iter_bar_dev = tqdm(self.dataset_dev_b, desc='Iter (loss=X.XXX)') self.optimizer = optim.optim4GPU(self.cfg, model, len(self.dataset_dev_b)) for i, batch in enumerate(iter_bar_dev): batch = [t.to(self.device) for t in batch] loss = get_loss_CNN(model, batch,global_step3).mean() # mean() for Data Parallelism valid_losses.append(loss.item()) global_step3 += 1 loss_sum += loss.item() iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss = np.average(valid_losses) loss_min=early_stopping(valid_loss, model,cnn_save_name) valid_losses = [] if early_stopping.early_stop: print("Early stopping") break model.load_state_dict(torch.load(cnn_save_name)) model.eval()# evaluation mode self.model.eval()# evaluation mode p=[] l=[] p3=[] p2=[] iter_bar = tqdm(self.data_iter2_b, desc='Iter (f1-score=X.XXX)') for batch in iter_bar: batch = [t.to(self.device) for t in batch] with torch.no_grad(): # evaluation without gradient calculation label_id, y_pred1 = evalute_CNN(model, batch) # accuracy to print softmax = nn.Softmax() y_pred3 = softmax(y_pred1) y_pred33, y_pred1 = torch.max(y_pred3, 1) p2.append(np.ndarray.flatten(y_pred3[:, 1].data.cpu().numpy())) p.append(np.ndarray.flatten(y_pred1.data.cpu().numpy())) l.append(np.ndarray.flatten(label_id.data.cpu().numpy())) result2 = 0 iter_bar.set_description('Iter(roc=%5.3f)'%result2) p2 = [item for sublist in p2 for item in sublist] p = [item for sublist in p for item in sublist] l = [item for sublist in l for item in sublist] p=np.array(p) l=np.array(l) F1score = f1_score(l,p,average='micro') accur = accuracy_score(l,p) ddf = open(result_name,'a', encoding='UTF8') ddf.write(str(t)+": "+str(num_a)+"aucr: "+str(accur)+"f1-score: "+str(F1score)+'\n') ddf.close() num_a+=1 valid_losses = [] temp = 987654321 early_stopping = EarlyStopping(patience=10, verbose=True) while(1): model2.train() l=0 l_sum=0 loss_sum = 0 global_step3 = 0 iter_bar3 = tqdm(self.data_iter_temp, desc='Iter (loss=X.XXX)') for i, batch in enumerate(iter_bar3): batch = [t.to(self.device) for t in batch] loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism self.optimizer2.zero_grad() loss.backward() self.optimizer2.step() global_step3 += 1 loss_sum += loss.item() iter_bar3.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) model2.eval() loss_sum = 0. global_step3 = 0 iter_bar_dev = tqdm(self.dataset_dev, desc='Iter (loss=X.XXX)') for i, batch in enumerate(iter_bar_dev): batch = [t.to(self.device) for t in batch] loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism valid_losses.append(loss.item()) global_step3 += 1 loss_sum += loss.item() iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item()) print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1))) valid_loss = np.average(valid_losses) loss_min=early_stopping(valid_loss, model2,rnn_save_name) valid_losses = [] if early_stopping.early_stop: print("Early stopping") break model2.load_state_dict(torch.load(rnn_save_name)) model2.eval() p=[] l=[] p3=[] iter_bar4 = tqdm(self.data_iter2, desc='Iter (f1-score=X.XXX)') for batch in iter_bar4: batch = [t.to(self.device) for t in batch] with torch.no_grad(): label_id, y_pred1 = evalute_Attn_LSTM_SSL(model2, batch) _, y_pred3 = y_pred1.max(1) p2=[] l2=[] for i in range(0,len(y_pred3)): p3.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy())) l.append(np.ndarray.flatten(label_id[i].data.cpu().numpy())) p2.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy())) l2.append(np.ndarray.flatten(label_id[i].data.cpu().numpy())) p2 = [item for sublist in p2 for item in sublist] l2 = [item for sublist in l2 for item in sublist] result2 = f1_score(l2, p2,average='micro') iter_bar4.set_description('Iter(roc=%5.3f)'%result2) p3 = [item for sublist in p3 for item in sublist] l = [item for sublist in l for item in sublist] p=np.array(p) l=np.array(l) results2 = accuracy_score(l, p3) F1score = f1_score(l,p3,average='micro') ddf = open(result_name,'a', encoding='UTF8') ddf.write(str(t)+": "+str(num_a)+"aucr: "+str(results2)+"f1-score: "+str(F1score)+'\n') ddf.close() num_a+=1
def main(task='mrpc', train_cfg='./model/config/train_mrpc.json', model_cfg='./model/config/bert_base.json', data_train_file='total_data/imdbtrain.tsv', data_test_file='total_data/IMDB_test.tsv', model_file=None, pretrain_file='./model/uncased_L-12_H-768_A-12/bert_model.ckpt', data_parallel=False, vocab='./model/uncased_L-12_H-768_A-12/vocab.txt', dataName='IMDB', stopNum=250, max_len=300, mode='train'): if mode == 'train': def get_loss_CNN(model, batch, global_step): # make sure loss is a scalar tensor input_ids, segment_ids, input_mask, label_id, seq_lengths = batch logits = model(input_ids, segment_ids, input_mask) loss = criterion(logits, label_id) return loss def evalute_CNN(model, batch): input_ids, segment_ids, input_mask, label_id, seq_lengths = batch logits = model(input_ids, segment_ids, input_mask) return label_id, logits def get_loss_Attn_LSTM( model, batch, global_step): # make sure loss is a scalar tensor input_ids, segment_ids, input_mask, label_id, seq_lengths = batch seq_lengths, perm_idx = seq_lengths.sort(0, descending=True) input_ids = input_ids[perm_idx] label_id = label_id[perm_idx] token1 = embedding(input_ids.long()) logits, attention_score = model(token1.cuda(), input_ids, segment_ids, input_mask, seq_lengths) loss1 = criterion(logits, label_id) return loss1 def evalute_Attn_LSTM(model, batch, global_step, ls): input_ids, segment_ids, input_mask, label_id, seq_lengths = batch seq_lengths, perm_idx = seq_lengths.sort(0, descending=True) input_ids = input_ids[perm_idx] label_id = label_id[perm_idx] token1 = embedding(input_ids.long()) logits, attention_score = model(token1.cuda(), input_ids, segment_ids, input_mask, seq_lengths) logits = F.softmax(logits) y_pred11, y_pred1 = logits.max(1) return label_id, logits def generating_lexiocn(model2, batch, global_step, ls, e): if (global_step == 0): result3.clear() result_label.clear() bb_11.clear() bb_22.clear() input_ids, segment_ids, input_mask, label_id, seq_lengths = batch seq_lengths, perm_idx = seq_lengths.sort(0, descending=True) input_ids = input_ids[perm_idx] label_id = label_id[perm_idx] token1 = embedding(input_ids.long()) #logits = model(input_ids, segment_ids, input_mask) logits2, attention_score2 = model2(token1.cuda(), input_ids, segment_ids, input_mask, seq_lengths) #logits=F.softmax(logits) logits = F.softmax(logits2) # y_pred11, y_pred1 = logits.max(1) y_pred22, y_pred2 = logits2.max(1) atten, attn_s1 = attention_score2.max(1) atte2, attn_s2 = torch.topk(attention_score2, 4) for i in range(0, len(input_ids)): split_tokens = [] att_index = [] for token in tokenizer.tokenize(data0[global_step * 64 + perm_idx[i]]): split_tokens.append(token) if (len(split_tokens) <= attn_s1[i].item()): attn_index3 = attention_score2[i][:len(split_tokens) - 1] attn_num, attn_index2 = attn_index3.max(0) attn_index = attn_index2.item() else: for j in range(0, 4): att_index.append(attn_s2[i][j].item()) tok = [] if (atten[i].item() <= 0): token_ab = split_tokens[0] else: for j in range(0, len(att_index)): if (att_index[j] >= len(split_tokens)): continue tok.append(split_tokens[att_index[j]]) token_temp = data0[global_step * 64 + perm_idx[i]].split(' ') token2 = [] for kk in range(0, len(tok)): token_ab = tok[kk] token_ab = token_ab.replace(".", "") token_ab = token_ab.replace(",", "") token_ab = token_ab.replace("'", "") token_ab = token_ab.replace("!", "") token_ab = token_ab.replace("?", "") token_ab = token_ab.replace("'", "") token_ab = token_ab.replace('"', "") if (token_ab == '' or token_ab == ' ' or token_ab == ',' or token_ab == '.' or token_ab == 'from' or token_ab == 'are' or token_ab == 'is' or token_ab == 'and' or token_ab == 'with' or token_ab == 'may' or token_ab == 'would' or token_ab == 'could' or token_ab == 'have' or token_ab == 'has' or token_ab == 'had' or token_ab == 'was' or token_ab == 'were' or token_ab == 'this' or token_ab == 'who' or token_ab == 'that' or token_ab == 'www' or token_ab == 'http' or token_ab == 'com' or token_ab == 'those' or token_ab == 'your' or token_ab == 'not' or token_ab == 'seem' or token_ab == 'too' or token_ab == 'lol' or token_ab == 'but' or token_ab == 'these' or token_ab == 'their' or token_ab == 'can' or token_ab == 'there' or token_ab == 'gave' or token_ab == 'his' or token_ab == 'etc' or token_ab == 'thats' or token_ab == 'though' or token_ab == 'off' or token_ab == 'she' or token_ab == 'them' or token_ab == 'huh' or token_ab == 'why' or token_ab == 'wont' or token_ab == 'any' or token_ab == 'some' or token_ab == 'its' or token_ab == 'yeah' or token_ab == 'yes' or token_ab == 'you' or token_ab == 'should' or token_ab == 'dont' or token_ab == 'anybody' or token_ab == 'than' or token_ab == 'where' or token_ab == 'for' or token_ab == 'more' or token_ab == 'will' or token_ab == 'him' or token_ab == 'its' or token_ab == 'your' or token_ab == 'wii' or token_ab == 'having' or token_ab == 'just' or token_ab == 'help' or token_ab == 'helps' or token_ab == 'all' or token_ab == 'they' or token_ab == 'take' or token_ab == 'the' or token_ab == 'what' or token_ab == 'need' or token_ab == 'make' or token_ab == 'about' or token_ab == 'then' or token_ab == 'when' or token_ab == 'does' or token_ab == 'ask' or token_ab == 'much' or token_ab == 'man' or token_ab == 'know' or token_ab == 'how' or token_ab == 'look' or token_ab == 'like' or token_ab == 'one' or token_ab == 'think' or token_ab == 'tell' or token_ab == 'find' or token_ab == 'cant' or token_ab == 'now' or token_ab == 'try' or token_ab == 'give' or token_ab == 'answer' or token_ab == 'her' or token_ab == 'out' or token_ab == 'get' or token_ab == 'because' or token_ab == 'myself' or token_ab == 'wants' or token_ab == 'movie' or token_ab == 'film' or token_ab == 'films'): continue if (len(token_ab) < 2): continue for gge, input_word in enumerate(token_temp): if (token_ab.lower() in input_word.lower()): input_word = input_word.replace(".", "") input_word = input_word.replace(",", "") input_word = input_word.replace("'", "") input_word = input_word.replace("!", "") input_word = input_word.replace("?", "") input_word = input_word.replace("'", "") input_word = input_word.replace('"', "") token2.append(input_word.lower()) break token2 = list(set(token2)) if (len(token2) < 3): continue #print(token2) sen = "" for l in range(0, len(token2) - 1): sen += token2[l] + ' ' sen += token2[len(token2) - 1] if (y_pred2[i] == 0): try: bb_11[sen] += y_pred22[i] except KeyError: bb_11[sen] = y_pred22[i] if (y_pred2[i] == 1): try: bb_22[sen] += y_pred22[i] except KeyError: bb_22[sen] = y_pred22[i] if (global_step == ls - 1): abusive_11.clear() abusive_22.clear() bb_11_up = sorted(bb_11.items(), key=lambda x: x[1], reverse=True) bb_22_up = sorted(bb_22.items(), key=lambda x: x[1], reverse=True) lexicon_size = 50 bb_11_up = bb_11_up[:lexicon_size] bb_22_up = bb_22_up[:lexicon_size] for i in bb_11_up: flag = 0 for j in bb_22_up: if ((i[0].lower() in j[0].lower()) or (j[0].lower() in i[0].lower())): if (i[1] < j[1]): flag = 1 break if (flag == 0): abusive_11.append(i[0]) for i in bb_22_up: flag = 0 for j in bb_11_up: if ((i[0].lower() in j[0].lower()) or (j[0].lower() in i[0].lower())): if (i[1] < j[1]): flag = 1 break if (flag == 0): abusive_22.append(i[0]) ddf = open("./IMDB_Lexicon/imdbLexicon_1.txt", 'w', encoding='UTF8') for i in range(0, len(abusive_11)): ddf.write(abusive_11[i] + '\n') ddf.close() ddf = open("./IMDB_Lexicon/imdbLexicon_2.txt", 'w', encoding='UTF8') for i in range(0, len(abusive_22)): ddf.write(abusive_22[i] + '\n') ddf.close() return label_id, logits def evalute_CNN_SSL(model, batch, global_step): if (global_step == 0): result5.clear() input_ids, segment_ids, input_mask, label_id, seq_lengths = batch logits = model(input_ids, segment_ids, input_mask) logits = F.softmax(logits) y_pred11, y_pred1 = logits.max(1) for i in range(0, len(input_ids)): result5.append([y_pred1[i].item(), y_pred11[i].item()]) return label_id, logits def pseudo_labeling(model2, batch, global_step, ls, e): if (global_step == 0): result3.clear() result4.clear() label_0.clear() label_1.clear() result_label.clear() abusive_11.clear() abusive_22.clear() abusive_dic_file = open("./IMDB_Lexicon/imdbLexicon_1.txt", 'r', encoding='UTF8') for line in abusive_dic_file.read().split('\n'): if (len(line) <= 3): continue abusive_11.append(line) abusive_dic_file.close() abusive_dic_file = open("./IMDB_Lexicon/imdbLexicon_2.txt", 'r', encoding='UTF8') for line in abusive_dic_file.read().split('\n'): if (len(line) <= 3): continue abusive_22.append(line) abusive_dic_file.close() input_ids, segment_ids, input_mask, label_id, seq_lengths = batch seq_lengths, perm_idx = seq_lengths.sort(0, descending=True) input_ids = input_ids[perm_idx] label_id = label_id[perm_idx] token1 = embedding(input_ids.long()) logits2, attention_score2 = model2(token1.cuda(), input_ids, segment_ids, input_mask, seq_lengths) logits2 = F.softmax(logits2) y_pred22, y_pred2 = logits2.max(1) label_id2 = [] for i in range(0, len(input_ids)): input_sentence = data0[global_step * 64 + perm_idx[i]] input_sentence = re.sub("[!@#$%^&*().?\"~/<>:;'{}]", "", input_sentence) matching_word1 = 3 matching_word2 = 4 abusive_word_list_neg11 = list() abusive_word_list_neg11 += matching_blacklist2( abusive_11, input_sentence, matching_word1) abusive_word_list_neg11 = list((set(abusive_word_list_neg11))) abusive_word_list_neg22 = list() abusive_word_list_neg22 += matching_blacklist2( abusive_22, input_sentence, matching_word1) abusive_word_list_neg22 = list((set(abusive_word_list_neg22))) abusive_word_list_neg111 = list() abusive_word_list_neg111 += matching_blacklist2( abusive_11, input_sentence, matching_word2) abusive_word_list_neg111 = list( (set(abusive_word_list_neg111))) abusive_word_list_neg222 = list() abusive_word_list_neg222 += matching_blacklist2( abusive_22, input_sentence, matching_word2) abusive_word_list_neg222 = list( (set(abusive_word_list_neg222))) a = max(len(abusive_word_list_neg11), len(abusive_word_list_neg22)) aa = max(len(abusive_word_list_neg111), len(abusive_word_list_neg222)) if ((len(abusive_word_list_neg11) > len(abusive_word_list_neg22) and result5[global_step * 64 + perm_idx[i]][0] == 0 and result5[global_step * 64 + perm_idx[i]][1] >= 0.9) or (len(abusive_word_list_neg11) > len(abusive_word_list_neg22) and y_pred2[i].item() == 0 and y_pred22[i].item() >= 0.9)): label_0.append(0) result4.append([ global_step * 64 + perm_idx[i], 0, data0[global_step * 64 + perm_idx[i]], label_id[perm_idx[i]].item() ]) elif ((len(abusive_word_list_neg11) < len(abusive_word_list_neg22) and result5[global_step * 64 + perm_idx[i]][0] == 1 and result5[global_step * 64 + perm_idx[i]][1] >= 0.9) or (len(abusive_word_list_neg11) < len(abusive_word_list_neg22) and y_pred2[i].item() == 1 and y_pred22[i].item() >= 0.9)): label_1.append(1) result4.append([ global_step * 64 + perm_idx[i], 1, data0[global_step * 64 + perm_idx[i]], label_id[perm_idx[i]].item() ]) elif (aa >= 1 and len(abusive_word_list_neg111) > len(abusive_word_list_neg222)): label_0.append(0) result4.append([ global_step * 64 + perm_idx[i], 0, data0[global_step * 64 + perm_idx[i]], label_id[perm_idx[i]].item() ]) elif (aa >= 1 and len(abusive_word_list_neg111) < len(abusive_word_list_neg222)): label_1.append(1) result4.append([ global_step * 64 + perm_idx[i], 1, data0[global_step * 64 + perm_idx[i]], label_id[perm_idx[i]].item() ]) elif (result5[global_step * 64 + perm_idx[i]][1] and y_pred22[i].item() >= 0.9 and result5[global_step * 64 + perm_idx[i]][0] == y_pred2[i].item()): if (result5[global_step * 64 + perm_idx[i]][0] == 0): label_0.append(0) result4.append([ global_step * 64 + perm_idx[i], 0, data0[global_step * 64 + perm_idx[i]], label_id[perm_idx[i]].item() ]) elif (result5[global_step * 64 + perm_idx[i]][0] == 1): label_1.append(1) result4.append([ global_step * 64 + perm_idx[i], 1, data0[global_step * 64 + perm_idx[i]], label_id[perm_idx[i]].item() ]) else: result4.append([ global_step * 64 + perm_idx[i], -1, data0[global_step * 64 + perm_idx[i]], label_id[perm_idx[i]].item() ]) if (global_step == ls - 1): result_label.clear() result3.clear() print("###result3[i] ###:", len(result3)) a = min(len(label_0), len(label_1)) la_0 = 0 la_1 = 0 la_2 = 0 la_3 = 0 random.shuffle(result4) for i in range(0, len(result4)): if (result4[i][1] == 0 and la_0 < a): if (temp_check[result4[i][0]][0] == 0): temp_check[result4[i][0]][0] = 1 temp_check[result4[i][0]][1] = 0 la_0 += 1 continue elif (result4[i][1] == 1 and la_1 < a): if (temp_check[result4[i][0]][0] == 0): temp_check[result4[i][0]][0] = 1 temp_check[result4[i][0]][1] = 1 la_1 += 1 continue result_label.clear() result3.clear() fw = open('./temp_data/temp_train_IMDB.tsv', 'a', encoding='utf-8', newline='') wr = csv.writer(fw, delimiter='\t') fww = open('./temp_data/temp_train_na_IMDB.tsv', 'w', encoding='utf-8', newline='') wrr = csv.writer(fww, delimiter='\t') for i in range(0, len(temp_check)): if (temp_check[i][0] == 1): result_label.append(str(temp_check[i][3])) result3.append(str(temp_check[i][1])) wr.writerow( [str(temp_check[i][1]), str(temp_check[i][2])]) else: wrr.writerow( [str(temp_check[i][3]), str(temp_check[i][2])]) fw.close() fww.close() data0.clear() temp_check.clear() with open('./temp_data/temp_train_na_IMDB.tsv', "r", encoding='utf-8') as f: lines = csv.reader(f, delimiter='\t') for i in lines: a = '' lines2 = i[1].split(' ') b = 0 for j in range(0, len(lines2)): a += lines2[j] + ' ' b += 1 data0.append(a) temp_check.append([0, -1, a, i[0]]) print("################;", len(data0)) f.close() dataset_temp = TaskDataset('./temp_data/temp_train_IMDB.tsv', pipeline) data_iter_temp = DataLoader(dataset_temp, batch_size=64, shuffle=True) dataset_temp_b = TaskDataset('./temp_data/temp_train_IMDB.tsv', pipeline1) data_iter_temp_b = DataLoader(dataset_temp_b, batch_size=64, shuffle=True) dataset_temp_na = TaskDataset( './temp_data/temp_train_na_IMDB.tsv', pipeline) data_iter_temp_na = DataLoader(dataset_temp_na, batch_size=64, shuffle=False) dataset_temp_na_b = TaskDataset( './temp_data/temp_train_na_IMDB.tsv', pipeline1) data_iter_temp_na_b = DataLoader(dataset_temp_na_b, batch_size=64, shuffle=False) if (global_step != ls - 1): dataset_temp = TaskDataset(data_dev_file, pipeline) data_iter_temp = DataLoader(dataset_temp, batch_size=cfg.batch_size, shuffle=True) dataset_temp_b = TaskDataset(data_dev_file, pipeline1) data_iter_temp_b = DataLoader(dataset_temp_b, batch_size=64, shuffle=True) dataset_temp_na = TaskDataset(data_dev_file, pipeline) data_iter_temp_na = DataLoader(dataset_temp_na, batch_size=cfg.batch_size, shuffle=False) dataset_temp_na_b = TaskDataset(data_dev_file, pipeline1) data_iter_temp_na_b = DataLoader(dataset_temp_na_b, batch_size=64, shuffle=False) return label_id, logits2, result_label, result3, data_iter_temp, data_iter_temp_b, data_iter_temp_na, data_iter_temp_na_b def evalute_Attn_LSTM_SSL(model, batch): input_ids, segment_ids, input_mask, label_id, seq_lengths = batch seq_lengths, perm_idx = seq_lengths.sort(0, descending=True) input_ids = input_ids[perm_idx] label_id = label_id[perm_idx] token1 = embedding(input_ids.long()) logits, attention_score = model2(token1.cuda(), input_ids, segment_ids, input_mask, seq_lengths) return label_id, logits curNum = 1 print("###########################################") print(model_cfg) print(model_cfg) #kkk+=1 cfg = train.Config.from_json(train_cfg) model_cfg = models.Config.from_json(model_cfg) for kkk in range(0, 5): print("###########################################") tokenizer = tokenization.FullTokenizer(do_lower_case=True) tokenizer1 = tokenization.FullTokenizer1(vocab_file=vocab, do_lower_case=True) TaskDataset = dataset_class( task) # task dataset class according to the task pipeline = [ Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), AddSpecialTokensWithTruncation(max_len), TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, max_len) ] pipeline1 = [ Tokenizing(tokenizer1.convert_to_unicode, tokenizer1.tokenize), AddSpecialTokensWithTruncation(max_len), TokenIndexing(tokenizer1.convert_tokens_to_ids1, TaskDataset.labels, max_len) ] fd = open("./total_data/imdbtrain.tsv", 'r', encoding='utf-8') rdr = csv.reader(fd, delimiter='\t') res = [] num_a = 0 num_b = 0 for line in rdr: #print(line) num_a += 1 res.append([line[0], line[1]]) print("curNum#:", curNum) #print(res) fw = open('./data/IMDB_temp_short.tsv', 'w', encoding='utf-8', newline='') wr = csv.writer(fw, delimiter='\t') for i in range(0, curNum): random.shuffle(res) #print(res[1][0]) print("########") curNum += 100 num_data = len(res) num_data_dev_temp = int(num_data * 0.01) num_data_dev = int(num_data_dev_temp * 0.15) num_data_short = int(num_data_dev_temp * 0.85) num_data_train = num_data - num_data_dev_temp fd.close() num = 0 data_train_file = "./data/IMDB_train" + str(kkk + 1) + ".tsv" data_dev_file = "./data/IMDB_dev" + str(kkk + 1) + ".tsv" data_short_file = "./data/IMDB_short" + str(kkk + 1) + ".tsv" print("num_data_dev#:", num_data_dev) print("num_data_short#:", num_data_short) print("num_data_train#:", num_data_train) fw = open('./data/IMDB_temp_short.tsv', 'w', encoding='utf-8', newline='') wr = csv.writer(fw, delimiter='\t') fe = open(data_train_file, 'w', encoding='utf-8', newline='') we = csv.writer(fe, delimiter='\t') res2 = [] num_pos = 0 num_neg = 0 for line in res: #print(line[0]) #print(line[1]) if (line[0] == '0' and num_pos <= (num_data_dev_temp / 2)): num_pos += 1 wr.writerow(['0', line[1]]) elif (line[0] == '1' and num_neg <= (num_data_dev_temp / 2)): num_neg += 1 wr.writerow(['1', line[1]]) else: num += 1 we.writerow([line[0], line[1]]) fw.close() fe.close() print("num_pos #:", num_pos, " num_neg:", num_neg) f = open('./data/IMDB_temp_short.tsv', 'r', encoding='utf-8') rdr = csv.reader(f, delimiter='\t') num_pos = 0 num_neg = 0 num = 0 fw = open(data_dev_file, 'w', encoding='utf-8', newline='') wr = csv.writer(fw, delimiter='\t') fe = open(data_short_file, 'w', encoding='utf-8', newline='') we = csv.writer(fe, delimiter='\t') for line in rdr: #print(line[0]) if (line[0] == '0' and num_pos <= (num_data_dev / 2)): num_pos += 1 wr.writerow(['0', line[1]]) elif (line[0] == '1' and num_neg <= (num_data_dev / 2)): num_neg += 1 wr.writerow(['1', line[1]]) else: num += 1 we.writerow([line[0], line[1]]) print("num_pos #:", num_pos, " num_neg:", num_neg) f.close() fw.close() fe.close() dataset = TaskDataset(data_train_file, pipeline) data_iter = DataLoader(dataset, batch_size=64, shuffle=False) dataset_b = TaskDataset(data_train_file, pipeline1) data_iter_b = DataLoader(dataset_b, batch_size=64, shuffle=False) dataset2 = TaskDataset(data_test_file, pipeline) data_iter2 = DataLoader(dataset2, batch_size=64, shuffle=False) dataset2_b = TaskDataset(data_test_file, pipeline1) data_iter2_b = DataLoader(dataset2_b, batch_size=64, shuffle=False) dataset_dev = TaskDataset(data_dev_file, pipeline) data_iter_dev = DataLoader(dataset_dev, batch_size=64, shuffle=False) dataset_dev_b = TaskDataset(data_dev_file, pipeline1) data_iter_dev_b = DataLoader(dataset_dev_b, batch_size=64, shuffle=False) dataset3 = TaskDataset(data_short_file, pipeline) data_iter3 = DataLoader(dataset3, batch_size=64, shuffle=True) dataset3_b = TaskDataset(data_short_file, pipeline1) data_iter3_b = DataLoader(dataset3_b, batch_size=64, shuffle=True) print("###########################################") print(model_cfg) weights = tokenization.embed_lookup2() print("#train_set:", len(data_iter)) print("#test_set:", len(data_iter2)) print("#short_set:", len(data_iter3)) print("#dev_set:", len(data_iter_dev)) curNum += 1 embedding = nn.Embedding.from_pretrained(weights).cuda() criterion = nn.CrossEntropyLoss() model = Classifier(model_cfg, 2) model2 = Classifier_Attention_LSTM(2) trainer = train.Trainer( cfg, dataName, stopNum, model, model2, data_iter, data_iter_b, data_iter2, data_iter2_b, data_iter3, data_iter3_b, data_iter_dev, data_iter_dev_b, optim.optim4GPU(cfg, model, len(data_iter) * 10), torch.optim.Adam(model2.parameters(), lr=0.005), get_device(), kkk + 1) label_0 = [] label_1 = [] result3 = [] result4 = [] result5 = [] bb_11 = {} bb_22 = {} abusive_11 = [] abusive_22 = [] result_label = [] fw = open('./temp_data/temp_train_IMDB.tsv', 'w', encoding='utf-8', newline='') wr = csv.writer(fw, delimiter='\t') fr = open(data_short_file, 'r', encoding='utf-8') rdrr = csv.reader(fr, delimiter='\t') for line in rdrr: wr.writerow([line[0], line[1]]) fw.close() fr.close() data0 = [] temp_check = [] temp_label = [] with open(data_train_file, "r", encoding='utf-8') as f: lines = csv.reader(f, delimiter='\t') for i in lines: a = '' lines2 = i[1].split(' ') for j in range(0, len(lines2)): a += lines2[j] + ' ' data0.append(a) temp_check.append([0, -1, a, i[0]]) temp_label.append([0, 0]) f.close() trainer.train(model_file, pretrain_file, get_loss_CNN, get_loss_Attn_LSTM, evalute_CNN_SSL, pseudo_labeling, evalute_Attn_LSTM, evalute_CNN, evalute_Attn_LSTM_SSL, generating_lexiocn, data_parallel) elif mode == 'eval': def evalute_Attn_LSTM_SSL(model, batch): input_ids, segment_ids, input_mask, label_id, seq_lengths = batch seq_lengths, perm_idx = seq_lengths.sort(0, descending=True) input_ids = input_ids[perm_idx] label_id = label_id[perm_idx] token1 = embedding(input_ids.long()) logits, attention_score = model2(token1.cuda(), input_ids, segment_ids, input_mask, seq_lengths) return label_id, logits def evalute_CNN_SSL(model, batch): input_ids, segment_ids, input_mask, label_id, seq_lengths = batch token1 = embedding(input_ids.long()) logits, attention_score = model(token1.cuda(), input_ids, segment_ids, input_mask) return label_id, logits weights = tokenization.embed_lookup2() embedding = nn.Embedding.from_pretrained(weights).cuda() criterion = nn.CrossEntropyLoss() model = Classifier_CNN(2) model2 = Classifier_Attention_LSTM(2) trainer = train.Eval(cfg, model, model2, data_iter, save_dir, get_device()) embedding = nn.Embedding.from_pretrained(weights).cuda() results = trainer.eval(evalute_CNN_SSL, evalute_Attn_LSTM_SSL, data_parallel)
def main(task='sim', train_cfg='config/train_mrpc.json', model_cfg='config/bert_base.json', data_file='../glue/MRPC/train.tsv', model_file=None, pretrain_file=pretrain_file, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', save_dir='../exp/bert/mrpc', max_len=128, batch_size=2, pretrained_type='local', mode='train'): cfg = train.Config.from_json(train_cfg) model_cfg = models.Config.from_json(model_cfg) #set_seeds(cfg.seed) if (pretrained_type == 'google'): local_pretrained = False else: local_pretrained = True tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) TaskDataset = dataset_class( task) # task dataset class according to the task pipeline = [ Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), AddSpecialTokensWithTruncation(max_len), TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, max_len) ] dataset = TaskDataset(data_file, pipeline) # batch_size #data_iter = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True) data_iter = DataLoader(dataset, batch_size=batch_size, shuffle=True) #model = Classifier(model_cfg, len(TaskDataset.labels)) model = SentEmbedding(model_cfg, len(TaskDataset.labels), local_pretrained) #trainer = train.Trainer(cfg, evaluator = SentEvaluator(cfg, model, data_iter, optim.optim4GPU(cfg, model), save_dir, get_device(), local_pretrained) if (True): def evaluate(model, batch): input_ids, segment_ids, input_mask, label_id = batch #logits = model(input_ids, segment_ids, input_mask) if (local_pretrained): print(np.shape(input_ids), np.shape(segment_ids), np.shape(input_mask)) embed = model(input_ids, segment_ids, input_mask) else: #input_ids = torch.LongTensor(input_ids) #segment_ids = torch.LongTensor(segment_ids) #input_mask = torch.LongTensor(input_mask) print(np.shape(input_ids), np.shape(segment_ids), np.shape(input_mask)) print(input_ids.shape, segment_ids.shape, input_mask.shape) embed = model(input_ids, segment_ids, input_mask) print('evaluate(embed) : ', embed.shape) return embed #results = trainer.eval(evaluate, model_file, data_parallel) results = evaluator.eval(evaluate, model_file, data_parallel) print(np.shape(results)) similarities = [] for svec in results: sim = cosine_similarity(results[0], svec) print(sim) similarities.append(sim.cpu().tolist()) print(similarities)
def main(config='config/finetune/agnews/train.json'): cfg = Config(**json.load(open(config, "r"))) cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r"))) cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r"))) cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r"))) set_seeds(cfg.seed) ### Prepare Dataset and Preprocessing ### TaskDataset = data.get_class(cfg_data.task) # task dataset class according to the task tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file, do_lower_case=True) dataset = TaskDataset(cfg_data.data_file[cfg.mode], pipelines=[ data.RemoveSymbols('\\'), data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), data.AddSpecialTokensWithTruncation(cfg_data.max_len), data.TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, cfg_data.max_len) ], n_data=None) tensors = TensorDataset(*dataset.get_tensors()) # To Tensors data_iter = DataLoader(tensors, batch_size=cfg_optim.batch_size, shuffle=False) ### Fetch Teacher's output and put it into the dataset ### def fetch_logits(model): def get_logits(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) return 0.0, logits train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, None, None, get_device()) results = torch.cat(train_loop.eval(get_logits, cfg.model_file)) return results if cfg.mode == "train": print("Fetching teacher's output...") teacher = models.Classifier4Transformer(cfg_model, len(TaskDataset.labels)) teacher.load_state_dict(torch.load(cfg.model_file)) # use trained model with torch.no_grad(): teacher_logits = fetch_logits(teacher) tensors = TensorDataset(teacher_logits, *dataset.get_tensors()) # To Tensors data_iter = DataLoader(tensors, batch_size=cfg_optim.batch_size, shuffle=False) ### Models ### model = models.BlendCNN(cfg_model, len(TaskDataset.labels)) checkpoint.load_embedding(model.embed, cfg.pretrain_file) optimizer = optim.optim4GPU(cfg_optim, model) train_loop = trainer.TrainLoop( cfg_optim, model, data_iter, optimizer, cfg.save_dir, get_device() ) def get_loss(model, batch, global_step): # make sure loss is a scalar tensor teacher_logits, input_ids, segment_ids, input_mask, label_id = batch T = 1.0 logits = model(input_ids, segment_ids, input_mask) loss = 0.1*nn.CrossEntropyLoss()(logits, label_id) loss += 0.9*nn.KLDivLoss()( F.log_softmax(logits/T, dim=1), F.softmax(teacher_logits/T, dim=1) ) #loss = 0.9*nn.MSELoss()(logits, teacher_logits) return loss def evaluate(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() #.cpu().numpy() accuracy = result.mean() return accuracy, result if cfg.mode == "train": train_loop.train(get_loss, None, None) # not use pretrain file print("Training has been done properly.") elif cfg.mode == "eval": results = train_loop.eval(evaluate, cfg.model_file) total_accuracy = torch.cat(results).mean().item() print(f"Accuracy: {total_accuracy}")
def main(train_cfg='config/pretrain.json', model_cfg='config/bert_base.json', data_file='/root/voucher/dataset/tifu/bert/train.tsv', model_file=None, data_parallel=True, word_vocab='/root/voucher/dataset/tifu/bert/word_vocab.txt', pos_vocab='/root/voucher/dataset/tifu/bert/pos_vocab.txt', dep_vocab='/root/voucher/dataset/tifu/bert/dep_vocab.txt', pos_dep_word_vocab='/root/voucher/dataset/tifu/bert/pos_dep_word.pkl', save_dir='../exp/bert/pretrain', log_dir='../exp/bert/pretrain/runs', max_len=384, max_pred=20, mask_prob=0.15, mode=train): if mode == 'train': pass elif mode == 'eval': pass # max_pred = max_len # mask_prob = 1 else: print("please select correct mode") exit(1) cfg = train.Config.from_json(train_cfg) model_cfg = models.Config.from_json(model_cfg) set_seeds(cfg.seed) custom_tokenizer = CustomVocabTokenizer( word_vocab_file=word_vocab, pos_vocab_file=pos_vocab, dep_vocab_file=dep_vocab, pos_dep_word_vocab_file=pos_dep_word_vocab) custom_tokenize = lambda word, pos, dep: custom_tokenizer.tokenize( custom_tokenizer.convert_to_unicode(word), custom_tokenizer.convert_to_unicode(pos), custom_tokenizer.convert_to_unicode(dep)) pipeline = [ Preprocess4Pretrain(max_pred, mask_prob, list(custom_tokenizer.word_tokenizer.vocab.keys()), list(custom_tokenizer.pos_tokenizer.vocab.keys()), list(custom_tokenizer.dep_tokenizer.vocab.keys()), custom_tokenizer.convert_tokens_to_ids, max_len) ] data_iter = TifuDataLoader(data_file, cfg.batch_size, custom_tokenize, max_len, pipeline=pipeline) model = BertModel4Pretrain(model_cfg, custom_tokenizer.get_word_vocab_size(), custom_tokenizer.get_pos_vocab_size(), custom_tokenizer.get_dep_vocab_size()) criterion1 = nn.CrossEntropyLoss(reduction='none') criterion2 = nn.CrossEntropyLoss(reduction='none') criterion3 = nn.CrossEntropyLoss(reduction='none') optimizer = optim.optim4GPU(cfg, model) trainer = train.Trainer(cfg, model, data_iter, optimizer, save_dir, get_device()) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX if mode == 'train': def get_loss(model, batch, global_step): # make sure loss is tensor input_word_ids,\ input_pos_ids,\ input_dep_ids,\ input_segment_ids,\ input_mask,\ masked_word_ids,\ masked_pos_ids,\ masked_dep_ids,\ masked_pos,\ masked_weights,\ target_word_ids,\ target_pos_ids,\ target_dep_ids,\ target_mask = batch logits_pos, logits_dep, logits_word = model( input_word_ids, input_segment_ids, masked_pos, input_mask, target_word_ids, target_mask) loss_pos = criterion1(logits_pos.transpose(1, 2), masked_pos_ids) # for masked pos loss_pos = (loss_pos * masked_weights.float()).mean() loss_dep = criterion2(logits_dep.transpose(1, 2), masked_dep_ids) # for masked dep loss_dep = (loss_dep * masked_weights.float()).mean() loss_word = criterion3(logits_word.transpose(1, 2), masked_word_ids) # for masked word loss_word = (loss_word * masked_weights.float()).mean() print(loss_pos.item(), loss_dep.item(), loss_word.item()) writer.add_scalars( 'data/scalar_group', { 'loss_pos': loss_pos.item(), 'loss_dep': loss_dep.item(), 'loss_word': loss_word.item(), 'loss_total': (loss_pos + loss_dep + loss_word).item(), 'lr': optimizer.get_lr()[0], }, global_step) return loss_pos + loss_dep + loss_word trainer.train(get_loss, model_file, None, data_parallel) elif mode == 'eval': def evaluate(model, batch): input_word_ids,\ input_pos_ids,\ input_dep_ids,\ input_segment_ids,\ input_mask,\ masked_word_ids,\ masked_pos_ids,\ masked_dep_ids,\ masked_pos,\ masked_weights,\ target_word_ids,\ target_pos_ids,\ target_dep_ids,\ target_mask = batch logits_pos, logits_dep, logits_word = model( input_word_ids, input_segment_ids, masked_pos, input_mask, target_word_ids, target_mask) _, label_pos = logits_pos.max(-1) result_pos = (label_pos == masked_pos_ids).float() #.cpu().numpy() pos_accuracy = result_pos.mean() _, label_dep = logits_dep.max(-1) result_dep = (label_dep == masked_dep_ids).float() dep_accuracy = result_dep.mean() _, label_word = logits_word.max(-1) result_word = (label_word == masked_word_ids).float() word_accuracy = result_word.mean() accuracies = [pos_accuracy, dep_accuracy, word_accuracy] results = [result_pos, result_dep, result_word] return accuracies, results results = trainer.eval( evaluate, model_file, data_parallel, eval_kind_names=["PosTagging", "SyntaxParsing", "Word"]) print(results)
def main(train_cfg='config/pretrain.json', model_cfg='config/bert_base.json', data_file='/root/voucher/dataset/tifu/bert/train.tsv', model_file=None, data_parallel=True, word_vocab='/root/voucher/dataset/tifu/bert/word_vocab.txt', pos_vocab='/root/voucher/dataset/tifu/bert/pos_vocab.txt', dep_vocab='/root/voucher/dataset/tifu/bert/dep_vocab.txt', pos_dep_word_vocab='/root/voucher/dataset/tifu/bert/pos_dep_word.pkl', save_dir='../exp/bert/pretrain', log_dir='../exp/bert/pretrain/runs', max_len=384, max_pred=20, mask_prob=0.15, mode=train): cfg = train.Config.from_json(train_cfg) model_cfg = models.Config.from_json(model_cfg) set_seeds(cfg.seed) custom_tokenizer = CustomVocabTokenizer( word_vocab_file=word_vocab, pos_vocab_file=pos_vocab, dep_vocab_file=dep_vocab, pos_dep_word_vocab_file=pos_dep_word_vocab) custom_tokenize = lambda word, pos, dep: custom_tokenizer.tokenize( custom_tokenizer.convert_to_unicode(word), custom_tokenizer.convert_to_unicode(pos), custom_tokenizer.convert_to_unicode(dep)) pipeline = [ Preprocess4Pretrain(max_pred, mask_prob, list(custom_tokenizer.word_tokenizer.vocab.keys()), list(custom_tokenizer.pos_tokenizer.vocab.keys()), list(custom_tokenizer.dep_tokenizer.vocab.keys()), custom_tokenizer.convert_tokens_to_ids, max_len) ] data_iter = TifuDataLoader(data_file, cfg.batch_size, custom_tokenize, max_len, pipeline=pipeline) model = BertModel4Pretrain(model_cfg, custom_tokenizer.get_word_vocab_size(), custom_tokenizer.get_pos_vocab_size(), custom_tokenizer.get_dep_vocab_size()) criterion1 = nn.CrossEntropyLoss(reduction='none') criterion2 = nn.CrossEntropyLoss(reduction='none') criterion3 = nn.CrossEntropyLoss(reduction='none') optimizer = optim.optim4GPU(cfg, model) trainer = train.Trainer(cfg, model, data_iter, optimizer, save_dir, get_device()) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX if mode == 'train': def get_loss(model, batch, global_step): # make sure loss is tensor origin_input_word_ids,\ input_word_ids,\ input_pos_ids,\ input_dep_ids,\ input_segment_ids,\ input_mask,\ target_word_ids,\ target_pos_ids,\ target_dep_ids,\ target_mask,\ input_len,\ target_len = batch logits_pos, logits_dep, logits_word = model( input_word_ids, input_segment_ids, input_pos_ids, input_dep_ids, input_mask, target_mask) loss_pos = criterion1(logits_pos.transpose(1, 2), input_pos_ids) # for masked pos loss_pos = (loss_pos * input_mask.float()).mean() loss_dep = criterion2(logits_dep.transpose(1, 2), input_dep_ids) # for masked dep loss_dep = (loss_dep * input_mask.float()).mean() loss_word = criterion3(logits_word.transpose(1, 2), origin_input_word_ids) # for masked word loss_word = (loss_word * input_mask.float()).mean() print(loss_pos.item(), loss_dep.item(), loss_word.item()) writer.add_scalars( 'data/scalar_group', { 'loss_pos': loss_pos.item(), 'loss_dep': loss_dep.item(), 'loss_word': loss_word.item(), 'loss_total': (loss_pos + loss_dep + loss_word).item(), 'lr': optimizer.get_lr()[0], }, global_step) return loss_pos + loss_dep + loss_word trainer.train(get_loss, model_file, None, data_parallel) elif mode == 'eval': def evaluate(model, batch): origin_input_word_ids, \ input_word_ids,\ input_pos_ids,\ input_dep_ids,\ input_segment_ids,\ input_mask,\ target_word_ids,\ target_pos_ids,\ target_dep_ids,\ target_mask,\ input_len,\ target_len = batch logits_pos, logits_dep, logits_word = model( input_word_ids, input_segment_ids, input_pos_ids, input_dep_ids, input_mask, target_mask) _, label_pos = logits_pos.max(-1) result_pos = (label_pos == target_pos_ids).float() #.cpu().numpy() pos_accuracy = result_pos.mean() _, label_dep = logits_dep.max(-1) result_dep = (label_dep == target_dep_ids).float() dep_accuracy = result_dep.mean() _, label_word = logits_word.max(-1) result_word = (label_word == target_word_ids).float() word_accuracy = result_word.mean() accuracies = [pos_accuracy, dep_accuracy, word_accuracy] results = [result_pos, result_dep, result_word] return accuracies, results results = trainer.eval( evaluate, model_file, data_parallel, eval_kind_names=["PosTagging", "SyntaxParsing", "Word"]) print(results) elif mode == 'sim': def sim(model, batch): origin_input_word_ids, \ input_word_ids,\ input_pos_ids,\ input_dep_ids,\ input_segment_ids,\ input_mask,\ target_word_ids,\ target_pos_ids,\ target_dep_ids,\ target_mask,\ input_len,\ target_len = batch logits_pos, logits_dep, logits_word = model( input_word_ids, input_segment_ids, input_pos_ids, input_dep_ids, input_mask, target_mask) input_len = input_len.tolist() target_len = target_len.tolist() for i in range(len(input_len)): logits = torch.squeeze(logits_word.narrow(0, i, 1), dim=0) logits_input = logits.narrow(0, 0, input_len[i]) logits_target = logits.narrow(0, input_len[i], target_len[i]) _, input_ids = logits_input.max(-1) _, target_ids = logits_target.max(-1) input_tokens = custom_tokenizer.word_tokenizer.convert_ids_to_tokens( input_ids.tolist()) target_tokens = custom_tokenizer.word_tokenizer.convert_ids_to_tokens( target_ids.tolist()) input_tokens2 = custom_tokenizer.word_tokenizer.convert_ids_to_tokens( input_word_ids[i].tolist()) target_tokens2 = custom_tokenizer.word_tokenizer.convert_ids_to_tokens( target_word_ids[i].tolist()) results = [] input_norm = logits_input / logits_input.norm(dim=1)[:, None] target_norm = logits_target / logits_target.norm(dim=1)[:, None] #target_len x input_len res = torch.mm(target_norm, input_norm.transpose(0, 1)) #target_len x 1 _, sim_idxs = res.max(-1) for j, sim_idx in enumerate(sim_idxs.tolist()): results.append([ target_tokens[j], input_tokens[sim_idx], target_tokens2[j], input_tokens2[sim_idx] ]) print(results) accuracies = [0] results = [0] return accuracies, results results = trainer.eval(sim, model_file, data_parallel, eval_kind_names=["Word"]) print(results)
def main(train_cfg='config/pretrain.json', model_cfg='config/bert_base.json', data_file='/root/voucher/dataset/tifu/bert/train.tsv', model_file=None, pretrain_file=None, data_parallel=True, word_vocab='/root/voucher/dataset/tifu/bert/word_vocab.txt', pos_vocab='/root/voucher/dataset/tifu/bert/pos_vocab.txt', dep_vocab='/root/voucher/dataset/tifu/bert/dep_vocab.txt', pos_dep_word_vocab='/root/voucher/dataset/tifu/bert/pos_dep_word.pkl', save_dir='../exp/bert/pretrain', log_dir='../exp/bert/pretrain/runs', max_len=384, max_pred=20, mask_prob=0.15, mode=train): if mode == 'train': pass elif mode == 'eval': pass # max_pred = max_len # mask_prob = 1 else: print("please select correct mode") exit(1) cfg = train.Config.from_json(train_cfg) model_cfg = models.Config.from_json(model_cfg) set_seeds(cfg.seed) custom_tokenizer = CustomVocabTokenizer(word_vocab_file=word_vocab, pos_vocab_file=pos_vocab, dep_vocab_file=dep_vocab, pos_dep_word_vocab_file=pos_dep_word_vocab) custom_tokenize = lambda word, pos, dep: custom_tokenizer.tokenize(custom_tokenizer.convert_to_unicode(word), custom_tokenizer.convert_to_unicode(pos), custom_tokenizer.convert_to_unicode(dep)) pipeline = [Preprocess4Pretrain(max_pred, mask_prob, list(custom_tokenizer.word_tokenizer.vocab.keys()), list(custom_tokenizer.pos_tokenizer.vocab.keys()), list(custom_tokenizer.dep_tokenizer.vocab.keys()), custom_tokenizer.convert_tokens_to_ids, max_len)] data_iter = TifuDataLoader(data_file, cfg.batch_size, custom_tokenize, max_len, pipeline=pipeline) model = BertModel4Pretrain(model_cfg) optimizer = optim.optim4GPU(cfg, model) trainer = train.Trainer(cfg, model, data_iter, optimizer, save_dir, get_device()) if mode == 'eval': def evaluate(model, batch): input_word_ids,\ input_segment_ids,\ input_mask,\ target_word_ids,\ target_mask,\ input_len, \ target_len = batch logits_word = model(input_word_ids, input_segment_ids, input_mask, target_mask) input_len = input_len.tolist() target_len = target_len.tolist() for i in range(len(input_len)): logits = torch.squeeze(logits_word.narrow(0, i, 1), dim=0) logits_input = logits.narrow(0, 0, input_len[i]) logits_target = logits.narrow(0, input_len[i], target_len[i]) _, input_ids = logits_input.max(-1) _, target_ids = logits_target.max(-1) input_tokens = custom_tokenizer.word_tokenizer.convert_ids_to_tokens(input_ids.tolist()) target_tokens = custom_tokenizer.word_tokenizer.convert_ids_to_tokens(target_ids.tolist()) results = [] input_norm = logits_input / logits_input.norm(dim=1)[:, None] target_norm = logits_target / logits_target.norm(dim=1)[:, None] #target_len x input_len res = torch.mm(target_norm, input_norm.transpose(0, 1)) #target_len x 1 _, sim_idxs = res.max(-1) for j, sim_idx in enumerate(sim_idxs.tolist()): results.append([target_tokens[j], input_tokens[sim_idx]]) print(results) accuracies = [0] results = [0] return accuracies, results results = trainer.eval(evaluate, None, pretrain_file, data_parallel, eval_kind_names=["Word"]) print(results)
def main(config='config/blendcnn/mrpc/eval.json', args=None): cfg = Config(**json.load(open(config, "r"))) cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r"))) cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r"))) cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r"))) set_seeds(cfg.seed) TaskDataset = data.get_class( cfg_data.task) # task dataset class according to the task tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file, do_lower_case=True) dataset = TaskDataset( args.dataset_location, pipelines=[ data.RemoveSymbols('\\'), data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize), data.AddSpecialTokensWithTruncation(cfg_data.max_len), data.TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, cfg_data.max_len) ], n_data=None) dataset = TensorDataset(*dataset.get_tensors()) # To Tensors data_iter = DataLoader(dataset, batch_size=args.batch_size, shuffle=False) model = models.BlendCNN(cfg_model, len(TaskDataset.labels)) checkpoint.load_embedding(model.embed, cfg.pretrain_file) optimizer = optim.optim4GPU(cfg_optim, model) train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, optimizer, cfg.save_dir, get_device()) def get_loss(model, batch, global_step): # make sure loss is a scalar tensor input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) loss = nn.CrossEntropyLoss()(logits, label_id) return loss def evaluate(model, batch): input_ids, segment_ids, input_mask, label_id = batch logits = model(input_ids, segment_ids, input_mask) _, label_pred = logits.max(1) result = (label_pred == label_id).float() #.cpu().numpy() accuracy = result.mean() return accuracy, result class Bert_DataLoader(object): def __init__(self, loader=None, model_type=None, device='cpu', batch_size=1): self.loader = loader self.model_type = model_type self.device = device self.batch_size = batch_size def __iter__(self): for batch in self.loader: batch = tuple(t.to(self.device) for t in batch) outputs = { 'output_all': (batch[0], batch[1], batch[2]), 'labels': batch[3] } yield outputs['output_all'], outputs['labels'] def benchmark(model): total_samples = 0 total_time = 0 index = 0 class RandomDataset(object): def __init__(self, size, shape): self.len = size self.input_ids = torch.randint(low=0, high=30522, size=(size, shape), dtype=torch.int64) self.segment_ids = torch.randint(low=0, high=1, size=(size, shape), dtype=torch.int64) self.input_mask = torch.randint(low=0, high=1, size=(size, shape), dtype=torch.int64) self.data = (self.input_ids, self.segment_ids, self.input_mask) def __getitem__(self, index): return (self.data[0][index], self.data[1][index], self.data[2][index]) def __len__(self): return self.len rand_loader = DataLoader(dataset=RandomDataset(size=5000, shape=128), batch_size=args.batch_size, shuffle=True) for batch in rand_loader: index += 1 tic = time.time() if os.environ.get('BLENDCNN_PROFILING') is not None: with profiler.profile(record_shapes=True) as prof: with torch.no_grad(): input_ids, segment_ids, input_mask = batch _ = model(*batch) else: with torch.no_grad( ): # evaluation without gradient calculation input_ids, segment_ids, input_mask = batch _ = model(*batch) if index > args.warmup: total_samples += batch[0].size()[0] total_time += time.time() - tic throughput = total_samples / total_time print('Latency: %.3f ms' % (1 / throughput * 1000)) print('Throughput: %.3f images/sec' % (throughput)) if os.environ.get('BLENDCNN_PROFILING') is not None: print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) def eval_func(model): results = [] # prediction results total_samples = 0 total_time = 0 index = 0 model.eval() eval_dataloader = Bert_DataLoader(loader=data_iter, batch_size=args.batch_size) for batch, label in eval_dataloader: index += 1 tic = time.time() if os.environ.get('BLENDCNN_PROFILING') is not None: with profiler.profile(record_shapes=True) as prof: with torch.no_grad(): accuracy, result = evaluate(model, (*batch, label)) else: with torch.no_grad( ): # evaluation without gradient calculation accuracy, result = evaluate(model, (*batch, label)) results.append(result) if index > args.warmup: total_samples += batch[0].size()[0] total_time += time.time() - tic total_accuracy = torch.cat(results).mean().item() throughput = total_samples / total_time print('Latency: %.3f ms' % (1 / throughput * 1000)) print('Throughput: %.3f samples/sec' % (throughput)) print('Accuracy: %.3f ' % (total_accuracy)) if os.environ.get('BLENDCNN_PROFILING') is not None: print(prof.key_averages().table(sort_by="cpu_time_total", row_limit=10)) return total_accuracy if cfg.mode == "train": train_loop.train(get_loss, cfg.model_file, None) # not use pretrain_file print("Training has been done properly.") elif cfg.mode == "eval": # results = train_loop.eval(evaluate, cfg.model_file) # total_accuracy = torch.cat(results).mean().item() # print(f"Accuracy: {total_accuracy}") if args.tune: import lpot from lpot import common # lpot tune model.load_state_dict(torch.load(args.input_model)) eval_dataloader = Bert_DataLoader(loader=data_iter, batch_size=args.batch_size) quantizer = lpot.Quantization(args.tuned_yaml) quantizer.model = common.Model(model) quantizer.calib_dataloader = eval_dataloader quantizer.eval_func = eval_func q_model = quantizer() q_model.save(args.tuned_checkpoint) elif args.int8: from lpot.utils.pytorch import load int8_model = load( os.path.abspath(os.path.expanduser(args.tuned_checkpoint)), model) print(int8_model) if args.accuracy_only: eval_func(int8_model) elif args.benchmark: benchmark(int8_model) else: model.load_state_dict(torch.load(args.input_model)) print(model) if args.accuracy_only: eval_func(model) elif args.benchmark: benchmark(model)
def main(task_name='qqp', base_train_cfg='config/QDElectra_pretrain.json', train_cfg='config/train_mrpc.json', model_cfg='config/QDElectra_base.json', train_data_file='GLUE/glue_data/QQP/train.tsv', eval_data_file='GLUE/glue_data/QQP/eval.tsv', model_file=None, data_parallel=True, vocab='../uncased_L-12_H-768_A-12/vocab.txt', log_dir='../exp/electra/pretrain/runs', save_dir='../exp/bert/mrpc', distill=True, quantize=True, gradually_distill=False, imitate_tinybert=False, pred_distill=True): check_dirs_exist([log_dir, save_dir]) train_cfg_dict = json.load(open(base_train_cfg, "r")) train_cfg_dict.update(json.load(open(train_cfg, "r"))) train_cfg = ElectraConfig().from_dict(train_cfg_dict) model_cfg = ElectraConfig().from_json_file(model_cfg) output_mode, train_cfg.n_epochs, max_len = get_task_params(task_name) set_seeds(train_cfg.seed) tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True) TaskDataset = dataset_class( task_name) # task dataset class according to the task name model_cfg.num_labels = len(TaskDataset.labels) pipeline = [ Tokenizing(task_name, tokenizer.convert_to_unicode, tokenizer.tokenize), AddSpecialTokensWithTruncation(max_len), TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len) ] train_data_set = TaskDataset(train_data_file, pipeline) eval_data_set = TaskDataset(eval_data_file, pipeline) train_data_iter = DataLoader(train_data_set, batch_size=train_cfg.batch_size, shuffle=True) eval_data_iter = DataLoader(eval_data_set, batch_size=train_cfg.batch_size, shuffle=False) generator = ElectraForSequenceClassification.from_pretrained( 'google/electra-small-generator') t_discriminator = ElectraForSequenceClassification.from_pretrained( 'google/electra-base-discriminator') s_discriminator = QuantizedElectraForSequenceClassification if quantize else ElectraForSequenceClassification s_discriminator = s_discriminator.from_pretrained( 'google/electra-small-discriminator', config=model_cfg) model = DistillElectraForSequenceClassification(generator, t_discriminator, s_discriminator, model_cfg) optimizer = optim.optim4GPU(train_cfg, model) writer = SummaryWriter(log_dir=log_dir) # for tensorboardX base_trainer_args = (train_cfg, model_cfg, model, train_data_iter, eval_data_iter, optimizer, save_dir, get_device()) trainer = QuantizedDistillElectraTrainer(task_name, output_mode, distill, gradually_distill, imitate_tinybert, pred_distill, len(TaskDataset.labels), writer, *base_trainer_args) trainer.train(model_file, None, data_parallel) trainer.eval(model_file, data_parallel)