예제 #1
0
    def __init__(self, args):
        self.args = args
        cfg = train.Config.from_json(args.train_cfg)
        model_cfg = models.Config.from_json(args.model_cfg)
        set_seeds(cfg.seed)

        tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab,
                                               do_lower_case=True)
        tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x)
                                                )

        pipeline = [
            Preprocess4Pretrain(args.max_pred, args.mask_prob,
                                list(tokenizer.vocab.keys()),
                                tokenizer.convert_tokens_to_ids,
                                model_cfg.max_len, args.mask_alpha,
                                args.mask_beta, args.max_gram)
        ]
        data_iter = DataLoader(SentPairDataset(args.data_file,
                                               cfg.batch_size,
                                               tokenize,
                                               model_cfg.max_len,
                                               pipeline=pipeline),
                               batch_size=cfg.batch_size,
                               collate_fn=seq_collate,
                               num_workers=mp.cpu_count())

        model = Generator(model_cfg)

        self.optimizer = optim.optim4GPU(cfg, model)
        self.trainer = train.MLMTrainer(cfg, model, data_iter, self.optimizer,
                                        args.save_dir, get_device())
        os.makedirs(os.path.join(args.log_dir, args.name), exist_ok=True)
        self.writer = SummaryWriter(log_dir=os.path.join(
            args.log_dir, args.name))  # for tensorboardX
예제 #2
0
def main(task='mrpc',
         train_cfg='config/train_mrpc.json',
         model_cfg='config/bert_base.json',
         data_file='../glue/MRPC/train.tsv',
         model_file=None,
         pretrain_file='../uncased_L-12_H-768_A-12/bert_model.ckpt',
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         save_dir='../exp/bert/mrpc',
         max_len=128,
         mode='train'):

    cfg = train.Config.from_json(train_cfg)
    model_cfg = models.Config.from_json(model_cfg)

    set_seeds(cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab,
                                           do_lower_case=True)
    TaskDataset = dataset_class(
        task)  # task dataset class according to the task
    pipeline = [
        Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
        AddSpecialTokensWithTruncation(max_len),
        TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels,
                      max_len)
    ]
    dataset = TaskDataset(data_file, pipeline)
    data_iter = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True)

    model = Classifier(model_cfg, len(TaskDataset.labels))
    criterion = nn.CrossEntropyLoss()

    trainer = train.Trainer(cfg, model, data_iter, optim.optim4GPU(cfg, model),
                            save_dir, get_device())

    if mode == 'train':

        def get_loss(model, batch,
                     global_step):  # make sure loss is a scalar tensor
            input_ids, segment_ids, input_mask, label_id = batch
            logits = model(input_ids, segment_ids, input_mask)
            loss = criterion(logits, label_id)
            return loss

        trainer.train(get_loss, model_file, pretrain_file, data_parallel)

    elif mode == 'eval':

        def evaluate(model, batch):
            input_ids, segment_ids, input_mask, label_id = batch
            logits = model(input_ids, segment_ids, input_mask)
            _, label_pred = logits.max(1)
            result = (label_pred == label_id).float()  #.cpu().numpy()
            accuracy = result.mean()
            return accuracy, result

        results = trainer.eval(evaluate, model_file, data_parallel)
        total_accuracy = torch.cat(results).mean().item()
        print('Accuracy:', total_accuracy)
예제 #3
0
def main(config='config/finetune/agnews/train.json'):

    cfg = Config(**json.load(open(config, "r")))

    cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r")))
    cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r")))
    cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r")))

    set_seeds(cfg.seed)

    TaskDataset = data.get_class(
        cfg_data.task)  # task dataset class according to the task
    tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file,
                                           do_lower_case=True)
    dataset = TaskDataset(
        cfg_data.data_file[cfg.mode],
        pipelines=[
            data.RemoveSymbols('\\'),
            data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            data.AddSpecialTokensWithTruncation(cfg_data.max_len),
            data.TokenIndexing(tokenizer.convert_tokens_to_ids,
                               TaskDataset.labels, cfg_data.max_len)
        ],
        n_data=None)
    dataset = TensorDataset(*dataset.get_tensors())  # To Tensors
    data_iter = DataLoader(dataset,
                           batch_size=cfg_optim.batch_size,
                           shuffle=True)

    classifier = models.Classifier4Transformer(cfg_model,
                                               len(TaskDataset.labels))
    optimizer = optim.optim4GPU(cfg_optim, classifier)

    train_loop = trainer.TrainLoop(cfg_optim, classifier, data_iter, optimizer,
                                   cfg.save_dir, get_device())

    def get_loss(model, batch,
                 global_step):  # make sure loss is a scalar tensor
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        loss = nn.CrossEntropyLoss()(logits, label_id)
        return loss

    def evaluate(model, batch):
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float()  #.cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    if cfg.mode == "train":
        train_loop.train(get_loss, cfg.model_file, cfg.pretrain_file)
        print("Training has been done properly.")

    elif cfg.mode == "eval":
        results = train_loop.eval(evaluate, cfg.model_file)
        total_accuracy = torch.cat(results).mean().item()
        print(f"Accuracy: {total_accuracy}")
예제 #4
0
def main(train_cfg='config/bert_pretrain.json',
         model_cfg='config/bert_base.json',
         data_file='../tbc/books_large_all.txt',
         model_file=None,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         save_dir='../exp/bert/pretrain',
         log_dir='../exp/bert/pretrain/runs',
         max_len=512,
         max_pred=20,
         mask_prob=0.15):

    train_cfg = BertTrainConfig.from_json(train_cfg)
    model_cfg = BertModelConfig.from_json(model_cfg)

    set_seeds(train_cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True)
    tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

    pipeline = [Preprocess4Pretrain(max_pred,
                                    mask_prob,
                                    list(tokenizer.vocab.keys()),
                                    tokenizer.convert_tokens_to_ids,
                                    max_len)]
    data_iter = SentPairDataLoader(data_file,
                                   train_cfg.batch_size,
                                   tokenize,
                                   max_len,
                                   pipeline=pipeline)

    model = BertModel4Pretrain(model_cfg)
    criterion1 = nn.CrossEntropyLoss(reduction='none')
    criterion2 = nn.CrossEntropyLoss()

    optimizer = optim.optim4GPU(train_cfg, model)
    trainer = train.Trainer(train_cfg, model_cfg, model, data_iter, optimizer, save_dir, get_device())

    writer = SummaryWriter(log_dir=log_dir) # for tensorboardX

    def get_loss(model, batch, global_step, train_cfg, model_cfg): # make sure loss is tensor
        input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = batch

        logits_lm, logits_clsf = model(input_ids, segment_ids, input_mask, masked_pos)
        loss_lm = criterion1(logits_lm.transpose(1, 2), masked_ids) # for masked LM
        loss_lm = (loss_lm*masked_weights.float()).mean()
        loss_clsf = criterion2(logits_clsf, is_next) # for sentence classification
        writer.add_scalars('data/scalar_group',
                           {'loss_lm': loss_lm.item(),
                            'loss_clsf': loss_clsf.item(),
                            'loss_total': (loss_lm + loss_clsf).item(),
                            'lr': optimizer.get_lr()[0],
                           },
                           global_step)
        return loss_lm + loss_clsf

    trainer.train(get_loss, model_file, None, data_parallel)
예제 #5
0
def main(train_cfg='config/electra_pretrain.json',
         model_cfg='config/electra_small.json',
         data_file='../tbc/books_large_all.txt',
         model_file=None,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         log_dir='../exp/electra/pretrain/runs',
         save_dir='../exp/electra/pretrain',
         max_len=128,
         max_pred=20,
         mask_prob=0.15,
         quantize=False):

    check_dirs_exist([log_dir, save_dir])

    train_cfg = ElectraConfig().from_json_file(train_cfg)
    model_cfg = ElectraConfig().from_json_file(model_cfg)

    set_seeds(train_cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab,
                                           do_lower_case=True)
    tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

    pipeline = [
        Preprocess4Pretrain(max_pred, mask_prob, list(tokenizer.vocab.keys()),
                            tokenizer.convert_tokens_to_ids, max_len)
    ]

    data_iter = SentPairDataLoader(data_file,
                                   train_cfg.batch_size,
                                   tokenize,
                                   max_len,
                                   pipeline=pipeline)

    # Get distilled-electra and quantized-distilled-electra
    generator = ElectraForMaskedLM.from_pretrained(
        'google/electra-small-generator')
    t_discriminator = ElectraForPreTraining.from_pretrained(
        'google/electra-base-discriminator')
    s_discriminator = QuantizedElectraForPreTraining(
        model_cfg) if quantize else ElectraForPreTraining
    s_discriminator = s_discriminator.from_pretrained(
        'google/electra-small-discriminator', config=model_cfg)  # model
    # config is used for model "QuantizedElectraForPreTraining"
    model = DistillElectraForPreTraining(generator, t_discriminator,
                                         s_discriminator, model_cfg)

    optimizer = optim.optim4GPU(train_cfg, model)
    writer = SummaryWriter(log_dir=log_dir)  # for tensorboardX

    base_trainer_args = (train_cfg, model_cfg, model, data_iter, None,
                         optimizer, save_dir, get_device())
    trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args)
    trainer.train(model_file, None, data_parallel)
    trainer._eval()
예제 #6
0
def main(task='mrpc',
         base_train_cfg='config/QDElectra_pretrain.json',
         train_cfg='config/train_mrpc.json',
         model_cfg='config/QDElectra_base.json',
         data_file='../glue/MRPC/train.tsv',
         model_file=None,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         log_dir='../exp/electra/pretrain/runs',
         save_dir='../exp/bert/mrpc',
         mode='train',
         pred_distill=True):
    train_cfg_dict = json.load(open(base_train_cfg, "r"))
    train_cfg_dict.update(json.load(open(train_cfg, "r")))
    train_cfg = ElectraConfig().from_dict(train_cfg_dict)
    # train_cfg = ElectraConfig().from_json_file(train_cfg)
    model_cfg = ElectraConfig().from_json_file(model_cfg)
    output_mode, train_cfg.n_epochs, max_len = get_task_params(task)
    set_seeds(train_cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab, do_lower_case=True)
    TaskDataset = dataset_class(task) # task dataset class according to the task
    num_labels = len(TaskDataset.labels)
    pipeline = [
        Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
        AddSpecialTokensWithTruncation(max_len),
        TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels, output_mode, max_len)
    ]
    data_set = TaskDataset(data_file, pipeline)
    data_iter = DataLoader(data_set, batch_size=train_cfg.batch_size, shuffle=True)

    t_discriminator = ElectraForSequenceClassification.from_pretrained(
        'google/electra-base-discriminator'
    )
    s_discriminator = QuantizedElectraForSequenceClassification.from_pretrained(
        'google/electra-small-discriminator', config=model_cfg
    )
    model = DistillElectraForSequenceClassification(t_discriminator, s_discriminator, model_cfg)

    optimizer = optim.optim4GPU(train_cfg, model)
    writer = SummaryWriter(log_dir=log_dir) # for tensorboardX

    base_trainer_args = (train_cfg, model_cfg, model, data_iter, optimizer, save_dir, get_device())
    trainer = QuantizedDistillElectraTrainer(writer, *base_trainer_args)

    if mode == 'train':
        trainer.train(model_file, None, data_parallel)
    elif mode == 'eval':
        input_ids, attention_mask, token_type_ids, label_ids = TokenIndexing(tokenizer.convert_tokens_to_ids,
                                                                            TaskDataset.labels,
                                                                            output_mode,
                                                                            max_len)
        _, eval_labels = get_tensor_data(output_mode, input_ids, attention_mask, token_type_ids, label_ids)
        results = trainer.eval(model_file, output_mode, eval_labels, num_labels, data_parallel)
        total_accuracy = torch.cat(results).mean().item()
        print('Accuracy:', total_accuracy)
예제 #7
0
def main(args):

    cfg = train.Config.from_json(args.train_cfg)
    model_cfg = models.Config.from_json(args.model_cfg)

    set_seeds(cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab,
                                           do_lower_case=True)
    tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))

    pipeline = [
        Preprocess4Pretrain(args.max_pred, args.mask_prob,
                            list(tokenizer.vocab.keys()),
                            tokenizer.convert_tokens_to_ids, model_cfg.max_len,
                            args.mask_alpha, args.mask_beta, args.max_gram)
    ]
    data_iter = SentPairDataLoader(args.data_file,
                                   cfg.batch_size,
                                   tokenize,
                                   model_cfg.max_len,
                                   pipeline=pipeline)

    model = BertModel4Pretrain(model_cfg)
    criterion1 = nn.CrossEntropyLoss(reduction='none')
    criterion2 = nn.CrossEntropyLoss()

    optimizer = optim.optim4GPU(cfg, model)
    trainer = train.Trainer(cfg, model, data_iter, optimizer, args.save_dir,
                            get_device())

    writer = SummaryWriter(log_dir=args.log_dir)  # for tensorboardX

    def get_loss(model, batch, global_step):  # make sure loss is tensor
        input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = batch

        logits_lm, logits_clsf = model(input_ids, segment_ids, input_mask,
                                       masked_pos)
        loss_lm = criterion1(logits_lm.transpose(1, 2),
                             masked_ids)  # for masked LM
        loss_lm = (loss_lm * masked_weights.float()).mean()
        loss_sop = criterion2(logits_clsf,
                              is_next)  # for sentence classification
        writer.add_scalars(
            'data/scalar_group', {
                'loss_lm': loss_lm.item(),
                'loss_sop': loss_sop.item(),
                'loss_total': (loss_lm + loss_sop).item(),
                'lr': optimizer.get_lr()[0],
            }, global_step)
        return loss_lm + loss_sop

    trainer.train(get_loss, model_file=None, data_parallel=True)
예제 #8
0
def main(args):

    cfg = train.Config.from_json(args.train_cfg)
    model_cfg = models.Config.from_json(args.model_cfg)

    tokenizer = tokenization.FullTokenizer(vocab_file=args.vocab,
                                           do_lower_case=True)
    tokenize = lambda x: tokenizer.tokenize(tokenizer.convert_to_unicode(x))
    pipeline = [Preprocess4Pretrain(args)]
    data_iter = SentPairDataLoader(args.data_file,
                                   cfg.batch_size,
                                   tokenize,
                                   model_cfg.max_len,
                                   pipeline=pipeline)

    model = BertModel4Pretrain(model_cfg)
    criterion1 = nn.CrossEntropyLoss(reduction='none')
    criterion2 = nn.CrossEntropyLoss()

    optimizer = optim.optim4GPU(cfg, model)
    trainer = train.Trainer(cfg, model, data_iter, optimizer, args.save_dir,
                            get_device())

    def get_loss(model, batch, global_step):  # make sure loss is tensor
        input_ids, segment_ids, input_mask, masked_ids, masked_pos, masked_weights, is_next = batch
        # input_ids : 마스크 처리된 전체 seq의 id
        # segment_ids : 문장 구분을 위한 [0, 1]의 segment 정보의 id
        # input_mask : 실제로 사용되는 id들 (zero-padding된 경우 제외)
        # masked_ids : 마스킹된 token들의 원래 값의 id(zero-padding됨)
        # maksed_pos : 마스킹된 token들의 위치 id
        # masked_weights : 마스크된 token의 갯수만큼 1로 채워진 배열
        # is_next : instance 생성에서 만든 값 boolean 값.
        logits_lm, logits_clsf = model(input_ids, segment_ids, input_mask,
                                       masked_pos)
        # logits_lm : [B, mS, V]
        # logits_clsf : [B, 1, 2]
        loss_lm = criterion1(logits_lm.transpose(1, 2),
                             masked_ids)  # for masked LM
        loss_lm = (loss_lm * masked_weights.float()).mean()
        loss_sop = criterion2(logits_clsf,
                              is_next)  # for sentence classification
        writer.add_scalars(
            'data/scalar_group', {
                'loss_lm': loss_lm.item(),
                'loss_sop': loss_sop.item(),
                'loss_total': (loss_lm + loss_sop).item(),
                'lr': optimizer.get_lr()[0],
            }, global_step)
        return loss_lm + loss_sop

    trainer.train(get_loss, model_file=None, data_parallel=True)
예제 #9
0
파일: train.py 프로젝트: HiitLee/SALNet
    def train(self, model_file, pretrain_file, get_loss_CNN, get_loss_Attn_LSTM, evalute_CNN_SSL, pseudo_labeling,evalute_Attn_LSTM,evalute_CNN,evalute_Attn_LSTM_SSL, generating_lexiocn, data_parallel=False):
     
        """ Train Loop """
        self.model.train() # train mode
        self.load3(model_file, pretrain_file)
        
        
        self.model2.train() # train mode
        model = self.model.to(self.device)
        model2 = self.model2.to(self.device)
        t =  self.kkk
        
        if(self.dataName == 'IMDB'):
            rnn_save_name = "./IMDB_model_save/checkpoint_RNN"+str(t)+".pt"
            cnn_save_name = "./IMDB_model_save/checkpoint_CNN"+str(t)+".pt"
            result_name = "./result/result_IMDB.txt"
            pseudo_name = "./result/pseudo_train_set_IMDB.txt"
        elif(self.dataName == "AGNews"):
            rnn_save_name = "./AGNews_model_save/checkpoint_RNN"+str(t)+".pt"
            cnn_save_name = "./AGNews_model_save/checkpoint_CNN"+str(t)+".pt"
            result_name = "./result/result_AGNews.txt"
            pseudo_name = "./result/pseudo_train_set_AGNews.txt"
        elif(self.dataName == "DBpedia"):
            rnn_save_name = "./DBpedia_model_save/checkpoint_RNN"+str(t)+".pt"
            cnn_save_name = "./DBpedia_model_save/checkpoint_CNN"+str(t)+".pt"
            result_name = "./result/result_DBpedia.txt"
            pseudo_name = "./result/pseudo_train_set_DBpedia.txt"
        elif(self.dataName == "yahoo"):
            rnn_save_name = "./yahoo_model_save/checkpoint_RNN"+str(t)+".pt"
            cnn_save_name = "./yahoo_model_save/checkpoint_CNN"+str(t)+".pt"
            result_name = "./result/result_yahoo.txt"
            pseudo_name = "./result/pseudo_train_set_yahoo.txt"

        
        
        num_a=0
        global_step = 0 # global iteration steps regardless of epochs
        global_step3 = 0

        before = -50
        curTemp=0
        print("self.cfg.n_epochs#:", self.cfg.n_epochs)
        ddf = open(result_name,'a', encoding='UTF8')
        ddf.write("############################################"+str(t)+": ramdom_samplimg###########################################"+'\n')
        ddf.close()
        
        ddf = open(pseudo_name,'a', encoding='UTF8')
        ddf.write("############################################"+str(t)+": ramdom_samplimg###########################################"+'\n')
        ddf.close()
                
        for e in range(self.cfg.n_epochs):
            if(e==0):
                temp=987654321
                early_stopping = EarlyStopping(patience=10, verbose=True)
                valid_losses = []
                
                while(1):
                    self.optimizer = optim.optim4GPU(self.cfg, model, len(self.data_iter3_b))
                    global_step = 0 # global iteration steps regardless of epochs
                    global_step3 = 0
                    loss_sum = 0. # the sum of iteration losses to get average loss in every epoch
                    iter_bar = tqdm(self.data_iter3_b, desc='Iter (loss=X.XXX)')
                    model.train()
                    for i, batch in enumerate(iter_bar):
                        batch = [t.to(self.device) for t in batch]

                        self.optimizer.zero_grad()
                        loss = get_loss_CNN(model, batch, global_step).mean() # mean() for Data Parallelism
                        loss.backward()
                        self.optimizer.step()

                        global_step += 1
                        loss_sum += loss.item()
                        iter_bar.set_description('Iter (loss=%5.3f)'%loss.item())



                    print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))
                    model.eval()# evaluation mode

                    loss_sum = 0.
                    global_step3 = 0
                    iter_bar_dev = tqdm(self.dataset_dev_b, desc='Iter (loss=X.XXX)')
                    self.optimizer = optim.optim4GPU(self.cfg, model, len(self.dataset_dev_b))
            
                    for i, batch in enumerate(iter_bar_dev):
                        batch = [t.to(self.device) for t in batch]
                        loss = get_loss_CNN(model, batch,global_step3).mean() # mean() for Data Parallelism
                        valid_losses.append(loss.item())
                        global_step3 += 1
                        loss_sum += loss.item()
                        iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item())



                    print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))

                    valid_loss = np.average(valid_losses)
                    loss_min=early_stopping(valid_loss, model,"./model_save/checkpoint_BERT_real.pt")
                    valid_losses = []

                    if early_stopping.early_stop:
                        print("Early stopping")
                        break

 
                        
                model.load_state_dict(torch.load("./model_save/checkpoint_BERT_real.pt"))
                print("Early stopping")
                model.eval()# evaluation mode
                
                p=[]
                l=[]
                p3=[]
                p2=[]
                iter_bar = tqdm(self.data_iter2_b, desc='Iter (f1-score=X.XXX)')
                for batch in iter_bar:
                    batch = [t.to(self.device) for t in batch]
                    with torch.no_grad(): # evaluation without gradient calculation
                        label_id, y_pred1 = evalute_CNN(model, batch) # accuracy to print
                        softmax = nn.Softmax()
                        y_pred3 = softmax(y_pred1)
                        #print("y_pred3#:", y_pred3)
                        y_pred33, y_pred1 = torch.max(y_pred3, 1)
                        print(y_pred1)
                        p2.append(np.ndarray.flatten(y_pred3[:, 1].data.cpu().numpy()))
                        p.append(np.ndarray.flatten(y_pred1.data.cpu().numpy()))
                        l.append(np.ndarray.flatten(label_id.data.cpu().numpy()))
                    result2  = 0
                    iter_bar.set_description('Iter(roc=%5.3f)'%result2)
                p2 = [item for sublist in p2 for item in sublist]
                p = [item for sublist in p for item in sublist]
                l = [item for sublist in l for item in sublist]
                p=np.array(p)
                l=np.array(l)
                F1score = f1_score(l,p,average='micro')
                accur = accuracy_score(l,p)
                ddf = open(result_name,'a', encoding='UTF8')
                ddf.write(str(t)+": "+ str(num_a)+"aucr: "+str(accur)+"f1-score: "+str(F1score)+'\n')
                ddf.close()
                num_a+=1
                
  
                temp=987654321
                early_stopping = EarlyStopping(patience=30, verbose=True)
                valid_losses = []
                while(1):
                    model2.train()
                    loss_sum = 0
                    global_step3 = 0
                    iter_bar3 = tqdm(self.data_iter3, desc='Iter (loss=X.XXX)')
                    for i, batch in enumerate(iter_bar3):
                        batch = [t.to(self.device) for t in batch]
                        loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism
                        self.optimizer2.zero_grad()
                        loss.backward()
                        self.optimizer2.step()
                        global_step3 += 1
                        loss_sum += loss.item()
                        iter_bar3.set_description('Iter (loss=%5.3f)'%loss.item())

                        if global_step3 % self.cfg.save_steps == 0: # save
                            self.save(global_step3)

                        if self.cfg.total_steps and self.cfg.total_steps < global_step3:
                            print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))
                            print('The Total Steps have been reached.')
                            self.save(global_step3) # save and finish when global_steps reach total_steps
                            return
                        
                    print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))
                    model2.eval()
                    loss_sum = 0.
                    global_step3 = 0
                    iter_bar_dev = tqdm(self.dataset_dev, desc='Iter (loss=X.XXX)')
                    for i, batch in enumerate(iter_bar_dev):
                        batch = [t.to(self.device) for t in batch]
                        loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism
                        valid_losses.append(loss.item())
                        global_step3 += 1
                        loss_sum += loss.item()
                        iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item())

                        if global_step3 % self.cfg.save_steps == 0: # save
                            self.save(global_step3)

                        if self.cfg.total_steps and self.cfg.total_steps < global_step3:
                            print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))
                            print('The Total Steps have been reached.')
                            self.save(global_step3) # save and finish when global_steps reach total_steps
                            return

                    print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))
                    valid_loss = np.average(valid_losses)
                    loss_min=early_stopping(valid_loss, model2,"./model_save/checkpoint_LSTM_real.pt")
                    valid_losses = []
                    if early_stopping.early_stop:
                        print("Early stopping")
                        break

                
                model2.eval()
                p=[]
                l=[]
                p3=[]
                iter_bar4 = tqdm(self.data_iter2, desc='Iter (f1-score=X.XXX)')
                global_step3=0
                for batch in iter_bar4:
                    batch = [t.to(self.device) for t in batch]
                    with torch.no_grad(): # evaluation without gradient calculation
                        label_id, y_pred1 = evalute_Attn_LSTM(model2, batch, global_step3,len(iter_bar4))# accuracy to print
                        _, y_pred3 = y_pred1.max(1)
                        global_step3+=1
                        p2=[]
                        l2=[]
                        for i in range(0,len(y_pred3)):
                            p3.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy()))
                            l.append(np.ndarray.flatten(label_id[i].data.cpu().numpy()))
                            p2.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy()))
                            l2.append(np.ndarray.flatten(label_id[i].data.cpu().numpy()))
                    p2 = [item for sublist in p2 for item in sublist]
                    l2 = [item for sublist in l2 for item in sublist]
                    result2  = f1_score(l2, p2,average='micro')
                    iter_bar4.set_description('Iter(roc=%5.3f)'%result2)
                p3 = [item for sublist in p3 for item in sublist]
                l = [item for sublist in l for item in sublist]
                p=np.array(p)
                l=np.array(l)
                results2  = accuracy_score(l, p3)
                F1score = f1_score(l,p3,average='micro')
                ddf = open(result_name,'a', encoding='UTF8')
                ddf.write(str(t)+": "+str(num_a)+"aucr: "+str(results2)+"f1-score: "+str(F1score)+'\n')
                ddf.close()
                num_a+=1
                

                
            elif(e%2==1):
                global_step1 = 0
                model2.eval()
                labell=[]
                iter_bar = tqdm(self.data_iter, desc='Iter (loss=X.XXX)')
                for batch in iter_bar:
                    batch = [t.to(self.device) for t in batch]
                    with torch.no_grad(): # evaluation without gradient calculation
                        label_id, y_pred1 = generating_lexiocn(model2, batch,global_step1,len(iter_bar),e) # accuracy to print
                        global_step1+=1
                        
                        
                        
                        
                global_step1 = 0
                model.eval()
                labell=[]
                iter_bar = tqdm(self.data_iter_b, desc='Iter (loss=X.XXX)')
                for batch in iter_bar:
                    batch = [t.to(self.device) for t in batch]
                    with torch.no_grad(): # evaluation without gradient calculation
                        label_id, y_pred1 = evalute_CNN_SSL(model, batch,global_step1) # accuracy to print
                        global_step1+=1
                        
                        
                
                global_step1 = 0

                model2.eval()
                sen = []
                labell=[]
                iter_bar = tqdm(self.data_iter, desc='Iter (loss=X.XXX)')
                for batch in iter_bar:
                    batch = [t.to(self.device) for t in batch]
                    with torch.no_grad(): # evaluation without gradient calculation
                        label_id, y_pred1,result_label,result3,data_temp, data_temp_b, data_iter_temp_na, data_iter_temp_na_b = pseudo_labeling( model2,batch,global_step1,len(iter_bar),e) # accuracy to print
                        global_step1+=1
        
                self.data_iter_temp = data_temp
                self.data_iter_temp_b = data_temp_b
                self.data_iter = data_iter_temp_na
                self.data_iter_b = data_iter_temp_na_b
                #print(result3)
                num_good=0
                num_label=0
                num_label1=0
                ddf = open(pseudo_name,'a', encoding='UTF8')
                
                for i in range(0, len(result3)):
                    sen.append(result3[i])
                
                num_label=0
                num_label1=0
                num_good = 0
                for i in range(0, len(result3)):
                    if(result3[i] != -1):
                        num_good +=1
                        if(result3[i] == result_label[i]):
                            num_label+=1
                
                ddf.write(str(t)+"  " +"number of good :"+str(num_good)+" ")
                ddf.write("number of label :"+str(num_label)+" ")
                ddf.write("\n")
                ddf.close()
                print("num_good#:", num_good)
                print("before#:", before)
                if(num_good  < self.stopNum):
                    curTemp+=1
                else:
                    curTemp=0
                if(curTemp>=2):
                    break
          

                    

            elif(e%2==0 ):
                self.model.train() # train mode
                self.load3(model_file, pretrain_file)
                model = self.model.to(self.device)
        
                b=0
                early_stopping = EarlyStopping(patience=1, verbose=True)
                valid_losses = []
                bb=987654321
                
                
                while(1):
                    self.optimizer = optim.optim4GPU(self.cfg, model, len(self.data_iter_temp_b))
                    iter_bar = tqdm(self.data_iter_temp_b, desc='Iter (loss=X.XXX)')
                    model.train()
                    global_step = 0 
                    global_step3 = 0
                    valid_losses2 = []
                    for i, batch in enumerate(iter_bar):
                        batch = [t.to(self.device) for t in batch]
                        self.optimizer.zero_grad()
                        loss = get_loss_CNN(model, batch, global_step).mean() # mean() for Data Parallelism
                        valid_losses2.append(loss.item())
                        loss.backward()
                        self.optimizer.step()
                        global_step += 1
                        loss_sum += loss.item()
                        iter_bar.set_description('Iter (loss=%5.3f)'%loss.item())

                    print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))
                    valid_loss2 = np.average(valid_losses2)
                    bb= min(bb, valid_loss2.item())
                               
                    valid_losses2 = []
                    model.eval()# evaluation mode
                    loss_sum = 0.
                    global_step3 = 0
                    iter_bar_dev = tqdm(self.dataset_dev_b, desc='Iter (loss=X.XXX)')
                    self.optimizer = optim.optim4GPU(self.cfg, model, len(self.dataset_dev_b))
            
                    for i, batch in enumerate(iter_bar_dev):
                        batch = [t.to(self.device) for t in batch]
                        loss = get_loss_CNN(model, batch,global_step3).mean() # mean() for Data Parallelism
                        valid_losses.append(loss.item())
                        global_step3 += 1
                        loss_sum += loss.item()
                        iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item())

                    print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))

                    valid_loss = np.average(valid_losses)
                    loss_min=early_stopping(valid_loss, model,cnn_save_name)
                    valid_losses = []

                    if early_stopping.early_stop:
                        print("Early stopping")
                        break
   
                model.load_state_dict(torch.load(cnn_save_name))
                model.eval()# evaluation mode
                self.model.eval()# evaluation mode
                
                p=[]
                l=[]
                p3=[]
                p2=[]
                iter_bar = tqdm(self.data_iter2_b, desc='Iter (f1-score=X.XXX)')
                for batch in iter_bar:
                    batch = [t.to(self.device) for t in batch]
                    with torch.no_grad(): # evaluation without gradient calculation
                        label_id, y_pred1 = evalute_CNN(model, batch) # accuracy to print
                        softmax = nn.Softmax()
                        y_pred3 = softmax(y_pred1)
                        y_pred33, y_pred1 = torch.max(y_pred3, 1)
                        p2.append(np.ndarray.flatten(y_pred3[:, 1].data.cpu().numpy()))
                        p.append(np.ndarray.flatten(y_pred1.data.cpu().numpy()))
                        l.append(np.ndarray.flatten(label_id.data.cpu().numpy()))
                    result2  = 0
                    iter_bar.set_description('Iter(roc=%5.3f)'%result2)
                p2 = [item for sublist in p2 for item in sublist]
                p = [item for sublist in p for item in sublist]
                l = [item for sublist in l for item in sublist]
                p=np.array(p)
                l=np.array(l)
                F1score = f1_score(l,p,average='micro')
                accur = accuracy_score(l,p)

                ddf = open(result_name,'a', encoding='UTF8')
                ddf.write(str(t)+": "+str(num_a)+"aucr: "+str(accur)+"f1-score: "+str(F1score)+'\n')
                ddf.close()
                num_a+=1
               
     
                valid_losses = []            
                temp = 987654321
                early_stopping = EarlyStopping(patience=10, verbose=True)
                while(1):
                    model2.train()
                    l=0
                    l_sum=0
                    loss_sum = 0
                    global_step3 = 0
                    iter_bar3 = tqdm(self.data_iter_temp, desc='Iter (loss=X.XXX)')
                    for i, batch in enumerate(iter_bar3):
                        batch = [t.to(self.device) for t in batch]
                        loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism
                        self.optimizer2.zero_grad()
                        loss.backward()
                        self.optimizer2.step()
                        global_step3 += 1
                        loss_sum += loss.item()
                        iter_bar3.set_description('Iter (loss=%5.3f)'%loss.item())

                     
                        
                    print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))
                    model2.eval()
                    loss_sum = 0.
                    global_step3 = 0
                    iter_bar_dev = tqdm(self.dataset_dev, desc='Iter (loss=X.XXX)')
                    for i, batch in enumerate(iter_bar_dev):
                        batch = [t.to(self.device) for t in batch]
                        loss = get_loss_Attn_LSTM(model2, batch, global_step3).mean() # mean() for Data Parallelism
                        valid_losses.append(loss.item())
                        global_step3 += 1
                        loss_sum += loss.item()
                        iter_bar_dev.set_description('Iter (loss=%5.3f)'%loss.item())

                       

                    print('Epoch %d/%d : Average Loss %5.3f'%(e+1, self.cfg.n_epochs, loss_sum/(i+1)))
                    valid_loss = np.average(valid_losses)
                    loss_min=early_stopping(valid_loss, model2,rnn_save_name)
                    valid_losses = []

                    if early_stopping.early_stop:
                        print("Early stopping")
                        break

                model2.load_state_dict(torch.load(rnn_save_name))   
                model2.eval()
                p=[]
                l=[]
                p3=[]
                
                iter_bar4 = tqdm(self.data_iter2, desc='Iter (f1-score=X.XXX)')
                for batch in iter_bar4:
                    batch = [t.to(self.device) for t in batch]
                    with torch.no_grad(): 
                        label_id, y_pred1 = evalute_Attn_LSTM_SSL(model2, batch) 
                        _, y_pred3 = y_pred1.max(1)
                        p2=[]
                        l2=[]
                        
                        for i in range(0,len(y_pred3)):
                            p3.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy()))
                            l.append(np.ndarray.flatten(label_id[i].data.cpu().numpy()))
                            p2.append(np.ndarray.flatten(y_pred3[i].data.cpu().numpy()))
                            l2.append(np.ndarray.flatten(label_id[i].data.cpu().numpy()))
                    p2 = [item for sublist in p2 for item in sublist]
                    l2 = [item for sublist in l2 for item in sublist]
      
                    result2  = f1_score(l2, p2,average='micro')
                    iter_bar4.set_description('Iter(roc=%5.3f)'%result2)
                p3 = [item for sublist in p3 for item in sublist]
                l = [item for sublist in l for item in sublist]
                p=np.array(p)
                l=np.array(l)
                results2  = accuracy_score(l, p3)
                F1score = f1_score(l,p3,average='micro')
                ddf = open(result_name,'a', encoding='UTF8')
                ddf.write(str(t)+": "+str(num_a)+"aucr: "+str(results2)+"f1-score: "+str(F1score)+'\n')
                ddf.close()
                num_a+=1
예제 #10
0
def main(task='mrpc',
         train_cfg='./model/config/train_mrpc.json',
         model_cfg='./model/config/bert_base.json',
         data_train_file='total_data/imdbtrain.tsv',
         data_test_file='total_data/IMDB_test.tsv',
         model_file=None,
         pretrain_file='./model/uncased_L-12_H-768_A-12/bert_model.ckpt',
         data_parallel=False,
         vocab='./model/uncased_L-12_H-768_A-12/vocab.txt',
         dataName='IMDB',
         stopNum=250,
         max_len=300,
         mode='train'):

    if mode == 'train':

        def get_loss_CNN(model, batch,
                         global_step):  # make sure loss is a scalar tensor
            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch
            logits = model(input_ids, segment_ids, input_mask)
            loss = criterion(logits, label_id)
            return loss

        def evalute_CNN(model, batch):
            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch
            logits = model(input_ids, segment_ids, input_mask)

            return label_id, logits

        def get_loss_Attn_LSTM(
                model, batch,
                global_step):  # make sure loss is a scalar tensor
            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch

            seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
            input_ids = input_ids[perm_idx]
            label_id = label_id[perm_idx]
            token1 = embedding(input_ids.long())

            logits, attention_score = model(token1.cuda(), input_ids,
                                            segment_ids, input_mask,
                                            seq_lengths)

            loss1 = criterion(logits, label_id)
            return loss1

        def evalute_Attn_LSTM(model, batch, global_step, ls):
            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch

            seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
            input_ids = input_ids[perm_idx]
            label_id = label_id[perm_idx]
            token1 = embedding(input_ids.long())

            logits, attention_score = model(token1.cuda(), input_ids,
                                            segment_ids, input_mask,
                                            seq_lengths)
            logits = F.softmax(logits)

            y_pred11, y_pred1 = logits.max(1)

            return label_id, logits

        def generating_lexiocn(model2, batch, global_step, ls, e):
            if (global_step == 0):
                result3.clear()
                result_label.clear()
                bb_11.clear()
                bb_22.clear()

            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch

            seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
            input_ids = input_ids[perm_idx]
            label_id = label_id[perm_idx]
            token1 = embedding(input_ids.long())
            #logits = model(input_ids, segment_ids, input_mask)
            logits2, attention_score2 = model2(token1.cuda(), input_ids,
                                               segment_ids, input_mask,
                                               seq_lengths)

            #logits=F.softmax(logits)
            logits = F.softmax(logits2)
            # y_pred11, y_pred1 = logits.max(1)
            y_pred22, y_pred2 = logits2.max(1)
            atten, attn_s1 = attention_score2.max(1)
            atte2, attn_s2 = torch.topk(attention_score2, 4)

            for i in range(0, len(input_ids)):

                split_tokens = []
                att_index = []
                for token in tokenizer.tokenize(data0[global_step * 64 +
                                                      perm_idx[i]]):
                    split_tokens.append(token)

                if (len(split_tokens) <= attn_s1[i].item()):
                    attn_index3 = attention_score2[i][:len(split_tokens) - 1]
                    attn_num, attn_index2 = attn_index3.max(0)
                    attn_index = attn_index2.item()
                else:
                    for j in range(0, 4):
                        att_index.append(attn_s2[i][j].item())

                tok = []
                if (atten[i].item() <= 0):
                    token_ab = split_tokens[0]
                else:
                    for j in range(0, len(att_index)):
                        if (att_index[j] >= len(split_tokens)):
                            continue
                        tok.append(split_tokens[att_index[j]])

                token_temp = data0[global_step * 64 + perm_idx[i]].split(' ')
                token2 = []
                for kk in range(0, len(tok)):
                    token_ab = tok[kk]
                    token_ab = token_ab.replace(".", "")
                    token_ab = token_ab.replace(",", "")
                    token_ab = token_ab.replace("'", "")
                    token_ab = token_ab.replace("!", "")
                    token_ab = token_ab.replace("?", "")
                    token_ab = token_ab.replace("'", "")
                    token_ab = token_ab.replace('"', "")
                    if (token_ab == '' or token_ab == ' ' or token_ab == ','
                            or token_ab == '.' or token_ab == 'from'
                            or token_ab == 'are' or token_ab == 'is'
                            or token_ab == 'and' or token_ab == 'with'
                            or token_ab == 'may' or token_ab == 'would'
                            or token_ab == 'could' or token_ab == 'have'
                            or token_ab == 'has' or token_ab == 'had'
                            or token_ab == 'was' or token_ab == 'were'
                            or token_ab == 'this' or token_ab == 'who'
                            or token_ab == 'that' or token_ab == 'www'
                            or token_ab == 'http' or token_ab == 'com'
                            or token_ab == 'those' or token_ab == 'your'
                            or token_ab == 'not' or token_ab == 'seem'
                            or token_ab == 'too' or token_ab == 'lol'
                            or token_ab == 'but' or token_ab == 'these'
                            or token_ab == 'their' or token_ab == 'can'
                            or token_ab == 'there' or token_ab == 'gave'
                            or token_ab == 'his' or token_ab == 'etc'
                            or token_ab == 'thats' or token_ab == 'though'
                            or token_ab == 'off' or token_ab == 'she'
                            or token_ab == 'them' or token_ab == 'huh'
                            or token_ab == 'why' or token_ab == 'wont'
                            or token_ab == 'any' or token_ab == 'some'
                            or token_ab == 'its' or token_ab == 'yeah'
                            or token_ab == 'yes' or token_ab == 'you'
                            or token_ab == 'should' or token_ab == 'dont'
                            or token_ab == 'anybody' or token_ab == 'than'
                            or token_ab == 'where' or token_ab == 'for'
                            or token_ab == 'more' or token_ab == 'will'
                            or token_ab == 'him' or token_ab == 'its'
                            or token_ab == 'your' or token_ab == 'wii'
                            or token_ab == 'having' or token_ab == 'just'
                            or token_ab == 'help' or token_ab == 'helps'
                            or token_ab == 'all' or token_ab == 'they'
                            or token_ab == 'take' or token_ab == 'the'
                            or token_ab == 'what' or token_ab == 'need'
                            or token_ab == 'make' or token_ab == 'about'
                            or token_ab == 'then' or token_ab == 'when'
                            or token_ab == 'does' or token_ab == 'ask'
                            or token_ab == 'much' or token_ab == 'man'
                            or token_ab == 'know' or token_ab == 'how'
                            or token_ab == 'look' or token_ab == 'like'
                            or token_ab == 'one' or token_ab == 'think'
                            or token_ab == 'tell' or token_ab == 'find'
                            or token_ab == 'cant' or token_ab == 'now'
                            or token_ab == 'try' or token_ab == 'give'
                            or token_ab == 'answer' or token_ab == 'her'
                            or token_ab == 'out' or token_ab == 'get'
                            or token_ab == 'because' or token_ab == 'myself'
                            or token_ab == 'wants' or token_ab == 'movie'
                            or token_ab == 'film' or token_ab == 'films'):
                        continue

                    if (len(token_ab) < 2):
                        continue

                    for gge, input_word in enumerate(token_temp):

                        if (token_ab.lower() in input_word.lower()):
                            input_word = input_word.replace(".", "")
                            input_word = input_word.replace(",", "")
                            input_word = input_word.replace("'", "")
                            input_word = input_word.replace("!", "")
                            input_word = input_word.replace("?", "")
                            input_word = input_word.replace("'", "")
                            input_word = input_word.replace('"', "")

                            token2.append(input_word.lower())
                            break
                token2 = list(set(token2))

                if (len(token2) < 3):
                    continue
            #print(token2)
                sen = ""
                for l in range(0, len(token2) - 1):
                    sen += token2[l] + ' '
                sen += token2[len(token2) - 1]
                if (y_pred2[i] == 0):
                    try:
                        bb_11[sen] += y_pred22[i]
                    except KeyError:
                        bb_11[sen] = y_pred22[i]

                if (y_pred2[i] == 1):
                    try:
                        bb_22[sen] += y_pred22[i]
                    except KeyError:
                        bb_22[sen] = y_pred22[i]

            if (global_step == ls - 1):

                abusive_11.clear()
                abusive_22.clear()

                bb_11_up = sorted(bb_11.items(),
                                  key=lambda x: x[1],
                                  reverse=True)
                bb_22_up = sorted(bb_22.items(),
                                  key=lambda x: x[1],
                                  reverse=True)

                lexicon_size = 50
                bb_11_up = bb_11_up[:lexicon_size]
                bb_22_up = bb_22_up[:lexicon_size]

                for i in bb_11_up:
                    flag = 0
                    for j in bb_22_up:
                        if ((i[0].lower() in j[0].lower())
                                or (j[0].lower() in i[0].lower())):
                            if (i[1] < j[1]):
                                flag = 1
                                break

                    if (flag == 0):
                        abusive_11.append(i[0])

                for i in bb_22_up:
                    flag = 0
                    for j in bb_11_up:
                        if ((i[0].lower() in j[0].lower())
                                or (j[0].lower() in i[0].lower())):
                            if (i[1] < j[1]):
                                flag = 1
                                break

                    if (flag == 0):
                        abusive_22.append(i[0])

                ddf = open("./IMDB_Lexicon/imdbLexicon_1.txt",
                           'w',
                           encoding='UTF8')

                for i in range(0, len(abusive_11)):
                    ddf.write(abusive_11[i] + '\n')

                ddf.close()

                ddf = open("./IMDB_Lexicon/imdbLexicon_2.txt",
                           'w',
                           encoding='UTF8')

                for i in range(0, len(abusive_22)):
                    ddf.write(abusive_22[i] + '\n')

                ddf.close()
            return label_id, logits

        def evalute_CNN_SSL(model, batch, global_step):
            if (global_step == 0):
                result5.clear()

            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch

            logits = model(input_ids, segment_ids, input_mask)

            logits = F.softmax(logits)
            y_pred11, y_pred1 = logits.max(1)

            for i in range(0, len(input_ids)):
                result5.append([y_pred1[i].item(), y_pred11[i].item()])

            return label_id, logits

        def pseudo_labeling(model2, batch, global_step, ls, e):
            if (global_step == 0):
                result3.clear()
                result4.clear()

                label_0.clear()
                label_1.clear()

                result_label.clear()

                abusive_11.clear()
                abusive_22.clear()

                abusive_dic_file = open("./IMDB_Lexicon/imdbLexicon_1.txt",
                                        'r',
                                        encoding='UTF8')
                for line in abusive_dic_file.read().split('\n'):
                    if (len(line) <= 3):
                        continue
                    abusive_11.append(line)
                abusive_dic_file.close()

                abusive_dic_file = open("./IMDB_Lexicon/imdbLexicon_2.txt",
                                        'r',
                                        encoding='UTF8')
                for line in abusive_dic_file.read().split('\n'):
                    if (len(line) <= 3):
                        continue
                    abusive_22.append(line)
                abusive_dic_file.close()

            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch

            seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
            input_ids = input_ids[perm_idx]
            label_id = label_id[perm_idx]
            token1 = embedding(input_ids.long())

            logits2, attention_score2 = model2(token1.cuda(), input_ids,
                                               segment_ids, input_mask,
                                               seq_lengths)

            logits2 = F.softmax(logits2)

            y_pred22, y_pred2 = logits2.max(1)

            label_id2 = []

            for i in range(0, len(input_ids)):
                input_sentence = data0[global_step * 64 + perm_idx[i]]
                input_sentence = re.sub("[!@#$%^&*().?\"~/<>:;'{}]", "",
                                        input_sentence)

                matching_word1 = 3
                matching_word2 = 4
                abusive_word_list_neg11 = list()
                abusive_word_list_neg11 += matching_blacklist2(
                    abusive_11, input_sentence, matching_word1)
                abusive_word_list_neg11 = list((set(abusive_word_list_neg11)))

                abusive_word_list_neg22 = list()
                abusive_word_list_neg22 += matching_blacklist2(
                    abusive_22, input_sentence, matching_word1)
                abusive_word_list_neg22 = list((set(abusive_word_list_neg22)))

                abusive_word_list_neg111 = list()
                abusive_word_list_neg111 += matching_blacklist2(
                    abusive_11, input_sentence, matching_word2)
                abusive_word_list_neg111 = list(
                    (set(abusive_word_list_neg111)))

                abusive_word_list_neg222 = list()
                abusive_word_list_neg222 += matching_blacklist2(
                    abusive_22, input_sentence, matching_word2)
                abusive_word_list_neg222 = list(
                    (set(abusive_word_list_neg222)))

                a = max(len(abusive_word_list_neg11),
                        len(abusive_word_list_neg22))
                aa = max(len(abusive_word_list_neg111),
                         len(abusive_word_list_neg222))

                if ((len(abusive_word_list_neg11) >
                     len(abusive_word_list_neg22)
                     and result5[global_step * 64 + perm_idx[i]][0] == 0
                     and result5[global_step * 64 + perm_idx[i]][1] >= 0.9) or
                    (len(abusive_word_list_neg11) >
                     len(abusive_word_list_neg22) and y_pred2[i].item() == 0
                     and y_pred22[i].item() >= 0.9)):
                    label_0.append(0)
                    result4.append([
                        global_step * 64 + perm_idx[i], 0,
                        data0[global_step * 64 + perm_idx[i]],
                        label_id[perm_idx[i]].item()
                    ])
                elif ((len(abusive_word_list_neg11) <
                       len(abusive_word_list_neg22)
                       and result5[global_step * 64 + perm_idx[i]][0] == 1
                       and result5[global_step * 64 + perm_idx[i]][1] >= 0.9)
                      or
                      (len(abusive_word_list_neg11) <
                       len(abusive_word_list_neg22) and y_pred2[i].item() == 1
                       and y_pred22[i].item() >= 0.9)):
                    label_1.append(1)
                    result4.append([
                        global_step * 64 + perm_idx[i], 1,
                        data0[global_step * 64 + perm_idx[i]],
                        label_id[perm_idx[i]].item()
                    ])

                elif (aa >= 1 and len(abusive_word_list_neg111) >
                      len(abusive_word_list_neg222)):
                    label_0.append(0)
                    result4.append([
                        global_step * 64 + perm_idx[i], 0,
                        data0[global_step * 64 + perm_idx[i]],
                        label_id[perm_idx[i]].item()
                    ])
                elif (aa >= 1 and len(abusive_word_list_neg111) <
                      len(abusive_word_list_neg222)):
                    label_1.append(1)
                    result4.append([
                        global_step * 64 + perm_idx[i], 1,
                        data0[global_step * 64 + perm_idx[i]],
                        label_id[perm_idx[i]].item()
                    ])
                elif (result5[global_step * 64 + perm_idx[i]][1]
                      and y_pred22[i].item() >= 0.9
                      and result5[global_step * 64 + perm_idx[i]][0]
                      == y_pred2[i].item()):
                    if (result5[global_step * 64 + perm_idx[i]][0] == 0):
                        label_0.append(0)
                        result4.append([
                            global_step * 64 + perm_idx[i], 0,
                            data0[global_step * 64 + perm_idx[i]],
                            label_id[perm_idx[i]].item()
                        ])
                    elif (result5[global_step * 64 + perm_idx[i]][0] == 1):
                        label_1.append(1)
                        result4.append([
                            global_step * 64 + perm_idx[i], 1,
                            data0[global_step * 64 + perm_idx[i]],
                            label_id[perm_idx[i]].item()
                        ])

                else:
                    result4.append([
                        global_step * 64 + perm_idx[i], -1,
                        data0[global_step * 64 + perm_idx[i]],
                        label_id[perm_idx[i]].item()
                    ])

            if (global_step == ls - 1):

                result_label.clear()
                result3.clear()

                print("###result3[i] ###:", len(result3))
                a = min(len(label_0), len(label_1))

                la_0 = 0
                la_1 = 0
                la_2 = 0
                la_3 = 0

                random.shuffle(result4)

                for i in range(0, len(result4)):

                    if (result4[i][1] == 0 and la_0 < a):
                        if (temp_check[result4[i][0]][0] == 0):
                            temp_check[result4[i][0]][0] = 1
                            temp_check[result4[i][0]][1] = 0
                            la_0 += 1
                            continue

                    elif (result4[i][1] == 1 and la_1 < a):
                        if (temp_check[result4[i][0]][0] == 0):
                            temp_check[result4[i][0]][0] = 1
                            temp_check[result4[i][0]][1] = 1
                            la_1 += 1
                            continue

                result_label.clear()
                result3.clear()

                fw = open('./temp_data/temp_train_IMDB.tsv',
                          'a',
                          encoding='utf-8',
                          newline='')
                wr = csv.writer(fw, delimiter='\t')

                fww = open('./temp_data/temp_train_na_IMDB.tsv',
                           'w',
                           encoding='utf-8',
                           newline='')
                wrr = csv.writer(fww, delimiter='\t')

                for i in range(0, len(temp_check)):
                    if (temp_check[i][0] == 1):
                        result_label.append(str(temp_check[i][3]))
                        result3.append(str(temp_check[i][1]))
                        wr.writerow(
                            [str(temp_check[i][1]),
                             str(temp_check[i][2])])
                    else:
                        wrr.writerow(
                            [str(temp_check[i][3]),
                             str(temp_check[i][2])])

                fw.close()
                fww.close()
                data0.clear()
                temp_check.clear()
                with open('./temp_data/temp_train_na_IMDB.tsv',
                          "r",
                          encoding='utf-8') as f:
                    lines = csv.reader(f, delimiter='\t')

                    for i in lines:
                        a = ''
                        lines2 = i[1].split(' ')
                        b = 0
                        for j in range(0, len(lines2)):
                            a += lines2[j] + ' '
                            b += 1

                        data0.append(a)
                        temp_check.append([0, -1, a, i[0]])
                print("################;", len(data0))
                f.close()

                dataset_temp = TaskDataset('./temp_data/temp_train_IMDB.tsv',
                                           pipeline)
                data_iter_temp = DataLoader(dataset_temp,
                                            batch_size=64,
                                            shuffle=True)

                dataset_temp_b = TaskDataset('./temp_data/temp_train_IMDB.tsv',
                                             pipeline1)
                data_iter_temp_b = DataLoader(dataset_temp_b,
                                              batch_size=64,
                                              shuffle=True)

                dataset_temp_na = TaskDataset(
                    './temp_data/temp_train_na_IMDB.tsv', pipeline)
                data_iter_temp_na = DataLoader(dataset_temp_na,
                                               batch_size=64,
                                               shuffle=False)

                dataset_temp_na_b = TaskDataset(
                    './temp_data/temp_train_na_IMDB.tsv', pipeline1)
                data_iter_temp_na_b = DataLoader(dataset_temp_na_b,
                                                 batch_size=64,
                                                 shuffle=False)

            if (global_step != ls - 1):
                dataset_temp = TaskDataset(data_dev_file, pipeline)
                data_iter_temp = DataLoader(dataset_temp,
                                            batch_size=cfg.batch_size,
                                            shuffle=True)

                dataset_temp_b = TaskDataset(data_dev_file, pipeline1)
                data_iter_temp_b = DataLoader(dataset_temp_b,
                                              batch_size=64,
                                              shuffle=True)

                dataset_temp_na = TaskDataset(data_dev_file, pipeline)
                data_iter_temp_na = DataLoader(dataset_temp_na,
                                               batch_size=cfg.batch_size,
                                               shuffle=False)

                dataset_temp_na_b = TaskDataset(data_dev_file, pipeline1)
                data_iter_temp_na_b = DataLoader(dataset_temp_na_b,
                                                 batch_size=64,
                                                 shuffle=False)

            return label_id, logits2, result_label, result3, data_iter_temp, data_iter_temp_b, data_iter_temp_na, data_iter_temp_na_b

        def evalute_Attn_LSTM_SSL(model, batch):

            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch

            seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
            input_ids = input_ids[perm_idx]
            label_id = label_id[perm_idx]
            token1 = embedding(input_ids.long())

            logits, attention_score = model2(token1.cuda(), input_ids,
                                             segment_ids, input_mask,
                                             seq_lengths)

            return label_id, logits

        curNum = 1

        print("###########################################")
        print(model_cfg)
        print(model_cfg)
        #kkk+=1

        cfg = train.Config.from_json(train_cfg)
        model_cfg = models.Config.from_json(model_cfg)

        for kkk in range(0, 5):
            print("###########################################")

            tokenizer = tokenization.FullTokenizer(do_lower_case=True)
            tokenizer1 = tokenization.FullTokenizer1(vocab_file=vocab,
                                                     do_lower_case=True)

            TaskDataset = dataset_class(
                task)  # task dataset class according to the task

            pipeline = [
                Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
                AddSpecialTokensWithTruncation(max_len),
                TokenIndexing(tokenizer.convert_tokens_to_ids,
                              TaskDataset.labels, max_len)
            ]

            pipeline1 = [
                Tokenizing(tokenizer1.convert_to_unicode, tokenizer1.tokenize),
                AddSpecialTokensWithTruncation(max_len),
                TokenIndexing(tokenizer1.convert_tokens_to_ids1,
                              TaskDataset.labels, max_len)
            ]

            fd = open("./total_data/imdbtrain.tsv", 'r', encoding='utf-8')
            rdr = csv.reader(fd, delimiter='\t')

            res = []
            num_a = 0
            num_b = 0
            for line in rdr:
                #print(line)
                num_a += 1
                res.append([line[0], line[1]])

            print("curNum#:", curNum)
            #print(res)
            fw = open('./data/IMDB_temp_short.tsv',
                      'w',
                      encoding='utf-8',
                      newline='')
            wr = csv.writer(fw, delimiter='\t')

            for i in range(0, curNum):
                random.shuffle(res)
                #print(res[1][0])
                print("########")
            curNum += 100
            num_data = len(res)
            num_data_dev_temp = int(num_data * 0.01)
            num_data_dev = int(num_data_dev_temp * 0.15)
            num_data_short = int(num_data_dev_temp * 0.85)
            num_data_train = num_data - num_data_dev_temp
            fd.close()

            num = 0

            data_train_file = "./data/IMDB_train" + str(kkk + 1) + ".tsv"
            data_dev_file = "./data/IMDB_dev" + str(kkk + 1) + ".tsv"
            data_short_file = "./data/IMDB_short" + str(kkk + 1) + ".tsv"

            print("num_data_dev#:", num_data_dev)
            print("num_data_short#:", num_data_short)
            print("num_data_train#:", num_data_train)
            fw = open('./data/IMDB_temp_short.tsv',
                      'w',
                      encoding='utf-8',
                      newline='')
            wr = csv.writer(fw, delimiter='\t')

            fe = open(data_train_file, 'w', encoding='utf-8', newline='')
            we = csv.writer(fe, delimiter='\t')

            res2 = []
            num_pos = 0
            num_neg = 0
            for line in res:
                #print(line[0])
                #print(line[1])
                if (line[0] == '0' and num_pos <= (num_data_dev_temp / 2)):
                    num_pos += 1
                    wr.writerow(['0', line[1]])
                elif (line[0] == '1' and num_neg <= (num_data_dev_temp / 2)):
                    num_neg += 1
                    wr.writerow(['1', line[1]])
                else:
                    num += 1
                    we.writerow([line[0], line[1]])

            fw.close()
            fe.close()

            print("num_pos #:", num_pos, " num_neg:", num_neg)

            f = open('./data/IMDB_temp_short.tsv', 'r', encoding='utf-8')
            rdr = csv.reader(f, delimiter='\t')
            num_pos = 0
            num_neg = 0
            num = 0

            fw = open(data_dev_file, 'w', encoding='utf-8', newline='')
            wr = csv.writer(fw, delimiter='\t')

            fe = open(data_short_file, 'w', encoding='utf-8', newline='')
            we = csv.writer(fe, delimiter='\t')

            for line in rdr:
                #print(line[0])
                if (line[0] == '0' and num_pos <= (num_data_dev / 2)):
                    num_pos += 1
                    wr.writerow(['0', line[1]])
                elif (line[0] == '1' and num_neg <= (num_data_dev / 2)):
                    num_neg += 1
                    wr.writerow(['1', line[1]])
                else:
                    num += 1
                    we.writerow([line[0], line[1]])

            print("num_pos #:", num_pos, " num_neg:", num_neg)
            f.close()
            fw.close()
            fe.close()

            dataset = TaskDataset(data_train_file, pipeline)
            data_iter = DataLoader(dataset, batch_size=64, shuffle=False)

            dataset_b = TaskDataset(data_train_file, pipeline1)
            data_iter_b = DataLoader(dataset_b, batch_size=64, shuffle=False)

            dataset2 = TaskDataset(data_test_file, pipeline)
            data_iter2 = DataLoader(dataset2, batch_size=64, shuffle=False)

            dataset2_b = TaskDataset(data_test_file, pipeline1)
            data_iter2_b = DataLoader(dataset2_b, batch_size=64, shuffle=False)

            dataset_dev = TaskDataset(data_dev_file, pipeline)
            data_iter_dev = DataLoader(dataset_dev,
                                       batch_size=64,
                                       shuffle=False)

            dataset_dev_b = TaskDataset(data_dev_file, pipeline1)
            data_iter_dev_b = DataLoader(dataset_dev_b,
                                         batch_size=64,
                                         shuffle=False)

            dataset3 = TaskDataset(data_short_file, pipeline)
            data_iter3 = DataLoader(dataset3, batch_size=64, shuffle=True)

            dataset3_b = TaskDataset(data_short_file, pipeline1)
            data_iter3_b = DataLoader(dataset3_b, batch_size=64, shuffle=True)

            print("###########################################")
            print(model_cfg)
            weights = tokenization.embed_lookup2()

            print("#train_set:", len(data_iter))
            print("#test_set:", len(data_iter2))
            print("#short_set:", len(data_iter3))
            print("#dev_set:", len(data_iter_dev))
            curNum += 1

            embedding = nn.Embedding.from_pretrained(weights).cuda()
            criterion = nn.CrossEntropyLoss()

            model = Classifier(model_cfg, 2)
            model2 = Classifier_Attention_LSTM(2)

            trainer = train.Trainer(
                cfg, dataName, stopNum, model, model2, data_iter, data_iter_b,
                data_iter2, data_iter2_b, data_iter3, data_iter3_b,
                data_iter_dev, data_iter_dev_b,
                optim.optim4GPU(cfg, model,
                                len(data_iter) * 10),
                torch.optim.Adam(model2.parameters(),
                                 lr=0.005), get_device(), kkk + 1)

            label_0 = []
            label_1 = []

            result3 = []
            result4 = []
            result5 = []

            bb_11 = {}
            bb_22 = {}

            abusive_11 = []
            abusive_22 = []

            result_label = []

            fw = open('./temp_data/temp_train_IMDB.tsv',
                      'w',
                      encoding='utf-8',
                      newline='')
            wr = csv.writer(fw, delimiter='\t')

            fr = open(data_short_file, 'r', encoding='utf-8')
            rdrr = csv.reader(fr, delimiter='\t')
            for line in rdrr:
                wr.writerow([line[0], line[1]])

            fw.close()
            fr.close()

            data0 = []
            temp_check = []
            temp_label = []

            with open(data_train_file, "r", encoding='utf-8') as f:
                lines = csv.reader(f, delimiter='\t')

                for i in lines:
                    a = ''
                    lines2 = i[1].split(' ')
                    for j in range(0, len(lines2)):
                        a += lines2[j] + ' '

                    data0.append(a)
                    temp_check.append([0, -1, a, i[0]])
                    temp_label.append([0, 0])
            f.close()

            trainer.train(model_file, pretrain_file, get_loss_CNN,
                          get_loss_Attn_LSTM, evalute_CNN_SSL, pseudo_labeling,
                          evalute_Attn_LSTM, evalute_CNN,
                          evalute_Attn_LSTM_SSL, generating_lexiocn,
                          data_parallel)

    elif mode == 'eval':

        def evalute_Attn_LSTM_SSL(model, batch):

            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch

            seq_lengths, perm_idx = seq_lengths.sort(0, descending=True)
            input_ids = input_ids[perm_idx]
            label_id = label_id[perm_idx]
            token1 = embedding(input_ids.long())

            logits, attention_score = model2(token1.cuda(), input_ids,
                                             segment_ids, input_mask,
                                             seq_lengths)

            return label_id, logits

        def evalute_CNN_SSL(model, batch):
            input_ids, segment_ids, input_mask, label_id, seq_lengths = batch
            token1 = embedding(input_ids.long())
            logits, attention_score = model(token1.cuda(), input_ids,
                                            segment_ids, input_mask)

            return label_id, logits

        weights = tokenization.embed_lookup2()

        embedding = nn.Embedding.from_pretrained(weights).cuda()
        criterion = nn.CrossEntropyLoss()

        model = Classifier_CNN(2)
        model2 = Classifier_Attention_LSTM(2)

        trainer = train.Eval(cfg, model, model2, data_iter, save_dir,
                             get_device())

        embedding = nn.Embedding.from_pretrained(weights).cuda()
        results = trainer.eval(evalute_CNN_SSL, evalute_Attn_LSTM_SSL,
                               data_parallel)
예제 #11
0
def main(task='sim',
         train_cfg='config/train_mrpc.json',
         model_cfg='config/bert_base.json',
         data_file='../glue/MRPC/train.tsv',
         model_file=None,
         pretrain_file=pretrain_file,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         save_dir='../exp/bert/mrpc',
         max_len=128,
         batch_size=2,
         pretrained_type='local',
         mode='train'):

    cfg = train.Config.from_json(train_cfg)
    model_cfg = models.Config.from_json(model_cfg)

    #set_seeds(cfg.seed)

    if (pretrained_type == 'google'):
        local_pretrained = False
    else:
        local_pretrained = True

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab,
                                           do_lower_case=True)
    TaskDataset = dataset_class(
        task)  # task dataset class according to the task
    pipeline = [
        Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
        AddSpecialTokensWithTruncation(max_len),
        TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels,
                      max_len)
    ]
    dataset = TaskDataset(data_file, pipeline)
    # batch_size
    #data_iter = DataLoader(dataset, batch_size=cfg.batch_size, shuffle=True)
    data_iter = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    #model = Classifier(model_cfg, len(TaskDataset.labels))
    model = SentEmbedding(model_cfg, len(TaskDataset.labels), local_pretrained)

    #trainer = train.Trainer(cfg,
    evaluator = SentEvaluator(cfg, model, data_iter,
                              optim.optim4GPU(cfg, model), save_dir,
                              get_device(), local_pretrained)

    if (True):

        def evaluate(model, batch):
            input_ids, segment_ids, input_mask, label_id = batch
            #logits = model(input_ids, segment_ids, input_mask)
            if (local_pretrained):
                print(np.shape(input_ids), np.shape(segment_ids),
                      np.shape(input_mask))
                embed = model(input_ids, segment_ids, input_mask)
            else:
                #input_ids = torch.LongTensor(input_ids)
                #segment_ids = torch.LongTensor(segment_ids)
                #input_mask = torch.LongTensor(input_mask)
                print(np.shape(input_ids), np.shape(segment_ids),
                      np.shape(input_mask))
                print(input_ids.shape, segment_ids.shape, input_mask.shape)
                embed = model(input_ids, segment_ids, input_mask)

            print('evaluate(embed) : ', embed.shape)
            return embed

        #results = trainer.eval(evaluate, model_file, data_parallel)
        results = evaluator.eval(evaluate, model_file, data_parallel)
        print(np.shape(results))

    similarities = []
    for svec in results:
        sim = cosine_similarity(results[0], svec)
        print(sim)
        similarities.append(sim.cpu().tolist())

    print(similarities)
예제 #12
0
파일: distill.py 프로젝트: mbasnet1/lpot
def main(config='config/finetune/agnews/train.json'):

    cfg = Config(**json.load(open(config, "r")))

    cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r")))
    cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r")))
    cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r")))

    set_seeds(cfg.seed)

    ### Prepare Dataset and Preprocessing ###

    TaskDataset = data.get_class(cfg_data.task) # task dataset class according to the task
    tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file, do_lower_case=True)
    dataset = TaskDataset(cfg_data.data_file[cfg.mode], pipelines=[
        data.RemoveSymbols('\\'),
        data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
        data.AddSpecialTokensWithTruncation(cfg_data.max_len),
        data.TokenIndexing(tokenizer.convert_tokens_to_ids,
                           TaskDataset.labels,
                           cfg_data.max_len)
    ], n_data=None)
    tensors = TensorDataset(*dataset.get_tensors()) # To Tensors
    data_iter = DataLoader(tensors, batch_size=cfg_optim.batch_size, shuffle=False)

    ### Fetch Teacher's output and put it into the dataset ###

    def fetch_logits(model):
        def get_logits(model, batch):
            input_ids, segment_ids, input_mask, label_id = batch
            logits = model(input_ids, segment_ids, input_mask)
            return 0.0, logits

        train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, None, None, get_device())
        results = torch.cat(train_loop.eval(get_logits, cfg.model_file))
        return results


    if cfg.mode == "train":
        print("Fetching teacher's output...")
        teacher = models.Classifier4Transformer(cfg_model, len(TaskDataset.labels))
        teacher.load_state_dict(torch.load(cfg.model_file)) # use trained model
        with torch.no_grad():
            teacher_logits = fetch_logits(teacher)

        tensors = TensorDataset(teacher_logits, *dataset.get_tensors()) # To Tensors
        data_iter = DataLoader(tensors, batch_size=cfg_optim.batch_size, shuffle=False)

    ### Models ###

    model = models.BlendCNN(cfg_model, len(TaskDataset.labels))
    checkpoint.load_embedding(model.embed, cfg.pretrain_file)

    optimizer = optim.optim4GPU(cfg_optim, model)

    train_loop = trainer.TrainLoop(
        cfg_optim, model, data_iter, optimizer, cfg.save_dir, get_device()
    )

    def get_loss(model, batch, global_step): # make sure loss is a scalar tensor
        teacher_logits, input_ids, segment_ids, input_mask, label_id = batch
        T = 1.0
        logits = model(input_ids, segment_ids, input_mask)
        loss = 0.1*nn.CrossEntropyLoss()(logits, label_id)
        loss += 0.9*nn.KLDivLoss()(
            F.log_softmax(logits/T, dim=1),
            F.softmax(teacher_logits/T, dim=1)
        )
        #loss = 0.9*nn.MSELoss()(logits, teacher_logits)
        return loss

    def evaluate(model, batch):
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float() #.cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    if cfg.mode == "train":
        train_loop.train(get_loss, None, None) # not use pretrain file
        print("Training has been done properly.")

    elif cfg.mode == "eval":
        results = train_loop.eval(evaluate, cfg.model_file)
        total_accuracy = torch.cat(results).mean().item()
        print(f"Accuracy: {total_accuracy}")
예제 #13
0
def main(train_cfg='config/pretrain.json',
         model_cfg='config/bert_base.json',
         data_file='/root/voucher/dataset/tifu/bert/train.tsv',
         model_file=None,
         data_parallel=True,
         word_vocab='/root/voucher/dataset/tifu/bert/word_vocab.txt',
         pos_vocab='/root/voucher/dataset/tifu/bert/pos_vocab.txt',
         dep_vocab='/root/voucher/dataset/tifu/bert/dep_vocab.txt',
         pos_dep_word_vocab='/root/voucher/dataset/tifu/bert/pos_dep_word.pkl',
         save_dir='../exp/bert/pretrain',
         log_dir='../exp/bert/pretrain/runs',
         max_len=384,
         max_pred=20,
         mask_prob=0.15,
         mode=train):

    if mode == 'train':
        pass
    elif mode == 'eval':
        pass
    #    max_pred = max_len
    #    mask_prob = 1
    else:
        print("please select correct mode")
        exit(1)

    cfg = train.Config.from_json(train_cfg)
    model_cfg = models.Config.from_json(model_cfg)

    set_seeds(cfg.seed)

    custom_tokenizer = CustomVocabTokenizer(
        word_vocab_file=word_vocab,
        pos_vocab_file=pos_vocab,
        dep_vocab_file=dep_vocab,
        pos_dep_word_vocab_file=pos_dep_word_vocab)
    custom_tokenize = lambda word, pos, dep: custom_tokenizer.tokenize(
        custom_tokenizer.convert_to_unicode(word),
        custom_tokenizer.convert_to_unicode(pos),
        custom_tokenizer.convert_to_unicode(dep))

    pipeline = [
        Preprocess4Pretrain(max_pred, mask_prob,
                            list(custom_tokenizer.word_tokenizer.vocab.keys()),
                            list(custom_tokenizer.pos_tokenizer.vocab.keys()),
                            list(custom_tokenizer.dep_tokenizer.vocab.keys()),
                            custom_tokenizer.convert_tokens_to_ids, max_len)
    ]
    data_iter = TifuDataLoader(data_file,
                               cfg.batch_size,
                               custom_tokenize,
                               max_len,
                               pipeline=pipeline)

    model = BertModel4Pretrain(model_cfg,
                               custom_tokenizer.get_word_vocab_size(),
                               custom_tokenizer.get_pos_vocab_size(),
                               custom_tokenizer.get_dep_vocab_size())
    criterion1 = nn.CrossEntropyLoss(reduction='none')
    criterion2 = nn.CrossEntropyLoss(reduction='none')
    criterion3 = nn.CrossEntropyLoss(reduction='none')

    optimizer = optim.optim4GPU(cfg, model)
    trainer = train.Trainer(cfg, model, data_iter, optimizer, save_dir,
                            get_device())

    writer = SummaryWriter(log_dir=log_dir)  # for tensorboardX

    if mode == 'train':

        def get_loss(model, batch, global_step):  # make sure loss is tensor
            input_word_ids,\
            input_pos_ids,\
            input_dep_ids,\
            input_segment_ids,\
            input_mask,\
            masked_word_ids,\
            masked_pos_ids,\
            masked_dep_ids,\
            masked_pos,\
            masked_weights,\
            target_word_ids,\
            target_pos_ids,\
            target_dep_ids,\
            target_mask = batch

            logits_pos, logits_dep, logits_word = model(
                input_word_ids, input_segment_ids, masked_pos, input_mask,
                target_word_ids, target_mask)

            loss_pos = criterion1(logits_pos.transpose(1, 2),
                                  masked_pos_ids)  # for masked pos
            loss_pos = (loss_pos * masked_weights.float()).mean()

            loss_dep = criterion2(logits_dep.transpose(1, 2),
                                  masked_dep_ids)  # for masked dep
            loss_dep = (loss_dep * masked_weights.float()).mean()

            loss_word = criterion3(logits_word.transpose(1, 2),
                                   masked_word_ids)  # for masked word
            loss_word = (loss_word * masked_weights.float()).mean()
            print(loss_pos.item(), loss_dep.item(), loss_word.item())
            writer.add_scalars(
                'data/scalar_group', {
                    'loss_pos': loss_pos.item(),
                    'loss_dep': loss_dep.item(),
                    'loss_word': loss_word.item(),
                    'loss_total': (loss_pos + loss_dep + loss_word).item(),
                    'lr': optimizer.get_lr()[0],
                }, global_step)

            return loss_pos + loss_dep + loss_word

        trainer.train(get_loss, model_file, None, data_parallel)
    elif mode == 'eval':

        def evaluate(model, batch):
            input_word_ids,\
            input_pos_ids,\
            input_dep_ids,\
            input_segment_ids,\
            input_mask,\
            masked_word_ids,\
            masked_pos_ids,\
            masked_dep_ids,\
            masked_pos,\
            masked_weights,\
            target_word_ids,\
            target_pos_ids,\
            target_dep_ids,\
            target_mask = batch
            logits_pos, logits_dep, logits_word = model(
                input_word_ids, input_segment_ids, masked_pos, input_mask,
                target_word_ids, target_mask)
            _, label_pos = logits_pos.max(-1)
            result_pos = (label_pos == masked_pos_ids).float()  #.cpu().numpy()
            pos_accuracy = result_pos.mean()

            _, label_dep = logits_dep.max(-1)
            result_dep = (label_dep == masked_dep_ids).float()
            dep_accuracy = result_dep.mean()

            _, label_word = logits_word.max(-1)
            result_word = (label_word == masked_word_ids).float()
            word_accuracy = result_word.mean()

            accuracies = [pos_accuracy, dep_accuracy, word_accuracy]
            results = [result_pos, result_dep, result_word]
            return accuracies, results

        results = trainer.eval(
            evaluate,
            model_file,
            data_parallel,
            eval_kind_names=["PosTagging", "SyntaxParsing", "Word"])
        print(results)
def main(train_cfg='config/pretrain.json',
         model_cfg='config/bert_base.json',
         data_file='/root/voucher/dataset/tifu/bert/train.tsv',
         model_file=None,
         data_parallel=True,
         word_vocab='/root/voucher/dataset/tifu/bert/word_vocab.txt',
         pos_vocab='/root/voucher/dataset/tifu/bert/pos_vocab.txt',
         dep_vocab='/root/voucher/dataset/tifu/bert/dep_vocab.txt',
         pos_dep_word_vocab='/root/voucher/dataset/tifu/bert/pos_dep_word.pkl',
         save_dir='../exp/bert/pretrain',
         log_dir='../exp/bert/pretrain/runs',
         max_len=384,
         max_pred=20,
         mask_prob=0.15,
         mode=train):

    cfg = train.Config.from_json(train_cfg)
    model_cfg = models.Config.from_json(model_cfg)

    set_seeds(cfg.seed)

    custom_tokenizer = CustomVocabTokenizer(
        word_vocab_file=word_vocab,
        pos_vocab_file=pos_vocab,
        dep_vocab_file=dep_vocab,
        pos_dep_word_vocab_file=pos_dep_word_vocab)
    custom_tokenize = lambda word, pos, dep: custom_tokenizer.tokenize(
        custom_tokenizer.convert_to_unicode(word),
        custom_tokenizer.convert_to_unicode(pos),
        custom_tokenizer.convert_to_unicode(dep))

    pipeline = [
        Preprocess4Pretrain(max_pred, mask_prob,
                            list(custom_tokenizer.word_tokenizer.vocab.keys()),
                            list(custom_tokenizer.pos_tokenizer.vocab.keys()),
                            list(custom_tokenizer.dep_tokenizer.vocab.keys()),
                            custom_tokenizer.convert_tokens_to_ids, max_len)
    ]
    data_iter = TifuDataLoader(data_file,
                               cfg.batch_size,
                               custom_tokenize,
                               max_len,
                               pipeline=pipeline)

    model = BertModel4Pretrain(model_cfg,
                               custom_tokenizer.get_word_vocab_size(),
                               custom_tokenizer.get_pos_vocab_size(),
                               custom_tokenizer.get_dep_vocab_size())
    criterion1 = nn.CrossEntropyLoss(reduction='none')
    criterion2 = nn.CrossEntropyLoss(reduction='none')
    criterion3 = nn.CrossEntropyLoss(reduction='none')

    optimizer = optim.optim4GPU(cfg, model)
    trainer = train.Trainer(cfg, model, data_iter, optimizer, save_dir,
                            get_device())

    writer = SummaryWriter(log_dir=log_dir)  # for tensorboardX

    if mode == 'train':

        def get_loss(model, batch, global_step):  # make sure loss is tensor
            origin_input_word_ids,\
            input_word_ids,\
            input_pos_ids,\
            input_dep_ids,\
            input_segment_ids,\
            input_mask,\
            target_word_ids,\
            target_pos_ids,\
            target_dep_ids,\
            target_mask,\
            input_len,\
            target_len = batch

            logits_pos, logits_dep, logits_word = model(
                input_word_ids, input_segment_ids, input_pos_ids,
                input_dep_ids, input_mask, target_mask)

            loss_pos = criterion1(logits_pos.transpose(1, 2),
                                  input_pos_ids)  # for masked pos
            loss_pos = (loss_pos * input_mask.float()).mean()

            loss_dep = criterion2(logits_dep.transpose(1, 2),
                                  input_dep_ids)  # for masked dep
            loss_dep = (loss_dep * input_mask.float()).mean()

            loss_word = criterion3(logits_word.transpose(1, 2),
                                   origin_input_word_ids)  # for masked word
            loss_word = (loss_word * input_mask.float()).mean()
            print(loss_pos.item(), loss_dep.item(), loss_word.item())
            writer.add_scalars(
                'data/scalar_group', {
                    'loss_pos': loss_pos.item(),
                    'loss_dep': loss_dep.item(),
                    'loss_word': loss_word.item(),
                    'loss_total': (loss_pos + loss_dep + loss_word).item(),
                    'lr': optimizer.get_lr()[0],
                }, global_step)

            return loss_pos + loss_dep + loss_word

        trainer.train(get_loss, model_file, None, data_parallel)
    elif mode == 'eval':

        def evaluate(model, batch):
            origin_input_word_ids, \
            input_word_ids,\
            input_pos_ids,\
            input_dep_ids,\
            input_segment_ids,\
            input_mask,\
            target_word_ids,\
            target_pos_ids,\
            target_dep_ids,\
            target_mask,\
            input_len,\
            target_len = batch

            logits_pos, logits_dep, logits_word = model(
                input_word_ids, input_segment_ids, input_pos_ids,
                input_dep_ids, input_mask, target_mask)
            _, label_pos = logits_pos.max(-1)
            result_pos = (label_pos == target_pos_ids).float()  #.cpu().numpy()
            pos_accuracy = result_pos.mean()

            _, label_dep = logits_dep.max(-1)
            result_dep = (label_dep == target_dep_ids).float()
            dep_accuracy = result_dep.mean()

            _, label_word = logits_word.max(-1)
            result_word = (label_word == target_word_ids).float()
            word_accuracy = result_word.mean()

            accuracies = [pos_accuracy, dep_accuracy, word_accuracy]
            results = [result_pos, result_dep, result_word]
            return accuracies, results

        results = trainer.eval(
            evaluate,
            model_file,
            data_parallel,
            eval_kind_names=["PosTagging", "SyntaxParsing", "Word"])
        print(results)
    elif mode == 'sim':

        def sim(model, batch):
            origin_input_word_ids, \
            input_word_ids,\
            input_pos_ids,\
            input_dep_ids,\
            input_segment_ids,\
            input_mask,\
            target_word_ids,\
            target_pos_ids,\
            target_dep_ids,\
            target_mask,\
            input_len,\
            target_len = batch

            logits_pos, logits_dep, logits_word = model(
                input_word_ids, input_segment_ids, input_pos_ids,
                input_dep_ids, input_mask, target_mask)

            input_len = input_len.tolist()
            target_len = target_len.tolist()

            for i in range(len(input_len)):
                logits = torch.squeeze(logits_word.narrow(0, i, 1), dim=0)
                logits_input = logits.narrow(0, 0, input_len[i])
                logits_target = logits.narrow(0, input_len[i], target_len[i])

                _, input_ids = logits_input.max(-1)
                _, target_ids = logits_target.max(-1)
                input_tokens = custom_tokenizer.word_tokenizer.convert_ids_to_tokens(
                    input_ids.tolist())
                target_tokens = custom_tokenizer.word_tokenizer.convert_ids_to_tokens(
                    target_ids.tolist())
                input_tokens2 = custom_tokenizer.word_tokenizer.convert_ids_to_tokens(
                    input_word_ids[i].tolist())
                target_tokens2 = custom_tokenizer.word_tokenizer.convert_ids_to_tokens(
                    target_word_ids[i].tolist())

                results = []
                input_norm = logits_input / logits_input.norm(dim=1)[:, None]
                target_norm = logits_target / logits_target.norm(dim=1)[:,
                                                                        None]

                #target_len x input_len
                res = torch.mm(target_norm, input_norm.transpose(0, 1))

                #target_len x 1
                _, sim_idxs = res.max(-1)
                for j, sim_idx in enumerate(sim_idxs.tolist()):
                    results.append([
                        target_tokens[j], input_tokens[sim_idx],
                        target_tokens2[j], input_tokens2[sim_idx]
                    ])

            print(results)
            accuracies = [0]
            results = [0]
            return accuracies, results

        results = trainer.eval(sim,
                               model_file,
                               data_parallel,
                               eval_kind_names=["Word"])
        print(results)
예제 #15
0
def main(train_cfg='config/pretrain.json',
         model_cfg='config/bert_base.json',
         data_file='/root/voucher/dataset/tifu/bert/train.tsv',
         model_file=None,
         pretrain_file=None,
         data_parallel=True,
         word_vocab='/root/voucher/dataset/tifu/bert/word_vocab.txt',
         pos_vocab='/root/voucher/dataset/tifu/bert/pos_vocab.txt',
         dep_vocab='/root/voucher/dataset/tifu/bert/dep_vocab.txt',
         pos_dep_word_vocab='/root/voucher/dataset/tifu/bert/pos_dep_word.pkl',
         save_dir='../exp/bert/pretrain',
         log_dir='../exp/bert/pretrain/runs',
         max_len=384,
         max_pred=20,
         mask_prob=0.15,
         mode=train):

    if mode == 'train':
        pass
    elif mode == 'eval':
        pass
    #    max_pred = max_len
    #    mask_prob = 1
    else:
        print("please select correct mode")
        exit(1)

    cfg = train.Config.from_json(train_cfg)
    model_cfg = models.Config.from_json(model_cfg)

    set_seeds(cfg.seed)

    custom_tokenizer = CustomVocabTokenizer(word_vocab_file=word_vocab,
                                            pos_vocab_file=pos_vocab,
                                            dep_vocab_file=dep_vocab,
                                            pos_dep_word_vocab_file=pos_dep_word_vocab)
    custom_tokenize = lambda word, pos, dep: custom_tokenizer.tokenize(custom_tokenizer.convert_to_unicode(word),
                                                                       custom_tokenizer.convert_to_unicode(pos),
                                                                       custom_tokenizer.convert_to_unicode(dep))

    pipeline = [Preprocess4Pretrain(max_pred,
                                    mask_prob,
                                    list(custom_tokenizer.word_tokenizer.vocab.keys()),
                                    list(custom_tokenizer.pos_tokenizer.vocab.keys()),
                                    list(custom_tokenizer.dep_tokenizer.vocab.keys()),
                                    custom_tokenizer.convert_tokens_to_ids,
                                    max_len)]
    data_iter = TifuDataLoader(data_file,
                               cfg.batch_size,
                               custom_tokenize,
                               max_len,
                               pipeline=pipeline)

    model = BertModel4Pretrain(model_cfg)

    optimizer = optim.optim4GPU(cfg, model)
    trainer = train.Trainer(cfg, model, data_iter, optimizer, save_dir, get_device())

    if mode == 'eval':
        def evaluate(model, batch):
            input_word_ids,\
            input_segment_ids,\
            input_mask,\
            target_word_ids,\
            target_mask,\
            input_len, \
            target_len = batch

            logits_word = model(input_word_ids,
                                input_segment_ids,
                                input_mask,
                                target_mask)


            input_len = input_len.tolist()
            target_len = target_len.tolist()

            for i in range(len(input_len)):
                logits = torch.squeeze(logits_word.narrow(0, i, 1), dim=0)
                logits_input = logits.narrow(0, 0, input_len[i])
                logits_target = logits.narrow(0, input_len[i], target_len[i])

                _, input_ids = logits_input.max(-1)
                _, target_ids = logits_target.max(-1)
                input_tokens = custom_tokenizer.word_tokenizer.convert_ids_to_tokens(input_ids.tolist())
                target_tokens = custom_tokenizer.word_tokenizer.convert_ids_to_tokens(target_ids.tolist())

                results = []
                input_norm = logits_input / logits_input.norm(dim=1)[:, None]
                target_norm = logits_target / logits_target.norm(dim=1)[:, None]

                #target_len x input_len
                res = torch.mm(target_norm, input_norm.transpose(0, 1))

                #target_len x 1
                _, sim_idxs = res.max(-1)
                for j, sim_idx in enumerate(sim_idxs.tolist()):
                    results.append([target_tokens[j], input_tokens[sim_idx]])

            print(results)
            accuracies = [0]
            results = [0]
            return accuracies, results

        results = trainer.eval(evaluate, None, pretrain_file, data_parallel, eval_kind_names=["Word"])
        print(results)
예제 #16
0
파일: classify.py 프로젝트: mbasnet1/lpot
def main(config='config/blendcnn/mrpc/eval.json', args=None):
    cfg = Config(**json.load(open(config, "r")))

    cfg_data = data.Config(**json.load(open(cfg.cfg_data, "r")))
    cfg_model = models.Config(**json.load(open(cfg.cfg_model, "r")))
    cfg_optim = trainer.Config(**json.load(open(cfg.cfg_optim, "r")))

    set_seeds(cfg.seed)

    TaskDataset = data.get_class(
        cfg_data.task)  # task dataset class according to the task
    tokenizer = tokenization.FullTokenizer(vocab_file=cfg_data.vocab_file,
                                           do_lower_case=True)
    dataset = TaskDataset(
        args.dataset_location,
        pipelines=[
            data.RemoveSymbols('\\'),
            data.Tokenizing(tokenizer.convert_to_unicode, tokenizer.tokenize),
            data.AddSpecialTokensWithTruncation(cfg_data.max_len),
            data.TokenIndexing(tokenizer.convert_tokens_to_ids,
                               TaskDataset.labels, cfg_data.max_len)
        ],
        n_data=None)
    dataset = TensorDataset(*dataset.get_tensors())  # To Tensors
    data_iter = DataLoader(dataset, batch_size=args.batch_size, shuffle=False)

    model = models.BlendCNN(cfg_model, len(TaskDataset.labels))
    checkpoint.load_embedding(model.embed, cfg.pretrain_file)

    optimizer = optim.optim4GPU(cfg_optim, model)

    train_loop = trainer.TrainLoop(cfg_optim, model, data_iter, optimizer,
                                   cfg.save_dir, get_device())

    def get_loss(model, batch,
                 global_step):  # make sure loss is a scalar tensor
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        loss = nn.CrossEntropyLoss()(logits, label_id)
        return loss

    def evaluate(model, batch):
        input_ids, segment_ids, input_mask, label_id = batch
        logits = model(input_ids, segment_ids, input_mask)
        _, label_pred = logits.max(1)
        result = (label_pred == label_id).float()  #.cpu().numpy()
        accuracy = result.mean()
        return accuracy, result

    class Bert_DataLoader(object):
        def __init__(self,
                     loader=None,
                     model_type=None,
                     device='cpu',
                     batch_size=1):
            self.loader = loader
            self.model_type = model_type
            self.device = device
            self.batch_size = batch_size

        def __iter__(self):
            for batch in self.loader:
                batch = tuple(t.to(self.device) for t in batch)
                outputs = {
                    'output_all': (batch[0], batch[1], batch[2]),
                    'labels': batch[3]
                }

                yield outputs['output_all'], outputs['labels']

    def benchmark(model):
        total_samples = 0
        total_time = 0
        index = 0

        class RandomDataset(object):
            def __init__(self, size, shape):
                self.len = size
                self.input_ids = torch.randint(low=0,
                                               high=30522,
                                               size=(size, shape),
                                               dtype=torch.int64)
                self.segment_ids = torch.randint(low=0,
                                                 high=1,
                                                 size=(size, shape),
                                                 dtype=torch.int64)
                self.input_mask = torch.randint(low=0,
                                                high=1,
                                                size=(size, shape),
                                                dtype=torch.int64)
                self.data = (self.input_ids, self.segment_ids, self.input_mask)

            def __getitem__(self, index):
                return (self.data[0][index], self.data[1][index],
                        self.data[2][index])

            def __len__(self):
                return self.len

        rand_loader = DataLoader(dataset=RandomDataset(size=5000, shape=128),
                                 batch_size=args.batch_size,
                                 shuffle=True)

        for batch in rand_loader:
            index += 1
            tic = time.time()
            if os.environ.get('BLENDCNN_PROFILING') is not None:
                with profiler.profile(record_shapes=True) as prof:
                    with torch.no_grad():
                        input_ids, segment_ids, input_mask = batch
                        _ = model(*batch)
            else:
                with torch.no_grad(
                ):  # evaluation without gradient calculation
                    input_ids, segment_ids, input_mask = batch
                    _ = model(*batch)
            if index > args.warmup:
                total_samples += batch[0].size()[0]
                total_time += time.time() - tic
        throughput = total_samples / total_time
        print('Latency: %.3f ms' % (1 / throughput * 1000))
        print('Throughput: %.3f images/sec' % (throughput))

        if os.environ.get('BLENDCNN_PROFILING') is not None:
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=10))

    def eval_func(model):
        results = []  # prediction results
        total_samples = 0
        total_time = 0
        index = 0
        model.eval()
        eval_dataloader = Bert_DataLoader(loader=data_iter,
                                          batch_size=args.batch_size)
        for batch, label in eval_dataloader:
            index += 1
            tic = time.time()
            if os.environ.get('BLENDCNN_PROFILING') is not None:
                with profiler.profile(record_shapes=True) as prof:
                    with torch.no_grad():
                        accuracy, result = evaluate(model, (*batch, label))
            else:
                with torch.no_grad(
                ):  # evaluation without gradient calculation
                    accuracy, result = evaluate(model, (*batch, label))
            results.append(result)
            if index > args.warmup:
                total_samples += batch[0].size()[0]
                total_time += time.time() - tic
        total_accuracy = torch.cat(results).mean().item()
        throughput = total_samples / total_time
        print('Latency: %.3f ms' % (1 / throughput * 1000))
        print('Throughput: %.3f samples/sec' % (throughput))
        print('Accuracy: %.3f ' % (total_accuracy))

        if os.environ.get('BLENDCNN_PROFILING') is not None:
            print(prof.key_averages().table(sort_by="cpu_time_total",
                                            row_limit=10))
        return total_accuracy

    if cfg.mode == "train":
        train_loop.train(get_loss, cfg.model_file,
                         None)  # not use pretrain_file
        print("Training has been done properly.")

    elif cfg.mode == "eval":
        # results = train_loop.eval(evaluate, cfg.model_file)
        # total_accuracy = torch.cat(results).mean().item()
        # print(f"Accuracy: {total_accuracy}")

        if args.tune:
            import lpot
            from lpot import common
            # lpot tune
            model.load_state_dict(torch.load(args.input_model))
            eval_dataloader = Bert_DataLoader(loader=data_iter,
                                              batch_size=args.batch_size)

            quantizer = lpot.Quantization(args.tuned_yaml)
            quantizer.model = common.Model(model)
            quantizer.calib_dataloader = eval_dataloader
            quantizer.eval_func = eval_func
            q_model = quantizer()
            q_model.save(args.tuned_checkpoint)

        elif args.int8:
            from lpot.utils.pytorch import load
            int8_model = load(
                os.path.abspath(os.path.expanduser(args.tuned_checkpoint)),
                model)
            print(int8_model)
            if args.accuracy_only:
                eval_func(int8_model)
            elif args.benchmark:
                benchmark(int8_model)

        else:
            model.load_state_dict(torch.load(args.input_model))
            print(model)
            if args.accuracy_only:
                eval_func(model)
            elif args.benchmark:
                benchmark(model)
예제 #17
0
def main(task_name='qqp',
         base_train_cfg='config/QDElectra_pretrain.json',
         train_cfg='config/train_mrpc.json',
         model_cfg='config/QDElectra_base.json',
         train_data_file='GLUE/glue_data/QQP/train.tsv',
         eval_data_file='GLUE/glue_data/QQP/eval.tsv',
         model_file=None,
         data_parallel=True,
         vocab='../uncased_L-12_H-768_A-12/vocab.txt',
         log_dir='../exp/electra/pretrain/runs',
         save_dir='../exp/bert/mrpc',
         distill=True,
         quantize=True,
         gradually_distill=False,
         imitate_tinybert=False,
         pred_distill=True):

    check_dirs_exist([log_dir, save_dir])

    train_cfg_dict = json.load(open(base_train_cfg, "r"))
    train_cfg_dict.update(json.load(open(train_cfg, "r")))
    train_cfg = ElectraConfig().from_dict(train_cfg_dict)
    model_cfg = ElectraConfig().from_json_file(model_cfg)
    output_mode, train_cfg.n_epochs, max_len = get_task_params(task_name)

    set_seeds(train_cfg.seed)

    tokenizer = tokenization.FullTokenizer(vocab_file=vocab,
                                           do_lower_case=True)
    TaskDataset = dataset_class(
        task_name)  # task dataset class according to the task name
    model_cfg.num_labels = len(TaskDataset.labels)
    pipeline = [
        Tokenizing(task_name, tokenizer.convert_to_unicode,
                   tokenizer.tokenize),
        AddSpecialTokensWithTruncation(max_len),
        TokenIndexing(tokenizer.convert_tokens_to_ids, TaskDataset.labels,
                      output_mode, max_len)
    ]
    train_data_set = TaskDataset(train_data_file, pipeline)
    eval_data_set = TaskDataset(eval_data_file, pipeline)
    train_data_iter = DataLoader(train_data_set,
                                 batch_size=train_cfg.batch_size,
                                 shuffle=True)
    eval_data_iter = DataLoader(eval_data_set,
                                batch_size=train_cfg.batch_size,
                                shuffle=False)

    generator = ElectraForSequenceClassification.from_pretrained(
        'google/electra-small-generator')
    t_discriminator = ElectraForSequenceClassification.from_pretrained(
        'google/electra-base-discriminator')
    s_discriminator = QuantizedElectraForSequenceClassification if quantize else ElectraForSequenceClassification
    s_discriminator = s_discriminator.from_pretrained(
        'google/electra-small-discriminator', config=model_cfg)
    model = DistillElectraForSequenceClassification(generator, t_discriminator,
                                                    s_discriminator, model_cfg)

    optimizer = optim.optim4GPU(train_cfg, model)
    writer = SummaryWriter(log_dir=log_dir)  # for tensorboardX

    base_trainer_args = (train_cfg, model_cfg, model, train_data_iter,
                         eval_data_iter, optimizer, save_dir, get_device())
    trainer = QuantizedDistillElectraTrainer(task_name, output_mode, distill,
                                             gradually_distill,
                                             imitate_tinybert, pred_distill,
                                             len(TaskDataset.labels), writer,
                                             *base_trainer_args)

    trainer.train(model_file, None, data_parallel)
    trainer.eval(model_file, data_parallel)