示例#1
0
def Eval_phase(params,which_files='test',model=None,test_dataloader=None,device=None):
    if(params['is_model']==True):
        print("model previously passed")
        model.eval()
    else:
        return 1
#         ### Have to modify in the final run
#         model=select_model(params['what_bert'],params['path_files'],params['weights'])
#         model.cuda()
#         model.eval()


    print("Running eval on ",which_files,"...")
    t0 = time.time()

    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    # Tracking variables 
    
    true_labels=[]
    pred_labels=[]
    logits_all=[]
    # Evaluate data for one epoch
    for step, batch in tqdm(enumerate(test_dataloader)):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)


        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention vals
        #   [2]: attention mask
        #   [3]: labels 
        b_input_ids = batch[0].to(device)
        b_att_val = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)


        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        
        outputs = model(b_input_ids,
            attention_vals=b_att_val,
            attention_mask=b_input_mask, 
            labels=None,device=device)
        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences.
        # Accumulate the total accuracy.
        pred_labels+=list(np.argmax(logits, axis=1).flatten())
        true_labels+=list(label_ids.flatten())
        logits_all+=list(logits)
    
    
    
    logits_all_final=[]
    for logits in logits_all:
        logits_all_final.append(softmax(logits))
    
    testf1=f1_score(true_labels, pred_labels, average='macro')
    testacc=accuracy_score(true_labels,pred_labels)
    if(params['num_classes']==3):
        testrocauc=roc_auc_score(true_labels, logits_all_final,multi_class='ovo',average='macro')
    else:
        #testrocauc=roc_auc_score(true_labels, logits_all_final,multi_class='ovo',average='macro')
        testrocauc=0
    testprecision=precision_score(true_labels, pred_labels, average='macro')
    testrecall=recall_score(true_labels, pred_labels, average='macro')
    
    if(params['logging']!='neptune' or params['is_model'] == True):
        # Report the final accuracy for this validation run.
        print(" Accuracy: {0:.2f}".format(testacc))
        print(" Fscore: {0:.2f}".format(testf1))
        print(" Precision: {0:.2f}".format(testprecision))
        print(" Recall: {0:.2f}".format(testrecall))
        print(" Roc Auc: {0:.2f}".format(testrocauc))
        print(" Test took: {:}".format(format_time(time.time() - t0)))
        #print(ConfusionMatrix(true_labels,pred_labels))
    else:
        bert_model = params['path_files']
        language  = params['language']
        name_one=bert_model+"_"+language
        neptune.create_experiment(name_one,params=params,send_hardware_metrics=False,run_monitoring_thread=False)
        neptune.append_tag(bert_model)
        neptune.append_tag(language)
        neptune.append_tag('test')
        neptune.log_metric('test_f1score',testf1)
        neptune.log_metric('test_accuracy',testacc)
        neptune.log_metric('test_precision',testprecision)
        neptune.log_metric('test_recall',testrecall)
        neptune.log_metric('test_rocauc',testrocauc)
        neptune.stop()

    return testf1,testacc,testprecision,testrecall,testrocauc,logits_all_final
def standaloneEval_with_rational(params,
                                 test_data=None,
                                 extra_data_path=None,
                                 topk=2,
                                 use_ext_df=False):
    #     device = torch.device("cpu")
    if torch.cuda.is_available() and params['device'] == 'cuda':
        # Tell PyTorch to use the GPU.
        device = torch.device("cuda")
        deviceID = get_gpu(params)
        torch.cuda.set_device(deviceID[0])
    else:
        print('Since you dont want to use GPU, using the CPU instead.')
        device = torch.device("cpu")

    embeddings = None
    if (params['bert_tokens']):
        train, val, test = createDatasetSplit(params)
        vocab_own = None
        vocab_size = 0
        padding_idx = 0
    else:
        train, val, test, vocab_own = createDatasetSplit(params)
        params['embed_size'] = vocab_own.embeddings.shape[1]
        params['vocab_size'] = vocab_own.embeddings.shape[0]
        embeddings = vocab_own.embeddings
    if (params['auto_weights']):
        y_test = [ele[2] for ele in test]
        encoder = LabelEncoder()
        encoder.classes_ = np.load('Data/classes.npy')
        params['weights'] = class_weight.compute_class_weight(
            'balanced', np.unique(y_test), y_test).astype('float32')
    if (extra_data_path != None):
        params_dash = {}
        params_dash['num_classes'] = 3
        params_dash['data_file'] = extra_data_path
        params_dash['class_names'] = dict_data_folder[str(
            params['num_classes'])]['class_label']
        temp_read = get_annotated_data(params_dash)
        with open('Data/post_id_divisions.json', 'r') as fp:
            post_id_dict = json.load(fp)
        temp_read = temp_read[
            temp_read['post_id'].isin(post_id_dict['test'])
            & (temp_read['final_label'].isin(['hatespeech', 'offensive']))]
        test_data = get_test_data(temp_read, params, message='text')
        test_extra = encodeData(test_data, vocab_own, params)
        test_dataloader = combine_features(test_extra, params, is_train=False)
    elif (use_ext_df):
        test_extra = encodeData(test_data, vocab_own, params)
        test_dataloader = combine_features(test_extra, params, is_train=False)
    else:
        test_dataloader = combine_features(test, params, is_train=False)

    model = select_model(params, embeddings)
    if (params['bert_tokens'] == False):
        model = load_model(model, params)
    if (params["device"] == 'cuda'):
        model.cuda()
    model.eval()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    # Tracking variables
    if ((extra_data_path != None) or (use_ext_df == True)):
        post_id_all = list(test_data['Post_id'])
    else:
        post_id_all = list(test['Post_id'])

    print("Running eval on test data...")
    t0 = time.time()
    true_labels = []
    pred_labels = []
    logits_all = []
    attention_all = []
    input_mask_all = []

    # Evaluate data for one epoch
    for step, batch in tqdm(enumerate(test_dataloader),
                            total=len(test_dataloader)):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention vals
        #   [2]: attention mask
        #   [3]: labels
        b_input_ids = batch[0].to(device)
        b_att_val = batch[1].to(device)
        b_input_mask = batch[2].to(device)
        b_labels = batch[3].to(device)

        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        #model.zero_grad()
        outputs = model(b_input_ids,
                        attention_vals=b_att_val,
                        attention_mask=b_input_mask,
                        labels=None,
                        device=device)
        #         m = nn.Softmax(dim=1)
        logits = outputs[0]

        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.detach().cpu().numpy()

        if (params['bert_tokens']):
            attention_vectors = np.mean(
                outputs[1][11][:, :, 0, :].detach().cpu().numpy(), axis=1)
        else:
            attention_vectors = outputs[1].detach().cpu().numpy()

        # Calculate the accuracy for this batch of test sentences.
        # Accumulate the total accuracy.
        pred_labels += list(np.argmax(logits, axis=1).flatten())
        true_labels += list(label_ids.flatten())
        logits_all += list(logits)
        attention_all += list(attention_vectors)
        input_mask_all += list(batch[2].detach().cpu().numpy())

    logits_all_final = []
    for logits in logits_all:
        logits_all_final.append(softmax(logits))

    if (use_ext_df == False):
        testf1 = f1_score(true_labels, pred_labels, average='macro')
        testacc = accuracy_score(true_labels, pred_labels)
        #testrocauc=roc_auc_score(true_labels, logits_all_final,multi_class='ovo',average='macro')
        testprecision = precision_score(true_labels,
                                        pred_labels,
                                        average='macro')
        testrecall = recall_score(true_labels, pred_labels, average='macro')

        # Report the final accuracy for this validation run.
        print(" Accuracy: {0:.3f}".format(testacc))
        print(" Fscore: {0:.3f}".format(testf1))
        print(" Precision: {0:.3f}".format(testprecision))
        print(" Recall: {0:.3f}".format(testrecall))
        #print(" Roc Auc: {0:.3f}".format(testrocauc))
        print(" Test took: {:}".format(format_time(time.time() - t0)))

    attention_vector_final = []
    for x, y in zip(attention_all, input_mask_all):
        temp = []
        for x_ele, y_ele in zip(x, y):
            if (y_ele == 1):
                temp.append(x_ele)
        attention_vector_final.append(temp)

    list_dict = []

    for post_id, attention, logits, pred, ground_truth in zip(
            post_id_all, attention_vector_final, logits_all_final, pred_labels,
            true_labels):
        #         if(ground_truth==1):
        #             continue
        temp = {}
        encoder = LabelEncoder()
        encoder.classes_ = np.load('Data/classes.npy')
        pred_label = encoder.inverse_transform([pred])[0]
        ground_label = encoder.inverse_transform([ground_truth])[0]
        temp["annotation_id"] = post_id
        temp["classification"] = pred_label
        temp["classification_scores"] = {
            "hatespeech": logits[0],
            "normal": logits[1],
            "offensive": logits[2]
        }

        topk_indicies = sorted(range(len(attention)),
                               key=lambda i: attention[i])[-topk:]

        temp_hard_rationales = []
        for ind in topk_indicies:
            temp_hard_rationales.append({
                'end_token': ind + 1,
                'start_token': ind
            })

        temp["rationales"] = [{
            "docid": post_id,
            "hard_rationale_predictions": temp_hard_rationales,
            "soft_rationale_predictions": attention,
            #"soft_sentence_predictions":[1.0],
            "truth": ground_truth
        }]
        list_dict.append(temp)

    return list_dict, test_data
示例#3
0
def train_model(params,device):
    embeddings=None
    if(params['bert_tokens']):
        train,val,test=createDatasetSplit(params)
    else:
        train,val,test,vocab_own=createDatasetSplit(params)
        params['embed_size']=vocab_own.embeddings.shape[1]
        params['vocab_size']=vocab_own.embeddings.shape[0]
        embeddings=vocab_own.embeddings
    if(params['auto_weights']):
        y_test = [ele[2] for ele in test] 
#         print(y_test)
        encoder = LabelEncoder()
        encoder.classes_ = np.load(params['class_names'],allow_pickle=True)
        params['weights']=class_weight.compute_class_weight('balanced',np.unique(y_test),y_test).astype('float32') 
        #params['weights']=np.array([len(y_test)/y_test.count(encoder.classes_[0]),len(y_test)/y_test.count(encoder.classes_[1]),len(y_test)/y_test.count(encoder.classes_[2])]).astype('float32') 
        
        
    print(params['weights'])
    train_dataloader =combine_features(train,params,is_train=True)   
    validation_dataloader=combine_features(val,params,is_train=False)
    test_dataloader=combine_features(test,params,is_train=False)
    
   
    model=select_model(params,embeddings)
    
    if(params["device"]=='cuda'):
        model.cuda()
    optimizer = AdamW(model.parameters(),
                  lr = params['learning_rate'], # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = params['epsilon'] # args.adam_epsilon  - default is 1e-8.
                )


    # Number of training epochs (authors recommend between 2 and 4)
    # Total number of training steps is number of batches * number of epochs.
    total_steps = len(train_dataloader) * params['epochs']

    # Create the learning rate scheduler.
    if(params['bert_tokens']):
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = int(total_steps/10),                     num_training_steps = total_steps)

    # Set the seed value all over the place to make this reproducible.
    fix_the_random(seed_val = params['random_seed'])
    # Store the average loss after each epoch so we can plot them.
    loss_values = []
    if(params['bert_tokens']):
        bert_model = params['path_files']
        name_one=bert_model
    else:
        name_one=params['model_name']
        
    if(params['logging']=='neptune'):
        neptune.create_experiment(name_one,params=params,send_hardware_metrics=False,run_monitoring_thread=False)
        
        neptune.append_tag(name_one)
        if(params['best_params']):
            neptune.append_tag('AAAI final best')
        else:
            neptune.append_tag('AAAI final')
        
    best_val_fscore=0
    best_test_fscore=0

    best_val_roc_auc=0
    best_test_roc_auc=0
    
    best_val_precision=0
    best_test_precision=0
    
    best_val_recall=0
    best_test_recall=0
    
    
    for epoch_i in range(0, params['epochs']):
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, params['epochs']))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_loss = 0
        model.train()

        # For each batch of training data...
        for step, batch in tqdm(enumerate(train_dataloader)):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                
                
            # `batch` contains three pytorch tensors:
            #   [0]: input ids 
            #   [1]: attention vals
            #   [2]: attention mask
            #   [3]: labels 
            b_input_ids = batch[0].to(device)
            b_att_val = batch[1].to(device)
            b_input_mask = batch[2].to(device)
            b_labels = batch[3].to(device)

            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()        
            outputs = model(b_input_ids, 
                attention_vals=b_att_val,
                attention_mask=b_input_mask, 
                labels=b_labels,
                device=device)

            # The call to `model` always returns a tuple, so we need to pull the 
            # loss value out of the tuple.
            
            loss = outputs[0]
           
            if(params['logging']=='neptune'):
            	neptune.log_metric('batch_loss',loss.item())
            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value 
            # from the tensor.
            total_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()
            # Update the learning rate.
            if(params['bert_tokens']):
                scheduler.step()
        # Calculate the average loss over the training data.
        avg_train_loss = total_loss / len(train_dataloader)
        if(params['logging']=='neptune'):
            neptune.log_metric('avg_train_loss',avg_train_loss)
        else:
            print('avg_train_loss',avg_train_loss)

        # Store the loss value for plotting the learning curve.
        loss_values.append(avg_train_loss)
        train_fscore,train_accuracy,train_precision,train_recall,train_roc_auc,_=Eval_phase(params,'train',model,train_dataloader,device)
        val_fscore,val_accuracy,val_precision,val_recall,val_roc_auc,_=Eval_phase(params,'val',model,validation_dataloader,device)
        test_fscore,test_accuracy,test_precision,test_recall,test_roc_auc,logits_all_final=Eval_phase(params,'test',model,test_dataloader,device)

        #Report the final accuracy for this validation run.
        if(params['logging']=='neptune'):	
            neptune.log_metric('test_fscore',test_fscore)
            neptune.log_metric('test_accuracy',test_accuracy)
            neptune.log_metric('test_precision',test_precision)
            neptune.log_metric('test_recall',test_recall)
            neptune.log_metric('test_rocauc',test_roc_auc)
            
            neptune.log_metric('val_fscore',val_fscore)
            neptune.log_metric('val_accuracy',val_accuracy)
            neptune.log_metric('val_precision',val_precision)
            neptune.log_metric('val_recall',val_recall)
            neptune.log_metric('val_rocauc',val_roc_auc)
    
            neptune.log_metric('train_fscore',train_fscore)
            neptune.log_metric('train_accuracy',train_accuracy)
            neptune.log_metric('train_precision',train_precision)
            neptune.log_metric('train_recall',train_recall)
            neptune.log_metric('train_rocauc',train_roc_auc)

            
        
    
        if(val_fscore > best_val_fscore):
            print(val_fscore,best_val_fscore)
            best_val_fscore=val_fscore
            best_test_fscore=test_fscore
            best_val_roc_auc = val_roc_auc
            best_test_roc_auc = test_roc_auc
            
            
            best_val_precision = val_precision
            best_test_precision = test_precision
            best_val_recall = val_recall
            best_test_recall = test_recall
            
            if(params['bert_tokens']):
                print('Loading BERT tokenizer...')
                tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=False)
                save_bert_model(model,tokenizer,params)
            else:
                print("Saving model")
                save_normal_model(model,params)

    if(params['logging']=='neptune'):
        neptune.log_metric('best_val_fscore',best_val_fscore)
        neptune.log_metric('best_test_fscore',best_test_fscore)
        neptune.log_metric('best_val_rocauc',best_val_roc_auc)
        neptune.log_metric('best_test_rocauc',best_test_roc_auc)
        neptune.log_metric('best_val_precision',best_val_precision)
        neptune.log_metric('best_test_precision',best_test_precision)
        neptune.log_metric('best_val_recall',best_val_recall)
        neptune.log_metric('best_test_recall',best_test_recall)
        
        neptune.stop()
    else:
        print('best_val_fscore',best_val_fscore)
        print('best_test_fscore',best_test_fscore)
        print('best_val_rocauc',best_val_roc_auc)
        print('best_test_rocauc',best_test_roc_auc)
        print('best_val_precision',best_val_precision)
        print('best_test_precision',best_test_precision)
        print('best_val_recall',best_val_recall)
        print('best_test_recall',best_test_recall)
        
#     del model
#     torch.cuda.empty_cache()
    return model
    def return_probab(self, sentences_list):
        """Input: should be a list of sentences"""
        """Ouput: probablity values"""
        params = self.params
        device = self.device

        if (params['auto_weights']):
            y_test = [ele[2] for ele in self.test]
            encoder = LabelEncoder()
            encoder.classes_ = np.load('Data/classes.npy')
            params['weights'] = class_weight.compute_class_weight(
                'balanced', np.unique(y_test), y_test).astype('float32')

        temp_read = transform_dummy_data(sentences_list)
        test_data = get_test_data(temp_read, params, message='text')
        test_extra = encodeData(test_data, self.vocab, params)
        test_dataloader = combine_features(test_extra, params, is_train=False)

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        # Tracking variables
        post_id_all = list(test_data['Post_id'])

        print("Running eval on test data...")
        t0 = time.time()
        true_labels = []
        pred_labels = []
        logits_all = []
        #attention_all=[]
        input_mask_all = []

        # Evaluate data for one epoch
        for step, batch in enumerate(test_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [1]: attention vals
            #   [2]: attention mask
            #   [3]: labels
            b_input_ids = batch[0].to(device)
            b_att_val = batch[1].to(device)
            b_input_mask = batch[2].to(device)
            b_labels = batch[3].to(device)

            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            #model.zero_grad()
            outputs = self.model(b_input_ids,
                                 attention_vals=b_att_val,
                                 attention_mask=b_input_mask,
                                 labels=None,
                                 device=device)
            logits = outputs[0]
            #print(logits)
            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.detach().cpu().numpy()

            # Calculate the accuracy for this batch of test sentences.
            # Accumulate the total accuracy.
            pred_labels += list(np.argmax(logits, axis=1).flatten())
            true_labels += list(label_ids.flatten())
            logits_all += list(logits)
            #attention_all+=list(attention_vectors)
            input_mask_all += list(batch[2].detach().cpu().numpy())

        logits_all_final = []
        for logits in logits_all:
            logits_all_final.append(list(softmax(logits)))

        return np.array(logits_all_final)