예제 #1
0
def encoder(sentence,argument):
    label2id,id2label,num_labels = tools.load_schema()
    encode_dict=tokenizer.encode_plus(sentence,max_length=args.max_length,pad_to_max_length=True)
    encode_sent=encode_dict['input_ids']
    token_type_ids=encode_dict['token_type_ids']
    attention_mask=encode_dict['attention_mask']
    label=[0 for i in range(args.max_length)]
    for key,value in argument.items():
        encode_arg=tokenizer.encode(value)
        start_idx=tools.search(encode_arg[1:-1],encode_sent)
        label[start_idx]= label2id[key] * 2 + 1
        for i in range(1, len(encode_arg[1:-1])):
            label[start_idx + i] = label2id[key] * 2 + 2
    return encode_sent,token_type_ids,attention_mask,label
예제 #2
0
def load_data(file_path):
    event_type_dict=tools.load_schema()
    with open(file_path,'r',encoding='utf8') as f:
        lines=f.readlines() 
        sentences=[]
        for line in lines:
            data=json.loads(line)
            text=data['text']
            title=data['title']
            if 'event_list' in data.keys() and data['event_list'] !=[]:
                for event in data['event_list']:
                    event_type = event['event_type']
                    if event_type !='无事件':
                        role_list = event_type_dict[event_type]
                        for role in role_list:
                            sent = event_type+'[unused1]'+role+'[SEP]'+text
                            sentences.append(sent)
        return sentences
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from utils.arguments_parse import args
import json
import torch
from torch import nn
from torch.utils.data import DataLoader, Dataset
import unicodedata, re
from data_preprocessing import tools
from tqdm import tqdm
from sklearn.utils import shuffle

tokenizer = tools.get_tokenizer()
predicate2id, id2predicate, s_entity_type, o_entity_type, _, _ = tools.load_schema(
)


def load_data(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        lines = f.readlines()
        sentences = []
        result = []
        for line in tqdm(lines):
            data = json.loads(line)
            text = data['text']
            text = text
            s_dict = {}
            o_dict = {}
            spo_dict = {}
            for spo in data['spo_list']:
예제 #4
0
import os
import sys
from typing import Any
from transformers import BertTokenizer, BertModel
import torch
from torch import nn
import pickle
from torch.utils.data import DataLoader, Dataset
from torch import optim
import numpy as np
from data_preprocessing import tools

label2id, id2label, num_labels = tools.load_schema()
num_label = num_labels + 1
tokenizer = tools.get_tokenizer()


class biaffine(nn.Module):
    def __init__(self, in_size, out_size, bias_x=True, bias_y=True):
        super().__init__()
        self.bias_x = bias_x
        self.bias_y = bias_y
        self.out_size = out_size
        self.U = torch.nn.Parameter(
            torch.Tensor(in_size + int(bias_x), out_size,
                         in_size + int(bias_y)))
        # self.U1 = self.U.view(size=(in_size + int(bias_x),-1))
        #U.shape = [in_size,out_size,in_size]
    def forward(self, x, y):
        if self.bias_x:
            x = torch.cat((x, torch.ones_like(x[..., :1])), dim=-1)
예제 #5
0
from model.loss_function import multilabel_cross_entropy
from model.metrics import metrics
from data_preprocessing import *
import json
from tqdm import tqdm
import unicodedata, re
from data_preprocessing import predict_data_prepro
from data_preprocessing import tools
from tqdm import tqdm

device = torch.device('cuda')

added_token = ['[unused1]', '[unused2]']
tokenizer = BertTokenizer.from_pretrained(
    args.pretrained_model_path, additional_special_tokens=added_token)
predicate2id, id2predicate = tools.load_schema()
model = bertMRC(pre_train_dir=args.pretrained_model_path,
                dropout_rate=0.5).to(device)
model.load_state_dict(torch.load(args.checkpoints))
model.eval()


def load_data(file_path):
    with open(file_path, 'r', encoding='utf8') as f:
        lines = f.readlines()
        sentences = []
        for line in lines:
            data = json.loads(line)
            sentences.append(data['text'])
        return sentences