Пример #1
0
    def __init__(self, eval_path):
        self.tokenizer = Tokenizer(VocabPath)
        self.eval_path = eval_path
        self.eval_lines = []
        self.label_lines = []

        self.__load_data()
 def __init__(self):
     self.tokenizer = Tokenizer(VocabPath)
     self.seg = pkuseg.pkuseg()
     self.vocab_size = self.tokenizer._vocab_size
     self.token_pad_id = self.tokenizer._token_pad_id
     self.token_cls_id = self.tokenizer._token_start_id
     self.token_sep_id = self.tokenizer._token_end_id
     self.token_mask_id = self.tokenizer._token_mask_id
Пример #3
0
def random_wrong(text):
    tokenizer = Tokenizer(VocabPath)
    length = len(text)
    position = random.randint(0, length - 1)
    number = random.randint(672, 7992)
    text = list(text)
    text[position] = tokenizer.id_to_token(number)
    text = ''.join(text)
    return text
 def __init__(self, test_path):
     self.tokenizer = Tokenizer(VocabPath)
     self.test_path = test_path
     self.test_lines = []
     self.label_lines = []
     # 读取数据
     with open(self.test_path, 'r', encoding='utf-8') as f:
         for line in f:
             if line:
                 line = line.strip()
                 line_list = line.split('-***-')
                 self.test_lines.append(line_list[1])
                 self.label_lines.append(line_list[0])
Пример #5
0
    def __init__(self,
                 vocab_size=VocabSize,
                 hidden=HiddenSize,
                 max_len=SentenceLength,
                 num_hidden_layers=HiddenLayerNum,
                 attention_heads=AttentionHeadNum,
                 dropout_prob=DropOut,
                 intermediate_size=IntermediateSize):
        super(SMBertMlm, self).__init__()
        self.vocab_size = vocab_size
        self.hidden_size = hidden
        self.max_len = max_len
        self.num_hidden_layers = num_hidden_layers
        self.attention_head_num = attention_heads
        self.dropout_prob = dropout_prob
        self.attention_head_size = hidden // attention_heads
        self.tokenizer = Tokenizer(VocabPath)
        self.intermediate_size = intermediate_size

        # 申明网络
        self.smbert_emd = SMBbertEmbeddings(vocab_size=self.vocab_size,
                                            max_len=self.max_len,
                                            hidden_size=self.hidden_size)
        self.bi_gru = BiGRU(self.hidden_size, self.hidden_size)
        self.sigmoid = nn.Sigmoid()
        self.transformer_blocks = nn.ModuleList(
            Transformer(hidden_size=self.hidden_size,
                        attention_head_num=self.attention_head_num,
                        attention_head_size=self.attention_head_size,
                        intermediate_size=self.intermediate_size).to(device)
            for _ in range(self.num_hidden_layers))
        self.mlm = Mlm(self.hidden_size, self.vocab_size)
    def __init__(self, test_path):
        self.tokenizer = Tokenizer(VocabPath)
        self.test_path = test_path
        self.test_lines = []
        self.label_lines = []
        self.labels = []
        # 读取数据
        with open(self.test_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line:
                    line_list = line.strip().split('-***-')

                    self.test_lines.append(line_list[0].strip())
                    self.label_lines.append(line_list[1].strip())
                    label = line_list[2].strip()
                    labels = [int(i) for i in label if i != ' ']
                    labels = [0] + labels[:min(len(labels), SentenceLength - 2)] + [0]
                    pad_label_len = SentenceLength - len(labels)
                    labels = labels + [0] * pad_label_len
                    self.labels.append(labels)
class RobertaTestSet(Dataset):
    def __init__(self, test_path):
        self.tokenizer = Tokenizer(VocabPath)
        self.test_path = test_path
        self.test_lines = []
        self.label_lines = []
        self.labels = []
        # 读取数据
        with open(self.test_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line:
                    line_list = line.strip().split('-***-')

                    self.test_lines.append(line_list[0].strip())
                    self.label_lines.append(line_list[1].strip())
                    label = line_list[2].strip()
                    labels = [int(i) for i in label if i != ' ']
                    labels = [0] + labels[:min(len(labels), SentenceLength - 2)] + [0]
                    pad_label_len = SentenceLength - len(labels)
                    labels = labels + [0] * pad_label_len
                    self.labels.append(labels)

    def __len__(self):
        return len(self.label_lines)

    def __getitem__(self, item):
        output = {}
        test_text = self.test_lines[item]
        label_text = self.label_lines[item]
        labels = self.labels[item]
        test_token = self.__gen_token(test_text)
        label_token = self.__gen_token(label_text)
        segment_ids = [0 if x else 0 for x in label_token]
        output['input_token_ids'] = test_token
        output['token_ids_labels'] = label_token
        output['segment_ids'] = segment_ids
        output['label'] = labels
        instance = {k: torch.tensor(v, dtype=torch.long) for k, v in output.items()}
        return instance

    def __gen_token(self, tokens):
        tar_token_ids = [101]
        tokens = list(tokens)
        tokens = tokens[:(SentenceLength - 2)]
        for token in tokens:
            token_id = self.tokenizer.token_to_id(token)
            tar_token_ids.append(token_id)
        tar_token_ids.append(102)
        if len(tar_token_ids) < SentenceLength:
            for i in range(SentenceLength - len(tar_token_ids)):
                tar_token_ids.append(0)
        return tar_token_ids
Пример #8
0
class SMBertEvalSet(Dataset):
    def __init__(self, eval_path):
        self.tokenizer = Tokenizer(VocabPath)
        self.eval_path = eval_path
        self.eval_lines = []
        self.label_lines = []

        self.__load_data()

    def __load_data(self):
        # 读取数据
        with open(self.eval_path, 'r', encoding='utf-8') as f:
            for line in f:
                if line:
                    line = line.strip()
                    line_list = line.split('-***-')
                    self.eval_lines.append(line_list[1])
                    self.label_lines.append(line_list[0])

    def __gen_token(self, tokens):
        tar_token_ids = [101]
        tokens = list(tokens)
        tokens = tokens[:(SentenceLength - 2)]
        for token in tokens:
            token_id = self.tokenizer.token_to_id(token)
            tar_token_ids.append(token_id)
        tar_token_ids.append(102)
        return tar_token_ids

    def __len__(self):
        return len(self.label_lines)

    def __getitem__(self, item):
        output = {}
        eval_text = self.eval_lines[item]
        label_text = self.label_lines[item]
        eval_token = self.__gen_token(eval_text)
        label_token = self.__gen_token(label_text)
        position_ids = [i for i in range(len(eval_token))]
        segment_ids = [1 if x else 0 for x in label_token]
        output['eval_token'] = eval_token
        output['eval_position'] = position_ids
        output['eval_segment'] = segment_ids
        output['eval_label'] = label_token
        instance = {
            k: torch.tensor(v, dtype=torch.long)
            for k, v in output.items()
        }
        return instance
class DataFactory(object):
    def __init__(self):
        self.tokenizer = Tokenizer(VocabPath)
        self.seg = pkuseg.pkuseg()
        self.vocab_size = self.tokenizer._vocab_size
        self.token_pad_id = self.tokenizer._token_pad_id
        self.token_cls_id = self.tokenizer._token_start_id
        self.token_sep_id = self.tokenizer._token_end_id
        self.token_mask_id = self.tokenizer._token_mask_id

    def __token_process(self, token_id):
        """
        以80%的几率替换为[MASK],以10%的几率保持不变,
        以10%的几率替换为一个随机token。
        """
        rand = np.random.random()
        if rand <= 0.8:
            return self.token_mask_id
        elif rand <= 0.9:
            return token_id
        else:
            return np.random.randint(0, self.vocab_size)

    def texts_to_ids(self, texts):
        texts_ids = []
        for text in texts:
            # 处理每个句子
            for word in text:
                # text_ids首位分别是cls和sep,这里暂时去除
                word_tokes = self.tokenizer.tokenize(text=word)[1:-1]
                words_ids = self.tokenizer.tokens_to_ids(word_tokes)
                texts_ids.append(words_ids)
        return texts_ids

    def ids_to_mask(self, texts_ids):
        instances = []
        total_ids = []
        total_masks = []
        # 为每个字或者词生成一个概率,用于判断是否mask
        mask_rates = np.random.random(len(texts_ids))

        for i, word_id in enumerate(texts_ids):
            # 为每个字生成对应概率
            total_ids.extend(word_id)
            if mask_rates[i] < MaskRate:
                # 因为word_id可能是一个字,也可能是一个词
                for sub_id in word_id:
                    total_masks.append(self.__token_process(sub_id))
            else:
                total_masks.extend([0] * len(word_id))

        # 每个实例的最大长度为512,因此对一个段落进行裁剪
        # 510 = 512 - 2,给cls和sep留的位置
        for i in range(math.ceil(len(total_ids) / (SentenceLength - 2))):
            tmp_ids = [self.token_cls_id]
            tmp_masks = [self.token_pad_id]
            tmp_ids.extend(
                total_ids[i * (SentenceLength - 2):min((i + 1) *
                                                       (SentenceLength -
                                                        2), len(total_ids))])
            tmp_masks.extend(total_masks[i * (SentenceLength - 2):min(
                (i + 1) * (SentenceLength - 2), len(total_masks))])
            # 不足512的使用padding补全
            diff = SentenceLength - len(tmp_ids)
            if diff == 1:
                tmp_ids.append(self.token_sep_id)
                tmp_masks.append(self.token_pad_id)
            else:
                # 添加结束符
                tmp_ids.append(self.token_sep_id)
                tmp_masks.append(self.token_pad_id)
                # 将剩余部分padding补全
                tmp_ids.extend([self.token_pad_id] * (diff - 1))
                tmp_masks.extend([self.token_pad_id] * (diff - 1))
            instances.append([tmp_ids, tmp_masks])
        return instances

    def ids_all_mask(self, texts_ids, tokenid2count):
        instances = []
        tmp_ids = [101]

        # 格式化数据
        for token_ids in texts_ids:
            if isinstance(token_ids, list):
                for token_id in token_ids:
                    tmp_ids.append(token_id)
                    if len(tmp_ids) == SentenceLength - 1:
                        break
            else:
                tmp_ids.append(token_ids)
                if len(tmp_ids) == SentenceLength - 1:
                    break
            if len(tmp_ids) == SentenceLength - 1:
                break

        tmp_ids.append(102)
        input_length = len(tmp_ids) - 2
        if len(tmp_ids) < SentenceLength:
            for i in range(SentenceLength - len(tmp_ids)):
                tmp_ids.append(0)

        for i in range(1, input_length + 1):
            # 如果某字出现次数很少,则强行增加训练集
            if tokenid2count[tmp_ids[i]] < WordGenTimes:
                for j in range(WordGenTimes - tokenid2count[tmp_ids[i]]):
                    tmp_masks = [0] * SentenceLength
                    rand_num = np.random.randint(672, 7992)
                    tmp_masks[i] = rand_num
                    instances.append([tmp_ids, tmp_masks])
            tmp_masks = [0] * SentenceLength
            if random.random() < RanWrongDivisor:
                rand_num = np.random.randint(672, 7992)
                tmp_masks[i] = rand_num
            else:
                tmp_masks[i] = tmp_ids[i]
            instances.append([tmp_ids, tmp_masks])
        return instances