Пример #1
0
  def create_from_ud(cls, data_file_list, split_chars=True):
    """Initialize corpus from a path to a file in conllu format
    split_chars: if true, split up multisyllabic words into characters.
    """
    corpus = POSCorpus()
    corpus.sentences = []

    for data_file_path in data_file_list:
      with open(data_file_path, "r", encoding="utf-8") as data_file:
        data = data_file.read()
        data = conllu.parse(data)

      for token_list in data:
        sentence = []
        for token in token_list:
          pos = token['upostag']
          word = token['form']

          if split_chars:
            for char in word:
              sentence.append({'char': chinese_converter.to_simplified(char), 'pos': pos})
          else:
            sentence.append({'word': chinese_converter.to_simplified(word), 'pos': pos})
        if len(sentence) > 0:
          corpus.sentences.append(sentence)

    return corpus
    def test_two_way(self):
        with self.subTest("should get same result when translated both ways"):
            trad = '皇后與國王在後面共同候車'
            result = to_traditional(to_simplified(trad))

            self.assertEqual(trad, result)

            simp = '皇后与国王在后面共同候车后'
            result = to_simplified(to_traditional(simp))

            self.assertEqual(simp, result)
Пример #3
0
def process_data(label, text):
    data = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                              do_lower_case=True)
    text = re.sub(
        '[a-zA-Z0-9 ’!"#$%&\'()*+,-./:;<=>?@?★…【】《》?“”‘’![\\]^_`{|}~]', '',
        text)
    text_split = re.split('。', text)
    text_split_join = []
    temp = ''
    for i in text_split:
        if len(temp) + len(i) < 511:
            temp += i
        else:
            if len(temp) < 511:
                text_split_join.append(temp)
            temp = i
    if temp and len(temp) < 511:
        text_split_join.append(temp)

    for text in text_split_join:
        text = chinese_converter.to_simplified(text)

        encoded_text = tokenizer(text,
                                 padding=True,
                                 return_tensors="pt",
                                 add_special_tokens=True)
        encoded_text = np.array(encoded_text["input_ids"])[0]

        if label[0:2] == '[]':
            label_cls = [0] * len(encoded_text)
        else:
            if type(label) == type(''):
                label = label.split(', ')
            label_cls = [0] * len(encoded_text)
            for i in label:
                i = i.split('\'')
                i = chinese_converter.to_simplified(i[1])
                encode_i = tokenizer(i,
                                     padding=True,
                                     return_tensors="pt",
                                     add_special_tokens=True)
                encode_i = np.array(encode_i['input_ids'][0][1:-1])
                for j in range(0, len(encoded_text) - len(encode_i)):
                    if (encoded_text[j:j + len(encode_i)] == encode_i).all():
                        for k in range(0, len(encode_i)):
                            label_cls[j + k] = 1
        data.append(
            (torch.tensor(encoded_text), torch.tensor(np.array(label_cls))))
    return data
Пример #4
0
def make_data(a, b):
    data = []
    model = BertModel.from_pretrained('bert-base-chinese')
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case = True)
    for i in range(a, b):
        f = open("training_set/" + str(i) + ".txt", "r")
        lines = f.readlines()

        label = lines[0]
        text = lines[1] if len(lines) == 2 else ''.join(lines[2:])
        f.close()
        text = re.sub('[a-zA-Z0-9 ’!"#$%&\'()*+,-./:;<=>?@?★…【】《》?“”‘’![\\]^_`{|}~]','',text)
        text_split = re.split('。', text)
        text_split_join = []
        temp = ''
        for i in text_split:
            if len(temp) + len(i) < 511:
                temp += i
            else:
                if len(temp) < 511:
                    text_split_join.append(temp)
                temp = i
        if temp and len(temp) < 511:
            text_split_join.append(temp)

        for text in text_split_join:
            text = chinese_converter.to_simplified(text)

            encoded_text = tokenizer(text, padding=True, return_tensors="pt", add_special_tokens=True)#会自动添加ERNIE所需要的特殊token,如[CLS], [SEP]
            encoded_text = np.array(encoded_text["input_ids"])[0]#[:MAX_SEQLEN]
            #encoded_text = np.pad(encoded_text, (0, MAX_SEQLEN-len(encoded_text)), mode='constant') # 对所有句子都补长至11000,这样会比较费显存;

            if label[0:2] == '[]':
                label_cls = [0] * len(encoded_text)
            else:
                if type(label) == type(''):
                    label = label.split(', ')
                label_cls = [0] * len(encoded_text)
                for i in label:
                    i = i.split('\'')
                    i = chinese_converter.to_simplified(i[1])
                    encode_i = tokenizer(i, padding=True, return_tensors="pt", add_special_tokens=True)
                    encode_i = np.array(encode_i['input_ids'][0][1:-1])
                    for j in range(0, len(encoded_text)-len(encode_i)):
                        if (encoded_text[j:j+len(encode_i)] == encode_i).all():
                            for k in range(0, len(encode_i)):
                                label_cls[j+k] = 1
            data.append((torch.tensor(encoded_text), torch.tensor(np.array(label_cls))))
    return data
Пример #5
0
def inference(text, model):

    data = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese',
                                              do_lower_case=True)
    text_split = re.split('。', text)
    text_split_join = []
    temp = ''
    for i in text_split:
        if len(temp) + len(i) < 511:
            temp += i
        else:
            if len(temp) < 511:
                text_split_join.append(temp)
            temp = i
    if temp and len(temp) < 511:
        text_split_join.append(temp)
    for text in text_split_join:
        text = re.sub(
            '[a-zA-Z0-9 ’!"#$%&\'()*+,-./:;<=>?@?★…【】《》?“”‘’![\\]^_`{|}~]', '',
            text)

        text_convert = chinese_converter.to_simplified(text)
        encoded_text = tokenizer(text_convert,
                                 padding=True,
                                 return_tensors="pt",
                                 add_special_tokens=True)
        encoded_text = np.array(encoded_text["input_ids"])[0]
        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        outputs = model(input_ids=torch.tensor([encoded_text]).to((device)))
        logits = outputs[0][0][1:-1]

        index = 0
        for i in range(0, len(logits)):
            if logits[i][1] > logits[i][0]:
                if data and i == index + 1:
                    data[-1] += text[i]
                else:
                    data.append(text[i])
                index = i
    data = set(data)
    res = []
    for i in data:
        if 2 <= len(i) <= 4:
            res.append(i)
    return res
Пример #6
0
def testing(text, model):

    data = []
    tokenizer = BertTokenizer.from_pretrained('bert-base-chinese', do_lower_case = True)
    text_split = re.split('。', text)
    text_split_join = []
    temp = ''
    for i in text_split:
        if len(temp) + len(i) < 511:
            temp += i
        else:
            if len(temp) < 511:
                text_split_join.append(temp)
            temp = i
    if temp and len(temp) < 511:
        text_split_join.append(temp)
    for text in text_split_join:
        text = re.sub('[a-zA-Z0-9 ’!"#$%&\'()*+,-./:;<=>?@?★…【】《》?“”‘’![\\]^_`{|}~]','',text)
        # print(len(text),text)
        text_convert = chinese_converter.to_simplified(text)
        encoded_text = tokenizer(text_convert, padding=True, return_tensors="pt", add_special_tokens=True)
        encoded_text = np.array(encoded_text["input_ids"])[0]
        # print(encoded_text)
        # predicted_index = encoded_text
        # predicted_token = [tokenizer.convert_ids_to_tokens([predicted_index[x]])[0] for x in
        #            range(1, (len(encoded_text) - 1))]
        # print(predicted_token)
        outputs = model(input_ids=torch.tensor([encoded_text]).to('cuda:0'))
        logits = outputs[0][0][1:-1]
        # print(len(logits))
        index = 0
        for i in range(0, len(logits)):
            if logits[i][1] > logits[i][0]:
                if data and i == index + 1:
                    data[-1] += text[i]
                else:
                    data.append(text[i])
                index = i
    data = set(data)
    res = []
    for i in data:
        if 2 <= len(i) <= 4:
            res.append(i)
    print(res)
    return set(data)
Пример #7
0
 def _to_simplified(self, x):
     return [cc.to_simplified(proc) for proc in x]
    def test_to_simplified(self):
        result = to_simplified('皇后與國王在後面共同候車吃麵')

        self.assertEqual(result, '皇后与国王在后面共同候车吃面')
    def test_other_language(self):
        text = "this is a book."

        with self.subTest("should not change text in other languages"):
            self.assertEqual(text, to_simplified(text))
            self.assertEqual(text, to_traditional(text))