Exemplo n.º 1
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    data_list = []
    dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()

        texts = split_2_short_text(text)
        corrections = split_2_short_text(correction)
        if len(texts) != len(corrections):
            print('error:' + text + '\t' + correction)
            continue
        for i in range(len(texts)):
            if len(texts[i]) > 40:
                print('error:' + texts[i] + '\t' + corrections[i])
                continue
            source = segment(texts[i], cut_type='char')
            target = segment(corrections[i], cut_type='char')
            pair = [source, target]
            if pair not in data_list:
                data_list.append(pair)
    return data_list
Exemplo n.º 2
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    data_list = []
    dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        source = segment(text, cut_type='char')
        target = segment(correction, cut_type='char')
        data_list.append([source, target])
    return data_list
Exemplo n.º 3
0
def parse_txt_file(input_path, truth_path):
    print('Parse data from %s and %s' % (input_path, truth_path))
    id_lst, word_lst, label_lst = [], [], []
    # read truth file
    truth_dict = {}
    with open(truth_path, 'r', encoding='utf-8') as truth_f:
        for line in truth_f:
            parts = line.strip().split(',')
            # Locate the error position
            locate_dict = {}
            if len(parts) == 4:
                text_id = parts[0]
                start_off = parts[1]
                end_off = parts[2]
                error_type = parts[3].strip()
                for i in range(int(start_off) - 1, int(end_off)):
                    if i == int(start_off) - 1:
                        error_type_change = 'B-' + error_type
                    else:
                        error_type_change = 'I-' + error_type
                    # locate_dict[i] = error_type_change
                    locate_dict[i] = error_type
                # for i in range(int(start_off) - 1, int(end_off)):
                #     locate_dict[i] = error_type
                if text_id in truth_dict:
                    truth_dict[text_id].update(locate_dict)
                else:
                    truth_dict[text_id] = locate_dict

    # read input file and get token
    with open(input_path, 'r', encoding='utf-8') as input_f:
        for line in input_f:
            parts = line.strip().split('\t')
            text_id = parts[0].replace('(sid=', '').replace(')', '')
            text = parts[1]
            # segment with pos
            word_seq, pos_seq = segment(text, cut_type='char', pos=True)
            word_arr, label_arr = [], []
            if text_id in truth_dict:
                locate_dict = truth_dict[text_id]
                for i in range(len(word_seq)):
                    if i in locate_dict:
                        word_arr.append(word_seq[i])
                        # fill with error type
                        label_arr.append(locate_dict[i])
                    else:
                        word_arr.append(word_seq[i])
                        # fill with pos tag
                        label_arr.append(pos_seq[i])
            else:
                word_arr = word_seq
                label_arr = pos_seq
            id_lst.append(text_id)
            word_lst.append(word_arr)
            label_lst.append(label_arr)
    return id_lst, word_lst, label_lst
Exemplo n.º 4
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    word_arr = []
    with open(path, 'r', encoding='utf-8') as f:
        dom_tree = minidom.parse(f)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()
        # Segment
        word_seq = segment(text, cut_type='char', pos=False)
        word_arr.append(word_seq)
    return word_arr
Exemplo n.º 5
0
def parse_xml_file(path):
    print('Parse data from %s' % path)
    id_lst, word_lst, label_lst = [], [], []
    with open(path, 'r', encoding='utf-8') as f:
        dom_tree = minidom.parse(path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        text_id = doc.getElementsByTagName('TEXT')[0].getAttribute('id')
        errors = doc.getElementsByTagName('ERROR')
        # Locate the error position and error type
        locate_dict = {}
        for error in errors:
            start_off = error.getAttribute('start_off')
            end_off = error.getAttribute('end_off')
            error_type = error.getAttribute('type')
            for i in range(int(start_off) - 1, int(end_off)):
                if i == int(start_off) - 1:
                    error_type_change = 'B-' + error_type
                else:
                    error_type_change = 'I-' + error_type
                # locate_dict[i] = error_type_change
                locate_dict[i] = error_type
        # Segment with pos
        word_seq, pos_seq = segment(text, cut_type='char', pos=True)
        word_arr, label_arr = [], []
        for i in range(len(word_seq)):
            if i in locate_dict:
                word_arr.append(word_seq[i])
                # Fill with error type
                label_arr.append(locate_dict[i])
            else:
                word_arr.append(word_seq[i])
                # Fill with pos tag
                label_arr.append(pos_seq[i])
        id_lst.append(text_id)
        word_lst.append(word_arr)
        label_lst.append(label_arr)
    return id_lst, word_lst, label_lst
Exemplo n.º 6
0
    '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物',  # 出租的事
    '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。',
    # 题高 => 提高 专业人氏 => 专业人士right; [['宠', '重', 2, 3], ['方面', '方便', 10, 12],error
    '三个凑皮匠胜过一个诸葛亮也有道理。',  # [['三个凑皮匠', '三个臭皮匠', 0, 5]])
    '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。',

]


d = Detector(enable_rnnlm=False)
for i in error_sentences:
    print(i, d.detect(i))


sent1 = '少先队员应该为老人让座'
sent_seg = segment(sent1)
ppl = d.ppl_score(sent_seg)
print(sent1, 'ppl_score:', ppl)

sent2 = '少先队员因该为老人让坐'
sent_seg = segment(sent2)
ppl = d.ppl_score(sent_seg)
print(sent2, 'ppl_score:', ppl)

print(sent1, d.detect(sent1))
print(sent2, d.detect(sent2))

freq = d.word_frequency('龟龙麟凤')
print('freq:', freq)

Exemplo n.º 7
0
# -*- coding: utf-8 -*-
# Author: XuMing <*****@*****.**>
# Brief:
import pycorrector
from pycorrector.tokenizer import segment

c = pycorrector.get_same_pinyin('长')
print('get_same_pinyin:', c)

c = pycorrector.get_same_stroke('长')
print('get_same_stroke:', c)

freq = pycorrector.word_frequency('龟龙麟凤')
print('freq:', freq)

sent = '少先队员应该为老人让座'
sent_seg = segment(sent)
ppl = pycorrector.ppl_score(sent_seg)
print('ppl_score:', ppl)

sent = '少先队员因该为老人让坐'
sent_seg = segment(sent)
ppl = pycorrector.ppl_score(sent_seg)
print('ppl_score:', ppl)

print(pycorrector.detect(sent))

corrected_sent, detail = pycorrector.correct('少先队员因该为老人让坐')
print(corrected_sent, detail)
Exemplo n.º 8
0
# -*- coding: utf-8 -*-
# Author: XuMing <*****@*****.**>
# Brief:
from pypinyin import lazy_pinyin

from pycorrector.utils.text_utils import traditional2simplified, simplified2traditional
from pycorrector.utils.text_utils import get_homophones_by_char, get_homophones_by_pinyin
from pycorrector.tokenizer import segment

traditional_sentence = '憂郁的臺灣烏龜'
simplified_sentence = traditional2simplified(traditional_sentence)
print(simplified_sentence)

simplified_sentence = '忧郁的台湾乌龟'
traditional_sentence = simplified2traditional(simplified_sentence)
print(traditional_sentence)

print(lazy_pinyin('中心'))  # 不带音调

print(segment('小姑娘蹦蹦跳跳的去了她外公家'))

pron = get_homophones_by_char('长')
print('get_homophones_by_char:', pron)

pron = get_homophones_by_pinyin('zha1ng')
print('get_homophones_by_pinyin:', pron)