def parse_xml_file(path): print('Parse data from %s' % path) data_list = [] dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() texts = split_2_short_text(text) corrections = split_2_short_text(correction) if len(texts) != len(corrections): print('error:' + text + '\t' + correction) continue for i in range(len(texts)): if len(texts[i]) > 40: print('error:' + texts[i] + '\t' + corrections[i]) continue source = segment(texts[i], cut_type='char') target = segment(corrections[i], cut_type='char') pair = [source, target] if pair not in data_list: data_list.append(pair) return data_list
def parse_xml_file(path): print('Parse data from %s' % path) data_list = [] dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() # Input the correct text correction = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() # Segment source = segment(text, cut_type='char') target = segment(correction, cut_type='char') data_list.append([source, target]) return data_list
def parse_txt_file(input_path, truth_path): print('Parse data from %s and %s' % (input_path, truth_path)) id_lst, word_lst, label_lst = [], [], [] # read truth file truth_dict = {} with open(truth_path, 'r', encoding='utf-8') as truth_f: for line in truth_f: parts = line.strip().split(',') # Locate the error position locate_dict = {} if len(parts) == 4: text_id = parts[0] start_off = parts[1] end_off = parts[2] error_type = parts[3].strip() for i in range(int(start_off) - 1, int(end_off)): if i == int(start_off) - 1: error_type_change = 'B-' + error_type else: error_type_change = 'I-' + error_type # locate_dict[i] = error_type_change locate_dict[i] = error_type # for i in range(int(start_off) - 1, int(end_off)): # locate_dict[i] = error_type if text_id in truth_dict: truth_dict[text_id].update(locate_dict) else: truth_dict[text_id] = locate_dict # read input file and get token with open(input_path, 'r', encoding='utf-8') as input_f: for line in input_f: parts = line.strip().split('\t') text_id = parts[0].replace('(sid=', '').replace(')', '') text = parts[1] # segment with pos word_seq, pos_seq = segment(text, cut_type='char', pos=True) word_arr, label_arr = [], [] if text_id in truth_dict: locate_dict = truth_dict[text_id] for i in range(len(word_seq)): if i in locate_dict: word_arr.append(word_seq[i]) # fill with error type label_arr.append(locate_dict[i]) else: word_arr.append(word_seq[i]) # fill with pos tag label_arr.append(pos_seq[i]) else: word_arr = word_seq label_arr = pos_seq id_lst.append(text_id) word_lst.append(word_arr) label_lst.append(label_arr) return id_lst, word_lst, label_lst
def parse_xml_file(path): print('Parse data from %s' % path) word_arr = [] with open(path, 'r', encoding='utf-8') as f: dom_tree = minidom.parse(f) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('CORRECTION')[0]. \ childNodes[0].data.strip() # Segment word_seq = segment(text, cut_type='char', pos=False) word_arr.append(word_seq) return word_arr
def parse_xml_file(path): print('Parse data from %s' % path) id_lst, word_lst, label_lst = [], [], [] with open(path, 'r', encoding='utf-8') as f: dom_tree = minidom.parse(path) docs = dom_tree.documentElement.getElementsByTagName('DOC') for doc in docs: # Input the text text = doc.getElementsByTagName('TEXT')[0]. \ childNodes[0].data.strip() text_id = doc.getElementsByTagName('TEXT')[0].getAttribute('id') errors = doc.getElementsByTagName('ERROR') # Locate the error position and error type locate_dict = {} for error in errors: start_off = error.getAttribute('start_off') end_off = error.getAttribute('end_off') error_type = error.getAttribute('type') for i in range(int(start_off) - 1, int(end_off)): if i == int(start_off) - 1: error_type_change = 'B-' + error_type else: error_type_change = 'I-' + error_type # locate_dict[i] = error_type_change locate_dict[i] = error_type # Segment with pos word_seq, pos_seq = segment(text, cut_type='char', pos=True) word_arr, label_arr = [], [] for i in range(len(word_seq)): if i in locate_dict: word_arr.append(word_seq[i]) # Fill with error type label_arr.append(locate_dict[i]) else: word_arr.append(word_seq[i]) # Fill with pos tag label_arr.append(pos_seq[i]) id_lst.append(text_id) word_lst.append(word_arr) label_lst.append(label_arr) return id_lst, word_lst, label_lst
'我对于宠物出租得事非常认同,因为其实很多人喜欢宠物', # 出租的事 '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。', # 题高 => 提高 专业人氏 => 专业人士right; [['宠', '重', 2, 3], ['方面', '方便', 10, 12],error '三个凑皮匠胜过一个诸葛亮也有道理。', # [['三个凑皮匠', '三个臭皮匠', 0, 5]]) '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。', ] d = Detector(enable_rnnlm=False) for i in error_sentences: print(i, d.detect(i)) sent1 = '少先队员应该为老人让座' sent_seg = segment(sent1) ppl = d.ppl_score(sent_seg) print(sent1, 'ppl_score:', ppl) sent2 = '少先队员因该为老人让坐' sent_seg = segment(sent2) ppl = d.ppl_score(sent_seg) print(sent2, 'ppl_score:', ppl) print(sent1, d.detect(sent1)) print(sent2, d.detect(sent2)) freq = d.word_frequency('龟龙麟凤') print('freq:', freq)
# -*- coding: utf-8 -*- # Author: XuMing <*****@*****.**> # Brief: import pycorrector from pycorrector.tokenizer import segment c = pycorrector.get_same_pinyin('长') print('get_same_pinyin:', c) c = pycorrector.get_same_stroke('长') print('get_same_stroke:', c) freq = pycorrector.word_frequency('龟龙麟凤') print('freq:', freq) sent = '少先队员应该为老人让座' sent_seg = segment(sent) ppl = pycorrector.ppl_score(sent_seg) print('ppl_score:', ppl) sent = '少先队员因该为老人让坐' sent_seg = segment(sent) ppl = pycorrector.ppl_score(sent_seg) print('ppl_score:', ppl) print(pycorrector.detect(sent)) corrected_sent, detail = pycorrector.correct('少先队员因该为老人让坐') print(corrected_sent, detail)
# -*- coding: utf-8 -*- # Author: XuMing <*****@*****.**> # Brief: from pypinyin import lazy_pinyin from pycorrector.utils.text_utils import traditional2simplified, simplified2traditional from pycorrector.utils.text_utils import get_homophones_by_char, get_homophones_by_pinyin from pycorrector.tokenizer import segment traditional_sentence = '憂郁的臺灣烏龜' simplified_sentence = traditional2simplified(traditional_sentence) print(simplified_sentence) simplified_sentence = '忧郁的台湾乌龟' traditional_sentence = simplified2traditional(simplified_sentence) print(traditional_sentence) print(lazy_pinyin('中心')) # 不带音调 print(segment('小姑娘蹦蹦跳跳的去了她外公家')) pron = get_homophones_by_char('长') print('get_homophones_by_char:', pron) pron = get_homophones_by_pinyin('zha1ng') print('get_homophones_by_pinyin:', pron)