def proc_item(item): """ 处理训练数据集 Args: item: Returns: list """ root = etree.XML(item) passages = dict() mistakes = [] for passage in root.xpath('/ESSAY/TEXT/PASSAGE'): passages[passage.get('id')] = traditional2simplified(passage.text) for mistake in root.xpath('/ESSAY/MISTAKE'): mistakes.append({'id': mistake.get('id'), 'location': int(mistake.get('location')) - 1, 'wrong': traditional2simplified(mistake.xpath('./WRONG/text()')[0].strip()), 'correction': traditional2simplified(mistake.xpath('./CORRECTION/text()')[0].strip())}) rst_items = dict() def get_passages_by_id(pgs, _id): p = pgs.get(_id) if p: return p _id = _id[:-1] + str(int(_id[-1]) + 1) p = pgs.get(_id) if p: return p raise ValueError(f'passage not found by {_id}') for mistake in mistakes: if mistake['id'] not in rst_items.keys(): rst_items[mistake['id']] = {'original_text': get_passages_by_id(passages, mistake['id']), 'wrong_ids': [], 'correct_text': get_passages_by_id(passages, mistake['id'])} # todo 繁体转简体字符数量或位置发生改变校验 ori_text = rst_items[mistake['id']]['original_text'] cor_text = rst_items[mistake['id']]['correct_text'] if len(ori_text) == len(cor_text): if ori_text[mistake['location']] in mistake['wrong']: rst_items[mistake['id']]['wrong_ids'].append(mistake['location']) wrong_char_idx = mistake['wrong'].index(ori_text[mistake['location']]) start = mistake['location'] - wrong_char_idx end = start + len(mistake['wrong']) rst_items[mistake['id']][ 'correct_text'] = f'{cor_text[:start]}{mistake["correction"]}{cor_text[end:]}' else: print(f'error line:\n{mistake["id"]}\n{ori_text}\n{cor_text}') rst = [] for k in rst_items.keys(): if len(rst_items[k]['correct_text']) == len(rst_items[k]['original_text']): rst.append({'id': k, **rst_items[k]}) else: text = rst_items[k]['correct_text'] rst.append({'id': k, 'correct_text': text, 'original_text': text, 'wrong_ids': []}) return rst
def proc_test_set(fp): """ 生成sighan15的测试集 Args: fp: Returns: """ inputs = dict() with open(os.path.join(fp, 'SIGHAN15_CSC_TestInput.txt'), 'r', encoding='utf-8') as f: for line in f: pid = line[5:14] text = line[16:].strip() inputs[pid] = text rst = [] with open(os.path.join(fp, 'SIGHAN15_CSC_TestTruth.txt'), 'r', encoding='utf-8') as f: for line in f: pid = line[0:9] mistakes = line[11:].strip().split(', ') if len(mistakes) <= 1: text = traditional2simplified(inputs[pid]) rst.append({ 'id': pid, 'original_text': text, 'wrong_ids': [], 'correct_text': text }) else: wrong_ids = [] original_text = inputs[pid] cor_text = inputs[pid] for i in range(len(mistakes) // 2): idx = int(mistakes[2 * i]) - 1 cor_char = mistakes[2 * i + 1] wrong_ids.append(idx) cor_text = f'{cor_text[:idx]}{cor_char}{cor_text[idx + 1:]}' original_text = traditional2simplified(original_text) cor_text = traditional2simplified(cor_text) if len(original_text) != len(cor_text): print('error line:\n', pid) print(original_text) print(cor_text) continue rst.append({ 'id': pid, 'original_text': original_text, 'wrong_ids': wrong_ids, 'correct_text': cor_text }) return rst
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import sys sys.path.append("..") import pycorrector if __name__ == '__main__': traditional_sentence = '憂郁的臺灣烏龜' simplified_sentence = pycorrector.traditional2simplified( traditional_sentence) print(traditional_sentence, '=>', simplified_sentence) simplified_sentence = '忧郁的台湾乌龟' traditional_sentence = pycorrector.simplified2traditional( simplified_sentence) print(simplified_sentence, '=>', traditional_sentence)
def handler(self): self.initialize() # 初始化 while True: retry_times = 100 # 搶lock失敗的重試次數 for i in range(retry_times): try: fh = open(f'{self.scan_path}/share_temp_file.log', 'a+', encoding='utf-8') # 開啟共享文件load檔名出來 portalocker.Lock(fh, timeout=60) # lock住檔案權限 fh.seek(0) filenames = list( filter(lambda x: x != '', fh.read().split('\n'))) fh.truncate(0) # load完就清空 fh.flush() # 歸還lock os.fsync(fh.fileno()) with open(f'{self.scan_path}/temp_file_queue.log', 'a+', encoding='utf-8' ) as f: # 開啟僅有本檔案所維護的filename queue if len(filenames) != 0: # 「共享文件」有filename才寫入 f.write('\n'.join(filenames) + '\n') # 新增讀入的filenames f.seek(0) filenames = list( filter(lambda x: x != '', f.read().split('\n'))) if len(filenames ) != 0: # 「filename queue」有filename才往下做 tra_text = '' word_list, keywords_list = list(), list() filename = filenames[0] # 一次只讀第一個檔案 pattern = re.match( '^(.+)_(\d{4}-\d{2}-\d{2})_(\d{2}-\d{2}-\d{2}).png$', filename) username = pattern.group(1) snapshot_date = pattern.group(2) snapshot_time = pattern.group(3) try: pil_image = self.image_preprocessing( f'{self.scan_path}/{snapshot_date}/{username}/{filename}' ) rcn_result = image_to_string( pil_image, lang='ase_chi_tra_3', config=r'--oem 3 --psm 6') # OCR辨識 sim_text = traditional2simplified( rcn_result.replace(' ', '').replace('\n', '')) # 繁轉簡 corrected_sent, _ = correct(sim_text) # 糾正錯字 tra_text = simplified2traditional( corrected_sent) # 簡轉繁 word_list = self.get_wordlist(tra_text) keywords_list = self.get_keywordlist(word_list) except Exception as e: print(f'[Recognition Fail] Message:{e}') continue self.save_to_json({ 'computer_id': username, 'snapshot_date': snapshot_date, 'snapshot_time': snapshot_time, 'address': f'{snapshot_date}/{username}/{filename}', 'keywords': keywords_list, 'wordlist': word_list, 'rawtext': tra_text }) f.truncate(0) # 確認儲存成功才清空 f.write('\n'.join(filenames[1:]) + '\n') # 濾除第一個檔案(已完成)並重新寫入檔案 f.close() except Exception as e: print( f'[Get Lock Fail] To read temp_file.log (Message:{e})') time.sleep(1) continue
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import pycorrector with open('eng_chi.txt', encoding='utf-8') as f1, open('a.txt', 'w', encoding='utf-8') as f2: for line in f1: line = line.strip() parts = line.split('\t') eng = parts[0] chi = parts[1] f2.write('src: ' + eng + "\n") f2.write('dst: ' + pycorrector.traditional2simplified(chi) + '\n')