def pretranslatexliff(self, input_source, template_source=None): """helper that converts strings to po source without requiring files""" input_file = wStringIO.StringIO(input_source) if template_source: template_file = wStringIO.StringIO(template_source) else: template_file = None output_file = wStringIO.StringIO() pretranslate.pretranslate_file(input_file, output_file, template_file) output_file.seek(0) return xliff.xlifffile(output_file.read())
def pretranslatepo(self, input_source, template_source=None): """helper that converts strings to po source without requiring files""" input_file = BytesIO(input_source.encode()) if template_source: template_file = BytesIO(template_source.encode()) else: template_file = None output_file = BytesIO() pretranslate.pretranslate_file(input_file, output_file, template_file) output_file.seek(0) return po.pofile(output_file.read())
def merge_translation(old_path, new_path, output_path, tmp_path): tmp_old_path = os.path.join(tmp_path, 'old.csv') tmp_new_path = os.path.join(tmp_path, 'new.csv') lines = '' if os.path.isfile(old_path): # 正確にマッチさせるため列と表記を合わせ、比較用の一時ファイルとして出力 with open(old_path, "r", encoding="utf_8_sig", newline="\n") as f_csv: old_lines = f_csv.read() old_lines = comma_replacer.sub(r'\t', old_lines) old_lines = re.sub(r'(^[^"\r\n])', r'"\1', old_lines, flags=re.MULTILINE) old_lines = re.sub(r'([^"\r\n]$)', r'\1"', old_lines, flags=re.MULTILINE) old_lines = re.sub(r'(["]{0,1})\t(["]{0,1})', r'"\t"', old_lines, flags=re.MULTILINE) old_lines = re.sub(r'^(([^\t\r\n]+\t?){2,2})$', r'\1\t""', old_lines, flags=re.MULTILINE) old_lines = re.sub(r'\t', r',', old_lines) with open(new_path, "r", encoding="utf_8_sig", newline="\n") as f_csv: new_lines = f_csv.read() new_lines = comma_replacer.sub(r'\t', new_lines) new_lines = re.sub(r'^(([^\t\r\n]+\t?){0,3})\t[^\r\n]+', r'\1', new_lines, flags=re.MULTILINE) new_lines = re.sub(r'\t', r',', new_lines) with open(tmp_old_path, "w+", encoding="utf_8_sig", newline="\n") as f_csv: f_csv.write(old_lines) with open(tmp_new_path, "w+", encoding="utf_8_sig", newline="\n") as f_csv: f_csv.write(new_lines) os.makedirs(os.path.split(output_path)[0], exist_ok=True) # 全文一致 f_input_csv = open(tmp_new_path, 'rb') f_output_csv = open(output_path, 'wb+') f_tm = open(tmp_old_path, 'rb') pretranslate_file(f_input_csv, f_output_csv, f_tm, fuzzymatching=False) f_input_csv.close() f_output_csv.close() f_tm.close() with open(output_path, "r", encoding="utf_8_sig") as f: no_fuzzy_lines = f.read() # あいまい一致用に原文のタグを消す new_lines = comma_replacer.sub(r'\t', new_lines) notags_lines = [] for line in new_lines.splitlines(False): if line.count('"') % 2 != 0: # クォーテーションが一致しない=破損したエントリ notags_lines.append(line) s = line.split('\t') if len(s) >= 2: s[1] = re.sub(r'(<[^>]+>)|(\{[^\}]+\})', '', s[1]) notags_lines.append(','.join(s)) with open(tmp_new_path, "w+", encoding="utf_8_sig", newline="\n") as f_csv: f_csv.write('\n'.join(notags_lines)) old_lines = comma_replacer.sub(r'\t', old_lines) notags_lines = [] for line in old_lines.splitlines(False): if line.count('"') % 2 != 0: notags_lines.append(line) s = line.split('\t') if len(s) >= 2: s[1] = re.sub(r'(<[^>]+>)|(\{[^\}]+\})', '', s[1]) notags_lines.append(','.join(s)) with open(tmp_old_path, "w+", encoding="utf_8_sig", newline="\n") as f_csv: f_csv.write('\n'.join(notags_lines)) # あいまい一致 f_input_csv = open(tmp_new_path, 'rb') f_output_csv = open(output_path, 'wb+') f_tm = open(tmp_old_path, 'rb') pretranslate_file(f_input_csv, f_output_csv, f_tm, min_similarity=75, fuzzymatching=True) f_input_csv.close() f_output_csv.close() f_tm.close() with open(output_path, "r", encoding="utf_8_sig") as f: fuzzy_lines = f.read() # 全文一致とあいまい一致を比較し、あいまい一致のラインに目印となるキーワードを入れる no_fuzzy_lines = comma_replacer.sub('\t', no_fuzzy_lines) fuzzy_lines = comma_replacer.sub('\t', fuzzy_lines) no_fuzzy_lines = no_fuzzy_lines.splitlines(False) fuzzy_lines = fuzzy_lines.splitlines(False) # 破損したエントリを削除 no_fuzzy_lines = [line for line in no_fuzzy_lines if '\t' in line] fuzzy_lines = [line for line in fuzzy_lines if '\t' in line] if len(no_fuzzy_lines) == len(fuzzy_lines): old_dict = {} for line in old_lines.splitlines(False): s = line.split('\t') if len(s) > 2 and s[2] != '""': old_dict[s[2]] = s[1] for idx, line in enumerate(no_fuzzy_lines): s = line.split('\t') nt_s = fuzzy_lines[idx].split('\t') if len(s) > 2 and s[2] == '""': if isJp.findall(nt_s[2]) and not ('(あいまい一致_') in nt_s[2]: isdiff_source = (re.sub(r'<[^>]+>', '', s[1]) != re.sub(r'<[^>]+>', '', old_dict.get(nt_s[2], ''))) isdiff_tag = (''.join(re.findall(r'<[^>]+>', s[1])) != ''.join(re.findall(r'<[^>]+>', old_dict.get(nt_s[2], '')))) if isdiff_tag and not isdiff_source: s[2] = re.sub(r'^("?)', r'\1(あいまい一致_タグに差異)', nt_s[2]) elif isdiff_source and not isdiff_tag: s[2] = re.sub(r'^("?)', r'\1(あいまい一致_原文に差異)', nt_s[2]) else: s[2] = re.sub(r'^("?)', r'\1(あいまい一致)', nt_s[2]) else: nt_s[2] = '""' no_fuzzy_lines[idx] = '\t'.join(s) no_fuzzy_lines.append('') lines = '\n'.join(no_fuzzy_lines) else: no_fuzzy_lines.append('') lines = '\n'.join(no_fuzzy_lines) lines = lines.replace('\t', ',') return lines