# \W 表示匹配非数字字母下划线 result = re.sub('\W+', ' ', value).replace("_", ' ').replace(r"\u", " ") result = re.sub(pat1, '', result) if (len(value) - len(result)) <= 0.3 * len(value): return result else: return "_" ctxs = mytools.load_from_txt("../filter_data/sorted_data/sorted_it.txt") counts = {} for ctx in tqdm(ctxs): html, location, text = ctx.replace("\n", "").split(" | ") text = replace_all_blank(text) word_tokens = word_tokenize(text) #分词 # filtered_sentence = [w for w in word_tokens if w not in stop_words] #去停用词 # filtered_sentence = [word.lower() for word in filtered_sentence] # stem_words = [ps.stem(w) for w in filtered_sentence] stem_words = [word.lower() for word in word_tokens] for w in stem_words: counts[w] = counts.get(w, 0) + 1 cw = sorted([(count, w) for w, count in counts.items()], reverse=True) cw = [w[1] + " | " + str(w[0]) + "\n" for w in cw] mytools.log_to_txt(cw, "all_it_vocab.txt")
def compute_column_score(item): """ 给定一行数据,返回至多1000行结果 :param item:data中的一行 :return: 相似度数组 """ zh_route = "../0_data/zhit-0825/" + item[0] ja_route = "../0_data/zhit-0825/" + item[2] # 读取文本所在行数 zh_start, zh_end = int(item[1].split(":")[0]), int(item[1].split(":")[1]) ja_start, ja_end = int(item[3].split(":")[0]), int(item[3].split(":")[1]) # search searched_results = [] zh_all = read_text_from_file(zh_route, zh_start, zh_end) zh_all_filter = replace_all_blank(zh_all) ja_all = read_text_from_file(ja_route, ja_start, ja_end) ja_translate = utils.querySentence(Dict, ja_all) score = utils.compute_item_score(ja_translate, zh_all_filter) if len(zh_all_filter) == 5: for ja_windows_lens in range(len(ja_all), 5, -1): for start_bias in range(ja_start, ja_end - ja_windows_lens): ja_tmp = ja_all[start_bias - ja_start:start_bias - ja_start + ja_windows_lens] ja_translate = utils.querySentence(Dict, ja_tmp) score = utils.compute_item_score(ja_translate, zh_all_filter) if score > 0.5: searched_results.append([ score, item[0], item[1], item[2], str(start_bias) + ":" + str(start_bias + ja_windows_lens) ]) log_info = " ".join(item) + "\n" mytools.log_to_txt(log_info, "file/already_finished.txt") return searched_results else: if score > 0.5: searched_results.append( [score, item[0], item[1], item[2], item[3]]) for ja_windows_lens in range(len(ja_all), 5, -1): for start_bias in range(ja_start, ja_end - ja_windows_lens): ja_tmp = ja_all[start_bias - ja_start:start_bias - ja_start + ja_windows_lens] ja_translate = utils.querySentence(Dict, ja_tmp) for zh_windows_lens in range(len(zh_all), 4, -1): for start_bias_zh in range(zh_start, zh_end - zh_windows_lens): zh_tmp = zh_all[start_bias_zh - zh_start:start_bias_zh - zh_start + zh_windows_lens] zh_tmp_filter = replace_all_blank(zh_tmp) if len(zh_tmp_filter) <= 4: continue score = utils.compute_item_score( ja_translate, zh_tmp_filter) if score > 0.5: searched_results.append([ score, item[0], str(start_bias_zh) + ":" + str(start_bias_zh + zh_windows_lens), item[2], str(start_bias) + ":" + str(start_bias + ja_windows_lens) ]) # 记录已完成 log_info = " ".join(item) + "\n" mytools.log_to_txt(log_info, "file/already_finished.txt") return searched_results
root = "../../data/processedData/zh/" for file in tqdm(os.listdir(root)): ctxs = mytools.load_from_txt(root+file) new_ctxs = [] for ctx in ctxs: # print("-----------------------------------------") # print(ctx) try: html, location, texts = ctx.replace("\n","").split(" | ") except: continue start,end = int(location.split(":")[0]),int(location.split(":")[1]) while len(texts) > 11: new_texts = texts[:11] new_ctxs.append( html + " | " + str(start)+":"+str(start+11) + " | " +new_texts + "\n" ) # print(html + " | " + str(start)+":"+str(start+11) + " | " +new_texts) texts = texts[7:] start = start + 7 new_ctxs.append(html + " | " + str(start)+":"+str(end) + " | " + texts + "\n") # print(html + " | " + str(start)+":"+str(end) + " | " + texts) # exit(0) mytools.log_to_txt(new_ctxs, root.replace("zh","optim_zh/")+file) new_ctxs = []
return respData["data"]["result"]["trans_result"]['dst'] def translate_init(from_lang='zh', to_lang='en'): gClass = get_result("ntrans.xfyun.cn", from_lang=from_lang, to_lang=to_lang) return gClass import jieba time.sleep(1) import nltk from nltk.tokenize import word_tokenize #分词 from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu sf7 = SmoothingFunction().method7 import mytools from nltk.corpus import stopwords #停用词 from nltk.stem import PorterStemmer # 词干化 if __name__ == '__main__': import mytools from tqdm import tqdm goodwords = mytools.load_from_json("good_words.json")[584:] for goodword in tqdm(goodwords): gClass = translate_init(from_lang='zh', to_lang='it') trans_text = translate(goodword, gClass) ctx = goodword + " | " + trans_text + "\n" mytools.log_to_txt(ctx, "goodwords_dict.txt")
import mytools, os from tqdm import tqdm ctxs = mytools.load_from_txt("sorted_data/zh.txt") for ctx in tqdm(ctxs): text, pos = ctx.split(" ||| ") html, location = pos.replace("\n", "").split(" || ")[0].split(" | ") ctx = html + " | " + location + " | " + text + "\n" mytools.log_to_txt(ctx, "sorted_data/sorted_zh.txt")
import mytools import os ctxs = mytools.load_from_txt("all_it_vocab.txt") remove_words = "0123456789,./;'[]\\-=<>?:\"{}|~!@#$%^&*()_+" smooth = [] for ctx in ctxs: try: word, freq = ctx.replace("\n", "").split(" | ") flag = True for tmp in word: if tmp in remove_words: flag = False break if len(word) > 18: flag = False if flag: smooth.append(word + " | " + freq + "\n") except: print("error") mytools.log_to_txt(smooth, "smooth_it_vocab.txt")
saved_results = {} all_idx = 0 for file in tqdm(os.listdir(root_path)): try: ctxs = mytools.load_from_txt(root_path + file) # try: for ctx in ctxs: all_idx += 1 html, location, text = ctx.replace("\n", "").split(" | ") if text not in saved_results: saved_results[text] = [html + " | " + location] else: saved_results[text].append(html + " | " + location) except: pass print(all_idx) print(len(saved_results.keys())) for k in tqdm(saved_results.keys()): mytools.log_to_txt(k + " ||| " + " || ".join(saved_results[k]) + "\n", "saved_results/zh.txt") """ 1202420 582393 3215128 1162976 """
for rootpath in rootpaths: for idx, item in enumerate(os.listdir(rootpath)): try: with open(rootpath + item, encoding='UTF-8') as f: s = f.read() for line in s.split("\n"): # 分割每一行 texts = utilize.filterForLine(line) if texts != []: for text in texts: if text != "": # 计算每一个文本的起始位置和终止位置 start, end = utilize.calcIndexForPiece(text, s) # 生成item 写入文件 ctx = rootpath + item + " | " + str( start) + ":" + str(end) + " | " + text mytools.log_to_txt( ctx, rootpath.replace("zhit-0825", "processedData") + str(int(idx / 1000)) + rootpath.replace("/", "") + ".txt", encoding="UTF-8") except: pass print(rootpath + " Finished:{}/{}".format(idx, len(os.listdir(rootpath))))
import os import mytools import numpy as np import random root_path = "../../data/train/" paths = os.listdir(root_path) items = [] for path in paths: sub_path = root_path + path +"/" for file in os.listdir(sub_path): item = "_ | "+"train/"+path+"/"+file+" | "+path+"\n" items.append(item) mytools.log_to_txt(items,"trainB.txt") root_path = "../../../data/val/" items = [] for file in os.listdir(root_path): item = "_ | "+"val/"+file+" | "+file[9]+"\n" items.append(item) mytools.log_to_txt(items,"valB.txt")
# text = "hello world" # respData = gClass.call_url(text=text) # print(respData["data"]["result"]["trans_result"]['dst']) ctxs = mytools.load_from_txt("smooth_it_vocab.txt") i = 0 for ctx in ctxs: try: # print(ctx.replace("\n","").split(" | ")[0]) # ans = translate(fromLang='zh', toLang='it', q=ctx.replace("\n","").split(" | ")[0]) ans = gClass.call_url(text=ctx.replace("\n", "").split(" | ") [0])["data"]["result"]["trans_result"]['dst'] # print(ans) if ans == None: ans = "unknown" except: ans = "unknown" tmp = ctx.replace("\n", "").split(" | ")[0] + " | " + ans + "\n" mytools.log_to_txt(tmp, "smooth_xunfei_dict.txt") i += 1 print(i, "/", len(ctxs)) # time.sleep(0.01) if i % 500 == 499: # 初始化类 gClass = WebOTS.get_result(host)
import mytools ctxs = mytools.load_from_txt("goodwords_dict.txt") remove_ = "|»,# ?." for ctx in ctxs: try: zh_, it_ = ctx.replace("\n", "").split(" | ") except: items = ctx.replace("\n", "").split(" | ") zh_, it_ = items[0], items[1] it_ = it_.lower() for i in remove_: it_ = it_.replace(i, "") mytools.log_to_txt(zh_ + " | " + it_ + "\n", "new_dict_lower_.txt")