示例#1
0
    # \W 表示匹配非数字字母下划线
    result = re.sub('\W+', ' ', value).replace("_", ' ').replace(r"\u", " ")

    result = re.sub(pat1, '', result)

    if (len(value) - len(result)) <= 0.3 * len(value):
        return result
    else:
        return "_"


ctxs = mytools.load_from_txt("../filter_data/sorted_data/sorted_it.txt")

counts = {}
for ctx in tqdm(ctxs):
    html, location, text = ctx.replace("\n", "").split(" | ")

    text = replace_all_blank(text)
    word_tokens = word_tokenize(text)  #分词
    # filtered_sentence = [w for w in word_tokens if w not in stop_words] #去停用词
    # filtered_sentence = [word.lower() for word in filtered_sentence]
    # stem_words = [ps.stem(w) for w in filtered_sentence]

    stem_words = [word.lower() for word in word_tokens]

    for w in stem_words:
        counts[w] = counts.get(w, 0) + 1
cw = sorted([(count, w) for w, count in counts.items()], reverse=True)
cw = [w[1] + " | " + str(w[0]) + "\n" for w in cw]
mytools.log_to_txt(cw, "all_it_vocab.txt")
示例#2
0
def compute_column_score(item):
    """
    给定一行数据,返回至多1000行结果
    :param item:data中的一行
    :return: 相似度数组
    """
    zh_route = "../0_data/zhit-0825/" + item[0]
    ja_route = "../0_data/zhit-0825/" + item[2]

    # 读取文本所在行数
    zh_start, zh_end = int(item[1].split(":")[0]), int(item[1].split(":")[1])
    ja_start, ja_end = int(item[3].split(":")[0]), int(item[3].split(":")[1])

    # search
    searched_results = []
    zh_all = read_text_from_file(zh_route, zh_start, zh_end)
    zh_all_filter = replace_all_blank(zh_all)

    ja_all = read_text_from_file(ja_route, ja_start, ja_end)
    ja_translate = utils.querySentence(Dict, ja_all)

    score = utils.compute_item_score(ja_translate, zh_all_filter)

    if len(zh_all_filter) == 5:
        for ja_windows_lens in range(len(ja_all), 5, -1):
            for start_bias in range(ja_start, ja_end - ja_windows_lens):
                ja_tmp = ja_all[start_bias - ja_start:start_bias - ja_start +
                                ja_windows_lens]
                ja_translate = utils.querySentence(Dict, ja_tmp)

                score = utils.compute_item_score(ja_translate, zh_all_filter)

                if score > 0.5:
                    searched_results.append([
                        score, item[0], item[1], item[2],
                        str(start_bias) + ":" +
                        str(start_bias + ja_windows_lens)
                    ])
        log_info = " ".join(item) + "\n"
        mytools.log_to_txt(log_info, "file/already_finished.txt")
        return searched_results

    else:
        if score > 0.5:
            searched_results.append(
                [score, item[0], item[1], item[2], item[3]])

        for ja_windows_lens in range(len(ja_all), 5, -1):
            for start_bias in range(ja_start, ja_end - ja_windows_lens):
                ja_tmp = ja_all[start_bias - ja_start:start_bias - ja_start +
                                ja_windows_lens]
                ja_translate = utils.querySentence(Dict, ja_tmp)

                for zh_windows_lens in range(len(zh_all), 4, -1):
                    for start_bias_zh in range(zh_start,
                                               zh_end - zh_windows_lens):
                        zh_tmp = zh_all[start_bias_zh -
                                        zh_start:start_bias_zh - zh_start +
                                        zh_windows_lens]
                        zh_tmp_filter = replace_all_blank(zh_tmp)
                        if len(zh_tmp_filter) <= 4:
                            continue

                        score = utils.compute_item_score(
                            ja_translate, zh_tmp_filter)

                        if score > 0.5:
                            searched_results.append([
                                score, item[0],
                                str(start_bias_zh) + ":" +
                                str(start_bias_zh + zh_windows_lens), item[2],
                                str(start_bias) + ":" +
                                str(start_bias + ja_windows_lens)
                            ])
        # 记录已完成
        log_info = " ".join(item) + "\n"
        mytools.log_to_txt(log_info, "file/already_finished.txt")

        return searched_results
root = "../../data/processedData/zh/"
for file in tqdm(os.listdir(root)):
    ctxs = mytools.load_from_txt(root+file)

    new_ctxs = []
    for ctx in ctxs:
        # print("-----------------------------------------")
        # print(ctx)
        try:
            html, location, texts = ctx.replace("\n","").split(" | ")
        except:
            continue

        start,end = int(location.split(":")[0]),int(location.split(":")[1])

        while len(texts) > 11:
            new_texts = texts[:11]
            new_ctxs.append(
                html + " | " + str(start)+":"+str(start+11) + " | " +new_texts + "\n"
            )
            # print(html + " | " + str(start)+":"+str(start+11) + " | " +new_texts)
            texts = texts[7:]

            start = start + 7
        new_ctxs.append(html + " | " + str(start)+":"+str(end) + " | " + texts + "\n")
        # print(html + " | " + str(start)+":"+str(end) + " | " + texts)
    # exit(0)
    mytools.log_to_txt(new_ctxs, root.replace("zh","optim_zh/")+file)
    new_ctxs = []
示例#4
0
    return respData["data"]["result"]["trans_result"]['dst']


def translate_init(from_lang='zh', to_lang='en'):
    gClass = get_result("ntrans.xfyun.cn",
                        from_lang=from_lang,
                        to_lang=to_lang)
    return gClass


import jieba
time.sleep(1)
import nltk
from nltk.tokenize import word_tokenize  #分词
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction, corpus_bleu
sf7 = SmoothingFunction().method7
import mytools
from nltk.corpus import stopwords  #停用词
from nltk.stem import PorterStemmer  # 词干化

if __name__ == '__main__':
    import mytools
    from tqdm import tqdm
    goodwords = mytools.load_from_json("good_words.json")[584:]

    for goodword in tqdm(goodwords):
        gClass = translate_init(from_lang='zh', to_lang='it')
        trans_text = translate(goodword, gClass)
        ctx = goodword + " | " + trans_text + "\n"
        mytools.log_to_txt(ctx, "goodwords_dict.txt")
示例#5
0
import mytools, os
from tqdm import tqdm

ctxs = mytools.load_from_txt("sorted_data/zh.txt")

for ctx in tqdm(ctxs):
    text, pos = ctx.split(" ||| ")
    html, location = pos.replace("\n", "").split(" || ")[0].split(" | ")
    ctx = html + " | " + location + " | " + text + "\n"
    mytools.log_to_txt(ctx, "sorted_data/sorted_zh.txt")
示例#6
0
import mytools
import os

ctxs = mytools.load_from_txt("all_it_vocab.txt")

remove_words = "0123456789,./;'[]\\-=<>?:\"{}|~!@#$%^&*()_+"
smooth = []
for ctx in ctxs:
    try:
        word, freq = ctx.replace("\n", "").split(" | ")

        flag = True
        for tmp in word:
            if tmp in remove_words:
                flag = False
                break
        if len(word) > 18:
            flag = False

        if flag:
            smooth.append(word + " | " + freq + "\n")
    except:
        print("error")
mytools.log_to_txt(smooth, "smooth_it_vocab.txt")
saved_results = {}
all_idx = 0
for file in tqdm(os.listdir(root_path)):
    try:
        ctxs = mytools.load_from_txt(root_path + file)

        # try:
        for ctx in ctxs:
            all_idx += 1
            html, location, text = ctx.replace("\n", "").split(" | ")

            if text not in saved_results:
                saved_results[text] = [html + " | " + location]
            else:
                saved_results[text].append(html + " | " + location)
    except:
        pass

print(all_idx)
print(len(saved_results.keys()))

for k in tqdm(saved_results.keys()):
    mytools.log_to_txt(k + " ||| " + " || ".join(saved_results[k]) + "\n",
                       "saved_results/zh.txt")
"""
1202420
582393

3215128
1162976
"""
示例#8
0
for rootpath in rootpaths:
    for idx, item in enumerate(os.listdir(rootpath)):

        try:
            with open(rootpath + item, encoding='UTF-8') as f:
                s = f.read()
            for line in s.split("\n"):
                # 分割每一行
                texts = utilize.filterForLine(line)
                if texts != []:
                    for text in texts:
                        if text != "":
                            # 计算每一个文本的起始位置和终止位置
                            start, end = utilize.calcIndexForPiece(text, s)

                            # 生成item 写入文件
                            ctx = rootpath + item + " | " + str(
                                start) + ":" + str(end) + " | " + text
                            mytools.log_to_txt(
                                ctx,
                                rootpath.replace("zhit-0825", "processedData")
                                + str(int(idx / 1000)) +
                                rootpath.replace("/", "") + ".txt",
                                encoding="UTF-8")
        except:
            pass

        print(rootpath +
              " Finished:{}/{}".format(idx, len(os.listdir(rootpath))))
import os
import mytools
import numpy as np
import random

root_path = "../../data/train/"


paths = os.listdir(root_path)

items = []
for path in paths:
    sub_path = root_path + path +"/"
    for file in os.listdir(sub_path):
        item = "_ | "+"train/"+path+"/"+file+" | "+path+"\n"
        items.append(item)

mytools.log_to_txt(items,"trainB.txt")


root_path = "../../../data/val/"

items = []
for file in os.listdir(root_path):
    item = "_ | "+"val/"+file+" | "+file[9]+"\n"
    items.append(item)

mytools.log_to_txt(items,"valB.txt")

示例#10
0
# text = "hello world"
# respData = gClass.call_url(text=text)
# print(respData["data"]["result"]["trans_result"]['dst'])

ctxs = mytools.load_from_txt("smooth_it_vocab.txt")

i = 0
for ctx in ctxs:
    try:
        # print(ctx.replace("\n","").split(" | ")[0])
        # ans = translate(fromLang='zh', toLang='it', q=ctx.replace("\n","").split(" | ")[0])
        ans = gClass.call_url(text=ctx.replace("\n", "").split(" | ")
                              [0])["data"]["result"]["trans_result"]['dst']
        # print(ans)
        if ans == None:
            ans = "unknown"
    except:
        ans = "unknown"

    tmp = ctx.replace("\n", "").split(" | ")[0] + " | " + ans + "\n"
    mytools.log_to_txt(tmp, "smooth_xunfei_dict.txt")

    i += 1

    print(i, "/", len(ctxs))
    # time.sleep(0.01)

    if i % 500 == 499:
        # 初始化类
        gClass = WebOTS.get_result(host)
示例#11
0
import mytools

ctxs = mytools.load_from_txt("goodwords_dict.txt")

remove_ = "|»,# ?."

for ctx in ctxs:
    try:
        zh_, it_ = ctx.replace("\n", "").split(" | ")
    except:
        items = ctx.replace("\n", "").split(" | ")
        zh_, it_ = items[0], items[1]

    it_ = it_.lower()
    for i in remove_:
        it_ = it_.replace(i, "")
    mytools.log_to_txt(zh_ + " | " + it_ + "\n", "new_dict_lower_.txt")