示例#1
0
def gloss_clean(path):
    gloss_data = getData(path)

    output_list = []
    for row in gloss_data:
        CN_col = str(row[0])
        TH_col = str(row[1])
        TH_in_CN_col = re.findall(r"[\u0E00-\u0E7F]+", CN_col)
        CN_in_TH_col = re.findall(r"[\u4e00-\u9fa5]+", TH_col)
        if TH_in_CN_col == [] and CN_in_TH_col == []:
            output_list.append(row)

    saveData("cleaned.xlsx", output_list)
示例#2
0
def gloss_tokenize(path):
    gloss_data = getData(path)

    output_list = []
    for row in gloss_data:
        CN_col = str(row[0])
        TH_col = str(row[1])

        token_list = list(jieba.cut(CN_col, cut_all=False))
        token_list = [
            token for token in token_list
            if len(re.findall(r"[\u4e00-\u9fa5]+", token)) > 0
        ]
        output_list.append(["|".join(token_list)])

    saveData("tokenize_cleaned.xlsx", output_list)
示例#3
0
def gloss_search(path):
    '''
    Search all translated file in <path> and save in .xlsx file
    :param path:
    :return:
    '''
    filepath_list = [
        path + filename for filename in os.listdir(path) if
        filename.split('.')[-1] == 'xlsx' or filename.split('.')[-1] == 'xls'
    ]
    filepath_list_n = len(filepath_list)

    gloss_data = []
    saved_CN = []
    for file_index, filepath in enumerate(filepath_list):
        # Get CN/TH column by auto-search function "get_source_target"
        cur_gloss_data = get_source_target(filepath)

        # Get non-repeated row and
        unrepeat_row_list, saved_CN_update = gloss_filter(
            saved_CN, cur_gloss_data)
        print("--%s unrepeat: %d" % (str(filepath), len(unrepeat_row_list)))
        gloss_data += unrepeat_row_list
        saved_CN = saved_CN_update

        # Save every 10 files
        if (file_index != 0
                and file_index % 10 == 0) or file_index == filepath_list_n - 1:
            time_now = datetime.now()
            filename = str(time_now)[:-5].replace(" ", "").replace(":",
                                                                   "").replace(
                                                                       "-", "")
            print("-----(%d/%d)Save temp file-----" %
                  (file_index, filepath_list_n))
            #save_filename = './%s.xlsx'%filename
            save_filename = './all_xlsx_search_translatePair.xlsx'
            saveData(save_filename, gloss_data)
示例#4
0
 def export_to_xlsx(self, xlsx_name = "corpus_%s.xlsx"):
     xlsx_data = [ [s, t] for s, t in zip(self.source_corpus_list, self.trans_corpus_list)]
     saveData(xlsx_name%self.corpus_name, xlsx_data)
示例#5
0
 def export_to_xlsx(self):
     xlsx_data = [[
         s, t
     ] for s, t in zip(self.source_gloss_list, self.trans_gloss_list)]
     saveData("%s.xlsx" % self.gloss_name, xlsx_data)