def gloss_clean(path): gloss_data = getData(path) output_list = [] for row in gloss_data: CN_col = str(row[0]) TH_col = str(row[1]) TH_in_CN_col = re.findall(r"[\u0E00-\u0E7F]+", CN_col) CN_in_TH_col = re.findall(r"[\u4e00-\u9fa5]+", TH_col) if TH_in_CN_col == [] and CN_in_TH_col == []: output_list.append(row) saveData("cleaned.xlsx", output_list)
def gloss_tokenize(path): gloss_data = getData(path) output_list = [] for row in gloss_data: CN_col = str(row[0]) TH_col = str(row[1]) token_list = list(jieba.cut(CN_col, cut_all=False)) token_list = [ token for token in token_list if len(re.findall(r"[\u4e00-\u9fa5]+", token)) > 0 ] output_list.append(["|".join(token_list)]) saveData("tokenize_cleaned.xlsx", output_list)
def gloss_search(path): ''' Search all translated file in <path> and save in .xlsx file :param path: :return: ''' filepath_list = [ path + filename for filename in os.listdir(path) if filename.split('.')[-1] == 'xlsx' or filename.split('.')[-1] == 'xls' ] filepath_list_n = len(filepath_list) gloss_data = [] saved_CN = [] for file_index, filepath in enumerate(filepath_list): # Get CN/TH column by auto-search function "get_source_target" cur_gloss_data = get_source_target(filepath) # Get non-repeated row and unrepeat_row_list, saved_CN_update = gloss_filter( saved_CN, cur_gloss_data) print("--%s unrepeat: %d" % (str(filepath), len(unrepeat_row_list))) gloss_data += unrepeat_row_list saved_CN = saved_CN_update # Save every 10 files if (file_index != 0 and file_index % 10 == 0) or file_index == filepath_list_n - 1: time_now = datetime.now() filename = str(time_now)[:-5].replace(" ", "").replace(":", "").replace( "-", "") print("-----(%d/%d)Save temp file-----" % (file_index, filepath_list_n)) #save_filename = './%s.xlsx'%filename save_filename = './all_xlsx_search_translatePair.xlsx' saveData(save_filename, gloss_data)
def export_to_xlsx(self, xlsx_name = "corpus_%s.xlsx"): xlsx_data = [ [s, t] for s, t in zip(self.source_corpus_list, self.trans_corpus_list)] saveData(xlsx_name%self.corpus_name, xlsx_data)
def export_to_xlsx(self): xlsx_data = [[ s, t ] for s, t in zip(self.source_gloss_list, self.trans_gloss_list)] saveData("%s.xlsx" % self.gloss_name, xlsx_data)