def gen_tokens( text: Union[str, List[str]], label: str = "", lang: Optional[str] = None, gen_para: Union[int, bool] = False, gen_sent: Union[int, bool] = True, gen_phrase: Union[int, bool] = False, ) -> Iterator[Tuple[str, str, int, str]]: # fmt: on """Genereate tokens from text/list of text.""" if isinstance(text, str): text = [elm.strip() for elm in text.splitlines() if elm.strip()] if lang is None: lang = Detector(" ".join(text)).language.code logger.debug("Deteced lang: %s", lang) if gen_para: for idx, para in enumerate(text): yield para, label, idx + 1, 'para' if gen_sent: for idx, para in enumerate(text): for sent in _sent_tokenizer(para, lang): yield sent, label, idx + 1, 'sent' if gen_phrase: for idx, para in enumerate(text): for sent in _sent_tokenizer(para, lang): raise Exception("need to install phrase_tokenizer" "which is dependant of benepar")
def _sent_tokenizer( text: Union[str, List[str]], lang: Optional[str] = None, debug: bool = False, # when True, disable joblib.Memory.cache ) -> List[str]: # fmt: on """Tokenize str|List[str] to sents.""" if isinstance(text, str): text = [text] if lang is None: try: lang = Detector(" ".join(text)).language.code except Exception as exc: logger.warning( "polyglot.text.Detector exc: %s, setting to 'en'", exc ) logger.info(" Try to pass lang (e.g. lang='en') to sent_tokenizer") lang = 'en' res = [] for elm in text: res.extend(seg_text(elm, lang=lang)) return res
def seg_text(text: str, lang: Optional[str] = None) -> List[str]: """split text to sentences. use sentence_splitter if supported, else use polyglot.text.Text """ if lang is None: lang = Detector("testt 12 3").language.code if lang in LANG_S: return split_text_into_sentences(text, lang) return [elm.string for elm in Text(text, lang).sentences]
def seg_text( text: str, lang: Optional[str] = None, qmode: bool = False, maxlines: int = 1000 ) -> List[str]: # fmt: on """ Split text to sentences. Use sentence_splitter if supported, else use polyglot.text.Text.sentences qmode: skip split_text_into_sentences if True, default False vectors for all books are based on qmode=False. qmode=True is for quick test purpose only maxlines (default 1000), threhold for turn on tqdm progressbar set to <1 or a large number to turn it off """ if lang is None: try: lang = Detector(text).language.code except Exception as exc: logger.warning( "polyglot.text.Detector exc: %s, setting to 'en'", exc ) lang = "en" if not qmode and lang in LANG_S: _ = [] lines = text.splitlines() # if maxlines > 1 and len(lines) > maxlines: if len(lines) > maxlines > 1: for para in tqdm(lines): if para.strip(): _.extend(split_text_into_sentences(para, lang)) else: for para in lines: if para.strip(): _.extend(split_text_into_sentences(para, lang)) return _ # return split_text_into_sentences(text, lang) return [elm.string for elm in Text(text, lang).sentences]
def lists_to_tmx( srclist: List[str], tgtlist: List[str], srclang: Optional[str] = None, # "en-US", tgtlang: Optional[str] = None, # "zh-CN", encoding: Optional[str] = None, # method: str = "xml", xml_declaration: bool = True, pretty_print: bool = True, doctype: str = '<!DOCTYPE tmx SYSTEM "tmx14a.dtd">', ) -> str: # fmt: on """ lists_to_tmx(srclist, tgtlist, srclang='en-US', tgtlang='zh-CN', encoding=None, method="xml", xml_declaration=True, pretty_print=False, doctype='<!DOCTYPE tmx SYSTEM "tmx14a.dtd">') return: bytes et.tostring(tostring(element_or_tree, encoding=None, method="xml", xml_declaration=None, pretty_print=False, with_tail=True, standalone=None, doctype=None, exclusive=False, with_comments=True, inclusive_ns_prefixes=None) wite out with: with open('test2tu.tmx','w') as fh: .....: fh.write(tmx.decode()) """ if len(srclist) != len(tgtlist): logger.warning(" len(srclist) != len(tgtlist), we proceed anyway...") # raise Exception(" len(srclist) != len(tgtlist) ") if srclang is None: lc1 = Detector(" ".join(srclist)[:5000], quiet=True).language.code srclang = langcode_to_tmxcode(lc1) if tgtlang is None: lc2 = Detector(" ".join(tgtlist)[:5000], quiet=True).language.code tgtlang = langcode_to_tmxcode(lc2) if encoding is None: encoding = "utf-8" root = et.Element("tmx", attrib={"version": "1.4"}) # type: ignore # header = # gen header et.SubElement(root, "header", attrib={ "amdinlang": srclang, "srclang": srclang }) # type: ignore body = et.SubElement(root, "body") # type: ignore # tuv_en = et.SubElement(tu, "tuv", xml:lang="en") # 'xml:lang' gets error # tuv_zh = et.SubElement(tu, "tuv", xml:lang="zh") len0 = min(len(srclist), len(tgtlist)) # for itrange in tqdm.trange(len0): for itrange in range(len0): tu = et.SubElement(body, "tu") # type: ignore tuv_en = et.SubElement(tu, "tuv", attrib={"lang": srclang}) # type: ignore tuv_zh = et.SubElement(tu, "tuv", attrib={"lang": tgtlang}) # type: ignore # attach tuv to tree et.SubElement(tuv_en, "seg").text = srclist[itrange] # type: ignore et.SubElement(tuv_zh, "seg").text = tgtlist[itrange] # type: ignore tree = et.ElementTree(root) # type: ignore treestr = et.tostring( # type: ignore tree, encoding=encoding, pretty_print=pretty_print, xml_declaration=xml_declaration, doctype=doctype, ) return treestr.decode()
def main(): """ main. """ front_cover() p_list = None p_list1 = None file1_flag, file2_flag = False, False # fetch file contents if src_fileio is not None: file1_flag = True src_file = src_fileio.getvalue() if isinstance(src_file, bytes): src_file = src_file.decode("utf8") lang1 = Detector(src_file).language.code src_text = split_text(src_file) # s_or_d = single_or_dual(src_file) # st.info(["1st file: single or dual lang? ", str(s_or_d), len(s_or_d)]) # if len(s_or_d) < 2: # single dual-lang file tgt_fileio = st.sidebar.file_uploader("Choose another file", type=[ 'txt', ], key="tgt_text") if tgt_fileio is not None: file2_flag = True tgt_file = tgt_fileio.getvalue() if isinstance(tgt_file, bytes): tgt_file = tgt_file.decode("utf8") lang2 = Detector(tgt_file).language.code tgt_text = split_text(tgt_file) len_ = len(src_text) + len(tgt_text) if len_ > 300: # st.markdown("Sorry, this will likely hog the petite server to it's virtual death (total number paragraphs limited to **300** ). We'll trim both texts to 50 and have a testrun. ") st.warning(" This is likely to take a long time...") # src_text = src_text[:150] # tgt_text = tgt_text[:150] len_ = len(src_text) + len(tgt_text) logger.info("total paras: %s", len_) _ = 3 st.subheader(f"first {_} paras in file1 and file 2, respectively") st.write(f" {src_text[:_]} ") st.write(f" {tgt_text[:_]} ") st.subheader(f"last {_} paras in file1 and file 2, respectively") st.write(f" {src_text[-_:]} ") st.write(f" {tgt_text[-_:]} ") len1 = len(src_text) len2 = len(tgt_text) est_time = round(len1 / 32) + bool(len1 % 32) est_time += round(len2 / 32) + bool(len2 % 32) est_time *= 13 / 60 st.info([ f" file1: {len(src_text)} paras", f" file2: {len(tgt_text)} paras", ]) _ = ''' else: # dual-lang file st.write(""" It looks like a dual-lang file. We'll implement something to handle this, soon. """) # assert 0, "" # ''' # align paras # if st.sidebar.checkbox( # "tick to proceed with para alignment", # value=False, # key="para-align", # ): if op_selectbox in ["Para/Sent Align" ]: # ("Para/Sent Align", "Simple Sent Align") if not (file1_flag and file2_flag): # st.info("Pick two files first") instruction1() else: # st.write(f" Processing... first run can take a while, ~{2 * len_ // 100 + 1}-{3 * len_ // 100 + 1} min. Please wait...") st.write( f" Processing... first run can take a while, ~{est_time:.1f} min. Please wait..." ) try: cos_mat = np.asarray( bee_corr(src_text, tgt_text, url=model_url)) except Exception as exc: st.write("exc: %s" % exc) st.stop() raise SystemExit(1) from Exception st.markdown("### cosine similarity matrix") st.dataframe(pd.DataFrame(cos_mat).style.highlight_max(axis=0)) fig, ax = plt.subplots() # fig = plt.figure() # (figsize= (10,7)) # cmap = sns.diverging_palette(20, 220, as_cmap=True) sns.heatmap(cos_mat, vmin=0, vmax=1) # plt.xlabel("file2") # plt.ylabel("file1") plt.xlabel(f"{tgt_fileio.name}") plt.ylabel(f"{src_fileio.name}") plt.title("cosine similarity heatmap") st.pyplot(fig) # plt.close() # st.markdown("## fine-tune alignment") st.header("fine-tune alignment") thr = st.slider( "threshold (0...1) -- drag or click to adjust threshold value", min_value=0., max_value=1., value=0.5, step=0.05, key="thr-slider", ) p_list = bee_aligner(src_text, tgt_text, cos_mat=cos_mat, thr=thr) if p_list is not None: df = pd.DataFrame(p_list) st.markdown("#### para alignment at a glance") st.info("(hove over a cell to disply full text)") st.dataframe(df) # st.dataframe(df.style.highlight_max(axis=0)) st.subheader("para alignment in detail") if st.checkbox( f"tick to show detailed alignment (threhold={thr})", value=0, key="para_df"): st.table(df) s_df = color_table_applymap(df) st.sidebar.subheader("aligned paras xlsx file for downloading") st.sidebar.markdown(get_table_download_link(s_df), unsafe_allow_html=True) # para-sent if st.sidebar.checkbox( " align sent within paras already aligned", value=0, key="para-sent-align", ): if p_list is None: st.info(" align paras first") else: st.subheader(" Aligning sents ") # st.info(" TODO ") # ==== then = default_timer() s_list = plist_to_slist(p_list) thr_s = [""] * len(s_list) c_mat = [""] * len(s_list) # final_aligned = [[], [], []] final_aligned = [] # for elm in range(2): for elm in range(len(c_mat)): _ = """ thr_s[elm] = st.slider( "", min_value=0., max_value=1., value=0.5, step=0.05, key=f"thr-slider-{elm}", ) st.write(f"{elm+1}/{len(c_mat)}", thr_s[elm]) # """ st.write(f"{elm + 1}/{len(c_mat)}") thr_s[elm] = 0.5 # c_mat[elm] = bee_corr(s_list[elm][0], s_list[elm][1]) c_mat[elm] = bee_corr( s_list[elm][0], s_list[elm][1], # src_lang=lang1, # tgt_lang=lang2, url=model_url) s_list_aligned = bee_aligner( s_list[elm][0], s_list[elm][1], cos_mat=c_mat[elm], thr=thr_s[elm], ) st.table(pd.DataFrame(s_list_aligned)) # final_aligned[0].extend([elm[0] for elm in s_list_aligned]) # final_aligned[1].extend([elm[1] for elm in s_list_aligned]) # final_aligned[2].extend([elm[2] for elm in s_list_aligned]) # [*zip(final_aligned[0], final_aligned[1], final_aligned[2])] final_aligned.extend(s_list_aligned) logger.debug("total sents: %s", len(final_aligned)) st.write( f"Time spent for sent alignment: {(default_timer() - then) / 60:.2f} min" ) st.write(f"Total sent pairs: {len(final_aligned)}") st.subheader("aligned sentences in one batch") df_sents = pd.DataFrame(final_aligned) # s_df_sents = color_table_applymap(df_sents) s_df_sents = df_sents if st.checkbox("Tick to show", value=0, key="finall align sentences"): # st.table(final_aligned) st.table(s_df_sents) logger.debug("aligned sents ready for downloading") st.sidebar.subheader( "Aligned sents in xlsx for downloading") st.sidebar.markdown( get_table_download_link_sents(s_df_sents), unsafe_allow_html=True) # ==== # align sents # if st.sidebar.checkbox( # "tick to proceed with sent alignment (w/o para align)", # value=False, # key="sent-align", # ): if op_selectbox in ["Simple Sent Align" ]: # ("Para/Sent Align", "Simple Sent Align") if not (file1_flag and file2_flag): # st.info("Pick two files first") instruction1() else: # st.info(" sent alignment to be implemented ") sents1 = [] for elm in src_text: if elm.strip(): sents1.extend(seg_text(elm, lang1)) st.info(sents1[:3]) sents2 = [] for elm in tgt_text: if elm.strip(): sents2.extend(seg_text(elm, lang2)) st.info(sents2[:3]) len1s = len(sents1) len2s = len(sents2) st.info([ f"file1: {len1s} sents", f"{lang1}", f"file2: {len2s} sents", f"{lang2}", ]) est_time1 = len1s // 32 + bool(len1s % 32) est_time1 += len2s // 32 + bool(len2s % 32) est_time1 *= 7 / 60 st.info( f"The first run may take a while, about {est_time1:.1f} min") try: # cos_mat = np.asarray(bee_corr(src_text, tgt_text)) cos_mat1 = np.asarray(bee_corr(sents1, sents2, url=model_url)) except Exception as exc: # st.write("exc: ", exc) logger.error("exc: %s" % exc) st.stop() raise SystemExit(1) from Exception st.markdown("### cosine similarity matrix (sents)") st.dataframe( pd.DataFrame(cos_mat1.round(2)).style.highlight_max(axis=0)) mean1 = cos_mat1.mean() var1 = cos_mat1.var() fig, ax = plt.subplots() # fig = plt.figure() # (figsize= (10,7)) # cmap = sns.diverging_palette(20, 220, as_cmap=True) sns.heatmap(cos_mat1, vmin=0, vmax=1) plt.xlabel(f"{tgt_fileio.name}") plt.ylabel(f"{src_fileio.name}") plt.title( f"mean={mean1.round(2)} var={var1.round(2)} cosine similarity (sents) heatmap" ) st.pyplot(fig) # st.markdown("## fine-tune alignment") st.header("fine-tune sent alignment") thr_i = float(round(min(mean1 + 30 * var1, 0.5), 2)) st.info(f"inital threshold={thr_i:.2f}") thr1 = st.slider( "threshold (0...1) -- drag or click to adjust threshold value", min_value=0., max_value=1., value=thr_i, step=0.05, # key="sliderkey", ) p_list1 = bee_aligner(sents1, sents2, cos_mat=cos_mat1, thr=thr1) if p_list1 is not None: df1 = pd.DataFrame(p_list1) st.markdown("#### sent alignment at a glance") st.info("(hove over a cell to disply full text)") st.dataframe(df1) st.subheader("sent alignment in detail") if st.checkbox( f"tick to show detailed sent alignment (threhold={thr1})", value=0, key="para_df"): st.table(df1) s_df1 = color_table_applymap(df1) st.sidebar.subheader("aligned sent xlsx file for downloading") st.sidebar.markdown(get_table_download_link_sents(s_df1), unsafe_allow_html=True) back_cover()
def plist_to_slist( p_list: List[Tuple[str, str, float]], lang0: str = "", lang1: str = "", ) -> List[Tuple[str, str, float]]: # ): # fmt: on """ para_list to sent_list. """ c_th = currentThread() # logzero.loglevel(10) logzero.loglevel(20) # convert 3rd column's nonempty str to float prob = [elm[2] if elm[2] == "" else float(elm[2]) for elm in p_list] logger.debug("prob: %s", prob) if lang0 in [""]: lang0 = Detector(" ".join([elm[0] for elm in p_list[:100]])).language.code if lang1 in [""]: lang1 = Detector(" ".join([elm[1] for elm in p_list[:100]])).language.code # lang0 = Detector(" ".join([elm[0] for elm in p_list[:100]])).language.code # lang1 = Detector(" ".join([elm[1] for elm in p_list[:100]])).language.code idx_lst = [[]] # t_f = [*map(lambda x: not isinstance(x, float), prob[:])]; idx, elm = i, t_f[i] for idx, elm in enumerate(map(lambda x: not isinstance(x, float), prob[:])): idx_lst = (idx_lst[:-1] + [idx_lst[-1] + [idx]]) if elm else (idx_lst + [[idx]]) logger.debug("idx_lst: %s", str(idx_lst)) # p_list[idx_lst[3][0]: idx_lst[3][-1]+1] # p_list[idx_lst[idx][0]: idx_lst[idx][-1]+1] sent_lst = [] for elm in idx_lst: if not elm: # bypass possible first empty list continue left, right = [], [] for idx in range(elm[0], elm[-1] + 1): if p_list[idx][0]: # left0 = [sent.string for sent in Text(p_list[idx][0], lang0).sentences] left0 = split_text(p_list[idx][0], lang0) left.extend([s.strip() for s in left0 if s.strip()]) if p_list[idx][1]: # right0 = [sent.string for sent in Text(p_list[idx][1], lang1).sentences] right0 = split_text(p_list[idx][1], lang1) right.extend([s.strip() for s in right0 if s.strip()]) # supply "" string if nothing exists if not left: left = [""] if not right: right = [""] sent_lst.append([left, right]) c_th.sent_lst = sent_lst # type: ignore # SIG_ALIGNER.send("plist_to_slist", **{"sent_lst":sent_lst}) # SIG_ALIGNER.send("plist_to_slist", sent_lst=sent_lst) return sent_lst
def text_to_plist( text_dual: Union[str, List[str]], langs: Optional[List[str]] = None, ) -> List[str]: # fmt: on """ convert text_dual to p_list, given a two-tuple langs, e.g., ['en', 'zh']. """ c_thr = currentThread() if isinstance(text_dual, list): text_dual = "\n".join(text_dual) if langs is None: # langs = ['en', 'zh'] langs = single_or_dual(text_dual) langid.set_languages(langs) # polyglot.Detector not in langs, use langid.classify # remove "\u3000" paras = [ elm.strip() for elm in text_dual.replace("\u3000", " ").splitlines() if elm.strip() ] if len(langs) == 1: _ = [*zip_longest(paras, [""], [""], fillvalue="")] c_thr.p_list = _ return _ # with timeme(): # 2094 ms langs_info = [] for para in paras: lang = Detector(para, True).language.code if lang not in langs: lang = langid.classify(para)[0] langs_info.append(lang) if len(langs_info) < 2: logger.warning( "langs_info: %s, nothing to separate, returning original text as the first column with two empty columns", langs_info) _ = [*zip_longest(paras, "", "", fillvalue="")] c_thr.p_list = _ return _ binary_info = [1] for idx, elm in enumerate(langs_info[1:], 1): if elm == langs_info[idx - 1]: binary_info.append(0) else: binary_info.append(1) left = [] right = [] l_or_r = 1 for idx, para in enumerate(paras): if binary_info[idx]: # switch l_or_r = (l_or_r + 1) % 2 if l_or_r: # right right.append([]) else: left.append([]) if l_or_r: # right right[-1].append(para) else: left[-1].append(para) left = ["\n".join(elm) for elm in left] right = ["\n".join(elm) for elm in right] _ = 5 # corr0 = bee_aligner.bee_corr.bee_corr(left[1: _ + 1], right[:_]).diagonal() # skip possible junk at the beginning # corr1 = bee_aligner.bee_corr.bee_corr(left[1: _ + 1], right[1:_ +1]).diagonal() corr0 = bee_corr( left[1:_ + 1], right[:_]).diagonal() # skip possible junk at the beginning corr1 = bee_corr(left[1:_ + 1], right[1:_ + 1]).diagonal() if np.sum(corr0) > np.sum(corr1): p_list = [*zip_longest(left, [''] + right, fillvalue='')] else: p_list = [*zip_longest(left, right, fillvalue='')] _ = p_list[:] p_list = [] for para in _: len0, len1 = len(para[0]), len(para[1]) if len0 > 20 * len1 or len1 > 20 * len0: entry = [para[0], para[1], ''] else: entry = [para[0], para[1], '0.66'] p_list.append(entry) logger.info(" update table via SIG_TABLE.send(df=p_list)") SIG_TABLE.send("text_to_plist", df=p_list) c_thr.p_list = p_list # type: ignore return p_list
def start_command(self, event=None): """ need a QUEUE_PS (paras or sents flag). set when ativating palign or salign flag = QUEUE_PS.get_nowait() queue1_put(QUEUE_PS, flag) """ try: flag = QUEUE_PS.get_nowait() queue1_put(QUEUE_PS, flag) except Empty: flag = "" # ######### SENTS ########### # 358 - 475 salign if flag in ["s"]: logger.debug("salign myprogressbar.start_command") logger.info("handle salign...") # check_thread_update # pbar (self) TButton1: Start, 2: Cancel,3: Back self.TButton1.config(state=tk.DISABLED) self.TButton2.config(state=tk.NORMAL) self.TButton3.config(state=tk.DISABLED) if not QUEUE_PA.qsize(): messagebox.showwarning( title="Not ready", message= " Paras not aligned yet, align paras first (Ctrl-P) ") # restore button state 1: Start, 2: Cancel, 3: Back self.TButton1.config(state=tk.DISABLED) self.TButton2.config(state=tk.DISABLED) self.TButton3.config(state=tk.NORMAL) return None try: paras1 = QUEUE_P1.get_nowait() logger.debug(" paras1[:3]: %s", paras1[:3]) queue1_put(QUEUE_P1, paras1) except Exception as exc: logger.error(" QUEUE_P1.get_nowait() exc: %s", exc) return None try: paras2 = QUEUE_P2.get_nowait() queue1_put(QUEUE_P2, paras2) except Exception as exc: logger.error(" QUEUE_P2.get_nowait() exc: %s", exc) return None try: parasm = QUEUE_PM.get_nowait() queue1_put(QUEUE_PM, parasm) except Exception as exc: logger.error(" QUEUE_PM.get_nowait() exc: %s", exc) return None lang0 = Detector(" ".join(paras1)).language.code lang1 = Detector(" ".join(paras2)).language.code # plist = [*zip_longest(paras1, paras2, parasm)] # QUEUE_DF, model.df try: # plist = QUEUE_DF[0] qdf = QUEUE_DF[0] except IndexError: logger.error(" qdf = QUEUE_DF[0] IndexError") messagebox.showwarning(" Oh no!", "Nothing in QUEUE_DF") return None except Exception as exc: logger.error(" qdf = QUEUE_DF[0] exc: %s", exc) messagebox.showwarning( " Oh no!", "unable to obtain data frmo QUEUE_DF exc: %s" % exc) return None # slist = plist_to_slist(plist, lang0, lang1) # tot = len(slist) * 2 tot = len(qdf) * 2 if tot > 300: msg = f"This can take about {get_time(tot)}. Continue?" logger.debug(msg) res = messagebox.askyesnocancel("Continue?", message=msg) if not res: return # get QUEUE_PS and QUEUE_PS0 if both 's', set split=Fasle try: qps0 = QUEUE_PS0[0] except IndexError: qps0 = "" # fetch QUEUE_PS and restore qps = fetch_queue1(QUEUE_PS) logger.debug(" qps0: %s qps0, qps: %s", qps0, qps) split = True if qps0 == qps: split = False plist = [*zip_longest( qdf["text1"], qdf["text2"], qdf["merit"], )] logger.debug("split: %s, plist[:5]: %s", split, plist[:5]) thr = Thread(target=plist_to_flist, args=(plist, ), kwargs={ "lang0": lang0, "lang1": lang1, "split": split }) thr.stop = False # type: ignore thr.start() self.TProgressbar1.start() # for mode="indeterminate" check_thread_update(self, thr) signal = { "PAlign": False, "SAlign": False, "pbtoplevel": True, # pbar grab_set, prevent editing Pad } logger.debug( """pbar send blinker.signal to aligner slot, signal: %s """, signal) SIG_ALIGNER.send("check_thread_update1", **signal) logger.debug(" pbar-s exit ") return None # ######### PARAS ########### logger.debug("palign myprogressbar.start_command") if event: # print(event) logger.debug("event: %s", event) # if tkMessageBox.askokcancel # self.top.destroy(); # self.start_command() # pbar = Mypbar(top): self.top = top => top = Aligner # self == pbar? # self.top: TopLevel # needs Aligner # self.top.align_command() # pbar (self) TButton1: Start, 2: Cancel,3: Back self.TButton1.config(state=tk.DISABLED) self.TButton2.config(state=tk.NORMAL) self.TButton3.config(state=tk.DISABLED) _ = """ thr = Thread( target=longtime_job, # name='job_thr', kwargs={"counter": 10}, ) # """ # src_text # type: Union[str, List[str] # tgt_text # type: Union[str, List[str] try: src_text = QUEUE_T1.get_nowait() queue1_put(QUEUE_T1, src_text) except Empty: src_text = "" try: tgt_text = QUEUE_T2.get_nowait() queue1_put(QUEUE_T2, tgt_text) except Empty: tgt_text = "" if not (src_text and tgt_text): messagebox.showwarning( title="Not ready", message=" Empty src_text or tgt_text, load files first. ") # restore button state 1: Start, 2: Cancel, 3: Back self.TButton1.config(state=tk.DISABLED) self.TButton3.config(state=tk.NORMAL) self.TButton2.config(state=tk.DISABLED) return None logger.debug("src_text[:5]: %s", src_text[:5]) logger.debug("tgt_text[:5]: %s", tgt_text[:5]) try: src_lang = Detector("\n".join(src_text)).language.code except Exception as exc: # messagebox.showerror(" Oh no! ", str(exc)) # self.TButton1.config(state=tk.DISABLED) # self.TButton3.config(state=tk.NORMAL) # self.TButton2.config(state=tk.DISABLED) # return None logger.error( """src_lang = Detector("\n".join(src_text)).language.code exc: %s""", exc) src_lang = "en" try: tgt_lang = Detector("\n".join(tgt_text)).language.code except Exception as exc: # messagebox.showerror(" Oh no! ", str(exc)) # self.TButton1.config(state=tk.DISABLED) # self.TButton3.config(state=tk.NORMAL) # self.TButton2.config(state=tk.DISABLED) # return None logger.error( """tgt_lang = Detector("\n".join(tgt_text)).language.code exc: %s""", exc) tgt_lang = "en" # convert to lists, already lists # src_text = text_to_paras(src_text) # tgt_text = text_to_paras(tgt_text) tot = len(src_text) + len(tgt_text) logger.debug( "tot: %s, len(src_text): %s, len(tgt_text): %s, src_text[:3]: %s, tgt_text[:3]: %s", tot, len(src_text), len(tgt_text), src_text[:3], tgt_text[:3]) if tot > 300: msg = f"This can take about {get_time(tot)}. Continue?" logger.debug(msg) res = messagebox.askyesnocancel("Continue?", message=msg) if not res: self.TButton1.config(state=tk.DISABLED) self.TButton3.config(state=tk.NORMAL) self.TButton2.config(state=tk.DISABLED) return # cos_mat = bee_corr(src_text, tgt_text, src_lang, tgt_lang) # plist = bee_aligner(src_text, tgt_text, cos_mat) thr = Thread( target=bee_aligner, args=(src_text, tgt_text), kwargs={"thr": self.spinbox_value}, ) logger.debug(" self.spinbox_value: %.2f, %s", self.spinbox_value, type(self.spinbox_value)) thr.stop = False # type: ignore thr.start() logger.debug("*job* thr_name: %s", thr.name) # thr.value = 0 # check_thread(thr) # self: pbar # reset pbar self.TProgressbar1["value"] = 0 self.TProgressbar1.step(100) self.TProgressbar1.start() # for mode="indeterminate" check_thread_update(self, thr) # thr.join()? fetch result from longtime_job # thr.join() will block, hence nonresponsive signal = { "PAlign": False, # "SAlign": True, "pbtoplevel": True, # pbar grab_set, prevent editing Pad } logger.debug("""send blinker.signal to aligner slot, signal: %s """, signal) SIG_ALIGNER.send("check_thread_update", **signal) logger.debug(" pbar-p exit ")