예제 #1
0
def gen_tokens(
    text: Union[str, List[str]],
    label: str = "",
    lang: Optional[str] = None,
    gen_para: Union[int, bool] = False,
    gen_sent: Union[int, bool] = True,
    gen_phrase: Union[int, bool] = False,
) -> Iterator[Tuple[str, str, int, str]]:
    # fmt: on
    """Genereate tokens from text/list of text."""
    if isinstance(text, str):
        text = [elm.strip() for elm in text.splitlines() if elm.strip()]

    if lang is None:
        lang = Detector(" ".join(text)).language.code
        logger.debug("Deteced lang: %s", lang)

    if gen_para:
        for idx, para in enumerate(text):
            yield para, label, idx + 1, 'para'
    if gen_sent:
        for idx, para in enumerate(text):
            for sent in _sent_tokenizer(para, lang):
                yield sent, label, idx + 1, 'sent'
    if gen_phrase:
        for idx, para in enumerate(text):
            for sent in _sent_tokenizer(para, lang):
                raise Exception("need to install phrase_tokenizer"
                                "which is dependant of benepar")
예제 #2
0
def _sent_tokenizer(
        text: Union[str, List[str]],
        lang: Optional[str] = None,
        debug: bool = False,  # when True, disable joblib.Memory.cache
) -> List[str]:
    # fmt: on
    """Tokenize str|List[str] to sents."""
    if isinstance(text, str):
        text = [text]

    if lang is None:
        try:
            lang = Detector(" ".join(text)).language.code
        except Exception as exc:
            logger.warning(
                "polyglot.text.Detector exc: %s, setting to 'en'", exc
            )
            logger.info(" Try to pass lang (e.g. lang='en') to sent_tokenizer")
            lang = 'en'

    res = []
    for elm in text:
        res.extend(seg_text(elm, lang=lang))

    return res
예제 #3
0
def seg_text(text: str, lang: Optional[str] = None) -> List[str]:
    """split text to sentences.

    use sentence_splitter if supported,
    else use polyglot.text.Text
    """
    if lang is None:
        lang = Detector("testt 12 3").language.code

    if lang in LANG_S:
        return split_text_into_sentences(text, lang)

    return [elm.string for elm in Text(text, lang).sentences]
예제 #4
0
def seg_text(
        text: str,
        lang: Optional[str] = None,
        qmode: bool = False,
        maxlines: int = 1000
) -> List[str]:
    # fmt: on
    """
    Split text to sentences.

    Use sentence_splitter if supported,
    else use polyglot.text.Text.sentences

    qmode: skip split_text_into_sentences if True, default False
        vectors for all books are based on qmode=False.
        qmode=True is for quick test purpose only

    maxlines (default 1000), threhold for turn on tqdm progressbar
        set to <1 or a large number to turn it off
    """
    if lang is None:
        try:
            lang = Detector(text).language.code
        except Exception as exc:
            logger.warning(
                "polyglot.text.Detector exc: %s, setting to 'en'", exc
            )
            lang = "en"

    if not qmode and lang in LANG_S:
        _ = []
        lines = text.splitlines()
        # if maxlines > 1 and len(lines) > maxlines:
        if len(lines) > maxlines > 1:
            for para in tqdm(lines):
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        else:
            for para in lines:
                if para.strip():
                    _.extend(split_text_into_sentences(para, lang))
        return _

        # return split_text_into_sentences(text, lang)

    return [elm.string for elm in Text(text, lang).sentences]
예제 #5
0
def lists_to_tmx(
    srclist: List[str],
    tgtlist: List[str],
    srclang: Optional[str] = None,  # "en-US",
    tgtlang: Optional[str] = None,  # "zh-CN",
    encoding: Optional[str] = None,
    # method: str = "xml",
    xml_declaration: bool = True,
    pretty_print: bool = True,
    doctype: str = '<!DOCTYPE tmx SYSTEM "tmx14a.dtd">',
) -> str:
    # fmt: on
    """
    lists_to_tmx(srclist, tgtlist, srclang='en-US',
    tgtlang='zh-CN',
    encoding=None, method="xml", xml_declaration=True,
    pretty_print=False, doctype='<!DOCTYPE tmx SYSTEM "tmx14a.dtd">')

    return: bytes

    et.tostring(tostring(element_or_tree, encoding=None, method="xml",
             xml_declaration=None, pretty_print=False, with_tail=True,
             standalone=None, doctype=None,
             exclusive=False, with_comments=True, inclusive_ns_prefixes=None)
    wite out with:
    with open('test2tu.tmx','w') as fh:
   .....:     fh.write(tmx.decode())
    """

    if len(srclist) != len(tgtlist):
        logger.warning(" len(srclist) != len(tgtlist), we proceed anyway...")
        # raise Exception(" len(srclist) != len(tgtlist) ")

    if srclang is None:
        lc1 = Detector(" ".join(srclist)[:5000], quiet=True).language.code
        srclang = langcode_to_tmxcode(lc1)
    if tgtlang is None:
        lc2 = Detector(" ".join(tgtlist)[:5000], quiet=True).language.code
        tgtlang = langcode_to_tmxcode(lc2)

    if encoding is None:
        encoding = "utf-8"

    root = et.Element("tmx", attrib={"version": "1.4"})  # type: ignore

    # header =  # gen header
    et.SubElement(root,
                  "header",
                  attrib={
                      "amdinlang": srclang,
                      "srclang": srclang
                  })  # type: ignore

    body = et.SubElement(root, "body")  # type: ignore

    # tuv_en = et.SubElement(tu, "tuv", xml:lang="en")  # 'xml:lang' gets error
    # tuv_zh = et.SubElement(tu, "tuv", xml:lang="zh")

    len0 = min(len(srclist), len(tgtlist))

    # for itrange in tqdm.trange(len0):
    for itrange in range(len0):
        tu = et.SubElement(body, "tu")  # type: ignore
        tuv_en = et.SubElement(tu, "tuv", attrib={"lang":
                                                  srclang})  # type: ignore
        tuv_zh = et.SubElement(tu, "tuv", attrib={"lang":
                                                  tgtlang})  # type: ignore
        # attach tuv to tree
        et.SubElement(tuv_en, "seg").text = srclist[itrange]  # type: ignore
        et.SubElement(tuv_zh, "seg").text = tgtlist[itrange]  # type: ignore

    tree = et.ElementTree(root)  # type: ignore
    treestr = et.tostring(  # type: ignore
        tree,
        encoding=encoding,
        pretty_print=pretty_print,
        xml_declaration=xml_declaration,
        doctype=doctype,
    )

    return treestr.decode()
예제 #6
0
def main():
    """ main.
    """

    front_cover()

    p_list = None
    p_list1 = None
    file1_flag, file2_flag = False, False

    # fetch file contents
    if src_fileio is not None:
        file1_flag = True
        src_file = src_fileio.getvalue()
        if isinstance(src_file, bytes):
            src_file = src_file.decode("utf8")

        lang1 = Detector(src_file).language.code

        src_text = split_text(src_file)

        # s_or_d = single_or_dual(src_file)
        # st.info(["1st file: single or dual lang? ", str(s_or_d), len(s_or_d)])

        # if len(s_or_d) < 2:  # single dual-lang file

        tgt_fileio = st.sidebar.file_uploader("Choose another file",
                                              type=[
                                                  'txt',
                                              ],
                                              key="tgt_text")

        if tgt_fileio is not None:
            file2_flag = True
            tgt_file = tgt_fileio.getvalue()
            if isinstance(tgt_file, bytes):
                tgt_file = tgt_file.decode("utf8")

            lang2 = Detector(tgt_file).language.code

            tgt_text = split_text(tgt_file)

            len_ = len(src_text) + len(tgt_text)
            if len_ > 300:
                # st.markdown("Sorry, this will likely hog the petite server to it's virtual death (total number paragraphs limited to **300** ). We'll trim both texts to 50 and have a testrun. ")
                st.warning(" This is likely to take a long time...")
                # src_text = src_text[:150]
                # tgt_text = tgt_text[:150]
                len_ = len(src_text) + len(tgt_text)

            logger.info("total paras: %s", len_)

            _ = 3
            st.subheader(f"first {_} paras in file1 and file 2, respectively")
            st.write(f" {src_text[:_]} ")
            st.write(f" {tgt_text[:_]} ")
            st.subheader(f"last {_} paras in file1 and file 2, respectively")
            st.write(f" {src_text[-_:]} ")
            st.write(f" {tgt_text[-_:]} ")

            len1 = len(src_text)
            len2 = len(tgt_text)
            est_time = round(len1 / 32) + bool(len1 % 32)
            est_time += round(len2 / 32) + bool(len2 % 32)
            est_time *= 13 / 60

            st.info([
                f" file1: {len(src_text)} paras",
                f" file2: {len(tgt_text)} paras",
            ])

        _ = '''
        else:  # dual-lang file
            st.write("""
        It looks like a dual-lang file.
        We'll implement something to handle this, soon.
            """)
            # assert 0, ""
        # '''

    # align paras
    # if st.sidebar.checkbox(
    # "tick to proceed with para alignment",
    # value=False,
    # key="para-align",
    # ):
    if op_selectbox in ["Para/Sent Align"
                        ]:  # ("Para/Sent Align", "Simple Sent Align")
        if not (file1_flag and file2_flag):
            # st.info("Pick two files first")
            instruction1()
        else:
            # st.write(f" Processing... first run can take a while, ~{2 * len_ // 100 + 1}-{3 * len_ // 100 + 1}  min. Please wait...")
            st.write(
                f" Processing... first run can take a while, ~{est_time:.1f}  min. Please wait..."
            )

            try:
                cos_mat = np.asarray(
                    bee_corr(src_text, tgt_text, url=model_url))
            except Exception as exc:
                st.write("exc: %s" % exc)
                st.stop()
                raise SystemExit(1) from Exception

            st.markdown("### cosine similarity matrix")
            st.dataframe(pd.DataFrame(cos_mat).style.highlight_max(axis=0))

            fig, ax = plt.subplots()
            # fig = plt.figure()  # (figsize= (10,7))

            # cmap = sns.diverging_palette(20, 220, as_cmap=True)
            sns.heatmap(cos_mat, vmin=0, vmax=1)

            # plt.xlabel("file2")
            # plt.ylabel("file1")
            plt.xlabel(f"{tgt_fileio.name}")
            plt.ylabel(f"{src_fileio.name}")
            plt.title("cosine similarity heatmap")
            st.pyplot(fig)

            # plt.close()

            # st.markdown("## fine-tune alignment")
            st.header("fine-tune alignment")

            thr = st.slider(
                "threshold (0...1) -- drag or click to adjust threshold value",
                min_value=0.,
                max_value=1.,
                value=0.5,
                step=0.05,
                key="thr-slider",
            )

            p_list = bee_aligner(src_text, tgt_text, cos_mat=cos_mat, thr=thr)

            if p_list is not None:
                df = pd.DataFrame(p_list)
                st.markdown("#### para alignment at a glance")
                st.info("(hove over a cell to disply full text)")

                st.dataframe(df)
                # st.dataframe(df.style.highlight_max(axis=0))

                st.subheader("para alignment in detail")
                if st.checkbox(
                        f"tick to show detailed alignment (threhold={thr})",
                        value=0,
                        key="para_df"):
                    st.table(df)

                s_df = color_table_applymap(df)

                st.sidebar.subheader("aligned paras xlsx file for downloading")
                st.sidebar.markdown(get_table_download_link(s_df),
                                    unsafe_allow_html=True)

            # para-sent
            if st.sidebar.checkbox(
                    " align sent within paras already aligned",
                    value=0,
                    key="para-sent-align",
            ):
                if p_list is None:
                    st.info(" align paras first")
                else:
                    st.subheader(" Aligning sents ")
                    # st.info(" TODO ")

                    # ====
                    then = default_timer()
                    s_list = plist_to_slist(p_list)
                    thr_s = [""] * len(s_list)
                    c_mat = [""] * len(s_list)

                    # final_aligned = [[], [], []]
                    final_aligned = []

                    # for elm in range(2):
                    for elm in range(len(c_mat)):
                        _ = """
                        thr_s[elm] = st.slider(
                            "",
                            min_value=0.,
                            max_value=1.,
                            value=0.5,
                            step=0.05,
                            key=f"thr-slider-{elm}",
                        )
                        st.write(f"{elm+1}/{len(c_mat)}", thr_s[elm])
                        # """

                        st.write(f"{elm + 1}/{len(c_mat)}")
                        thr_s[elm] = 0.5

                        # c_mat[elm] = bee_corr(s_list[elm][0], s_list[elm][1])
                        c_mat[elm] = bee_corr(
                            s_list[elm][0],
                            s_list[elm][1],
                            # src_lang=lang1,
                            # tgt_lang=lang2,
                            url=model_url)
                        s_list_aligned = bee_aligner(
                            s_list[elm][0],
                            s_list[elm][1],
                            cos_mat=c_mat[elm],
                            thr=thr_s[elm],
                        )

                        st.table(pd.DataFrame(s_list_aligned))

                        # final_aligned[0].extend([elm[0] for elm in s_list_aligned])
                        # final_aligned[1].extend([elm[1] for elm in s_list_aligned])
                        # final_aligned[2].extend([elm[2] for elm in s_list_aligned])
                        # [*zip(final_aligned[0], final_aligned[1], final_aligned[2])]
                        final_aligned.extend(s_list_aligned)

                    logger.debug("total sents: %s", len(final_aligned))

                    st.write(
                        f"Time spent for sent alignment: {(default_timer() - then) / 60:.2f} min"
                    )
                    st.write(f"Total sent pairs: {len(final_aligned)}")

                    st.subheader("aligned sentences in one batch")
                    df_sents = pd.DataFrame(final_aligned)
                    # s_df_sents = color_table_applymap(df_sents)
                    s_df_sents = df_sents

                    if st.checkbox("Tick to show",
                                   value=0,
                                   key="finall align sentences"):
                        # st.table(final_aligned)
                        st.table(s_df_sents)

                    logger.debug("aligned sents ready for downloading")

                    st.sidebar.subheader(
                        "Aligned sents in xlsx for downloading")
                    st.sidebar.markdown(
                        get_table_download_link_sents(s_df_sents),
                        unsafe_allow_html=True)

                    # ====

    # align sents
    # if st.sidebar.checkbox(
    # "tick to proceed with sent alignment (w/o para align)",
    # value=False,
    # key="sent-align",
    # ):
    if op_selectbox in ["Simple Sent Align"
                        ]:  # ("Para/Sent Align", "Simple Sent Align")
        if not (file1_flag and file2_flag):
            # st.info("Pick two files first")
            instruction1()
        else:
            # st.info(" sent alignment to be implemented ")
            sents1 = []
            for elm in src_text:
                if elm.strip():
                    sents1.extend(seg_text(elm, lang1))
            st.info(sents1[:3])
            sents2 = []
            for elm in tgt_text:
                if elm.strip():
                    sents2.extend(seg_text(elm, lang2))
            st.info(sents2[:3])
            len1s = len(sents1)
            len2s = len(sents2)
            st.info([
                f"file1: {len1s} sents",
                f"{lang1}",
                f"file2: {len2s} sents",
                f"{lang2}",
            ])

            est_time1 = len1s // 32 + bool(len1s % 32)
            est_time1 += len2s // 32 + bool(len2s % 32)
            est_time1 *= 7 / 60

            st.info(
                f"The first run may take a while, about {est_time1:.1f} min")
            try:
                #     cos_mat = np.asarray(bee_corr(src_text, tgt_text))
                cos_mat1 = np.asarray(bee_corr(sents1, sents2, url=model_url))
            except Exception as exc:
                # st.write("exc: ", exc)
                logger.error("exc: %s" % exc)
                st.stop()
                raise SystemExit(1) from Exception

            st.markdown("### cosine similarity matrix (sents)")
            st.dataframe(
                pd.DataFrame(cos_mat1.round(2)).style.highlight_max(axis=0))

            mean1 = cos_mat1.mean()
            var1 = cos_mat1.var()

            fig, ax = plt.subplots()
            # fig = plt.figure()  # (figsize= (10,7))

            # cmap = sns.diverging_palette(20, 220, as_cmap=True)
            sns.heatmap(cos_mat1, vmin=0, vmax=1)

            plt.xlabel(f"{tgt_fileio.name}")
            plt.ylabel(f"{src_fileio.name}")
            plt.title(
                f"mean={mean1.round(2)} var={var1.round(2)} cosine similarity (sents) heatmap"
            )
            st.pyplot(fig)

            # st.markdown("## fine-tune alignment")
            st.header("fine-tune sent alignment")

            thr_i = float(round(min(mean1 + 30 * var1, 0.5), 2))
            st.info(f"inital threshold={thr_i:.2f}")

            thr1 = st.slider(
                "threshold (0...1) -- drag or click to adjust threshold value",
                min_value=0.,
                max_value=1.,
                value=thr_i,
                step=0.05,
                # key="sliderkey",
            )

            p_list1 = bee_aligner(sents1, sents2, cos_mat=cos_mat1, thr=thr1)

            if p_list1 is not None:
                df1 = pd.DataFrame(p_list1)
                st.markdown("#### sent alignment at a glance")
                st.info("(hove over a cell to disply full text)")

                st.dataframe(df1)
                st.subheader("sent alignment in detail")
                if st.checkbox(
                        f"tick to show detailed sent alignment (threhold={thr1})",
                        value=0,
                        key="para_df"):
                    st.table(df1)

                s_df1 = color_table_applymap(df1)

                st.sidebar.subheader("aligned sent xlsx file for downloading")
                st.sidebar.markdown(get_table_download_link_sents(s_df1),
                                    unsafe_allow_html=True)

    back_cover()
def plist_to_slist(
        p_list: List[Tuple[str, str, float]],
        lang0: str = "",
        lang1: str = "",
) -> List[Tuple[str, str, float]]:
    # ):
    # fmt: on
    """ para_list to sent_list. """

    c_th = currentThread()

    # logzero.loglevel(10)
    logzero.loglevel(20)

    # convert 3rd column's nonempty str to float
    prob = [elm[2] if elm[2] == "" else float(elm[2]) for elm in p_list]

    logger.debug("prob: %s", prob)

    if lang0 in [""]:
        lang0 = Detector(" ".join([elm[0] for elm in p_list[:100]])).language.code
    if lang1 in [""]:
        lang1 = Detector(" ".join([elm[1] for elm in p_list[:100]])).language.code

    # lang0 = Detector(" ".join([elm[0] for elm in p_list[:100]])).language.code
    # lang1 = Detector(" ".join([elm[1] for elm in p_list[:100]])).language.code

    idx_lst = [[]]
    # t_f = [*map(lambda x: not isinstance(x, float), prob[:])]; idx, elm = i, t_f[i]
    for idx, elm in enumerate(map(lambda x: not isinstance(x, float), prob[:])):
        idx_lst = (idx_lst[:-1] + [idx_lst[-1] + [idx]]) if elm else (idx_lst + [[idx]])

    logger.debug("idx_lst: %s", str(idx_lst))

    # p_list[idx_lst[3][0]: idx_lst[3][-1]+1]
    # p_list[idx_lst[idx][0]: idx_lst[idx][-1]+1]

    sent_lst = []
    for elm in idx_lst:
        if not elm:  # bypass possible first empty list
            continue

        left, right = [], []
        for idx in range(elm[0], elm[-1] + 1):
            if p_list[idx][0]:
                # left0 = [sent.string for sent in Text(p_list[idx][0], lang0).sentences]
                left0 = split_text(p_list[idx][0], lang0)
                left.extend([s.strip() for s in left0 if s.strip()])
            if p_list[idx][1]:
                # right0 = [sent.string for sent in Text(p_list[idx][1], lang1).sentences]
                right0 = split_text(p_list[idx][1], lang1)
                right.extend([s.strip() for s in right0 if s.strip()])

            # supply "" string if nothing exists
            if not left:
                left = [""]
            if not right:
                right = [""]

        sent_lst.append([left, right])

    c_th.sent_lst = sent_lst  # type: ignore
    # SIG_ALIGNER.send("plist_to_slist", **{"sent_lst":sent_lst})
    # SIG_ALIGNER.send("plist_to_slist", sent_lst=sent_lst)

    return sent_lst
예제 #8
0
def text_to_plist(
    text_dual: Union[str, List[str]],
    langs: Optional[List[str]] = None,
) -> List[str]:
    # fmt: on
    """ convert text_dual to p_list, given a two-tuple langs, e.g., ['en', 'zh'].
    """

    c_thr = currentThread()

    if isinstance(text_dual, list):
        text_dual = "\n".join(text_dual)

    if langs is None:
        # langs = ['en', 'zh']
        langs = single_or_dual(text_dual)

    langid.set_languages(langs)

    # polyglot.Detector not in langs, use langid.classify

    # remove "\u3000"
    paras = [
        elm.strip() for elm in text_dual.replace("\u3000", " ").splitlines()
        if elm.strip()
    ]

    if len(langs) == 1:
        _ = [*zip_longest(paras, [""], [""], fillvalue="")]
        c_thr.p_list = _
        return _

    # with timeme():  # 2094 ms
    langs_info = []
    for para in paras:
        lang = Detector(para, True).language.code
        if lang not in langs:
            lang = langid.classify(para)[0]
        langs_info.append(lang)

    if len(langs_info) < 2:
        logger.warning(
            "langs_info: %s, nothing to separate, returning original text as the first column with two empty columns",
            langs_info)

        _ = [*zip_longest(paras, "", "", fillvalue="")]
        c_thr.p_list = _
        return _

    binary_info = [1]
    for idx, elm in enumerate(langs_info[1:], 1):
        if elm == langs_info[idx - 1]:
            binary_info.append(0)
        else:
            binary_info.append(1)

    left = []
    right = []
    l_or_r = 1
    for idx, para in enumerate(paras):
        if binary_info[idx]:
            # switch
            l_or_r = (l_or_r + 1) % 2
            if l_or_r:  # right
                right.append([])
            else:
                left.append([])
        if l_or_r:  # right
            right[-1].append(para)
        else:
            left[-1].append(para)

    left = ["\n".join(elm) for elm in left]
    right = ["\n".join(elm) for elm in right]

    _ = 5
    # corr0 = bee_aligner.bee_corr.bee_corr(left[1: _ + 1], right[:_]).diagonal()  # skip possible junk at the beginning
    # corr1 = bee_aligner.bee_corr.bee_corr(left[1: _ + 1], right[1:_ +1]).diagonal()

    corr0 = bee_corr(
        left[1:_ + 1],
        right[:_]).diagonal()  # skip possible junk at the beginning
    corr1 = bee_corr(left[1:_ + 1], right[1:_ + 1]).diagonal()

    if np.sum(corr0) > np.sum(corr1):
        p_list = [*zip_longest(left, [''] + right, fillvalue='')]
    else:
        p_list = [*zip_longest(left, right, fillvalue='')]

    _ = p_list[:]
    p_list = []
    for para in _:
        len0, len1 = len(para[0]), len(para[1])
        if len0 > 20 * len1 or len1 > 20 * len0:
            entry = [para[0], para[1], '']
        else:
            entry = [para[0], para[1], '0.66']
        p_list.append(entry)

    logger.info(" update table via SIG_TABLE.send(df=p_list)")
    SIG_TABLE.send("text_to_plist", df=p_list)

    c_thr.p_list = p_list  # type: ignore

    return p_list
    def start_command(self, event=None):
        """ need a QUEUE_PS (paras or sents flag).

        set when ativating palign or salign

        flag = QUEUE_PS.get_nowait()
        queue1_put(QUEUE_PS, flag)
        """

        try:
            flag = QUEUE_PS.get_nowait()
            queue1_put(QUEUE_PS, flag)
        except Empty:
            flag = ""

        # ######### SENTS ###########
        # 358 - 475 salign
        if flag in ["s"]:
            logger.debug("salign myprogressbar.start_command")
            logger.info("handle salign...")
            # check_thread_update
            # pbar (self) TButton1: Start, 2: Cancel,3: Back
            self.TButton1.config(state=tk.DISABLED)
            self.TButton2.config(state=tk.NORMAL)
            self.TButton3.config(state=tk.DISABLED)

            if not QUEUE_PA.qsize():
                messagebox.showwarning(
                    title="Not ready",
                    message=
                    " Paras not aligned yet, align paras first (Ctrl-P) ")

                # restore button state 1: Start, 2: Cancel, 3: Back
                self.TButton1.config(state=tk.DISABLED)
                self.TButton2.config(state=tk.DISABLED)
                self.TButton3.config(state=tk.NORMAL)

                return None

            try:
                paras1 = QUEUE_P1.get_nowait()
                logger.debug(" paras1[:3]: %s", paras1[:3])
                queue1_put(QUEUE_P1, paras1)
            except Exception as exc:
                logger.error(" QUEUE_P1.get_nowait() exc: %s", exc)
                return None
            try:
                paras2 = QUEUE_P2.get_nowait()
                queue1_put(QUEUE_P2, paras2)
            except Exception as exc:
                logger.error(" QUEUE_P2.get_nowait() exc: %s", exc)
                return None
            try:
                parasm = QUEUE_PM.get_nowait()
                queue1_put(QUEUE_PM, parasm)
            except Exception as exc:
                logger.error(" QUEUE_PM.get_nowait() exc: %s", exc)
                return None

            lang0 = Detector(" ".join(paras1)).language.code
            lang1 = Detector(" ".join(paras2)).language.code

            # plist = [*zip_longest(paras1, paras2, parasm)]
            # QUEUE_DF, model.df

            try:
                # plist = QUEUE_DF[0]
                qdf = QUEUE_DF[0]
            except IndexError:
                logger.error(" qdf = QUEUE_DF[0] IndexError")
                messagebox.showwarning(" Oh no!", "Nothing in QUEUE_DF")
                return None
            except Exception as exc:
                logger.error(" qdf = QUEUE_DF[0] exc: %s", exc)
                messagebox.showwarning(
                    " Oh no!",
                    "unable to obtain data frmo QUEUE_DF exc: %s" % exc)
                return None

            # slist = plist_to_slist(plist, lang0, lang1)
            # tot = len(slist) * 2
            tot = len(qdf) * 2

            if tot > 300:
                msg = f"This can take about {get_time(tot)}. Continue?"
                logger.debug(msg)
                res = messagebox.askyesnocancel("Continue?", message=msg)
                if not res:
                    return

            # get QUEUE_PS and QUEUE_PS0 if both 's', set split=Fasle
            try:
                qps0 = QUEUE_PS0[0]
            except IndexError:
                qps0 = ""
            # fetch QUEUE_PS and restore
            qps = fetch_queue1(QUEUE_PS)

            logger.debug(" qps0: %s qps0, qps: %s", qps0, qps)
            split = True
            if qps0 == qps:
                split = False

            plist = [*zip_longest(
                qdf["text1"],
                qdf["text2"],
                qdf["merit"],
            )]

            logger.debug("split: %s, plist[:5]: %s", split, plist[:5])

            thr = Thread(target=plist_to_flist,
                         args=(plist, ),
                         kwargs={
                             "lang0": lang0,
                             "lang1": lang1,
                             "split": split
                         })
            thr.stop = False  # type: ignore
            thr.start()

            self.TProgressbar1.start()  # for mode="indeterminate"

            check_thread_update(self, thr)
            signal = {
                "PAlign": False,
                "SAlign": False,
                "pbtoplevel": True,  # pbar grab_set, prevent editing Pad
            }
            logger.debug(
                """pbar send blinker.signal to aligner slot, signal: %s """,
                signal)
            SIG_ALIGNER.send("check_thread_update1", **signal)

            logger.debug(" pbar-s exit ")

            return None

        # ######### PARAS ###########
        logger.debug("palign myprogressbar.start_command")
        if event:
            # print(event)
            logger.debug("event: %s", event)

        # if tkMessageBox.askokcancel
        # self.top.destroy();

        # self.start_command()

        # pbar = Mypbar(top): self.top = top => top = Aligner
        # self == pbar?
        # self.top: TopLevel
        # needs Aligner
        # self.top.align_command()

        # pbar (self) TButton1: Start, 2: Cancel,3: Back
        self.TButton1.config(state=tk.DISABLED)
        self.TButton2.config(state=tk.NORMAL)
        self.TButton3.config(state=tk.DISABLED)

        _ = """
        thr = Thread(
            target=longtime_job,
            # name='job_thr',
            kwargs={"counter": 10},
        )
        # """

        # src_text  # type: Union[str, List[str]
        # tgt_text  # type: Union[str, List[str]

        try:
            src_text = QUEUE_T1.get_nowait()
            queue1_put(QUEUE_T1, src_text)
        except Empty:
            src_text = ""
        try:
            tgt_text = QUEUE_T2.get_nowait()
            queue1_put(QUEUE_T2, tgt_text)
        except Empty:
            tgt_text = ""

        if not (src_text and tgt_text):
            messagebox.showwarning(
                title="Not ready",
                message=" Empty src_text or tgt_text, load files first. ")

            # restore button state 1: Start, 2: Cancel, 3: Back
            self.TButton1.config(state=tk.DISABLED)
            self.TButton3.config(state=tk.NORMAL)
            self.TButton2.config(state=tk.DISABLED)

            return None

        logger.debug("src_text[:5]: %s", src_text[:5])
        logger.debug("tgt_text[:5]: %s", tgt_text[:5])

        try:
            src_lang = Detector("\n".join(src_text)).language.code
        except Exception as exc:
            # messagebox.showerror(" Oh no! ", str(exc))
            # self.TButton1.config(state=tk.DISABLED)
            # self.TButton3.config(state=tk.NORMAL)
            # self.TButton2.config(state=tk.DISABLED)
            # return None
            logger.error(
                """src_lang = Detector("\n".join(src_text)).language.code exc: %s""",
                exc)
            src_lang = "en"

        try:
            tgt_lang = Detector("\n".join(tgt_text)).language.code
        except Exception as exc:
            # messagebox.showerror(" Oh no! ", str(exc))
            # self.TButton1.config(state=tk.DISABLED)
            # self.TButton3.config(state=tk.NORMAL)
            # self.TButton2.config(state=tk.DISABLED)
            # return None
            logger.error(
                """tgt_lang = Detector("\n".join(tgt_text)).language.code exc: %s""",
                exc)
            tgt_lang = "en"

        # convert to lists, already lists
        # src_text = text_to_paras(src_text)
        # tgt_text = text_to_paras(tgt_text)

        tot = len(src_text) + len(tgt_text)
        logger.debug(
            "tot: %s, len(src_text): %s, len(tgt_text): %s, src_text[:3]: %s, tgt_text[:3]: %s",
            tot, len(src_text), len(tgt_text), src_text[:3], tgt_text[:3])
        if tot > 300:
            msg = f"This can take about {get_time(tot)}. Continue?"
            logger.debug(msg)
            res = messagebox.askyesnocancel("Continue?", message=msg)
            if not res:
                self.TButton1.config(state=tk.DISABLED)
                self.TButton3.config(state=tk.NORMAL)
                self.TButton2.config(state=tk.DISABLED)
                return

        # cos_mat = bee_corr(src_text, tgt_text, src_lang, tgt_lang)
        # plist = bee_aligner(src_text, tgt_text, cos_mat)

        thr = Thread(
            target=bee_aligner,
            args=(src_text, tgt_text),
            kwargs={"thr": self.spinbox_value},
        )

        logger.debug(" self.spinbox_value: %.2f, %s", self.spinbox_value,
                     type(self.spinbox_value))

        thr.stop = False  # type: ignore
        thr.start()

        logger.debug("*job* thr_name: %s", thr.name)

        # thr.value = 0
        # check_thread(thr)

        # self: pbar
        # reset pbar
        self.TProgressbar1["value"] = 0
        self.TProgressbar1.step(100)

        self.TProgressbar1.start()  # for mode="indeterminate"

        check_thread_update(self, thr)

        # thr.join()? fetch result from longtime_job
        # thr.join() will block, hence nonresponsive

        signal = {
            "PAlign": False,
            # "SAlign": True,
            "pbtoplevel": True,  # pbar grab_set, prevent editing Pad
        }
        logger.debug("""send blinker.signal to aligner slot, signal: %s """,
                     signal)
        SIG_ALIGNER.send("check_thread_update", **signal)

        logger.debug(" pbar-p exit ")