Python detokenize示例，summarkup.utils.detokenize Python示例

示例#1

0

显示文件

    def make_utterance_markup(self, doc, utt_index, translation, budget, 
                              stem=False):
        score_key = translation + (".stem_match" if stem else ".exact_match")
        match_style = "rel_close_match" if stem else "rel_exact_match"
        utt = doc.utterances[utt_index]["translations"][translation]
        token_scores = np.array(
            doc.annotations[score_key]["annotation"][utt_index]\
                ["word"]["matches"]
        )
        assert token_scores.ndim == 2 and token_scores.shape[1] == 1
        token_scores = token_scores.ravel()

        line_items = []
        for t, token in enumerate(utt.tokens): 
            if token_scores[t] > 0:
                line_items.append(
                    '<span_class="[STYLE]">{}</span>'.format(token.word)
                )
            else:
                line_items.append(token.word)
                        
        line = detokenize(" ".join(line_items))
        wc = len(line.split())
        if wc > budget:
            wc = budget
            line = " ".join(line.split()[:wc]) + "..."
        line = re.sub(r"span_class", "span class", line)
        line = re.sub(r"\[STYLE\]", match_style, line)
        return "<p>{}</p>\n".format(line), wc

示例#2

0

显示文件

    def make_utterance_markup(self, utt, budget, exact_matches, close_matches):

        line_items = []
        matches = []
        for token in utt.tokens:
            if token in exact_matches:
                line_items.append('<span_class="[EXACTREL]">' + token.word +
                                  '</span>')
                matches.append(1)
            elif token in close_matches:
                line_items.append('<span_class="[REL]">' + token.word +
                                  '</span>')
                matches.append(1)
            else:
                line_items.append(token.word)
                matches.append(0)

        l, r = None, None
        new_line_items = line_items
        if len(line_items) > budget:
            match_inds = [i for i in range(len(matches)) if matches[i] == 1]
            if len(match_inds) > 0:
                l = match_inds[0]
                r = match_inds[-1]

                if r - l + 1 > budget:
                    new_line_items = line_items
                    l, r = None, None
                else:
                    new_line_items = line_items[l:r + 1]
                    while len(new_line_items) < budget:
                        if l > 0:
                            l = l - 1
                            new_line_items.insert(0, line_items[l])
                            if len(new_line_items) == budget:
                                break
                        if r < len(line_items) - 1:
                            r = r + 1
                            new_line_items.append(line_items[r])

        line = detokenize(" ".join(new_line_items))
        wc = len(line.split())
        if wc > budget:
            wc = budget
            line = " ".join(line.split()[:wc]) + "..."

        if l is not None and l > 0:
            line = '...' + line
        if r is not None and r < len(line_items):
            if not line.endswith('...'):
                line = line + '...'

        line = re.sub(r"span_class", "span class", line)
        line = re.sub(r"\[EXACTREL\]", "rel_exact_match", line)
        line = re.sub(r"\[REL\]", "rel_close_match", line)
        return "<p>{}</p>\n".format(line), wc

示例#3

0

显示文件

文件： lexicalmultiwordv1.py 项目： eturcan/scripts

    def make_utterance_markup(self, doc, utt_index, translation, budget, query,
                              stem=False):
        score_key = translation + (".stem_match" if stem else ".exact_match")
        match_style = "rel_close_match" if stem else "rel_exact_match"
        rel_style = "rel_close" if stem else "rel_exact"
        utt = doc.utterances[utt_index]["translations"][translation]
        raw_token_scores = np.array(
            doc.annotations[score_key]["annotation"][utt_index]\
                ["word"]["matches"]
        )
        
        query_tokens = [x.word for x in query.content.tokens 
                        if x.word.lower() not in en_stopwords]
        assert len(query_tokens) == raw_token_scores.shape[1]

        found_words = set()
        for qword, qscore in zip(query_tokens, raw_token_scores.sum(axis=0)):
            if qscore > 0:
                found_words.add(qword)

        token_scores = raw_token_scores.sum(axis=1)
        token_scores = token_scores.ravel()
        mark_sims = set()
        if any(raw_token_scores.sum(axis=0) == 0):
            sim_scores = np.array(doc.annotations[translation + ".glove42Bsim.content_semcons"]["annotation"][utt_index]["word"]["sims"]).ravel()

            for top_sim in np.argsort(sim_scores)[::-1]:
                if token_scores[top_sim] == 0:
                    mark_sims.add(top_sim)
                if len(mark_sims) >= 2:
                    break


        line_items = []
        for t, token in enumerate(utt.tokens): 
            if token_scores[t] > 0:
                line_items.append(
                    '<span_class="[STYLE]">{}</span>'.format(token.word)
                )
            elif t in mark_sims:
                line_items.append(
                    '<span_class="[REL]">{}</span>'.format(token.word)
                )
            else:
                line_items.append(token.word)
                        
        line = detokenize(" ".join(line_items))
        wc = len(line.split())
        if wc > budget:
            wc = budget
            line = " ".join(line.split()[:wc]) + "..."
        line = re.sub(r"span_class", "span class", line)
        line = re.sub(r"\[STYLE\]", match_style, line)
        line = re.sub(r"\[REL\]", rel_style, line)
        return "<p>{}</p>\n".format(line), wc

示例#4

0

显示文件

    def make_utterance_markup(self, utt, budget, exact_matches, close_matches):

        line_items = []
        for token in utt.tokens:
            if token in exact_matches:
                line_items.append('<span_class="[EXACTREL]">' + token.word +
                                  '</span>')
            elif token in close_matches:
                line_items.append('<span_class="[REL]">' + token.word +
                                  '</span>')
            else:
                line_items.append(token.word)

        line = detokenize(" ".join(line_items))
        wc = len(line.split())
        if wc > budget:
            wc = budget
            line = " ".join(line.split()[:wc]) + "..."

        line = re.sub(r"span_class", "span class", line)
        line = re.sub(r"\[EXACTREL\]", "rel_exact_match", line)
        line = re.sub(r"\[REL\]", "relevant", line)
        return "<p>{}</p>\n".format(line), wc

示例#5

0

显示文件

    def __call__(self, doc, budget=100, make_header=True):
        query = doc.annotations["QUERY"]
        #        print(query)
        #        print([x.word for x in query.content.tokens])

        best_translations = []
        scores = []
        for i in range(len(doc.utterances)):

            trans, trans_score = self.get_best_translation(doc.annotations, i)
            best_translations.append(trans)
            scores.append(trans_score)

        scores = np.stack(scores).T
        ranks = np.argsort(scores, axis=1)[:, ::-1]
        merged_ranks = merge_rankings(ranks)

        markup_lines = []
        if make_header:
            header, header_size = make_relevant_header(query)
            markup_lines.append(header)
            size = header_size
        else:
            size = 0

        ranked_utterances = []
        for i in merged_ranks:
            best_trans = best_translations[i]
            utt = doc.utterances[i]["translations"][best_trans]
            num_words = len(
                detokenize(" ".join([x.word for x in utt.tokens])).split())
            size += num_words
            ranked_utterances.append({
                "index": i,
                "utt": utt,
                "trans": best_trans
            })
            if size > budget:
                break
        ranked_utterances.sort(key=lambda x: x["index"])
        size = header_size if make_header else 0

        exact_matches = set()
        close_matches = set()
        t2s = {}
        for x in ranked_utterances:
            tokens = x["utt"].tokens
            x_matches = doc.annotations[x["trans"] +
                                        ".exact_match"]["annotation"][
                                            x['index']]["word"]["matches"]
            c_matches = doc.annotations[
                x["trans"] + ".glove42Bsim.content_semcons"]["annotation"][
                    x['index']]["word"]["sims"]
            for t, m, sim in zip(tokens, x_matches, c_matches):
                if np.sum(m) > 0:
                    exact_matches.add(t)
                if t.pos in ["NN", "VB"]:
                    t2s[t] = sim[0]
        sim_toks = sorted(t2s, key=lambda x: t2s[x], reverse=True)
        for t in sim_toks:
            if t in exact_matches: continue
            close_matches.add(t)
            if len(close_matches) > 5:
                break

        for x in ranked_utterances:
            line, wc = self.make_utterance_markup(x["utt"], budget - size,
                                                  exact_matches, close_matches)
            size += wc
            markup_lines.append(line)
            if size >= budget:
                break

        found_terms = set([t.word.lower() for t in exact_matches])
        missing_terms = set([
            t.word.lower() for t in query.content.tokens
            if t.word.lower() not in found_terms
        ])

        instr = get_instructions(query.string, found_terms, missing_terms)
        return "\n".join(markup_lines), instr

示例#6

0

显示文件

文件： morphv1.py 项目： eturcan/scripts

    def __call__(self, doc, budget=100):
        query = doc.annotations["QUERY"]

        best_translations = self.get_best_translations(doc)
        scores = self.get_scores(doc)

        I = np.argsort(scores)[::-1]
        if scores[I[0]] == 0:
            return ConceptV2(*self.default_args,
                             **self.default_kwargs)(doc, budget=budget)

        if self.header:
            header = "Match found for {}, check number/tense/meaning:".format(
                " ".join([t.word for t in query.content.tokens]))
            size = len(header.split())
            markup_lines = ["<h1>{}</h1>".format(header)]
        else:
            markup_lines = []
            size = 0
        meta = {
            "translation": [],
            "markup": "morphv1",
            "utterance_ids": [],
            "source_offsets": [],
            "mode": doc.mode,
            "source_md5": doc.md5
        }
        for idx in I:
            score = scores[idx]
            if score == 0:
                break
            trans = best_translations[idx]
            sent = doc.utterances[idx]["translations"][trans]
            src_utt = doc.utterances[idx]["source"]
            tokens = [token.word for token in sent.tokens]
            mname = self.translation_annotators[trans][0][0]
            for m in doc.annotations[mname]["annotation"][idx]:
                for j, s in enumerate(m["match_quality"], m["token_position"]):
                    if s >= 1:
                        tokens[j] = '<span_class="RELEXACTMATCH">' \
                            + tokens[j] + '</span>'
                    else:
                        tokens[j] = '<span_class="RELEXACT">' \
                            + tokens[j] + '</span>'

            line = detokenize(" ".join(tokens))
            wc = len(line.split())
            if wc + size > budget:
                wc = budget - size
                line = " ".join(line.split()[:wc]) + "..."

            size += wc
            line = line.replace("RELEXACTMATCH", "rel_exact_match")
            line = line.replace("RELEXACT", "rel_close_match")
            line = line.replace("span_class", "span class")
            markup_lines.append("<p>{}</p>".format(line))
            meta["translation"].append(trans)
            meta["utterance_ids"].append(int(idx))
            meta["source_offsets"].append(src_utt.offsets)
            if size >= budget:
                break

        found_terms = self.get_found_words(doc, best_translations, query)
        missing_terms = [t.word.lower() for t in query.content.tokens
                         if t.word.lower() not in found_terms \
                         and t.word.lower() not in en_stopwords]
        #
        instructions = get_instructions(query.string, found_terms,
                                        missing_terms)
        return "\n".join(markup_lines), instructions, meta

        #        if query.morphological_constraint.morph.pos != "NN":
        #            return "<p>SKIP</p>"
        pos = "np"

        matches = []
        match_qualities = []
        for i, utt in enumerate(doc):

            tr2ann = {}
            tr2score = {}
            for trans in translations:
                k = trans + ".morph_match_" + pos
                ann = doc.annotations[k]["annotations"][i]
                if len(ann) > 0:
                    tr2ann[trans] = ann
                    tr2score[trans] = max([x["match_score"] for x in ann])

            if len(tr2ann) == 0:
                continue

            srt_trans = sorted(tr2ann.keys(),
                               key=lambda x: tr2score[x],
                               reverse=True)
            if len(srt_trans) > 1 \
                    and tr2score[srt_trans[0]] == tr2score[srt_trans[1]]:
                if "nmt" in srt_trans[0]:
                    best_trans = srt_trans[0]
                else:
                    best_trans = srt_trans[1]
            else:
                best_trans = srt_trans[0]

            for ann in tr2ann[best_trans]:
                match_qualities.append([x >= 1 for x in ann["match_quality"]])
            matches.append({
                "sent":
                i,
                "trans":
                best_trans,
                "anns":
                tr2ann[best_trans],
                "score":
                tr2score[best_trans],
                "exact_morph":
                any([x["exact_morph"] for x in tr2ann[best_trans]])
            })

        # sort is stable sorting should put exact matches first by higest score
        # then soft matches, by highest score
        matches.sort(key=lambda x: x["score"], reverse=True)
        matches.sort(key=lambda x: x["exact_morph"], reverse=True)

        if len(match_qualities) == 0:
            result = ConceptV2(*self.default_args,
                               **self.default_kwargs)(doc, budget=budget)
            result[2]["markup"] = "morph-backoff-conceptv2"
            return result

        found_term_ind = np.array(match_qualities).sum(axis=0)
        found_terms = [
            q.word for q, ind in zip(query.content.tokens, found_term_ind)
            if ind
        ]

        markup_lines = []

        header, size = make_word_match_header(query, found_terms)
        markup_lines.append(header)

        for match in matches:
            sent = doc.utterances[match["sent"]]["translations"][
                match["trans"]]
            tokens = [token.word for token in sent.tokens]
            for m in match["anns"]:
                for j, s in enumerate(m["match_quality"], m["token_position"]):
                    if s >= 1:
                        tokens[j] = '<span_class="RELEXACTMATCH">' \
                            + tokens[j] + '</span>'
                    else:
                        tokens[j] = '<span_class="RELEXACT">' \
                            + tokens[j] + '</span>'

            line = detokenize(" ".join(tokens))
            wc = len(line.split())
            if wc + size > budget:
                wc = budget - size
                line = " ".join(line.split()[:wc]) + "..."

            size += wc
            line = line.replace("RELEXACTMATCH", "rel_exact_match")
            line = line.replace("RELEXACT", "rel_exact")
            line = line.replace("span_class", "span class")
            markup_lines.append("<p>{}</p>".format(line))
            if size >= budget:
                break

        missing_terms = [
            t.word.lower() for t in query.content.tokens
            if t.word.lower() not in found_terms
        ]

        instructions = get_instructions(query.string, found_terms,
                                        missing_terms)
        return "\n".join(markup_lines), instructions