Пример #1
0
def process_html_structured(classifier, html, ranker, parId):
    """
    Transforms HMTL source, enriching simplified text spans with core markup by
    separating markup from text and sending pure text to simplification class.

    :param classifier: Simplification classifier instance
    :param html: Input HTML source
    :param parId: Paragraph identifier to disambiguate core simplification
    targets across multiple calls to this method
    :return: a tuple containing (1) the enriched HTML output and (2) a set of
    dicts that map core simplification target IDs to dicts of the form:
    ```
    {
        "original": original,
        "simple": simple,
        "bad_feedback": False,
        "is_simplified": False}
    }
    ```
    """
    global parser
    if not parser:
        from pylinkgrammar.linkgrammar import Parser
        parser = Parser()
    simplifications = {}
    html_out = []
    spanId = 0
    if not html.strip():
        return html
    output_sents = classifier.simplify_text(html, ranker)
    for original, simple in zip(*output_sents):
        simple_parsed = parser.parse_sent(simple)
        logger.debug([simple_parsed, simple.replace('\n', ''), parser])
        logger.debug(original)
        if original == simple:
            html_out.append(original)
        # elif not simple_parsed:
        #     out.append(original)
        else:
            original = util.escape(original)
            simple = util.escape(simple)
            spanId += 1
            elemId = "lexi_{}_{}".format(parId, spanId)
            html_out.append(
                "<span id='{}' class='lexi-simplify'>{}</span>".format(
                    elemId, original))
            simplifications.update({
                elemId: {
                    "original": original,
                    "simple": simple,
                    "bad_feedback": False,
                    "is_simplified": False,
                    # "sentence": sentence,  # uncomment if ever needing this
                    # "index": word_index
                }
            })
    return " ".join(html_out), simplifications
Пример #2
0
def process_html_lexical(pipeline,
                         html,
                         startOffset,
                         endOffset,
                         cwi,
                         ranker,
                         requestId=0,
                         min_similarity=0.7,
                         blacklist=None):
    """
    Transforms HMTL source, enriching simplified words with core markup by
    separating markup from text and sending pure text to simplification class.

    :param pipeline: Simplification pipeline instance
    :param html: Input HTML source
    :param startOffset: offset after which simplifications are solicited
    :param endOffset: offset until which simplifications are solicited
    :param cwi: personalized CWI module
    :param ranker: personalized ranker
    :param requestId: Request identifier to disambiguate core simplification
    targets across multiple calls to this method
    :param min_similarity: minimum similarity score for candidates, if
    applicable
    :param blacklist: list of words not to be simplified
    :return: a tuple containing (1) the enriched HTML output and (2) a set of
    dicts that map core simplification target IDs to dicts of the form:
    ```
    {
        "original": original word,
        "simple": simplified word,
        "choices": list of word alternatives available for display
        "bad_feedback": boolean, will be filled by frontend indicating bad
                        feedback,
        "selection": integer, will be filled by frontend, storing the number of
                     clicks by user to arrive at ultimately selected alternative
        "sentence": the original sentence string (without markup),
        "word_index": integer, index of the target word in the whitespace-
                      separated sentence string (counting from 0)
    }
    ```
    """
    def get_local_hyperlink_balance(tags):
        local_hyperlink_balance = 0
        for tag in tags:
            if tag.startswith("<a "):
                local_hyperlink_balance += 1
            elif tag == "</a>":
                local_hyperlink_balance -= 1
        return local_hyperlink_balance

    simplifications = {}
    html_out = ""
    spanId = 0
    if not html.strip():
        return html, simplifications
    # output is a sequence of tokens including whitespaces, id2simplification
    # is a dict mapping token IDs to simplifications, if applicable
    offset2html, pure_text = util.filter_html(html)

    # check if this is a single-word request. If so, do not provide CWI module
    # such that the word will be regarded as difficult
    if ' ' not in pure_text[startOffset:endOffset]:
        logger.info("Single-word request: '{}', at character offsets {} to {} "
                    "in text: {}".format(pure_text[startOffset:endOffset],
                                         startOffset, endOffset, pure_text))
        cwi = None

    offset2simplification = pipeline.simplify_text(
        pure_text,
        startOffset,
        endOffset,
        cwi=cwi,
        ranker=ranker,
        min_similarity=min_similarity,
        blacklist=blacklist)
    logger.debug("Simplifying text between character offsets {} "
                 "and {}: {}".format(startOffset, endOffset, pure_text))
    logger.debug(offset2simplification)
    i = 0
    open_hyperlinks_count = 0
    while i < len(pure_text):
        tags_at_offset = offset2html.get(i, [])
        open_hyperlinks_count += get_local_hyperlink_balance(tags_at_offset)
        # insert any HTML markup that belongs here
        if i in offset2html:
            html_out += "".join(offset2html[i])
        if i in offset2simplification and not open_hyperlinks_count > 0:
            # checking for hyperlinks because we don't want to simplify those
            original, replacements, sentence, \
                word_offset_start, word_offset_end = \
                offset2simplification[i]
            # in future, possibly get more alternatives, and possibly return
            # in some other order
            replacements = [util.escape(r) for r in replacements]
            choices = [original] + replacements
            spanId += 1
            elemId = "lexi_{}_{}".format(requestId, spanId)
            displaying_original = "true" if choices[0] == original else "false"
            span_out = "<span id='{}' " \
                       "class='lexi-simplify' " \
                       "data-displaying-original='{}'>" \
                       "{}" \
                       "</span>"\
                .format(elemId, displaying_original, original)
            html_out += span_out
            simplifications.update({
                elemId: {
                    "request_id": requestId,
                    "original": original,
                    "simple": replacements,  # legacy for frontend v. <= 0.2
                    "choices": choices,
                    "bad_feedback": False,
                    "selection": 0,
                    "sentence": sentence,
                    "word_offset_start": word_offset_start,
                    "word_offset_end": word_offset_end
                }
            })
            i += len(original) - 1
        else:
            html_out += pure_text[i]
        i += 1
    if i in offset2html:  # Append possible final markup at offset len(text)+1
        html_out += "".join(offset2html[i])
    return html_out, simplifications