Пример #1
0
def test_html_fromstring_too_big(fromstring):
    eq_(None, u.html_fromstring("<html></html>"))
    assert_false(fromstring.called)
Пример #2
0
def test_html_fromstring_exception():
    eq_(None, u.html_fromstring("<html></html>"))
Пример #3
0
def test_html_fromstring_exception():
    eq_(None, u.html_fromstring("<html></html>"))
Пример #4
0
def test_html_fromstring_too_big(fromstring):
    eq_(None, u.html_fromstring("<html></html>"))
    assert_false(fromstring.called)
Пример #5
0
def _extract_from_html(msg_body):
    """
    Extract not quoted message from provided html message body
    using tags and plain text algorithm.

    Cut out the 'blockquote', 'gmail_quote' tags.
    Cut Microsoft quotations.

    Then use plain text algorithm to cut out splitter or
    leftover quotation.
    This works by adding checkpoint text to all html tags,
    then converting html to text,
    then extracting quotations from text,
    then checking deleted checkpoints,
    then deleting necessary tags.
    """
    if msg_body.strip() == b'':
        return msg_body

    msg_body = msg_body.replace(b'\r\n', b'\n')
    html_tree = html_fromstring(msg_body)

    if html_tree is None:
        return msg_body

    cut_quotations = (html_quotations.cut_gmail_quote(html_tree)
                      or html_quotations.cut_zimbra_quote(html_tree)
                      or html_quotations.cut_blockquote(html_tree)
                      or html_quotations.cut_microsoft_quote(html_tree)
                      or html_quotations.cut_by_id(html_tree)
                      or html_quotations.cut_from_block(html_tree))
    html_tree_copy = deepcopy(html_tree)

    number_of_checkpoints = html_quotations.add_checkpoint(html_tree, 0)
    quotation_checkpoints = [False] * number_of_checkpoints
    plain_text = html_tree_to_text(html_tree)
    plain_text = preprocess(plain_text, '\n', content_type='text/html')
    lines = plain_text.splitlines()

    # Don't process too long messages
    if len(lines) > MAX_LINES_COUNT:
        return msg_body

    # Collect checkpoints on each line
    line_checkpoints = [
        [
            int(i[4:-4])  # Only checkpoint number
            for i in re.findall(html_quotations.CHECKPOINT_PATTERN, line)
        ] for line in lines
    ]

    # Remove checkpoints
    lines = [
        re.sub(html_quotations.CHECKPOINT_PATTERN, '', line) for line in lines
    ]

    # Use plain text quotation extracting algorithm
    markers = remove_initial_spaces_and_mark_message_lines(lines)
    return_flags = []
    process_marked_lines(lines, markers, return_flags)
    lines_were_deleted, first_deleted, last_deleted = return_flags

    if not lines_were_deleted and not cut_quotations:
        return msg_body

    if lines_were_deleted:
        #collect checkpoints from deleted lines
        for i in range(first_deleted, last_deleted):
            for checkpoint in line_checkpoints[i]:
                quotation_checkpoints[checkpoint] = True

        # Remove tags with quotation checkpoints
        html_quotations.delete_quotation_tags(html_tree_copy, 0,
                                              quotation_checkpoints)

    if _readable_text_empty(html_tree_copy):
        return msg_body

    return _html_tostring(html_tree_copy)