Exemplo n.º 1
0
def get_article(doc, options):
    try:
        ruthless = True
        while True:
            for i in tags(doc, 'script', 'style'):
                i.drop_tree()
            for i in tags(doc, 'body'):
                i.set('id', 'readabilityBody')
            if ruthless:
                remove_unlikely_candidates(doc)
            transform_double_breaks_into_paragraphs(doc)
            transform_misused_divs_into_paragraphs(doc)
            candidates = score_paragraphs(doc, options)

            best_candidate = select_best_candidate(candidates)
            if best_candidate:
                confidence = best_candidate['content_score']
                article = get_raw_article(candidates, best_candidate)
            else:
                if ruthless:
                    logging.debug("ruthless removal did not work. ")
                    ruthless = False
                    logging.debug(
                        "ended up stripping too much - going for a safer parse"
                    )
                    # try again
                    continue
                else:
                    logging.debug(
                        "Ruthless and lenient parsing did not work. Returning raw html"
                    )
                    return Summary(0, None)

            unicode_cleaned_article = sanitize(article, candidates, options)
            cleaned_doc = fragment_fromstring(unicode_cleaned_article)
            cleaned_article = tostring(cleaned_doc)

            of_acceptable_length = len(cleaned_article
                                       or '') >= options['retry_length']
            if ruthless and not of_acceptable_length:
                ruthless = False
                continue  # try again
            else:
                return Summary(confidence, cleaned_article)
    except StandardError as e:
        #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
        logging.exception('error getting summary: ')
        raise Unparseable(str(e)), None, sys.exc_info()[2]
Exemplo n.º 2
0
def transform_misused_divs_into_paragraphs(doc):
    for elem in tags(doc, 'div'):
        # transform <div>s that do not contain other block elements into <p>s
        if not REGEXES['divToPElementsRe'].search(
                unicode(''.join(map(tostring, list(elem))))):
            logging.debug("Altering %s to p" % (describe(elem)))
            elem.tag = "p"
Exemplo n.º 3
0
def find_next_page_url(parsed_urls, url, elem):
    links = tags(elem, 'a')
    base_url = find_base_url(url)
    # candidates is a mapping from URLs to NextPageCandidate objects that
    # represent information used to determine if a URL points to the next page
    # in the article.
    candidates = {}
    for link in links:
        logging.debug('link: %s' % tostring(link))
        eval_possible_next_page_link(
                parsed_urls,
                url,
                base_url,
                candidates,
                link
                )
    top_candidate = None
    for url, candidate in candidates.items():
        score = candidate.score
        logging.debug('next page score of %s: %s' % (url, candidate.score))
        if 50 <= score and (not top_candidate or top_candidate.score < score):
            top_candidate = candidate

    if top_candidate:
        logging.debug('next page link found: %s' % top_candidate.href)
        parsed_urls.add(top_candidate.href)
        return top_candidate.href
    else:
        return None
Exemplo n.º 4
0
def get_article(doc, options):
    try:
        ruthless = True
        while True:
            for i in tags(doc, 'script', 'style'):
                i.drop_tree()
            for i in tags(doc, 'body'):
                i.set('id', 'readabilityBody')
            if ruthless: 
                remove_unlikely_candidates(doc)
            transform_double_breaks_into_paragraphs(doc)
            transform_misused_divs_into_paragraphs(doc)
            candidates = score_paragraphs(doc, options)
            
            best_candidate = select_best_candidate(candidates)
            if best_candidate:
                confidence = best_candidate['content_score']
                article = get_raw_article(candidates, best_candidate)
            else:
                if ruthless:
                    logging.debug("ruthless removal did not work. ")
                    ruthless = False
                    logging.debug("ended up stripping too much - going for a safer parse")
                    # try again
                    continue
                else:
                    logging.debug("Ruthless and lenient parsing did not work. Returning raw html")
                    return Summary(0, None)

            unicode_cleaned_article = sanitize(article, candidates, options)
            cleaned_doc = fragment_fromstring(unicode_cleaned_article)
            cleaned_article = tounicode(cleaned_doc)

            of_acceptable_length = len(cleaned_article or '') >= options['retry_length']
            if ruthless and not of_acceptable_length:
                ruthless = False
                continue # try again
            else:
                return Summary(confidence, cleaned_article)
    except StandardError as e:
        #logging.exception('error getting summary: ' + str(traceback.format_exception(*sys.exc_info())))
        logging.exception('error getting summary: ' )
        raise Unparseable(str(e)), None, sys.exc_info()[2]
Exemplo n.º 5
0
def transform_double_breaks_into_paragraphs(doc):
    '''
    Modifies doc so that double-breaks (<br><br>) in content delineate
    paragraphs.  Some pages use double-breaks when they really should be using
    paragraphs:

        <div>
            Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent
            in justo sapien, a consectetur est. Aliquam iaculis, augue eu
            euismod gravida, nisl nisl posuere odio, at euismod metus enim quis
            nibh.

            <br><br>

            Praesent posuere tortor at nunc iaculis eget suscipit tellus
            tempus.  Nulla facilisi. Quisque rutrum, ante eu sollicitudin
            congue, dui sapien egestas arcu, in consequat nisl metus eu sem.

            <br><br>

            Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet
            justo.  Nullam rutrum sodales magna vel vestibulum. Curabitur sit
            amet urna purus, ac aliquet sem.
        </div>

    This routine would transform this into:

        <div>
            <p>
            Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent
            in justo sapien, a consectetur est. Aliquam iaculis, augue eu
            euismod gravida, nisl nisl posuere odio, at euismod metus enim quis
            nibh.
            </p>

            <p>
            Praesent posuere tortor at nunc iaculis eget suscipit tellus
            tempus.  Nulla facilisi. Quisque rutrum, ante eu sollicitudin
            congue, dui sapien egestas arcu, in consequat nisl metus eu sem.
            </p>

            <p>
            Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet
            justo.  Nullam rutrum sodales magna vel vestibulum. Curabitur sit
            amet urna purus, ac aliquet sem.
            </p>
        </div>
    '''
    for div in tags(doc, 'div'):
        transform_double_breaks_into_paragraphs_elem(div)
Exemplo n.º 6
0
def score_paragraphs(doc, options):
    candidates = {}
    #logging.debug(str([describe(node) for node in tags(doc, "div")]))

    ordered = []
    for elem in tags(doc, "p", "pre", "td"):
        logging.debug('Scoring %s' % describe(elem))
        parent_node = elem.getparent()
        if parent_node is None:
            continue
        grand_parent_node = parent_node.getparent()

        inner_text = clean(elem.text_content() or "")
        inner_text_len = len(inner_text)

        # If this paragraph is less than 25 characters, don't even count it.
        if inner_text_len < options['min_text_len']:
            continue

        if parent_node not in candidates:
            candidates[parent_node] = score_node(parent_node)
            ordered.append(parent_node)

        if grand_parent_node is not None and grand_parent_node not in candidates:
            candidates[grand_parent_node] = score_node(grand_parent_node)
            ordered.append(grand_parent_node)

        content_score = 1
        content_score += len(inner_text.split(','))
        content_score += min((inner_text_len / 100), 3)
        #if elem not in candidates:
        #    candidates[elem] = score_node(elem)

        #WTF? candidates[elem]['content_score'] += content_score
        candidates[parent_node]['content_score'] += content_score
        if grand_parent_node is not None:
            candidates[grand_parent_node][
                'content_score'] += content_score / 2.0

    # Scale the final candidates score based on link density. Good content should have a
    # relatively small link density (5% or less) and be mostly unaffected by this operation.
    for elem in ordered:
        candidate = candidates[elem]
        ld = get_link_density(elem)
        score = candidate['content_score']
        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" %
                      (score, describe(elem), ld, score * (1 - ld)))
        candidate['content_score'] *= (1 - ld)

    return candidates
Exemplo n.º 7
0
def transform_double_breaks_into_paragraphs(doc):
    '''
    Modifies doc so that double-breaks (<br><br>) in content delineate
    paragraphs.  Some pages use double-breaks when they really should be using
    paragraphs:

        <div>
            Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent
            in justo sapien, a consectetur est. Aliquam iaculis, augue eu
            euismod gravida, nisl nisl posuere odio, at euismod metus enim quis
            nibh.

            <br><br>

            Praesent posuere tortor at nunc iaculis eget suscipit tellus
            tempus.  Nulla facilisi. Quisque rutrum, ante eu sollicitudin
            congue, dui sapien egestas arcu, in consequat nisl metus eu sem.

            <br><br>

            Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet
            justo.  Nullam rutrum sodales magna vel vestibulum. Curabitur sit
            amet urna purus, ac aliquet sem.
        </div>

    This routine would transform this into:

        <div>
            <p>
            Lorem ipsum dolor sit amet, consectetur adipiscing elit. Praesent
            in justo sapien, a consectetur est. Aliquam iaculis, augue eu
            euismod gravida, nisl nisl posuere odio, at euismod metus enim quis
            nibh.
            </p>

            <p>
            Praesent posuere tortor at nunc iaculis eget suscipit tellus
            tempus.  Nulla facilisi. Quisque rutrum, ante eu sollicitudin
            congue, dui sapien egestas arcu, in consequat nisl metus eu sem.
            </p>

            <p>
            Nam mi sem, lobortis eget adipiscing vitae, ultricies sit amet
            justo.  Nullam rutrum sodales magna vel vestibulum. Curabitur sit
            amet urna purus, ac aliquet sem.
            </p>
        </div>
    '''
    for div in tags(doc, 'div'):
        transform_double_breaks_into_paragraphs_elem(div)
Exemplo n.º 8
0
def score_paragraphs(doc, options):
    candidates = {}
    #logging.debug(str([describe(node) for node in tags(doc, "div")]))

    ordered = []
    for elem in tags(doc, "p", "pre", "td"):
        logging.debug('Scoring %s' % describe(elem))
        parent_node = elem.getparent()
        if parent_node is None:
            continue 
        grand_parent_node = parent_node.getparent()

        inner_text = clean(elem.text_content() or "")
        inner_text_len = len(inner_text)

        # If this paragraph is less than 25 characters, don't even count it.
        if inner_text_len < options['min_text_len']:
            continue

        if parent_node not in candidates:
            candidates[parent_node] = score_node(parent_node)
            ordered.append(parent_node)
            
        if grand_parent_node is not None and grand_parent_node not in candidates:
            candidates[grand_parent_node] = score_node(grand_parent_node)
            ordered.append(grand_parent_node)

        content_score = 1
        content_score += len(inner_text.split(','))
        content_score += min((inner_text_len / 100), 3)
        #if elem not in candidates:
        #    candidates[elem] = score_node(elem)
            
        #WTF? candidates[elem]['content_score'] += content_score
        candidates[parent_node]['content_score'] += content_score
        if grand_parent_node is not None:
            candidates[grand_parent_node]['content_score'] += content_score / 2.0

    # Scale the final candidates score based on link density. Good content should have a
    # relatively small link density (5% or less) and be mostly unaffected by this operation.
    for elem in ordered:
        candidate = candidates[elem]
        ld = get_link_density(elem)
        score = candidate['content_score']
        logging.debug("Candid: %6.3f %s link density %.3f -> %6.3f" % (score, describe(elem), ld, score*(1-ld)))
        candidate['content_score'] *= (1 - ld)

    return candidates
Exemplo n.º 9
0
def find_next_page_url(parsed_urls, url, elem):
    links = tags(elem, 'a')
    base_url = find_base_url(url)
    # candidates is a mapping from URLs to NextPageCandidate objects that
    # represent information used to determine if a URL points to the next page
    # in the article.
    candidates = {}
    for link in links:
        logging.debug('link: %s' % tostring(link))
        eval_possible_next_page_link(parsed_urls, url, base_url, candidates,
                                     link)
    top_candidate = None
    for url, candidate in candidates.items():
        score = candidate.score
        logging.debug('next page score of %s: %s' % (url, candidate.score))
        if 50 <= score and (not top_candidate or top_candidate.score < score):
            top_candidate = candidate

    if top_candidate:
        logging.debug('next page link found: %s' % top_candidate.href)
        parsed_urls.add(top_candidate.href)
        return top_candidate.href
    else:
        return None
Exemplo n.º 10
0
def sanitize(node, candidates, options):
    for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
        if class_weight(header) < 0 or get_link_density(header) > 0.33: 
            header.drop_tree()

    for elem in tags(node, "form", "iframe", "textarea"):
        elem.drop_tree()
    allowed = {}
    # Conditionally clean <table>s, <ul>s, and <div>s
    for el in reverse_tags(node, "table", "ul", "div"):
        if el in allowed:
            continue
        weight = class_weight(el)
        if el in candidates:
            content_score = candidates[el]['content_score']
            #print '!',el, '-> %6.3f' % content_score
        else:
            content_score = 0
        tag = el.tag

        if weight + content_score < 0:
            logging.debug("Cleaned %s with score %6.3f and weight %-3s" %
                (describe(el), content_score, weight, ))
            el.drop_tree()
        elif el.text_content().count(",") < 10:
            counts = {}
            for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
                counts[kind] = len(el.findall('.//%s' %kind))
            counts["li"] -= 100

            content_length = text_length(el) # Count the text length excluding any surrounding whitespace
            link_density = get_link_density(el)
            parent_node = el.getparent()
            if parent_node is not None:
                if parent_node in candidates:
                    content_score = candidates[parent_node]['content_score']
                else:
                    content_score = 0
            #if parent_node is not None:
                #pweight = class_weight(parent_node) + content_score
                #pname = describe(parent_node)
            #else:
                #pweight = 0
                #pname = "no parent"
            to_remove = False
            reason = ""

            #if el.tag == 'div' and counts["img"] >= 1:
            #    continue
            if counts["p"] and counts["img"] > counts["p"]:
                reason = "too many images (%s)" % counts["img"]
                to_remove = True
            elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
                reason = "more <li>s than <p>s"
                to_remove = True
            elif counts["input"] > (counts["p"] / 3):
                reason = "less than 3x <p>s than <input>s"
                to_remove = True
            elif content_length < options['min_text_length'] and (counts["img"] == 0 or counts["img"] > 2):
                reason = "too short content length %s without a single image" % content_length
                to_remove = True
            elif weight < 25 and link_density > 0.2:
                    reason = "too many links %.3f for its weight %s" % (link_density, weight)
                    to_remove = True
            elif weight >= 25 and link_density > 0.5:
                reason = "too many links %.3f for its weight %s" % (link_density, weight)
                to_remove = True
            elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1:
                reason = "<embed>s with too short content length, or too many <embed>s"
                to_remove = True
            # if el.tag == 'div' and counts['img'] >= 1 and to_remove:
            #     imgs = el.findall('.//img')
            #     valid_img = False
            #     logging.debug(tounicode(el))
            #     for img in imgs:

            #         height = img.get('height')
            #         text_length = img.get('text_length')
            #         logging.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
            #         if to_int(height) >= 100 or to_int(text_length) >= 100:
            #             valid_img = True
            #             logging.debug("valid image" + tounicode(img))
            #             break
            #     if valid_img:
            #         to_remove = False
            #         logging.debug("Allowing %s" %el.text_content())
            #         for desnode in tags(el, "table", "ul", "div"):
            #             allowed[desnode] = True

                #find x non empty preceding and succeeding siblings
                i, j = 0, 0
                x  = 1
                siblings = []
                for sib in el.itersiblings():
                    #logging.debug(sib.text_content())
                    sib_content_length = text_length(sib)
                    if sib_content_length:
                        i =+ 1
                        siblings.append(sib_content_length)
                        if i == x:
                            break
                for sib in el.itersiblings(preceding=True):
                    #logging.debug(sib.text_content())
                    sib_content_length = text_length(sib)
                    if sib_content_length:
                        j =+ 1
                        siblings.append(sib_content_length)
                        if j == x:
                            break
                #logging.debug(str(siblings))
                if siblings and sum(siblings) > 1000 :
                    to_remove = False
                    logging.debug("Allowing %s" % describe(el))
                    for desnode in tags(el, "table", "ul", "div"):
                        allowed[desnode] = True

            if to_remove:
                logging.debug("Cleaned %6.3f %s with weight %s cause it has %s." %
                    (content_score, describe(el), weight, reason))
                #print tounicode(el)
                #logging.debug("pname %s pweight %.3f" %(pname, pweight))
                el.drop_tree()

    # for el in ([node] + [n for n in node.iter()]):
    #     if not (self.options['attributes']):
    #         #el.attrib = {} #FIXME:Checkout the effects of disabling this
    #         pass

    return clean_attributes(tounicode(node))
Exemplo n.º 11
0
def transform_misused_divs_into_paragraphs(doc):
    for elem in tags(doc, 'div'):
        # transform <div>s that do not contain other block elements into <p>s
        if not REGEXES['divToPElementsRe'].search(unicode(''.join(map(tostring, list(elem))))):
            logging.debug("Altering %s to p" % (describe(elem)))
            elem.tag = "p"
Exemplo n.º 12
0
def sanitize(node, candidates, options):
    for header in tags(node, "h1", "h2", "h3", "h4", "h5", "h6"):
        if class_weight(header) < 0 or get_link_density(header) > 0.33:
            header.drop_tree()

    for elem in tags(node, "form", "iframe", "textarea"):
        elem.drop_tree()
    allowed = {}
    # Conditionally clean <table>s, <ul>s, and <div>s
    for el in reverse_tags(node, "table", "ul", "div"):
        if el in allowed:
            continue
        weight = class_weight(el)
        if el in candidates:
            content_score = candidates[el]['content_score']
            #print '!',el, '-> %6.3f' % content_score
        else:
            content_score = 0
        tag = el.tag

        if weight + content_score < 0:
            logging.debug("Cleaned %s with score %6.3f and weight %-3s" % (
                describe(el),
                content_score,
                weight,
            ))
            el.drop_tree()
        elif el.text_content().count(",") < 10:
            counts = {}
            for kind in ['p', 'img', 'li', 'a', 'embed', 'input']:
                counts[kind] = len(el.findall('.//%s' % kind))
            counts["li"] -= 100

            content_length = text_length(
                el
            )  # Count the text length excluding any surrounding whitespace
            link_density = get_link_density(el)
            parent_node = el.getparent()
            if parent_node is not None:
                if parent_node in candidates:
                    content_score = candidates[parent_node]['content_score']
                else:
                    content_score = 0
            #if parent_node is not None:
            #pweight = class_weight(parent_node) + content_score
            #pname = describe(parent_node)
            #else:
            #pweight = 0
            #pname = "no parent"
            to_remove = False
            reason = ""

            #if el.tag == 'div' and counts["img"] >= 1:
            #    continue
            if counts["p"] and counts["img"] > counts["p"]:
                reason = "too many images (%s)" % counts["img"]
                to_remove = True
            elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol":
                reason = "more <li>s than <p>s"
                to_remove = True
            elif counts["input"] > (counts["p"] / 3):
                reason = "less than 3x <p>s than <input>s"
                to_remove = True
            elif content_length < options['min_text_length'] and (
                    counts["img"] == 0 or counts["img"] > 2):
                reason = "too short content length %s without a single image" % content_length
                to_remove = True
            elif weight < 25 and link_density > 0.2:
                reason = "too many links %.3f for its weight %s" % (
                    link_density, weight)
                to_remove = True
            elif weight >= 25 and link_density > 0.5:
                reason = "too many links %.3f for its weight %s" % (
                    link_density, weight)
                to_remove = True
            elif (counts["embed"] == 1
                  and content_length < 75) or counts["embed"] > 1:
                reason = "<embed>s with too short content length, or too many <embed>s"
                to_remove = True
                # if el.tag == 'div' and counts['img'] >= 1 and to_remove:
                #     imgs = el.findall('.//img')
                #     valid_img = False
                #     logging.debug(tounicode(el))
                #     for img in imgs:

                #         height = img.get('height')
                #         text_length = img.get('text_length')
                #         logging.debug ("height %s text_length %s" %(repr(height), repr(text_length)))
                #         if to_int(height) >= 100 or to_int(text_length) >= 100:
                #             valid_img = True
                #             logging.debug("valid image" + tounicode(img))
                #             break
                #     if valid_img:
                #         to_remove = False
                #         logging.debug("Allowing %s" %el.text_content())
                #         for desnode in tags(el, "table", "ul", "div"):
                #             allowed[desnode] = True

                #find x non empty preceding and succeeding siblings
                i, j = 0, 0
                x = 1
                siblings = []
                for sib in el.itersiblings():
                    #logging.debug(sib.text_content())
                    sib_content_length = text_length(sib)
                    if sib_content_length:
                        i = +1
                        siblings.append(sib_content_length)
                        if i == x:
                            break
                for sib in el.itersiblings(preceding=True):
                    #logging.debug(sib.text_content())
                    sib_content_length = text_length(sib)
                    if sib_content_length:
                        j = +1
                        siblings.append(sib_content_length)
                        if j == x:
                            break
                #logging.debug(str(siblings))
                if siblings and sum(siblings) > 1000:
                    to_remove = False
                    logging.debug("Allowing %s" % describe(el))
                    for desnode in tags(el, "table", "ul", "div"):
                        allowed[desnode] = True

            if to_remove:
                logging.debug(
                    "Cleaned %6.3f %s with weight %s cause it has %s." %
                    (content_score, describe(el), weight, reason))
                #print tounicode(el)
                #logging.debug("pname %s pweight %.3f" %(pname, pweight))
                el.drop_tree()

    # for el in ([node] + [n for n in node.iter()]):
    #     if not (self.options['attributes']):
    #         #el.attrib = {} #FIXME:Checkout the effects of disabling this
    #         pass

    return clean_attributes(tounicode(node))