Пример #1
0
def clean_em_tags(doc):
    ems = parser.get_elements_by_tag(doc, tag='em')
    for node in ems:
        images = parser.get_elements_by_tag(node, tag='img')
        if len(images) == 0:
            node.drop_tag()
    return doc
Пример #2
0
def is_table_tag_and_no_paragraphs_exist(e):
    sub_paragraphs = parser.get_elements_by_tag(e, tag='p')
    for p in sub_paragraphs:
        txt = parser.get_text(p)
        if len(txt) < 25:
            parser.remove(p)
    sub_paragraphs2 = parser.get_elements_by_tag(e, tag='p')
    if len(sub_paragraphs2) == 0 and e.tag is not "td":
        return True
    return False
Пример #3
0
def remove_script_and_style(doc):
    # remove scripts
    scripts = parser.get_elements_by_tag(doc, tag='script')
    for item in scripts:
        parser.remove(item)
    # remove styles
    styles = parser.get_elements_by_tag(doc, tag='style')
    for item in styles:
        parser.remove(item)
    # remove comments
    comments = parser.get_comments(doc)
    for item in comments:
        parser.remove(item)
    return doc
Пример #4
0
def sibling_content(current_sibling, sibling_paragraphs_baseline_score):
    """
    Adds any siblings that may have a decent score to this node
    """
    if current_sibling.tag == 'p' \
            and len(parser.get_text(current_sibling)) > 0:
        e0 = current_sibling
        if e0.tail:
            e0 = copy.deepcopy(e0)
            e0.tail = ''
        return [e0]
    else:
        potential_paragraphs = parser.get_elements_by_tag(current_sibling, tag='p')
        if potential_paragraphs is None:
            return None
        else:
            ps = []
            for first_paragraph in potential_paragraphs:
                text = parser.get_text(first_paragraph)
                if len(text) > 0:
                    word_stats = StopWords().get_stop_word_count(text)
                    paragraph_score = word_stats.get_stop_word_count()
                    sibling_baseline_score = float(.30)
                    high_link_density = is_high_link_density(first_paragraph)
                    score = float(sibling_paragraphs_baseline_score * sibling_baseline_score)
                    if score < paragraph_score and not high_link_density:
                        p = parser.create_element(tag='p', text=text, tail=None)
                        ps.append(p)
            return ps
Пример #5
0
def title(doc):
    """Return the document's title."""
    title = ''
    title_elem = parser.get_elements_by_tag(doc, tag='title')
    # no title found
    if title_elem is None or len(title_elem) == 0:
        return title
    # title elem found
    title_text = parser.get_text(title_elem[0])
    used_delimeter = False
    # split title with |
    if '|' in title_text:
        title_text = _split_title(title_text, "\\|")
        used_delimeter = True
    # split title with -
    if not used_delimeter and '-' in title_text:
        title_text = _split_title(title_text, " - ")
        used_delimeter = True
    # split title with »
    if not used_delimeter and u'»' in title_text:
        title_text = _split_title(title_text, "»")
        used_delimeter = True
    # split title with :
    if not used_delimeter and ':' in title_text:
        title_text = _split_title(title_text, ":")
        used_delimeter = True
    title = title_text.replace("&#65533;", "")
    return title
Пример #6
0
def meta_favicon(doc):
    """Return the document's meta favicon."""
    kwargs = {'tag': 'link', 'attr': ' rel', 'value': 'icon'}
    meta = parser.get_elements_by_tag(doc, **kwargs)
    if meta:
        favicon = meta[0].attrib.get('href')
        return favicon
    return ''
Пример #7
0
def nodes_to_check(doc):
    """
    Returns a list of nodes we want to search on like paragraphs and tables.
    """
    nodes_to_check = []
    for tag in ['p', 'pre', 'td']:
        items = parser.get_elements_by_tag(doc, tag=tag)
        nodes_to_check += items
    return nodes_to_check
Пример #8
0
def canonical_link(doc):
    """Return document's canonical link."""
    kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'canonical'}
    meta = parser.get_elements_by_tag(doc, **kwargs)
    if meta is not None and len(meta) > 0:
        href = meta[0].attrib.get('href')
        if href:
            href = href.strip()
            return href
    return ''
Пример #9
0
def remove_paragraphs_with_few_words(top_node):
    """
    Remove paragraphs that have less than x number of words,  would
    indicate that it's some sort of link.
    """
    all_nodes = parser.get_elements_by_tags(top_node, ['*'])
    all_nodes.reverse()
    for el in all_nodes:
        text = parser.get_text(el)
        stop_words = StopWords().get_stop_word_count(text)
        if stop_words.get_stop_word_count() < 3 \
            and len(parser.get_elements_by_tag(el, tag='object')) == 0 \
            and len(parser.get_elements_by_tag(el, tag='embed')) == 0:
            parser.remove(el)
        # TODO: Check if it is in the right place.
        else:
            trimmed = parser.get_text(el)
            if trimmed.startswith("(") and trimmed.endswith(")"):
                parser.remove(el)
Пример #10
0
def meta_lang(doc):
    """Extract content language from meta."""
    # we have a lang attribute in html
    attr = parser.get_attribute(doc, attr='lang')
    if attr is None:
        # look up for a Content-Language in meta
        kwargs = {'tag': 'meta', 'attr': ' http-equiv',
                  'value': 'content-language'}
        meta = parser.get_elements_by_tag(doc, **kwargs)
        if meta:
            attr = parser.get_attribute(meta[0], attr='content')
    if attr:
        value = attr[:2]
        if re.search('^[A-Za-z]{2}$', value):
            return value.lower()
    return None
Пример #11
0
def convert_div_to_p(doc, dom_type):
    bad_divs = 0
    else_divs = 0
    divs = parser.get_elements_by_tag(doc, tag=dom_type)
    tags = ['a', 'blockquote', 'dl', 'div', 'img', 'ol', 'p', 'pre',
            'table', 'ul']
    for div in divs:
        items = parser.get_elements_by_tags(div, tags)
        if div is not None and len(items) == 0:
            replace_elements_with_p(doc, div)
            bad_divs += 1
        elif div is not None:
            replace_nodes = get_replacement_nodes(doc, div)
            div.clear()
            for c, n in enumerate(replace_nodes):
                div.insert(c, n)
            else_divs += 1
    return doc
Пример #12
0
def is_high_link_density(e):
    """
    Checks the density of links within a node, is there not much text and
    most of it contains linky shit? if so it's no good.
    """
    links = parser.get_elements_by_tag(e, tag='a')
    if links is None or len(links) == 0:
        return False
    text = parser.get_text(e)
    words = text.split(' ')
    number_of_words = float(len(words))
    sb = []
    for link in links:
        sb.append(parser.get_text(link))
    link_text = ''.join(sb)
    link_words = link_text.split(' ')
    number_of_link_words = float(len(link_words))
    number_of_links = float(len(links))
    link_divisor = float(number_of_link_words / number_of_words)
    score = float(link_divisor * number_of_links)
    if score >= 1.0:
        return True
    return False
Пример #13
0
def siblings_baseline_score(top_node):
    """
    We could have long articles that have tons of paragraphs
    so if we tried to calculate the base score against
    the total text score of those paragraphs it would be unfair.
    So we need to normalize the score based on the average scoring
    of the paragraphs within the top node.
    For example if our total score of 10 paragraphs was 1000
    but each had an average value of 100 then 100 should be our base.
    """
    base = 100000
    number_of_paragraphs = 0
    score_of_paragraphs = 0
    nodes_to_check = parser.get_elements_by_tag(top_node, tag='p')
    for node in nodes_to_check:
        node_text = parser.get_text(node)
        word_stats = StopWords().get_stop_word_count(node_text)
        high_link_density = is_high_link_density(node)
        if word_stats.get_stop_word_count() > 2 and not high_link_density:
            number_of_paragraphs += 1
            score_of_paragraphs += word_stats.get_stop_word_count()
    if number_of_paragraphs > 0:
        base = score_of_paragraphs / number_of_paragraphs
    return base