Exemplo n.º 1
0
def _get_normal_with_extra_link(el,url):
    content = _base_get_content(el)
    if not content: return {}
    return { 'item_link_heading': unicode(content), # in progress
             'item_link_href': urljoin(url,el.contents[1].get('href') if hasattr(el.contents[1],'get') else unicode(el.contents[0].get('href'))),
             'item_link_subheading': strip_tags(unicode(el.contents[3])).strip(),
             'extra_link': unicode(el.contents[2]).strip() }
Exemplo n.º 2
0
def _get_content_item_content(elements):
    # the content is spread out through many elements
    # different elements = line breaks
    # br = line breaks
    to_return = []
    for el in elements:
        for content in el.contents:
            # we want the string, not the bs from soup
            content = unicode(content)
            # we don't want blank lines, or bullshit, bullshit = all caps?
            if content.upper() == content: continue
            if content.strip() == '': continue # don't need blank lines
            # don't need non-content bs, which means if we start w/
            # a bad tag, rejected
            good_tags = ['p','b']
            if content[0] == '<' and content[1] not in good_tags: continue
            if ''.join(content.strip().split()).lower() == '<br/>':
                # this is a line break
                to_return.append('')
            else:
                to_return.append(strip_tags(content).strip())
    return "\n".join(to_return)
Exemplo n.º 3
0
def _get_content_item_author(elements):
    if not elements: return ''
    return strip_tags(str(elements[0].contents[1])).strip()
Exemplo n.º 4
0
    # classes:
    #  blackbig = heading
    #  blackit = subheading # only the first is the subheading
    #  blackbasic = content
    #  blackitalic = author
    classes = [('td','blackbig','item_heading'),
               ('td','blackit','item_subheading'),
               ('td','blackbasic','item_content'),
               ('div','blackitalic','item_author')]

    # we are going to pull the item's info
    item = {'item_url':url}
    function_base = '_get_content_'
    for t,c,n in classes:
        elements = soup.findAll(t,{'class':c})
        fn = globals().get(function_base+n) or globals().get(function_base)
        content = fn(elements)
        item[n] = content

    # there might be articles with multiple pages
    page = 1
    if 'Page %s' in html:
        # follow to the next page and grab the content
        pass

    # if the item doesn't have an author, than the tag is probably malformed
    if not item.get('item_author'):
        item['item_author'] = strip_tags(re.findall(r'(>-.*</)',html)[0])[3:-2].strip()

    return item
Exemplo n.º 5
0
def _get_feature(el,url):
    return { 'is_feature':1,
             'item_link_href': urljoin(url,el.contents[3].get('href')),
             'item_link_subheading': unicode(el.contents[4].strip()),
             'item_link_heading': strip_tags(str(el.contents[3]).strip()),
             'item_link_date': strip_tags(str(el.contents[4])).strip() }
Exemplo n.º 6
0
def _get_normal(el,url):
    content = _base_get_content(el)
    if not content: return {}
    return { 'item_link_heading': content,
             'item_link_href': urljoin(url,el.contents[1].get('href') if hasattr(el.contents[1],'get') else unicode(el.contents[0].get('href'))),
             'item_link_subheading': strip_tags(unicode(el.contents[2])).strip() }