示例#1
0
def _get_normal_with_extra_link(el,url):
    content = _base_get_content(el)
    if not content: return {}
    return { 'item_link_heading': unicode(content), # in progress
             'item_link_href': urljoin(url,el.contents[1].get('href') if hasattr(el.contents[1],'get') else unicode(el.contents[0].get('href'))),
             'item_link_subheading': strip_tags(unicode(el.contents[3])).strip(),
             'extra_link': unicode(el.contents[2]).strip() }
示例#2
0
def get_archive_list(url):
    # grab our html
    lines = urlopen(url)
    html = ''.join(lines)
    soup = BS(massage_html(html))

    # classes:
    #  greybold = month / year (once per group)
    #  blackbasic = link
    classes = ['greybold','blackbasic']
    base_item = {'item_link_group_date':None}
    items = []
    for el in soup.findAll('td', {'class':lambda a: a in classes}):
        if el.get('class') == 'greybold':
            # it's the date
            base_item['item_link_group_date'] = _base_get_content(el)
        else:
            # could be a pic line or could be a normal
            item = copy(base_item)
            all_content = ''.join([unicode(x).lower() for x in el.contents])
            # if picture
            if 'in pictures:' in all_content:
                item.update(_get_in_pictures(el,url))
            # if feature
            elif 'feature:' in all_content:
                item.update(_get_feature(el,url))
            # if extra link
            elif '[pictures]' in all_content:
                item.update(_get_normal_with_extra_link(el,url))
            # if normal
            else:
                item.update(_get_normal(el,url))
#                if not item.get('item_link_heading'): continue
            if len(item) != 1: # must have @ least some data
                items.append(item)


    return items
示例#3
0
def _get_normal(el,url):
    content = _base_get_content(el)
    if not content: return {}
    return { 'item_link_heading': content,
             'item_link_href': urljoin(url,el.contents[1].get('href') if hasattr(el.contents[1],'get') else unicode(el.contents[0].get('href'))),
             'item_link_subheading': strip_tags(unicode(el.contents[2])).strip() }