Пример #1
0
    def gen_content(self):
        '''Return tuple (issue, title, pub_date, items)'''
        html = infra.get_html_content(self.url)
        soup = BeautifulSoup(html)

        self.title = unicode(soup.title.next_element)
        infra.print_u(self.title)
        #get publish date of weekly issue
        h2 = soup.h2.string
        matched_date = self._re_weekly_date.search(h2)
        self.pub_date = date(int(matched_date.group(1)),
            int(matched_date.group(2)),
            int(matched_date.group(3)))

        #get items' content
        raw_items = soup.find_all('h4')
        del raw_items[-1]
        del raw_items[-1]
        for raw_item in raw_items:
            #get item's  head, link, and origin if exists
            #print raw_item
            head = raw_item.next_element.next_element
            infra.print_u(head)
            link = raw_item.a['href']
            #infra.print_u(link)
            if infra.is_item_job(link):
                continue

            #infra.print_u(link)
            #link = infra.get_redir_url(link)
            #infra.print_u(link)

            origin = None
            matched_origin = self._re_origin.search(head)
            if matched_origin:
                origin = matched_origin.group(1)
                #infra.print_u(origin)

            new_item = item.WeeklyItem(unicode(head),
                unicode(origin),
                unicode(link))

            infra.print_u(new_item.head)

            self.items.append(new_item)
Пример #2
0
                continue

            #infra.print_u(link)
            #link = infra.get_redir_url(link)
            #infra.print_u(link)

            origin = None
            matched_origin = self._re_origin.search(head)
            if matched_origin:
                origin = matched_origin.group(1)
                #infra.print_u(origin)

            new_item = item.WeeklyItem(unicode(head),
                unicode(origin),
                unicode(link))

            infra.print_u(new_item.head)

            self.items.append(new_item)

if __name__ == '__main__':
    spider = WeeklySpider(58, 'http://weekly.manong.io/issues/58')
    spider.gen_content()
    items = spider.items
    for i in items:
        infra.print_u(i.link)