Exemplo n.º 1
0
    def parse_novel_page(self, url, html):
        soup = BeautifulSoup(html)
        if soup is None:
            print "soup is None"
            return None
        novel = {'url': url}
        site = utils.get_site(url)
        novel['site'] = site
        box_intro = soup.find('div', {'class': 'box_intro'})
        pic = box_intro.find('div', {'class': 'pic'})
        novel['image'] = pic.img.get('src')

        info = box_intro.find('div', {'class': 'box_info'})
        if info is None:
            return None

        rank = {}
        ren = info.find('em', {'id': 'ren'})
        if ren is not None:
            rank['review_count'] = int(ren.text.encode('utf-8'))
        novel['rank'] = rank

        h1 = info.find('h1')
        if h1 is None:
            return None

        name_author = h1.text.encode('utf-8')
        if "作者:" not in name_author:
            return None
        name, author = name_author.split("作者:")
        novel['name'] = name.strip()
        novel['author'] = author.strip()

        novel['bid'] = utils.make_book_id(name, author)
        novel['nid'] = utils.make_novel_id(name, author, site)

        desc = info.find('div', {'class': 'intro'})
        novel['desc'] = desc.text.encode("utf-8").replace(" ", "").strip().lstrip('\r')

        infos = info.find('tr', {'valign': 'top'})
        if infos is not None:
            tds = infos.findAll('td')
            for td in tds:
                txt = td.text.encode('utf-8')
                tmp = txt.split(':', 1)
                if len(tmp) != 2:
                    continue
                if tmp[0] == '文章分类':
                    novel['category'] = tmp[1]
                elif tmp[0] == '文章状态':
                    novel['status'] = tmp[1]
        options = info.find('div', {'class': 'option'})
        if options is not None:
            list_page = options.find('span', {'class': 'btopt'}).a.get('href')
            novel['list_url'] = list_page
        return novel
Exemplo n.º 2
0
    def __get_crawler__(self, url):
        site = utils.get_site(url)
        if site is None:
            logger.debug("Unsupported site: %s" % url)
            return None

        crawler = self.crawlers.get(site, None)
        if crawler is None:
            logger.debug("Unsupported site: %s" % url)
        return crawler
Exemplo n.º 3
0
def _get_profile_manager(site):
	"""Gets a new UserProfileManager"""
	site = utils.get_site(site)
	context = ServerContext.GetContext(site)
	return UserProfileManager(context)