Пример #1
0
    def get_result_page_by_page_num(self, search_url, cookieJar = None, ua = None, proxy = None):
        search_header = zhilian_crawler_data.get_search_url_header()

        try:
            html_src = webutil.request(search_url, headers = search_header, cookie = cookieJar, ua = ua, encoding = 'utf-8', retry = 5, timeout = 60, proxy = proxy)
            if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                raise Exception(u'下载结果页太大或太小')
            return html_src
        except Exception as e:
            print u'下载结果页异常 %s' % e
            raise Exception(u'下载结果页异常')
    def crawl(self, url):
        if url == None or len(url) < 1:
            return

        ua = webutil.get_user_agent()
        cookieJar = cookielib.MozillaCookieJar()

        data_dict = {}
        data_dict['type'] = 'zhilian'
        data_dict['version'] = 1
        data_dict['url'] = url

        try:
            html_src = webutil.request(url, headers = zhilian_crawler_data.get_search_url_header(), ua = ua, cookie = cookieJar, timeout = 60, retry = 5, encoding = 'utf-8', proxy = None)
            if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10:
                raise Exception(u'下载详情页异常')
            data_dict['html'] = html_src
            self.parse_html(html_src, data_dict)
            self.save_data(url, data_dict)
        except Exception as e:
            print u'下载详情页异常%s' % e
            raise Exception(u'下载详情页异常')