def get_result_page_by_page_num(self, search_url, cookieJar = None, ua = None, proxy = None): search_header = zhilian_crawler_data.get_search_url_header() try: html_src = webutil.request(search_url, headers = search_header, cookie = cookieJar, ua = ua, encoding = 'utf-8', retry = 5, timeout = 60, proxy = proxy) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'下载结果页太大或太小') return html_src except Exception as e: print u'下载结果页异常 %s' % e raise Exception(u'下载结果页异常')
def crawl(self, url): if url == None or len(url) < 1: return ua = webutil.get_user_agent() cookieJar = cookielib.MozillaCookieJar() data_dict = {} data_dict['type'] = 'zhilian' data_dict['version'] = 1 data_dict['url'] = url try: html_src = webutil.request(url, headers = zhilian_crawler_data.get_search_url_header(), ua = ua, cookie = cookieJar, timeout = 60, retry = 5, encoding = 'utf-8', proxy = None) if len(html_src) < 100 or len(html_src) > 1024 * 1024 * 10: raise Exception(u'下载详情页异常') data_dict['html'] = html_src self.parse_html(html_src, data_dict) self.save_data(url, data_dict) except Exception as e: print u'下载详情页异常%s' % e raise Exception(u'下载详情页异常')