def crawl(self): # 获取一个消息 pre_information = get_message(self.msg_queue) if pre_information is None: return "NO_MESSAGE", {} # 获取需要爬取的网页地址 if DATA_URL in pre_information: url = pre_information[DATA_URL] else: url = pre_information[URL] # 设置爬虫浏览目录页方式 if self.flip_page_type == 1 and not pre_information.get(GENERATE_ALL_PAGE): pre_information[GENERATE_ALL_PAGE] = True # 获取与url相匹配的parser matched_parser = get_matched_parser(url) if matched_parser is None: return "NO_PARSER", pre_information re_url, func = matched_parser # 如果需要休眠,则爬虫进行休眠 sleep_second = pre_information.get(SLEEP_SECOND, 0) sleep_second += pre_information.get(SLEEP_FOR_CONTENT, 0) if sleep_second > 0: time.sleep(sleep_second) # 获取网页内容 method = pre_information.get(METHOD, "GET").upper() if method == "POST": params = pre_information.get(PARAMS, {}) html = self.post(url, data=params, timeout=40) else: html = self.get(url, timeout=60) if html is None: if NO_CONTENT_TIMES not in pre_information: pre_information[NO_CONTENT_TIMES] = 0 pre_information[SLEEP_FOR_CONTENT] = 0 if pre_information[NO_CONTENT_TIMES] < 3: pre_information[NO_CONTENT_TIMES] += 1 pre_information[SLEEP_FOR_CONTENT] += 4 publish_message(self.msg_queue, pre_information) return "NO_CONTENT", pre_information else: pre_information[NO_CONTENT_TIMES] = 0 pre_information[SLEEP_FOR_CONTENT] = 0 # 解析网页获取继续访问的链接和已经解析成功的公告内容 try: links, contents = func(html, pre_information) except KeyboardInterrupt as e: raise e except Exception, exception: import traceback traceback.print_exc(exception) logger.error("Parse html failed. %s\n%s" % (exception.message, json.dumps(pre_information).decode("unicode-escape"))) return "PARSER_ERROR", pre_information
def test_start_urls(): skips = [ "www.chinabidding.com", "cz.fjzfcg.gov.cn", ] fixes = [ 'cqjbjyzx', 'fjjyzx' ] urls = skip_urls(page_start.start_urls, skips) # urls = fix_urls(urls, fixes) urls = filter_with(urls, {ORIGIN_REGION: u"重庆"}) # urls = filter_without(urls, {ORIGIN_REGION: u"重庆"}) queue_name = "test_start_urls" success_list, fail_list = [], [] for i, item in enumerate(urls): print "%d:\t" % i, "test start url: ", show_start_url(item) clear_queue(queue_name) publish_message(queue_name, item) bid_crawler = BidCrawler(queue_name, skip_parse_failure=True) results = [] for k in range(10): res, info = bid_crawler.crawl() if res is "NO_MESSAGE": break if res is not "SUCCESS": results.append("%s: %s" % (k, res)) if len(results) != 0: print_red("crawl start url failed: ") print_green(results) fail_list.append(item) else: success_list.append(item) print "\nTotal success urls are %s: " % len(success_list) for i, item in enumerate(success_list): print "%3d: %s" % (i, show_start_url(item)) print_red("\nTotal fail urls are %s: " % len(fail_list)) for i, item in enumerate(fail_list): print_red("%3d: %s" % (i, show_start_url(item))) import util hosts = set() for item in success_list: host = util.get_host_address(item.get("url")) hosts.add(host) hosts = sorted(hosts) print "\nAvailable Hosts:" for host in hosts: print '"%s",' % host
def test_crawler(): info = [{ ORIGIN_REGION: u"重庆>>巴南区", URL: "http://jy.bnzw.gov.cn/LBv3/n_newslist_zz.aspx?Item=100026", ANNOUNCE_TYPE: u"中标公示", PROJECT_TYPE: u"工程建设", WEBSITE: u"重庆市巴南区行政服务和公共资源交易中心", NOTE: u"重庆市重庆綦江公共资源综合交易网巴南区行政服务和公共资源交易中心-中标公示" }] queue.publish_message("test", info) bid_crawler = crawler.BidCrawler("test") bid_crawler.run()
def run(self): is_monitor = (self.policy.CRAWLER_TYPE == 2) # 是否是定时监控 schedule_seconds = self.policy.SCHEDULE_TIME # 监控定时 for idx, sec in enumerate(schedule_seconds): schedule_seconds[idx] += random.randint(-1700, 1700) schedule_seconds = sorted(schedule_seconds) schedule_seconds.append(24 * 3600 + schedule_seconds[0]) while True: thread_list = [] queue.publish_message(self.queue_name, self.msg) crawler_number = self.policy.CRAWLER_NUMBER for m in self.msg: if m.get(MAX_CRAWLER_NUMBER ) and m.get(MAX_CRAWLER_NUMBER) < crawler_number: crawler_number = m.get(MAX_CRAWLER_NUMBER) try: for i in range(crawler_number): crawler = BidCrawler( msg_queue=self.queue_name, is_monitor=is_monitor, check_published_ts=self.policy.APPLY_TIME_INTERVAL, start_ts=self.policy.TIME_INTERVAL_ST, end_ts=self.policy.TIME_INTERVAL_ED, skip_parse_failure=True) thread = Thread(target=crawler.run) thread.start() thread_list.append(thread) for thread in thread_list: thread.join() if is_monitor: now = datetime.datetime.now() seconds_of_day = (now.hour * 3600 + now.minute * 60 + now.second) for second in schedule_seconds: if second > seconds_of_day: sleep_seconds = second - seconds_of_day logger.info( "The crawler will be started after %s seconds." % sleep_seconds) time.sleep(sleep_seconds) break else: break except KeyboardInterrupt, e: print e.message break
def test_crawler(): info = [{ URL: "http://www.hljcg.gov.cn/xwzs!index.action" }, { ORIGIN_REGION: u"黑龙江", URL: "http://www.hljcg.gov.cn/xwzs!queryXwxxqx.action?lbbh=5&xwzsPage.pageNo=1", ANNOUNCE_TYPE: u"成交公告", NOTE: u"黑龙江省政府采购网-成交公告" }] queue.publish_message("test", info) bid_crawler = crawler.BidCrawler("test") bid_crawler.run()
class BidCrawler(BaseCrawler): sleep_time = 1 # 休眠时间 def __init__(self, msg_queue, is_monitor=False, # 是否是定时监控 check_published_ts=False, # 是否检查网页发布时间 start_ts="2000-01-01", # 爬取网页发布时间的起始时间 end_ts="", # 爬取网页发布时间的结束时间 skip_parse_failure=False, # 是否跳过解析错误 flip_page_type=0): # 翻页方式 (0: 依次查找下一页,主要用于监控; 1: 一次产生全部的页,主要用于全量) super(BidCrawler, self).__init__() self.msg_queue = msg_queue self.is_monitor = is_monitor self.check_published_ts = check_published_ts self.start_ts = start_ts self.end_ts = end_ts self.skip_parse_failure = skip_parse_failure self.flip_page_type = flip_page_type def crawl(self): # 获取一个消息 pre_information = get_message(self.msg_queue) if pre_information is None: return "NO_MESSAGE", {} # 获取需要爬取的网页地址 if DATA_URL in pre_information: url = pre_information[DATA_URL] else: url = pre_information[URL] # 设置爬虫浏览目录页方式 if self.flip_page_type == 1 and not pre_information.get(GENERATE_ALL_PAGE): pre_information[GENERATE_ALL_PAGE] = True # 获取与url相匹配的parser matched_parser = get_matched_parser(url) if matched_parser is None: return "NO_PARSER", pre_information re_url, func = matched_parser # 如果需要休眠,则爬虫进行休眠 sleep_second = pre_information.get(SLEEP_SECOND, 0) sleep_second += pre_information.get(SLEEP_FOR_CONTENT, 0) if sleep_second > 0: time.sleep(sleep_second) # 获取网页内容 method = pre_information.get(METHOD, "GET").upper() if method == "POST": params = pre_information.get(PARAMS, {}) html = self.post(url, data=params, timeout=40) else: html = self.get(url, timeout=60) if html is None: if NO_CONTENT_TIMES not in pre_information: pre_information[NO_CONTENT_TIMES] = 0 pre_information[SLEEP_FOR_CONTENT] = 0 if pre_information[NO_CONTENT_TIMES] < 3: pre_information[NO_CONTENT_TIMES] += 1 pre_information[SLEEP_FOR_CONTENT] += 4 publish_message(self.msg_queue, pre_information) return "NO_CONTENT", pre_information else: pre_information[NO_CONTENT_TIMES] = 0 pre_information[SLEEP_FOR_CONTENT] = 0 # 解析网页获取继续访问的链接和已经解析成功的公告内容 try: links, contents = func(html, pre_information) except KeyboardInterrupt as e: raise e except Exception, exception: import traceback traceback.print_exc(exception) logger.error("Parse html failed. %s\n%s" % (exception.message, json.dumps(pre_information).decode("unicode-escape"))) return "PARSER_ERROR", pre_information # 并处理解析后的结果 if self.is_monitor: for cont in contents: URL_POOL.add_url(cont[UNI_ORIGIN_ID], cont[URL]) links = _check_links(links) if self.check_published_ts: links = _check_published_ts(links, self.start_ts, self.end_ts) links = _filter_links(links) _store_contents(contents) publish_message(self.msg_queue, links) return "SUCCESS", pre_information