def init_spider(module_name): """ :param module_name: like spider.booking.hotel_list_spider :return: 理论上只有一个spider """ print(module_name) spider_module = importlib.import_module('.' + module_name, 'mioji') spider_list = [] for attr in inspect.getmembers(spider_module): if inspect.isclass(attr[1]) and attr[1].__module__.endswith( '_spider') and attr[1].__module__.endswith(module_name): if issubclass(attr[1].__bases__[0], Spider): # 当为 Spider 子类或同类时加载 try: spider_clazz = getattr(spider_module, attr[0]) spider = spider_clazz() if isinstance(spider, Spider): spider_desc = {} spider_desc['source_type'] = spider.source_type spider_desc['spider_class'] = spider_clazz spider_desc['targets'] = spider.targets.keys() spider_list.append(spider_desc) except: logger.exception('instance spider[%s]', attr[1]) return spider_list
def crawl(self): """ 外部启动爬虫的入口方法 当调用这个方法时才能开始爬虫工作~ :return: """ # todo self.__create_browser() cur_id = str(uuid.uuid1()) if hasattr(self.task, 'new_task_id'): cur_id = self.task.new_task_id self.spider_taskinfo = {'task_id': cur_id} for k, v in self.task.__dict__.items(): self.spider_taskinfo[k] = v try: logger.info(current_log_tag() + '[任务信息][%s][%s]' % (k, json.dumps(v))) except Exception: continue chains = self.targets_request() try: self.code = self.__crawl_by_chain(chains) except parser_except.ParserException as e: logger.exception(e) self.code = e.code self.exception = e.msg if e.retry_from_first: raise e # 通过返回的全部 result 判断错误码 self.check_all_result() return self.code
def call(*args, **kwargs): try: return func(*args, **kwargs), True except Exception as exc: logger.exception('[新框架][页面解析异常][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) return (args, kwargs, exc), False
def write_message(max_try): """ :param max_try: :return: """ try: max_try -= 1 msg = json.dumps({ 'qid': task.req_qid, 'type': task.callback_type, 'uid': task.req_uid, 'query': json.dumps(query), 'status': spider_status }) credentials = pika.PlainCredentials( username=task.master_info['spider_mq_user'], password=task.master_info['spider_mq_passwd']) connection = pika.BlockingConnection( pika.ConnectionParameters( host=task.master_info['spider_mq_host'], virtual_host=task.master_info['spider_mq_vhost'], credentials=credentials, # heartbeat_interval=0 )) channel = connection.channel() res = channel.basic_publish( exchange=task.master_info['spider_mq_exchange'], routing_key=task.master_info['spider_mq_routerKey'], properties=pika.BasicProperties(delivery_mode=2), body=msg, ) connection.process_data_events() connection.close() if not res: warn_msg = 'RabbitMQ Result False: {0}'.format(msg) info = warn(str(task.req_qid), 'ex_RabbitMQ', get_local_ip(), warn_msg) logger.debug("\n" + info) raise Exception('RabbitMQ Result False') logger.debug( '[callback a verifytask done] qid:{}, source: {}, task_info: {}, status: {}' .format(str(task.req_qid), str(task.source), task.content, spider_status)) return max_try except Exception as exc: if max_try > 0: return write_message(max_try) else: warn_msg = 'RabbitMQ Result False qid : {}, e_info: {}, msg: {}'.format( task.req_qid, traceback.format_exc(), msg) info = warn(task.req_qid, 'ex_SpiderMQ', get_local_ip(), warn_msg) logger.exception("\n" + info) return max_try
def __crawl_by_chain(self, chains): """ 根据请求链的类型,进入不同的抓取顺序进行抓取 :param chains: :return: """ code = 0 try: for reqParse in chains: # gevent.sleep(0) browser = self.__create_browser(reqParse.new_session) reqParse.spider = self t_req = reqParse.request() if isinstance(t_req, dict): # 单一请求 new_result = self.__single_crawl(reqParse, browser, t_req, 0) elif isinstance(t_req, list): # 爬虫有可能返回一个空列表!!! if t_req: if reqParse.asynchronous: # 并行抓取 list_result = self.__async_crawl_list( reqParse, browser, t_req) else: # 串行请求 list_result = self.__crawl_list( reqParse, browser, t_req) new_result, code = self.check_list_result( list_result, code) # $$$ 可以优化 elif isinstance(t_req, types.GeneratorType): # 针对使用的yelid 调用方法的请求 list_result = self.__crawl_list(reqParse, browser, t_req) new_result, code = self.check_list_result( list_result, code) self.__spider_append_result(new_result) if self.use_selenium and browser.br: browser.close() except parser_except.ParserException as e: if self.use_selenium and browser.br: browser.close() logger.error(e) raise e except Exception: if self.use_selenium and browser.br: browser.close() logger.exception(current_log_tag() + '[新框架 持续请求 未知问题][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) raise parser_except.ParserException( parser_except.UNKNOWN_ERROR, 'e:{0}'.format(traceback.format_exc())) return code
def doCallback(self, task, error_code, spider, result_type): """ 执行回调工作 """ def get_ticket_num(): ticket_num = 0 for per_data_type in spider.crawl_targets_required: ticket_num += len(spider._asy_temp_result[per_data_type]) return ticket_num def get_result(_result): _proxy_or_ticket = [] for per_data_type in spider.crawl_targets_required: _proxy_or_ticket.extend(_result[per_data_type]) return _proxy_or_ticket # 如果是 running状态 等一秒再判断下。 if result_type == "RUNNING": num1 = get_ticket_num() time.sleep(1) # 缓冲后票张数量 num2 = get_ticket_num() if num1 != num2 or spider.spider_frame_status: return task.other_info['parser_error'] = int(error_code) query = {"other_info": task.other_info} result = None redis_mq_logger = RedisMQCostLogger() extra = {} if spider: result = spider._asy_temp_result if result_type == 'RUNNING' else spider.result result = get_result(result) extra = spider.extra redis_mq_logger.ticket_num = len(spider._asy_temp_result) try: redis_mq_logger.qid = task.req_qid redis_mq_logger.source = task.source redis_mq_logger.task_id = task.new_task_id redis_mq_logger.task_info = task.content redis_mq_logger.error_code = error_code if result_type == 'END': redis_mq_logger.is_end = 1 # 写入redis redis_cost = self.write_redis_ticket(task, result, error_code, extra) if isinstance(redis_cost, tuple): redis_mq_logger.conn_redis = redis_cost[0] redis_mq_logger.write_redis = redis_cost[1] else: redis_mq_logger.exception = redis_cost except Exception, e: logger.exception('not redis con' + str(e))
def __crawl_list(self, reqParse, browser, req_list): """ 串行抓取分页 """ result = defaultdict(list) all_except = True all_ok = True one_exception = None total_count = 0 success_count = 0 error_req = [] for req in req_list: # 串行增加翻页限制取消 # if NEED_FLIP_LIMIT: # if total_count >= MAX_FLIP: # break total_count += 1 try: success_count += 1 res = self.__single_crawl(reqParse, browser, req, page_count=total_count) self.__target_append_result(result, res) all_except = False except Exception as e: all_ok = False one_exception = e error_req.append((req, one_exception.message)) logger.exception( current_log_tag() + '[新框架][页面解析异常][ {0} ]'.format( traceback.format_exc().replace('\n', '\t'))) # 抛出生成器部分的异常 if isinstance(req, types.GeneratorType): raise e if reqParse.binding: self.success_count = success_count self.all_count = total_count logger.debug( current_log_tag() + '[翻页抓取][串行抓取][ 成功 {0} / {1} ]'.format(success_count, total_count)) if error_req: logger.debug(current_log_tag() + '[翻页抓取][串行抓取][ 失败页请求 {0} ]'.format(str(error_req))) return result, all_except, all_ok, one_exception
def write_redis_ticket(self, task, result, error_code, extra): try: begin = time.time() params = (task.redis_host, task.redis_port, int(task.redis_db), task.redis_passwd) rds = self.get_redis_pool(params) conn_cost_time = round(time.time() - begin, 3) # 等检索做兼容,暂时妥协方案,谷歌api返回格式强行转换。 if task.source == "Realtraffic" and result: result = json.dumps(result[0]) result = {"err_code": error_code, "data": result, "extra": extra} begin = time.time() if task.ticket_info.get("auth"): rds.setex(task.redis_key, json.dumps(result), 1800) else: rds.setex(task.redis_key, json.dumps(result), 600) write_cost_time = round(time.time() - begin, 3) return conn_cost_time, write_cost_time except Exception, e: warn_msg = 'redis_host:' + task.redis_host + ' ' + str(e) info = warn(task.req_qid, 'ex_SpiderRedis', msg=warn_msg) logger.exception("\n" + info) return str(e)
def doTask(task): """ 此方法用于调用spider 并接收最终的code和result信息 """ spider = g_spider_factory.get_spider(task.source) if not spider : logger.error('未找到指定源[%s]对应的spider' % (task.source)) callback.CallbackResult(task=task, error_code=11,result_type="END") code = 11 else: spider = spider(task) spider.task = task print task spider.debug = False spider.process_callback = callback.CallbackResult # 执行回调 spider.task_post_process_queue = g_task_post_process_queue # 上传ucloud所用池子 spider.need_proxy = g_config.need_proxy spider.machine_type = g_config.machine_type spider.env = g_config.env spider.local_ip = g_config.local_ip spider.is_verify = g_config.need_post_process crawl_time = time.time() try: spider = spider_crawl(spider, task) # 执行 爬虫,并从头重试 except ParserException as e: error_info = e.msg error = e.code logger.exception('新框架 爬虫抛出异常: task:{0}, error:{1}, msg: {2}'.format(task, error_info, error)) except Exception, e: logger.exception("新框架 爬虫抛出异常: task_data:{0} error:{1}".format(task, e)) error = SLAVE_ERROR spider.last_time = int((time.time() - crawl_time) * 1000) check_all_result(spider) # 最后对所有返回数据进行check spider.spider_frame_status = 1 callback.CallbackResult(task=task, error_code=spider.code, spider=spider, result_type="END") # 执行回调操作,如果是end将执行同步回调 error_logger(spider) # 写入error日志 code = spider.code