def work(resp, stat): pipe = None root = None try: logDelayTime(resp) if checkResp(resp) == True: pipe = stat.getPipeByName(resp['trespassing_field']['pipe']) log.notice("got result of pipe: {}, result: {}".format( pipe.name, resp['result'])) urlPacker = cPickle.loads( base64.b64decode(resp['trespassing_field']['urlPack'])) root = json.loads(resp['html_body']) saveResult = pipe.saver.start(root, urlPacker) if not saveResult: raise RuntimeError("saver_error: pipe={}, resp={}".format( pipe.name, resp)) incrPipeSaverStatus(pipe.name, "ok") except Exception as e: traceback.print_exc() log.fatal("handle_spider_result_worker_err: error={}, resp={}".format( e, resp)) if pipe is not None: try: msg = urlPacker.msg msg.retry = msg.retry + 1 if msg.retry > 5: log.debug("retry num > 5, push to trash") pipe.pushToTrashList(base64.b64encode(cPickle.dumps(msg))) incrPipeSaverStatus(pipe.name, "error") else: log.debug("push to retry list {}".format(msg.retry)) pipe.pushToRetryList(base64.b64encode(cPickle.dumps(msg))) except Exception as e: log.fatal("unexcept_error_on_csrev_work", e)
def handle_spider_result(packet_queue, stat): """ 接收spider的抓取结果 :param packet_queue: nshead接收到的数据存储在packet_queue中 :return: """ while True: try: receive_buf = packet_queue.get() try: result_info = tryMcpackLoad(receive_buf, "gb18030") if result_info is False: result_info = tryMcpackLoad(receive_buf, "utf8") if result_info is False: log.fatal("mcpack_loads_error") #result_info = mcpack.loads(receive_buf, use_unicode=True, charset="gb18030") #TODO 是否需要更改编码 #result_info = mcpack.loads(receive_buf) #TODO 是否需要更改编码 #print(result_info) #print(result_info.keys()) work(result_info, stat) except Exception as e: print traceback.format_exc() log.fatal("handle_spider_result_err", e) result_info = mcpack.loads(receive_buf, use_unicode=True, charset="utf8") #TODO 是否需要更改编码 print(result_info) continue except: print traceback.format_exc() continue
def _real_worker(self, urlPack): for _ in range(2): try: json_data = {} json_data['target_url'] = urlPack.url json_data['method'] = 'POST' json_data[ 'request_header'] = "Content-Type: application/x-www-form-urlencoded\r\nUser-Agent: kwai-android" json_data['post_data'] = mcpack.RAW(urlPack.form) bypass = urlPack.getExtra() bypass['submitTime'] = time.time() bypass['urlPack'] = base64.b64encode( cPickle.dumps(urlPack, protocol=-1)) cspubutil.patch(json_data, bypass, urlPack=urlPack) failedList = cspubutil.send2cspub([json_data]) #log.debug("send2cspub_data:{}".format(json_data)) if len(failedList) > 0: log.fatal("send2cspub_error:{}".format(failedList)) self.incrStatKey('sub2cspub') log.debug("submit2cspub: {}, bypass: {}".format( urlPack, bypass)) return True except Exception as e: traceback.print_exc() log.fatal("crawlerkuaishou_cspubmodel_real_worker_error:{},{}". format(e, urlPack)) return False
def _work(self): """ """ try: self.work() self._render_result(self.errno, self.errmsg, self.response_data) except error.BaseError as e: self._render_result(e.errno, e.errmsg, {}) warning = { "uri": self.request.uri, "logid": self.logid, "errno": e.errno, "errmsg": e.errmsg, "args": str(e.args), "trace": traceback.format_exc(), "ex_type": type(e) } log.warning(warning) sys.stderr.write(pprint.pformat(warning)) except Exception, e: errno = error.ERRNO_UNKNOWN self._render_result(errno, str(e), "") warning = { "uri": self.request.uri, "logid": self.logid, "errno": errno, "errmsg": str(e), "args": str(e.args), "trace": traceback.format_exc(), "ex_type": type(e) } log.fatal("internal_error", warning) sys.stderr.write(pprint.pformat(warning))
def needRetry(self, pipeName, root): try: if "error_msg" in root: log.fatal("kuaishou_fatal_error, pipe={}, error_msg={}".format( pipeName, root['error_msg'])) except: pass return False
def checkTemplate(self, root, exprs, urlPack): for e in exprs: if len(e.find(root)) == 0: log.fatal( "{} precheck_error, url={}, resp={}".format( self.pipe.name, urlPack, root), str(e)) self.pipe.incTemplateError() return False return True
def transformAuthorDetail(data): resp = {} for f in funcsAuthor: try: r = f(data) resp.update(r) except Exception as e: log.fatal(f, e) resp['3rdRawData'] = data return resp
def _real_worker(self, urlPack): for _ in range(2): try: #time.sleep(100) log.debug("Fetching: {}".format(urlPack)) resp = requests.get(urlPack.url, headers=self.HEADERS) return json.loads(resp.text) except Exception as e: log.fatal("crawler_default_real_worker_error:{},{}".format( e, urlPack)) return False
def addDouyinAuthorDetailJob(uid, priority=10): for _ in range(3): try: uid = int(uid) host = conftool.randomChoice(SUBMIT_HOST, SUBMIT_PORT) r = requests.get("http://{}/spider/add?priority={}&pipe=DouyinAuthorDetailPipeCspub&msg_type=AUTHOR&msg_data={}".format(host, priority, uid)) log.notice("addDouyinAuthorDetailJob:{}".format(uid, r.text)) return True except: pass log.fatal("addDouyinVideosJob_error:".format(uid)) return False
def transformTopicDetail(data): resp = {} for f in funcsTopicDetail: try: r = f(data) resp.update(r) except Exception as e: log.fatal( "error_in_transfrom_topic_detail, func={}, err={}, key={}". format(f, e, data.get('_key_'))) raise e return resp
def retry(fun, *param, **kwargs): CNT = 3 for i in range(3): try: resp = fun(*param, **kwargs) return resp except Exception as e: traceback.print_exc() log.fatal("{}, error: {}, retrying {}".format(fun, e, i)) if i < CNT - 1: time.sleep(1) return False
def run(self): log.debug("pipe {} priority: {}".format(self.name, self.priority)) try: self.running = True while not self.urlGenerator.end() and self.running: urlPacker = self.urlGenerator.next() self.pool.apply_async(self._real_worker, (urlPacker, )) self.pending.add(urlPacker) self.numToWorker += 1 except Exception as e: log.fatal("{} end with error {}".format(self.name, e)) self.running = False return
def addMp4Job(itemKey, priority=10000): itemKey = itemKey.strip() for _ in range(3): try: params = {"cmd": "add", "_key_": itemKey, "priority": priority} host = conftool.randomChoice(SUBMIT_HOST, SUBMIT_PORT) resp = requests.get("http://{}/job".format(host), params=params) return json.loads(resp.text) except Exception as e: log.warning("addMp4Job", e) time.sleep(1) pass log.fatal("submiter.addMp4Job fail") return False
def checkResp(resp): try: if resp['content_length'] == 0: if resp['result'] == 501: log.debug("content_length is 0, result is 501") retry(resp) return False else: return True except Exception as e: log.fatal("checkResp err: error={}, resp={}".format(e, resp))
def load(self): fn = self.saveDir + "/" + self.name num = 0 try: if os.path.exists(fn): with open(fn, "rb") as f: tmpSet = cPickle.load(f) num = len(tmpSet) for urlPack in tmpSet: self.urlGenerator.add(urlPack) log.notice("load {} queue, num={}".format(self.name, num)) except Exception as e: log.fatal("load_queue_error_{}".format(self.name), e) return num
def getByVideoId(videoId): for _ in range(10): try: col_videos = 'mvideo_meta' where = {} where['video_id'] = videoId where = json.dumps(where) where = urllib.quote(where) get_url_videos = 'http://' + mongo_server + '/get/' + col_videos + '?where=' + where + "&limit=10&order=@id" result = _req_get(get_url_videos) result = json.loads(result) result['data'] = json.loads(result['data']) return result except Exception as e: log.fatal("getByVideoId_error", e) return None
def transfromVideoDetail(data): resp = {} if '_authorInfo_' in data and 'user' in data['_authorInfo_']: data['_authorInfo_'].update(data['_authorInfo_']['user']) if 'author' not in data: data['author'] = data['_authorInfo_'] for f in funcsVideoDetail: try: r = f(data) resp.update(r) except Exception as e: log.fatal( "error_in_transfrom_video_detail, func={}, err={}, key={}". format(f, e, data.get('_key_'))) raise e return resp
def retry(resp): pipe = stat.getPipeByName(resp['trespassing_field']['pipe']) urlPacker = cPickle.loads( base64.b64decode(resp['trespassing_field']['urlPack'])) if pipe is not None: try: msg = urlPacker.msg msg.retry = msg.retry + 1 if msg.retry > 5: log.debug("retry num > 5, push to trash") pipe.pushToTrashList(base64.b64encode(cPickle.dumps(msg))) incrPipeSaverStatus(pipe.name, "error") else: log.debug("push to retry list {}".format(msg.retry)) pipe.pushToRetryList(base64.b64encode(cPickle.dumps(msg))) except Exception as e: log.fatal("unexcept_error_on_csrev_work", e)
def _real_worker(self, urlPack): log.debug("_real_worker") for _ in range(2): try: json_data = {} json_data['target_url'] = urlPack.url json_data['method'] = 'GET' json_data['request_header'] = "Content-Type: application/x-www-form-urlencoded\r\nUser-Agent: kwai-android" #json_data['post_data'] = mcpack.RAW(urlPack.form) bypass = urlPack.getExtra() bypass['submitTime'] = time.time() bypass['urlPack'] = base64.b64encode(cPickle.dumps(urlPack, protocol=-1)) cspubutil.patch(json_data, bypass, urlPack=urlPack) cspubutil.send2cspub([json_data]) log.debug("submit2cspub: {}, bypass: {}".format(urlPack, bypass)) return True except Exception as e: traceback.print_exc() log.fatal("crawlerhuoshan_cspub_real_worker_error:{},{}".format(e, urlPack)) return False
def _execute_action(self, path_actions, root, urlPack): """ execute xpath action """ for xa in path_actions: pattern = xa[0] func = xa[1] if pattern is None: func(root, None) else: r = pattern.find(root) for match in r: try: func(root, match.value, urlPack) except Exception as e: traceback.print_exc() log.fatal( "_execute_action_error:{}, match.value:{}".format( func, match.value), e) if len(r) == 0: log.warning("pattern {} match empty!".format(pattern))
def _real_worker(self, urlPack): for _ in range(2): try: log.debug("Fetching: {}".format(urlPack)) resp = urllib2.urlopen( urllib2.Request(urlPack.url, urlPack.form, { 'Content-Type': 'application/x-www-form-urlencoded', 'User-Agent': 'kwai-android'}), timeout=30 ) data = resp.read() data = json.loads(data) return data except Exception as e: log.fatal("crawlerkuaishou_syncmodel_real_worker_error:{},{}".format(e, urlPack)) return False
def _real_worker(self, urlPacker): try: self.numFetchAll += 1 log.notice("{},{}".format(threading.current_thread().name, urlPacker)) resp = self.crawler.fetch(urlPacker) if resp is False: log.fatal("{} fetch_error:{}".format(self.name, urlPacker)) self.numFetchErr += 1 return if type(resp) == bool: #cspub model return self.saver.start(resp, urlPacker) self.urlGenerator.done(urlPacker) except Exception as e: self.running = False log.fatal(e) traceback.print_exc() finally: self.pending.discard(urlPacker)
def handler(self, root, data, urlPack): # @UnusedVariable log.debug("huoshan main feed saver handler, len={}".format( len(data["data"]))) for info in data["data"]: try: vid = str(info['data']['id']) uid = str(info['data']['author']['id']) except Exception as e: log.fatal("get_huoshan_id_error:{},{}".format(info, e)) continue obj = dbtools.MongoObject() #视频直接存下来 obj = dbtools.MongoObject() obj.setMeta("VIDEO", const_huoshan.DATA_PROVIDER, vid) obj.setUserId(uid) obj.setData(info) if not self.db.isItemUpdatedRecently(obj.key): obj.save() log.debug( "Inserting obj from HuoshanMainFeed video: {}".format( obj.getLastObjectId())) else: log.debug("HuoshanMainFeed video: {} already inserted".format( obj.getLastObjectId())) #如果作者三天以上未更新, 则publish uid authorKey = dbtools.gen_object_key('AUTHOR', const_huoshan.DATA_PROVIDER, uid) if not self.db.isItemUpdatedRecently(authorKey, 3 * 86400): objAuthor = dbtools.MongoObject() objAuthor.setMeta("AUTHOR", const_huoshan.DATA_PROVIDER, uid) objAuthor.save() self.addStatObject(authorKey, const_huoshan.DATA_TYPE_AUTHOR) msg = Message(const_huoshan.DATA_TYPE_AUTHOR, uid) self.pipe.publish(msg) else: log.debug("author updated recently") return
def send2cspub(data_list, host=CSPUB_HOST, port=CSPUB_PORT): if not data_list: return [] mcpack.set_default_version(2) server = (host, port) try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) except Exception as err: log.fatal('Creating socket error:{}'.format(err)) return data_list try: sock.connect(server) except Exception as err: log.fatal('Connecting to server error:{}'.format(err)) return data_list data_failed = [] for line in data_list: json_string = line if type(json_string) != dict: send_dict = json.loads(json_string) else: send_dict = json_string send_pack = mcpack.dumps(send_dict) try: nshead.nshead_write(sock, send_pack) except Exception as err: log.fatal('Sending data error:{}'.format(err)) data_failed.append(line) sock.close() return data_failed
def work(self): """ main worker """ log.notice("in ItemInfoHandler handler") key = self.checkParamAsString('key') db = mongo.DB() table = const.getTable(key) itemType, provider, thirdId, version = dbtools.get_key_info(key) resp = db.getOne(table, dbtools.get_object_id_by_key(key)) if resp is None: self.response_data = resp return adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]), fromlist=["libs.adapter"]) if itemType == "VIDEO": uid = adaptertool.getUid(resp) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) authorInfo = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') if authorInfo is None: log.fatal("no author info for key:{}".format(key)) raise ValueError("no author meta") return resp['_authorInfo_'] = authorInfo resp['_callback_'] = "http://" + conftool.randomChoice( CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + key resp = adaptertool.transform(key, resp) elif itemType == "AUTHOR": resp = adapter.transformAuthorDetail(resp) else: raise ValueError("Invalid itemType") self.response_data = resp log.notice("get iteminfo: {},{},{},{}".format(itemType, provider, thirdId, version))
def handleUserDetail(self, root, data, urlPack): # @UnusedVariable for info in data: try: user = info["author"] uid = user["uid"] obj = dbtools.MongoObject() obj.setMeta(const_douyin.DATA_TYPE_AUTHOR, const_douyin.DATA_PROVIDER, uid, version=const_douyin.DATA_VERSION) obj.setData(user) if not obj.db.isItemUpdatedRecently(obj.key): obj.save(const_douyin.MONGO_TABLE_AUTHOR) log.debug( "DouyinAuthorDetailSaver Inserting obj {}".format( obj.getLastObjectId())) self.addStatObject(obj.getLastObjectId(), const_douyin.DATA_TYPE_AUTHOR) else: log.debug("uid:{} is already inserted".format(uid)) except Exception as e: log.fatal("{}".format(e)) raise e return
def work(self): """ main worker """ log.notice("in JobHandler handler") cmd = self.getParamAsString('cmd') if cmd == "get": #从队列提取一条item try: q = queue.JobPriorityQueue() itemKey, priority = q.deQueue(True) if itemKey is False: self.response_data = {"notice": "queue empty"} return self.response_data = {"_key_": itemKey} queueBack = queue.JobBackupQueue() queueBack.enQueue(itemKey, time.time()) _, provider, thirdId, _ = dbtools.get_key_info(itemKey) isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId)) db = mongo.DB() if isCrawled: insertVal = {} insertVal["_crawl_"] = const.CRAWL_STATUS_OK insertVal["_utime_"] = int(time.time()) db.updateByKey(const.getTable(itemKey), itemKey, insertVal) self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK} return data = db.getOne(const.getTable(itemKey), itemKey, '_key_') uid = adaptertool.getUid(data) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey data['_priority_'] = priority if len(data.get('_topic3rdId_', '')) > 0: try: topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_']) topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_') data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic'] except Exception as e: log.warning("error_get_microVideoTopic", e) self.response_data = data log.notice("pop one not crawled:{}".format(itemKey)) except Exception as e: log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey)) self.response_data = {"_key_": itemKey, "error": str(e)} return if cmd == "add": itemKey = self.checkParamAsString('_key_') priority = self.getParamAsInt('priority', 10000) q = queue.JobPriorityQueue() resp = q.enQueue(itemKey, priority) self.response_data = resp return if cmd == "callback": itemKey = self.checkParamAsString('_key_') log.notice("got a callback:{}".format(itemKey)) db = mongo.DB() stat = statistics.Statistics() value = {} value["_crawl_"] = 1 value["_utime_"] = int(time.time()) if self.getParamAsString('from') == 'mimod': value['_cspubResult_'] = self.getParamAsString('result', '') stat.incrCspubResult(value['_cspubResult_']) resp = db.updateByKey(const.getTable(itemKey), itemKey, value) self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp} stat.incrSenderCallback() return raise ValueError("invalid cmd: ".format(cmd))
def _run(self, port=8000, handler=None): """ 服务接收逻辑 :param port: 服务接口 :param handler: 消息处理函数 :return: """ if handler is None: raise Exception('receiver function without handler') #manager = multiprocessing.Manager() #packet_queue = manager.Queue(10000) packet_queue = Queue.Queue() try: #pool = multiprocessing.Pool(5) pool = ThreadPool(processes=20) except: traceback.print_exc() raise Exception('process pool create failed') for _ in range(5): pool.apply_async(handler, (packet_queue, self.stat)) Server = ("0.0.0.0", port) server_sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server_sock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) server_sock.bind(Server) server_sock.listen(5) server_sock.setblocking(0) epoll = select.epoll() # @UndefinedVariable epoll.register(server_sock.fileno(), select.EPOLLIN) # @UndefinedVariable connections = {} requests = {} log.debug("Starting callback at port {}".format(port)) while self.running: events = epoll.poll() #print(events) #print("select.EPOLLIN={}".format(select.EPOLLIN)) #print("select.EPOLLHUP={}".format(select.EPOLLHUP)) for fileno, event in events: if fileno == server_sock.fileno(): connection, addr = server_sock.accept() log.debug("accepted socket from {}".format(addr)) connection.setblocking(0) epoll.register(connection.fileno(), select.EPOLLIN) # @UndefinedVariable connections[connection.fileno()] = connection requests[connection.fileno()] = b'' elif event & select.EPOLLIN: # @UndefinedVariable recv_data = connections[fileno].recv(104857600) need_close_conn = False #time.sleep(3) if not recv_data: need_close_conn = True else: requests[fileno] += recv_data while len(requests[fileno]) >= nshead.nsead_body_len: receive_nshead = nshead.nshead() try: receive_nshead.load( requests[fileno][:nshead.nsead_body_len]) packet_length = nshead.nsead_body_len + receive_nshead.head[ 'body_len'] if len(requests[fileno]) >= packet_length: data = requests[fileno][ nshead.nsead_body_len:packet_length] packet_queue.put(data) #print packet_queue.qsize() #print ("packet length {} read from nshead".format(packet_length)) requests[fileno] = requests[fileno][ packet_length:] else: break except Exception as e: need_close_conn = True print traceback.format_exc() log.fatal("cspub_receiver_error", e) break if need_close_conn: epoll.unregister(fileno) connections[fileno].close() del connections[fileno] del requests[fileno] print "connection closed" elif event & select.EPOLLHUP: # @UndefinedVariable epoll.unregister(fileno) connections[fileno].close() del connections[fileno] del requests[fileno]