def save(self, table=None): ''' :param table: ''' val = { '_key_': self.key, '_utime_': int(time.time()), '_dataSource_': self.dataSource } #适配字段 if self.user3rdId is not None: self.data["_user3rdId_"] = str(self.user3rdId) if self.video3rdId is not None: self.data["_video3rdId_"] = str(self.video3rdId) if self.topic3rdId is not None: self.data['_topic3rdId_'] = str(self.topic3rdId) val.update(self.data) insertVal = {} insertVal["_insertTime_"] = int(time.time()) insertVal["_crawl_"] = 0 if table is None: table = const.getTable(self.dataType) self.db.updateByKey(table, self.key, val, insertVal)
def handler(self, root, users, urlPack): # @UnusedVariable log.debug("return users len:[{}]".format(len(users))) for user in users: key = dbtools.gen_object_key('AUTHOR', 'kuaishou', user['user_id']) if not self.db.isObjectUpdatedRecently(const.getTable('AUTHOR'), key, 365 * 86400): log.debug("search result, pcursor={}, user_id={}".format(urlPack.getKey('pcursor'), user['user_id'])) msg = Message(const.DATA_TYPE_AUTHOR, user['user_id']) self.publish(msg) obj = dbtools.MongoObject() obj.setMeta(const.DATA_TYPE_AUTHOR, const_kuaishou.DATA_PROVIDER, user['user_id']) obj.setData(user) obj.save() log.debug("KuaiShouSearchUserSaver Inserting obj {}".format(obj.getLastObjectId())) self.addStatObject(obj.getLastObjectId(), const_kuaishou.DATA_TYPE_VIDEO) #authorDetail p = self.pipe.stat.getPipeByName('KuaiShouAuthorDetailPipeCspub') msg = Message('AUTHOR', user['user_id']) p.addMessageObject(msg) if int(urlPack.getKey('pcursor')) <= 10: p = self.pipe.stat.getPipeByName('KuaiShouAuthorVideosPipeCspub') msg = Message('AUTHOR', user['user_id']) p.addMessageObject(msg) if type(users) is list and len(users) > 0: time.sleep(10) msg = Message(const_kuaishou.DATA_TYPE_KEYWORD, urlPack.extra['keyword']) msg.addKey('pcursor', int(urlPack.extra['pcursor']) + 1) self.publish(msg) log.debug("publish to next page: {}".format(self.pipe.name)) return
def work(self): """ main worker """ log.notice("in ItemInfoHandler handler") key = self.checkParamAsString('key') db = mongo.DB() table = const.getTable(key) resp = db.getOne(table, dbtools.get_object_id_by_key(key)) adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]), fromlist=["libs.adapter"]) resp = adapter.transformAuthorDetail(resp) self.response_data = resp
def handler(self, root, data, urlPack): # @UnusedVariable #log.debug("HuoShanAuthorDetailSaver", data) if type(data) == dict: data = [data] for user in data: obj = dbtools.MongoObject() obj.setMeta(const.DATA_TYPE_AUTHOR, const_huoshan.DATA_PROVIDER, user["id"]) obj.setData(user) obj.save(const.getTable(const.DATA_TYPE_AUTHOR)) log.debug("HuoShanAuthorDetailSaver Inserting obj {}".format( obj.getLastObjectId())) self.addStatObject(obj.getLastObjectId(), const.DATA_TYPE_AUTHOR) return
def work(self): """ main worker """ log.notice("in ItemInfoHandler handler") key = self.checkParamAsString('key') db = mongo.DB() table = const.getTable(key) itemType, provider, thirdId, version = dbtools.get_key_info(key) resp = db.getOne(table, dbtools.get_object_id_by_key(key)) if resp is None: self.response_data = resp return adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]), fromlist=["libs.adapter"]) if itemType == "VIDEO": uid = adaptertool.getUid(resp) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) authorInfo = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') if authorInfo is None: log.fatal("no author info for key:{}".format(key)) raise ValueError("no author meta") return resp['_authorInfo_'] = authorInfo resp['_callback_'] = "http://" + conftool.randomChoice( CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + key resp = adaptertool.transform(key, resp) elif itemType == "AUTHOR": resp = adapter.transformAuthorDetail(resp) else: raise ValueError("Invalid itemType") self.response_data = resp log.notice("get iteminfo: {},{},{},{}".format(itemType, provider, thirdId, version))
def onReceiveMsg(self, msg): """ :param msg: """ log.debug("KuaiShouVideoDetailProvider receive {}".format(msg)) if msg.msgType == const_kuaishou.DATA_TYPE_VIDEO: key = dbtools.gen_object_key(const.DATA_TYPE_VIDEO, 'kuaishou', msg.msgData) if self.db.getOne(const.getTable('VIDEO'), key, '_key_') is None: urlPack = urlprovider.UrlPack(priority=0, url=self.url) urlPack.setForm(self.form.format(vid=msg.msgData)) urlPack.fillMsg(msg, self.pipe) self.add(urlPack) return True else: log.debug("vid:{} has already inserted".format(msg.msgData)) return False
def handler(self, root, data, urlPack): # @UnusedVariable feeds = data["feeds"] pcursor = data["pcursor"] tag = urlPack.getKey("tag") log.debug("KuaiShouShareTagSaver tag:{}, feed length: {}, pcursor: {}". format(tag, len(feeds), pcursor)) for info in feeds: info[self.pipe.name] = int(time.time()) authorId = info['userId'] author_obj = dbtools.MongoObject(db=self.db) author_obj.setMeta(const_kuaishou.DATA_TYPE_AUTHOR, const_kuaishou.DATA_PROVIDER, authorId) author_obj.setData(info) if not self.db.isItemUpdatedRecently(author_obj.key, 3 * 86400): author_obj.save() msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, authorId) self.publish(msg) else: log.debug("skip user_id:{}".format(authorId)) videoId = info["photoId"] videoId_obj = dbtools.MongoObject(db=self.db) videoId_obj.setMeta(const_kuaishou.DATA_TYPE_VIDEO, const_kuaishou.DATA_PROVIDER, videoId) if not self.db.getOne(const.getTable(const.DATA_TYPE_VIDEO), videoId_obj.key): msg = Message(const_kuaishou.DATA_TYPE_VIDEO, videoId) self.publish(msg) if pcursor != "no_more": msg = Message(const_kuaishou.DATA_TYPE_TAG_NAME, tag) msg.setExtra("topic_id", urlPack.getKey("topic_id")) msg.setExtra("pcursor", pcursor) self.publish(msg) time.sleep(60) return
def isItemUpdatedRecently(self, itemKey, recentSeconds=3 * 86400): return self.isObjectUpdatedRecently(const.getTable(itemKey), itemKey, recentSeconds)
def work(self): """ main worker """ log.notice("in JobHandler handler") cmd = self.getParamAsString('cmd') if cmd == "get": #从队列提取一条item try: q = queue.JobPriorityQueue() itemKey, priority = q.deQueue(True) if itemKey is False: self.response_data = {"notice": "queue empty"} return self.response_data = {"_key_": itemKey} queueBack = queue.JobBackupQueue() queueBack.enQueue(itemKey, time.time()) _, provider, thirdId, _ = dbtools.get_key_info(itemKey) isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId)) db = mongo.DB() if isCrawled: insertVal = {} insertVal["_crawl_"] = const.CRAWL_STATUS_OK insertVal["_utime_"] = int(time.time()) db.updateByKey(const.getTable(itemKey), itemKey, insertVal) self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK} return data = db.getOne(const.getTable(itemKey), itemKey, '_key_') uid = adaptertool.getUid(data) authorKey = "AUTHOR-{}-{}-1".format(provider, uid) data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_') data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey data['_priority_'] = priority if len(data.get('_topic3rdId_', '')) > 0: try: topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_']) topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_') data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic'] except Exception as e: log.warning("error_get_microVideoTopic", e) self.response_data = data log.notice("pop one not crawled:{}".format(itemKey)) except Exception as e: log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey)) self.response_data = {"_key_": itemKey, "error": str(e)} return if cmd == "add": itemKey = self.checkParamAsString('_key_') priority = self.getParamAsInt('priority', 10000) q = queue.JobPriorityQueue() resp = q.enQueue(itemKey, priority) self.response_data = resp return if cmd == "callback": itemKey = self.checkParamAsString('_key_') log.notice("got a callback:{}".format(itemKey)) db = mongo.DB() stat = statistics.Statistics() value = {} value["_crawl_"] = 1 value["_utime_"] = int(time.time()) if self.getParamAsString('from') == 'mimod': value['_cspubResult_'] = self.getParamAsString('result', '') stat.incrCspubResult(value['_cspubResult_']) resp = db.updateByKey(const.getTable(itemKey), itemKey, value) self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp} stat.incrSenderCallback() return raise ValueError("invalid cmd: ".format(cmd))