def handleUserDetail(self, root, data, urlPack):  # @UnusedVariable
        cursor = data["max_cursor"]
        aweme_list = data["aweme_list"]
        for aweme in aweme_list:
            vid = aweme["aweme_id"]
            uid = aweme["author_user_id"]
            obj = dbtools.MongoObject()
            obj.setMeta(const_douyin.DATA_TYPE_VIDEO,
                        const_douyin.DATA_PROVIDER,
                        vid,
                        version=const_douyin.DATA_VERSION)
            obj.setData(aweme)
            obj.setUserId(uid)
            obj.save(const_douyin.MONGO_TABLE_VIDEO)
            log.debug("DouyinAuthorVideoSaver Inserting obj {}".format(
                obj.getLastObjectId()))
            self.addStatObject(obj.getLastObjectId(),
                               const_douyin.DATA_TYPE_VIDEO)

        if data["has_more"] == 1:
            msg = Message(const_douyin.DATA_TYPE_AUTHOR, uid)
            msg.setExtra("cursor", cursor)
            self.publish(msg)
        else:
            log.debug("DouyinAuthorVideoSaver: no more!")

        return
 def handler(self, root, users, urlPack):  # @UnusedVariable
     log.debug("return users len:[{}]".format(len(users)))
     for user in users:
         key = dbtools.gen_object_key('AUTHOR', 'kuaishou', user['user_id'])
         if not self.db.isObjectUpdatedRecently(const.getTable('AUTHOR'), key, 365 * 86400):
             log.debug("search result, pcursor={}, user_id={}".format(urlPack.getKey('pcursor'), user['user_id']))
             msg = Message(const.DATA_TYPE_AUTHOR, user['user_id'])
             self.publish(msg)
             obj = dbtools.MongoObject()
             obj.setMeta(const.DATA_TYPE_AUTHOR, const_kuaishou.DATA_PROVIDER, user['user_id'])
             obj.setData(user)
             obj.save()
             log.debug("KuaiShouSearchUserSaver Inserting obj {}".format(obj.getLastObjectId()))
             self.addStatObject(obj.getLastObjectId(), const_kuaishou.DATA_TYPE_VIDEO)
             
             #authorDetail
             p = self.pipe.stat.getPipeByName('KuaiShouAuthorDetailPipeCspub')
             msg = Message('AUTHOR', user['user_id'])
             p.addMessageObject(msg)
             
             if int(urlPack.getKey('pcursor')) <= 10:
                 p = self.pipe.stat.getPipeByName('KuaiShouAuthorVideosPipeCspub')
                 msg = Message('AUTHOR', user['user_id'])
                 p.addMessageObject(msg)
             
     if type(users) is list and len(users) > 0:
         time.sleep(10)
         msg = Message(const_kuaishou.DATA_TYPE_KEYWORD, urlPack.extra['keyword'])
         msg.addKey('pcursor', int(urlPack.extra['pcursor']) + 1)
         self.publish(msg)
         log.debug("publish to next page: {}".format(self.pipe.name))
     return
示例#3
0
    def work(self):
        """
            main worker
        """
        log.notice("in SpiderControlHandler handler")
        sampleId = self.getParamAsString('s')
        if sampleId:
            samples = util.load_file_asdict("./data/spider_add.samples", 0,
                                            ":")
            params = util.qs_to_dict(samples[sampleId][0][1])
            pipeName = params["pipe"]
            msgType = params["msg_type"]
            msgData = params["msg_data"]
            priority = params.get("priority", 0)

        else:
            pipeName = self.checkParamAsString("pipe")
            msgType = self.checkParamAsString("msg_type")
            msgData = self.checkParamAsString("msg_data")
            priority = self.getParamAsInt("priority", 0)

        pipe = self.statistics.getPipeByName(pipeName)
        cmd = self.getParamAsString("cmd")
        if cmd == "save":
            self.response_data = pipe.save()
            return
        if cmd == "load":
            self.response_data = pipe.load()
            return

        if pipe is None:
            self.response_data = {
                "added": 0,
                "errmsg": "pipe {} not exist".format(pipeName),
                "msg_type": msgType,
                "msg_data": msgData
            }
            return

        pipeLine = self.getParamAsInt('pipeline', 0)
        msg = Message(msgType, msgData)
        msg.setExtra('priority', priority)
        msg.setExtra('pipeLine', pipeLine)
        #print(msg)
        qsize = pipe.addMessageObject(msg)
        self.response_data = {
            "added": qsize,
            "msg_type": msgType,
            "msg_data": msgData
        }
    def handleChallengeInfo(self, root, dataDict, urlPack):  # @UnusedVariable
        for data in dataDict['challenge_list']:
            data = data['challenge_info']
            if data['user_count'] < 10:
                continue
            obj = dbtools.MongoObject(self.db)
            obj.setMeta(const_douyin.DATA_TYPE_TOPIC,
                        const_douyin.DATA_PROVIDER, data["cid"])
            obj.setData(data)
            #最近一天更新过
            if not self.db.isObjectUpdatedRecently(
                    const_douyin.MONGO_TABLE_TOPIC, obj.key, 86400):
                #self.publish(Message(const_douyin.DATA_TYPE_TOPIC, data["cid"]))
                obj.save(const_douyin.MONGO_TABLE_TOPIC)
                log.debug(
                    "DouyinTopicByKeywordSaver Inserting obj _key_={}, user_count={}"
                    .format(obj.key, data['user_count']))
                msg = Message(const_douyin.DATA_TYPE_TOPIC, data["cid"])
                self.publish(msg)
                self.addStatObject(obj.getLastObjectId(), "TOPIC")

        if dataDict['has_more'] > 0:
            msg = Message(const_douyin.DATA_TYPE_TOPIC_KEYWORD,
                          urlPack.getKey('keyword'))
            msg.setExtra('keyword', urlPack.getKey('keyword'))
            msg.setExtra('cursor', urlPack.getKey('cursor', 0) + 20)
            self.publish(msg)
        return
示例#5
0
    def handler(self, root, data, urlPack):  # @UnusedVariable
        log.debug("huoshan main_feed_up saver handler, len={}".format(
            len(data["data"])))
        for info in data["data"]:
            vid = str(info['data']['id'])
            uid = str(info['data']['author']['id'])
            obj = dbtools.MongoObject()
            #视频直接存下来
            obj = dbtools.MongoObject()
            obj.setMeta("VIDEO", const_huoshan.DATA_PROVIDER, vid)
            obj.setUserId(uid)
            obj.setData(info)
            if not self.db.isItemUpdatedRecently(obj.key):
                obj.save()
                log.debug(
                    "Inserting obj from HuoshanMainFeedUp video: {}".format(
                        obj.getLastObjectId()))
            else:
                log.debug(
                    "HuoshanMainFeedUp video: {} already inserted".format(
                        obj.getLastObjectId()))

            #如果作者三天以上未更新, 则publish uid
            authorKey = dbtools.gen_object_key('AUTHOR',
                                               const_huoshan.DATA_PROVIDER,
                                               uid)
            if not self.db.isItemUpdatedRecently(authorKey, 3 * 86400):
                self.addStatObject(authorKey, const_huoshan.DATA_TYPE_AUTHOR)
                msg = Message(const_huoshan.DATA_TYPE_AUTHOR, uid)
                self.pipe.publish(msg)
            else:
                log.debug("author updated recently")

        return
 def handleChallengeInfo(self, root, data, urlPack):  # @UnusedVariable
     log.debug("handleChallengeInfo", data)
     obj = dbtools.MongoObject(self.db)
     obj.setMeta(const_douyin.DATA_TYPE_TOPIC, const_douyin.DATA_PROVIDER, data["cid"])
     if not self.db.isObjectUpdatedRecently(const_douyin.MONGO_TABLE_TOPIC, obj.key):
         self.publish(Message(const_douyin.DATA_TYPE_TOPIC, data["cid"]))
     return
示例#7
0
    def handler(self, root, data, urlPack):  # @UnusedVariable
        log.debug("kuaishou main feed saver handler, len={}".format(
            len(data["feeds"])))
        for info in data["feeds"]:
            vid = str(info['photo_id'])
            uid = str(info['user_id'])

            #视频直接存下来
            obj = dbtools.MongoObject()
            obj.setMeta("VIDEO", const_kuaishou.DATA_PROVIDER, vid)
            obj.setUserId(uid)
            obj.setData(info)
            if not self.db.isItemUpdatedRecently(obj.key):
                obj.save()
                log.debug("Inserting obj from KuaishouMainFeed: {}".format(
                    obj.getLastObjectId()))

            #如果作者三天以上未更新, 则publish uid
            authorKey = dbtools.gen_object_key('AUTHOR',
                                               const_kuaishou.DATA_PROVIDER,
                                               uid)
            if not self.db.isItemUpdatedRecently(authorKey, 3 * 86400):
                objAuthor = dbtools.MongoObject()
                objAuthor.setMeta("AUTHOR", const_kuaishou.DATA_PROVIDER, uid)
                objAuthor.save()
                self.addStatObject(authorKey, const_kuaishou.DATA_TYPE_AUTHOR)
                msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, uid)
                self.pipe.publish(msg)
            else:
                log.notice("kuaishou author updated recently")

        return
 def handler(self, root, dataList, urlPack):  # @UnusedVariable
     for entity in dataList:
         data = entity['data']
         authorId = data['author']['id']
         del data["author"]
         obj = dbtools.MongoObject()
         obj.setMeta(const.DATA_TYPE_VIDEO, const_huoshan.DATA_PROVIDER, data["id"])
         obj.setData(data)
         obj.setUserId(authorId)
         obj.save()
         log.debug("HuoshanAuthorVideoListSaver Inserting obj {}".format(obj.getLastObjectId()))
         self.addStatObject(obj.getLastObjectId(), const.DATA_TYPE_VIDEO)
     if root['extra']['has_more']:
         msg = Message(const.DATA_TYPE_AUTHOR, authorId)
         msg.setExtra('max_time', root['extra']['max_time'])
         self.publish(msg)
     return
 def handler(self, root, data, urlPack):  # @UnusedVariable
     log.debug("got tag len={}".format(len(data['tags'])))
     for i, info in enumerate(data["tags"]):
         tag_name = info["tag"].strip()
         md5_key = util.md5(tag_name)
         obj = dbtools.MongoObject()
         obj.setMeta(const.DATA_TYPE_TOPIC, const_kuaishou.DATA_PROVIDER, md5_key, version=const_kuaishou.DATA_VERSION)
         obj.setData(info)
         obj.save()
         log.debug("KuaiShouSearchTagSaver Inserting obj {}, tag={}".format(obj.getLastObjectId(), tag_name))
         self.addStatObject(obj.getLastObjectId(), const.DATA_TYPE_TOPIC)
         msg = Message(const_kuaishou.DATA_TYPE_TAG_NAME, info["tag"])
         msg.setExtra("topic_id", md5_key)
         self.publish(msg)
         if i == len(data['tags']) - 1:
             continue
         time.sleep(40)
     return
 def handler(self, root, data, urlPack):  # @UnusedVariable
     feeds = data["feeds"]
     pcursor = data["pcursor"]
     for info in feeds:
         info[self.pipe.name] = int(time.time())
         obj = dbtools.MongoObject()
         obj.setMeta(const_kuaishou.DATA_TYPE_VIDEO, const_kuaishou.DATA_PROVIDER, info["photo_id"])
         obj.setData(info)
         obj.setUserId(info['user_id'])
         obj.save(const_kuaishou.MONGO_TABLE_VIDEO)
         log.debug("KuaiShouAuthorVideoListSaver Inserting obj {}".format(obj.getLastObjectId()))
         self.addStatObject(obj.getLastObjectId(), const_kuaishou.DATA_TYPE_VIDEO)
         authorId = info['user_id']
     log.debug("KuaiShouAuthorVideoListSaver feed length: {}, pcursor: {}".format(len(feeds), pcursor))
     if len(feeds) > 0:
         msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, authorId)
         msg.setExtra("pcursor", pcursor)
         self.publish(msg)
     return
 def handleAwemeList(self, root, data, urlPack):  # @UnusedVariable
     for music in data:
         obj = dbtools.MongoObject()
         obj.setMeta(const_douyin.DATA_TYPE_VIDEO, const_douyin.DATA_PROVIDER, music["aweme_id"], version=self.DATA_VERSION)
         obj.setData(music)
         obj.save()
         log.debug("DouyinTopicSaver Insert obj {}".format(obj.getLastObjectId()))
         self.addStatObject(obj.getLastObjectId(), const_douyin.DATA_TYPE_VIDEO)
         self.publish(Message(const_douyin.DATA_TYPE_VIDEO, music["aweme_id"]))
     return
示例#12
0
    def handler(self, root, data, urlPack):  # @UnusedVariable
        feeds = data["feeds"]
        pcursor = data["pcursor"]
        tag = urlPack.getKey("tag")
        log.debug("KuaiShouShareTagSaver tag:{}, feed length: {}, pcursor: {}".
                  format(tag, len(feeds), pcursor))
        for info in feeds:
            info[self.pipe.name] = int(time.time())
            authorId = info['userId']
            author_obj = dbtools.MongoObject(db=self.db)
            author_obj.setMeta(const_kuaishou.DATA_TYPE_AUTHOR,
                               const_kuaishou.DATA_PROVIDER, authorId)
            author_obj.setData(info)
            if not self.db.isItemUpdatedRecently(author_obj.key, 3 * 86400):
                author_obj.save()
                msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, authorId)
                self.publish(msg)
            else:
                log.debug("skip user_id:{}".format(authorId))

            videoId = info["photoId"]
            videoId_obj = dbtools.MongoObject(db=self.db)
            videoId_obj.setMeta(const_kuaishou.DATA_TYPE_VIDEO,
                                const_kuaishou.DATA_PROVIDER, videoId)
            if not self.db.getOne(const.getTable(const.DATA_TYPE_VIDEO),
                                  videoId_obj.key):
                msg = Message(const_kuaishou.DATA_TYPE_VIDEO, videoId)
                self.publish(msg)

        if pcursor != "no_more":
            msg = Message(const_kuaishou.DATA_TYPE_TAG_NAME, tag)
            msg.setExtra("topic_id", urlPack.getKey("topic_id"))
            msg.setExtra("pcursor", pcursor)
            self.publish(msg)
            time.sleep(60)

        return
 def handleUserDetail(self, root, data, urlPack):  # @UnusedVariable
     uid = data["user"]["uid"]
     obj = dbtools.MongoObject()
     obj.setMeta(const_douyin.DATA_TYPE_AUTHOR,
                 const_douyin.DATA_PROVIDER,
                 uid,
                 version=const_douyin.DATA_VERSION)
     obj.setData(data["user"])
     obj.save(const_douyin.MONGO_TABLE_AUTHOR)
     log.debug("DouyinAuthorDetailSaver Inserting obj {}".format(
         obj.getLastObjectId()))
     self.addStatObject(obj.getLastObjectId(),
                        const_douyin.DATA_TYPE_AUTHOR)
     self.pipe.publish(Message(const_douyin.DATA_TYPE_AUTHOR, uid))
     return
    def handler(self, root, data, urlPack):  # @UnusedVariable
        feeds = data["feeds"]
        pcursor = data["pcursor"]
        tag = urlPack.getKey("tag")
        log.debug(
            "KuaiShouVideoListSaver tag:{}, feed length: {}, pcursor: {}".
            format(tag, len(feeds), pcursor))
        for info in feeds:
            info[self.pipe.name] = int(time.time())
            obj = dbtools.MongoObject()
            obj.setMeta(const_kuaishou.DATA_TYPE_VIDEO,
                        const_kuaishou.DATA_PROVIDER,
                        info["photo_id"],
                        version=const_kuaishou.DATA_VERSION)
            obj.setData(info)
            obj.setUserId(info['user_id'])
            obj.setTopicId(urlPack.getKey("topic_id"))
            obj.save()
            log.debug("KuaiShouTagFeedSaver Inserting obj {}, tag={}".format(
                obj.getLastObjectId(), tag))
            self.addStatObject(obj.getLastObjectId(),
                               const_kuaishou.DATA_TYPE_VIDEO)
            authorId = info['user_id']

            author_obj = dbtools.MongoObject(db=self.db)
            author_obj.setMeta(const_kuaishou.DATA_TYPE_AUTHOR,
                               const_kuaishou.DATA_PROVIDER, authorId)
            if not self.db.isItemUpdatedRecently(author_obj.key, 3 * 86400):
                msg = Message(const_kuaishou.DATA_TYPE_AUTHOR, authorId)
                self.publish(msg)
            else:
                log.debug("skip user_id:{}".format(authorId))

        if pcursor != "no_more":
            msg = Message(const_kuaishou.DATA_TYPE_TAG_NAME, tag)
            msg.setExtra("topic_id", urlPack.getKey("topic_id"))
            msg.setExtra("pcursor", pcursor)
            self.publish(msg)
            time.sleep(60)

        return
示例#15
0
    def handler(self, root, data, urlPack):  # @UnusedVariable
        comments = data["data"]["comments"]
        vid = urlPack.getKey("vid")
        offset = urlPack.getKey("offset")

        for comment in comments:
            uid = comment["user"]["id"]
            log.debug("HuoshanVideoComments get one uid: {}".format(uid))
            authorKey = dbtools.gen_object_key('AUTHOR',
                                               const_huoshan.DATA_PROVIDER,
                                               uid)
            if not self.db.isItemUpdatedRecently(authorKey, 3 * 86400):
                msg = Message(const_huoshan.DATA_TYPE_AUTHOR, uid)
                self.publish(msg)
            else:
                log.debug("huoshan user_id:{} has already updated".format(uid))

        if data['extra']['has_more']:
            msg = Message(const.DATA_TYPE_VIDEO, vid)
            msg.setExtra('offset', offset + 1)
            self.publish(msg)
        return