示例#1
0
 def __init__(self, db=None):
     if db is None:
         db = mongo.DB()
     self.db = db
     self.xpath_actions = []
     self.register_action("$.ch_info", self.handleChallangeDetail)
     return
示例#2
0
 def __init__(self, db=None):
     if db is None:
         db = mongo.DB()
     self.db = db
     self.xpath_actions = []
     self.register_action("$", self.handler)
     return
 def __init__(self, db=None):
     if db is None:
         db = mongo.DB()
     self.db = db
     self.xpath_actions = []
     #self.register_action("$.category_list[*].aweme_list[*].video.play_addr.url_list", self.replace_https)
     self.register_action("$.category_list[*].challenge_info", self.handleChallengeInfo)
     self.register_action("$.category_list[*].aweme_list", self.handleAwemeList)
     #self.register_action(None,self.add_extend_fileds)
     return
示例#4
0
 def work(self):
     """
         main worker
     """
     log.notice("in ItemInfoHandler handler")
     key = self.checkParamAsString('key')
     db = mongo.DB()
     table = const.getTable(key)
     resp = db.getOne(table, dbtools.get_object_id_by_key(key))
     adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]),
                          fromlist=["libs.adapter"])
     resp = adapter.transformAuthorDetail(resp)
     self.response_data = resp
示例#5
0
 def __init__(self, db=None):
     """
     Constructor
     """
     if db is None:
         db = mongo.DB()
     self.db = db
     self.dataSource = ''
     self.dataType = ''
     self.user3rdId = None
     self.video3rdId = None
     self.topic3rdId = None
     self.data = {}
     return
示例#6
0
 def work(self):
     """
         main worker
     """
     log.notice("in CmsHandler handler")
     ksid = self.checkParamAsString('ksid')
     db = mongo.DB()
     authorInfo = db.find('m_author', {'profile.kwaiId': ksid}, 1)
     if authorInfo is None or len(authorInfo) == 0:
         self.response_data = {"notice": "未收录作者信息"}
         return
     authorInfo = authorInfo[0]
     uid = adaptertool.getUid(authorInfo)
     count = db.getCollection('m_video').find({
         '_dataSource_': 'kuaishou',
         '_user3rdId_': str(uid)
     }).count()
     resp = {"_video_count_": count, '_authorInfo_': authorInfo}
     self.response_data = resp
示例#7
0
    def work(self):
        """
            main worker
        """
        log.notice("in ItemInfoHandler handler")
        key = self.checkParamAsString('key')
        db = mongo.DB()
        table = const.getTable(key)
        itemType, provider, thirdId, version = dbtools.get_key_info(key)
        resp = db.getOne(table, dbtools.get_object_id_by_key(key))
        if resp is None:
            self.response_data = resp
            return
        adapter = __import__('libs.adapter.adapter_' + str(key.split("-")[1]),
                             fromlist=["libs.adapter"])
        if itemType == "VIDEO":
            uid = adaptertool.getUid(resp)
            authorKey = "AUTHOR-{}-{}-1".format(provider, uid)
            authorInfo = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR),
                                   authorKey, '_key_')
            if authorInfo is None:
                log.fatal("no author info for key:{}".format(key))
                raise ValueError("no author meta")
                return
            resp['_authorInfo_'] = authorInfo
            resp['_callback_'] = "http://" + conftool.randomChoice(
                CALLBACK_HOST,
                CALLBACK_PORT) + "/job?cmd=callback&_key_=" + key
            resp = adaptertool.transform(key, resp)
        elif itemType == "AUTHOR":
            resp = adapter.transformAuthorDetail(resp)
        else:
            raise ValueError("Invalid itemType")

        self.response_data = resp
        log.notice("get iteminfo: {},{},{},{}".format(itemType, provider,
                                                      thirdId, version))
示例#8
0
 def __init__(self):
     urlprovider.UrlProvider.__init__(self)
     self.db = mongo.DB()
     return
示例#9
0
 def __init__(self, table, condition, fields=None):
     self.condition = condition
     self.table = table
     self.fields = fields
     self.db = mongo.DB()
     self.lastId = None
示例#10
0
 def work(self):
     """
         main worker
     """
     log.notice("in JobHandler handler")
     cmd = self.getParamAsString('cmd')
     if cmd == "get":
         #从队列提取一条item
         try:
             q = queue.JobPriorityQueue()
             itemKey, priority = q.deQueue(True)
             if itemKey is False:
                 self.response_data = {"notice": "queue empty"}
                 return
             self.response_data = {"_key_": itemKey}
             queueBack = queue.JobBackupQueue()
             queueBack.enQueue(itemKey, time.time())
             _, provider, thirdId, _ = dbtools.get_key_info(itemKey)
             isCrawled = spider_ucptool.isVideoCrawled("{}_{}".format(provider, thirdId))
             db = mongo.DB()
             if isCrawled:
                 insertVal = {}
                 insertVal["_crawl_"] = const.CRAWL_STATUS_OK
                 insertVal["_utime_"] = int(time.time())
                 db.updateByKey(const.getTable(itemKey), itemKey, insertVal)
                 self.response_data = {"_key_": itemKey, "_crawl_": const.CRAWL_STATUS_OK}
                 return
             data = db.getOne(const.getTable(itemKey), itemKey, '_key_')   
             uid = adaptertool.getUid(data)
             authorKey = "AUTHOR-{}-{}-1".format(provider, uid)
             data['_authorInfo_'] = db.getOne(const.getTable(const.DATA_TYPE_AUTHOR), authorKey, '_key_')
             data['_callback_'] = "http://" + conftool.randomChoice(CALLBACK_HOST, CALLBACK_PORT) + "/job?cmd=callback&_key_=" + itemKey
             data['_priority_'] = priority
             if len(data.get('_topic3rdId_', '')) > 0:
                 try:
                     topicKey = "TOPIC-{}-{}-1".format(provider, data['_topic3rdId_'])
                     topicInfo = db.getOne(const.getTable('TOPIC'), topicKey, '_key_')
                     data['microVideoTopic'] = adaptertool.transform(topicKey, topicInfo)['microVideoTopic']
                 except Exception as e:
                     log.warning("error_get_microVideoTopic", e)
                 
             self.response_data = data        
             log.notice("pop one not crawled:{}".format(itemKey))
         except Exception as e:
             log.fatal("error_get_job_fromqueue={}, _key_={}".format(e, itemKey))
             self.response_data = {"_key_": itemKey, "error": str(e)}
         return
     if cmd == "add":
         itemKey = self.checkParamAsString('_key_')
         priority = self.getParamAsInt('priority', 10000)
         q = queue.JobPriorityQueue()
         resp = q.enQueue(itemKey, priority)
         self.response_data = resp
         return
     if cmd == "callback":
         itemKey = self.checkParamAsString('_key_')
         log.notice("got a callback:{}".format(itemKey))
         db = mongo.DB()
         stat = statistics.Statistics()
         value = {}
         value["_crawl_"] = 1
         value["_utime_"] = int(time.time())
         if self.getParamAsString('from') == 'mimod':
             value['_cspubResult_'] = self.getParamAsString('result', '')
             stat.incrCspubResult(value['_cspubResult_'])
         resp = db.updateByKey(const.getTable(itemKey), itemKey, value)
         self.response_data = {"_key_": itemKey, "_crawl_": 1, 'resp': resp}
         stat.incrSenderCallback()
         return
     raise ValueError("invalid cmd: ".format(cmd))