예제 #1
0
 def findPendingUrls4JinghuaByStatusAndSpiderName(self, spiderName):
     '''
     精华游记url处理:不下载item只要url
     '''
     whereJson = {"status": {"$gte": 300, "$lt": 1001}, "priority": 1}
     return mongoApt.find(self.urlDbnamekey,
                          self.urlCollectionsMap[spiderName],
                          whereJson=whereJson,
                          sortField='priority')  #self.urlIncreasement * 20
예제 #2
0
 def findUrlsForDupfilter(self, spiderName):
     '''
     加载用于排重的url的md5值
     '''
     whereJson = {"status": {"$lt": 400}}
     return mongoApt.find(self.urlDbnamekey,
                          self.urlCollectionsMap[spiderName],
                          whereJson=whereJson,
                          sortField='status')
예제 #3
0
 def findUnparsedPageByStatus(self, spiderName):
     '''
     查询待解析的Page,通过状态
     '''
     colName = 'Page'
     whereJson = {'status': {'$lt': 200, '$gt': 0}}
     #        whereJson={'_id':ObjectId('4e5e0e7df77648068901556d')}
     #        whereJson={'url':'http://www.mafengwo.cn/i/704654.html'}
     cursor = mongoApt.find(spiderName, colName, whereJson=whereJson)
     return cursor
예제 #4
0
 def findPendingUrlsByStatusAndSpiderName(self,
                                          spiderName,
                                          statusBegin=400,
                                          statusEnd=800):
     '''
     未被下载或下载失败的url,以便相应爬虫的恢复
     '''
     whereJson = {"status": {"$gte": statusBegin, "$lt": statusEnd}}
     return mongoApt.find(self.urlDbnamekey,
                          self.urlCollectionsMap[spiderName],
                          whereJson=whereJson,
                          sortField='priority')  #self.urlIncreasement * 20
예제 #5
0
 def getRequestsToSupplyPendingreqeust(self, spiderName):
     """
     从数据库加载新url,创建request,补充pengdingRequest
     """
     whereJson = {"status": 1000}
     cursor = mongoApt.find(self.urlDbnamekey,
                            self.urlCollectionsMap[spiderName],
                            whereJson=whereJson,
                            sortField='priority',
                            limitNum=self.urlIncreasement)
     requests = []
     for p in cursor:
         req = self.makeRequest(p["url"],
                                callBackFunctionName=p["callBack"],
                                urlId=p['_id'],
                                priority=p["priority"])
         requests.append(req)
         if len(requests) >= self.urlIncreasement:
             break
     return requests
예제 #6
0
 def getRequestWithUpdateStrategy(self, spiderName):
     """
     更新策略
     """
     whereJson = {
         "status": {
             "$lt": 400
         },
         "spiderName": spiderName,
         'updateInterval': {
             '$exists': True
         }
     }
     cursor = mongoApt.find(self.urlDbnamekey,
                            self.urlCollectionsMap[spiderName],
                            whereJson=whereJson,
                            sortField='status')
     requests = []
     for p in cursor:
         if 'updateInterval' in p and p['status'] in [
                 200, 304
         ] and datetime.datetime.now() - datetime.timedelta(
                 days=p["updateInterval"]) > p["dateTime"]:
             meta = {}
             headers = {}
             if 'reference' in p:
                 meta['reference'] = p['reference']
             if self.updateStrategy in p:
                 meta[self.updateStrategy] = p[self.updateStrategy]
                 headers['If-Modified-Since'] = self.getGMTFormatDate(
                     p['dateTime'])
             req = self.makeRequest(p["url"],
                                    callBackFunctionName=p["callBack"],
                                    meta=meta,
                                    urlId=p['_id'],
                                    priority=p["priority"],
                                    headers=headers)
             requests.append(req)
     return requests
예제 #7
0
 def findKerwordsForSespider(self):
     '''
     加载Se爬虫的搜素关键字
     '''
     colName = 'keyword'
     return mongoApt.find(self.urlDbnamekey, colName)