示例#1
0
def searchRange(casperScript, dep, arr, dateRange, retryTimes):
    depTime = datetime.datetime.today()
    d = datetime.timedelta(days=1)
    for i in range(dateRange):
        depTime = depTime + d
        ret = ER_SUCC
        for j in range(retryTimes):
            ret = searchOne(casperScript, dep, arr, depTime.strftime("%Y-%m-%d"))
            if ret == ER_SUCC:
                L.info("{} -> {}  {}", dep[0], arr[0], depTime.strftime("%Y-%m-%d"))
                break
        
        if ER_SUCC != ret:
            L.error("retry {} times, {} -> {}  {} failed", retryTimes, dep[0], arr[0], depTime.strftime("%Y-%m-%d"))
示例#2
0
def searchRange(casperScript, dep, arr, dateRange, retryTimes):
    depTime = datetime.datetime.today()
    d = datetime.timedelta(days=1)
    for i in range(dateRange):
        depTime = depTime + d
        ret = ER_SUCC
        for j in range(retryTimes):
            ret = searchOne(casperScript, dep, arr,
                            depTime.strftime("%Y-%m-%d"))
            if ret == ER_SUCC:
                L.info("{} -> {}  {}", dep[0], arr[0],
                       depTime.strftime("%Y-%m-%d"))
                break

        if ER_SUCC != ret:
            L.error("retry {} times, {} -> {}  {} failed", retryTimes, dep[0],
                    arr[0], depTime.strftime("%Y-%m-%d"))
示例#3
0
 def crawlAllAirlinesWithRetry(self, retryTime):    
     self.crawlAllAirlines()
     
     lstCur = self.lstFailAirline
     lstNext = []
     for i in range(1, retryTime + 1):
         if len(lstCur) == 0:
             return ER_SUCC
         
         for it in lstCur:
             L.info("retry[{}] {} -> {}", i, it[0][0], it[1][0])
             startDate = datetime.datetime.today() + datetime.timedelta(days=1)
             if ER_SUCC != self.crawlOneAirline(it[0], it[1], startDate):
                 lstNext.append(it)
                 
         lstCur = lstNext
         
     for it in lstCur:
         L.error("{} -> {} retry {} times failed", it[0][0], it[1][0], retryTime)
示例#4
0
    def crawlOneAirline(self, depInfo, arrInfo, startDate):
        curDateTime = time.localtime(time.time())
        queryDate = time.strftime('%Y-%m-%d', curDateTime)
        queryTime = time.strftime('%H:%M:%S', curDateTime)
        urlBase = "http://flight.qunar.com/twell/flight/farecast.jsp?departureCity={}&arrivalCity={}&nextNDays=0&departureDate={}&searchType=OnewayFlight&searchLangs=zh&locale=zh&serverIP=twell4&allowOld=true&queryID=127.0.0.1%3A1c1ea29%3A113aed2be0b%3A-7bfb&dayNum={}&pageNum=0"
        url = urlBase.format(depInfo[1], arrInfo[1], startDate, self.dateRange)
        #         print url
        try:
            r = requests.get(url, timeout=10)
        except Exception as e:
            L.error(e)
            L.error("{} -> {} timeout, url={}", depInfo[0], arrInfo[0], url)
            return ER_REQUEST_TIMEOUT

        if r.status_code != 200:
            L.error("{} -> {} failed, url={}", depInfo[0], arrInfo[0], url)
            return ER_RESPONSE_FAIL

        L.info("{} -> {}", depInfo[0], arrInfo[0])
        bs = BeautifulSoup(r.text, 'lxml-xml')
        resultData = bs.find('ResultData')
        for airline in resultData.children:
            if airline.name == 'lowestPrice':
                d = airline.attrs
                allAttrs = ('date', 'code', 'depTime', 'arrTime', 'carrier',
                            'vendorName', 'price')
                attrOK = True
                for attr in allAttrs:
                    if attr not in d:
                        attrOK = False
                        break
                if not attrOK:
                    continue

                info = FlightLowestPriceInfo(queryDate, queryTime, depInfo[0],
                                             arrInfo[0],
                                             [d[x] for x in allAttrs])
                #                 print info.asRec()
                self.dbHandle.insertOneRec(info)

        return ER_SUCC
示例#5
0
    def crawlAllAirlinesWithRetry(self, retryTime):
        self.crawlAllAirlines()

        lstCur = self.lstFailAirline
        lstNext = []
        for i in range(1, retryTime + 1):
            if len(lstCur) == 0:
                return ER_SUCC

            for it in lstCur:
                L.info("retry[{}] {} -> {}", i, it[0][0], it[1][0])
                startDate = datetime.datetime.today() + datetime.timedelta(
                    days=1)
                if ER_SUCC != self.crawlOneAirline(it[0], it[1], startDate):
                    lstNext.append(it)

            lstCur = lstNext

        for it in lstCur:
            L.error("{} -> {} retry {} times failed", it[0][0], it[1][0],
                    retryTime)
示例#6
0
    def crawlOneAirline(self, depInfo, arrInfo, startDate):
        curDateTime =  time.localtime(time.time())
        queryDate = time.strftime('%Y-%m-%d', curDateTime)
        queryTime = time.strftime('%H:%M:%S', curDateTime)
        urlBase = "http://flight.qunar.com/twell/flight/farecast.jsp?departureCity={}&arrivalCity={}&nextNDays=0&departureDate={}&searchType=OnewayFlight&searchLangs=zh&locale=zh&serverIP=twell4&allowOld=true&queryID=127.0.0.1%3A1c1ea29%3A113aed2be0b%3A-7bfb&dayNum={}&pageNum=0"
        url = urlBase.format(depInfo[1], arrInfo[1], startDate, self.dateRange)
#         print url
        try:
            r = requests.get(url, timeout=10)
        except Exception as e:
            L.error(e)
            L.error("{} -> {} timeout, url={}", depInfo[0], arrInfo[0], url)
            return ER_REQUEST_TIMEOUT
            
        if r.status_code != 200:
            L.error("{} -> {} failed, url={}", depInfo[0], arrInfo[0], url)
            return ER_RESPONSE_FAIL
        
        L.info("{} -> {}", depInfo[0], arrInfo[0])
        bs = BeautifulSoup(r.text, 'lxml-xml')
        resultData = bs.find('ResultData')
        for airline in resultData.children:
            if airline.name == 'lowestPrice':
                d = airline.attrs
                allAttrs = ('date', 'code', 'depTime', 'arrTime', 'carrier', 'vendorName', 'price')
                attrOK = True
                for attr in allAttrs:
                    if attr not in d:
                        attrOK = False
                        break
                if not attrOK:
                    continue        
                    
                info = FlightLowestPriceInfo(queryDate, queryTime, depInfo[0], arrInfo[0], [d[x] for x in allAttrs])
#                 print info.asRec()
                self.dbHandle.insertOneRec(info)
                
        return ER_SUCC