def searchRange(casperScript, dep, arr, dateRange, retryTimes): depTime = datetime.datetime.today() d = datetime.timedelta(days=1) for i in range(dateRange): depTime = depTime + d ret = ER_SUCC for j in range(retryTimes): ret = searchOne(casperScript, dep, arr, depTime.strftime("%Y-%m-%d")) if ret == ER_SUCC: L.info("{} -> {} {}", dep[0], arr[0], depTime.strftime("%Y-%m-%d")) break if ER_SUCC != ret: L.error("retry {} times, {} -> {} {} failed", retryTimes, dep[0], arr[0], depTime.strftime("%Y-%m-%d"))
def crawlAllAirlinesWithRetry(self, retryTime): self.crawlAllAirlines() lstCur = self.lstFailAirline lstNext = [] for i in range(1, retryTime + 1): if len(lstCur) == 0: return ER_SUCC for it in lstCur: L.info("retry[{}] {} -> {}", i, it[0][0], it[1][0]) startDate = datetime.datetime.today() + datetime.timedelta(days=1) if ER_SUCC != self.crawlOneAirline(it[0], it[1], startDate): lstNext.append(it) lstCur = lstNext for it in lstCur: L.error("{} -> {} retry {} times failed", it[0][0], it[1][0], retryTime)
def crawlOneAirline(self, depInfo, arrInfo, startDate): curDateTime = time.localtime(time.time()) queryDate = time.strftime('%Y-%m-%d', curDateTime) queryTime = time.strftime('%H:%M:%S', curDateTime) urlBase = "http://flight.qunar.com/twell/flight/farecast.jsp?departureCity={}&arrivalCity={}&nextNDays=0&departureDate={}&searchType=OnewayFlight&searchLangs=zh&locale=zh&serverIP=twell4&allowOld=true&queryID=127.0.0.1%3A1c1ea29%3A113aed2be0b%3A-7bfb&dayNum={}&pageNum=0" url = urlBase.format(depInfo[1], arrInfo[1], startDate, self.dateRange) # print url try: r = requests.get(url, timeout=10) except Exception as e: L.error(e) L.error("{} -> {} timeout, url={}", depInfo[0], arrInfo[0], url) return ER_REQUEST_TIMEOUT if r.status_code != 200: L.error("{} -> {} failed, url={}", depInfo[0], arrInfo[0], url) return ER_RESPONSE_FAIL L.info("{} -> {}", depInfo[0], arrInfo[0]) bs = BeautifulSoup(r.text, 'lxml-xml') resultData = bs.find('ResultData') for airline in resultData.children: if airline.name == 'lowestPrice': d = airline.attrs allAttrs = ('date', 'code', 'depTime', 'arrTime', 'carrier', 'vendorName', 'price') attrOK = True for attr in allAttrs: if attr not in d: attrOK = False break if not attrOK: continue info = FlightLowestPriceInfo(queryDate, queryTime, depInfo[0], arrInfo[0], [d[x] for x in allAttrs]) # print info.asRec() self.dbHandle.insertOneRec(info) return ER_SUCC
def crawlAllAirlinesWithRetry(self, retryTime): self.crawlAllAirlines() lstCur = self.lstFailAirline lstNext = [] for i in range(1, retryTime + 1): if len(lstCur) == 0: return ER_SUCC for it in lstCur: L.info("retry[{}] {} -> {}", i, it[0][0], it[1][0]) startDate = datetime.datetime.today() + datetime.timedelta( days=1) if ER_SUCC != self.crawlOneAirline(it[0], it[1], startDate): lstNext.append(it) lstCur = lstNext for it in lstCur: L.error("{} -> {} retry {} times failed", it[0][0], it[1][0], retryTime)