def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, 60) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(3) continue for data in result: (dt_st, dep, arr, days) = vyUtil.analysisData(data) for i in range(int(days)): dt = vyUtil.get_real_date(dt_st, i) data = dict(beginCity=dep, endCity=arr, beginDate=dt) data.update(self.custom_settings.get('DEFAULT_DATA')) yield scrapy.Request( url=self.start_urls, method="POST", body=json.dumps(data), meta=dict(data=data), dont_filter=True, callback=self.parse, errback=self.errback, )
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=1) result = next(result_iter) else: result = pubUtil.getUrl('aq', 1) if not result: logging.info('get task error') time.sleep(20) continue for data in result: # logging.info("###input data: " + data) (dt, dep, to) = pubUtil.analysisData(data) # dt,dep,to='2019-03-28','PVG','TPE' # ua = UserAgent() # self.headers['user-agent'] = ua.random post_data = 'B_LOCATION_1=' + dep + '&E_LOCATION_1=' + to + '&B_DATE_1=' + dt.replace( '-', '' ) + '0000&B_ANY_TIME_1=True&EMBEDDED_TRANSACTION=FlexPricerAvailability&ARRANGE_BY=D&DISPLAY_TYPE=2&PRICING_TYPE=O&SO_SITE_MATRIX_CALENDAR=FALSE&SO_SITE_RUI_CAL_AVAI_NO_RECO=TRUE&SO_SITE_RUI_FP_AVAI_PRESEL=FALSE&COMMERCIAL_FARE_FAMILY_1=NEWECOOW&COMMERCIAL_FARE_FAMILY_2=NEWDELOW&COMMERCIAL_FARE_FAMILY_3=NEWBIZOW&SO_SITE_RUI_AX_CAL_ENABLED=TRUE&SO_SITE_CAL_CHANGE_WEEK=TRUE&SO_SITE_RUI_HIDE_MDF_SRC=FALSE&EXTERNAL_ID%236=OW&TRAVELLER_TYPE_1=ADT&TRIP_TYPE=O&TRIP_FLOW=YES&SO_SITE_EXPORT_CONFIRM=TRUE&SO_SITE_EXPORT_CONF_URL=https%3A%2F%2Fbooking.evaair.com%2Fexporttripplan%2Fwebservice.aspx&SO_SITE_THREEDS_USE=N&SO_SITE_BILLING_NOT_REQUIRED=Y&SO_SITE_BILL_ADD_OPTIONS=BILL_ADD_HIDDEN&SO_SITE_PREBOOK_CANCELLATION=TRUE&SO_GL=%3C%3Fxml+version%3D%221.0%22+encoding%3D%22iso-8859-1%22%3F%3E%0D%0A%3CSO_GL%3E%0D%0A%3CGLOBAL_LIST+mode%3D%22partial%22%3E%0D%0A%3CNAME%3ESL_AIR_MOP%3C%2FNAME%3E%0D%0A%3CLIST_ELEMENT%3E%0D%0A%3CCODE%3ECC%3C%2FCODE%3E%0D%0A%3CLIST_VALUE%3ECredit+Card%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EY%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECC%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECryptic%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECC%25T%25I%2F%25E%2F%25C%25F%2FN%25A%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%2F%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECC%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3C%2FLIST_ELEMENT%3E%0D%0A%3C%2FGLOBAL_LIST%3E%0D%0A%3C%2FSO_GL%3E&SO_SITE_FD_DISPLAY_MODE=1&SO_SITE_CURRENCY_FORMAT_JAVA=0&SO_SITE_ENABLE_SRV_POLICY=BAG%2CCOA&SO_SITE_ALLOW_SPEC_REQ_SERV=FALSE&SO_SITE_SD_TRUE_OP_CARRIER=TRUE&SO_SITE_BARCODE_ENABLE=TRUE&SO_SITE_ALLOW_CS_CODE_SHARE=FALSE&SO_SITE_USE_PAYMENT_ACTION=TRUE&EXTERNAL_ID=AIBS&EXTERNAL_ID%232=&EXTERNAL_ID%233=&EXTERNAL_ID%234=NEWECOOW&EXTERNAL_ID%235=&EXTERNAL_ID%2314=N&EXTERNAL_ID%2312=&EXTERNAL_ID%2313=zh_CN&EXTERNAL_ID%2399=C5WBKT102%23%23flyeva&DIRECT_LOGIN=NO&SO_SITE_RUI_MULTIDEV_ENABLED=TRUE&SO_SITE_RUI_TABLET_PG_LIST=ALL&SO_SITE_RUI_MOBILE_PG_LIST=ALL&SO_SITE_RUI_DISP_FF_TABLE=TRUE&SO_SITE_RUI_UPSLL_T_MDL=TRUE&SO_SITE_RUI_UPSLL_T_MDL_ATC=TRUE&SO_SITE_RUI_DPICKER_NATIVE=TABLET%2CMOBILE&MC_FORCE_DEVICE_TYPE=MOBILE&SO_SITE_RUI_MOBILE_FLOW=ALL&SO_SITE_RUI_TABLET_FLOW=ALL&SO_SITE_RUI_COLLAPSE_BOUND_T=TWO_STEPS&SO_SITE_RUI_UPSLL_HIDE_BTNS=FALSE&SO_SITE_OFFICE_ID=SHABR08AA&LANGUAGE=CN&SITE=CAWXCNEW' url_data = { "ENCT": "1", "ENC": "990572D723A7BC83F77B4C6C03C696340674137066140FF11D721B8765E55FF8DC0562E080CE4BD1CD01272028CBBA89", # 传入当前查询时间 "ENC_TIME": time.strftime("%Y%m%d%H%M%S", time.localtime()) } # 设置无效 invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } url_data = urllib.urlencode(url_data) self.url = self.start_urls[0] + '?' + url_data # print '# url: ', url # print '# url_data: ', url_data # ip = '127.0.0.1:8888' # ip = '127.0.0.1:1080' yield scrapy.Request( self.url, headers=self.headers, body=post_data, callback=self.parse, dont_filter=True, # meta={'invalid': invalid, 'proxy': ip}, meta={'invalid': invalid}, method='POST', errback=self.errback)
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(6) continue for data in result: (_dt, dep, to, days) = vyUtil.analysisData(data) for i in range(int(days)): dt = (datetime.strptime(_dt, '%Y%m%d') + timedelta(days=i)).strftime('%Y-%m-%d') # dt, dep, to = '2019-02-28', 'BLR', 'BKK' post_data = self.custom_settings.get( 'POST_DATA_FORMAT').copy() post_data['query'] = post_data.get('query') % (self.seats, to, dep, dt) yield scrapy.Request( url=self.start_urls, method="POST", body=json.dumps(post_data), meta={'post_data': post_data}, dont_filter=True, )
def heart_task(self): result_iter = None # 需要加查询判断 while True: try: if self.local: if not result_iter: result_iter = pubUtil.get_task(self.name, days=10) result = next(result_iter) else: result = json.loads( requests.get(settings.GET_TASK_URL + 'carrier=' + self.name, timeout=60).text).get('data') except Exception as e: logging.error(e) result = None if result is None: logging.info('Date is None!') logging.info('Waiting...') time.sleep(16) continue for data in result: # 开始请求 self.start_requests(data)
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(3) continue for data in result: (dt_st, dep, arr, days) = vyUtil.analysisData(data) for i in range(int(days)): dt = vyUtil.get_real_date(dt_st, i) pay_load = dict( depCity1=dep, arrCity1=arr, depDate1=dt, ) pay_load.update(self.custom_settings.get('PAY_LOAD')) yield scrapy.FormRequest( self.start_url, formdata=pay_load, meta={'payload': pay_load}, callback=self.parse, dont_filter=True, errback=self.err_back, )
def start_requests(self): permins = 0 print(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(3) continue for data in result: (dt_st, dep, arr, days) = vyUtil.analysisData(data) for i in range(int(days)): dt = vyUtil.get_real_date(dt_st, i) params = { 'origination-airport': dep, 'destination-airport': arr, 'departure-date': dt, 'number-adult-passengers': self.custom_settings.get('SEAT_SEARCH'), 'number-senior-passengers': 0, 'currency': 'USD', } total_url = self.start_urls + parse.urlencode(params) yield scrapy.Request( url=total_url, method="GET", dont_filter=True, callback=self.parse, errback=self.errback, )
def start_requests(self): permins = 0 print( LamudatechDevPipeline.heartbeat(self.host_name, self.spider_name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, step=7, st=2) result = next(result_iter) else: try: data_api = settings.GET_TASK_URL + 'carrier=VJ' result = json.loads( requests.get(data_api, timeout=180).text).get('data') except Exception as e: self.log(e) result = None if not result: logging.info('Date is None!') logging.info('Waiting...') continue airports, _day, _num = result[0].split(':') FROM, TO = airports.split('-') _day = re.sub(r'(\d{4})(\d{2})(\d{2})', r'\1-\2-\3', _day) # for airports in get_airports(u'越捷航空.csv'): # FROM = airports.get('DepartureAirportCode') # TO = airports.get('ArrivalAirportCode') # # _day = "{:%Y-%m-%d}".format(datetime.today()) for _day in self._get_dates(_day, int(_num)): data = { 'OutboundDate': _day, 'DaysBefore': '0', 'DaysAfter': '0', 'AdultCount': '1', 'ChildCount': '0', 'InfantCount': '0', 'DepartureAirportCode': FROM, 'ArrivalAirportCode': TO, 'CurrencyCode': 'VND', 'PromoCode': '' } yield scrapy.FormRequest( self.start_urls, formdata=data, meta={ 'FROM': FROM, 'TO': TO }, callback=self.parse, dont_filter=True, # errback=lambda x: self.download_errback(x, FROM, TO) errback=self.errback, )
def start_requests(self): permins = 0 print(LamudatechDevPipeline.heartbeat(self.host_name, self.spider_name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, step=1) result = next(result_iter) else: try: data_api = settings.GET_TASK_URL + 'carrier=DY' result = json.loads(requests.get(data_api, timeout=180).text).get('data') except Exception as e: self.log(e) result = None if not result: self.log('Date is None!', 40) self.log('Waiting...', 40) time.sleep(16) continue airports, _day, _num = result[0].split(':') FROM, TO = airports.split('-') currency_market = self.currency_info.get(FROM) if currency_market: currencyCode = currency_market.get('currency') marketCode = currency_market.get('marketCode') else: currencyCode = 'EUR' marketCode = 'en' for _day in self._get_dates(_day, int(_num)): params = parse.urlencode(dict( adultCount=3, childCount=0, infantCount=0, # culture='en-GB', currencyCode=currencyCode, marketCode=marketCode, origin=FROM, destination=TO, inboundDate=_day, outboundDate=_day, includeTransit='true', isRoundTrip='false', isSsrNeeded='false', )) total_url = self.start_urls + params yield scrapy.Request(total_url, meta={'FROM': FROM, 'TO': TO, '_day': _day}, callback=self.parse, errback=None, dont_filter=True)
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl(self.name) if not result: time.sleep(6) continue for data in result: # 处理任务 [u'TLN-CFE:20181110:1'] task_data_list = data.split(':') count = int(task_data_list[2]) (dt, dep, arr) = pubUtil.analysisData(task_data_list[0] + ':' + task_data_list[1]) _date = datetime.strptime(dt, '%Y-%m-%d') for i in range(count): date = _date + timedelta(days=i) date = date.strftime('%Y%m%d0000') dep = self.port_city.get(dep, dep) arr = self.port_city.get(arr, arr) # logging.info('# input data: ' + dep + '' + arr + '' + date) # 设置无效 invalid = { 'date': date.replace('-', ''), 'depAirport': dep, 'arrAirport': arr, 'mins': self.custom_settings.get('INVALID_TIME') } post_data = urllib.urlencode( ly_post_data.second_post_data(dep, arr, date, self.ADT)) yield scrapy.Request(self.start_urls[0], body=post_data, callback=self.parse, dont_filter=True, meta={'invalid': invalid}, errback=self.errback, method='POST')
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(6) continue hour = datetime.now().hour + 2 self.cookie = self.cookies[hour % len(self.cookies)] installid = self.InstallationID[hour % len(self.InstallationID)] for data in result: (dt_st, dep, to, days) = vyUtil.analysisData(data) # 把获取到的data格式化 # dep, to = 'CDG', 'VIE' for i in range(int(days)): dt = vyUtil.get_real_date(dt_st, i) # dt = '2018-11-01' self.task.append({ 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': settings.INVALID_TIME }) dt = dt + 'T00:00:00' data_list = { 'InstallationID': installid, 'AirportDateTimeList': [{ 'MarketDateDeparture': dt, 'DepartureStation': dep, 'ArrivalStation': to, }] } data_list.update(self.custom_settings.get('DEFAULT_DATA')) yield scrapy.Request( method='POST', url=self.start_url, headers={'Cookie': self.cookie}, body=json.dumps(data_list), meta={'data_list': data_list}, callback=self.parse, dont_filter=True, errback=lambda x: self.download_errback(x, data_list))
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(20) continue for data in result: (dt, dep, to) = pubUtil.analysisData(data) # dep, to, dt = 'FUK', 'YNT', '2019-03-27' post_data = { "tripType": "OW", "orgCode": dep, "dstCode": to, "takeoffdate1": dt, } # 随机UA ua = UserAgent() self.headers['User-Agent'] = ua.random # post_data = urllib.urlencode(post_data) # logging.info("###input data: " + dep + to + dt) # 设置无效 invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } yield scrapy.Request( self.start_urls[0], headers=self.headers, body=json.dumps(post_data), # body=post_data, callback=self.parse, dont_filter=True, # meta={'invalid': invalid, 'proxy': 'http://127.0.0.1:8888'}, meta={'invalid': invalid}, method='POST', errback=self.errback)
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, step=7) result = next(result_iter) else: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(10) continue # 循环多个任务,现在默认一个 for data in result: # 处理任务 BVE-LYS-201812030000-15 count = int(data.split(':')[-1]) (date, dep, arr) = pubUtil.analysisData(data[:-2]) date = date.replace('-', '') + '0000' # logging.info('# input data: ' + dep + '-' + arr + '-' + date + '-' + str(count)) task_data = { 'dep': dep, 'arr': arr, 'date': date, 'count': count } post_data = urllib.urlencode( a5_post_data.first_post_data(dep, arr, date, self.ADT)) # 获取session yield scrapy.Request( self.get_session_url[0], body=post_data, callback=self.get_session, dont_filter=True, meta={ 'post_data': post_data, 'task_data': task_data }, method='POST', errback=self.errback, )
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(20) continue self.session_data['tck'] = random.choice(self.id_pool) for data in result: # logging.info("###input data: " + data) (dt, dep, to) = pubUtil.analysisData(data) # dt,dep,to='2019-02-28','CAN','RGN' post_data = { 'traveldate': dt, 'ori': dep, 'currency': 'CNY', 'dest': to } # 设置无效 invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } post_data = urllib.urlencode(post_data) yield scrapy.Request(self.start_urls[0], headers=self.headers, body=post_data, callback=self.parse, dont_filter=True, meta={'invalid': invalid}, method='POST', errback=self.errback)
def start_requests(self): permins = 0 self.proxy = True print( LamudatechDevPipeline.heartbeat(self.host_name, self.spider_name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, step=7) result = next(result_iter) else: try: data_api = settings.GET_TASK_URL + 'carrier=U2' result = json.loads( requests.get(data_api, timeout=180).text).get('data') except Exception as e: self.log(e) result = None if not result: self.log('Date is None!', level=20) self.log('Waiting...', level=20) time.sleep(16) continue airports, _day, _num = result[0].split(':') FROM, TO = airports.split('-') # FROM, TO = 'TXL', 'ARN' lowfares_url = 'https://www.easyjet.com/ejcms/cache15m/api/routedates/get/?' lowfares_total_url = lowfares_url + parse.urlencode( { 'originIata': FROM, 'destinationIata': TO }) yield scrapy.Request(lowfares_total_url, meta={ 'FROM': FROM, 'TO': TO }, callback=self.date_parse, errback=self.errback, dont_filter=True)
def start_requests(self): permins = 0 print(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl('TW', 5) if not result: logging.info('get task error') time.sleep(10) continue today = datetime.now().strftime('%Y%m%d') for data in result: (dt, dep, to) = pubUtil.analysisData(data) # 把获取到的data格式化 # dt, dep, to = '20180722', 'ICN', 'KIX' # 测试某条数据 params = urllib.urlencode(dict( origin=dep, destination=to, onwardDateStr=dt.replace('-', ''), # pointOfPurchase='KR', paxTypeCountStr='3,0,0', today=today, travelType='OW', searchType='byDate', # domesticYn='Y', bundleAmountOW=0, bundleAmountRT=0, routeCls='AS', _=int(time.time() * 1000) )) total_url = self.start_urls[0] + params yield scrapy.Request(url=total_url, callback=self.transit, meta={'params': params, 'flag': 1}, dont_filter=True)
def start_requests(self): permins = 0 print( LamudatechDevPipeline.heartbeat(self.host_name, self.spider_name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter or not result: result_iter = pubUtil.get_task(self.name) result = next(result_iter) else: try: data_api = settings.GET_TASK_URL + 'carrier=EW' result = json.loads( requests.get(data_api, timeout=180).text).get('data') except Exception as e: self.log(e) result = None if not result: self.log('Date is None!', level=20) self.log('Waiting...', level=20) time.sleep(16) continue airports, _day, _num = result[0].split(':') FROM, TO = airports.split('-') # FROM, TO = 'HAM', 'CDG' # 请求最低价获得有价格的日期 for record in self.get_data(FROM, TO): # 日期排序 order_list = [ x.get('date') for x in sorted(record, key=lambda x: x.get('date')) ] # 筛选临近日期,减少爬取数量 lowfares = [ x for x in record if x.get('date') in self.get_list(order_list) ] for data in lowfares: _from = data.get('origin') _to = data.get('destination') # currency = data.get('currency') # currency = self.currency_cache.get(currency, currency) _date = data.get('date') # _date = '2018-11-06' params = urllib.urlencode( dict( o=_from, d=_to, t='o', od=_date, adt='5', lng='en-GB', appvi='2D53F50C85034ECF-6000119C00002033', adobe_mc= 'TS=%s|MCAID=2D791936852A6702-40000129C00FCFCA' % int(time.time()), screen='Search', culture='en-GB', )) total_url = self.start_urls + params yield scrapy.Request( total_url, meta={ 'FROM': _from, 'TO': _to, '_day': _date, # 'currency': currency }, dont_filter=True, callback=self.parse, errback=self.download_errback)
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(5) continue for data in result: # logging.info("## input data: " + data) # 处理任务 [u'TLN-CFE:20181110:1'] count = int(data.split(':')[-1]) (date, dep, arr) = pubUtil.analysisData(data[:-2]) _date = datetime.strptime(date, '%Y-%m-%d') for i in range(count): _date = _date + timedelta(days=i) date = _date.strftime('%Y%m%d') # dep = 'KIX' # arr = 'ICN' # logging.info('# input data: ' + dep + '-' + arr + '-' + date) city_code = self.city_dict.get(dep) if city_code is None: logging.info('# not found city: ' + dep) body = json.dumps( ze_post_data.get_data(dep, arr, date, self.ADT, city_code)) # 设置无效 invalid = { 'date': date.replace('-', ''), 'depAirport': dep, 'arrAirport': arr, 'mins': self.custom_settings.get('INVALID_TIME') } task_data = { 'dep': dep, 'arr': arr, 'date': date, 'city_code': city_code, 'body': body } yield scrapy.Request(self.start_urls[0], headers=self.headers, body=body, callback=self.parse, dont_filter=True, meta={ 'invalid': invalid, 'task_data': task_data }, errback=self.errback, method='POST')
def start_request(self): result_iter = None # 需要加查询判断 while True: # if not timeUtil.time_is_valid(self.st_time, self.en_time): # logging.info('Waiting to 07:30:00.....') # time.sleep(5 * 60) # continue # data_api = 'http://dx.redis.jiaoan100.com/buddha/gettask?carrier=JX' data_api = 'http://task.jiaoan100.com/buddha/gettask?carrier=jx' try: if self.local: if not result_iter: result_iter = pubUtil.get_task('JQ', days=10) result = next(result_iter) else: result = json.loads( requests.get(data_api, timeout=60).text).get('data') except Exception as e: logging.error(e) result = None if result is None: logging.info('Date is None!') logging.info('Waiting...') time.sleep(16) continue airports, _day, day_num = result[0].split(':') # day_num='1' # print('airports, _day, day_num',airports, _day, day_num) FROM, TO = airports.split('-') # FROM, TO = ('DAD', 'HKG') _day = re.sub(r'(\d{4})(\d{2})(\d{2})', r'\1-\2-\3', _day) days = self._get_dates(_day, day_num) # print(days) # days = ['2019-01-11', '2019-01-12', '2019-01-13'] for day in days: # FROM, TO, day = 'RGN', 'SIN', '2019-01-17' query = urlencode({ 'origin1': FROM, 'destination1': TO, # 'flight-type': '1', 'departuredate1': day, 'adults': str(settings.ADULT_NUM), 'children': '0', 'infants': '0', }) print(query) # set_invalid('JX', FROM, TO, day) total_url = self.start_url + query # 设置无效 invalid = { 'date': day.replace('-', ''), 'depAirport': FROM, 'arrAirport': TO, 'mins': settings.INVALID_TIME } # total_url = 'https://www.jetstar.com/au/en/home?origin=CBR&destination=HNL&flight-type=1&selected-departure-date=02-02-2019&adult=1&flexible=1¤cy=AUD' # yield total_url,invalid yield [total_url, invalid]
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(5) continue for data in result: # logging.info("## input data: " + data) # 处理任务 [u'TLN-CFE:20181110:1'] count = int(data.split(':')[-1]) (date, dep, arr) = pubUtil.analysisData(data[:-2]) _date = datetime.strptime(date, '%Y-%m-%d') for i in range(count): temp_date = _date + timedelta(days=i) date = temp_date.strftime('%Y-%m-%d') # logging.info('# input data: ' + dep + '-' + arr + '-' + date) # dep, arr, date = 'MNL', 'SIN', '2019-01-04' post_data = { "originIata": dep, "destinationIata": arr, "departureDate": date + "T00:00:00+08:00", "passengerComposition": { "adult": self.ADT, "children": 0, "infant": 0 } } body = json.dumps(post_data) # 设置无效 invalid = { 'date': date[:10].replace('-', ''), 'depAirport': dep, 'arrAirport': arr, 'mins': self.custom_settings.get('INVALID_TIME') } task_data = { 'dep': dep, 'arr': arr, 'date': date, 'body': body } yield scrapy.Request(url=self.start_urls[0], body=body, callback=self.parse, dont_filter=True, meta={ 'invalid': invalid, 'task_data': task_data }, errback=self.errback, method='POST')
def start_requests(self): permins = 0 logging.info(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None # IP使用时长计时器 # start_time = time.time() while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=10) result = next(result_iter) else: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(20) continue for data in result: # logging.info("## input data: " + data) # 处理任务 [u'TLN-CFE:20181110:1'] count = int(data.split(':')[-1]) (date, dep, arr) = pubUtil.analysisData(data[:-2]) _date = datetime.strptime(date, '%Y-%m-%d') for i in range(count): temp_date = _date + timedelta(days=i) date = temp_date.strftime('%m/%d/%Y') invalid_date = temp_date.strftime('%Y%m%d') # logging.info('# input data: ' + dep + '-' + arr + '-' + date) # dep, arr, date = 'FLL', 'LAS', '2019-01-13' # IP超过使用时长,强制更换 # logging.info('ip used time: ' + str(time.time() - start_time)) # if time.time() - start_time > self.use_time: # self.proxy_flag = True # logging.info('### ip invalid:' + self.proxy) if self.proxy_flag: while True: # 俄罗斯代理 # self.proxy = pubUtil.nk_get_ip() # 小池子代理 self.proxy = pubUtil.get_proxy(self.name) if self.proxy is None: logging.info('# no get proxy, continue') # time.sleep(60) continue logging.info('# get a new ip: ' + self.proxy) ip_proxies = {"https": "https://" + self.proxy} # 获取session try: response = requests.get(self.get_session_url, proxies=ip_proxies, timeout=15) self.cookies_str = json.dumps(requests.utils.dict_from_cookiejar(response.cookies))[ 1:-1].replace( '\"', '').replace( ':', '=').replace(' ', '').replace(',', '; ') except Exception as e: logging.info(e) self.proxy_flag = True logging.info('# get session error') continue # IP正常使用,开始计时 # start_time = time.time() self.proxy_flag = False break headers = { 'Content-Type': 'application/x-www-form-urlencoded', 'Cookie': self.cookies_str } post_data = { 'from': dep, 'to': arr, # 'from': 'AXM', # 'to': 'ATL', 'departDate': date, 'departDateDisplay': date, 'ADT': self.ADT } post_data.update(self.custom_settings.get('POST_DATA')) post_data = urllib.urlencode(post_data) # 设置无效 invalid = { 'date': invalid_date, 'depAirport': dep, 'arrAirport': arr, 'mins': self.custom_settings.get('INVALID_TIME') } yield scrapy.Request(url=self.start_urls[0], body=post_data, headers=headers, callback=self.parse, dont_filter=True, meta={'invalid': invalid, 'proxy': self.proxy}, errback=self.errback, method='POST')