def process_item(self, item): self.buffer.append(item) self.count += 1 this_time = time.time() if this_time - self.st_time >= 60: self.st_time = this_time logging.info( pubUtil.heartbeat(self.host_name, self.carrier, self.num, self.count, self.version)) self.count = 0 if len(self.buffer) > 5: add_success = pubUtil.addData('add', self.buffer, self.push_url, self.host_name, carrier=self.carrier) if add_success: logging.info(add_success) self.buffer.clear() if len(self.headers) > 5: add_success = pubUtil.push_cookies(self.headers, self.carrier) if add_success: logging.info(add_success) self.headers.clear()
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, 60) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(3) continue for data in result: (dt_st, dep, arr, days) = vyUtil.analysisData(data) for i in range(int(days)): dt = vyUtil.get_real_date(dt_st, i) data = dict(beginCity=dep, endCity=arr, beginDate=dt) data.update(self.custom_settings.get('DEFAULT_DATA')) yield scrapy.Request( url=self.start_urls, method="POST", body=json.dumps(data), meta=dict(data=data), dont_filter=True, callback=self.parse, errback=self.errback, )
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=1) result = next(result_iter) else: result = pubUtil.getUrl('aq', 1) if not result: logging.info('get task error') time.sleep(20) continue for data in result: # logging.info("###input data: " + data) (dt, dep, to) = pubUtil.analysisData(data) # dt,dep,to='2019-03-28','PVG','TPE' # ua = UserAgent() # self.headers['user-agent'] = ua.random post_data = 'B_LOCATION_1=' + dep + '&E_LOCATION_1=' + to + '&B_DATE_1=' + dt.replace( '-', '' ) + '0000&B_ANY_TIME_1=True&EMBEDDED_TRANSACTION=FlexPricerAvailability&ARRANGE_BY=D&DISPLAY_TYPE=2&PRICING_TYPE=O&SO_SITE_MATRIX_CALENDAR=FALSE&SO_SITE_RUI_CAL_AVAI_NO_RECO=TRUE&SO_SITE_RUI_FP_AVAI_PRESEL=FALSE&COMMERCIAL_FARE_FAMILY_1=NEWECOOW&COMMERCIAL_FARE_FAMILY_2=NEWDELOW&COMMERCIAL_FARE_FAMILY_3=NEWBIZOW&SO_SITE_RUI_AX_CAL_ENABLED=TRUE&SO_SITE_CAL_CHANGE_WEEK=TRUE&SO_SITE_RUI_HIDE_MDF_SRC=FALSE&EXTERNAL_ID%236=OW&TRAVELLER_TYPE_1=ADT&TRIP_TYPE=O&TRIP_FLOW=YES&SO_SITE_EXPORT_CONFIRM=TRUE&SO_SITE_EXPORT_CONF_URL=https%3A%2F%2Fbooking.evaair.com%2Fexporttripplan%2Fwebservice.aspx&SO_SITE_THREEDS_USE=N&SO_SITE_BILLING_NOT_REQUIRED=Y&SO_SITE_BILL_ADD_OPTIONS=BILL_ADD_HIDDEN&SO_SITE_PREBOOK_CANCELLATION=TRUE&SO_GL=%3C%3Fxml+version%3D%221.0%22+encoding%3D%22iso-8859-1%22%3F%3E%0D%0A%3CSO_GL%3E%0D%0A%3CGLOBAL_LIST+mode%3D%22partial%22%3E%0D%0A%3CNAME%3ESL_AIR_MOP%3C%2FNAME%3E%0D%0A%3CLIST_ELEMENT%3E%0D%0A%3CCODE%3ECC%3C%2FCODE%3E%0D%0A%3CLIST_VALUE%3ECredit+Card%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EY%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECC%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECryptic%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECC%25T%25I%2F%25E%2F%25C%25F%2FN%25A%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%2F%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECC%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3C%2FLIST_ELEMENT%3E%0D%0A%3C%2FGLOBAL_LIST%3E%0D%0A%3C%2FSO_GL%3E&SO_SITE_FD_DISPLAY_MODE=1&SO_SITE_CURRENCY_FORMAT_JAVA=0&SO_SITE_ENABLE_SRV_POLICY=BAG%2CCOA&SO_SITE_ALLOW_SPEC_REQ_SERV=FALSE&SO_SITE_SD_TRUE_OP_CARRIER=TRUE&SO_SITE_BARCODE_ENABLE=TRUE&SO_SITE_ALLOW_CS_CODE_SHARE=FALSE&SO_SITE_USE_PAYMENT_ACTION=TRUE&EXTERNAL_ID=AIBS&EXTERNAL_ID%232=&EXTERNAL_ID%233=&EXTERNAL_ID%234=NEWECOOW&EXTERNAL_ID%235=&EXTERNAL_ID%2314=N&EXTERNAL_ID%2312=&EXTERNAL_ID%2313=zh_CN&EXTERNAL_ID%2399=C5WBKT102%23%23flyeva&DIRECT_LOGIN=NO&SO_SITE_RUI_MULTIDEV_ENABLED=TRUE&SO_SITE_RUI_TABLET_PG_LIST=ALL&SO_SITE_RUI_MOBILE_PG_LIST=ALL&SO_SITE_RUI_DISP_FF_TABLE=TRUE&SO_SITE_RUI_UPSLL_T_MDL=TRUE&SO_SITE_RUI_UPSLL_T_MDL_ATC=TRUE&SO_SITE_RUI_DPICKER_NATIVE=TABLET%2CMOBILE&MC_FORCE_DEVICE_TYPE=MOBILE&SO_SITE_RUI_MOBILE_FLOW=ALL&SO_SITE_RUI_TABLET_FLOW=ALL&SO_SITE_RUI_COLLAPSE_BOUND_T=TWO_STEPS&SO_SITE_RUI_UPSLL_HIDE_BTNS=FALSE&SO_SITE_OFFICE_ID=SHABR08AA&LANGUAGE=CN&SITE=CAWXCNEW' url_data = { "ENCT": "1", "ENC": "990572D723A7BC83F77B4C6C03C696340674137066140FF11D721B8765E55FF8DC0562E080CE4BD1CD01272028CBBA89", # 传入当前查询时间 "ENC_TIME": time.strftime("%Y%m%d%H%M%S", time.localtime()) } # 设置无效 invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } url_data = urllib.urlencode(url_data) self.url = self.start_urls[0] + '?' + url_data # print '# url: ', url # print '# url_data: ', url_data # ip = '127.0.0.1:8888' # ip = '127.0.0.1:1080' yield scrapy.Request( self.url, headers=self.headers, body=post_data, callback=self.parse, dont_filter=True, # meta={'invalid': invalid, 'proxy': ip}, meta={'invalid': invalid}, method='POST', errback=self.errback)
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) while True: result = pubUtil.getUrl(self.name, 1) if not result: self.log('get task error'), 20 time.sleep(10) continue for data in result: (dt, dep, to, days) = vyUtil.analysisData(data) # 把获取到的data格式化 # dep, to, dt, days= 'RHO', 'PMO', '2018-08-15', 30 dt_datetime = datetime.strptime(dt, '%Y%m%d') end_date = dt_datetime + timedelta(days=int(days)) dt = dt_datetime.strftime('%Y-%m-%d') if dt_datetime.month != end_date.month: next_fday = datetime(end_date.year, end_date.month, 1) days_before = (next_fday - dt_datetime).days next_fday_str = next_fday.strftime('%Y-%m-%d') yield self.first_request(dep, to, dt, days_before + 1) yield self.first_request(dep, to, next_fday_str, int(days) - days_before) else: yield self.first_request(dep, to, dt, days)
def process_item(self, item, spider): # item['segments'] = '[]' item = dataUtil.strip_item(item) item = dataUtil.keys_for_short(item) self.store.append(dict(item)) run_time = time.time() if run_time - self.interval >= 60: self.interval = run_time permins = spider.crawler.stats.get_value('permins') print( pubUtil.heartbeat(spider.host_name, spider.name, spider.num, permins, spider.version)) if 1 or len(self.store) >= settings.PUSH_DATA_NUM: add_success = pubUtil.addData('add', self.store, settings.PUSH_DATA_URL_TEST, spider.host_name, carrier=spider.name.upper()) if add_success: self.store = [] invalid_success = pubUtil.invalidData( 'invalid', spider.task, settings.PUSH_DATA_URL_TEST + 'carrier=%s' % spider.name, spider.host_name) if invalid_success: spider.task = []
def process_item(self, item, spider): item = dataUtil.strip_item(item) item = dataUtil.keys_for_short(item) run_time = time.time() if run_time - self.interval >= 60: self.interval = run_time permins = spider.crawler.stats.get_value('permins') print(pubUtil.heartbeat(spider.host_name, spider.name, spider.num, permins, spider.version)) self.store.append(dict(item)) if hasattr(spider, 'push_data_num'): num = spider.push_data_num else: num = settings.PUSH_DATA_NUM if len(self.store) >= num: url = dataUtil.get_random_url(settings.PUSH_DATA_URL) add_success = pubUtil.addData('add', self.store, url, spider.host_name, carrier=spider.carrier) if add_success: self.store = [] if len(spider.task): time.sleep(0.5) invalid_success = pubUtil.invalidData('invalid', spider.task, url + 'carrier=%s' % spider.name, spider.host_name) if invalid_success: spider.task = []
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(3) continue for data in result: (dt_st, dep, arr, days) = vyUtil.analysisData(data) for i in range(int(days)): dt = vyUtil.get_real_date(dt_st, i) pay_load = dict( depCity1=dep, arrCity1=arr, depDate1=dt, ) pay_load.update(self.custom_settings.get('PAY_LOAD')) yield scrapy.FormRequest( self.start_url, formdata=pay_load, meta={'payload': pay_load}, callback=self.parse, dont_filter=True, errback=self.err_back, )
def start_requests(self): permins = 0 print(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(3) continue for data in result: (dt_st, dep, arr, days) = vyUtil.analysisData(data) for i in range(int(days)): dt = vyUtil.get_real_date(dt_st, i) params = { 'origination-airport': dep, 'destination-airport': arr, 'departure-date': dt, 'number-adult-passengers': self.custom_settings.get('SEAT_SEARCH'), 'number-senior-passengers': 0, 'currency': 'USD', } total_url = self.start_urls + parse.urlencode(params) yield scrapy.Request( url=total_url, method="GET", dont_filter=True, callback=self.parse, errback=self.errback, )
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(6) continue for data in result: (_dt, dep, to, days) = vyUtil.analysisData(data) for i in range(int(days)): dt = (datetime.strptime(_dt, '%Y%m%d') + timedelta(days=i)).strftime('%Y-%m-%d') # dt, dep, to = '2019-02-28', 'BLR', 'BKK' post_data = self.custom_settings.get( 'POST_DATA_FORMAT').copy() post_data['query'] = post_data.get('query') % (self.seats, to, dep, dt) yield scrapy.Request( url=self.start_urls, method="POST", body=json.dumps(post_data), meta={'post_data': post_data}, dont_filter=True, )
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) while True: result = pubUtil.getUrl('je', 1) if not result: logging.info('get task error') time.sleep(10) continue for data in result: # logging.info("###input data: " + data) (dt, dep, to) = pubUtil.analysisData(data) # 目标地址参数字典 post_data = { "AgencyCode": "", "AirportFrom": dep, "AirportTo": to, "BoardDate": dt, "CarPackage": 'false', "ReturnDate": "", "SearchType": "Normal", "AvailType": "", "IsReturnFlight": 'false', "IsBusiness": 'false', "Adults": self.ADT, "Children": "0", "Infants": "0", "FareDesignator": "", "EdgarsClubCard": "", "VoyagerState": '0', "HaveErrors": 'false', "IsChangeBooking": 'false', "MomentumClientNumber": "", "OutSegmentKeyFromRedirect": "", "InSegmentKeyFromRedirect": "", "isMobile": 'false', "CriteriaSearchType": "Day" } # 设置无效 invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } PayLoad = json.dumps(post_data) yield scrapy.Request(self.start_urls[0], body=PayLoad, callback=self.parse, dont_filter=True, meta={'invalid': invalid}, method='POST', errback=self.errback)
def start_requests(self): permins = 0 print(pubUtil.heartbeat(self.host_name,self.name,self.num,permins,self.version)) while True: result = pubUtil.getUrl('FZ', 5) if not result: logging.info('get task error') time.sleep(10) continue for data in result: (dt, dep, to) = pubUtil.analysisData(data) # for data in self.get_task(): # dep, to, dt = data # dt,dep,to= '2018-09-13','DXB','BEY' dt_change = datetime.strptime(dt,'%Y-%m-%d').strftime('%m/%d/%Y') print(dep, to, dt) seat = self.custom_settings.get('SEAT') payload = { "journeyType": "ow", "isOriginMetro": False, "isDestMetro": False, "variant": "0", "searchCriteria": [{ "origin": dep, "dest":to, "originDesc": "", "destDesc": "", "isOriginMetro": False, "isDestMetro": False, "direction": "outBound", "date": "%s 12:00 AM"%dt_change }], "paxInfo": { "adultCount": seat, "infantCount": 0, "childCount": 0 } } invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } body=json.dumps(payload) meta_data = dict( invalid=invalid, payload=body, aaa = (dep, to, dt) ) yield scrapy.Request(self.start_urls[0], callback=self.parse, method='POST', headers=self.custom_settings.get('HEADERS'), meta={'meta_data': meta_data}, body=body, errback=self.errback )
def start_requests(self): permins = 0 print(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) while True: result = pubUtil.getUrl('BJ', 5) if not result: logging.info('get task error') time.sleep(10) continue for data in result: (dt, dep, to, days) = pubUtil.analysisData_5j(data) # for i in range(1): # for data in self.get_task(): # dep, to, dt = data # dt,dep,to= '2018-11-16','TUN','MRS' print(dep, to, dt) seat = self.custom_settings.get('SEAT') duration = self.custom_settings.get("DURATION") # self.log('%s:%s:%s:%s' % (dt, dep, to, days), 20) for i in range(0, int(days), duration): begin_dt, end_dt = pubUtil.time_add_5j(dt, i, duration) payload = { 'adultes': seat, 'aller': begin_dt, 'bebes': '0', 'devise': 'TND', 'enfants': '0', 'felxibilite': '3', 'retour': '', 'sens': '1' } body = '' for key in payload: body = body + key + '=' + str(payload.get(key)) + '&' url = self.start_urls[0] + '%s/%s'%(dep,to) invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME'), } meta_data = dict( invalid=invalid, payload=body, aaa=(dep, to, dt), url=url, ) yield scrapy.Request(url, callback=self.parse, method='POST', headers=self.custom_settings.get('HEADERS'), meta={'meta_data': meta_data}, body=body, errback=self.errback )
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name) result = next(result_iter) else: result = pubUtil.getUrl(self.carrier, 1) if not result: time.sleep(6) continue hour = datetime.now().hour + 2 self.cookie = self.cookies[hour % len(self.cookies)] installid = self.InstallationID[hour % len(self.InstallationID)] for data in result: (dt_st, dep, to, days) = vyUtil.analysisData(data) # 把获取到的data格式化 # dep, to = 'CDG', 'VIE' for i in range(int(days)): dt = vyUtil.get_real_date(dt_st, i) # dt = '2018-11-01' self.task.append({ 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': settings.INVALID_TIME }) dt = dt + 'T00:00:00' data_list = { 'InstallationID': installid, 'AirportDateTimeList': [{ 'MarketDateDeparture': dt, 'DepartureStation': dep, 'ArrivalStation': to, }] } data_list.update(self.custom_settings.get('DEFAULT_DATA')) yield scrapy.Request( method='POST', url=self.start_url, headers={'Cookie': self.cookie}, body=json.dumps(data_list), meta={'data_list': data_list}, callback=self.parse, dont_filter=True, errback=lambda x: self.download_errback(x, data_list))
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl(self.name) if not result: time.sleep(6) continue for data in result: # 处理任务 [u'TLN-CFE:20181110:1'] task_data_list = data.split(':') count = int(task_data_list[2]) (dt, dep, arr) = pubUtil.analysisData(task_data_list[0] + ':' + task_data_list[1]) _date = datetime.strptime(dt, '%Y-%m-%d') for i in range(count): date = _date + timedelta(days=i) date = date.strftime('%Y%m%d0000') dep = self.port_city.get(dep, dep) arr = self.port_city.get(arr, arr) # logging.info('# input data: ' + dep + '' + arr + '' + date) # 设置无效 invalid = { 'date': date.replace('-', ''), 'depAirport': dep, 'arrAirport': arr, 'mins': self.custom_settings.get('INVALID_TIME') } post_data = urllib.urlencode( ly_post_data.second_post_data(dep, arr, date, self.ADT)) yield scrapy.Request(self.start_urls[0], body=post_data, callback=self.parse, dont_filter=True, meta={'invalid': invalid}, errback=self.errback, method='POST')
def process_item(self, item, spider): self.addCities(item) self.store.append(dict(item)) if time.time() - self.basic_time >= settings.HEARTBEAT_DURATION: self.basic_time = time.time() permins = spider.crawler.stats.get_value('permins') print( pubUtil.heartbeat(spider.host_name, spider.carrier, spider.num, permins, spider.version)) if len(self.store) >= settings.PUSH_DATA_NUM: pubUtil.pushData('add', self.store) self.store = []
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) while True: result = pubUtil.getUrl('by', 1) if not result: logging.info('get task error') time.sleep(10) continue for data in result: logging.info("###input data: " + data) (dt, dep, to) = pubUtil.analysisData(data) self.dep = dep self.arr = to self.date = dt second_data = { 'flyingFrom[]': self.dep, 'flyingTo[]': self.arr, 'depDate': self.date, 'returnDate': '', 'adults': self.ADT, 'children': '0', 'infants': '0', 'infantAge': '', 'isOneWay': 'true', 'childAge': '', 'searchType': 'selected', 'tabId': dep, 'cycleDates': dt, 'duration': '0' } second_url = '%s%s' % (self.second_url[0], urllib.urlencode(second_data)) # 设置无效 invalid = { 'date': self.date.replace('-', ''), 'depAirport': self.dep, 'arrAirport': self.arr, 'mins': self.custom_settings.get('INVALID_TIME') } yield scrapy.Request(second_url, callback=self.parse, dont_filter=True, meta={'invalid': invalid}, errback=self.errback)
def start_requests(self): permins = 0 print(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None result = None while True: if hasattr(self, 'local'): if not result_iter or not result: # 本地任务未编写 result_iter = self.get_task() result = next(result_iter) else: result = pubUtil.getUrl('TR', 5) if not result: logging.info('get task error') time.sleep(10) continue for data in result: (dt, dep, to) = pubUtil.analysisData(data) # dt, dep, to = '2019-01-12', 'SIN', 'TAO' print dt, dep, to seat = self.custom_settings.get('SEAT') querystring = { 'adt': seat, 'arcity': to, 'chd': '0', 'dpcity': dep, 'dpdate': dt, 'inft': '0', 'promo': '', 'type': '1' } data = '' for key in querystring: data = data + key + '=' + str(querystring.get(key)) + '&' invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } meta_data = dict( invalid=invalid, data=data, ) yield scrapy.Request(self.start_urls[0], callback=self.parse, headers=self.custom_settings.get('HEADERS'), method='POST', meta={'meta_data': meta_data}, body=data, errback=self.errback)
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None result = None while True: if hasattr(self, 'local'): if not result_iter or not result: result_iter = self.get_task() result = next(result_iter) else: result = pubUtil.getUrl('TT', 5) if not result: logging.info('get task error') time.sleep(10) continue for data in result: (dt, dep, to) = pubUtil.analysisData(data) # dt, dep, to = '2019-01-25', 'SYD', 'PER' seat = self.custom_settings.get('SEAT') payload = { 'currencyCode': 'AUD', 'departureDate': dt, 'destination': to, 'numAdults': seat, 'numChildren': 0, 'numInfants': 0, 'origin': dep, 'promoCode': '' } invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } meta_data = dict( invalid=invalid, payload=payload, ) headers = self.custom_settings.get('HEADERS') # headers['User-Agent'] = random.choice(self.ua_data)[0] headers['User-Agent'] = self.ua_construction() yield scrapy.Request(self.start_urls[0], callback=self.parse, headers=headers, method='POST', meta={'meta_data': meta_data}, body=json.dumps(payload), errback=self.errback)
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(20) continue for data in result: (dt, dep, to) = pubUtil.analysisData(data) # dep, to, dt = 'FUK', 'YNT', '2019-03-27' post_data = { "tripType": "OW", "orgCode": dep, "dstCode": to, "takeoffdate1": dt, } # 随机UA ua = UserAgent() self.headers['User-Agent'] = ua.random # post_data = urllib.urlencode(post_data) # logging.info("###input data: " + dep + to + dt) # 设置无效 invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } yield scrapy.Request( self.start_urls[0], headers=self.headers, body=json.dumps(post_data), # body=post_data, callback=self.parse, dont_filter=True, # meta={'invalid': invalid, 'proxy': 'http://127.0.0.1:8888'}, meta={'invalid': invalid}, method='POST', errback=self.errback)
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, step=7) result = next(result_iter) else: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(10) continue # 循环多个任务,现在默认一个 for data in result: # 处理任务 BVE-LYS-201812030000-15 count = int(data.split(':')[-1]) (date, dep, arr) = pubUtil.analysisData(data[:-2]) date = date.replace('-', '') + '0000' # logging.info('# input data: ' + dep + '-' + arr + '-' + date + '-' + str(count)) task_data = { 'dep': dep, 'arr': arr, 'date': date, 'count': count } post_data = urllib.urlencode( a5_post_data.first_post_data(dep, arr, date, self.ADT)) # 获取session yield scrapy.Request( self.get_session_url[0], body=post_data, callback=self.get_session, dont_filter=True, meta={ 'post_data': post_data, 'task_data': task_data }, method='POST', errback=self.errback, )
def start_requests(self): permins = 0 logging.info( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(20) continue self.session_data['tck'] = random.choice(self.id_pool) for data in result: # logging.info("###input data: " + data) (dt, dep, to) = pubUtil.analysisData(data) # dt,dep,to='2019-02-28','CAN','RGN' post_data = { 'traveldate': dt, 'ori': dep, 'currency': 'CNY', 'dest': to } # 设置无效 invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } post_data = urllib.urlencode(post_data) yield scrapy.Request(self.start_urls[0], headers=self.headers, body=post_data, callback=self.parse, dont_filter=True, meta={'invalid': invalid}, method='POST', errback=self.errback)
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) while True: result = pubUtil.getUrl('LA', 5) if not result: logging.info('get task error') time.sleep(10) continue for data in result: (dt, dep, to) = pubUtil.analysisData(data) # for i in range(1): # for data in self.get_task(): # dep, to, dt = data # dt,dep,to= '2018-10-16','CWB','SAO' # print(dep, to, dt) currency = "BR" seat = self.custom_settings.get('SEAT') querystring = { "country": currency, "origin": dep, "destination": to, "departure": dt, "adult": seat, } url = self.start_urls[0] + '?' for key in querystring: url = url + key + '=' + str(querystring.get(key)) + '&' invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': to, 'mins': self.custom_settings.get('INVALID_TIME') } meta_data = dict(invalid=invalid, params=querystring, aaa=(dep, to, dt), flight_time=dt) yield scrapy.Request( url, callback=self.parse, method='GET', headers=self.custom_settings.get('HEADERS'), meta={'meta_data': meta_data}, errback=self.errback)
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) while True: result = pubUtil.getUrl('KC', 5) if not result: logging.info('get task error') time.sleep(10) continue for data in result: (dt, dep, arr) = pubUtil.analysisData(data) dt = re.sub(r'(\d{4})-(\d{2})-(\d{2})', r'\3.\2.\1', dt) # print(dt) # dt = time.strftime('%d.%m.%Y',dt) print(dep, arr, dt) payload = { 'captchaResponse': '', 'pReturnDateStr': '', 'pFlightDateStr': dt, 'pRequest': { 'TwoWayRoute': 'false', 'DateAreFlexible': 'true', 'Origin': dep, 'Destination': arr, 'Bookingclass': 'ECO', 'Adult': '3', 'Child': '0', 'Infant': '0', 'Resident': 'false' }, } invalid = { 'date': dt.replace('-', ''), 'depAirport': dep, 'arrAirport': arr, 'mins': self.custom_settings.get('INVALID_TIME') } meta_data = dict(invalid=invalid, payload=payload) yield scrapy.Request( self.custom_settings.get('sessionID_url'), callback=self.data_requests, method='POST', headers=self.custom_settings.get('start_headers'), meta={'meta_data': meta_data}, body=json.dumps(payload), errback=self.errback)
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) while True: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(10) continue for data in result: (dt, dep, to, days) = ddUtil.analysisData(data) # 把获取到的data格式化 # (dt_st, dep, to, days) = '20180510', 'DMK', 'AM1', 20 # self.task.append({'date':dt.replace('-', ''), # 'depAirport': dep, # 'arrAirport': to, # 'mins': settings.INVALID_TIME # }) data_dict = { 'GetAvailabilityDetail': { "Infant": 0, "DepartureAirport": dep, "ArrivalAirport": to, "Child": 0, "Currency": 'THB', "RoundTripFlag": "0", "Adult": 3, "AgencyCode": "", "ReturnDate": "", "BoardDate": dt, "PromotionCode": "" } } data_dict.update(self.custom_settings.get('DEFAULT_DATA')) yield scrapy.Request( method='POST', url=self.start_urls, body=json.dumps(data_dict), meta={'data_dict': data_dict}, callback=self.parse, dont_filter=True, errback=lambda x: self.download_errback(x, data_dict))
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.carrier, self.num, permins, self.version)) while True: result = pubUtil.getUrl(self.carrier, 5) if not result: logging.info('get task error') time.sleep(3) continue for data in result: (_dt, dep, to, days) = vyUtil.analysisData(data) for i in range(int(days)): dt = (datetime.strptime(_dt, '%Y%m%d') + timedelta(days=i)).strftime('%Y/%m/%d') self.task.append({ 'date': dt.replace('/', ''), 'depAirport': dep, 'arrAirport': to, 'mins': settings.INVALID_TIME }) post_data = { "flight_search_parameter[0][departure_date]": dt, "flight_search_parameter[0][departure_airport_code]": dep, "flight_search_parameter[0][arrival_airport_code]": to, } post_data.update( self.custom_settings.get('DEFAULT_POST_DATA')) # print(post_data) yield scrapy.Request( url=self.start_urls[1], method="GET", # body=json.dumps(post_data), # formdata=post_data, meta={'post_data': post_data}, dont_filter=True, callback=self.parse, errback=self.errback, )
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.carrier, self.num, permins, self.version)) result_iter, result = None, None while True: if hasattr(self, 'local'): if not result_iter or not result: result_iter = self.get_task() result = next(result_iter) else: result = pubUtil.getUrl('BE', 10) if not result: time.sleep(60) continue for data in result: (dt, dep, to) = pubUtil.analysisData(data) # 把获取到的data格式化 # dt, dep, to = '2018-11-01', 'EXT', 'JER' if pubUtil.dateIsInvalid(dt): continue temp = { 'depart': dep, 'arr': to, 'departing': dt, 'returning': '', 'promo-code': '', 'adults': 3, 'teens': 0, 'children': 0, 'infants': 0 } try: params = urllib.parse.urlencode(temp) except: params = urllib.urlencode(temp) url = '%s%s/%s?%s' % (self.start_urls, dep, to, params) yield scrapy.Request(url, callback=self.parse, dont_filter=True, errback=self.err_back)
def process_item(self, item, spider): # item['segments'] = '[]' item = dataUtil.strip_item(item) item = dataUtil.keys_for_short(item) run_time = time.time() if run_time - self.interval >= 60: self.interval = run_time permins = spider.crawler.stats.get_value('permins') print( pubUtil.heartbeat(spider.host_name, spider.name, spider.num, permins, spider.version)) self.store.append(dict(item)) if hasattr(spider, 'push_data_num'): num = spider.push_data_num else: num = settings.PUSH_DATA_NUM if len(self.store) >= num: url = dataUtil.get_random_url(settings.PUSH_DATA_URL) add_success = pubUtil.addData('add', self.store, url, spider.host_name, carrier=spider.name.upper()) if add_success: if hasattr(spider, 'spe'): push_data_log = [ '%s->%s:%s %s' % (data.get('depAirport'), data.get('arrAirport'), time.strftime('%Y-%m-%d', time.localtime(data.get('depTime'))), data.get('flightNumber')) for data in self.store ] spider.log('push task :' + str(push_data_log), 20) self.store = [] invalid_success = pubUtil.invalidData( 'invalid', spider.task, url + 'carrier=%s' % spider.name, spider.host_name) if invalid_success: spider.task = []
def start_requests(self): permins = 0 print(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pubUtil.get_task(self.name, days=30) result = next(result_iter) else: result = pubUtil.getUrl('TW', 5) if not result: logging.info('get task error') time.sleep(10) continue today = datetime.now().strftime('%Y%m%d') for data in result: (dt, dep, to) = pubUtil.analysisData(data) # 把获取到的data格式化 # dt, dep, to = '20180722', 'ICN', 'KIX' # 测试某条数据 params = urllib.urlencode(dict( origin=dep, destination=to, onwardDateStr=dt.replace('-', ''), # pointOfPurchase='KR', paxTypeCountStr='3,0,0', today=today, travelType='OW', searchType='byDate', # domesticYn='Y', bundleAmountOW=0, bundleAmountRT=0, routeCls='AS', _=int(time.time() * 1000) )) total_url = self.start_urls[0] + params yield scrapy.Request(url=total_url, callback=self.transit, meta={'params': params, 'flag': 1}, dont_filter=True)
def start_requests(self): permins = 0 print( pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version)) self.get_headers() while True: result = pubUtil.getUrl(self.name, 1) if not result: logging.info('get task error') time.sleep(10) continue for data in result: (dt, dep, to, days) = vyUtil.analysisData(data) # 把获取到的data格式化 # (dt, dep, to, days) = ('20181026', 'LTN', 'IAS', 30) dt_datetime = datetime.strptime(dt, '%Y%m%d') end_date = dt_datetime + timedelta(days=int(days)) dt = dt_datetime.strftime('%Y-%m-%d') data_post = dict( DepartureDate=dt, DepartureStation=dep, ArrivalStation=to, ) data_post.update(self.custom_settings.get('GET_DATE_DATA')) yield scrapy.Request( method='POST', url=self.start_urls[0], # formdata=data_post, body=json.dumps(data_post), headers=self.custom_settings.get( 'DEFAULT_REQUEST_HEADERS'), meta={'end_date': end_date}, dont_filter=True, callback=self.date_parse, errback=lambda x: self.download_errback( x, data_post, end_date), )
def process_item(self, item): self.buffer.append(item) if len(self.buffer) >= 5: # # 测试库 # url = '%scarrier=%s' % (settings.PUSH_DATA_URL_TEST, item["carrier"]) # # 正式库 # # url = '%scarrier=%s' % (settings.PUSH_DATA_URL, item["carrier"]) # data = { # "action": "add", # "data": self.buffer # # } # response = requests.post(url, data=json.dumps(data), timeout=2 * 60, verify=False) # logging.info("%s,%s" % (response.content, len(self.buffer))) url = dataUtil.get_random_url(settings.PUSH_DATA_URL) add_success = pubUtil.addData('add', self.buffer, url, self.name, 'JQ') self.item_num += len(self.buffer) if add_success: self.buffer = [] invalid_success = pubUtil.invalidData( 'invalid', self.task, url + 'carrier=%s' % 'JQ', self.name) if invalid_success: self.task = [] # 加入心跳 run_time = time.time() if run_time - self.now >= 60: permins = self.item_num self.item_num = 0 print( pubUtil.heartbeat('%s' % (self.name), 'jq', '%s' % self.num, permins, self.version)) self.now = run_time