def start_requests(self): permins = 0 self.log( pub_util.heartbeat(self.host_name, self.name, self.num, permins, self.version), 20) result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pub_util.get_task(self.name, days=10) result = next(result_iter) else: result = pub_util.get_url(self.name, 1) if not result: self.log('get task error', 30) time.sleep(10) continue for data in result: # 处理任务 data JED-AHB:20181218:1 dt, dep, arr, days = data_util.parse_data(data) # print data # print dep, arr, dt, days this_day = datetime.strptime(dt, '%Y%m%d') for diff_day in range(int(days)): dt_format = (this_day + timedelta(days=diff_day)).strftime('%Y-%m-%d') params = { # 可变参数 "availabilityRequests[0].departureStation": dep, "availabilityRequests[0].arrivalStation": arr, "availabilityRequests[0].beginDate": dt_format, "availabilityRequests[0].endDate": dt_format, } params.update(self.custom_settings.get('DEFAULT_PARAMS')) url = self.start_url + urllib.urlencode(params) yield scrapy.Request( url=url, dont_filter=True, callback=self.parse, errback=self.err_back, )
def process_item(self, item, spider): item = data_util.strip_item(item) item = data_util.keys_to_short(item) self.store.append(item) run_time = time.time() if run_time - self.interval >= 60: self.interval = run_time permins = spider.crawler.stats.get_value('permins') print(pub_util.heartbeat(spider.host_name, spider.name, spider.num, permins, spider.version)) num = settings.PUSH_DATA_NUM if 1 or len(self.store) >= num: add_success = pub_util.operate_data('add', self.store, self.url, spider.host_name, carrier=spider.name.upper()) if add_success: self.store = [] invalid_success = pub_util.operate_data('invalid', spider.task, self.url, spider.host_name, carrier=spider.name.upper) if invalid_success: spider.task = []
def start_requests(self): permins = 0 self.log( pub_util.heartbeat(self.host_name, self.name, self.num, permins, self.version), 20) # 心跳 result_iter = None while True: if hasattr(self, 'local'): if not result_iter: result_iter = pub_util.get_task(self.name, st=1, days=30) result = next(result_iter) else: result = pub_util.get_url(self.name, 1) if not result: self.log('get task error', 30) time.sleep(10) continue for data in result: # print(data) dt, dep, arr, days = data_util.parse_data(data) # dep, arr = 'KHH', 'MZG' this_day = datetime.strptime(dt, '%Y%m%d') for vary_day in range(int(days)): one_day = (this_day + timedelta(days=vary_day)).strftime('%Y-%m-%d') # one_day = '2019-03-23' body1 = self.data_for_time.format(dep, arr, one_day, self.seats) flight_data = [dep, arr, one_day, self.seats] meta_data = {'flight_data': flight_data, 'oneday': one_day} yield scrapy.Request( url=self.start_urls[0], method='POST', body=body1, meta={'meta_data': meta_data}, headers=self.headers_for_time, callback=self.parse_time, errback=self.err_back, dont_filter=True, )