예제 #1
0
    def start_requests(self):
        permins = 0
        self.log(
            pub_util.heartbeat(self.host_name, self.name, self.num, permins,
                               self.version), 20)
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pub_util.get_task(self.name, days=10)
                result = next(result_iter)
            else:
                result = pub_util.get_url(self.name, 1)
            if not result:
                self.log('get task error', 30)
                time.sleep(10)
                continue
            for data in result:

                # 处理任务 data JED-AHB:20181218:1
                dt, dep, arr, days = data_util.parse_data(data)
                # print data
                # print dep, arr, dt, days
                this_day = datetime.strptime(dt, '%Y%m%d')
                for diff_day in range(int(days)):
                    dt_format = (this_day +
                                 timedelta(days=diff_day)).strftime('%Y-%m-%d')

                    params = {  # 可变参数
                        "availabilityRequests[0].departureStation": dep,
                        "availabilityRequests[0].arrivalStation": arr,
                        "availabilityRequests[0].beginDate": dt_format,
                        "availabilityRequests[0].endDate": dt_format,
                    }
                    params.update(self.custom_settings.get('DEFAULT_PARAMS'))
                    url = self.start_url + urllib.urlencode(params)
                    yield scrapy.Request(
                        url=url,
                        dont_filter=True,
                        callback=self.parse,
                        errback=self.err_back,
                    )
예제 #2
0
    def process_item(self, item, spider):
        item = data_util.strip_item(item)
        item = data_util.keys_to_short(item)
        self.store.append(item)

        run_time = time.time()
        if run_time - self.interval >= 60:
            self.interval = run_time
            permins = spider.crawler.stats.get_value('permins')
            print(pub_util.heartbeat(spider.host_name, spider.name, spider.num, permins, spider.version))

        num = settings.PUSH_DATA_NUM
        if 1 or len(self.store) >= num:
            add_success = pub_util.operate_data('add', self.store, self.url, spider.host_name, carrier=spider.name.upper())
            if add_success:
                self.store = []
                invalid_success = pub_util.operate_data('invalid', spider.task, self.url,
                                                        spider.host_name, carrier=spider.name.upper)
                if invalid_success:
                    spider.task = []
예제 #3
0
 def start_requests(self):
     permins = 0
     self.log(
         pub_util.heartbeat(self.host_name, self.name, self.num, permins,
                            self.version), 20)  # 心跳
     result_iter = None
     while True:
         if hasattr(self, 'local'):
             if not result_iter:
                 result_iter = pub_util.get_task(self.name, st=1, days=30)
             result = next(result_iter)
         else:
             result = pub_util.get_url(self.name, 1)
         if not result:
             self.log('get task error', 30)
             time.sleep(10)
             continue
         for data in result:
             # print(data)
             dt, dep, arr, days = data_util.parse_data(data)
             # dep, arr = 'KHH', 'MZG'
             this_day = datetime.strptime(dt, '%Y%m%d')
             for vary_day in range(int(days)):
                 one_day = (this_day +
                            timedelta(days=vary_day)).strftime('%Y-%m-%d')
                 # one_day = '2019-03-23'
                 body1 = self.data_for_time.format(dep, arr, one_day,
                                                   self.seats)
                 flight_data = [dep, arr, one_day, self.seats]
                 meta_data = {'flight_data': flight_data, 'oneday': one_day}
                 yield scrapy.Request(
                     url=self.start_urls[0],
                     method='POST',
                     body=body1,
                     meta={'meta_data': meta_data},
                     headers=self.headers_for_time,
                     callback=self.parse_time,
                     errback=self.err_back,
                     dont_filter=True,
                 )