Exemplo n.º 1
0
 def start_requests(self):
     permins = 0
     print(
         pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                           self.version))
     result_iter = None
     while True:
         if hasattr(self, 'local'):
             if not result_iter:
                 result_iter = pubUtil.get_task(self.name, 60)
             result = next(result_iter)
         else:
             result = pubUtil.getUrl(self.carrier, 1)
         if not result:
             time.sleep(3)
             continue
         for data in result:
             (dt_st, dep, arr, days) = vyUtil.analysisData(data)
             for i in range(int(days)):
                 dt = vyUtil.get_real_date(dt_st, i)
                 data = dict(beginCity=dep, endCity=arr, beginDate=dt)
                 data.update(self.custom_settings.get('DEFAULT_DATA'))
                 yield scrapy.Request(
                     url=self.start_urls,
                     method="POST",
                     body=json.dumps(data),
                     meta=dict(data=data),
                     dont_filter=True,
                     callback=self.parse,
                     errback=self.errback,
                 )
Exemplo n.º 2
0
    def start_requests(self):
        permins = 0
        logging.info(
            pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                              self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, days=1)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl('aq', 1)
            if not result:
                logging.info('get task error')
                time.sleep(20)
                continue
            for data in result:
                # logging.info("###input data: " + data)
                (dt, dep, to) = pubUtil.analysisData(data)

                # dt,dep,to='2019-03-28','PVG','TPE'

                # ua = UserAgent()
                # self.headers['user-agent'] = ua.random
                post_data = 'B_LOCATION_1=' + dep + '&E_LOCATION_1=' + to + '&B_DATE_1=' + dt.replace(
                    '-', ''
                ) + '0000&B_ANY_TIME_1=True&EMBEDDED_TRANSACTION=FlexPricerAvailability&ARRANGE_BY=D&DISPLAY_TYPE=2&PRICING_TYPE=O&SO_SITE_MATRIX_CALENDAR=FALSE&SO_SITE_RUI_CAL_AVAI_NO_RECO=TRUE&SO_SITE_RUI_FP_AVAI_PRESEL=FALSE&COMMERCIAL_FARE_FAMILY_1=NEWECOOW&COMMERCIAL_FARE_FAMILY_2=NEWDELOW&COMMERCIAL_FARE_FAMILY_3=NEWBIZOW&SO_SITE_RUI_AX_CAL_ENABLED=TRUE&SO_SITE_CAL_CHANGE_WEEK=TRUE&SO_SITE_RUI_HIDE_MDF_SRC=FALSE&EXTERNAL_ID%236=OW&TRAVELLER_TYPE_1=ADT&TRIP_TYPE=O&TRIP_FLOW=YES&SO_SITE_EXPORT_CONFIRM=TRUE&SO_SITE_EXPORT_CONF_URL=https%3A%2F%2Fbooking.evaair.com%2Fexporttripplan%2Fwebservice.aspx&SO_SITE_THREEDS_USE=N&SO_SITE_BILLING_NOT_REQUIRED=Y&SO_SITE_BILL_ADD_OPTIONS=BILL_ADD_HIDDEN&SO_SITE_PREBOOK_CANCELLATION=TRUE&SO_GL=%3C%3Fxml+version%3D%221.0%22+encoding%3D%22iso-8859-1%22%3F%3E%0D%0A%3CSO_GL%3E%0D%0A%3CGLOBAL_LIST+mode%3D%22partial%22%3E%0D%0A%3CNAME%3ESL_AIR_MOP%3C%2FNAME%3E%0D%0A%3CLIST_ELEMENT%3E%0D%0A%3CCODE%3ECC%3C%2FCODE%3E%0D%0A%3CLIST_VALUE%3ECredit+Card%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EY%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECC%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECryptic%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECC%25T%25I%2F%25E%2F%25C%25F%2FN%25A%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%2F%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3ECC%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3CLIST_VALUE%3EN%3C%2FLIST_VALUE%3E%0D%0A%3C%2FLIST_ELEMENT%3E%0D%0A%3C%2FGLOBAL_LIST%3E%0D%0A%3C%2FSO_GL%3E&SO_SITE_FD_DISPLAY_MODE=1&SO_SITE_CURRENCY_FORMAT_JAVA=0&SO_SITE_ENABLE_SRV_POLICY=BAG%2CCOA&SO_SITE_ALLOW_SPEC_REQ_SERV=FALSE&SO_SITE_SD_TRUE_OP_CARRIER=TRUE&SO_SITE_BARCODE_ENABLE=TRUE&SO_SITE_ALLOW_CS_CODE_SHARE=FALSE&SO_SITE_USE_PAYMENT_ACTION=TRUE&EXTERNAL_ID=AIBS&EXTERNAL_ID%232=&EXTERNAL_ID%233=&EXTERNAL_ID%234=NEWECOOW&EXTERNAL_ID%235=&EXTERNAL_ID%2314=N&EXTERNAL_ID%2312=&EXTERNAL_ID%2313=zh_CN&EXTERNAL_ID%2399=C5WBKT102%23%23flyeva&DIRECT_LOGIN=NO&SO_SITE_RUI_MULTIDEV_ENABLED=TRUE&SO_SITE_RUI_TABLET_PG_LIST=ALL&SO_SITE_RUI_MOBILE_PG_LIST=ALL&SO_SITE_RUI_DISP_FF_TABLE=TRUE&SO_SITE_RUI_UPSLL_T_MDL=TRUE&SO_SITE_RUI_UPSLL_T_MDL_ATC=TRUE&SO_SITE_RUI_DPICKER_NATIVE=TABLET%2CMOBILE&MC_FORCE_DEVICE_TYPE=MOBILE&SO_SITE_RUI_MOBILE_FLOW=ALL&SO_SITE_RUI_TABLET_FLOW=ALL&SO_SITE_RUI_COLLAPSE_BOUND_T=TWO_STEPS&SO_SITE_RUI_UPSLL_HIDE_BTNS=FALSE&SO_SITE_OFFICE_ID=SHABR08AA&LANGUAGE=CN&SITE=CAWXCNEW'
                url_data = {
                    "ENCT": "1",
                    "ENC":
                    "990572D723A7BC83F77B4C6C03C696340674137066140FF11D721B8765E55FF8DC0562E080CE4BD1CD01272028CBBA89",
                    # 传入当前查询时间
                    "ENC_TIME": time.strftime("%Y%m%d%H%M%S", time.localtime())
                }

                # 设置无效
                invalid = {
                    'date': dt.replace('-', ''),
                    'depAirport': dep,
                    'arrAirport': to,
                    'mins': self.custom_settings.get('INVALID_TIME')
                }
                url_data = urllib.urlencode(url_data)
                self.url = self.start_urls[0] + '?' + url_data
                # print '# url: ', url
                # print '# url_data: ', url_data

                # ip = '127.0.0.1:8888'
                # ip = '127.0.0.1:1080'
                yield scrapy.Request(
                    self.url,
                    headers=self.headers,
                    body=post_data,
                    callback=self.parse,
                    dont_filter=True,
                    # meta={'invalid': invalid, 'proxy': ip},
                    meta={'invalid': invalid},
                    method='POST',
                    errback=self.errback)
Exemplo n.º 3
0
 def start_requests(self):
     permins = 0
     print(
         pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                           self.version))
     result_iter = None
     while True:
         if hasattr(self, 'local'):
             if not result_iter:
                 result_iter = pubUtil.get_task(self.name)
             result = next(result_iter)
         else:
             result = pubUtil.getUrl(self.carrier, 1)
         if not result:
             time.sleep(6)
             continue
         for data in result:
             (_dt, dep, to, days) = vyUtil.analysisData(data)
             for i in range(int(days)):
                 dt = (datetime.strptime(_dt, '%Y%m%d') +
                       timedelta(days=i)).strftime('%Y-%m-%d')
                 # dt, dep, to = '2019-02-28', 'BLR', 'BKK'
                 post_data = self.custom_settings.get(
                     'POST_DATA_FORMAT').copy()
                 post_data['query'] = post_data.get('query') % (self.seats,
                                                                to, dep, dt)
                 yield scrapy.Request(
                     url=self.start_urls,
                     method="POST",
                     body=json.dumps(post_data),
                     meta={'post_data': post_data},
                     dont_filter=True,
                 )
Exemplo n.º 4
0
    def heart_task(self):

        result_iter = None
        # 需要加查询判断
        while True:

            try:
                if self.local:
                    if not result_iter:
                        result_iter = pubUtil.get_task(self.name, days=10)
                    result = next(result_iter)
                else:
                    result = json.loads(
                        requests.get(settings.GET_TASK_URL + 'carrier=' +
                                     self.name,
                                     timeout=60).text).get('data')
            except Exception as e:
                logging.error(e)
                result = None

            if result is None:
                logging.info('Date is None!')
                logging.info('Waiting...')
                time.sleep(16)
                continue
            for data in result:
                # 开始请求
                self.start_requests(data)
Exemplo n.º 5
0
 def start_requests(self):
     permins = 0
     print(
         pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                           self.version))
     result_iter = None
     while True:
         if hasattr(self, 'local'):
             if not result_iter:
                 result_iter = pubUtil.get_task(self.name)
             result = next(result_iter)
         else:
             result = pubUtil.getUrl(self.carrier, 1)
         if not result:
             time.sleep(3)
             continue
         for data in result:
             (dt_st, dep, arr, days) = vyUtil.analysisData(data)
             for i in range(int(days)):
                 dt = vyUtil.get_real_date(dt_st, i)
                 pay_load = dict(
                     depCity1=dep,
                     arrCity1=arr,
                     depDate1=dt,
                 )
                 pay_load.update(self.custom_settings.get('PAY_LOAD'))
                 yield scrapy.FormRequest(
                     self.start_url,
                     formdata=pay_load,
                     meta={'payload': pay_load},
                     callback=self.parse,
                     dont_filter=True,
                     errback=self.err_back,
                 )
Exemplo n.º 6
0
 def start_requests(self):
     permins = 0
     print(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version))
     result_iter = None
     while True:
         if hasattr(self, 'local'):
             if not result_iter:
                 result_iter = pubUtil.get_task(self.name)
             result = next(result_iter)
         else:
             result = pubUtil.getUrl(self.carrier, 1)
         if not result:
             time.sleep(3)
             continue
         for data in result:
             (dt_st, dep, arr, days) = vyUtil.analysisData(data) 
             for i in range(int(days)):
                 dt = vyUtil.get_real_date(dt_st, i)
                 params = {
                     'origination-airport': dep,
                     'destination-airport': arr,
                     'departure-date': dt,
                     'number-adult-passengers': self.custom_settings.get('SEAT_SEARCH'),
                     'number-senior-passengers': 0,
                     'currency': 'USD',
                 }
                 total_url = self.start_urls + parse.urlencode(params)
                 yield scrapy.Request(
                     url=total_url,
                     method="GET",
                     dont_filter=True,
                     callback=self.parse,
                     errback=self.errback,
                 )
Exemplo n.º 7
0
    def start_requests(self):
        permins = 0
        print(
            LamudatechDevPipeline.heartbeat(self.host_name, self.spider_name,
                                            self.num, permins, self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, step=7, st=2)
                result = next(result_iter)
            else:
                try:
                    data_api = settings.GET_TASK_URL + 'carrier=VJ'
                    result = json.loads(
                        requests.get(data_api, timeout=180).text).get('data')
                except Exception as e:
                    self.log(e)
                    result = None
            if not result:
                logging.info('Date is None!')
                logging.info('Waiting...')
                continue
            airports, _day, _num = result[0].split(':')
            FROM, TO = airports.split('-')

            _day = re.sub(r'(\d{4})(\d{2})(\d{2})', r'\1-\2-\3', _day)

            # for airports in get_airports(u'越捷航空.csv'):
            #     FROM = airports.get('DepartureAirportCode')
            #     TO = airports.get('ArrivalAirportCode')
            #
            #     _day = "{:%Y-%m-%d}".format(datetime.today())

            for _day in self._get_dates(_day, int(_num)):
                data = {
                    'OutboundDate': _day,
                    'DaysBefore': '0',
                    'DaysAfter': '0',
                    'AdultCount': '1',
                    'ChildCount': '0',
                    'InfantCount': '0',
                    'DepartureAirportCode': FROM,
                    'ArrivalAirportCode': TO,
                    'CurrencyCode': 'VND',
                    'PromoCode': ''
                }
                yield scrapy.FormRequest(
                    self.start_urls,
                    formdata=data,
                    meta={
                        'FROM': FROM,
                        'TO': TO
                    },
                    callback=self.parse,
                    dont_filter=True,
                    # errback=lambda x: self.download_errback(x, FROM, TO)
                    errback=self.errback,
                )
Exemplo n.º 8
0
    def start_requests(self):
        permins = 0
        print(LamudatechDevPipeline.heartbeat(self.host_name, self.spider_name, self.num, permins, self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, step=1)
                result = next(result_iter)
            else:
                try:
                    data_api = settings.GET_TASK_URL + 'carrier=DY'
                    result = json.loads(requests.get(data_api, timeout=180).text).get('data')
                except Exception as e:
                    self.log(e)
                    result = None

            if not result:
                self.log('Date is None!', 40)
                self.log('Waiting...', 40)
                time.sleep(16)
                continue

            airports, _day, _num = result[0].split(':')
            FROM, TO = airports.split('-')
            currency_market = self.currency_info.get(FROM)
            if currency_market:
                currencyCode = currency_market.get('currency')
                marketCode = currency_market.get('marketCode')
            else:
                currencyCode = 'EUR'
                marketCode = 'en'
            for _day in self._get_dates(_day, int(_num)):
                params = parse.urlencode(dict(
                    adultCount=3,
                    childCount=0,
                    infantCount=0,
                    # culture='en-GB',
                    currencyCode=currencyCode,
                    marketCode=marketCode,
                    origin=FROM,
                    destination=TO,
                    inboundDate=_day,
                    outboundDate=_day,
                    includeTransit='true',
                    isRoundTrip='false',
                    isSsrNeeded='false',
                ))

                total_url = self.start_urls + params
                yield scrapy.Request(total_url,
                                     meta={'FROM': FROM, 'TO': TO, '_day': _day},
                                     callback=self.parse,
                                     errback=None,
                                     dont_filter=True)
Exemplo n.º 9
0
    def start_requests(self):
        permins = 0
        print(
            pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                              self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, days=30)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl(self.name)
            if not result:
                time.sleep(6)
                continue

            for data in result:

                # 处理任务 [u'TLN-CFE:20181110:1']
                task_data_list = data.split(':')
                count = int(task_data_list[2])
                (dt, dep, arr) = pubUtil.analysisData(task_data_list[0] + ':' +
                                                      task_data_list[1])
                _date = datetime.strptime(dt, '%Y-%m-%d')

                for i in range(count):
                    date = _date + timedelta(days=i)
                    date = date.strftime('%Y%m%d0000')

                    dep = self.port_city.get(dep, dep)
                    arr = self.port_city.get(arr, arr)

                    # logging.info('# input data: ' + dep + '' + arr + '' + date)

                    # 设置无效
                    invalid = {
                        'date': date.replace('-', ''),
                        'depAirport': dep,
                        'arrAirport': arr,
                        'mins': self.custom_settings.get('INVALID_TIME')
                    }

                    post_data = urllib.urlencode(
                        ly_post_data.second_post_data(dep, arr, date,
                                                      self.ADT))

                    yield scrapy.Request(self.start_urls[0],
                                         body=post_data,
                                         callback=self.parse,
                                         dont_filter=True,
                                         meta={'invalid': invalid},
                                         errback=self.errback,
                                         method='POST')
Exemplo n.º 10
0
    def start_requests(self):
        permins = 0
        print(
            pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                              self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl(self.carrier, 1)
            if not result:
                time.sleep(6)
                continue
            hour = datetime.now().hour + 2
            self.cookie = self.cookies[hour % len(self.cookies)]
            installid = self.InstallationID[hour % len(self.InstallationID)]

            for data in result:
                (dt_st, dep, to,
                 days) = vyUtil.analysisData(data)  # 把获取到的data格式化
                # dep, to = 'CDG', 'VIE'
                for i in range(int(days)):
                    dt = vyUtil.get_real_date(dt_st, i)
                    # dt = '2018-11-01'
                    self.task.append({
                        'date': dt.replace('-', ''),
                        'depAirport': dep,
                        'arrAirport': to,
                        'mins': settings.INVALID_TIME
                    })
                    dt = dt + 'T00:00:00'
                    data_list = {
                        'InstallationID':
                        installid,
                        'AirportDateTimeList': [{
                            'MarketDateDeparture': dt,
                            'DepartureStation': dep,
                            'ArrivalStation': to,
                        }]
                    }

                    data_list.update(self.custom_settings.get('DEFAULT_DATA'))
                    yield scrapy.Request(
                        method='POST',
                        url=self.start_url,
                        headers={'Cookie': self.cookie},
                        body=json.dumps(data_list),
                        meta={'data_list': data_list},
                        callback=self.parse,
                        dont_filter=True,
                        errback=lambda x: self.download_errback(x, data_list))
Exemplo n.º 11
0
    def start_requests(self):
        permins = 0
        logging.info(
            pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                              self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, days=30)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl(self.name, 1)
            if not result:
                logging.info('get task error')
                time.sleep(20)
                continue
            for data in result:
                (dt, dep, to) = pubUtil.analysisData(data)

                # dep, to, dt = 'FUK', 'YNT', '2019-03-27'
                post_data = {
                    "tripType": "OW",
                    "orgCode": dep,
                    "dstCode": to,
                    "takeoffdate1": dt,
                }

                # 随机UA
                ua = UserAgent()
                self.headers['User-Agent'] = ua.random
                # post_data = urllib.urlencode(post_data)
                # logging.info("###input data: " + dep + to + dt)
                # 设置无效
                invalid = {
                    'date': dt.replace('-', ''),
                    'depAirport': dep,
                    'arrAirport': to,
                    'mins': self.custom_settings.get('INVALID_TIME')
                }

                yield scrapy.Request(
                    self.start_urls[0],
                    headers=self.headers,
                    body=json.dumps(post_data),
                    # body=post_data,
                    callback=self.parse,
                    dont_filter=True,
                    # meta={'invalid': invalid, 'proxy': 'http://127.0.0.1:8888'},
                    meta={'invalid': invalid},
                    method='POST',
                    errback=self.errback)
Exemplo n.º 12
0
    def start_requests(self):

        permins = 0
        logging.info(
            pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                              self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, step=7)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl(self.name, 1)
            if not result:
                logging.info('get task error')
                time.sleep(10)
                continue

            # 循环多个任务,现在默认一个
            for data in result:
                # 处理任务 BVE-LYS-201812030000-15
                count = int(data.split(':')[-1])
                (date, dep, arr) = pubUtil.analysisData(data[:-2])
                date = date.replace('-', '') + '0000'

                # logging.info('# input data: ' + dep + '-' + arr + '-' + date + '-' + str(count))

                task_data = {
                    'dep': dep,
                    'arr': arr,
                    'date': date,
                    'count': count
                }

                post_data = urllib.urlencode(
                    a5_post_data.first_post_data(dep, arr, date, self.ADT))
                # 获取session
                yield scrapy.Request(
                    self.get_session_url[0],
                    body=post_data,
                    callback=self.get_session,
                    dont_filter=True,
                    meta={
                        'post_data': post_data,
                        'task_data': task_data
                    },
                    method='POST',
                    errback=self.errback,
                )
Exemplo n.º 13
0
    def start_requests(self):
        permins = 0
        logging.info(
            pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                              self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, days=30)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl(self.name, 1)
            if not result:
                logging.info('get task error')
                time.sleep(20)
                continue

            self.session_data['tck'] = random.choice(self.id_pool)
            for data in result:
                # logging.info("###input data: " + data)
                (dt, dep, to) = pubUtil.analysisData(data)

                # dt,dep,to='2019-02-28','CAN','RGN'
                post_data = {
                    'traveldate': dt,
                    'ori': dep,
                    'currency': 'CNY',
                    'dest': to
                }

                # 设置无效
                invalid = {
                    'date': dt.replace('-', ''),
                    'depAirport': dep,
                    'arrAirport': to,
                    'mins': self.custom_settings.get('INVALID_TIME')
                }

                post_data = urllib.urlencode(post_data)

                yield scrapy.Request(self.start_urls[0],
                                     headers=self.headers,
                                     body=post_data,
                                     callback=self.parse,
                                     dont_filter=True,
                                     meta={'invalid': invalid},
                                     method='POST',
                                     errback=self.errback)
Exemplo n.º 14
0
    def start_requests(self):
        permins = 0
        self.proxy = True
        print(
            LamudatechDevPipeline.heartbeat(self.host_name, self.spider_name,
                                            self.num, permins, self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, step=7)
                result = next(result_iter)
            else:
                try:
                    data_api = settings.GET_TASK_URL + 'carrier=U2'
                    result = json.loads(
                        requests.get(data_api, timeout=180).text).get('data')
                except Exception as e:
                    self.log(e)
                    result = None

            if not result:
                self.log('Date is None!', level=20)
                self.log('Waiting...', level=20)
                time.sleep(16)
                continue

            airports, _day, _num = result[0].split(':')
            FROM, TO = airports.split('-')
            # FROM, TO = 'TXL', 'ARN'

            lowfares_url = 'https://www.easyjet.com/ejcms/cache15m/api/routedates/get/?'
            lowfares_total_url = lowfares_url + parse.urlencode(
                {
                    'originIata': FROM,
                    'destinationIata': TO
                })

            yield scrapy.Request(lowfares_total_url,
                                 meta={
                                     'FROM': FROM,
                                     'TO': TO
                                 },
                                 callback=self.date_parse,
                                 errback=self.errback,
                                 dont_filter=True)
Exemplo n.º 15
0
    def start_requests(self):
        permins = 0
        print(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version))
        result_iter = None
        while True:

            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, days=30)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl('TW', 5)

            if not result:
                logging.info('get task error')
                time.sleep(10)
                continue
            today = datetime.now().strftime('%Y%m%d')
            for data in result:
                (dt, dep, to) = pubUtil.analysisData(data)  # 把获取到的data格式化
                # dt, dep, to = '20180722', 'ICN', 'KIX' # 测试某条数据
                params = urllib.urlencode(dict(
                    origin=dep,
                    destination=to,
                    onwardDateStr=dt.replace('-', ''),
                    # pointOfPurchase='KR',
                    paxTypeCountStr='3,0,0',
                    today=today,
                    travelType='OW',
                    searchType='byDate',
                    # domesticYn='Y',
                    bundleAmountOW=0,
                    bundleAmountRT=0,
                    routeCls='AS',
                    _=int(time.time() * 1000)
                ))
                total_url = self.start_urls[0] + params
                yield scrapy.Request(url=total_url,
                                     callback=self.transit,
                                     meta={'params': params, 'flag': 1},
                                     dont_filter=True)
Exemplo n.º 16
0
    def start_requests(self):
        permins = 0
        print(
            LamudatechDevPipeline.heartbeat(self.host_name, self.spider_name,
                                            self.num, permins, self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter or not result:
                    result_iter = pubUtil.get_task(self.name)
                result = next(result_iter)
            else:
                try:
                    data_api = settings.GET_TASK_URL + 'carrier=EW'
                    result = json.loads(
                        requests.get(data_api, timeout=180).text).get('data')
                except Exception as e:
                    self.log(e)
                    result = None

            if not result:
                self.log('Date is None!', level=20)
                self.log('Waiting...', level=20)
                time.sleep(16)
                continue

            airports, _day, _num = result[0].split(':')
            FROM, TO = airports.split('-')
            # FROM, TO = 'HAM', 'CDG'

            # 请求最低价获得有价格的日期
            for record in self.get_data(FROM, TO):
                # 日期排序
                order_list = [
                    x.get('date')
                    for x in sorted(record, key=lambda x: x.get('date'))
                ]
                # 筛选临近日期,减少爬取数量
                lowfares = [
                    x for x in record
                    if x.get('date') in self.get_list(order_list)
                ]

                for data in lowfares:
                    _from = data.get('origin')
                    _to = data.get('destination')
                    # currency = data.get('currency')
                    # currency = self.currency_cache.get(currency, currency)
                    _date = data.get('date')
                    # _date = '2018-11-06'
                    params = urllib.urlencode(
                        dict(
                            o=_from,
                            d=_to,
                            t='o',
                            od=_date,
                            adt='5',
                            lng='en-GB',
                            appvi='2D53F50C85034ECF-6000119C00002033',
                            adobe_mc=
                            'TS=%s|MCAID=2D791936852A6702-40000129C00FCFCA' %
                            int(time.time()),
                            screen='Search',
                            culture='en-GB',
                        ))

                    total_url = self.start_urls + params

                    yield scrapy.Request(
                        total_url,
                        meta={
                            'FROM': _from,
                            'TO': _to,
                            '_day': _date,
                            # 'currency': currency
                        },
                        dont_filter=True,
                        callback=self.parse,
                        errback=self.download_errback)
Exemplo n.º 17
0
    def start_requests(self):
        permins = 0
        logging.info(
            pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                              self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, days=30)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl(self.name, 1)
            if not result:
                logging.info('get task error')
                time.sleep(5)
                continue
            for data in result:
                # logging.info("## input data: " + data)

                # 处理任务 [u'TLN-CFE:20181110:1']
                count = int(data.split(':')[-1])
                (date, dep, arr) = pubUtil.analysisData(data[:-2])
                _date = datetime.strptime(date, '%Y-%m-%d')

                for i in range(count):
                    _date = _date + timedelta(days=i)
                    date = _date.strftime('%Y%m%d')
                    # dep = 'KIX'
                    # arr = 'ICN'
                    # logging.info('# input data: ' + dep + '-' + arr + '-' + date)
                    city_code = self.city_dict.get(dep)
                    if city_code is None:
                        logging.info('# not found city: ' + dep)
                    body = json.dumps(
                        ze_post_data.get_data(dep, arr, date, self.ADT,
                                              city_code))

                    # 设置无效
                    invalid = {
                        'date': date.replace('-', ''),
                        'depAirport': dep,
                        'arrAirport': arr,
                        'mins': self.custom_settings.get('INVALID_TIME')
                    }
                    task_data = {
                        'dep': dep,
                        'arr': arr,
                        'date': date,
                        'city_code': city_code,
                        'body': body
                    }

                    yield scrapy.Request(self.start_urls[0],
                                         headers=self.headers,
                                         body=body,
                                         callback=self.parse,
                                         dont_filter=True,
                                         meta={
                                             'invalid': invalid,
                                             'task_data': task_data
                                         },
                                         errback=self.errback,
                                         method='POST')
Exemplo n.º 18
0
    def start_request(self):
        result_iter = None
        # 需要加查询判断
        while True:
            # if not timeUtil.time_is_valid(self.st_time, self.en_time):
            #     logging.info('Waiting to 07:30:00.....')
            #     time.sleep(5 * 60)
            #     continue
            # data_api = 'http://dx.redis.jiaoan100.com/buddha/gettask?carrier=JX'
            data_api = 'http://task.jiaoan100.com/buddha/gettask?carrier=jx'
            try:
                if self.local:
                    if not result_iter:
                        result_iter = pubUtil.get_task('JQ', days=10)
                    result = next(result_iter)
                else:
                    result = json.loads(
                        requests.get(data_api, timeout=60).text).get('data')
            except Exception as e:
                logging.error(e)
                result = None

            if result is None:
                logging.info('Date is None!')
                logging.info('Waiting...')
                time.sleep(16)
                continue

            airports, _day, day_num = result[0].split(':')
            # day_num='1'
            # print('airports, _day, day_num',airports, _day, day_num)
            FROM, TO = airports.split('-')
            # FROM, TO = ('DAD', 'HKG')
            _day = re.sub(r'(\d{4})(\d{2})(\d{2})', r'\1-\2-\3', _day)
            days = self._get_dates(_day, day_num)
            # print(days)
            # days = ['2019-01-11', '2019-01-12', '2019-01-13']
            for day in days:
                # FROM, TO, day = 'RGN', 'SIN', '2019-01-17'
                query = urlencode({
                    'origin1': FROM,
                    'destination1': TO,
                    # 'flight-type': '1',
                    'departuredate1': day,
                    'adults': str(settings.ADULT_NUM),
                    'children': '0',
                    'infants': '0',
                })
                print(query)
                # set_invalid('JX', FROM, TO, day)
                total_url = self.start_url + query
                # 设置无效
                invalid = {
                    'date': day.replace('-', ''),
                    'depAirport': FROM,
                    'arrAirport': TO,
                    'mins': settings.INVALID_TIME
                }
                # total_url = 'https://www.jetstar.com/au/en/home?origin=CBR&destination=HNL&flight-type=1&selected-departure-date=02-02-2019&adult=1&flexible=1&currency=AUD'
                # yield total_url,invalid
                yield [total_url, invalid]
Exemplo n.º 19
0
    def start_requests(self):
        permins = 0
        logging.info(
            pubUtil.heartbeat(self.host_name, self.name, self.num, permins,
                              self.version))
        result_iter = None
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, days=30)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl(self.name, 1)
            if not result:
                logging.info('get task error')
                time.sleep(5)
                continue
            for data in result:

                # logging.info("## input data: " + data)
                # 处理任务 [u'TLN-CFE:20181110:1']
                count = int(data.split(':')[-1])
                (date, dep, arr) = pubUtil.analysisData(data[:-2])
                _date = datetime.strptime(date, '%Y-%m-%d')
                for i in range(count):
                    temp_date = _date + timedelta(days=i)
                    date = temp_date.strftime('%Y-%m-%d')

                    # logging.info('# input data: ' + dep + '-' + arr + '-' + date)
                    # dep, arr, date = 'MNL', 'SIN', '2019-01-04'
                    post_data = {
                        "originIata": dep,
                        "destinationIata": arr,
                        "departureDate": date + "T00:00:00+08:00",
                        "passengerComposition": {
                            "adult": self.ADT,
                            "children": 0,
                            "infant": 0
                        }
                    }
                    body = json.dumps(post_data)

                    # 设置无效
                    invalid = {
                        'date': date[:10].replace('-', ''),
                        'depAirport': dep,
                        'arrAirport': arr,
                        'mins': self.custom_settings.get('INVALID_TIME')
                    }
                    task_data = {
                        'dep': dep,
                        'arr': arr,
                        'date': date,
                        'body': body
                    }
                    yield scrapy.Request(url=self.start_urls[0],
                                         body=body,
                                         callback=self.parse,
                                         dont_filter=True,
                                         meta={
                                             'invalid': invalid,
                                             'task_data': task_data
                                         },
                                         errback=self.errback,
                                         method='POST')
Exemplo n.º 20
0
    def start_requests(self):
        permins = 0
        logging.info(pubUtil.heartbeat(self.host_name, self.name, self.num, permins, self.version))
        result_iter = None
        # IP使用时长计时器
        # start_time = time.time()
        while True:
            if hasattr(self, 'local'):
                if not result_iter:
                    result_iter = pubUtil.get_task(self.name, days=10)
                result = next(result_iter)
            else:
                result = pubUtil.getUrl(self.name, 1)
            if not result:
                logging.info('get task error')
                time.sleep(20)
                continue
            for data in result:
                # logging.info("## input data: " + data)
                # 处理任务 [u'TLN-CFE:20181110:1']
                count = int(data.split(':')[-1])
                (date, dep, arr) = pubUtil.analysisData(data[:-2])
                _date = datetime.strptime(date, '%Y-%m-%d')

                for i in range(count):
                    temp_date = _date + timedelta(days=i)
                    date = temp_date.strftime('%m/%d/%Y')
                    invalid_date = temp_date.strftime('%Y%m%d')

                    # logging.info('# input data: ' + dep + '-' + arr + '-' + date)
                    # dep, arr, date = 'FLL', 'LAS', '2019-01-13'

                    # IP超过使用时长,强制更换
                    # logging.info('ip used time: ' + str(time.time() - start_time))
                    # if time.time() - start_time > self.use_time:
                    #     self.proxy_flag = True
                    #     logging.info('### ip invalid:' + self.proxy)
                    if self.proxy_flag:
                        while True:
                            # 俄罗斯代理
                            # self.proxy = pubUtil.nk_get_ip()
                            # 小池子代理
                            self.proxy = pubUtil.get_proxy(self.name)
                            if self.proxy is None:
                                logging.info('# no get proxy, continue')
                                # time.sleep(60)
                                continue
                            logging.info('# get a new ip: ' + self.proxy)
                            ip_proxies = {"https": "https://" + self.proxy}
                            # 获取session
                            try:
                                response = requests.get(self.get_session_url, proxies=ip_proxies, timeout=15)
                                self.cookies_str = json.dumps(requests.utils.dict_from_cookiejar(response.cookies))[
                                                   1:-1].replace(
                                    '\"',
                                    '').replace(
                                    ':', '=').replace(' ', '').replace(',', '; ')

                            except Exception as e:
                                logging.info(e)
                                self.proxy_flag = True
                                logging.info('# get session error')
                                continue
                            # IP正常使用,开始计时
                            # start_time = time.time()
                            self.proxy_flag = False

                            break
                    headers = {
                        'Content-Type': 'application/x-www-form-urlencoded',
                        'Cookie': self.cookies_str
                    }

                    post_data = {
                        'from': dep,
                        'to': arr,
                        # 'from': 'AXM',
                        # 'to': 'ATL',
                        'departDate': date,
                        'departDateDisplay': date,
                        'ADT': self.ADT
                    }

                    post_data.update(self.custom_settings.get('POST_DATA'))
                    post_data = urllib.urlencode(post_data)

                    # 设置无效
                    invalid = {
                        'date': invalid_date,
                        'depAirport': dep,
                        'arrAirport': arr,
                        'mins': self.custom_settings.get('INVALID_TIME')
                    }
                    yield scrapy.Request(url=self.start_urls[0],
                                         body=post_data,
                                         headers=headers,
                                         callback=self.parse,
                                         dont_filter=True,
                                         meta={'invalid': invalid, 'proxy': self.proxy},
                                         errback=self.errback,
                                         method='POST')