def process_request(self, request, spider): k = PyKeyboard() self.driver.get((request.url)) x = request.url[:] #request.headers["Referer"] = "https://antirobot.tianyancha.com/captcha/verify?return_url=https%3A%2F%2Fwww.tianyancha.com%2Fcompany%" + str(request.url[35:]) + "&rnd=" #print(request.headers["Referer"]) time.sleep(random.random()) try: denglu = self.driver.find_element_by_xpath( '//div[@id="J_TLoginInfoHd"]/a[1]') if not denglu: pass else: denglu.click() time.sleep(random.random()) self.driver.find_element_by_xpath( '//a[@class="forget-pwd J_Quick2Static"]').click() time.sleep(0.3) self.driver.find_element_by_xpath( '//input[@id="TPL_username_1"]').send_keys('15123358380') self.driver.find_element_by_xpath( '//input[@id="TPL_password_1"]').send_keys('a135792468') self.driver.find_element_by_xpath( '//button[@id="J_SubmitStatic"]').click() #win32api.SetCursorPos([random.randint(1160,1170),random.randint(590,595)]) #win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP | win32con.MOUSEEVENTF_LEFTDOWN, 0, 0, 0, 0) #self.mima() # try: # while True: # huakuai = self.driver.find_element_by_xpath('//*[@id="nc_1_n1z"]') # if not huakuai: # break # else: # x = random.randint(975,1010) # y = random.randint(505, 515) # win32api.SetCursorPos([x, y]) # time.sleep(0.1) # win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN, 0, 0, 0, 0) # sum = x # for i in range(9): # sum += random.randint(30, random.randint(40, 65)) # win32api.SetCursorPos([sum, y]) # 为鼠标焦点设定一个位置 # time.sleep(0.1) # win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP, 0, 0, 0, 0) # win32api.SetCursorPos([random.randint(1000, 1200), random.randint(560, 565)]) # win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP | win32con.MOUSEEVENTF_LEFTDOWN, 0, 0, 0,0) # time.sleep(random.random()) # self.driver.refresh() # time.sleep(5) # try: # win32api.SetCursorPos([random.randint(1160, 1170), random.randint(590, 595)]) # win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP | win32con.MOUSEEVENTF_LEFTDOWN, 0, 0,0, 0) # except: # pass # self.mima() # except: # pass time.sleep(10) request.url = x[:] self.driver.get((request.url)) except: pass time.sleep(0.5 + random.random()) str = self.driver.page_source a = 0 month_sell = etree.HTML(str).xpath( '//ul[@class="info-list"]/li[1]/em/text()') while (month_sell == ['-'] or month_sell == []): if request.url == 'https://www.fliggy.com/dujia/?spm=181.11358650.0.0.78d5223eYq1rts': break # self.driver.switch_to.frame('sufei-dialog-content') try: y = random.randint(325, 335) x = random.randint(835, 845) win32api.SetCursorPos([x, y]) time.sleep(0.3 + random.random()) win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN, 0, 0, 0, 0) sum = x for i in range(9): sum += random.randint(random.randint(25, 30), random.randint(40, 65)) win32api.SetCursorPos([sum, y]) # 为鼠标焦点设定一个位置 time.sleep(random.random() * 0.1) win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP, 0, 0, 0, 0) # self.driver.switch_to.default_content() str = self.driver.page_source month_sell = etree.HTML(str).xpath( '//ul[@class="info-list"]/li[1]/em/text()') print(month_sell) if (month_sell == ['-'] or month_sell == []): time.sleep(random.random()) self.driver.refresh() a += 1 time.sleep(random.random()) if (a >= 7): self.cursor.execute(self.sql, (request.url)) self.conn.commit() break except: pass source = self.driver.page_source response = HtmlResponse(url=self.driver.current_url, body=source, request=request, encoding='utf-8') return response
def test_empty_page(self): spider, page, _ = open_spider_page_and_results( 'cars.com_nested.json') page = HtmlResponse(page.url, body=u'', encoding='utf-8') items = [i for i in spider.parse(page) if not isinstance(i, Request)] self.assertEqual(items, [])
def get_response_object(self, url): path_to_file = url.replace(FILE_SYSTEM_PREFIX, '') f = open(path_to_file, 'rb') bytess = f.read() f.close() return HtmlResponse(url, 200, self.generate_response_headers(), bytess, None, Request(url), encoding='utf-8')
def collect_all_tagets(self, driver, starturl: str): """ 从入口starturl开始,遍历每一页收集所有标的url,该函数执行前务必已经`driver.get(starturl)` """ try: total = WebDriverWait(driver, 10).until( EC.presence_of_element_located( (By.CSS_SELECTOR, 'div.pagination.J_Pagination .page-total'))) except TimeoutException: # TimeoutException self.logger.debug("导航页有问题,获取导航失败:{}".format(starturl)) return # 分页信息分析 max_page_num = int(total.text) try: nav = WebDriverWait(driver, 10).until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, 'div.pagination.J_Pagination a'))) except TimeoutException: # TimeoutException self.logger.debug("导航页有问题,获取导航失败:{}".format(starturl)) return # 跳转到首页,第二个是第一页 driver.execute_script("arguments[0].click();", nav[1]) # 从首页开始遍历每页,动作是收集所有标的,然后到下一页. self.logger.info("将收集{}页,starturl{}".format(max_page_num, starturl)) max_retry_times = 5 # 尝试5次 retrys = max_retry_times for i in range(max_page_num): # 提示 if i < max_page_num - 1 and i % 50 == 0: self.logger.info("第{}页开始收集".format(i)) elif i == max_page_num - 1: self.logger.info("已收集{}页".format(i)) try: _ = WebDriverWait(driver, 20).until( EC.presence_of_all_elements_located( (By.CSS_SELECTOR, ".sf-content .sf-item-list li a"))) #从实践看,网页会变化,所以采用保存下来再解析 response = HtmlResponse(driver.current_url, body=str.encode(driver.page_source), encoding='utf-8') yield list( set( map( lambda x: response.urljoin(x), response.css( ".sf-content .sf-item-list li a::attr(href)"). getall()))) except TimeoutException: self.logger.info("{} 页收集标的url失败".format(i)) # 已经到了最后页,就不需要跳转下一页了 if i < max_page_num - 1: # 跳转到下一页, if self._to_next_page(driver) == -1: retrys -= 1 if retrys <= 0: break # 如果跳转失败跳出循环结束 else: retrys = max_retry_times # return urls return
def process_request(self, request, spider): if spider.name == 'sohucaijing_Spider': request.headers['User-Agent'] = random_user_agent.give_a_head() # self.driver = Firefox(executable_path='geckodriver', firefox_options=self.options) # 配了环境变量第一个参数就可以省了,不然传绝对路径 # wait = WebDriverWait(self.driver, timeout=10) # 当引擎从调度器中取出request进行请求发送下载器之前 # 会先执行当前的爬虫中间件 ,在中间件里面使用selenium # 请求这个request ,拿到动态网站的数据 然后将请求 # 返回给spider爬虫对象 # 使用爬虫文件的url地址 # self.driver.get(request) # wait.until(expected.visibility_of_element_located((By.NAME, 'q'))).send_keys( # 'headless firefox' + Keys.ENTER) # wait.until(expected.visibility_of_element_located((By.CSS_SELECTOR, '#ires a'))).click() # print(self.driver.page_source) # 整数,需要爬取的新闻数量,最好定义成整20 news_amount = 1000 # 整数 额外获取的数据包数量,一包20条新闻,只要初始的20条就改成0 不保证因为网卡产生的数据损失 ex_packages_amount = int(news_amount / 20) - 1 url = request.url if 'https://' in request.url: url = request.url[9:] print("在中间件请求的连接:" + url) self.driver.get(url) if 'mp.sohu.com/profile?xpt=c29odWNqeWMyMDE3QHNvaHUuY29t' in url: for temp in range(0, ex_packages_amount): # for x in range(1, 12, 2): # i = (float(x) / 11)/(temp+1) + temp/(temp+1) # # scrollTop 从上往下的滑动距离 # # print("中间件:准备执行这个滚动js") # js = 'document.body.scrollTop=document.body.scrollHeight * %f' % i # time.sleep(1) # self.driver.execute_script(js) self.driver.execute_script( "document.documentElement.scrollTop=document.body.scrollHeight" ) time.sleep(1) print("现在的feed item数量:" + str( len( self.driver.find_elements_by_class_name( "feed-item")))) # float_list = [] # for _ in range(0, 3): # float_list.append(random.random()) # float_list.sort() # print("random list:" + str(float_list)) # for i in float_list: # js = 'document.body.scrollTop=document.body.scrollHeight * %f' % i # time.sleep(3) # spider.driver.execute_script(js) # time.sleep(3) else: for x in range(1, 6, 2): i = float(x) / 5 # scrollTop 从上往下的滑动距离 # print("中间件:准备执行这个滚动js") js = 'document.body.scrollTop=document.body.scrollHeight * %f' % i time.sleep(1) self.driver.execute_script(js) response = HtmlResponse(url=url, body=self.driver.page_source, encoding='utf-8', request=request) # print("中间件:准备return这个response") # 这个地方只能返回response对象,当返回了response对象,那么可以直接跳过Internet,将response的值传递给引擎,引擎又传递给 spider进行解析 return response
def process_request(self,request,spider): self.driver.get(request.url) self.driver.implicitly_wait(50) source = self.driver.page_source response = HtmlResponse(url = self.driver.current_url,body=source,request =request,encoding='utf-8') return response
def test_display_response_without_return(self): response = HtmlResponse(body=b'<html>test</html>', url='http://some.url', status=200) resp = display(response) self.assertIsNone(resp)
def process_request(self,request,spider): self.driver.get(request.url) time.sleep(1) source = self.driver.page_source response = HtmlResponse(url=self.driver.current_url,body=source,request=request,encoding='utf-8') return response
def cached_page(site, url_path, spider_name='toc'): handle_client_ip() site = base64.standard_b64decode(site.encode()).decode() url_path = base64.standard_b64decode(url_path.encode()).decode() url_site = SiteSchemas.get(site).get(SSK.URL) url = url_site + url_path origin_encoding = SiteSchemas.get(site).get(SSK.ENCODING, 'utf-8') aid = request.args.get('aid', default=None, type=int) from moltspider.consts import Schemas from moltspider.parser import iter_items from scrapy.utils.misc import load_object from scrapy.utils.project import get_project_settings from scrapy.http.request import Request from scrapy.http.response.html import HtmlResponse from scrapy.utils.gz import gunzip from scrapy.downloadermiddlewares.httpcompression import ACCEPTED_ENCODINGS try: import brotli except: pass import zlib settings = get_project_settings() storage = load_object(settings['HTTPCACHE_STORAGE'])(settings) body = None spider_req = Request(url) if spider_name == Spiders.META: from moltspider.spiders.meta import MetaSpider spider = MetaSpider() schema_name = Schemas.META_PAGE elif spider_name == Spiders.TOC: from moltspider.spiders.toc import TocSpider spider = TocSpider schema_name = Schemas.TOC_PAGE else: raise Exception('No support for spider "%s"\'s cache page' % spider_name) cachedresponse = storage.retrieve_response(spider, spider_req) if cachedresponse: content_encoding = cachedresponse.headers.getlist('Content-Encoding') if content_encoding: encoding = content_encoding.pop() if encoding == b'gzip' or encoding == b'x-gzip': body = gunzip(cachedresponse.body) if encoding == b'deflate': try: body = zlib.decompress(body) except zlib.error: # ugly hack to work with raw deflate content that may # be sent by microsoft servers. For more information, see: # http://carsten.codimi.de/gzip.yaws/ # http://www.port80software.com/200ok/archive/2005/10/31/868.aspx # http://www.gzip.org/zlib/zlib_faq.html#faq38 body = zlib.decompress(body, -15) if encoding == b'br' and b'br' in ACCEPTED_ENCODINGS: body = brotli.decompress(body) if body: if spider_name == Spiders.TOC and aid: sb = [] colspan = 4 i = 0 scrapy_resp = HtmlResponse(url) scrapy_resp = scrapy_resp.replace(body=body, encoding=origin_encoding) sb.append('<table width="1000px" align="center"><tr>') for item in iter_items(spider, scrapy_resp, [ site, ], schema_name): if i % colspan == 0: sb.append('</tr><tr>') item['_'] = url_site sb.append('<td><a href="%(_)s%(url)s">%(name)s</a></td>' % item) del item['_'] i += 1 sb.append('</tr></table>') body = '\n'.join(sb) body = render_template_string(template_page, content=body) else: body = body.decode(encoding=origin_encoding) else: body = '%s (%s) not found in cache.' % (url, origin_encoding) resp = make_response(body) resp.headers['Content-Type'] = 'text/html; charset=utf-8' return resp
def load_response(url, filename): input_path = os.path.join(os.path.dirname(__file__), '_tests', filename) with open(input_path, 'rb') as input_file: return HtmlResponse(url, body=input_file.read())
def process_request(self, request, spider): if 'origin' in request.meta or spider.errback_status is True: while True: # 每隔10分钟设置代理为None now = time.time() if now - self.run_time > 60 * 10: self.close_err_driver(spider, 'set proxy is None') self.proxy = None self.run_time = now # 初始化浏览器 if not self.driver: self.init_browser(spider) try: # 模拟请求初始页 self.driver.get(request.url) if 'err504' in self.driver.current_url: self.close_err_driver(spider, 'page err504') # 更换代理 self.proxy = _get_proxy(spider) continue # 等待数据 WebDriverWait(self.driver, 40).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="availabilityForm"]'))) except: traceback.print_exc() # 更换代理 self.proxy = _get_proxy(spider) self.close_err_driver( spider, 'browser error, 3 seconds after the restart.') else: break # 设置cookies self.cookies = self.driver.get_cookies() request.cookies = self.cookies request.meta['proxy'] = self.proxy # 清除cookies spider.log('delete_all_cookies', 20) self.driver.delete_all_cookies() # 自定义Response对象 current_url = bytes(self.driver.current_url) body = bytes(self.driver.page_source.encode('utf-8')) response = HtmlResponse(current_url, body=body, request=request) # # 注释掉以后不再关闭浏览器 # if self.num % 6 == 0: # spider.log('browser closed.', 20) # self.driver.close() # self.driver = None # # self.num += 1 spider.errback_status = False return response else: request.meta['proxy'] = self.proxy request.cookies = self.cookies
def process_request(self, request, spider): if spider.name == 'uo' and 'origin' in request.meta and request.meta[ 'origin'] == 1: meta = request.meta _from = meta['_from'] _to = meta['_to'] while True: # 初始化浏览器 if not self.driver: self.init_browser(spider) try: # 模拟请求初始页 self.driver.get(request.url) # 处理滑块验证码 if u"Please slide to verify that you're not a robot" in self.driver.page_source: self.driver.maximize_window() action = ActionChains(self.driver) f**k = self.driver.find_element_by_id('nc_1_n1z') action.drag_and_drop_by_offset(f**k, 233.51, 0).perform() spider.log( "Please slide to verify that you're not a robot", 40) self.driver.delete_all_cookies() try: self.driver.close() self.driver = None except: self.driver = None time.sleep(8) continue else: # Ajax请求 comment = '''var data=%s; $.post("#", data, function(){ window.location.href = "/en-US/select?origin=%s&destination=%s" })''' % (request.body, _from, _to) self.driver.execute_script(comment) # 等待数据 WebDriverWait(self.driver, 50).until( EC.presence_of_element_located( (By.XPATH, '//*[@name="__RequestVerificationToken"]'))) break except: traceback.print_exc() spider.log('browser error, 8 seconds after the restart.', 40) try: self.driver.close() self.driver = None except: self.driver = None time.sleep(8) # 自定义Response对象 current_url = bytes(self.driver.current_url) body = bytes(self.driver.page_source.encode('utf-8')) response = HtmlResponse(current_url, body=body) # 设置cookies request.cookies = self.driver.get_cookies() # 设置headers request.headers.setdefault('Referer', current_url) # 注释掉以后不再关闭浏览器 if self.num % 10 == 0: spider.log('browser closed.', 20) self.driver.close() self.driver = None self.num += 1 # 清除cookies spider.log('delete_all_cookies', 20) self.driver.delete_all_cookies() return response
def process_request(self, request, spider): if spider.name == 'lx' and 'origin' in request.meta: while True: # 初始化浏览器 if not self.driver: self.init_browser(spider) try: # 模拟请求初始页 self.driver.get(request.url) if 'Schedule' in self.driver.current_url: print('这条路线与其他航空公司一起提供。') return None if 'False' in self.driver.current_url: print('我们无法找到所选路线的航班。') return None if 'distil_r_captcha' in self.driver.current_url: spider.log('验证码出现,换ip,,,,,,') if spider.proxy: self.proxy = _get_proxy(spider) # 等待数据 WebDriverWait(self.driver, 50).until( EC.presence_of_element_located((By.ID, 'frm-matrix'))) except: spider.log('browser error, 8 seconds after the restart.', 40) spider.log('exception ,change proxy....') if spider.proxy: self.proxy = _get_proxy(spider) # traceback.print_exc() try: self.driver.close() self.driver = None except: self.driver = None time.sleep(8) else: break # 自定义Response对象 current_url = bytes(self.driver.current_url) body = bytes(self.driver.page_source.encode('utf-8')) response = HtmlResponse(current_url, body=body) # 设置cookies if spider.proxy: request.meta["proxy"] = self.proxy request.cookies = self.driver.get_cookies() # 设置headers request.headers.setdefault('Referer', current_url) # 清除cookies self.driver.delete_all_cookies() spider.log('delete_all_cookies', 20) # 暂停爬虫引擎 # spider.crawler.engine.pause() # 注释掉以后不再关闭浏览器 if self.num % 6 == 0: spider.log('browser closed.', 20) self.driver.close() self.driver = None self.num += 1 return response elif not spider.is_ok and spider.proxy: self.proxy = _get_proxy(spider)
def process_request(self, request, spider): # ua = UserAgent() # agent = ua.chrome # f = faker.Faker(locale='zh_cn') # agent = f.user_agent() agent = random.choice(useragent) same = get_lxsdk_cuid(agent) cook = '_lxsdk_cuid={}; _lxsdk={}; _hc.v={}; _lxsdk_s={}'.format( same, same, get_hc(), get_lxsdk_s()) # cook = '_lxsdk_cuid={}; _lxsdk={}; _hc.v={}; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_s={}'.format(same,same,get_hc(),get_lxsdk_s()) cook1 = 'cy=1236; cityid=1236; cye=huixian; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; _lxsdk_cuid=165c6b8e911c8-0383cc5ec3114e-37664109-144000-165c6b8e91289; _lxsdk=165c6b8e911c8-0383cc5ec3114e-37664109-144000-165c6b8e91289; _hc.v=0c84e8b5-c945-5c86-bb54-94e4936012e5.1536637332; s_ViewType=10; cye=beijing; _lxsdk_s=165cb7d7e23-268-18-f1%7C%7C87' # print(cook) headers = { 'Host':'www.dianping.com', 'Upgrade-Insecure-Requests':'1', 'Cookie':cook, 'User-Agent':agent , # 'Proxy-Connection':'keep-alive' } proxyHost = "http-dyn.abuyun.com" proxyPort = "9020" # 代理隧道验证信息 proxyUser = "******" proxyPass = "******" proxyMeta = "http://%(user)s:%(pass)s@%(host)s:%(port)s" % { "host": proxyHost, "port": proxyPort, "user": proxyUser, "pass": proxyPass, } proxies = { # "http": proxyMeta, "https": proxyMeta, } proxiess = { "https": "http://140.255.6.45:5649", # "https": "http://118.79.54.90:6996", # "https": "http://117.42.201.221:6214", } import requests #s = requests.Session() #base = 'https://www.dianping.com/' try: # start_url = requests.get(base, headers=headers, proxies=proxies, timeout=15) # print(start_url.text) res = requests.get(request.url, headers=headers, proxies=proxies, timeout=15) if res.status_code != 200 or len(res.text) < 560: if res.status_code == 403 or res.status_code == 404: content = '页面无法访问' else: content = res.text print('该URL:{},状态码:{},内容为:{}'.format(request.url, res.status_code, content)) key = getattr(spider, 'redis_key') db = RedisClient() print('该URL:{}需要重新入队列'.format(request.url)) db.add_value(key, request.url) raise IgnoreRequest else: from scrapy.http.response.html import HtmlResponse rs = res.content.decode('utf-8') # print(rs) response = HtmlResponse(url=request.url, body=res.content.decode('utf-8'), encoding="utf-8", request=request) return response except Exception as e: print('出现错误,原因{}'.format(e.args)) key = getattr(spider, 'redis_key') db = RedisClient() print('该URL:{}需要重新入队列'.format(request.url)) db.add_value(key, request.url) raise IgnoreRequest
def test_display_selector_without_return(self): selector = HtmlResponse(body=b'<html>test</html>', url='http://some.url', status=200).xpath('//html') resp = display(selector) self.assertIsNone(resp)
def open_page(name): return HtmlResponse(url=name, body=open_spec(name), encoding='utf-8')
def build_links(self): """ Build a complete list of links from html in elasticsearch """ def binary_search(array, key, low, high): """ Fast search in a sorted array """ if low > high: # termination case return -1 middle = (low + high) / 2 # gets the middle of the array if array[middle] == key: # if the middle is our key return middle elif key < array[middle]: # our key might be in the left sub-array return binary_search(array, key, low, middle-1) else: # our key might be in the right sub-array return binary_search(array, key, middle+1, high) es_obj = ElasticSearchPipeline.from_crawler(self.crawler).es new_links = [] hashes = sorted([ url['_id']for url in scan( es_obj, query={ "query": { "exists": { "field": "url" } } }, index=self.es_index, doc_type=self.settings['ELASTICSEARCH_TYPE'], _source_exclude=["*"]) ]) urls_iter = scan( es_obj, query={ "query": { "exists": { "field": "content" } } }, index=self.es_index, doc_type=self.settings['ELASTICSEARCH_TYPE'], _source_include=["content", "url"] ) for hit in urls_iter: id_ = hit['_id'] url = hit['_source']['url'] content = hit['_source']['content'] try: #response = HtmlResponse(url, encoding="utf-8", body=content) response = HtmlResponse(url, body=content) for request in self._requests_to_follow(response): hash_target = hashlib.sha1(request.url.encode('utf-8')).hexdigest() if binary_search(hashes, hash_target, 0, len(hashes)-1) < 0: continue new_links.append((id_, hash_target)) except TypeError: pass return new_links
def body2hxs(ctx, encoding='utf-8', url='http://localhost'): r = HtmlResponse(url, body=ctx, encoding=encoding) return HtmlXPathSelector(r)
def parse(self, response): # meta = response.meta # _from = meta.get('_from') # _to = meta.get('_to') # _date = meta.get('_date') res = json.loads(response.text) if 'origin' in response.meta: multiDayAvailabilityOutbound = res['multiDayAvailabilityOutbound'] r = HtmlResponse(url=self.start_urls, body=multiDayAvailabilityOutbound.encode('utf-8')) __RequestVerificationToken = r.xpath( '//div[@class="animation-container"]//input[@name="__RequestVerificationToken"]/@value' ).extract_first() li_days = r.xpath( '//*[@class="HV-gc bulletless days"]/li/div[@class="day day-with-availability"]' ) for li in li_days: date_date = li.xpath('@data-date').extract_first() body = { 'selectSingleDayAvailability.JourneyType': 'OutboundFlight', 'selectSingleDayAvailability.Date.DateToParse': date_date[:10], 'selectSingleDayAvailability.AutoSelect': False, '__RequestVerificationToken': __RequestVerificationToken } yield scrapy.Request( self.select_url, method='POST', body=parse.urlencode(body), headers=response.request.headers, cookies=response.request.cookies, ) else: SingleDayOutbound = res['SingleDayOutbound'] html = HtmlResponse(url='', body=SingleDayOutbound.encode('utf-8')) buttons = html.xpath('//button[@class="flight-result-button"]') for button in buttons: # 机场 button_value = button.xpath('@value').extract_first() dep_airport, arr_airport = re.findall(r'~(\w{3})~', button_value)[:2] fromCity = self.city_airport.get(dep_airport, dep_airport) toCity = self.city_airport.get(arr_airport, arr_airport) # 时间 div_times = button.xpath('div[@class="times"]') departure = div_times.xpath( 'time[@class="departure"]/@datetime').extract_first() departure_time = div_times.xpath( 'time[@class="departure"]/text()').extract_first().strip() dep_date = "%s %s:00" % (departure[:10], departure_time) arrival = div_times.xpath( 'time[@class="arrival"]/@datetime').extract_first() arrival_time = div_times.xpath( 'time[@class="arrival"]/text()').extract_first().strip() arr_date = "%s %s:00" % (arrival[:10], arrival_time) # 航班号 details = button.xpath('div[@class="details"]') flight_number_list = details.xpath( 'ul/li[@class="flight-number"]/text()').extract() flight_number = flight_number_list[1].strip() # 价格 actions = button.xpath('div[@class="actions"]') price_div = actions.xpath('div[contains(@class, "price")]') currency = price_div.xpath( 'span[@class="currency"]/text()').extract_first().strip() currency = self.currency_cache.get(currency, currency) price = price_div.xpath('text()[2]').extract_first().strip() item = FlightsItem() item.update( dict( flightNumber=flight_number, # 航班号 depTime=int( time.mktime( time.strptime(dep_date, "%d/%m/%Y %H:%M:%S"))), # 出发时间 arrTime=int( time.mktime( time.strptime(arr_date, "%d/%m/%Y %H:%M:%S"))), # 达到时间 fromCity=fromCity, # 出发城市 toCity=toCity, # 到达城市 depAirport=dep_airport, # 出发机场 arrAirport=arr_airport, # 到达机场 currency=currency, # 货币种类 adultPrice=float(price), # 成人票价 adultTax=0, # 税价 netFare=float(price), # 净票价 maxSeats=3, # 可预定座位数 cabin='E', # 舱位 carrier=flight_number[:2], # 航空公司 isChange=1, # 是否为中转 1.直达2.中转 segments="NULL", # 中转时的各个航班信息 getTime=int(time.time()), )) yield item