def parse_play_list(self, response): # country 层面的playlist jsn = json.loads(response.text) item = LvmamaPoiItem() meta = response.request.meta item['raw'] = {'meta': meta, 'content': jsn} yield item meta['level'] = 'city' html = HTML(jsn.get('data').get('html')) a_list = html.xpath('.//dl//div[@class="item-info"]//strong/a') for a, (type, value) in product(a_list, self.content.items()): url = get_text_by_xpath(a, '@href') dest = url.replace("http://www.lvmama.com/lvyou/", '') if url.startswith('http://www.lvmama.com/lvyou' ) and not url.endswith('#dianping'): meta['request_uri'] = url.replace( "http://www.lvmama.com/lvyou/", '') dest_id_pattern = re.compile('(\d+)') dest_id = dest_id_pattern.findall(url)[-1] data = copy.deepcopy(value.get('data')) data['dest_id'] = dest_id page_per_count = value.get('page_per_count') meta['page_per_count'] = page_per_count meta['type'] = type if 'request_uri' in data: data['request_uri'] = data['request_uri'] + dest yield Request(url=self.URL.format(type=type, country=dest), meta=meta, callback=self.parse)
def parse_play_detail(self, html, meta): a_list = html.xpath('.//dl/dt/a') result = [] for a in a_list: url = get_text_by_xpath(a, '@href') meta['request_uri'] = url.replace( "http://www.lvmama.com/lvyou/poi/", '') result.append(Request(url=url, meta=meta, callback=self.parse_poi)) return result
def parse_items(self, response): """ :param 对于列表的解析 """ meta = response.request.meta page = meta['page'] jsn = json.loads(response.text) # 翻页 if page == 1: result_number = jsn['result_number'] count_per_page = 10 # 没有一页的记录数量 pages = math.ceil(result_number / count_per_page) for page in range(2, pages + 1): meta['page'] = page yield Request(self.PAGE_URL.format(dest_id=meta.get('dest_id'), page=page), meta=meta, callback=self.parse_items) # 提取记录 mdb_item = HaoqiaoMDBItem() mdb_item['raw'] = { 'content': str(lzma.compress(response.body)), 'meta': meta } yield mdb_item html = HTML(jsn['list']) lis = html.xpath('.//li[@class="J_hotel_list"]') for li in lis: item = HaoqiaoItem() item['title'] = get_text_by_xpath( li, './/div[@class="hotel-l-t f20 t-333 fl"]/text()') item['title_en'] = get_text_by_xpath( li, './/div[@class="hotel-l-t f20 t-333 fl"]/span/text()') item['city_id'] = meta['city_id'] item['city'] = meta['city'] item['url'] = get_text_by_xpath(li, './/a[1]/@href') yield item
def parse(self, response): """:param 解析旅游点的列表 1.获取地点的pid 2.获取旅游列表的页码 3.返回ajax网址 eg:https://place.qyer.com/dubai/sight/ """ url = 'https://place.qyer.com/poi.php?' # 获取参数 pattern = re.compile('var PLACE ([\d\D]+?);') place = pattern.findall(response.text)[0].replace('= PLACE || ', '') place = execjs.eval(place) # 获取页码 html = HTML(response.text) page_num = self.get_page_num(html) poi_sort = utils.get_text_by_xpath(html, './/p[@id="poiSort"]/a[@class="current"]/@data-id') # TODO if page_num < 100: for i in range(1, page_num+1): param = {'action': 'list_json', 'haslastm': 'false', 'isnominate': '-1', 'page': i, 'pid': place['PID'], 'rank': '6', 'sort': poi_sort, 'subsort': 'all', 'type': place['TYPE']} print('爬取第{} 页'.format(i)) yield Request(url=url+urlencode(param), callback=self.parse_poi_list) else: subsorts = html.xpath('.//li[@id="poiSubsort"]/p[@id="poiSortLabels"]/a/@data-id') for subsort in subsorts: if subsort == '0': continue param = {'action': 'list_json', 'haslastm': 'false', 'isnominate': '-1', 'page': 1, 'pid': place['PID'], 'rank': '6', 'sort': poi_sort, 'subsort': subsort, 'type': place['TYPE']} # print('爬取第{} 页'.format(1)) yield Request(url=url + urlencode(param), meta={'param': param}, callback=self.pares_poi_subpage)
def parse_view_list(self, response): # 景点列表解析 jsn = json.loads(response.text) meta = response.request.meta item = LvmamaPoiItem() item['raw'] = {'meta': meta, 'content': jsn} yield item meta['level'] = 'poi' html = HTML(jsn.get('data')) a_list = html.xpath('.//dl/dd/div[@class="title"]/a') for a in a_list: url = get_text_by_xpath(a, '@href') if url.startswith('http://www.lvmama.com/lvyou/poi' ) and not url.endswith('#dianping'): yield Request(url=url, meta=meta, callback=self.parse_poi)
def parse_poi_detail(self, response): """ 旅游景点解析 eg:https://place.qyer.com/poi/V2UJYVFkBzJTZVI9/ """ html = HTML(response.text) item = items.PoiDetailItem() item['raw'] = {'html': str(lzma.compress(response.body))} item['url'] = response.request.url item['id'] = response.request.meta.get('id') item['catename'] = response.request.meta.get('catename') item['head'] = utils.get_text_by_xpath(html, './/div[@class="qyer_head_crumb"]/span//text()') item['title'] = utils.get_text_by_xpath(html, './/div[@class="poi-largeTit"]/h1[@class="cn"]//text()') item['title_en'] = utils.get_text_by_xpath(html, './/div[@class="poi-largeTit"]/h1[@class="en"]//text()') item['rank'] = utils.get_text_by_xpath(html, './/div[@class="infos"]//ul/li[@class="rank"]/span//text()') item['poi_detail'] = utils.get_text_by_xpath(html, './/div[@class="compo-detail-info"]/div[@class="poi-detail"]//text()') item['poi_tips'] = utils.get_text_by_xpath(html, './/div[@class="compo-detail-info"]/ul[@class="poi-tips"]//text()') lis = html.xpath('.//div[@class="compo-detail-info"]/ul[@class="poi-tips"]/li') for li in lis: title = utils.get_text_by_xpath(li, './/span[@class="title"]/text()') content = utils.get_text_by_xpath(li, './/div[@class="content"]//text()') if '地址' in title: item['address'] = content elif '到达方式' in title: item['arrive_method'] = content elif '开放时间' in title: item['open_time'] = content elif '门票' in title: item['ticket'] = content elif '电话' in title: item['phone'] = content elif '网址' in title: item['website'] = content item['poi_tip_content'] = utils.get_text_by_xpath(html, './/div[@class="compo-detail-info"]/div[@class="poi-tipContent"]//text()') yield item
def parse(self, response): # 获取 dest_id, base_id # 获取页码数 html = HTML(response.text) meta = response.request.meta num_pattern = re.compile('(\d+)') count_info = get_text_by_xpath( html, './/div[@class="wy_state_page"]/p//text()') total_count = num_pattern.findall(count_info)[-1] page_num = 1 if total_count.isdigit(): page_per_count = meta.get('page_per_count') page_num = math.ceil(int(total_count) / page_per_count) # page_num = max([int(i) for i in page_nums if i.isdigit()]) view_list = html.xpath('.//div[@id="view_list"]') if view_list: for request in self.parse_play_detail(html, meta): yield request return type = meta.get('type') dest_id = meta.get('dest_id') for i in range(1, page_num + 1): if type == 'play' and meta.get('level') == 'country': data = {'page': i, 'dest_id': dest_id} meta['ref'] = data yield Request( url= "http://www.lvmama.com/lvyou/dest_content/AjaxGetPlayList?" + urlencode(data), meta=meta, callback=self.parse_play_list) elif type == 'play' and meta.get('level') != 'country': data = { 'page': i, 'dest_id': dest_id, 'search_key': '', 'request_uri': '/lvyou/play/' + meta.get('request_uri'), 'type': type } meta['ref'] = data yield Request( url= "http://www.lvmama.com/lvyou/dest_content/AjaxGetViewSpotList?" + urlencode(data), meta=meta, callback=self.parse_play_list2) elif type == 'scenery': base_id_pattern = re.compile('base_id :"(\d+)",') base_id = base_id_pattern.findall(response.text)[-1] data = { 'page_num': i, 'dest_id': dest_id, 'base_id': base_id, 'request_uri': '/lvyou/scenery/' + meta.get('request_uri') } meta['ref'] = data yield Request( url='http://www.lvmama.com/lvyou/ajax/getNewViewList?' + urlencode(data), meta=meta, callback=self.parse_view_list)
def parse_poi(self, response): html = HTML(response.text) item = LvmamaPoiDetailItem() meta = response.request.meta item['raw'] = {'html': str(lzma.compress(response.body)), 'meta': meta} url = response.request.url item['url'] = url item['country'] = meta['country'] if 'sight' in url: item['head'] = get_text_by_xpath( html, './/span[@class="crumbs_nav"]/span/a//text()', "|") item['title'] = get_text_by_xpath( html, './/div[@class="vtop-name-box"]/h2[@class="title"]/text()') item['title_en'] = get_text_by_xpath( html, './/div[@class="vtop-name-box"]/span[@class="title-eng"]/text()' ) item['vcomon'] = get_text_by_xpath( html, './/div[@class="vtop-name-box"]/i[@class="vcomon-icon"]/text()' ) dls = html.xpath('.//dl[@class="poi_bordernone"]') for dl in dls: dt = get_text_by_xpath(dl, './/dt//text()') dd = get_text_by_xpath(dl, './/dd//text()') if '简介' in dt: item['poi_brief'] = dd elif '景点导览' in dt: item['poi_detail'] = dd elif '交通信息' in dt: item['traffic'] = dd elif '小贴士' in dt: item['poi_tip_content'] = dd dts = html.xpath('.//div[@class="vtop-comment-box fl"]/dl/dt') dds = html.xpath('.//div[@class="vtop-comment-box fl"]/dl/dd') for dt, dd in zip(dts, dds): dt = get_text_by_xpath(dt, './/text()') dd = get_text_by_xpath(dd, './/text()') if '地 址' in dt: item['address'] = dd elif '游玩时间' in dt: item['playtime'] = dd elif '联系电话' in dt: item['phone'] = dd elif '门票' in dt: item['ticket'] = dd elif '开放时间' in dt: item['open_time'] = dd elif '网址' in dt: item['website'] = dd elif 'zone' in url: item['head'] = get_text_by_xpath( html, './/div[@class="nav clearfix"]/span[@class="crumbs_nav fl"]/span/a//text()', '|') item['title'] = get_text_by_xpath( html, './/div[@class="nav_country clearfix"]/div[@class="countryBox fl"]/h1/text()' ) item['title_en'] = get_text_by_xpath( html, './/div[@class="nav_country clearfix"]/div[@class="countryBox fl"]/h1/span/text()' ) item['active'] = get_text_by_xpath( html, './/div[@class="nav_country clearfix"]/div[@class="countryBox fl"]/p[@class="active"]/text()' ) dls = html.xpath( './/div[@class="city_viewBox"]/div[@class="city_view_model"]/div/dl' ) for dl in dls: dt = get_text_by_xpath(dl, './/dt//text()') dd = get_text_by_xpath(dl, './/dd//text()') if '简介' in dt: item['poi_brief'] = dd elif '景点导览' in dt: item['poi_detail'] = dd elif '交通信息' in dt: item['traffic'] = dd elif '小贴士' in dt: item['poi_tip_content'] = dd divs = html.xpath('.//dl[@class="city_mapList clearfix"]/dd/div') for div in divs: dt = get_text_by_xpath(div, './/p[1]//text()') dd = get_text_by_xpath(div, './/p[2]//text()') if '地址' in dt.replace(' ', ''): item['address'] = dd elif '游玩时间' in dt: item['playtime'] = dd elif '联系电话' in dt: item['phone'] = dd elif '门票' in dt: item['ticket'] = dd elif '开放时间' in dt: item['open_time'] = dd elif '网址' in dt: item['website'] = dd yield item