def parse(self, task, input_file): """解析函数 Args: task: HttpTask, 请求任务 input_file: StringIO, 网页StringIO Yields: task: HttpTask, 任务 item: Item, 解析结果 """ self.logger.debug("start parse attraction list page") try: json_data = json.load(input_file) city_name = json_data['data']['surl'] current_page = int(json_data['data']['current_page']) scene_list = json_data['data']['scene_list'] total_scene = int(json_data['data']['scene_total']) for index, scene in enumerate(scene_list): relate_path = scene['surl'] sid = scene['sid'] map_info = scene['ext']['map_info'] seq_sort = \ (current_page - 1) * EVERY_PAGE_SCENE_COUNT + index + 1 # 生成景点request http_request = HTTPRequest(build_scene_url(relate_path), connect_timeout=5, request_timeout=10) scene_task = HttpTask(http_request, callback="AttractionParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={ "map_info": map_info, "seq_sort": seq_sort, "sid": sid, "relate_path": relate_path }) yield scene_task # 生成 下一页任务 if current_page * EVERY_PAGE_SCENE_COUNT < total_scene: # 有下一页, 生成下一个request请求 next_request = build_next_page_request(city_name, current_page + 1) next_page_task = HttpTask(next_request, callback="AttractionListParser", max_fail_count=5, cookie_host=LVYOU_HOST) yield next_page_task except Exception, e: self.logger.info("json dumps error:%s for url:%s" % (e, task.request.url)) raise e
def _check_and_execute_picture(self, picture_url, cookie_host, cookie_count): """檢查圖片是否存在,並且生成task和改造path Args: picture_url, str, 圖片的url Returns: 二元組, (picture_path, task) """ picture_path = u"" if picture_url: picture_path = picture_url.replace(u"http://", self._picture_host)\ .replace(u"\\s+|", "")\ .replace(u"\\.jpg\\\\.*$", u".jpg")\ .lower() if len(picture_path) > 0 and not os.path.exists(self._picture_dir + picture_path): picture_request = HTTPRequest(url=str(picture_url), connect_timeout=10, request_timeout=60) picture_task = HttpTask( picture_request, callback='PictureParser', cookie_host=cookie_host, cookie_count=cookie_count, max_fail_count=2, kwargs={'picturepath': self._picture_dir + picture_path}) return picture_path, picture_task else: return picture_path, None
def create_city_type_task(city_name, city_code, abbreviation, _type, tag, page=1, j=1): """根据参数构建CityTypeTask Args: city_name: str, 城市中文名 city_code: int, 城市code abbreviation: str, 城市拼音缩写 _type: str, 类型名 tag: str, 标签 page: int, 页码 j: int, 常数 Returns: task: HttpTask, 任务 """ url = "http://www.228.com.cn/s/%s-%s/?j=%s&p=%s" % ( abbreviation, _type, j, page) cookie_host = "http://www.228.com.cn/%s/" % abbreviation http_request = HTTPRequest(url=url, connect_timeout=10, request_timeout=25) task = HttpTask(http_request, callback="DealParser", max_fail_count=8, cookie_host=cookie_host, cookie_count=20, kwargs={'type': _type, 'abbreviation': abbreviation, 'city_code': city_code, 'city_name': city_name, 'tag': tag, 'current_page': page, 'cookie_host': cookie_host, 'cookie_count': 20}) return task
def _check_and_execute_picture(self, picture_url): """检查图片信息,生成图片task Args: picture_url: str, 图片链接 Returns: (picture_path, task): 二元组,(路径,任务) """ pictures = [] if picture_url: picture_path = picture_url.replace(u"http://", self._picture_host)\ .replace(u"\\s+|", "")\ .replace(u"\\.jpg\\\\.*$", u".jpg")\ .lower() pictures.append(picture_path) if len(pictures) >= 1 and not os.path.exists(self._picture_dir + pictures[0]): picture_request = HTTPRequest(url=str(picture_url), connect_timeout=10, request_timeout=40) picture_task = HttpTask( picture_request, callback='PictureParser', cookie_host='http://www.nuomi.com', cookie_count=15, kwargs={'picturepath': self._picture_dir + pictures[0]}) return pictures, picture_task else: return pictures, None
def parse(self, task, input_file): """解析函数 Args: task:Task, 任务描述 input_file:File, 文件对象 Yields: Item Task """ tree = html.parse(input_file) citys = tree.xpath("//p[@id='citypid']/text()") citys = citys[0] if citys is not None and len(citys) > 0 else "" for city in citys.split(u","): city_english_name = remove_white(city) if len(city_english_name) > 0: city_item = CityItem("", city_english_name, get_city_code(city_english_name)) if city_item.english_name and city_item.city_code: yield city_item http_request = HTTPRequest(url=build_url_by_city_name( city_item.english_name), connect_timeout=20, request_timeout=240) new_task = HttpTask( http_request, callback='DealParser', max_fail_count=5, kwargs={'citycode': city_item.city_code}) yield new_task
def parse(self, task, input_file): """用于解析列表页面(json数据), Args: task: HttpTask, 任务对象 input_file: File, 文件对象 Yields: item: Item, 提取的对象 task: 新的Task """ # 获取json数据 self.logger.info("deal parser start to handle") json_data = json.load(input_file) elems = json_data.get('products') page_size = json_data.get('pageSize', 1) # 获取传递的参数 city_name = task.kwargs.get('city_name') tag = task.kwargs.get('tag') current_page = task.kwargs.get('current_page') city_code = task.kwargs.get('city_code') _type = task.kwargs.get('type') abbreviation = task.kwargs.get('abbreviation') cookie_host = task.kwargs.get('cookie_host') cookie_count = task.kwargs.get('cookie_count') if elems is not None: for elem in elems: try: url, name, start_time, end_time, place_name = \ _extract_elem(elem) # 存储Activity Item yield ActivityItem(name, url, start_time, end_time, place_name, tag, city_code) # 发起item请求 request = HTTPRequest(url, connect_timeout=10, request_timeout=15) task = HttpTask(request, callback="ActivityParser", cookie_host=cookie_host, cookie_count=cookie_count, max_fail_count=3, kwargs={ "url": url, "cookie_host": cookie_host, "cookie_count": cookie_count }) yield task except Exception, e: self.logger.warn("extract one element failed error:%s" % e) # 发起下一页请求 if current_page < int(page_size): next_page_task = create_city_type_task(city_name, city_code, abbreviation, _type, tag, page=current_page + 1) yield next_page_task
def build_hotels_task_for_city(ctrip_code, city_code, chinese_name, avaliable="false"): """build task for hotel search Args: ctrip_code: str, city code for ctrip city_code: str, city code of tigerknows chinese_name: str, chinese name of city Returns: task: HttpTask, new task """ timestamp = int(time.time()) request_xml = """<?xml version="1.0" encoding="utf-8"?> <Request><Header AllianceID="%s" SID="%s" TimeStamp="%s" RequestType="%s" Signature="%s" /><HotelRequest> <RequestBody xmlns:ns="http://www.opentravel.org/OTA/2003/05" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"> <ns:OTA_HotelSearchRQ Version="1.0" PrimaryLangID="zh" xsi:schemaLocation="http://www.opentravel.org/OTA/2003/05 OTA_HotelSearchRQ.xsd" xmlns="http://www.opentravel.org/OTA/2003/05"> <ns:Criteria AvailableOnlyIndicator="%s"><ns:Criterion> <ns:HotelRef HotelCityCode="%s"/> <ns:Position PositionTypeCode="502" /> </ns:Criterion></ns:Criteria></ns:OTA_HotelSearchRQ> </RequestBody></HotelRequest></Request>""" \ % (ALLIANCE_ID, SID, timestamp, "OTA_HotelSearch", _create_signature(timestamp, ALLIANCE_ID, SID, "OTA_HotelSearch", API_KEY), avaliable, ctrip_code,) post_xml = """<?xml version="1.0" encoding="utf-8"?> <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"> <soap:Body><Request xmlns="http://ctrip.com/"> <requestXML>%s</requestXML></Request></soap:Body></soap:Envelope>""" \ % escape(request_xml) http_request = HTTPRequest("http://%s/Hotel/OTA_HotelSearch.asmx" % API_URL, method="POST", body=post_xml, connect_timeout=20, request_timeout=240, headers={ "SOAPAction": "http://ctrip.com/Request", "Content-Type": "text/xml; charset=utf-8" }) return HttpTask(http_request, callback="HotelListParser", max_fail_count=5, kwargs={ "citycode": city_code, "chinesename": chinese_name })
class Intro1Spider(BaseSpider): parsers = { u"ActivityParser": ActivityParser, } pipelines = { u"WebItem": WebItemPipeline, } start_tasks = [HttpTask(HTTPRequest(u"http://www.228.com.cn/ticket-49052202.html"), callback=u"ActivityParser")]
def build_tag_tasks(): """生成所有的tag任务 Returns: tasks: list, [HttpTask] """ tasks = [] for key, value in TAGS.iteritems(): task = HttpTask(build_next_tag_page_request(key, 1, "shanghai"), callback="TagListParser", max_fail_count=5, cookie_host=LVYOU_HOST, kwargs={"tag": value}) tasks.append(task) return tasks
class LvYouDaoDaoSpider(BaseSpider): """用于抓取道道旅游信息的爬虫 """ parsers = { "AttractionListParser": AttractionListParser, "AttractionParser": AttractionParser, "DescriptionParser": DescriptionParser, } pipelines = { "AttractionItem": AttractionItemPipeline, "DescriptionItem": DescriptionItemPipeline, } start_tasks = [HttpTask( build_next_page_request(u"/Attractions-g308272-Activities-Shanghai.html"), callback="AttractionListParser",)]
def parse(self, task, input_file): """解析函数 Args: task: HttpTask,任务 input_file: StringIO, 网页文件 Yields: task: HttpTask, 新任务 """ tree = html.parse(input_file) attraction_elems = tree.xpath( "//div[@id='ATTRACTION_OVERVIEW']" "/div[@class='attraction-list clearfix']") for attraction_elem in attraction_elems: try: info_elem = flist( attraction_elem.xpath( "div[@class='clearfix']/div[@class='info']"), None) rank_elem = flist( attraction_elem.xpath( "div[@class='clearfix']/div[@class='rank']"), None) relate_path = flist( info_elem.xpath("div[@class='title']/a/@href"), u"") name = flist(info_elem.xpath("div[@class='title']/a/text()"), u"") address = _extract_address(info_elem) hot = flist(rank_elem.xpath("a/strong/text()"), u"") rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"") # 形成attraction 任务 http_request = build_attraction_request(relate_path) attraction_task = HttpTask(http_request, callback="AttractionParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={ "name": unicode(name).strip(), "address": unicode(address), "hot": unicode(hot), "rank": unicode(rank) }) yield attraction_task except Exception, e: self.logger.warn("extract one attraction failed error:%s" % e)
class LvYouBaiDuSpider(BaseSpider): """抓取百度旅游的爬虫 """ parsers = { "AttractionListParser": AttractionListParser, "AttractionParser": AttractionParser, "CommentListParser": CommentListParser, } pipelines = { "AttractionItem": AttractionItemPipeline, "CommentListItem": CommentListItemPipeline, } start_tasks = [ HttpTask(build_next_page_request("shanghai", 1), callback="AttractionListParser", max_fail_count=5, cookie_host=LVYOU_HOST) ]
def parse(self, task, input_file): """解析函数 Args: task: HttpTask, 请求任务 input_file: StringIO, 网页stringIO Yields: task: HttpTask, 任务 item: Item, 解析结果 """ self.logger.debug("start parse tag list parser") try: json_data = json.load(input_file) city_name = json_data['data']['surl'] current_page = int(json_data['data']['current_page']) scene_list = json_data['data']['scene_list'] total_scene = int(json_data['data']['scene_total']) current_cid = json_data['data']['current_cid'] tag = task.kwargs.get('tag', u"") for index, scene in enumerate(scene_list): sid = scene['sid'] yield TagItem(tag, current_cid, sid) # 生成 下一页任务 if current_page * EVERY_PAGE_SCENE_COUNT < total_scene: # 有下一页, 生成下一个request请求 next_request = build_next_tag_page_request( current_cid, current_page + 1, city_name) next_tag_task = HttpTask(next_request, callback="TagListParser", max_fail_count=5, cookie_host=LVYOU_HOST, kwargs={'tag': tag}) yield next_tag_task except Exception, e: self.logger.info("json loads error:%s for url:%s" % (e, task.request.url)) raise e
class NuomiSpider(BaseSpider): """团购类糯米网数据抓取 """ parsers = { 'CityParser': CityParser, 'DealParser': DealParser, 'PictureParser': PictureParser, } pipelines = { 'CityItem': EmptyPipeline, 'DealItem': DealItemPipeline, 'PictureItem': PictureItemPipeline, } start_tasks = [ HttpTask(HTTPRequest(url='http://www.nuomi.com/help/api', connect_timeout=10, request_timeout=20), callback='CityParser', max_fail_count=8, kwargs={}), ]
def build_rooms_task_for_hotel(hotel_requests, city_code, chinese_name, hotel_addresses): """build room task for hotel Args: hotel_requests: list, [(hotel_code, city_code, chinese_name)] city_code: str, city code of tigerknows chinese_name: str, chinese name of city hotel_addresses: dict, hotel address dict Returns: task: HttpTask, new task for hotel search """ timestamp = int(time.time()) request_info_xml = "".join([ """<HotelDescriptiveInfo HotelCode="%s" PositionTypeCode="502"> <HotelInfo SendData="true"/><FacilityInfo SendGuestRooms="true"/> <AreaInfo SendAttractions="false" SendRecreations="false"/> <ContactInfo SendData="false"/><MultimediaObjects SendData="true"/> </HotelDescriptiveInfo>""" % hotel_code for hotel_code in hotel_requests ]) request_xml = """<?xml version="1.0" encoding="utf-8"?><Request> <Header AllianceID="%s" SID="%s" TimeStamp="%s" RequestType="%s" Signature="%s" /> <HotelRequest><RequestBody xmlns:ns="http://www.opentravel.org/OTA/2003/05" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema"> <OTA_HotelDescriptiveInfoRQ Version="1.0" xsi:schemaLocation="http://www.opentravel.org/OTA/2003/05 OTA_HotelDescriptiveInfoRQ.xsd" xmlns="http://www.opentravel.org/OTA/2003/05" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"> <HotelDescriptiveInfos>%s</HotelDescriptiveInfos></OTA_HotelDescriptiveInfoRQ> </RequestBody></HotelRequest></Request>""" % ( ALLIANCE_ID, SID, timestamp, "OTA_HotelDescriptiveInfo", _create_signature(timestamp, ALLIANCE_ID, SID, "OTA_HotelDescriptiveInfo", API_KEY), request_info_xml) post_xml = """<?xml version="1.0" encoding="utf-8"?> <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:xsd="http://www.w3.org/2001/XMLSchema" xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/"> <soap:Body><Request xmlns="http://ctrip.com/"> <requestXML>%s</requestXML></Request></soap:Body></soap:Envelope>""" \ % escape(request_xml) http_request = HTTPRequest( "http://%s/Hotel/OTA_HotelDescriptiveInfo.asmx" % API_URL, method="POST", body=post_xml, connect_timeout=20, request_timeout=360, headers={ "SOAPAction": "http://ctrip.com/Request", "Content-Type": "text/xml; charset=utf-8" }) return HttpTask(http_request, callback="HotelParser", max_fail_count=5, kwargs={ "citycode": city_code, "chinesename": chinese_name, "address": hotel_addresses })
def parse(self, task, input_file): """解析函数 Args: task: HttpTask,任务 input_file: StringIO, 网页文件 Yields: item """ self.logger.debug("attraction parser start to parse") content = input_file.read() tree = html.parse(StringIO(content)) try: zip_code = flist(tree.xpath("//span[@class='postal-code']/text()"), u"") play_spend, play_spend_unit = _extract_play_spend_and_unit(content) tel_phone = flist( tree.xpath("//div[@id='HEADING_GROUP']" "/div[@class='wrap infoBox']" "/div[@class='odcHotel blDetails']" "/div/div[@class='fl']/text()"), u"") open_time = u"" total_score = flist( tree.xpath("//div[@class='rs rating']" "/span/img/@content")) ticket_info = u"" preview_relate_path = flist( tree.xpath("//div[@class='listing_description']/a/@href"), u"") lon, lat = _extract_lon_lat( flist( tree.xpath("//div[@class='js_mapThumb']" "/div[@id='bmapContainer']/img[1]/@src"), u"")) comments = _extract_comments(tree) # 生成景点信息(不包括description) attraction_item = AttractionItem(task.request.url, task.kwargs['name'], unicode(play_spend), play_spend_unit, task.kwargs['address'], unicode(tel_phone), unicode(open_time), unicode(total_score), unicode(ticket_info), task.kwargs['hot'], lon, lat, task.kwargs['rank'], comments, unicode(zip_code)) yield attraction_item # 生成description任务 if len(preview_relate_path) != 0: description_request = build_description_request( task.request.url, preview_relate_path) description_task = HttpTask(description_request, callback="DescriptionParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={'url': task.request.url}) yield description_task else: yield DescriptionItem(task.request.url, u"") except Exception, e: print "error:%s" % e print traceback.format_exc()
class AttractionListParser(BaseParser): """用于解析Attraction list的解析器 """ def __init__(self, namespace): BaseParser.__init__(self, namespace) self.logger.info("init attraction list parser finish") def parse(self, task, input_file): """解析函数 Args: task: HttpTask,任务 input_file: StringIO, 网页文件 Yields: task: HttpTask, 新任务 """ tree = html.parse(input_file) attraction_elems = tree.xpath( "//div[@id='ATTRACTION_OVERVIEW']" "/div[@class='attraction-list clearfix']") for attraction_elem in attraction_elems: try: info_elem = flist( attraction_elem.xpath( "div[@class='clearfix']/div[@class='info']"), None) rank_elem = flist( attraction_elem.xpath( "div[@class='clearfix']/div[@class='rank']"), None) relate_path = flist( info_elem.xpath("div[@class='title']/a/@href"), u"") name = flist(info_elem.xpath("div[@class='title']/a/text()"), u"") address = _extract_address(info_elem) hot = flist(rank_elem.xpath("a/strong/text()"), u"") rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"") # 形成attraction 任务 http_request = build_attraction_request(relate_path) attraction_task = HttpTask(http_request, callback="AttractionParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={ "name": unicode(name).strip(), "address": unicode(address), "hot": unicode(hot), "rank": unicode(rank) }) yield attraction_task except Exception, e: self.logger.warn("extract one attraction failed error:%s" % e) # 形成下一页任务 next_page_relate = flist( tree.xpath( "//div[@class='pagination']/div" "/a[@class='next sprite-arrow-right-green ml6 ']/@href"), u"") if len(next_page_relate) != 0: next_page_request = build_next_page_request(next_page_relate) next_page_task = HttpTask(next_page_request, callback="AttractionListParser", max_fail_count=5, cookie_host=LVYOU_HOST) yield next_page_task
def parse(self, task, input_file): """解析函数 Args: task: HTTPTask, 任务 input_file: StringIO, 网页信息 Yields: task: HTTPTask, 任务 item: Item, 解析的结果 """ self.logger.debug("attraction parser start to parse") parser = html.HTMLParser(encoding='utf-8') tree = html.parse(input_file, parser) try: name = flist( tree.xpath("//header[@class='title-head']/a/p/text()"), u"") play_spend, play_spend_unit = _extract_play_spend(tree) address = flist( tree.xpath("//div[@id='J-aside-info-address']" "/span[@class='val address-value']" "/text()"), u"") tel_phone = flist( tree.xpath("//div[@id='J-aside-info-phone']" "/span[@class='val phone-value']" "/text()"), u"") time_elems = tree.xpath("//div[@id='J-aside-info-opening_hours']" "/div[@class='val opening_hours-value']/p") time_list = [] for time_elem in time_elems: time_list.append(time_elem.text) open_time = "".join(time_list) total_score = flist( tree.xpath("//div[@class='scene-rating']" "/div/@content"), u"") ticket_info = flist( tree.xpath("//div[@id='J-aside-info-price']" "/div[@class='val price-value']" "/p/text()"), u"") preview = _extract_preview(tree) traffic = _extract_traffic(tree) tips = _extract_tips(tree) hot = flist( tree.xpath("//section[@id='remark-container']" "/div[@class='remark-overall-rating']" "/span[@class='remark-all-counts']" "/text()"), u"") lon_lat = task.kwargs['map_info'].split(",") if len(lon_lat) <= 1: lon, lat = u"", u"" else: lon, lat = lon_lat[0], lon_lat[1] seq_sort = task.kwargs['seq_sort'] sid = task.kwargs['sid'] attraction_item = AttractionItem(unicode(sid), unicode(name), unicode(play_spend), unicode(play_spend_unit), unicode(address), unicode(tel_phone), unicode(open_time), unicode(total_score), unicode(ticket_info), unicode(preview), unicode(hot), unicode(lon), unicode(lat), unicode(seq_sort), unicode(traffic), unicode(tips)) yield attraction_item # yield comment list task comments_request = build_comment_list_request( sid, task.kwargs['relate_path']) comments_task = HttpTask(comments_request, callback="CommentListParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={'sid': sid}) yield comments_task except Exception, e: self.logger.error("extract Attraction failed error:%s" % e) self.logger.error("error traceback:%s" % traceback.format_exc()) raise e