fl_shop2 = Fieldlist() page_shop_1 = Page( name='马蜂窝景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector( list_css_selector= '#_j_search_result_left > div:nth-child(1) > div > ul > li', ), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) page_shop_2 = Page() page_shop_2 = Page( name='马蜂窝景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.ct-text > h3 > a'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) def get_comment_grade(self, _str): return str(_str[-1]) def get_comment_time(self, _str): #时间格式统一为2018-12-08 return _str[0:10] def get_comment_year(self, _str): time = _str[0:10]
'comment_num_list': p( 'div:nth-child(3) > div.review-filter > ul.filter-list.clearfix').text().split('\n')[1:]}, ensure_ascii=False) fl_shop2 = Fieldlist( Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='#hotel-page > div > div.hotel-box.hotel-baseinfo > div.info > div.base > p.address', offset=6, try_times=10, pause_time=5), Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL, css_selector='#J_RoomList', attr='innerHTML', filter_func=get_room_all), Field(fieldname=FieldName.SHOP_INTRO, css_selector='#hotel-desc', attr='innerHTML', filter_func=get_shop_intro), Field(fieldname=FieldName.SHOP_FACILITIES, css_selector='#hotel-facility', attr='innerHTML', filter_func=get_shop_facility), Field(fieldname=FieldName.SHOP_TRAFFIC, css_selector='#rich-map-wrap', attr='innerHTML', filter_func=get_shop_traffic), Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#hotel-review', attr='innerHTML', filter_func=get_shop_statistics), ) page_shop_1 = Page(name='飞猪酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#J_List > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) page_shop_2 = Page(name='飞猪酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.row-center > div > h5 > a'),mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) class FliggyHotelSpider(TravelDriver): def page_shop_2_func(self): try: self.move_to_element_by_css_selector(css_selector='#rich-map-wrap > div.J_RichCon > div.tabs > ul') self.vertical_scroll_by(offset=-200) for i in self.until_presence_of_all_elements_located_by_css_selector(css_selector='#rich-map-wrap > div.J_RichCon > div.tabs > ul > li.J_Tab'): i.click() except Exception: self.error_log(e='找不到元素') time.sleep(3) def get_shop_info(self): try:
page_shop_1 = Page( name='途牛景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector( list_css_selector= '#niuren_list > div.contentcontainer.clearfix > div.content_bottom > div.main.fl > div.thelist > ul > li', ), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) page_shop_2 = Page() page_shop_2 = Page( name='途牛景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector= ' div.theinfo.ticket.clearfix > a > dl > dt > p > span'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) def get_comment_grade(self, _str): if _str == 'icon_manyi': return '5' elif _str == 'icon_yiban': return '2.5' else: return '0'
#正则表达式不一样 #mainHotelLeft > div:nth-child(2) > div > div:nth-child(2) > ul > li:nth-child(2) Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.hotel-brief.fl > div.comment > a > span', is_info=True), Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > div.hotel-info.fl > div.nameAndIcon > span.decorate_year',is_info=True), Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.hotel-brief.fl > div.startPrice > span.digit', is_info=True), Field(fieldname=FieldName.SHOP_RATE, css_selector='',filter_func=get_shop_rate, is_info=True), ) fl_shop2 = Fieldlist() page_shop_1 = Page(name='途牛酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#main > div.hotel-list > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True) page_shop_2 = Page() page_shop_2 = Page(name='途牛酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.hotel-info.fl > div.nameAndIcon > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection)) def get_comment_user_name(self, _str): return _str.split(' ')[0] def get_comment_time(self, _str): return re.findall(r'([\d]{4}-[\d]{2}-[\d]{2} [\d]{2}:[\d]{2})',_str)[0] def get_comment_grade(self,_str): #判断如果含有好 高 if ('好' in _str) and ('但是' in _str) == False: return str("5.0") elif ('好' in _str) and ('但是' in _str) == True: return str("0.0") elif ('差' in _str) and ('但是' in _str) == False:
filter_func=get_around_facilities, is_focus=True), ) page_shop_1 = Page(name='携程酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector( list_css_selector='#hotel_list > div.hotel_new_list', item_css_selector='ul.hotel_item'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) page_shop_2 = Page( name='携程酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup( click_css_selector='li.hotel_price_icon > div.action_info > p > a'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) fl_comment1 = Fieldlist( Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector='div.user_info.J_ctrip_pop > p.name'), Field(fieldname=FieldName.COMMENT_TIME, css_selector= 'div.comment_main > div.comment_txt > div.comment_bar > p > span', regex=r'[^\d-]*'), Field(fieldname=FieldName.SHOP_NAME, css_selector='#J_htl_info > div.name > h2.cn_n', is_isolated=True), Field(
Field(fieldname=FieldName.SHOP_IMG, css_selector='div.h_info_pic > a > img', attr='big-src',is_info=True), #有些问题 Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2',is_info=True), Field(fieldname=FieldName.SHOP_PRICE,css_selector='div > div.h_info_text > div.h_info_pri > p:nth-child(1) > a > span.h_pri_num',is_info=True), #稍许有些问题 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.h_info_text > div.h_info_comt > a > span.c555.block.mt5'), Field(fieldname=FieldName.SHOP_GRADE, css_selector=' div > div.h_info_text > div.h_info_comt > a > span.h_info_comt_bg > i.c37e',is_info=True), Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True), Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True) ) fl_shop2 = Fieldlist() page_shop_1 = Page(name='艺龙酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#hotelContainer > div > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True) # page_shop_2 = Page() # page_shop_2 = Page(name='艺龙酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection)) fl_comment1 = Fieldlist( Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector=' div.cmt_userinfo > div > p.cmt_un',is_info=True), Field(fieldname=FieldName.COMMENT_TIME, css_selector='div.cmt_info_mn > div > div.if_hd_r > span.cmt_con_time', is_info=True), Field(fieldname=FieldName.SHOP_NAME, css_selector='body > div.hdetail_rela_wrap > div > div.hrela_ns_wrap.clearfix > div.hdetail_main.hrela_name > div > h1', is_isolated=True, is_info=True), Field(fieldname=FieldName.COMMENT_CONTENT, css_selector='div.cmt_info_mn > p.cmt_txt',is_info=True), #有问题 Field(fieldname=FieldName.COMMENT_SCORE, css_selector='div.cmt_info_mn > div > div.if_hd > b',is_info=True), ) page_comment_1 = Page(name='艺龙酒店评论列表', fieldlist=fl_comment1, listcssselector=ListCssSelector(list_css_selector='#review > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.comments_collection), is_save=True) class ELongHotelSpider(TravelDriver):
from selenium.webdriver.remote.webelement import WebElement from spider.driver.base.tabsetup import TabSetup from spider.driver.base.field import Field, Fieldlist from spider.driver.base.page import Page, PageGroup from spider.driver.base.listcssselector import ListCssSelector from spider.driver.base.mongodb import Mongodb fl = Fieldlist(Field(fieldname=12), Field(fieldname=13)) mongo = Mongodb(db='122', collection='12') lcs = ListCssSelector(list_css_selector=12) tab = TabSetup(url_name=12) p = Page(name=122, fieldlist=fl, mongodb=mongo, listcssselector=lcs, tabsetup=tab) p1 = Page(name=123, fieldlist=fl, mongodb=mongo, listcssselector=lcs, tabsetup=tab) pg = PageGroup(p, p1) print(next(pg))
is_focus=True), ) page_shop_1 = Page(name='携程景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector( list_css_selector='#searchResultContainer > div', item_css_selector='div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) page_shop_2 = Page( name='携程景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div.search_ticket_title > h2 > a'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) def get_comment_user_name(self, _str): return _str.split(' ')[0] def get_comment_time(self, _str): return re.findall(r'([\d]{4}-[\d]{2}-[\d]{2} [\d]{2}:[\d]{2})', _str)[0] fl_comment1 = Fieldlist( Field(fieldname=FieldName.COMMENT_USER_NAME,
fl_weixin2 = Fieldlist( Field(fieldname='article_name', css_selector='div > div > h4'), Field(fieldname='article_time', css_selector='div > div > p.weui_media_extra_info'), ) page_weixin_1 = Page(name='微信公众号列表页面', fieldlist=fl_weixin1, listcssselector=ListCssSelector( list_css_selector='#main > div.news-box > ul > li')) page_weixin_2 = Page( name='微信公众号文章列表页面', fieldlist=fl_weixin2, tabsetup=TabSetup(click_css_selector='div > div.txt-box > p.tit > a'), listcssselector=ListCssSelector(list_css_selector='#history > div')) class WeixinSpider(Driver): def __init__(self, isheadless=False, ismobile=False, isvirtualdisplay=False, spider_id='', name=''): Driver.__init__(self, log_file_name=spider_id, ismobile=ismobile, isvirtualdisplay=isvirtualdisplay, isheadless=isheadless)
fl_shop2 = Fieldlist() page_shop_1 = Page(name='驴妈妈景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector( list_css_selector=' div.product-list > div', item_css_selector='div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) page_shop_2 = Page() page_shop_2 = Page( name='驴妈妈景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup( click_css_selector= 'div.product-regular.clearfix > div.product-section > h3 > a'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=False) def get_comment_year(self, _str): return _str[0:4] def get_comment_season(self, _str): time = _str[0:10] times = time.split('-')
'#poi-detail > div.container > div.sub-content.clearfix > div.main > div.user-comment-info', attr='innerHTML', filter_func=get_shop_statistics, is_focus=True), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='#comment > div > h2 > a > span.count', regex=r'[^\d]*', is_focus=True, is_info=True), ) page_shop_2 = Page( name='大众点评酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup( click_css_selector= 'div.hotel-info-ctn > div.hotel-info-main > h2 > a.hotel-name-link'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) def get_rate(self, _str): return str(int(re.sub('[^\d]*', '', _str)) / 10) def get_comment_rate_tag(self, _str): p = PyQuery(_str) tag_list = [] for i in p('span.item').items(): tag_list.append(i.text().strip())
css_selector='#hotelUserComment', attr='innerHTML', filter_func=get_shop_statistics), ) page_shop_1 = Page(name='途牛酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector( list_css_selector='#main > div.hotel-list > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) page_shop_2 = Page( name='途牛酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup( click_css_selector='div.hotel-brief.fl > div.hotelDetail > a'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) class TuniuHotelSpider(TravelDriver): def page_shop_2_func(self): try: for i in self.until_presence_of_all_elements_located_by_css_selector( css_selector='#hotelTraffic > div.detail_map > ul > li'): self.until_click_by_vertical_scroll_page_down(click_ele=i) except Exception: self.error_log(e='找不到元素') time.sleep(3)
Field(fieldname=FieldName.SHOP_FEATURE, css_selector='',is_info=True,filter_func=get_shop_feature), Field(fieldname=FieldName.SHOP_RATE,css_selector='',is_info=True,filter_func=get_shop_rate) ) fl_shop2 = Fieldlist( ) page_shop_1 = Page(name='途牛景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#niuren_list > div.contentcontainer.clearfix > div.content_bottom > div.main.fl > div.thelist > ul > li',), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True) page_shop_2 = Page() page_shop_2 = Page(name='途牛景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector=' div.theinfo.ticket.clearfix > a > dl > dt > p > span'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection), is_save=True) def get_comment_grade(self,_str): if _str == 'icon_manyi': return '5' elif _str =='icon_yiban': return '2.5' else: return '0' def get_comment_time(self,_str): #时间格式统一为2018-12-08 return _str[0:10] def get_comment_year(self,_str):
comment_num_list = [] for i in p('dl.rank').items('dd'): comment_num_list.append({re.sub(r'[^\u4e00-\u9fa5]*', '', i.text()): re.sub(r'[^\d]*', '', i.text())}) statistics.setdefault('comment_num_list', comment_num_list) return json.dumps(statistics, ensure_ascii=False) fl_shop2 = Fieldlist( Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL, css_selector='div.m-room-tools-bd.js-roomtool-rooms.caculate-price', attr='innerHTML', filter_func=get_room_all, pause_time=5), Field(fieldname=FieldName.SHOP_TRAFFIC, css_selector='#js-neighbor', attr='innerHTML', filter_func=get_shop_traffic, pause_time=1), Field(fieldname=FieldName.SHOP_FACILITIES, css_selector='#descContent', attr='innerHTML', filter_func=get_shop_facilities), Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#comment_main > div > div.b_ugcheader > div.b_ugcfilter', attr='innerHTML', filter_func=get_shop_statistics), ) page_shop_1 = Page(name='去哪儿酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='div.b_result_box.js_list_block.b_result_commentbox'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) page_shop_2 = Page(name='去哪儿酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='a.e_title.js_list_name'),mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) class QunarHotelSpider(TravelDriver): def page_shop_2_func(self): try: for i in self.until_presence_of_all_elements_located_by_partial_link_text(link_text='查看其他'): i.click() except Exception: self.error_log(e='找不到元素') try: for i in self.until_presence_of_all_elements_located_by_partial_link_text(link_text='展开报价'): i.click() except Exception: self.error_log(e='找不到元素') try:
if len(item) >= 2: around.setdefault(item[0], (lambda x: x[1:] if len(x) >= 2 else [''])(item)) return json.dumps(around, ensure_ascii=False) fl_shop2 = Fieldlist( Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL,css_selector='#hotelRoomBox', attr='innerHTML', filter_func=get_recommend_all_room_dict, pause_time=1, is_focus=True), Field(fieldname=FieldName.SHOP_ROOM_FAVOURABLE,css_selector='#divDetailMain > div.htl_room_table',attr='innerHTML', filter_func=get_favourable_room, is_focus=True), Field(fieldname=FieldName.SHOP_INTRO, css_selector='#hotel_info_comment > div',attr='innerHTML', filter_func=get_hotel_intro, is_focus=True), Field(fieldname=FieldName.SHOP_PHONE, css_selector='#J_realContact', attr='data-real', regex='^([^<]*).*$', repl=r'\1', is_focus=True), Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#commentList > div.detail_cmt_box',attr='innerHTML',filter_func=get_shop_statistics, is_focus=True), Field(fieldname=FieldName.SHOP_AROUND_FACILITIES, css_selector='#hotel_info_comment > div', attr='innerHTML',filter_func=get_around_facilities, is_focus=True), ) page_shop_1 = Page(name='携程酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#hotel_list > div.hotel_new_list', item_css_selector='ul.hotel_item'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) page_shop_2 = Page(name='携程酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='li.hotel_price_icon > div.action_info > p > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection), is_save=True) fl_comment1 = Fieldlist( Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector='div.user_info.J_ctrip_pop > p.name'), Field(fieldname=FieldName.COMMENT_TIME, css_selector='div.comment_main > div.comment_txt > div.comment_bar > p > span', regex=r'[^\d-]*'), Field(fieldname=FieldName.SHOP_NAME, css_selector='#J_htl_info > div.name > h2.cn_n', is_isolated=True), Field(fieldname=FieldName.COMMENT_CONTENT, css_selector='div.comment_main > div.comment_txt > div.J_commentDetail'), Field(fieldname=FieldName.COMMENT_USER_IMG, css_selector='div.user_info.J_ctrip_pop > p.head > span > img', attr='src'), Field(fieldname=FieldName.COMMENT_USER_CHECK_IN, css_selector='div.comment_main > p > span.date'), Field(fieldname=FieldName.COMMENT_USER_ROOM, css_selector='div.comment_main > p > a'), Field(fieldname=FieldName.COMMENT_TYPE, css_selector='div.comment_main > p > span.type'), Field(fieldname=FieldName.COMMENT_SCORE, css_selector='div.comment_main > p > span.score', regex=r'[^\d.]*'), Field(fieldname=FieldName.COMMENT_SCORE_TEXT, css_selector='div.comment_main > p > span.small_c', attr='data-value'), Field(fieldname=FieldName.COMMENT_USER_NUM, css_selector='div.user_info.J_ctrip_pop > p.num'), Field(fieldname=FieldName.COMMENT_PIC_LIST, list_css_selector='div.comment_txt > div.comment_pic', item_css_selector='div.pic > img', attr='src', timeout=0), Field(fieldname=FieldName.COMMENT_REPLAY, css_selector='div.comment_main > div.htl_reply > p.text.text_other'),
filter_func=get_shop_rate, is_info=True), ) fl_shop2 = Fieldlist() page_shop_1 = Page(name='驴妈妈景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector( list_css_selector='#mainHotelLeft > div', ), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) page_shop_2 = Page() page_shop_2 = Page(name='驴妈妈景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='dl > dt > a'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) def get_comment_user_name(self, _str): return _str.split(' ')[0] def get_comment_time(self, _str): return re.findall(r'([\d]{4}-[\d]{2}-[\d]{2} [\d]{2}:[\d]{2})', _str)[0] def get_comment_grade(self, _str): return str(_str[-1])
fl_shop2 = Fieldlist( #phoenix_dom_3_0 > div > div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span #phoenix_dom_3_1 > div > div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.card-box.special2-box.c-container >div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span',is_info=True), Field(fieldname=FieldName.SHOP_SCORE, css_selector='span.left-header-visit',is_info=True), Field(fieldname=FieldName.SHOP_CATEGORY_NAME,css_selector='span.left-header-stdtag',is_info=True), Field(fieldname=FieldName.SHOP_PRICE,css_selector='span.left-header-reference-price',is_info=True), Field(fieldname=FieldName.SHOP_NAME_SEARCH_KEY,css_selector='div.generalHead-left-header-title > span',filter_func=get_shop_name,is_info=True), Field(fieldname=FieldName.SHOP_PHONE,css_selector='#generalinfo > div.generalInfo-address-telnum > div.generalInfo-telnum.item > span.clampword.generalInfo-telnum-text',is_info=True) #generalheader > div.generalHead-left-header.animation-common > div.generalHead-left-header-title > span ) #card-1 > div > ul > li:nth-child(1) > div.cf.mb_5 > div.ml_30.mr_85 > div:nth-child(2) page_shop_1 = Page(name='百度餐饮店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='ul.poilist > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True) page_shop_2 = Page(name='百度餐饮店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div.cf > div.ml_30 > div:nth-child(1) > span > a'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True) def get_shop_lng(self,_str): doc = _str.split(',') #再某一个经度范围内 if(float(doc[0]) <= 119.243071 and float(doc[0]) >= 118.650908): return doc[0] else: return 119.051491 def get_shop_lat(self,_str): doc = _str.split(',')
tag_star_dict.setdefault('star', stars) return json.dumps(tag_star_dict, ensure_ascii=False) fl_shop2 = Fieldlist( Field(fieldname=FieldName.SHOP_GRADE, css_selector='#poi-detail > div.container > div.base-info > div.main-detail.clearfix > div.main-detail-right > div.hotel-appraise > div.hotel-scope > span', pause_time=5, is_focus=True), Field(fieldname=FieldName.SHOP_PHONE, css_selector='#poi-detail > div.container > div.base-info > div.main-detail.clearfix > div.main-detail-left > div.main-detail-left-top.clearfix > div.hotel-detail-info > div > div.call-info > div > span.call-number', is_focus=True), Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='#poi-detail > div.container > div.base-info > div.main-detail.clearfix > div.main-detail-left > div.main-detail-left-top.clearfix > div.hotel-detail-price > div.hotel-address-box.clearfix > span.hotel-address', is_focus=True), Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL, css_selector='#deal', attr='innerHTML', filter_func=get_shop_room_all, is_focus=True), Field(fieldname=FieldName.SHOP_INTRO, css_selector='#poi-detail > div.container > div.sub-content.clearfix > div.main > div> div.hotel-info', attr='innerHTML', filter_func=get_shop_intro, is_focus=True), Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#poi-detail > div.container > div.sub-content.clearfix > div.main > div.user-comment-info', attr='innerHTML', filter_func=get_shop_statistics, is_focus=True), Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='#comment > div > h2 > a > span.count', regex=r'[^\d]*'), ) page_shop_1 = Page(name='大众点评酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#poi-list > div.content-wrap > div > div.list-wrapper > div.content > ul > li',item_start=11,item_end=12), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) page_shop_2 = Page(name='大众点评酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div.hotel-info-ctn > div.hotel-info-main > h2 > a.hotel-name-link'),mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) def get_rate(self, _str): return str(int(re.sub('[^\d]*','',_str))/10) def get_comment_rate_tag(self, _str): p = PyQuery(_str) tag_list = [] for i in p('span.item').items(): tag_list.append(i.text().strip()) return json.dumps(tag_list, ensure_ascii=False) def get_comment_content(self, _str): return PyQuery(_str).text().replace('收起评论','') fl_comment1 = Fieldlist(
css_selector='', is_info=True)) fl_shop2 = Fieldlist() page_shop_1 = Page(name='去哪儿酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector( list_css_selector='#jxContentPanel > div', ), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) page_shop_2 = Page( name='去哪儿酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='a.e_title.js_list_name'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) def get_comment_grade(self, _str): return str(_str[-1]) def get_comment_time(self, _str): #时间格式统一为2018-12-08 print(_str) return _str[0:10] fl_comment1 = Fieldlist(
Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#_j_comment', attr='innerHTML', filter_func=get_shop_stattistics), ) page_shop_1 = Page( name='马蜂窝酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#_j_hotel_list > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection)) page_shop_2 = Page(name='马蜂窝酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div.hotel-pic > a'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) class MafengwoHotelSpider(TravelDriver): def get_shop_info(self): try: shop_data_list = self.from_page_get_data_list(page=page_shop_1) self.from_page_add_data_to_data_list(page=page_shop_2, data_list=shop_data_list, pre_page=page_shop_1) except Exception as e: self.error_log(e=e)
is_info=True), Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade), #正则表达式不一样 Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a',filter_func=get_comment_num, is_info=True), Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > div.ct-text > p',is_info=True), Field(fieldname=FieldName.SHOP_PRICE,css_selector= '',filter_func=get_shop_price, is_info=True) ) fl_shop2 = Fieldlist( ) page_shop_1 = Page(name='马蜂窝景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#_j_search_result_left > div:nth-child(1) > div > ul > li',), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True) page_shop_2 = Page() page_shop_2 = Page(name='马蜂窝景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.ct-text > h3 > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection)) def get_comment_grade(self,_str): return str(_str[-1]) def get_comment_time(self,_str): #时间格式统一为2018-12-08 return _str[0:10] def get_comment_year(self,_str): time = _str[0:10] return time[0:4]; def get_comment_season(self, _str):
is_info=True), ) fl_shop2 = Fieldlist() page_shop_1 = Page( name='去哪儿景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#search-list > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) page_shop_2 = () page_shop_2 = Page( name='去哪儿景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup( click_css_selector='div > div.sight_item_about > h3 > a'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True) def get_comment_grade(self, _str): groups = re.findall(r'[\d]{1,3}', _str) saveTo = (float(groups[0]) / 100 * 5) return str(saveTo) def get_comment_year(self, _str):