示例#1
0
from spider.driver.base.field import Fieldlist,Field,FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.page import Page,NextPageCssSelectorSetup,PageFunc,NextPageLinkTextSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.travel.core.traveldriver import TravelDriver
import time
from pyquery import PyQuery
import json
import re
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.ct-text > h3 > a', is_debug=True),

    Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.ct-text > h3 > a',attr='href'),
    Field(fieldname=FieldName.SHOP_IMG, css_selector=' div > div.flt1 > a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.ct-text > ul > li:nth-child(1) > a'),
  #  Field(fieldname=FieldName.SHOP_GRADE,css_selector='div.search_ticket_assess > span.grades > em'),
    #正则表达式不一样
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a', regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$', repl=r'\1'),
    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > ul > li:nth-child(1) > div > div.ct-text > p'),
)

def get_shop_ticket():
  print(111)
def get_shop_info():
    print(222)
fl_shop2 = Fieldlist(
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='body > div.container > div:nth-child(6) > div.mod.mod-detail > dl:nth-child(4) > dd > div:nth-child(1) > div', pause_time=3, is_focus=True, is_info=True),
    Field(fieldname=FieldName.SHOP_TIME, css_selector='body > div.container > div:nth-child(6) > div.mod.mod-detail > dl:nth-child(5) > dd > div:nth-child(1)', is_focus=True),
    #Field(fieldname=FieldName.SHOP_SERVICE,css_selector='3) > div.main-bd > div > div.brief-box.clearfix > div.brief-right > ul > li.promise',attr='innerHTML', filter_func=get_shop_service, is_focus=True),
    #门票信息尚有问题
示例#2
0
fl_shop1 = Fieldlist(
    Field(
        fieldname=FieldName.SHOP_NAME,
        css_selector='div > div.mp-sight-info > a > div.mp-sight-detail > h3',
        is_info=True),
    #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(
        fieldname=FieldName.SHOP_PRICE,
        css_selector=
        'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-pricecon > div.mp-sight-price > em',
        is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div > div.mp-sight-info > a',
          attr='href',
          is_debug=True,
          is_info=True),
    #img还有些许问题
    #\33 6822720 > div:nth-child(1) > div
    Field(
        fieldname=FieldName.SHOP_IMG,
        css_selector='div > div.mp-sight-info > a > div.mp-sight-imgcon > img',
        attr='src',
        is_info=True),
    Field(
        fieldname=FieldName.SHOP_ADDRESS,
        css_selector=
        'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-pricecon > div.mp-sight-location > span',
        is_info=True),
    #这里应该做一个转换
    #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)
    Field(
        fieldname=FieldName.SHOP_GRADE,
        css_selector=
        'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-comments > span.mpf-starlevel > span.mpg-iconfont.mpf-starlevel-gain',
        attr='data-score',
        is_info=True),
    #正则表达式的使用有问题
    Field(
        fieldname=FieldName.SHOP_COMMENT_NUM,
        css_selector=
        'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-comments > span.mp-comments-totalnum',
        is_info=True),
    #无shop_feature
    Field(fieldname=FieldName.SHOP_FEATURE,
          css_selector='',
          filter_func=get_shop_feature,
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          filter_func=get_shop_rate,
          is_info=True),
)
    num = re.findall(r'[\d]{1,10}',_str)
    return str(num[0])
def get_shop_grade(self,_str):
    return "0.0"
def get_shop_price(self,_str):
    return "0.0"
def get_shop_rate(self,_str):
    return ""
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,css_selector='div > div.ct-text > h3 > a',is_debug=True),
    Field(fieldname=FieldName.SHOP_RATE,css_selector='',is_info=True,filter_func=get_shop_rate),
    Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.ct-text > h3 > a',attr='href',is_info=True),
    Field(fieldname=FieldName.SHOP_IMG, css_selector=' div > div.flt1 > a > img', attr='src',is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.ct-text > ul > li:nth-child(1) > a',
          is_info=True),

    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade),
    #正则表达式不一样
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a',filter_func=get_comment_num, is_info=True),

    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > div.ct-text > p',is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,css_selector= '',filter_func=get_shop_price, is_info=True)
)


fl_shop2 = Fieldlist(
)
page_shop_1 = Page(name='马蜂窝景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#_j_search_result_left > div:nth-child(1) > div > ul > li',), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True)
page_shop_2 = Page()
page_shop_2 = Page(name='马蜂窝景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.ct-text > h3 > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection))
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='a > div.ml-pro-info > p'),
    #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(
        fieldname=FieldName.SHOP_PRICE,
        css_selector=
        ' a > div.ml-pro-info > div.ml-pro-price > span.price > i:nth-child(2)',
        is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='a',
          attr='href',
          is_debug=True,
          is_info=True),
    #img还有些许问题
    #\33 6822720 > div:nth-child(1) > div
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector='a > div.ml-pro-img > img',
          attr='src',
          is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector=
          'a > div.ml-pro-info > div.orderNum.adress > span:nth-child(1)',
          is_info=True),
    #这里应该做一个转换
    #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)
    Field(fieldname=FieldName.SHOP_GRADE,
          css_selector='a > div.ml-pro-info > div:nth-child(3) > span',
          is_info=True),
    #正则表达式的使用有问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='',
          filter_func=get_comment_num,
          is_info=True),
    #无shop_feature
    Field(fieldname=FieldName.SHOP_FEATURE,
          css_selector='',
          filter_func=get_shop_feature,
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          filter_func=get_shop_rate,
          is_info=True),
    Field(fieldname=FieldName.SHOP_COMMENT_URL,
          css_selector='a',
          attr='href',
          filter_func=get_comment_url,
          is_info=True))
示例#5
0
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector=' a.e_title.js_list_name',
          is_debug=True),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='a.e_title.js_list_name',
          attr='href',
          is_info=True),
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector='a > img:nth-child(1)',
          attr='src',
          is_info=True),
    Field(
        fieldname=FieldName.SHOP_ADDRESS,
        css_selector=
        'div > div > div.clrfix > div.item_hotel_info > div.item_hotel_bsinfo > table > tbody > tr > td.item_hotel_name > div > p > span > em',
        is_info=True),
    Field(
        fieldname=FieldName.SHOP_PRICE,
        css_selector=
        ' div > div > div.clrfix > div.item_hotel_info > div.hotel_price >  div > div > div > p > a > b',
        is_info=True),
    #正则表达式不一样 小问题
    Field(
        fieldname=FieldName.SHOP_COMMENT_NUM,
        css_selector=
        ' div > div > div.clrfix > div.item_hotel_info > div.item_hotel_bsinfo > table > tbody > tr > td.item_hotel_name > div > div.level.levelmargin > a.level_comment.level_commentbd.js_list_usercomcount',
        is_info=True),
    Field(
        fieldname=FieldName.SHOP_GRADE,
        css_selector=
        'div > div > div.clrfix > div.item_hotel_info > div.item_hotel_bsinfo > table > tbody > tr > td.item_hotel_name > div > div.level.levelmargin > a.level_score.js_list_score > strong',
        is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          filter_func=get_shop_rate,
          is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,
          filter_func=get_shop_feature,
          css_selector='',
          is_info=True))
示例#6
0
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
import re
import time
import json
from pyquery import PyQuery

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,css_selector='li.hotel_item_name > h2 > a',regex=r'^[\d]*(.*)$',repl=r'\1'),
    Field(fieldname=FieldName.SHOP_URL,css_selector='li.hotel_item_name > h2 > a',attr='href',regex=r'^([^\?]*)?.*$',repl=r'\1'),
    Field(fieldname=FieldName.SHOP_ID, css_selector='li.hotel_item_name > h2 > a', attr='href',regex=r'^[^\?\d]*([\d]*).html?.*$', repl=r'\1'),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='li.pic_medal > div > a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='li.hotel_item_name > p.hotel_item_htladdress'),
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.hotel_value'),
    Field(fieldname=FieldName.SHOP_STATISFACTION_PERCENT,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.total_judgement_score > span'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='li.hotel_item_name > span', attr='innerHTML',regex=r'[^\d]*'),
    Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='li.hotel_item_name > p.hotel_item_last_book'),
    Field(fieldname=FieldName.SHOP_PRICE,css_selector='span.J_price_lowList'),
    Field(fieldname=FieldName.SHOP_CATEGORY_NAME, css_selector='li.hotel_item_name > p.medal_list > span'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.hotel_judgement > span'),
    Field(fieldname=FieldName.SHOP_GRADE_TEXT,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.recommend'),
)


def get_recommend_all_room_dict(self, _str):
    p = PyQuery(_str)
    item_list = []
    for each in p('tr').items():
        if each.attr('class'):
            item_list.append(each)
import math
import datetime
def get_zero(self,_str):
    return 0.0
def get_shop_area(self,_str):
    return '千岛湖东北湖区';
def get_baidu_spider_step(self,_str):
    return "2";

fl_shop1 = Fieldlist(
#card-56 > div > ul > li:nth-child(3) > div.cf.mb_5 > div.ml_30.mr_85 > div:nth-child(1) > span > a
#card-56 > div > ul > li.search-item.base-item > div.cf > div.ml_30.mr_90 > div:nth-child(1) > span:nth-child(1) > a
    Field(fieldname=FieldName.SHOP_NAME,css_selector='div.cf > div.ml_30 > div:nth-child(1) > span > a',is_info=True),
#card-56 > div > ul > li.search-item.base-item > div.cf > div.ml_30.mr_90 > div.row.addr > span
#card-56 > div > ul > li:nth-child(3) > div.cf.mb_5 > div.ml_30.mr_85 > div.row.addr > span
    Field(fieldname=FieldName.SHOP_ADDRESS,css_selector='div.cf > div.ml_30 > div.row.addr > span',is_info=True),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div.cf > div.col-r > div.img-wrap > a > img', attr='src',is_info=True),
    Field(fieldname=FieldName.SHOP_LNG, css_selector='',filter_func=get_zero, is_info=True),
    Field(fieldname=FieldName.SHOP_LAT, css_selector='',filter_func=get_zero, is_info=True),
    Field(fieldname=FieldName.SHOP_AREA,css_selector='',filter_func=get_shop_area,is_info=True),
    Field(fieldname=FieldName.BAIDU_SPIDER_STEP,css_selector='',filter_func=get_baidu_spider_step,is_info=True)
)

def get_shop_name(self,_str):
    self.shop_name = _str;
    return _str;

fl_shop2 = Fieldlist(

#phoenix_dom_3_0 > div > div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span
#phoenix_dom_3_1 > div > div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span
示例#8
0
import re
import time
import json
from pyquery import PyQuery
import xmltodict

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.search_ticket_title > h2 > a'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.search_ticket_title > h2 > span > span.rate'),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.search_ticket_title > h2 > a',
          attr='href'),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='div.search_ticket_title > div.adress'),
    Field(fieldname=FieldName.SHOP_GRADE,
          css_selector='div.search_ticket_assess > span.grades > em'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.search_ticket_assess > span.grades',
          regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$',
          repl=r'\1'),
    Field(fieldname=FieldName.SHOP_FEATURE,
          css_selector='div.search_ticket_title > div.exercise'),
)


def get_shop_service(self, _str):
    p = PyQuery(_str)
    service_list = []
    for i in p('span').items():
示例#9
0
def get_shop_rate(self,_str):
    return ""
def get_shop_grade(self,_str):
    return "0.0"
fl_shop1 = Fieldlist(
    ##_j_search_result_left > div:nth-child(1) > div > div:nth-child(1) > div.ct-text > h3 > a
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div.ct-text > h3 > a', is_debug=True),
#_j_search_result_left > div:nth-child(1) > div > div:nth-child(2) > div.ct-text > h3 > a

    Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.ct-text > h3 > a', attr='href', is_info=True),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div.flt1 > a > img', attr='src', is_info=True),
#_j_search_result_left > div:nth-child(1) > div > div:nth-child(1) > div.ct-text > div > p:nth-child(1)
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.ct-text > ul > li:nth-child(1) > a',
          is_info=True),

     Field(fieldname=FieldName.SHOP_PRICE,css_selector='div.ct-text > ul > li.frt._j_hotel_ota > a > span.seg-price'),
    # 正则表达式不一样
#_j_search_result_left > div:nth-child(1) > div > div:nth-child(2) > div.ct-text > ul > li:nth-child(2) > a
    Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.ct-text > ul > li:nth-child(2) > a',
          is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div.ct-text > div > p:nth-child(1)',
          is_info=True),
    Field(fieldname=FieldName.SHOP_GRADE, css_selector='',filter_func=get_shop_grade,
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='',filter_func=get_shop_rate,
          is_info=True),

)



示例#10
0
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector=
          'div.sight_item_detail.clrfix > div.sight_item_about > h3 > a'),
    # 5A景区
    Field(
        fieldname=FieldName.SHOP_RATE,
        css_selector=
        'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > div.clrfix > span.level'
    ),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector=
          'div.sight_item_detail.clrfix > div.sight_item_about > h3 > a',
          attr='href'),
    Field(
        fieldname=FieldName.SHOP_IMG,
        css_selector=
        'div.sight_item_detail.clrfix > div.sight_item_show > div.show loading > a > img',
        attr='src'),
    Field(
        fieldname=FieldName.SHOP_ADDRESS,
        css_selector=
        'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > p.address.color999 > span'
    ),
    Field(
        fieldname=FieldName.SHOP_GRADE,
        css_selector=
        'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > div.clrfix > div.sight_item_hot > span.product_star_level > em > span'
    ),
    #价格
    Field(
        fieldname=FieldName.SHOP_PRICE,
        css_selector=
        'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_pop > table > tbody > tr-nthchild:(0) > td > span.sight_item_price > em'
    ),
    Field(
        fieldname=FieldName.SHOP_FEATURE,
        css_selector=
        'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > div.intro.color999'
    ),
)
import json


def get_shop_area(self, _str):
    return '千岛湖乡村游景点'


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='dl > dd > a > h2',
          is_info=True),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='dl > dd > a',
          attr='href',
          is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(1)',
          is_info=True),
    Field(fieldname=FieldName.SHOP_PHONE,
          css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)',
          is_info=True),
    Field(fieldname=FieldName.SHOP_AREA,
          css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)',
          filter_func=get_shop_area,
          is_info=True))

page_shop_1 = Page(
    name='大众点评餐饮店铺列表页面',
    fieldlist=fl_shop1,
    listcssselector=ListCssSelector(
        list_css_selector=
        'body > div.mainLayout.newsMainLayout > div.newsLeftLayout.sceneRightLayout > div'
示例#12
0
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector=
          'div.product-regular.clearfix > div.product-section > h3 > a',
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          is_info=True,
          filter_func=get_shop_rate),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector=
          'div.product-regular.clearfix > div.product-section > h3 > a',
          attr='href',
          is_info=True),
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector=
          'div.product-regular.clearfix > div.product-left > a > img',
          attr='src',
          is_info=True),
    Field(
        fieldname=FieldName.SHOP_ADDRESS,
        css_selector=
        ' div.product-regular.clearfix > div.product-section > dl:nth-child(3) > dd',
        is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,
          css_selector=
          'div.product-regular.clearfix > div.product-info > div > em',
          is_info=True),
    Field(
        fieldname=FieldName.SHOP_COMMENT_NUM,
        css_selector=
        ' div.product-regular.clearfix > div.product-info > ul > li:nth-child(2) > a '
    ),
    Field(
        fieldname=FieldName.SHOP_FEATURE,
        css_selector=
        ' div.product-regular.clearfix > div.product-section > dl:nth-child(6) > dd > div'
    ),
    Field(
        fieldname=FieldName.SHOP_GRADE,
        css_selector=
        'div.product-regular.clearfix > div.product-info > ul > li:nth-child(1) > b',
        filter_func=get_shop_grade,
        is_info=True),
)
示例#13
0
            result.setdefault('评分', float(re.sub(r'[^\d.]*', '', i.text())))
        elif '评价' in i.text():
            result.setdefault('评论数', int(re.sub(r'[^\d]*', '', i.text())))
        elif '游记' in i.text():
            result.setdefault('游记数', int(re.sub(r'[^\d]*', '', i.text())))
    return json.dumps(result, ensure_ascii=False)


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.hotel-title > div > h3 > a'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.hotel-title > div > span.hotel-rate.rate5',
          attr='class',
          regex='[^\d]*',
          is_info=True),
    Field(fieldname=FieldName.SHOP_INTRO,
          css_selector='div.hotel-info > ul',
          attr="innerHTML",
          is_debug='True',
          filter_func=get_shop_grade,
          is_info=True),
)

# fl_shop2 = Fieldlist(
#     Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.container > div.hotel-intro > div.intro-hd > div.location > span', attr='title', offset=6, try_times=10, pause_time=1),
#     Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL, css_selector='#_j_booking_info', attr='innerHTML', filter_func=get_shop_room_all, offset=6, try_times=10, pause_time=2),
#     Field(fieldname=FieldName.SHOP_TRAFFIC, css_selector='#_j_map_poi_list > div.bd', attr='innerHTML', filter_func=get_shop_traffic, offset=6, try_times=10, pause_time=1),
#     Field(fieldname=FieldName.SHOP_FACILITIES, css_selector='#_j_hotel_info', attr='innerHTML', filter_func=get_shop_facilities, offset=6, try_times=10, pause_time=1),
#     Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#_j_comment', attr='innerHTML', filter_func=get_shop_stattistics),
# )
示例#14
0
    return ""
def get_shop_feature(self,_str):
    return ""
def get_shop_rate(self,_str):
    return ""
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.search-scenic-content > h3'),
#\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
#\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-price > span',is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,css_selector='a',attr='href', is_debug=True,is_info=True),
    #img还有些许问题
#\33 6822720 > div:nth-child(1) > div
    Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True),
    #这里应该做一个转换
#\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True),
    #正则表达式的使用有问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True),
    #无shop_feature

    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='',filter_func=get_shop_feature, is_info=True),

    Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True)
)
page_shop_1 = Page(name='途牛景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#search-container > section > div > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)

def get_comment_grade(self,_str):
    doc = pq(_str)
    if (doc('.star-active').length) == 3:
示例#15
0
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.theinfo.ticket.clearfix > a > dl > dt > p > span',
          is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,
          css_selector=
          'div.theinfo.ticket.clearfix > a > div.priceinfo > span > em',
          is_info=False),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.theinfo.ticket.clearfix > a',
          attr='href',
          is_debug=True,
          is_info=False),
    #img还有些许问题
    Field(
        fieldname=FieldName.SHOP_IMG,
        css_selector='div.theinfo.ticket.clearfix > a > div.imgbox > div > img',
        attr='data-src',
        is_info=False),
    Field(
        fieldname=FieldName.SHOP_ADDRESS,
        css_selector='div.theinfo.ticket.clearfix > a > dl > dd:nth-child(2)',
        is_info=False),
    #这里应该做一个转换
    Field(fieldname=FieldName.SHOP_GRADE,
          css_selector=
          'div.theinfo.ticket.clearfix > a > div.priceinfo > div > p > i',
          filter_func=get_shop_grade,
          is_info=False),
    #正则表达式的使用有问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector=
          'div.theinfo.ticket.clearfix > a > div.priceinfo > div > p > span',
          is_info=False),
    #无shop_feature
    Field(fieldname=FieldName.SHOP_FEATURE,
          css_selector='',
          is_info=True,
          filter_func=get_shop_feature),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          is_info=True,
          filter_func=get_shop_rate))
示例#16
0
# -*- coding:utf-8 -*-
from spider.driver.base.driver import Driver
from spider.driver.base.mysql import Mysql
import time
from pyquery import PyQuery
from spider.driver.base.field import Field, FieldName, Fieldlist, FieldType
from spider.driver.base.page import Page
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.base.tabsetup import TabSetup

fl_weixin1 = Fieldlist(
    Field(fieldname='public_name',
          css_selector='div > div.txt-box > p.tit > a',
          regex=r'[^\u4e00-\u9fa5]*'), )

fl_weixin2 = Fieldlist(
    Field(fieldname='article_name', css_selector='div > div > h4'),
    Field(fieldname='article_time',
          css_selector='div > div > p.weui_media_extra_info'),
)

page_weixin_1 = Page(name='微信公众号列表页面',
                     fieldlist=fl_weixin1,
                     listcssselector=ListCssSelector(
                         list_css_selector='#main > div.news-box > ul > li'))

page_weixin_2 = Page(
    name='微信公众号文章列表页面',
    fieldlist=fl_weixin2,
    tabsetup=TabSetup(click_css_selector='div > div.txt-box > p.tit > a'),
示例#17
0
# -*- coding:utf-8 -*-

from spider.driver.travel.core.traveldriver import TravelDriver
from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc
from spider.driver.base.field import Fieldlist, Field, FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
import re
import time
import json
from pyquery import PyQuery

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div:nth-child(2) > span'), )

fl_shop2 = Fieldlist(
    Field(
        fieldname=FieldName.SHOP_NAME,
        css_selector=
        'body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(1) > span'
    ), )

fl_comment1 = Fieldlist(
    # Field(fieldname=FieldName.SHOP_NAME, css_selector='body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(1) > span'),
    Field(fieldname=FieldName.COMMENT_USER_NAME,
          css_selector='div.rate-info > div.avatar-info > div.user-nick'), )

page_shop_1 = Page(name='飞猪景点店铺列表页面',
                   fieldlist=fl_shop1,
示例#18
0
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.page import Page
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.travel.core.traveldriver import TravelDriver
import time
from pyquery import PyQuery
import json
import re

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div > div.h_info_pic > a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a', attr='title'),
    Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a', attr='href'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > b', attr='class', regex=r'[^\d]*'),
    Field(fieldname=FieldName.SHOP_GRADE, css_selector='div > div.h_info_text > div.h_info_comt', regex=r'^([\d.]*).*$', repl=r'\1'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div > div.h_info_text > div.h_info_comt', regex=r'^[\d.]*[^\d]*([\d]*)[^\d]*$', repl=r'\1'),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2'),
    Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='div > div.h_info_text > div.h_info_base > p.lastt_book'),
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.h_info_text > div.h_info_pri', regex=r'[^\d.]*'),
)

def get_shop_room(self, _str):
    p = PyQuery(_str)
    room_list = []
    for i in p('div.hdetail_type > div.htype_list > div.htype_item').items():
        info_list = i('div.htype_info').text().split('\n')
        detail = info_list[3].split('|')
        type_list = []
        for j in i('div.htype_info_list').items('tbody > tr'):
            type = j.text()
示例#19
0
    shop_comment_url = "https://m.tuniu.com/h5/tour/comment/" + shop_id + "/4"
    return shop_comment_url


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.search-scenic-content > h3'),
#\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
#\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-price > span',is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,css_selector='a',attr='href', is_debug=True,is_info=True),
    #img还有些许问题
#\33 6822720 > div:nth-child(1) > div
    Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True),
    #这里应该做一个转换
#\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True),
    #正则表达式的使用有问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True),
    #无shop_feature

    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='',filter_func=get_shop_feature, is_info=True),

    Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True),
Field(fieldname=FieldName.SHOP_COMMENT_URL,css_selector='a',attr='href',filter_func=get_shop_comment_url, is_info=True)
)
page_shop_1 = Page(name='途牛景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#search-container > section > div > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)
fl_shop2 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='#main-page > div.mp-main > div.mp-headfigure > div.mp-headfeagure-info > div'),
示例#20
0
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector=' dl > dt > a',
          is_debug=True),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='dl > dt > a',
          attr='onclick',
          filter_func=get_shop_url,
          is_info=True),
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector=' a > img',
          attr='src',
          is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='dl > dd.proInfo-address > i',
          is_info=True),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='  div > div:nth-child(2) > ul > li:nth-child(2) > a',
          is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,
          css_selector=' dl > dd:nth-child(4)',
          is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,
          css_selector='div > div.priceInfo-price > dfn > span',
          is_info=True),
    Field(fieldname=FieldName.SHOP_GRADE,
          css_selector='div > div:nth-child(2) > ul > li:nth-child(1) > a > b',
          filter_func=get_shop_grade,
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          filter_func=get_shop_rate,
          is_info=True),
)
示例#21
0
import xmltodict


def get_shop_rate(self,_str):
    return ""
def get_shop_feature(self,_str):
    return ""
fl_shop1 = Fieldlist(



    Field(fieldname=FieldName.SHOP_NAME,css_selector=' div > div.h_info > div.h_info_text > div.h_info_base > p.h_info_b1 > a > span.info_cn',attr='innerHTML', is_info=True),

    Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a',attr='href',is_info=True),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div.h_info_pic > a > img', attr='big-src',is_info=True),
    #有些问题
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2',is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,css_selector='div > div.h_info_text > div.h_info_pri > p:nth-child(1) > a > span.h_pri_num',is_info=True),
    #稍许有些问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.h_info_text > div.h_info_comt > a > span.c555.block.mt5'),
    Field(fieldname=FieldName.SHOP_GRADE, css_selector=' div > div.h_info_text > div.h_info_comt > a > span.h_info_comt_bg > i.c37e',is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True)

)
fl_shop2 = Fieldlist()
page_shop_1 = Page(name='艺龙酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#hotelContainer > div > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True)
# page_shop_2 = Page()
#
page_shop_2 = Page(name='艺龙酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection))
fl_comment1 = Fieldlist(
    Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector=' div.cmt_userinfo > div > p.cmt_un',is_info=True),
    p = PyQuery(_str)
    tag_list = []
    for i in list(p('span').items())[1:]:
        tag_list.append(i.text())
    return json.dumps(tag_list, ensure_ascii=False)

def get_shop_rate(self, _str):
    return str(float((int(_str)/10)))

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.hotel-info-ctn > div.hotel-remark > div.price > p > strong'),
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div.hotel-info-ctn > div.hotel-info-main > h2 > a.hotel-name-link'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-info-ctn > div.hotel-remark > div.remark > div > div > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate),
    # Field(fieldname=FieldName.SHOP_TAG, css_selector='div.hotel-info-ctn > div.hotel-info-main > p.hotel-tags', attr='innerHTML', filter_func=get_shop_tag, pause_time=3),
    Field(fieldname=FieldName.SHOP_URL,css_selector='',attr='href',is_info=True),
    Field(fieldname=FieldName.SHOP_IMG,css_selector='',attr='src',is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS,css_selector='',is_info=True),
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',is_info=True),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='',is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',is_info=True)

)

def get_shop_room_all(self, _str):
    p = PyQuery(_str)
    sale_dict = {}
    room_list = []
    for i in p('div.hotel-rooms > div.hotel-rooms-list > div.hotel-rooms-list-cont > ul > li').items():
        room = {'room_name': i('div.title-info.clearfix.dph-col.dph-col1 > div.title > h3').text()}
        for j in i('div.h-item-more.h-hide').text().split('\n'):
            room.update((lambda x: {x[0].strip(): x[1].strip()} if len(x) == 2 else {})(j.split(':')))
示例#23
0
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.txt > div.tit > a > h4',
          is_info=True),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.txt > div.tit > a',
          attr='href',
          is_info=True),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.txt > div.comment > a.review-num',
          attr='innerHTML',
          filter_func=get_zero,
          is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,
          css_selector='div.txt > div.comment > a.mean-price > b',
          attr='innerHTML',
          filter_func=get_zero,
          is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='div.txt > div.tag-addr > span.addr',
          is_info=True),
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector='div.pic > a > img',
          attr='src',
          is_info=True),
    Field(fieldname=FieldName.SHOP_SCORE,
          css_selector='div.txt > div.comment > span',
          filter_func=get_zero,
          attr='class',
          is_info=True),
    Field(fieldname=FieldName.SHOP_COOK_STYLE,
          css_selector='',
          filter_func=get_shop_cookie_style,
          is_info=True),
    Field(fieldname=FieldName.SHOP_SITE,
          css_selector='',
          filter_func=get_shop_site,
          is_info=True),
    Field(fieldname=FieldName.SHOP_COMMENT_URL,
          css_selector='div.txt > div.tit > a',
          attr='href',
          filter_func=get_comment_url,
          is_info=True),
    Field(fieldname=FieldName.SHOP_LNG,
          css_selector='',
          filter_func=get_zero,
          is_info=True),
    Field(fieldname=FieldName.SHOP_LAT,
          css_selector='',
          filter_func=get_zero,
          is_info=True),
    Field(fieldname=FieldName.SHOP_SERVICE,
          css_selector='',
          filter_func=get_zero,
          is_info=True),
    Field(fieldname=FieldName.SHOP_TASTE,
          css_selector='',
          filter_func=get_zero,
          is_info=True),
    Field(fieldname=FieldName.SHOP_ENV,
          css_selector='',
          filter_func=get_zero,
          is_info=True),
)
from spider.driver.base.mongodb import Mongodb
from spider.driver.travel.core.traveldriver import TravelDriver
import time
from pyquery import PyQuery
import json

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector='div.hotel-pic > a > img',
          attr='src'),
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.hotel-title > div > h3 > a'),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.hotel-title > div > h3 > a',
          attr='href'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.hotel-title > div > span.hotel-rate',
          attr='class',
          regex=r'[^\d]*'),
    Field(fieldname=FieldName.SHOP_GRADE,
          css_selector='div.hotel-info > ul > li.rating > em'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.hotel-info > ul > li:nth-child(2) > a > em',
          regex=r'[^\d]*'),
)


def get_shop_room_all(self, _str):
    p = PyQuery(_str)
    room_list = []
    for i in p('a.item._j_booking_item').items():
示例#25
0
from selenium.webdriver.remote.webelement import WebElement

from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.field import Field, Fieldlist
from spider.driver.base.page import Page, PageGroup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb

fl = Fieldlist(Field(fieldname=12), Field(fieldname=13))
mongo = Mongodb(db='122', collection='12')
lcs = ListCssSelector(list_css_selector=12)
tab = TabSetup(url_name=12)
p = Page(name=122,
         fieldlist=fl,
         mongodb=mongo,
         listcssselector=lcs,
         tabsetup=tab)
p1 = Page(name=123,
          fieldlist=fl,
          mongodb=mongo,
          listcssselector=lcs,
          tabsetup=tab)
pg = PageGroup(p, p1)
print(next(pg))
from spider.driver.base.field import Fieldlist,Field,FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.page import Page
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.travel.core.traveldriver import TravelDriver
import time
from pyquery import PyQuery
import json

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.row-center > div > h5 > a'),
    Field(fieldname=FieldName.SHOP_CURR_URL, css_selector='div > div.row-center > div > h5 > a', attr='href'),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div > div.row-left.fleft > a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='div > div.row-center > div > h5 > span.row-subtitle', attr='title', regex=r'[^\d]*'),
    Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='div > div.row-center > div > p.row-someone-book > span'),
    Field(fieldname=FieldName.SHOP_GRADE, css_selector='div > div.row-sub-right.fright > a > p.score > span.value'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div > div.row-sub-right.fright > a > p.comment > span'),
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.row-right.fright > div.box-price > p > span.pi-price.pi-price-lg', regex=r'[^\d]*'),
)

def get_room_all(self, _str):
    p = PyQuery(_str)
    room_list = []
    for i in p('div.room-item-wrapper > div.room-item-inner > div:nth-child(1)').items():
        room_list.append(i.text().split()[1:])
    return json.dumps(room_list, ensure_ascii=False)

def get_shop_intro(self, _str):
    p = PyQuery(_str)
    info_list = p.text().split('\n')
示例#27
0
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div:nth-child(2) > span'),
    #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(
        fieldname=FieldName.SHOP_PRICE,
        css_selector=
        'div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)',
        is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='',
          attr='id',
          filter_func=get_shop_url,
          is_debug=True,
          is_info=True),
    #img还有些许问题
    #\33 6822720 > div:nth-child(1) > div
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector='',
          attr='',
          filter_func=get_shop_img,
          is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='',
          filter_func=get_shop_address,
          is_info=True),
    #这里应该做一个转换
    #\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)
    Field(
        fieldname=FieldName.SHOP_GRADE,
        css_selector=
        'div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)',
        is_info=True),
    #正则表达式的使用有问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='',
          filter_func=_get_shop_comment_num,
          is_info=True),
    #无shop_feature
    Field(fieldname=FieldName.SHOP_FEATURE,
          css_selector='div:nth-child(2) > div:nth-child(2) > span',
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div:nth-child(2) > div:nth-child(3) > span',
          is_info=True))
def get_shop_rate(self, _str):
    return str(float((int(_str)/10)))

def get_shop_subtype_name(self, _str):
    return _str.strip()
def get_shop_feature(self,_str):
    return ""
def get_comment_url(self,_str):
    return _str + "/review_all"
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div.txt > div.tit > a > h4'),
    Field(fieldname=FieldName.SHOP_URL, css_selector='div.txt > div.tit > a', attr='href'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.txt > div.comment > a.review-num'),
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.txt > div.comment > a.mean-price'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='div.txt > div.comment > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.txt > div.tag-addr > span.addr'),
    Field(fieldname=FieldName.SHOP_IMG,css_selector='div.pic > a > img',is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True),
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='div.txt > span > span:nth-child(1) > b',is_info=True),
Field(fieldname=FieldName.SHOP_COMMENT_URL, css_selector='div.txt > div.tit > a', attr='href',filter_func=get_comment_url, is_info=True)
)

page_shop_1 = Page(name='大众点评景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#shop-all-list > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)

def get_shop_time(self, _str):
    try:
        p = PyQuery(_str)
        shop_time = ''
        for i in p('p.info.info-indent').items():
            if '营业时间' in i.text():
                shop_time = i.text()
示例#29
0
    return _str.strip()


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.txt > div.tit > a > h4'),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.txt > div.tit > a',
          attr='href'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.txt > div.comment > a.review-num'),
    Field(fieldname=FieldName.SHOP_PRICE,
          css_selector='div.txt > div.comment > a.mean-price'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.txt > div.comment > span',
          attr='class',
          regex=r'[^\d]*',
          filter_func=get_shop_rate),
    Field(fieldname=FieldName.SHOP_TAG,
          css_selector='div.txt > span.comment-list',
          attr='innerHTML',
          filter_func=get_shop_tag,
          pause_time=1),
    Field(fieldname=FieldName.SUBTYPE_NAME,
          css_selector='div.txt > div.tag-addr > a:nth-child(1)',
          filter_func=get_shop_subtype_name),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='div.txt > div.tag-addr > span.addr'))

page_shop_1 = Page(name='大众点评爱车店铺列表页面',
                   fieldlist=fl_shop1,
示例#30
0
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector='div.hotel-logo > img',
          attr='src'),
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.hotel-info > div.nameAndIcon > a'),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.hotel-info > div.nameAndIcon > a',
          attr='href'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.hotel-info.fl > div.nameAndIcon > div',
          attr='class',
          regex=r'[^\d]*'),
    Field(
        fieldname=FieldName.SHOP_YEAR,
        css_selector='div.hotel-info.fl > div.nameAndIcon > span.decorate_year'
    ),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='div.hotel-info.fl > div.addressInfo'),
    Field(fieldname=FieldName.SHOP_PRICE,
          css_selector='div.hotel-brief.fl > div.startPrice > span.digit'),
    Field(
        fieldname=FieldName.SHOP_RATE,
        css_selector='div.hotel-brief.fl > div.satisfaction > span.highlight'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.hotel-brief.fl > div.comment > a > span'),
    Field(fieldname=FieldName.SHOP_ACTIVE_STATUS,
          css_selector='div.hotel-brief.fl > div.lastOrderTime'),
)