Exemplo n.º 1
0
import time
import json
from pyquery import PyQuery
import math
import datetime

def get_shop_address(self,_str):
    return ""
def get_shop_grade(self,_str):
    return ""
def get_shop_feature(self,_str):
    return ""
def get_shop_rate(self,_str):
    return ""
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.search-scenic-content > h3'),
#\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
#\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-price > span',is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,css_selector='a',attr='href', is_debug=True,is_info=True),
    #img还有些许问题
#\33 6822720 > div:nth-child(1) > div
    Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True),
    #这里应该做一个转换
#\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True),
    #正则表达式的使用有问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True),
    #无shop_feature
Exemplo n.º 2
0
def get_shop_rate(self, _str):
    return ""


def get_comment_url(self, _str):
    return _str + "/comment"


def get_shop_name_search_key(self, _str):

    return self.shop_name_search_key(self.shop_name)


fl_shop1 = Fieldlist(
    Field(
        fieldname=FieldName.SHOP_NAME,
        css_selector='div > div.mp-sight-info > a > div.mp-sight-detail > h3',
        is_info=True),
    #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(
        fieldname=FieldName.SHOP_PRICE,
        css_selector=
        'div > div.mp-sight-info > a > div.mp-sight-detail > div.mp-sight-pricecon > div.mp-sight-price > em',
        is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div > div.mp-sight-info > a',
          attr='href',
          is_debug=True,
          is_info=True),
    #img还有些许问题
Exemplo n.º 3
0
import json
import re
import random
import datetime
import math
def get_comment_num(self,_str):
    num = re.findall(r'[\d]{1,10}',_str)
    return str(num[0])
def get_shop_grade(self,_str):
    return "0.0"
def get_shop_price(self,_str):
    return "0.0"
def get_shop_rate(self,_str):
    return ""
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,css_selector='div > div.ct-text > h3 > a',is_debug=True),
    Field(fieldname=FieldName.SHOP_RATE,css_selector='',is_info=True,filter_func=get_shop_rate),
    Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.ct-text > h3 > a',attr='href',is_info=True),
    Field(fieldname=FieldName.SHOP_IMG, css_selector=' div > div.flt1 > a > img', attr='src',is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.ct-text > ul > li:nth-child(1) > a',
          is_info=True),

    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade),
    #正则表达式不一样
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a',filter_func=get_comment_num, is_info=True),

    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > div.ct-text > p',is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,css_selector= '',filter_func=get_shop_price, is_info=True)
)


def get_shop_feature(self, _str):
    return ""


def get_shop_rate(self, _str):
    return ""


def get_comment_url(self, _str):
    return _str + "/comment"


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='a > div.ml-pro-info > p'),
    #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(
        fieldname=FieldName.SHOP_PRICE,
        css_selector=
        ' a > div.ml-pro-info > div.ml-pro-price > span.price > i:nth-child(2)',
        is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='a',
          attr='href',
          is_debug=True,
          is_info=True),
    #img还有些许问题
    #\33 6822720 > div:nth-child(1) > div
Exemplo n.º 5
0
import json
import re
import random


def get_shop_rate(self, _str):
    return ""


def get_shop_feature(self, _str):
    return ""


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector=' a.e_title.js_list_name',
          is_debug=True),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='a.e_title.js_list_name',
          attr='href',
          is_info=True),
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector='a > img:nth-child(1)',
          attr='src',
          is_info=True),
    Field(
        fieldname=FieldName.SHOP_ADDRESS,
        css_selector=
        'div > div > div.clrfix > div.item_hotel_info > div.item_hotel_bsinfo > table > tbody > tr > td.item_hotel_name > div > p > span > em',
        is_info=True),
    Field(
Exemplo n.º 6
0
# -*- coding:utf-8 -*-

from spider.driver.travel.core.traveldriver import TravelDriver
from spider.driver.base.page import Page
from spider.driver.base.field import Fieldlist,Field,FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
import re
import time
import json
from pyquery import PyQuery

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,css_selector='li.hotel_item_name > h2 > a',regex=r'^[\d]*(.*)$',repl=r'\1'),
    Field(fieldname=FieldName.SHOP_URL,css_selector='li.hotel_item_name > h2 > a',attr='href',regex=r'^([^\?]*)?.*$',repl=r'\1'),
    Field(fieldname=FieldName.SHOP_ID, css_selector='li.hotel_item_name > h2 > a', attr='href',regex=r'^[^\?\d]*([\d]*).html?.*$', repl=r'\1'),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='li.pic_medal > div > a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='li.hotel_item_name > p.hotel_item_htladdress'),
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.hotel_value'),
    Field(fieldname=FieldName.SHOP_STATISFACTION_PERCENT,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.total_judgement_score > span'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='li.hotel_item_name > span', attr='innerHTML',regex=r'[^\d]*'),
    Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='li.hotel_item_name > p.hotel_item_last_book'),
    Field(fieldname=FieldName.SHOP_PRICE,css_selector='span.J_price_lowList'),
    Field(fieldname=FieldName.SHOP_CATEGORY_NAME, css_selector='li.hotel_item_name > p.medal_list > span'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.hotel_judgement > span'),
    Field(fieldname=FieldName.SHOP_GRADE_TEXT,css_selector='li.hotel_item_judge.no_comment > div.hotelitem_judge_box > a > span.recommend'),
)


def get_recommend_all_room_dict(self, _str):
import json
from pyquery import PyQuery
import xmltodict
import math
import datetime
def get_zero(self,_str):
    return 0.0
def get_shop_area(self,_str):
    return '千岛湖东北湖区';
def get_baidu_spider_step(self,_str):
    return "2";

fl_shop1 = Fieldlist(
#card-56 > div > ul > li:nth-child(3) > div.cf.mb_5 > div.ml_30.mr_85 > div:nth-child(1) > span > a
#card-56 > div > ul > li.search-item.base-item > div.cf > div.ml_30.mr_90 > div:nth-child(1) > span:nth-child(1) > a
    Field(fieldname=FieldName.SHOP_NAME,css_selector='div.cf > div.ml_30 > div:nth-child(1) > span > a',is_info=True),
#card-56 > div > ul > li.search-item.base-item > div.cf > div.ml_30.mr_90 > div.row.addr > span
#card-56 > div > ul > li:nth-child(3) > div.cf.mb_5 > div.ml_30.mr_85 > div.row.addr > span
    Field(fieldname=FieldName.SHOP_ADDRESS,css_selector='div.cf > div.ml_30 > div.row.addr > span',is_info=True),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div.cf > div.col-r > div.img-wrap > a > img', attr='src',is_info=True),
    Field(fieldname=FieldName.SHOP_LNG, css_selector='',filter_func=get_zero, is_info=True),
    Field(fieldname=FieldName.SHOP_LAT, css_selector='',filter_func=get_zero, is_info=True),
    Field(fieldname=FieldName.SHOP_AREA,css_selector='',filter_func=get_shop_area,is_info=True),
    Field(fieldname=FieldName.BAIDU_SPIDER_STEP,css_selector='',filter_func=get_baidu_spider_step,is_info=True)
)

def get_shop_name(self,_str):
    self.shop_name = _str;
    return _str;

fl_shop2 = Fieldlist(
Exemplo n.º 8
0
# -*- coding:utf-8 -*-

from spider.driver.travel.core.traveldriver import TravelDriver
from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc
from spider.driver.base.field import Fieldlist, Field, FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
import re
import time
import json
from pyquery import PyQuery
import xmltodict

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.search_ticket_title > h2 > a'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.search_ticket_title > h2 > span > span.rate'),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.search_ticket_title > h2 > a',
          attr='href'),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='div.search_ticket_title > div.adress'),
    Field(fieldname=FieldName.SHOP_GRADE,
          css_selector='div.search_ticket_assess > span.grades > em'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.search_ticket_assess > span.grades',
          regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$',
          repl=r'\1'),
    Field(fieldname=FieldName.SHOP_FEATURE,
Exemplo n.º 9
0
from spider.driver.base.page import Page,NextPageCssSelectorSetup,PageFunc,NextPageLinkTextSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.travel.core.traveldriver import TravelDriver
import time
from pyquery import PyQuery
import json
import re
import random
def get_shop_rate(self,_str):
    return ""
def get_shop_grade(self,_str):
    return "0.0"
fl_shop1 = Fieldlist(
    ##_j_search_result_left > div:nth-child(1) > div > div:nth-child(1) > div.ct-text > h3 > a
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div.ct-text > h3 > a', is_debug=True),
#_j_search_result_left > div:nth-child(1) > div > div:nth-child(2) > div.ct-text > h3 > a

    Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.ct-text > h3 > a', attr='href', is_info=True),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div.flt1 > a > img', attr='src', is_info=True),
#_j_search_result_left > div:nth-child(1) > div > div:nth-child(1) > div.ct-text > div > p:nth-child(1)
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.ct-text > ul > li:nth-child(1) > a',
          is_info=True),

     Field(fieldname=FieldName.SHOP_PRICE,css_selector='div.ct-text > ul > li.frt._j_hotel_ota > a > span.seg-price'),
    # 正则表达式不一样
#_j_search_result_left > div:nth-child(1) > div > div:nth-child(2) > div.ct-text > ul > li:nth-child(2) > a
    Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.ct-text > ul > li:nth-child(2) > a',
          is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div.ct-text > div > p:nth-child(1)',
          is_info=True),
Exemplo n.º 10
0
from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc
from spider.driver.base.field import Fieldlist, Field, FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from selenium import webdriver
from spider.driver.base.driver import *
import re
import time
import json
from pyquery import PyQuery
import xmltodict
#
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector=
          'div.sight_item_detail.clrfix > div.sight_item_about > h3 > a'),
    # 5A景区
    Field(
        fieldname=FieldName.SHOP_RATE,
        css_selector=
        'div.sight_item_detail.clrfix > div.sight_item_about > div.sight_item_info > div.clrfix > span.level'
    ),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector=
          'div.sight_item_detail.clrfix > div.sight_item_about > h3 > a',
          attr='href'),
    Field(
        fieldname=FieldName.SHOP_IMG,
        css_selector=
        'div.sight_item_detail.clrfix > div.sight_item_show > div.show loading > a > img',
import re
import random
import datetime
import math
from urllib import request
import demjson
import json


def get_shop_area(self, _str):
    return '千岛湖乡村游景点'


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='dl > dd > a > h2',
          is_info=True),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='dl > dd > a',
          attr='href',
          is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(1)',
          is_info=True),
    Field(fieldname=FieldName.SHOP_PHONE,
          css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)',
          is_info=True),
    Field(fieldname=FieldName.SHOP_AREA,
          css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)',
          filter_func=get_shop_area,
          is_info=True))
Exemplo n.º 12
0
from spider.driver.travel.core.traveldriver import TravelDriver
from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc
from spider.driver.base.field import Fieldlist, Field, FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
import re
import time
import json
from pyquery import PyQuery
import xmltodict
import math
import datetime

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.search_ticket_title > h2 > a'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.search_ticket_title > h2 > span > span.rate'),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.search_ticket_title > h2 > a',
          attr='href'),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='div.search_ticket_title > div.adress'),
    Field(fieldname=FieldName.SHOP_GRADE,
          css_selector='div.search_ticket_assess > span.grades > em'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.search_ticket_assess > span.grades',
          regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$',
          repl=r'\1'),
    Field(fieldname=FieldName.SHOP_FEATURE,
Exemplo n.º 13
0
import math
import datetime


def get_shop_grade(self, _str):
    saveTo = round(float(_str[0:-1]) / 100 * 5, 1)
    return str(saveTo)


def get_shop_rate(self, _str):
    return ""


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector=
          'div.product-regular.clearfix > div.product-section > h3 > a',
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          is_info=True,
          filter_func=get_shop_rate),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector=
          'div.product-regular.clearfix > div.product-section > h3 > a',
          attr='href',
          is_info=True),
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector=
          'div.product-regular.clearfix > div.product-left > a > img',
          attr='src',
          is_info=True),
Exemplo n.º 14
0
def get_shop_grade(self, _str):
    p = pq(_str)
    result = {}
    for i in p('li').items():
        if '分' in i.text():
            result.setdefault('评分', float(re.sub(r'[^\d.]*', '', i.text())))
        elif '评价' in i.text():
            result.setdefault('评论数', int(re.sub(r'[^\d]*', '', i.text())))
        elif '游记' in i.text():
            result.setdefault('游记数', int(re.sub(r'[^\d]*', '', i.text())))
    return json.dumps(result, ensure_ascii=False)


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.hotel-title > div > h3 > a'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.hotel-title > div > span.hotel-rate.rate5',
          attr='class',
          regex='[^\d]*',
          is_info=True),
    Field(fieldname=FieldName.SHOP_INTRO,
          css_selector='div.hotel-info > ul',
          attr="innerHTML",
          is_debug='True',
          filter_func=get_shop_grade,
          is_info=True),
)

# fl_shop2 = Fieldlist(
#     Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.container > div.hotel-intro > div.intro-hd > div.location > span', attr='title', offset=6, try_times=10, pause_time=1),
Exemplo n.º 15
0
    saveTo = round(float(_str[0:-1]) / 100 * 5, 1)
    return str(saveTo)


def get_shop_feature(self, _str):
    return ""


def get_shop_rate(self, _str):
    return ""


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.theinfo.ticket.clearfix > a > dl > dt > p > span',
          is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,
          css_selector=
          'div.theinfo.ticket.clearfix > a > div.priceinfo > span > em',
          is_info=False),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.theinfo.ticket.clearfix > a',
          attr='href',
          is_debug=True,
          is_info=False),
    #img还有些许问题
    Field(
        fieldname=FieldName.SHOP_IMG,
        css_selector='div.theinfo.ticket.clearfix > a > div.imgbox > div > img',
Exemplo n.º 16
0
# -*- coding:utf-8 -*-
from spider.driver.base.driver import Driver
from spider.driver.base.mysql import Mysql
import time
from pyquery import PyQuery
from spider.driver.base.field import Field, FieldName, Fieldlist, FieldType
from spider.driver.base.page import Page
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.base.tabsetup import TabSetup

fl_weixin1 = Fieldlist(
    Field(fieldname='public_name',
          css_selector='div > div.txt-box > p.tit > a',
          regex=r'[^\u4e00-\u9fa5]*'), )

fl_weixin2 = Fieldlist(
    Field(fieldname='article_name', css_selector='div > div > h4'),
    Field(fieldname='article_time',
          css_selector='div > div > p.weui_media_extra_info'),
)

page_weixin_1 = Page(name='微信公众号列表页面',
                     fieldlist=fl_weixin1,
                     listcssselector=ListCssSelector(
                         list_css_selector='#main > div.news-box > ul > li'))

page_weixin_2 = Page(
    name='微信公众号文章列表页面',
    fieldlist=fl_weixin2,
    tabsetup=TabSetup(click_css_selector='div > div.txt-box > p.tit > a'),
Exemplo n.º 17
0
# -*- coding:utf-8 -*-

from spider.driver.travel.core.traveldriver import TravelDriver
from spider.driver.base.page import Page, NextPageCssSelectorSetup, PageFunc
from spider.driver.base.field import Fieldlist, Field, FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
import re
import time
import json
from pyquery import PyQuery

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div:nth-child(2) > span'), )

fl_shop2 = Fieldlist(
    Field(
        fieldname=FieldName.SHOP_NAME,
        css_selector=
        'body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(1) > span'
    ), )

fl_comment1 = Fieldlist(
    # Field(fieldname=FieldName.SHOP_NAME, css_selector='body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(1) > span'),
    Field(fieldname=FieldName.COMMENT_USER_NAME,
          css_selector='div.rate-info > div.avatar-info > div.user-nick'), )

page_shop_1 = Page(name='飞猪景点店铺列表页面',
                   fieldlist=fl_shop1,
Exemplo n.º 18
0
# -*- coding:utf-8 -*-

from spider.driver.base.field import Fieldlist,Field,FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.page import Page
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.travel.core.traveldriver import TravelDriver
import time
from pyquery import PyQuery
import json
import re

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div > div.h_info_pic > a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a', attr='title'),
    Field(fieldname=FieldName.SHOP_URL, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a', attr='href'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > b', attr='class', regex=r'[^\d]*'),
    Field(fieldname=FieldName.SHOP_GRADE, css_selector='div > div.h_info_text > div.h_info_comt', regex=r'^([\d.]*).*$', repl=r'\1'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div > div.h_info_text > div.h_info_comt', regex=r'^[\d.]*[^\d]*([\d]*)[^\d]*$', repl=r'\1'),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2'),
    Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='div > div.h_info_text > div.h_info_base > p.lastt_book'),
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.h_info_text > div.h_info_pri', regex=r'[^\d.]*'),
)

def get_shop_room(self, _str):
    p = PyQuery(_str)
    room_list = []
    for i in p('div.hdetail_type > div.htype_list > div.htype_item').items():
        info_list = i('div.htype_info').text().split('\n')
        detail = info_list[3].split('|')
Exemplo n.º 19
0
    return ""
def get_shop_grade(self,_str):
    return ""
def get_shop_feature(self,_str):
    return ""
def get_shop_rate(self,_str):
    return ""

def get_shop_comment_url(self,_str):
    shop_id = re.findall(r'([\d]{1,10})',_str)[0];
    shop_comment_url = "https://m.tuniu.com/h5/tour/comment/" + shop_id + "/4"
    return shop_comment_url


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='a > div.search-scenic-content > h3'),
#\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
#\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-price > span',is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,css_selector='a',attr='href', is_debug=True,is_info=True),
    #img还有些许问题
#\33 6822720 > div:nth-child(1) > div
    Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True),
    #这里应该做一个转换
#\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True),
    #正则表达式的使用有问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True),
    #无shop_feature
Exemplo n.º 20
0

def get_shop_url(self, _str):

    matchObj = re.search(r'http.*html', _str, re.M | re.I)

    return str(matchObj.group())


def get_shop_rate(self, _str):
    return ""


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector=' dl > dt > a',
          is_debug=True),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='dl > dt > a',
          attr='onclick',
          filter_func=get_shop_url,
          is_info=True),
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector=' a > img',
          attr='src',
          is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='dl > dd.proInfo-address > i',
          is_info=True),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='  div > div:nth-child(2) > ul > li:nth-child(2) > a',
Exemplo n.º 21
0
import re
import time
import json
from pyquery import PyQuery
import xmltodict


def get_shop_rate(self,_str):
    return ""
def get_shop_feature(self,_str):
    return ""
fl_shop1 = Fieldlist(



    Field(fieldname=FieldName.SHOP_NAME,css_selector=' div > div.h_info > div.h_info_text > div.h_info_base > p.h_info_b1 > a > span.info_cn',attr='innerHTML', is_info=True),

    Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a',attr='href',is_info=True),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div.h_info_pic > a > img', attr='big-src',is_info=True),
    #有些问题
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2',is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,css_selector='div > div.h_info_text > div.h_info_pri > p:nth-child(1) > a > span.h_pri_num',is_info=True),
    #稍许有些问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.h_info_text > div.h_info_comt > a > span.c555.block.mt5'),
    Field(fieldname=FieldName.SHOP_GRADE, css_selector=' div > div.h_info_text > div.h_info_comt > a > span.h_info_comt_bg > i.c37e',is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True)

)
fl_shop2 = Fieldlist()
page_shop_1 = Page(name='艺龙酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#hotelContainer > div > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True)
Exemplo n.º 22
0
import json
import re
import random

def get_shop_tag(self, _str):
    p = PyQuery(_str)
    tag_list = []
    for i in list(p('span').items())[1:]:
        tag_list.append(i.text())
    return json.dumps(tag_list, ensure_ascii=False)

def get_shop_rate(self, _str):
    return str(float((int(_str)/10)))

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.hotel-info-ctn > div.hotel-remark > div.price > p > strong'),
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div.hotel-info-ctn > div.hotel-info-main > h2 > a.hotel-name-link'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='div.hotel-info-ctn > div.hotel-remark > div.remark > div > div > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate),
    # Field(fieldname=FieldName.SHOP_TAG, css_selector='div.hotel-info-ctn > div.hotel-info-main > p.hotel-tags', attr='innerHTML', filter_func=get_shop_tag, pause_time=3),
    Field(fieldname=FieldName.SHOP_URL,css_selector='',attr='href',is_info=True),
    Field(fieldname=FieldName.SHOP_IMG,css_selector='',attr='src',is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS,css_selector='',is_info=True),
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',is_info=True),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='',is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',is_info=True)

)

def get_shop_room_all(self, _str):
    p = PyQuery(_str)
    sale_dict = {}
Exemplo n.º 23
0
def get_shop_score(self, _str):
    return (float(re.findall(r'([\d]{1,4})', _str)[0]) / 10)


def get_zero(self, _str):
    return 0.0


def get_shop_site(self, _str):
    return self.shop_site


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.txt > div.tit > a > h4',
          is_info=True),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.txt > div.tit > a',
          attr='href',
          is_info=True),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.txt > div.comment > a.review-num',
          attr='innerHTML',
          filter_func=get_zero,
          is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,
          css_selector='div.txt > div.comment > a.mean-price > b',
          attr='innerHTML',
          filter_func=get_zero,
          is_info=True),
Exemplo n.º 24
0
# -*- coding:utf-8 -*-

from spider.driver.base.field import Fieldlist, Field, FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.page import Page
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.travel.core.traveldriver import TravelDriver
import time
from pyquery import PyQuery
import json

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_IMG,
          css_selector='div.hotel-pic > a > img',
          attr='src'),
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.hotel-title > div > h3 > a'),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.hotel-title > div > h3 > a',
          attr='href'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.hotel-title > div > span.hotel-rate',
          attr='class',
          regex=r'[^\d]*'),
    Field(fieldname=FieldName.SHOP_GRADE,
          css_selector='div.hotel-info > ul > li.rating > em'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.hotel-info > ul > li:nth-child(2) > a > em',
          regex=r'[^\d]*'),
)
Exemplo n.º 25
0
from selenium.webdriver.remote.webelement import WebElement

from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.field import Field, Fieldlist
from spider.driver.base.page import Page, PageGroup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb

fl = Fieldlist(Field(fieldname=12), Field(fieldname=13))
mongo = Mongodb(db='122', collection='12')
lcs = ListCssSelector(list_css_selector=12)
tab = TabSetup(url_name=12)
p = Page(name=122,
         fieldlist=fl,
         mongodb=mongo,
         listcssselector=lcs,
         tabsetup=tab)
p1 = Page(name=123,
          fieldlist=fl,
          mongodb=mongo,
          listcssselector=lcs,
          tabsetup=tab)
pg = PageGroup(p, p1)
print(next(pg))
Exemplo n.º 26
0
# -*- coding:utf-8 -*-

from spider.driver.base.field import Fieldlist,Field,FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.page import Page
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.travel.core.traveldriver import TravelDriver
import time
from pyquery import PyQuery
import json

fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.row-center > div > h5 > a'),
    Field(fieldname=FieldName.SHOP_CURR_URL, css_selector='div > div.row-center > div > h5 > a', attr='href'),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div > div.row-left.fleft > a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='div > div.row-center > div > h5 > span.row-subtitle', attr='title', regex=r'[^\d]*'),
    Field(fieldname=FieldName.SHOP_ACTIVE_STATUS, css_selector='div > div.row-center > div > p.row-someone-book > span'),
    Field(fieldname=FieldName.SHOP_GRADE, css_selector='div > div.row-sub-right.fright > a > p.score > span.value'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div > div.row-sub-right.fright > a > p.comment > span'),
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.row-right.fright > div.box-price > p > span.pi-price.pi-price-lg', regex=r'[^\d]*'),
)

def get_room_all(self, _str):
    p = PyQuery(_str)
    room_list = []
    for i in p('div.room-item-wrapper > div.room-item-inner > div:nth-child(1)').items():
        room_list.append(i.text().split()[1:])
    return json.dumps(room_list, ensure_ascii=False)

def get_shop_intro(self, _str):
Exemplo n.º 27
0
def _get_shop_comment_num(self, _str):
    return ""


def get_shop_url(self, _str):
    return 'https://market.m.taobao.com/apps/market/travelticket/detail.html?wh_weex=true&scenicId=' + str(
        _str) + '&gsCallback=' + str(_str)


def get_shop_img(self, _str):
    return ""


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div:nth-child(2) > span'),
    #\31 302 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    #\32 0808 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)
    Field(
        fieldname=FieldName.SHOP_PRICE,
        css_selector=
        'div:nth-child(2) > div:nth-child(3) > div > div:nth-child(1) > span:nth-child(2)',
        is_info=True),
    #稍微有点问题
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='',
          attr='id',
          filter_func=get_shop_url,
          is_debug=True,
          is_info=True),
    #img还有些许问题
Exemplo n.º 28
0
        p = PyQuery(_str)
    except Exception:
        return None
    return json.dumps([i.text() for i in p('span').items()][1:], ensure_ascii=False)

def get_shop_rate(self, _str):
    return str(float((int(_str)/10)))

def get_shop_subtype_name(self, _str):
    return _str.strip()
def get_shop_feature(self,_str):
    return ""
def get_comment_url(self,_str):
    return _str + "/review_all"
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div.txt > div.tit > a > h4'),
    Field(fieldname=FieldName.SHOP_URL, css_selector='div.txt > div.tit > a', attr='href'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.txt > div.comment > a.review-num'),
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.txt > div.comment > a.mean-price'),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='div.txt > div.comment > span', attr='class', regex=r'[^\d]*', filter_func=get_shop_rate),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.txt > div.tag-addr > span.addr'),
    Field(fieldname=FieldName.SHOP_IMG,css_selector='div.pic > a > img',is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True),
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='div.txt > span > span:nth-child(1) > b',is_info=True),
Field(fieldname=FieldName.SHOP_COMMENT_URL, css_selector='div.txt > div.tit > a', attr='href',filter_func=get_comment_url, is_info=True)
)

page_shop_1 = Page(name='大众点评景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#shop-all-list > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)

def get_shop_time(self, _str):
    try:
Exemplo n.º 29
0
    except Exception:
        return None
    return json.dumps([i.text() for i in p('span').items()][1:],
                      ensure_ascii=False)


def get_shop_rate(self, _str):
    return str(float((int(_str) / 10)))


def get_shop_subtype_name(self, _str):
    return _str.strip()


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='div.txt > div.tit > a > h4'),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='div.txt > div.tit > a',
          attr='href'),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,
          css_selector='div.txt > div.comment > a.review-num'),
    Field(fieldname=FieldName.SHOP_PRICE,
          css_selector='div.txt > div.comment > a.mean-price'),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div.txt > div.comment > span',
          attr='class',
          regex=r'[^\d]*',
          filter_func=get_shop_rate),
    Field(fieldname=FieldName.SHOP_TAG,
          css_selector='div.txt > span.comment-list',
          attr='innerHTML',
Exemplo n.º 30
0
# -*- coding:utf-8 -*-

from spider.driver.base.field import Fieldlist,Field,FieldName
from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.page import Page,NextPageCssSelectorSetup,PageFunc,NextPageLinkTextSetup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb
from spider.driver.travel.core.traveldriver import TravelDriver
import time
from pyquery import PyQuery
import json
import re
fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='div > div.ct-text > h3 > a', is_debug=True),

    Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.ct-text > h3 > a',attr='href'),
    Field(fieldname=FieldName.SHOP_IMG, css_selector=' div > div.flt1 > a > img', attr='src'),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.ct-text > ul > li:nth-child(1) > a'),
  #  Field(fieldname=FieldName.SHOP_GRADE,css_selector='div.search_ticket_assess > span.grades > em'),
    #正则表达式不一样
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a', regex=r'^[^\(]*\(([\d]+)[^\)\d]*\)$', repl=r'\1'),
    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > ul > li:nth-child(1) > div > div.ct-text > p'),
)

def get_shop_ticket():
  print(111)
def get_shop_info():
    print(222)
fl_shop2 = Fieldlist(
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='body > div.container > div:nth-child(6) > div.mod.mod-detail > dl:nth-child(4) > dd > div:nth-child(1) > div', pause_time=3, is_focus=True, is_info=True),
    Field(fieldname=FieldName.SHOP_TIME, css_selector='body > div.container > div:nth-child(6) > div.mod.mod-detail > dl:nth-child(5) > dd > div:nth-child(1)', is_focus=True),