Python SpiderPathUrl示例，gl.SpiderPathUrl Python示例

示例#1

0

显示文件

    def parse(self, response):
        """ hatype="dota2"
        vkey="longzhu"
        root_path='//div[@class="list-con"]/a'
        title_path='./h3[@class="listcard-caption"]/text()'
        href_path='.//@href'
        img_path='.//img/@src'
        author_path='.//strong/text()'
        audian_path='.//span[@class="livecard-meta-item-text"]/text()'
        url_prefix=""
        SpiderPathUrl.process(response,root_path,title_path,href_path,img_path,author_path,audian_path,url_prefix,hatype,vkey) """

        responseStr = response.body_as_unicode()
        tripResponse = responseStr.replace("_callbacks_._36bxu1(", '')
        index = tripResponse.rfind(")")
        print("zuihou ) weizhi : " + str(index) + "\n")
        tripResponse = tripResponse[0:index]

        #print(tripResponse +"\n")
        #print("tripResponse:"+tripResponse)
        resultJson = json.loads(tripResponse)

        title_j = "channel.name"
        href_j = "channel.url"
        img_j = "preview"
        author_j = "channel.status"
        audian_j = "viewers"
        vkey = "channel.domain"
        url_prefix = ""
        vType_j = "game.0.name"
        SpiderPathUrl.processJSON(resultJson["data"]["items"], title_j, href_j,
                                  img_j, author_j, audian_j, url_prefix,
                                  vType_j, vkey, "longzhu", "")

示例#2

0

显示文件

 def parse(self, response):
     """hatype="dota2"
     vkey="zhanqi"
     root_path='//div[@class="live-list-tabc active"]/ul/li'
     #root_path='//div[@class="live-list-tabc tabc js-room-list-tabc"]/ul/li'
     title_path='.//span[@class="name"]/text()'
     href_path='.//a/@href'
     img_path='.//img/@src'
     author_path='.//span[@class="anchor anchor-to-cut dv"]/text()'
     audian_path='.//span[@class="dv"]/text()'
     url_prefix="http://www.zhanqi.tv"
     SpiderPathUrl.process(response,root_path,title_path,href_path,img_path,author_path,audian_path,url_prefix,hatype,vkey)"""
     resultJson = json.loads(response.body_as_unicode())
     root_j = "rooms"
     title_j = "title"
     href_j = "url"
     img_j = "bpic"
     author_j = "nickname"
     audian_j = "online"
     vkey = "id"
     url_prefix = "http://www.zhanqi.tv"
     vType_j = "gameName"
     SpiderPathUrl.processJSON(resultJson["data"]["rooms"], title_j, href_j,
                               img_j, author_j, audian_j, url_prefix,
                               vType_j, vkey, "zhanqi", "")

示例#3

0

显示文件

    def parse(self, response):
        #hatype="dota2"
        #vkey="panda"
        #root_path='//ul[@class="video-list clearfix"]/li'

        #root_path='//div[@class="list-container"]/ul/li'
        """title_path='.//div[@class="video-title"]/text()'
        href_path='.//a/@href'
        img_path='.//img/@data-original'
        author_path='.//span[@class="video-nickname"]/text()'
        audian_path='.//span[@class="video-number"]/text()'
        url_prefix="http://www.panda.tv"
        SpiderPathUrl.process(response,root_path,title_path,href_path,img_path,author_path,audian_path,url_prefix,hatype,vkey)"""

        resultJson = json.loads(response.body_as_unicode())

        title_j = "name"
        href_j = "id"
        img_j = "pictures.img"
        author_j = "userinfo.nickName"
        audian_j = "person_num"
        vkey = "id"
        url_prefix = "http://www.panda.tv/"
        vType_j = "classification.cname"
        SpiderPathUrl.processJSON(resultJson["data"]["items"], title_j, href_j,
                                  img_j, author_j, audian_j, url_prefix,
                                  vType_j, vkey, "panda", "")

示例#4

0

显示文件

 def parse(self, response):
     resultJson= json.loads(response.body_as_unicode())  
     
     title_j="title"
     href_j="slug"
     img_j="thumb"
     author_j="nick"
     audian_j="view"
     vkey="slug"#uid 
     vType_j="category_name"
     url_prefix="http://www.quanmin.tv/v/"
   
     SpiderPathUrl.processJSON(resultJson["data"],title_j,href_j,img_j,author_j,audian_j,url_prefix,vType_j,vkey,"quanming","")

示例#5

0

显示文件

文件： huya.py 项目： ColaMachine/calendar

 def parse(self, response):
     #print("huya parse \n")
     #print(response.body)
     #print(self)
     #root_path='//div[@id="live-list-content"]/ul/li'
     root_path = '//div[@class="video-unit"]/ul[@class="video-list"]/li'
     title_path = './/div[@class="all_live_tit"]/a/text()'
     href_path = './/a/@href'
     img_path = './/span[@class="txt all_live_txt"]/span/img/@src'
     author_path = './/a/img/@alt'
     audian_path = './/span[@class="txt all_live_txt"]/span[@class="num"]/i/text()'
     url_prefix = ""
     vtype = './/a/@eid_desc'
     vkey = href_path
     SpiderPathUrl.process(response, root_path, title_path, href_path,
                           img_path, author_path, audian_path, url_prefix,
                           vtype, vkey, "huya", "")

示例#6

0

显示文件

文件： douyu.py 项目： ColaMachine/calendar

    def parse(self, response):

        #print(response)

        #root_path='//div[@id="live-list-content"]/ul/li'
        root_path = '//li'
        title_path = './/h3[@class="ellipsis"]/text()'
        href_path = './/a/@href'
        img_path = './/img/@data-original'
        author_path = './/span[@class="dy-name ellipsis fl"]/text()'
        audian_path = './/span[@class="dy-num fr"]/text()'
        url_prefix = "http://www.douyu.com/"
        vtype = './/span[@class="tag ellipsis"]/text()'
        vkey = './/@data-rid'
        SpiderPathUrl.process(response, root_path, title_path, href_path,
                              img_path, author_path, audian_path, url_prefix,
                              vtype, vkey, "douyu", "")

示例#7

0

显示文件

#     def parse(self, response):
#         for href in response.css('.question-summary h3 a::attr(href)'):
#             full_url = response.urljoin(href.extract())
#             yield scrapy.Request(full_url, callback=self.parse_question)

#     def parse_question(self, response):
#         yield {
#             'title': response.css('h1 a::text').extract()[0],
#             'votes': response.css('.question .vote-count-post::text').extract()[0],
#             'body': response.css('.question .post-text').extract()[0],
#             'tags': response.css('.question .post-tag::text').extract(),
#             'link': response.url,
#         }
process = CrawlerProcess(get_project_settings())
SpiderPathUrl.opendb();
SpiderPathUrl.clearDb();
# SpiderPathUrl.commitdb();
# SpiderPathUrl.opendb();
#print("hello \n")
#process.crawl(douyuStar)
process.crawl(huya)

process.crawl(douyu)
process.crawl(panda)
process.crawl(zhanqi)
process.crawl(quanming)
process.crawl(longzhu)

process.crawl(quanming)
process.start()