def parse_dir_contents(self, response):
        str1 = response.url.split("/")[3]
        filename = 'output11/' + str1 + '.html'
        with open(filename, 'wb') as f:
            f.write(response.body)
        hxs = HtmlXPathSelector(response)

        #extract the cost for new format
        HDcost1 = hxs.xpath('//*[@class="dv-button-inner"]/text()').extract()
        len1 = len(HDcost1)
        del HDcost1[0]
        for i in range(0, len1 - 1):
            var1 = HDcost1[i]
            var1 = var1.encode('utf-8')
            HDcost1[i] = var1

        #extract the title for new format
        title1 = hxs.xpath('//*[@id="aiv-content-title"]/text()').extract()
        len1 = len(title1)
        for i in range(0, len1):
            var1 = title1[i]
            var1 = var1.encode('utf-8')
            var1 = var1.strip()
            title1[i] = var1
        title1 = filter(None, title1)

        #extract the release year for new format
        relyear = hxs.xpath('//*[@class="release-year"]/text()').extract()
        relyear1 = relyear[0].encode('utf-8')
        relyear1 = relyear1.strip()

        #extrcat the time for new format
        times = hxs.xpath(
            '//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[2]/text()'
        ).extract()
        time1 = times[0].strip()
        time1 = time1.encode('utf-8')

        #extract the director for new format
        dir1 = response.xpath(
            '//*[@id="dv-center-features"]/div[1]/div/table/tr[2]/td/a/text()'
        ).extract()
        dir1 = dir1[0].encode('utf-8')
        dir1 = dir1.strip()

        #extract the starring actors
        actors = hxs.select(
            '//*[@id="dv-dp-left-content"]/div[2]/div[2]/dl/dd[1]/text()'
        ).extract()
        actors = actors[0].encode('utf-8')
        actors = actors.strip()

        yield DmozItem(
            title=title1,
            time=time1,
            cost=HDcost1,
            year=relyear1,
            director=dir1,
            star=actors,
        )
예제 #2
0
 def parse(self, response):
     for sel in response.xpath('//ul/li'):
         item = DmozItem()
         item['title'] = sel.xpath('a/text()').extract()
         item['link'] = sel.xpath('a/@href').extract()
         item['desc'] = sel.xpath('text()').extract()
         yield item
예제 #3
0
    def parse(self, response):
        filename = response.url.split('/')[-2]
        with open(filename, 'wb') as f:
            f.write(response.body)
        #build response file

        sel = scrapy.selector.Selector(response)

        items = []

        title = sel.xpath('//div[@class="site-title"]/text()').extract()
        link = sel.xpath('//div[@class="title-and-desc"]/a/@href').extract()
        desc = sel.xpath('//div[@class="site-descr "]/text()').extract()
        desclist = []

        for i in range(len(desc)):
            if i % 2 == 0:
                desclist.append(desc[i])

        for j in range(len(title)):
            item = DmozItem()
            #instantiation
            item['title'] = title[j]
            item['link'] = link[j]
            item['desc'] = desclist[j]

            items.append(item)

        return items
        '''titles = sel.xpath('//div[@class="site-title"]/text()').extract()
예제 #4
0
 def parse_item2(self,response):
     l = ItemLoader(item=DmozItem(), response=response)
     l.add_xpath('type','//div[@class="location ask_main_location"]/span[@class="fl"]/a[last()]/text()')
     l.add_xpath('type','//div[@class="question"]/h2/text()')
     l.add_xpath('answer','//div[@class="anwser"]/h2/text()')
     l.add_value('answer','牛逼')
     yield l.load_item()
예제 #5
0
    def parse(self, response):
        sel = Selector(response)
        # sites = sel.xpath('//div[@class="name"]/a')
        sites = sel.css('div.product-grid > div')
        items = []
        for site in sites:
            item = DmozItem()
            title = site.css('div.name > a::text').extract()[0]
            link = site.css('div.name > a::attr("href")').extract()[0]
            des = site.css('div.description::text').extract()[0]
            price = site.css('div.price::text').extract()[0].replace(' ','').replace('\n','').replace('\r','')

            item['title'] = title

            item['link'] = link
            # item['desc'] = des
            item['price'] = price
            items.append(item)
            yield http.Request(url=item["link"], meta={'item': item}, callback=self.parseDetail, dont_filter=True)
            # yield item

        nextPage = sel.xpath('//div[@class="links"]/a/@href').extract()[-2]
        if nextPage:
            next = nextPage
            yield http.Request(next, callback=self.parse)
예제 #6
0
 def start_requests(self):
     for i in range(3618, len(self.next_url)):
         item = DmozItem()
         item['tencent_index'] = i + 1
         yield scrapy.Request(self.next_url[i],
                              meta={'item': item},
                              callback=self.parse,
                              dont_filter=True)
예제 #7
0
 def parse(self, response):
     for sel in response.xpath('//div[@class="title-and-desc"]'):
         item = DmozItem()
         item['title'] = sel.xpath('a/div/text()').extract()
         item['link'] = sel.xpath('a/@href').extract()
         item['desc'] = sel.xpath('div/text()').extract()
         yield item
     '''
예제 #8
0
    def parse_tag(self, response):
        # print response.url
        urllk = urllib.unquote(response.url.replace("\\x", "%"))
        clsname = urllk.split('/')[-1]
        filename = "tag_app_" + clsname
        f = open(filename, "w")
        rx1 = ur"data-install=\"(.*?)\".*data-name=\"(.*?)\".*data-pn=\"(.*?)\""
        rx2 = ur"<li class=\"(.*?)\" data-pn=\"(.*?)\""
        driver = webdriver.Chrome(os.environ['webdriver.chrome.driver'])
        driver.get(response.url)
        for i in range(1, 100):
            try:
                driver.find_element_by_id("j-refresh-btn").click()
                time.sleep(1)
            except:
                break
        sou = driver.page_source
        cnt = 0
        name = ""
        hxs = Selector(text=sou)
        for sel in hxs.xpath('//*[@id="j-tag-list"]/li'):
            item = DmozItem()
            item['cls'] = clsname
            print sel
            try:
                if len(sel.xpath('a[@class="install-btn"]').extract()) > 0:
                    ma1 = re.search(
                        rx1,
                        sel.xpath('a[@class="install-btn"]').extract()[0])
                elif len(sel.xpath('a[@class="install-btn "]').extract()) > 0:
                    ma1 = re.search(
                        rx1,
                        sel.xpath('a[@class="install-btn "]').extract()[0])
                else:
                    continue

                if ma1:
                    cnt += 1
                    #print "install:",ma1.group(1), "name:", ma1.group(2), " pn:", ma1.group(3)
                    item['pn'] = ma1.group(3)
                    item['isc'] = ma1.group(1)
                    name = ma1.group(2)
                    #print item['pn'], " | ", item['cls'][0]
                    data = u' '.join(
                        (item['pn'], item['isc'])).encode('utf-8').strip()
                    #print data
                    f.write(data)
                    f.write(u'\n')
                else:
                    print 'not found!'
                    continue
            except:
                continue

        data = u"total found %d packages\n" % cnt
        print "cate:", clsname, " total:", data, "last app:", name
        f.write(data)
        driver.close()
예제 #9
0
 def parse(self, response):
     sel = Selector(response)
     sites = sel.xpath('//body')
     items = []
     for site in sites:
         item = DmozItem()
         item['content'] = site.xpath('//body/h1/text()').extract()
         items.append(item)
     return items
예제 #10
0
 def parse(self, response):
     for sel in response.xpath(
             '//*[@id="block-system-main"]/article/div[7]/div[1]/table/tbody/tr'
     ):
         item = DmozItem()
         item['Mashup_Name'] = sel.xpath('td[1]/a/text()').extract()
         item['Description'] = sel.xpath('td[2]/text()').extract()
         item['Category'] = sel.xpath('td[3]/a/text()').extract()
         yield item
예제 #11
0
    def parse(self, response):
        for sel in response.xpath('//*[@id="mainContent"]/div/div[*]/div[2]'):
            print("惺惺惜惺惺想寻寻寻寻寻寻寻寻寻")

            item = DmozItem()
            item['title'] = sel.xpath('a/text()').extract()
            item['link'] = sel.xpath('a/@href').extract()

            yield item
예제 #12
0
 def page_2(self, response):
     item = DmozItem()
     items = []
     item['name'] = response.xpath(
         '//div[@class="substance"]/text()').extract()
     item['desc'] = response.xpath(
         '//div[@class="report-text-surround"]/text()').extract()
     items.append(item)
     return (items)
예제 #13
0
 def parse(self, response):
     t = response.xpath('//title/text()').extract()
     d = response.xpath('//meta[@name="description"]/@content').extract()
     l = response.xpath('//link[@rel="dns-prefetch"]/@href').extract()
     item = DmozItem()
     item['title'] = t
     item['desc'] = d
     item['link'] = l
     yield item
예제 #14
0
 def parse(self, response):
     for sel in response.xpath('//div[@class="title-and-desc"]'):
         item = DmozItem()
         item['title'] = sel.xpath(
             'a/div[@class="site-title"]/text()').extract()  #.strip()
         item['link'] = sel.xpath('a/@href').extract()
         item['desc'] = sel.xpath('div/text()').extract()  #.strip()
         #print(title,link,desc)
         yield item
예제 #15
0
 def parse(self, response):
     for sel in response.xpath('//div[@class="clsShow"]/div'):
         item = DmozItem()
         item['price'] = sel.xpath('div[@class="gPrice"]/text()').extract()
         item['address'] = sel.xpath(
             'div[@class="gAddress"]/text()').extract()
         item['name'] = sel.xpath('div[@class="gStatn"]/a/text()').extract()
         log.msg(item['name'])
         yield item
예제 #16
0
    def parse(self, response):
        # chapther 1 css
        # print response.body
        # print response.css('meta::attr(content)')[0].extract()
        #CSS 伪元素 (Pseudo-elements)
        # for jscript in response.css('script::text'):
        #     print jscript.extract()
        #
        # for metalabel in response.css('meta::attr(content)'):
        #     print metalabel.extract()

        #chapter 2 selecter  xpath
        # sel=Selector(text=response.body,type="html")
        # for labelp in sel.xpath('//div[@class="RichContent-inner"]/span/p/text()'):
        #     print labelp.extract().encode('utf-8')

        # for labela in sel.xpath('//a'):
        #     item=DmozItem()
        #     item['title']=labela.xpath('@class').extract()
        #     item['link']=labela.xpath('@href').extract()
        #     yield item

        #使用beautifulSoup解析html
        soup = BeautifulSoup(response.body, "lxml")
        # print soup.body.a #获取第一个tag

        # print soup.body.a.contents[0].name

        #获取所有a标签
        # for lablesvg in  soup.find_all('svg'):
        #     print lablesvg;
        # print  soup.p.string
        # print type(soup.p.string)
        # for labelp in soup.find_all('p'):
        #     print labelp.string
        # for str in labelp.stripped_strings:
        #     print str
        # BeautifulSoup 对象包含了一个值为 “[document]” 的特殊属性 .name
        # print soup.name

        #find_all
        # print soup.find_all('img')[0]['src']
        # print soup.find_all(role='navigation')[0].a
        # print soup.find_all(role='navigation')[0].a.attrs
        # print soup.find_all(role='navigation')[0].a['class']
        # print soup.find_all(role='navigation')[0].a.string

        #rules
        for img in soup.find_all('img'):
            item = DmozItem()
            item['link'] = img['src']
            yield item
        for url in response.xpath('//a/@href').extract():
            if (url.find('http') == -1):
                url = 'https://www.zhihu.com' + url
            yield scrapy.Request(url, callback=self.parse)
예제 #17
0
 def parse_page2(self, response):
     for sel in response.css('.site-item '):
         item = DmozItem()
         item['title'] = sel.xpath('//*[@class="title-and-desc"]/a/div[@class="site-title"]/text()') \
             .extract_first().strip()
         item['link'] = sel.xpath('//*[@class="title-and-desc"]/a/@href'
                                  ).extract_first().strip()
         item['desc'] = sel.css(
             '.site-descr ::text').extract_first().strip()
         yield item
예제 #18
0
 def parse(self, response):  #necessary
     # filename = response.url.split("/")[-2]#最后一个是“,所以倒数第二个分别是Books 和 Resources
     # with open(filename, 'wb') as f:
     #     f.write(response.body)
     for sel in response.xpath('//ul/li'):
         item = DmozItem()  # 自定义的字典
         item['title'] = sel.xpath('a/text()').extract()
         item['link'] = sel.xpath('a/@href').extract()
         item['desc'] = sel.xpath('text()').extract()
         yield item
예제 #19
0
    def parse(self, response):
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('//div[@class="griditem"]')
        items = []
        for each in sites:
	    item = DmozItem()          
	    item['title'] = each.select('h2[@class="title"]').extract()[0]
	    item['link'] = each.select('div[@class="oper clear-fix shoptext"]').extract()[0]
	    items.append(item)
        return items
예제 #20
0
 def parse_item(self, response):
     sel = Selector(response)
     items = []
     item = DmozItem()
     title = sel.xpath('//*[@id="wrap"]/div[2]/div[1]/div[1]/div[1]/text()').extract()
     desc1 = sel.xpath('//*[@id="wrap"]/div[2]/div[1]/div[3]/div[2]/div[1]/text()').extract()
     item['title'] = [t.encode('utf-8') for t in title]
     item['desc1'] = [d.encode('utf-8') for d in desc1]
     items.append(item)
     return items
예제 #21
0
    def parse_item(self, response):
        items = []

        item = DmozItem()
        item['title'] = response.xpath('//title/text()').extract()
        item['desc'] = response.xpath('//meta[name="description"]').extract()
        item['link'] = response.url
        print item
        items.append(item)
        return items
예제 #22
0
 def parse_review(self, response):
     # https://manga.mipcdn.com/i/s/https://mhimg.eshanyao.com/ManHuaKu/y/yirenzhixia/1jiejie1/2019300523.jpg
     try:
         item = DmozItem()
         # 图片链接
         item['link'] = response.xpath(
             '//div[@id="images"]/image/@src').extract()
         yield item
     except Exception as error:
         log(error)
예제 #23
0
	def parse(self, response):
		# filename = response.url.split("/")[-2]
		# with open(filename, "wb") as f:
		# 	f.write(response.body)
		for sel in response.xpath('//div[@class="results browse-content"]//div[@class="site-item "]/div[@class="title-and-desc"]'):
			item = DmozItem()
			item['title'] = sel.xpath('a/div[@class="site-title"]/text()').extract()
			item['link'] = sel.xpath('a/@href').extract()
			item['desc'] = sel.xpath('div[@class="site-descr "]/text()').extract()
			yield item
예제 #24
0
 def parse(self, response):
     # filename = response.url.split("/")[-2] + '.html'
     # with open(filename, 'wb') as f:
     # 	f.write(response.body)
     for sel in response.xpath('//ul/li'):
         item = DmozItem()
         item['title'] = sel.xpath('a/text()').extract()
         item['link'] = sel.xpath('a/@href').extract()
         item['desc'] = sel.xpath('text()').extract()
         yield item
예제 #25
0
 def parse_old(self, response):
     for sel in response.xpath('//ul[@class="mulu_list"]/li'):
         item = DmozItem()
         item['title'] = sel.xpath('a/text()').extract()
         item['link'] = sel.xpath('a/@href').extract()
         str = item['title'][0]
         href = item['link'][0]
         url = response.urljoin(href)
         print url.encode('utf8')
         yield item
예제 #26
0
   def parse(self, response):
	   sel = Selector(response)
	   sites = sel.xpath('//div')
	   items = []
	   for site in sites:
		   item = DmozItem()
		   item['title'] = site.xpath('img/@src').extract()
		 
		   items.append(item)
	   return items
예제 #27
0
    def parse(self, response):
        
        hxs = HtmlXPathSelector(response)
        sites = hxs.select('..//div[@class=" orddata"]/ul')
        items = []
        f = open('texto.txt', 'w')
        for site in sites:
            item = DmozItem()
            item['titleVaga'] = ''.join(site.select('li//h2/text()').extract())
            item['link'] = ''.join(site.select('li/a[@class="vagaTitle"]/@href').extract())
            item['desc'] = ''.join(site.select('li[@class="vagaDesc"]/text()').extract())
            item['location'] = ''.join(site.select('li[@class="location2"]/text()').extract())
            item['area'] = ''.join(site.select('li[@class="area "]/span/text()').extract())
            items.append(item)

            #print('\n\n\n\n++++++++')
            #print('\n\n\n\n++++++++')
            f.write(', '.join(item.values()).encode('utf-8'))
        f.close()
        return items
예제 #28
0
파일: dmoz_spider.py 프로젝트: xctn/scrapy
 def parse(self, response):  
     sel = Selector(response)  
     sites = sel.xpath('//div[@id="righ_list"]/ul/li')  
     items = []  
     for site in sites:  
         item = DmozItem()  
         item['title'] = site.xpath('a/text()').extract()  
         item['link'] = site.xpath('a/@href').extract()  
         item['desc'] = site.xpath('text()').extract()  
         items.append(item)  
     return items 
예제 #29
0
        def parse(self, response):
            test = HtmlXPathSelector(response)
       #     sites = test.xpath("//div[@class='subjects-wrapper clearfix']")
            sites = test.xpath("//ul[@class='tlst clearfix']/li[@class='ilst']")
            for site in sites:
                item = DmozItem()
                item['title'] = site.xpath('a/@title').extract()
   #             item['link'] = site.xpath('a/@href').extract()
   #             item['desc'] = site.xpath('text()').extract()
 #               items.append(item)
            return item
예제 #30
0
 def parse_url2(self, response):
     item = DmozItem()  # 实例化一个item
     selector = Selector(response)  # 构造一个选择器
     title = selector.xpath("//div[@class='title']/h1/text()").extract()[
         0]  # 标题
     content = selector.xpath(
         "//div[@id='content']//text()").extract()  # 内容
     # item['article_url'] = response.url
     # item['article_title'] = title
     # item['article_content'] = "".join(content)
     yield item
예제 #31
0
    def parse(self, response):
        aside_nodes = response.xpath('//aside')
        for aside_node in aside_nodes:
            item = DmozItem()
            top_cat = aside_node.xpath('.//h2//a/text()').extract()
            sub_cat = aside_node.xpath('.//h3//a/text()').extract()

            item['top_cat'] = top_cat
            item['sub_cat'] = sub_cat

            yield item