示例#1
0
    def parse(self, response):
        item = MyscrapyItem()

        movies = response.xpath('//div[@class="info"]')

        for each in movies:
            title = each.xpath(
                '//div[@class="hd"]/a/span[@class="title"]/text()').extract()
            content = each.xpath('//div[@class="bd"]/p/text()').extract()
            score = each.xpath(
                '//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()'
            ).extract()
            ratetotal = each.xpath(
                '//div[@class="bd"]/div[@class="star"]/span[4]/text()'
            ).extract()
            info = extract.xpath(
                '//div[@class="bd"]/p[@class="quote"]/span[@class="inq"]/text()'
            ).extract()

            item['title'] = title[0]
            item['content'] = ';'.join(content)
            item['score'] = score[0]
            item['ratetotal'] = ratetotal[0]
            item['info'] = info[0]

            yield item

            if self.start <= 225:
                self.start += 25
                yield scrapy.Request(self.url + str(self.start) + self.end,
                                     callback=self.parse)
示例#2
0
 def parse(self, response):
     i = 0
     for item in response.xpath(
             "//div[@class='seo-recommended-notes']/div"):
         i = i + 1
         myItem = MyscrapyItem()
         myItem['author'] = item.xpath(
             "./a[@class='author']/span/text()").extract()[0]
         print("作者:" + myItem['author'])
         myItem['author_icon_url'] = item.xpath(
             "./a[@class='author']/@href").extract()[0]
         myItem['blog_title'] = item.xpath(
             "./a[@class='title']/text()").extract()[0]
         print("标题:" + myItem['blog_title'])
         myItem['content_summary'] = item.xpath("./p/text()").extract()[0]
         print("内容:" + myItem['content_summary'])
         myItem['content_url'] = item.xpath(
             "./a[@class='title']/@href").extract()[0]
         print("url:" + self.base_url + myItem['content_url'])
         time.sleep(1)
         if myItem['content_url'] not in self.viewed:
             self.url = myItem['content_url']
             self.viewed.append(self.url)
             if (i == len(
                     response.xpath(
                         "//div[@class='seo-recommended-notes']/div"))):
                 self.page_order = self.page_order + 1
                 yield self.parse_more()
示例#3
0
    def parse(self, response):
        names = response.xpath(
            '//div[@class ="channel-detail movie-item-title"]/@title').extract(
            )
        scores = [
            score.xpath('string(.)').extract_first()
            for score in response.xpath(
                '//div[@class = "channel-detail channel-detail-orange"]')
        ]
        # for score in scores_div:
        #     scores.append(score.xpath('string(.)').extract_first())

        # use dictionary to push item
        # for name, score in zip(names, scores):
        #     # print(name, ':', score)
        #     yield {"name": name, "score": score}

        # use object to push item
        item = MyscrapyItem()
        for name, score in zip(names, scores):
            item['name'] = name
            item['score'] = score

            if response.url.find('catId=2') != -1:
                item['type'] = 'comedy'
            elif response.url.find('catId=3') != -1:
                item['type'] = 'romantic'
            yield item
示例#4
0
    def parse(self,response):
        
       # current_url = response.url #爬取时请求的URL
       # body = response.body #返回的Html
       # unicode_body = response.body_as_uncode()#返回的html unicode编码
	item = MyscrapyItem()
	selector = scrapy.Selector(response)
	#sites = hxs.select('//ul/li/div/a/img/@src').extract()
	books = selector.xpath('//div[@class="bd doulist-subject"]')
	for book in books:
		#print(item)
		title = book.xpath('div[@class="title"]/a/text()').extract()[0]
		rate = book.xpath('div[@class="rating"]/span[@class="rating_nums"]/text()').extract()[0]
		author = book.xpath('div[@class="abstract"]/text()').extract()[0]
		title = title.replace(' ','').replace('\n','')
		author = author.replace(' ','').replace('\n','')
		item['title'] = title
		item['rate'] = rate
		item['author'] = author
		#print(title)
		#print(rate)
		#print(author)
		#print('\n')
		yield item
		nextpage = selector.xpath('//span[@class="next"]/link/@href').extract()
		if nextpage:
			next = nextpage[0]
			yield scrapy.http.Request(next,callback=self.parse)
示例#5
0
    def parse(self, response):

        quotes = response.xpath("//div[@class='quote']/span[@class='text']")
        for x in quotes:
            item = MyscrapyItem()
            title = x.xpath(".//text()").get()
            item['title'] = title
            yield item
示例#6
0
文件: getinfo.py 项目: fyandlyq/-
 def parse(self, response):
     movie = MyscrapyItem()
     movie['name'] = response.xpath(
         '//div[@id="content"]/h1/span[1]/text()')[0].extract()
     movie['director'] = response.xpath(
         '//div[@id="info"]/span[@class="attrs"]/text()')[0].extract()
     actor_list = []
     actor_list.extend(response.xpath('//span[@class="actor"]'))
示例#7
0
    def parse(self, response):
        list = response.xpath('//div[@class="li_txt"]')
        for element in list:
            item = MyscrapyItem()
            name = element.xpath('./h3/text()').extract()
            title = element.xpath('./h4/text()').extract()
            info = element.xpath('./p/text()').extract()

            item['name'] = name[0].strip()
            item['title'] = title[0].strip()
            item['info'] = info[0].strip()
            yield item
示例#8
0
    def parse(self, response):
        # print("请求:"+response.xpath("//ul[@class='note-list']/li").extract()[0])
        for i in response.xpath("//ul[@class='note-list']/li"):
            print("数量:" + str(
                len(response.xpath("//ul[@class='note-list']/li").extract())))
            item = MyscrapyItem()
            try:
                item['_id'] = i.xpath("./@id").extract()[0]
            except Exception as e:
                print(e)
                return
            try:
                # text()
                item['blog_title'] = i.xpath(
                    ".//div[@class='content']/a/text()").extract()[0]
            except Exception as e:
                print(e)
                return
            try:
                item['content_url'] = self.get_full_url(
                    i.xpath(".//div[@class='content']/a/@href").extract()[0])
            except Exception as e:
                item['content_url'] = ''

            try:
                item['content_summary'] = i.xpath(
                    ".//div[@class='content']/p/text()").extract()[0]

            except Exception as e:
                item['content_summary'] = ''
            try:
                item['content_figure_url'] = self.get_full_url(
                    i.xpath("./a/img/@src").extract()[0])
            except Exception as e:
                item['content_figure_url'] = ''
            try:
                item['author'] = i.xpath(
                    ".//div[@class='author']/div/a/text()").extract()[0]
            except Exception as e:
                item['author'] = ''
            try:
                item['date'] = i.xpath(
                    ".//div[@class='author']/div/span/@data-shared - at"
                ).extract()[0]
            except Exception as e:
                item['date'] = ''
            try:
                item['author_icon_url'] = self.get_full_url(
                    i.xpath(".//div[@class='author']/a/@href").extract()[0])
            except Exception as e:
                item['author_icon_url'] = ''
            yield item
示例#9
0
 def parse(self, response):
     qiubai = MyscrapyItem()
     # print 'start $$$$$$$$$$$$$$$$$$$$$$$'
     # print response.xpath('//div[@class="bai"]/a/@href|//div[@class="du"]/a/@href').extract()
     # print 'end $$$$$$$$$$$$$$$$$$$$$$$'
     a = 0
     for item in response.xpath(
             '//div[@class="bai"]/a/@href|//div[@class="du"]/a/@href'
     ).extract():
         if a > 1:
             break
         a = a + 1
         yield scrapy.Request(url=item, callback=self.second_parse)
示例#10
0
 def parse(self, response):
     items = []
     soup = BeautifulSoup(response.text, 'lxml')
     sectionList = soup.find_all(class_='stream-list__item')
     for section in sectionList:
         title = section.find(class_='title').a.get_text()
         quote = section.find(class_='excerpt wordbreak hidden-xs').get_text().strip()
         time = section.find(class_='col-xs-10').get_text().strip()[-12:-2]
         itemLoader = ItemLoader(item=MyscrapyItem(), response=response)
         itemLoader.add_value('title', title)
         itemLoader.add_value('quote', quote)
         itemLoader.add_value('time', time)
         items.append(itemLoader.load_item())
     return items
示例#11
0
 def parse(self, response):
     tr_list = response.xpath(
         '//div[@class="greyframe"]/table[2]/tr/td/table/tr')
     for tr in tr_list:
         item = MyscrapyItem()
         item['num'] = tr.xpath('./td[1]/text()').extract_first()
         item['title'] = tr.xpath('./td[2]/a[2]/text()').extract_first()
         item['href'] = tr.xpath('./td[2]/a[2]/@href').extract_first()
         item['status'] = tr.xpath('./td[3]/span/text()').extract_first()
         item['name'] = tr.xpath('./td[4]/text()').extract_first()
         item['publish_date'] = tr.xpath('./td[5]/text()').extract_first()
         yield scrapy.Request(item['href'],
                              callback=self.parse_detail,
                              meta={'item': item})
     #构造下一页请求,翻页
     next_url = response.xpath('//a[text()=">"]/@href').extract_first()
     if next_url is not None:
         yield scrapy.Request(next_url, callback=self.parse)
示例#12
0
文件: mybots.py 项目: jobs0414/TIL
    def parse(self, response):
        titles = response.xpath(
            '//*[@id="main_content"]/div[2]/ul/li/dl/dt[2]/a/text()').extract(
            )
        authors = response.css('.writing::text').extract()
        previews = response.css('.lede::text').extract()

        items = []
        # loop
        for idx in range(len(titles)):
            item = MyscrapyItem()
            item['title'] = titles[idx]
            item['author'] = authors[idx]
            item['preview'] = previews[idx]

            items.append(item)

        return items
示例#13
0
    def parse_travellist(self, response):
        """得到每一个地点的游记列表"""
        # print("into parse_travellist************************************")
        if response.status == 200 :
            print("enter")
            res = response.text
            content = json.loads(res)
            s = content.get('list', 0)
            page_info = content.get('page',0)
            #提取下一页的请求参数
            next = re.compile(r'<a class="pi pg-next" href="/yj/(\d+)/1-0-(\d+).html" title')
            next = re.search(next,page_info)

            if s:
                s = "<html>" + s + "</html>"
                html = etree.HTML(s)
                lis = html.xpath('//div[@class="tn-item clearfix"]')
                for li in lis[:]:
                    href1 = li.xpath('.//a[@class="title-link"]//@href')[0]
                    title = li.xpath('.//a[@class="title-link"]/text()')#('./div[@class="tn-wrapper"]/dl/dt/a/text()')
                    content = li.xpath('./div[@class="tn-wrapper"]/dl/dd/a/text()')
                    zan = li.xpath('./div[@class="tn-wrapper"]/div/span[@class="tn-ding"]/em/text()')
                    user_name = li.xpath('./div[@class="tn-wrapper"]/div/span[@class="tn-user"]/a/text()')
                    item = MyscrapyItem()
                    item['title'] = title[0] if title else ''
                    item['content'] = content[0] if content else ''
                    item['zan'] = zan[0] if zan else ''
                    item['user_name'] = user_name[0] if user_name else ''
                    yield item
                    # url = 'http://www.mafengwo.cn' + href1
                    # yield Request(url, callback=self.parse_detail, dont_filter=False)
                if next:
                    next_page = next.group(1) #获取参数midde的值
                    next_num = next.group(2) #获取参数page的值
                    every_page_params = self.params
                    every_page_params['mddid'] = next_page
                    every_page_params['page'] = next_num
                    yield FormRequest(self.travel_url, callback=self.parse_travellist,dont_filter=False, formdata=every_page_params)
            else:
                logging.warning(f"parse_travellist失败!:{response.status}")

    # def parse_detail(self, response):
    #     """每一个游记的详细内容"""
    #     print(response.text)
示例#14
0
 def parse(self, response):
     # 创建MyscrapyItem类的实例
     item = MyscrapyItem()
     sectionList = response.xpath('//*[@id="all"]/div[1]/section').extract()
     for section in sectionList:
         bs = BeautifulSoup(section, 'lxml')
         articleDict = {}
         a = bs.find('a')
         articleDict['title'] = a.text
         articleDict['href'] = 'https://geekori.com/' + a.get('href')
         p = bs.find('p', class_='excerpt')
         articleDict['abstract'] = p.text
         # 为MyscrapyItem对象的3个属性赋值
         item['title'] = articleDict['title']
         item['href'] = articleDict['href']
         item['abstract'] = articleDict['abstract']
         # 本例只保存抓取的第1条博文先关信息,所以迭代一次后退出for循环
         break
         # 返回MyscrapyItem对象
     return item
示例#15
0
    def parse(self, response):
        items = []

        sectionList = response.xpath('//*[@id="all"]/div[1]/section').extract()
        for section in sectionList:
            bs = BeautifulSoup(section, 'lxml')
            articleDict = {}
            a = bs.find('a')

            articleDict['title'] = a.text
            articleDict['href'] = 'https://geekori.com/' + a.get('href')
            p = bs.find('p', class_='excerpt')
            articleDict['abstract'] = p.text
            itemLoader = ItemLoader(item=MyscrapyItem(), response=response)
            itemLoader.add_value('title', articleDict['title'])
            itemLoader.add_value('href', articleDict['href'])
            itemLoader.add_value('abstract', articleDict['abstract'])
            items.append(itemLoader.load_item())

        return items
示例#16
0
    def parse(self, response):
        #开始爬取
        print("spider start")
        #取出所有<li>标签中style=_width:183px;的标签
        node_list = response.xpath("//li[@style='_width:183px;']")

        #遍历list
        for node in node_list:
            # 创建item字段对象用来存储信息
            item = MyscrapyItem()
            # extract() : 将xpath对象转换为Unicode字符串
            novelName = node.xpath("./a/@alt").extract()
            authorName = node.xpath("./a/label/text()").extract()
            novelContent = node.xpath("./a/@href").extract()

            #对的到的信息进行一点加工,并放入item中
            item['novelName'] = novelName[0].split(" ")[0]
            item['authorName'] = authorName[0]
            item['novelContent'] = "http://www.jjwxc.net/" + novelContent[0]

            yield item
示例#17
0
    def parse(self, response):
        item = MyscrapyItem()
        tr_list = response.xpath(
            "//div[@class='greyframe']/table[2]/tr/td/table/tr")
        for tr in tr_list:
            item["title"] = tr.xpath(
                "./td[2]/a[@class='news14']/@title").extract_first()
            item["href"] = tr.xpath(
                "./td[2]/a[@class='news14']/@href").extract_first()
            item["who"] = tr.xpath("./td[4]/text()").extract_first()
            item["time"] = tr.xpath("./td[5]/text()").extract_first()

            yield scrapy.Request(item["href"],
                                 callback=self.parse_detail,
                                 meta={"item": deepcopy(item)})
        next_url = response.xpath(
            "//div[@class='pagination']/a[@text()='>']/@href").extract_first()
        if next_url is not None:

            yield scrapy.Request(next_url, callback=self.parse)
        yield item
示例#18
0
文件: hr.py 项目: ATM0909/Spiders
    def parse(self, response):
        data = response.body
        soup = BeautifulSoup(data, "lxml")
        data = soup.find("table", class_="tablelist")

        trs = data.select("tr")

        for tr in trs[1:-2]:
            td = tr.select("td")
            item = MyscrapyItem()

            item["title"] = td[0].string
            item["position"] = td[3].string
            item["date"] = td[4].string

            yield item

        div = soup.find('div', class_="pagenav")
        next = div.find("a", id="next")
        next_url = next['href']
        # print(next_url)
        next_url = "https://hr.tencent.com/" + next_url
        yield scrapy.Request(next_url, callback=self.parse)
示例#19
0
    def parse(self, response):
        # print(response)
        content = response.body.decode('utf-8')
        # with open('youbain.html','w',encoding='utf-8')as fp:
        #     fp.write(content)
        #提取数据:
        # print(content)
        tree = etree.HTML(content)

        cell_list = tree.xpath('//div[@class="cinema-cell"]')
        for cell in cell_list:
            #影院名称:
            item = MyscrapyItem()

            name = cell.xpath('./div[@class="cinema-info"]/a/text()')[0]

            item['name'] = name

            #地址:
            adress = cell.xpath('./div[@class="cinema-info"]/p/text()')[0]

            item['adress'] = adress

            yield item
示例#20
0
    def parse(self, response):
        try:
            print 'url : ', response.url
            urls = response.xpath(
                '/html/body/div[8]/div/div[3]/img/@src').extract()
            title = response.xpath(
                '/html/body/div[8]/div/div[3]/h1/text()').extract()

            item = MyscrapyItem()
            item['title'] = title
            item['urls'] = urls
            yield item

            page_urls = response.xpath(
                '/html/body/div[8]/div/ul/span/a/@href').extract()
            print len(page_urls)
            for url in page_urls:
                if str(url).startswith('http') or str(url).startswith('HTTP'):
                    full_url = url
                else:
                    full_url = response.urljoin(url)
                yield Request(url=full_url, callback=self.parse)
        except Exception, e:
            print "ERROR : ", e
示例#21
0
 def parse(self, response):
     blogs = response.xpath("//div[@class='post_item_body']")
     for b_item in blogs:
         item = MyscrapyItem()
         item["title"] = b_item.xpath(
             "./h3/a[@class='titlelnk']/text()").extract()[0].strip()
         item["link"] = b_item.xpath(
             "./h3/a[@class='titlelnk']/@href").extract_first().strip()
         item["discribe"] = b_item.xpath(
             "./p[@class='post_item_summary']/text()").extract_first(
             ).strip()
         item["author"] = b_item.xpath(
             "./div[@class='post_item_foot']/a[@class='lightblue']/text()"
         ).extract_first().strip()
         item["comment"] = b_item.xpath(
             "./div[@class='post_item_foot']/span[@class='article_comment']/a[@class='gray']/text()"
         ).extract_first().strip()
         yield item
     next_links = response.xpath("//div[@class='pager']/a[last()]")
     for next_link in next_links:
         if next_link.xpath("./text()").extract_first() == 'Next >':
             next_link_href = next_link.xpath("./@href").extract()[0]
             yield self.make_requests_from_url(u"https://www.cnblogs.com" +
                                               next_link_href)
示例#22
0
 def parse(self, response):
     for sel in response.xpath(
             '//div[@class="mod-info-flow"]/div/div[@class="mob-ctt"]'):
         item = MyscrapyItem()
         item['title'] = sel.xpath('h3/a/text()')[0].extract()
         print(item['title'])