예제 #1
0
    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        divs = soup.findAll('div', {'class': 'box_list clearfix'})
        for div in divs:
            # title, content,url
            item = IfengItem()
            h2 = div.find('h2')
            link = h2.find('a')
            url = link['href']
            item['url'] = url
            title = link['title']
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            content = soup2.find('div', {'id': 'artical_real'}).get_text()
            item['content'] = content
            item['label'] = 'history'
            if self.check(item['url']):
                yield item
            #//*[@id="pagenext"]
            next_url = response.xpath(
                "//*[@id='pagenext'] /@href").extract()  # 找到下一个链接,也就是翻页。

            if next_url:
                yield scrapy.Request(next_url[0], callback=self.parse)
    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        #/html/body/div[4]/div[1]/div/div/div[1]/a
        divs = soup.findAll('div', {'class': 'con_lis show'})
        for div in divs:
            # title, content,url
            item = IfengItem()
            url = div.find('a')['href']
            title = div.find('h4').get_text()
            item['url'] = url
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            content = soup2.find('div', {'id': 'yc_con_txt'}).get_text()
            item['content'] = content
            item['label'] = 'history'
            if self.check(item['url']):
                yield item
            #//*[@id="pagenext"]
            next_url = response.xpath(
                "//*[@id='pagenext'] /@href").extract()  # 找到下一个链接,也就是翻页。

            if next_url:
                yield scrapy.Request(next_url[0], callback=self.parse)
예제 #3
0
 def parse_news(self, response):
     title = response.css(
         'div#artical h1#artical_topic::text').extract_first()
     fbody = response.css(
         'div#main_content.js_selection_area p::text').extract()
     body = '\n'.join(fbody)
     source_url = response.css(
         'div#artical_sth.clearfix span.ss03 a::attr(href)').extract_first(
         )
     response.meta['data'].update({
         'content': body,
         'source_url': source_url,
         'title': title,
         'response_url': response.url
     })
     yield IfengItem(response.meta['data'])
    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        root = soup.find('div', {'class': 'leftList'})
        lis = root.findAll('li')
        for li in lis:
            # title, content,url
            item = IfengItem()
            url = li.find('a')['href']
            item['url'] = url
            title = li.get_text()
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            try:
                content = soup2.find('div', {'id': 'Cnt-Main-Article-QQ'}).get_text()#Cnt-Main-Article-QQ
                item['content'] = content
            except AttributeError:
                print AttributeError.message


            item['label'] = 'military'
            if self.check(item['url']):
                yield item