Пример #1
0
class IfengSpider(scrapy.Spider):
    name = "tech"
    start_urls = [
        "http://tech.ifeng.com/listpage/26344/1/list.shtml",  #问题来了
        "http://tech.ifeng.com/listpage/26333/1/list.shtml",  #车科技
        "http://tech.ifeng.com/listpage/26335/1/list.shtml",  #可穿戴
        "http://tech.ifeng.com/listpage/26334/1/list.shtml",  #智慧家庭
        "http://digi.ifeng.com/listpage/4085/1/list.shtml",  #手机
        "http://digi.ifeng.com/listpage/11143/1/list.shtml",  #苹果
        "http://digi.ifeng.com/listpage/11148/1/list.shtml",  #平板
        "http://digi.ifeng.com/listpage/2689/1/list.shtml",  #笔记本
        "http://digi.ifeng.com/listpage/5098/1/list.shtml",  #影像
    ]

    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        divs = soup.findAll('div', {'class': 'box_list clearfix'})
        for div in divs:
            # title, content,url
            item = IfengItem()
            h2 = div.find('h2')
            link = h2.find('a')
            url = link['href']
            item['url'] = url
            title = link['title']
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            content = soup2.find('div', {'id': 'artical_real'}).get_text()
            item['content'] = content
            item['label'] = 'technology'
            if self.check(item['url']):
                yield item
            # //*[@id="pagenext"] //*[@id="pagenext"]
            next_url = response.xpath(
                "//*[@id='pagenext'] /@href").extract()  # 找到下一个链接,也就是翻页。

            if next_url:
                yield scrapy.Request(next_url[0], callback=self.parse)

    def check(self, url):
        self.database = Database()
        self.database.connect('crawl_data')
        sql = "SELECT * FROM news where url=%s order by url"
        str_article_url = url.encode('utf-8')
        data = (str_article_url, )
        try:
            search_result = self.database.query(sql, data)
            if search_result == ():
                self.database.close()
                return True

        except Exception, e:
            print e
            traceback.print_exc()
        self.database.close()
        return False
class IfengSpider(scrapy.Spider):
    name = "cul"
    start_urls = [
        "http://culture.ifeng.com/listpage/59669/1/list.shtml",  #眼界
        "http://culture.ifeng.com/listpage/59668/1/list.shtml",  #艺文
        "http://culture.ifeng.com/listpage/59667/1/list.shtml",  #思想
        "http://culture.ifeng.com/listpage/59665/1/list.shtml",  #文学
        "http://culture.ifeng.com/listpage/59664/1/list.shtml",  #热点
    ]

    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        divs = soup.findAll('div', {'class': 'box_list clearfix'})
        for div in divs:
            # title, content,url
            item = IfengItem()
            h2 = div.find('h2')
            link = h2.find('a')
            url = link['href']
            item['url'] = url
            title = link['title']
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            content = soup2.find('div', {'id': 'main_content'}).get_text()
            item['content'] = content
            item['label'] = 'culture'
            if self.check(item['url']):
                yield item
            # //*[@id="pagenext"] //*[@id="pagenext"]
            next_url = response.xpath(
                "//*[@id='pagenext'] /@href").extract()  # 找到下一个链接,也就是翻页。

            if next_url:
                yield scrapy.Request(next_url[0], callback=self.parse)

    def check(self, url):
        self.database = Database()
        self.database.connect('crawl_data')
        sql = "SELECT * FROM news where url=%s order by url"
        str_article_url = url.encode('utf-8')
        data = (str_article_url, )
        try:
            search_result = self.database.query(sql, data)
            if search_result == ():
                self.database.close()
                return True

        except Exception, e:
            print e
            traceback.print_exc()
        self.database.close()
        return False
class IfengSpider(scrapy.Spider):
    name = "history1"
    start_urls = [
        "http://news.ifeng.com/listpage/71096/1/list.shtml",  #假设历史
        "http://news.ifeng.com/listpage/41708/1/list.shtml",  #凤凰历史
        "http://news.ifeng.com/listpage/70296/1/list.shtml",  #兰台说史
    ]

    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        #/html/body/div[4]/div[1]/div/div/div[1]/a
        divs = soup.findAll('div', {'class': 'con_lis show'})
        for div in divs:
            # title, content,url
            item = IfengItem()
            url = div.find('a')['href']
            title = div.find('h4').get_text()
            item['url'] = url
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            content = soup2.find('div', {'id': 'yc_con_txt'}).get_text()
            item['content'] = content
            item['label'] = 'history'
            if self.check(item['url']):
                yield item
            #//*[@id="pagenext"]
            next_url = response.xpath(
                "//*[@id='pagenext'] /@href").extract()  # 找到下一个链接,也就是翻页。

            if next_url:
                yield scrapy.Request(next_url[0], callback=self.parse)

    def check(self, url):
        self.database = Database()
        self.database.connect('crawl_data')
        sql = "SELECT * FROM news where url=%s order by url"
        str_article_url = url.encode('utf-8')
        data = (str_article_url, )
        try:
            search_result = self.database.query(sql, data)
            if search_result == ():
                self.database.close()
                return True

        except Exception, e:
            print e
            traceback.print_exc()
        self.database.close()
        return False
Пример #4
0
    def check(self, url):
        self.database = Database()
        self.database.connect('crawl_data')
        sql = "SELECT * FROM news where url=%s order by url"
        str_article_url = url.encode('utf-8')
        data = (str_article_url, )
        try:
            search_result = self.database.query(sql, data)
            if search_result == ():
                self.database.close()
                return True

        except Exception, e:
            print e
            traceback.print_exc()
class IfengSpider(scrapy.Spider):
    name = "sports"
    start_urls = [
        "http://sports.ifeng.com/listpage/11244/1/list.shtml", #中国足球
        "http://sports.ifeng.com/listpage/31190/1/list.shtml", #中超
        "http://sports.ifeng.com/listpage/35586/1/list.shtml", #亚冠
        "http://sports.ifeng.com/listpage/31186/1/list.shtml",#英超
        "http://sports.ifeng.com/listpage/31188/1/list.shtml", #西甲
        "http://sports.ifeng.com/listpage/31198/1/list.shtml",#欧冠
        "http://sports.ifeng.com/listpage/31419/1/list.shtml",#田径
        "http://sports.ifeng.com/listpage/31418/1/list.shtml",#网球
        "http://sports.ifeng.com/listpage/34120/1/list.shtml",#排球
        "http://sports.ifeng.com/listpage/11246/1/list.shtml",#国际足球
        "http://sports.ifeng.com/listpage/31194/1/list.shtml",#CBA
        "http://sports.ifeng.com/listpage/11247/1/list.shtml",#综合

    ]

    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        divs = soup.findAll('div', {'class': 'box_list clearfix'})
        for div in divs:
            # title, content,url
            item = IfengItem()
            h2 = div.find('h2')
            link = h2.find('a')
            url = link['href']
            item['url'] = url
            title = link['title']
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            content = soup2.find('div', {'id': 'artical_real'}).get_text()
            item['content'] = content
            item['label'] = 'sports'
            if self.check(item['url']):
                yield item
            # //*[@id="pagenext"] //*[@id="pagenext"]
            next_url = response.xpath("//*[@id='pagenext'] /@href").extract()  # 找到下一个链接,也就是翻页。

            if next_url:
                yield scrapy.Request(next_url[0], callback=self.parse)

    def check(self, url):
        self.database = Database()
        self.database.connect('crawl_data')
        sql = "SELECT * FROM news where url=%s order by url"
        str_article_url = url.encode('utf-8')
        data = (str_article_url,)
        try:
            search_result = self.database.query(sql, data)
            if search_result == ():
                self.database.close()
                return True

        except Exception, e:
            print e
            traceback.print_exc()
        self.database.close()
        return False
class IfengSpider(scrapy.Spider):
    name = "mil3"
    start_urls = [
        "http://news.qq.com/l/milite/milgn/list2010122872223.htm", #国内军情
        "http://news.qq.com/l/milite/milhqj/list2010122872321.htm",#环球军情
        "http://news.qq.com/l/milite/junbei/list2012095132410.htm",#军备动态
    ]
    base = "http://news.qq.com/l/milite/junbei/list2012095132410_"#80.htm
    for i in range(2,51):
        url = base +str(i)+".htm"
        start_urls.append(url)
    base = "http://news.qq.com/l/milite/milhqj/list2010122872321_"#80.htm
    for i in range(2,333):
        url = base +str(i)+".htm"
        start_urls.append(url)
    base = "http://news.qq.com/l/milite/milgn/list2010122872223_"#80.htm
    for i in range(2,335):
        url = base +str(i)+".htm"
        start_urls.append(url)

    def parse(self, response):
        # response.body
        soup = BeautifulSoup(response.body, "lxml")
        root = soup.find('div', {'class': 'leftList'})
        lis = root.findAll('li')
        for li in lis:
            # title, content,url
            item = IfengItem()
            url = li.find('a')['href']
            item['url'] = url
            title = li.get_text()
            item['title'] = title
            response2 = urllib.urlopen(url)
            soup2 = BeautifulSoup(response2, "lxml")
            try:
                content = soup2.find('div', {'id': 'Cnt-Main-Article-QQ'}).get_text()#Cnt-Main-Article-QQ
                item['content'] = content
            except AttributeError:
                print AttributeError.message


            item['label'] = 'military'
            if self.check(item['url']):
                yield item

                # next_url = response.xpath("//*[@class='f12'] /@href").extract()  # 找到下一个链接,也就是翻页。
                #
                # if next_url:
                #     yield scrapy.Request(next_url[0], callback=self.parse)

    def check(self, url):
        self.database = Database()
        self.database.connect('crawl_data')
        sql = "SELECT * FROM news where url=%s order by url"
        str_article_url = url.encode('utf-8')
        data = (str_article_url,)
        try:
            search_result = self.database.query(sql, data)
            if search_result == ():
                self.database.close()
                return True

        except Exception, e:
            print e
            traceback.print_exc()
        self.database.close()
        return False