Пример #1
0
    def parse_content(self, response):
        try:
            item = NewsspiderItem()
            item_type = 'society'
            item_url = response.url
            item_crawl_time = GetCrawlTime.CrawlTime
            item_title = ' '.join(response.xpath('//h2[@id="article_title"]/text()').extract()).strip().replace(u'\xa0', u' ')
           
            item_time = ''.join(response.xpath('//span[@id="pubtime_baidu"]/text()').extract()).strip().replace(u'\xa0', u' ')
           
            item_content = '\n'.join(response.xpath('//div[@id="content"]//p//text()').extract()).strip().replace(u'\xa0', u' ')
           
            item_img = response.xpath('//div[@id="content"]//img/@src').extract()
           
            item_author = ''.join(response.xpath('//div[@class="m-editor"]/text()').extract()).strip().replace(u'\xa0', u' ')
           
            item_tags = ''
            item_source = u'南方网'
            item_summary = item_content[:100]

            item_img = str(CommonParse.array_uft8_parse(item_img))
            item_time = item_time[:16]
            item_time = TimeParse.time_parse(item_time)

            item = item.getItem(url=item_url, title=item_title, tags=item_tags, source=item_source,
                                publish_date=item_time, content=item_content, author=item_author,
                                type=item_type, img=item_img, summary=item_summary,
                                crawl_time=item_crawl_time)
            item.parseStr(item)
            yield item
        except Exception as e:
            print '=' * 100
            print e, response.url
            with open('%s.txt' % self.name, 'a') as f:
                f.write('%s - %s\n' % (e, response.url))
Пример #2
0
    def parse_class(self, response):

        soup = BeautifulSoup(response.body, "lxml")
        meta = response.request.meta
        category = meta['category']
        current_url = response.url
        content = soup.find("div", class_="container cc-body-content")
        if content is not None:
            item = NewsspiderItem()
            item['category'] = category
            item['url'] = current_url
            item['page'] = content.decode()
            yield item

        else:
            logging.info("failed to get content: " + current_url)

        all_reviews = soup.find("div", class_="course-all-reviews")
        if all_reviews:

            for review_title in all_reviews.find_all(
                    "div", class_="review-title title-with-image"):
                a = review_title.find("a")
                if a is not None:
                    link = self.index_url + a['href']
                    if link not in self.user_dict.keys():
                        logging.info("add new user: " + link)
                        self.user_dict[link] = 'user'
Пример #3
0
    def second_parse(self, response):

        # 获取meta参数里面键为'meta_1'的值
        meta_1 = response.meta['meta_1']
        items = []
        # 遍历小类里面的子链接
        for each in response.xpath('//a/@href'):
            # 获取的子链接,以大类链接开头,以.shtml结尾
            if each.extract().encode('utf-8').startswith(
                    meta_1['parentUrl'].encode('utf-8')) and each.extract(
                    ).encode('utf-8').endswith('.shtml'.encode('utf-8')):
                item = NewsspiderItem()
                item['parentUrl'] = meta_1['parentUrl']
                item['parentTitle'] = meta_1['parentTitle']
                item['subUrl'] = meta_1['subUrl']
                item['subTitle'] = meta_1['subTitle']
                item['subpath'] = meta_1['subpath']
                item['sonUrl'] = each.extract()
                items.append(item)

        # 发送子链接请求
        for each in items:
            yield scrapy.Request(each['sonUrl'],
                                 meta={'meta_2': each},
                                 callback=self.detail_parse)
Пример #4
0
 def parse(self, response):
     # filename = response.url.split("/")[-2]
     filename = "huxiu"
     with open(filename, 'wf') as f:
         for newsCell in response.css("div.mod-b"):               
             item = NewsspiderItem()
             item["title"] = (newsCell.css("h3 a").extract())[0].encode("utf-8")
             f.write(item["title"])
             yield item
Пример #5
0
 def parse_item(self, response):
     i = NewsspiderItem()
     i['title'] = response.xpath(
         "/html/head/meta[@property='og:title']/@content").extract()
     i['keywords'] = response.xpath(
         "/html/head/meta[@name='keywords']/@content").extract()
     content = response.xpath("//div[@id='artibody']").extract()
     i['content'] = str(content[0])
     i['link'] = response.url
     i['source'] = 'sina.com.cn'
     i['author'] = response.xpath(
         "//p[@class='article-editor']/text()").extract()
     i['publish_time'] = response.xpath(
         "//meta[@property='article:published_time']/@content").extract()
     return i
Пример #6
0
    def parse_user(self, response):

        soup = BeautifulSoup(response.body, "lxml")
        meta = response.request.meta
        category = 'user'  #meta['category']
        current_url = response.url

        content = soup.find("div", class_="container cc-body-content")
        if content is not None:
            item = NewsspiderItem()
            item['category'] = category
            item['url'] = current_url
            item['page'] = content.decode()
            yield item
        else:
            logging.info("failed to get content: " + current_url)
Пример #7
0
    def parse_detail(self, response):
        news_content = ''
        news_img = ''
        for p in response.xpath('//*[@id="newscontent"]/div/div[1]/div/div/p'):
            content = p.xpath('.//text()').extract()
            img = p.xpath('./img/@src').extract()
            # 要先判断是否为空,不然可能会出现indexError
            if content:
                # print("正文:"+temp[0])
                news_content += content[0] + "\r\n"
            if img:
                # print("图片:"+img[0])
                news_img += "http://news.xmu.edu.cn" + img[0] + ";"

        item = NewsspiderItem()
        item["news_title"] = response.xpath(
            '//*[@id="mainContainer"]/div[3]/table/tr/td/span/span/span/text()'
        ).extract()[0]
        item["news_content"] = news_content
        item["news_source"] = '厦门大学新闻网'
        item["news_link"] = response.url
        item["news_release_time"] = response.xpath(
            '//*[@id="mainContainer"]/div[4]/table/tr[2]/td/span[1]/span/span/text()'
        ).extract()[0]
        item["news_read_status"] = '1'
        item["news_get_time"] = time.strftime('%Y-%m-%d',
                                              time.localtime(time.time()))
        item["news_imgs"] = news_img

        release_time = item['news_release_time']
        print("the latest_release_time from web_xmu_news is {0}".format(
            self.latest_release_time))

        st1 = time.mktime(time.strptime(release_time, "%Y-%m-%d"))
        st2 = time.mktime(time.strptime(self.latest_release_time, "%Y-%m-%d"))
        result = int(st1) - int(st2)
        print("发布时间:{0} 是否继续:{1}".format(release_time, result))

        # 发布时间小于数据库最新时间则停止爬取,且数据若已存在则不保存到数据库
        if result <= 0:
            self.crawler.engine.close_spider(self, '厦门大学新闻网消息爬取完成!')
        elif myMysql.myMysql().columnExist(tableName="web_news",
                                           columnValue=item['news_link']):
            return
        else:
            yield item
Пример #8
0
 def second_parse(self, response):
     meta_1 = response.meta['meta_1']
     items = []
     # 循环遍历获取文章url
     for each in response.xpath('//a/@href'):
         if each.extract().encode('utf-8').startswith(
                 meta_1['first_url'].encode('utf-8')) and each.extract(
                 ).encode('utf-8').endswith('.shtml'.encode('utf-8')):
             item = NewsspiderItem()
             item['first_url'] = meta_1['first_url']
             item['second_url'] = meta_1['second_url']
             item['article_url'] = each.extract()
             items.append(item)
             # 获取文章请求
             for each in items:
                 yield scrapy.Request(each['article_url'],
                                      meta={'meta_2': each},
                                      callback=self.detail_parse)
Пример #9
0
 def parse(self, response):
     #  以一级目录的“地方站”作为根来循环遍历所有的一级目录url和title
     for each in response.xpath(
             "//div[@id='tab01']/div[@data-sudaclick!='citynav']"):
         # 获取一级目录的url
         first_url = each.xpath('./h3/a/@href').extract()[0]
         # 循环遍历二级目录url
         for other in each.xpath("./ul/li/a"):
             # 获取二级目录的url
             if other.xpath('./@href').extract()[0].startswith(first_url):
                 item = NewsspiderItem()
                 second_url = other.xpath('./@href').extract()[0]
                 item['first_url'] = first_url
                 item['second_url'] = second_url
                 # 获取二级目录请求
                 yield scrapy.Request(url=item['second_url'],
                                      meta={'meta_1': item},
                                      callback=self.second_parse)
Пример #10
0
 def parse_detail(self, response):
     print(response.status)
     print(response.xpath('//h1[@class="main-title"]/text()').extract()
           [0])  #标题
     news = NewsspiderItem()
     news["url"] = response.url
     news["title"] = response.xpath(
         '//h1[@class="main-title"]/text()').extract()[0]
     news["time"] = response.xpath(
         '//*[@id="top_bar"]/div/div[2]/span/text()').extract()[0]
     news["origin"] = response.xpath(
         '//*[@id="top_bar"]/div/div[2]/a/text()').extract()
     news["origin_url"] = response.xpath(
         '//*[@id="top_bar"]/div/div[2]/a/@href').extract()[0]
     news["detail"] = "\n".join(response.xpath('//div[@class="article"]/div/p/text()').extract())+\
     "\n".join(response.xpath('//div[@class="article"]/p/text()').extract())+\
     "\n".join(response.xpath('//div[@class="article"]/div/div/text()'))
     yield news
Пример #11
0
    def parse(self, response):

        # 通过某节点作为根节点进行大类链接遍历
        for each in response.xpath(
                "//div[@id='tab01']/div[@data-sudaclick!='citynav']"):
            # 获取大类链接和大类标题
            #encode('utf-8') string编码为bytes
            parentUrl = each.xpath('./h3/a/@href').extract()[0]
            #parentTitle = each.xpath('./h3/a/text()').extract()[0].encode('utf-8')
            parentTitle = each.xpath('./h3/a/text()').extract()[0]
            # 设置大类存储路径
            parentpath = './data/' + parentTitle
            #parentpath = parentTitle
            if not os.path.exists(parentpath):
                os.makedirs(parentpath)

            # 遍历小类链接
            for other in each.xpath("./ul/li/a"):

                # 获取以大类链接开头的小类链接
                if other.xpath('./@href').extract()[0].startswith(parentUrl):
                    # 注意item的位置,不同的位置会导致不同的结果。尽量不要把item的数据在外循环和内循环里面分别获取,如必须这样做,则创建空列表添加item来解决。
                    item = NewsspiderItem()
                    subUrl = other.xpath('./@href').extract()[0]
                    subTitle = other.xpath('./text()').extract()[0]
                    subpath = parentpath + '/' + subTitle
                    item['parentUrl'] = parentUrl
                    item['parentTitle'] = parentTitle
                    item['subUrl'] = subUrl
                    item['subTitle'] = subTitle
                    item['subpath'] = subpath

                    if not os.path.exists(subpath):
                        os.makedirs(subpath)

                    # 发送小类链接请求,使用meta参数把item数据传递到回调函数里面,通过response.meta['']得到数据
                    yield scrapy.Request(url=item['subUrl'],
                                         meta={'meta_1': item},
                                         callback=self.second_parse)
Пример #12
0
 def archive(self, response):
     title_tag = response.css('.noBorder::text').extract_first().strip()
     art_author = response.css('.art_author::text').extract_first().strip()
     art_authors = art_author.encode('utf-8').split(":")
     if len(art_authors) == 2:
         art_author = art_authors[1]
     else:
         art_author = u'佚名'
     art_publish = response.css(
         '.art_publish::text').extract_first().strip()
     art_publishs = art_publish.encode('utf-8').split(":")
     if len(art_publishs) == 2:
         art_publish = art_publishs[1]
     else:
         art_publish = '2001-01-01'
     art_con = response.css('.atr_con').extract_first()
     item = NewsspiderItem()
     item['title'] = title_tag
     item['author'] = art_author
     item['date'] = art_publish
     item['content'] = art_con
     item['url'] = response.url
     # print item['title']
     yield item