示例#1
0
 def extract_comments(self, sel):
     assert type(sel) == Selector
     raw_comments = sel.xpath('//section[@class="comment-content comment"]')
     comments= []
     for comment in raw_comments:
         comments.append(mergeListElements(comment.xpath('./p/text()').extract()))
     return comments
示例#2
0
文件: vox.py 项目: n-witt/BlogCrawler
 def parse_item(self, response):
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "Vox"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('/html/body/div[2]/div/div[2]/p/text()').extract())[0], fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = self.extract_authors(safepop(sel.xpath('/html/body/div[2]/div/div[2]/p/strong/text()').extract(), 0))
     item['headline'] = safepop(sel.xpath('/html/body/div[2]/div/div[2]/h1/text()').extract(), 0).strip()
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/preceding-sibling::*/text()').extract()), 0)
     item['links'] = sel.xpath('//div[@class="format_text entry-content"]/p[@class="to_comments"]/preceding-sibling::*/descendant-or-self::*/text()').extract()
     item['references'] = self.extract_references(sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/following-sibling::*'))
     item['comments'] = ""
     item['tags'] = ""
     item["teaser"] = safepop(mergeListElements(sel.xpath('//div[contains(@class, "teaser")]/descendant-or-self::*/text()').extract()), 0).strip()
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     
     return item
示例#3
0
 def parse_item(self, response):
     self.log(response.url, level=log.DEBUG)
     sel = Selector(response)
     item = BlogItem()
     
     item['blog_name'] = "MarginalRevolution"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//p[@class="headline_meta"]/text()').extract())[0], fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="format_text entry-content"]/p[@class="to_comments"]/preceding-sibling::*/descendant-or-self::text()').extract()), 0)
     item['links'] = sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/following-sibling::*/a/@href').extract()
     item['references'] = ""
     item['comments'] = ""
     item['tags'] = sel.xpath('//span[@class="post-footers"]/a[@rel="author"]/following-sibling::*/text()').extract()
     item["teaser"] = ""        
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     return item
示例#4
0
    def parse_item(self, response):
        self.log(response.url, level=log.DEBUG)
        sel = Selector(response)
        item = BlogItem()

        item['blog_name'] = "Brad DeLong's Grasping Reality..."
        item['url'] = response.url
        item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//h2[@class="date-header"]/text()').extract())[0], fuzzy=True)
        item['crawldate'] = datetime.now().isoformat()
        item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
        item['headline'] = safepop(sel.xpath('//h3[@class="entry-header"]/text()').extract(), 0)
        item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-body"]/descendant-or-self::text()').extract()), 0)
        item['links'] = sel.xpath('//div[@class="entry-body"]/descendant-or-self::a/@href').extract()
        item['references'] = ""
        item['comments'] = ""
        item['tags'] = sel.xpath('//span[@class="post-footers"]/a[@rel="author"]/following-sibling::*/text()').extract()
        item["teaser"] = ""
        self.log("parsed %s successfully" % response.url, level=log.INFO)
        return item
示例#5
0
    def parse_item(self, response):
        self.log(response.url, level=log.DEBUG)
        sel = Selector(response)
        item = BlogItem()

        item['blog_name'] = "Economist's View"
        item['url'] = response.url
        item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//h2[@class="date-header"]/text()').extract())[0], fuzzy=True)
        item['crawldate'] = datetime.now().isoformat()
        item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
        item['headline'] = safepop(sel.xpath('//h3[@class="entry-header"]/a/text()').extract(), 0)
        item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-body"]/descendant-or-self::text()').extract()), 0)
        item['links'] = sel.xpath('//div[@class="entry-body"]/descendant-or-self::a/@href').extract()
        item['references'] = ""
        item['comments'] = ""
        #the last two elements are always "permalink" an and "comment", hence they can be discarded
        item['tags'] = sel.xpath('//p[@class="posted"]/a[@rel="author"]/following-sibling::*/text()').extract()[0:-2]
        item["teaser"] = ""
        self.log("parsed %s successfully" % response.url, level=log.INFO)
        return item
示例#6
0
 def parse_item(self, response):
     self.log(response.url, level=log.INFO)
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "Zero Hedge"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(sel.xpath('//span[@class="submitted"]/text()').extract()[-1], fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//span[@class="submitted"]/a/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//h1[@class="print-title"]/text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="content"]/descendant-or-self::text()').extract()), 0)
     item['links'] = self.extract_links(sel.xpath('//div[@class="print-links"]/p/text()').extract())
     item['references'] = ""
     item['comments'] = ""
     #the last two elements are always "permalink" an and "comment", hence they can be discarded
     item['tags'] = sel.xpath('//ul[@class="links"]/li/a/text()').extract()
     item["teaser"] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     return item
示例#7
0
 def parse_item(self, response):
     self.log(response.url, level=log.DEBUG)
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "The BIG Picture"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(sel.xpath('//p[@class="byline"]/text()').extract()[-1], fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//span[@class="author"]/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//div[@class="headline"]/h2/a/text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//a[@rel="category tag"]/parent::*/preceding-sibling::*/descendant-or-self::*/text()').extract()), 0)
     item['links'] = sel.xpath('//a[@rel="category tag"]/parent::*/preceding-sibling::*/descendant-or-self::*/@href').extract()
     item['references'] = ""
     item['comments'] = sel.xpath('//div[@class="comment-meta commentmetadata"]/following-sibling::p/text()').extract() 
     item['tags'] = sel.xpath('//a[@rel="category tag"]/text()').extract()
     item["teaser"] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     
     return item
示例#8
0
 def parse_item(self, response):
     self.log(response.url, level=log.DEBUG)
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "naked capitalism"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(safepop(sel.xpath('//time[@class="entry-date date updated"]/text()').extract(), 0), fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/descendant-or-self::text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="pf-content"]/descendant-or-self::text()').extract()), 0)
     item['links'] = sel.xpath('//div[@class="pf-content"]/descendant-or-self::a/@href').extract()
     item['references'] = ""
     item['comments'] = self.extract_comments(sel) 
     item['tags'] = sel.xpath('//footer[@class="entry-meta"]/a/text()').extract()
     item["teaser"] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     
     return item
示例#9
0
 def parse_item(self, response):
     self.log(response.url, level=log.DEBUG)
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "econbrowser"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(safepop(sel.xpath('//time[@class="entry-date"]/@datetime').extract(), 0), fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/descendant-or-self::text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-content"]/div[@class="addtoany_share_save_container addtoany_content_bottom"]/preceding-sibling::*/descendant-or-self::text()').extract()), 0)
     item['links'] = sel.xpath('//div[@class="entry-content"]/div[@class="addtoany_share_save_container addtoany_content_bottom"]/preceding-sibling::*/descendant-or-self::a/@href').extract()
     item['references'] = ""
     item['comments'] = self.extract_comments(sel) 
     item['tags'] = ""
     item["teaser"] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     
     return item
示例#10
0
 def parse(self, response):
     sel = Selector(response)
     links = []
     item = BlogItem()
     item['blog_name'] = "The Conscience of a Liberal"
     item['url'] = safepop(sel.xpath('//div[@data-url]/attribute::data-url').extract(), 0)
     item['releasedate'] = dparser.parse(safepop(sel.xpath('//time/@datetime').extract(), 0)).replace(tzinfo=None)
     item['crawldate'] = datetime.datetime.now().isoformat()
     item['author'] = "Paul Krugman"
     item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/text()').extract(), 0)
     item['body'] = safepop(toolbox.mergeListElements(sel.xpath('//p[@class="story-body-text"]/text() | //p[@class="story-body-text"]/a/text()').extract()), 0)
     for i in sel.xpath("//p[@class='story-body-text']/a/attribute::href").extract():
         links.append(i)
     item['links'] = links
     item['references'] = ""
     item['comments'] = ""
     item['tags'] = ""
     item['teaser'] = ""
     #self.log("parsed %s successfully" % response.url, level=log.INFO)
     return item
示例#11
0
 def parse(self, response):
     sel = Selector(response)
     item = BlogItem()
     links = []
     item['blog_name'] = "The Upshot"
     item['url'] = response.url
     item['releasedate'] = sel.xpath('//p[@class="byline-dateline"]/time[@class="dateline"]/attribute::datetime').extract()
     item['crawldate'] = datetime.datetime.now().isoformat()
     item['author'] = sel.xpath('//span[@class="byline-author"]/text()').extract()                                                                               
     item['headline'] = sel.xpath('//h1[@class="story-heading" and @itemprop="headline"]/text()').extract()
     item['body'] = safepop(mergeListElements(sel.xpath('//p[@class="story-body-text story-content"]/text() | //p[@class="story-body-text story-content"]/a/text()').extract()), 0)
     for i in sel.xpath("//p[@class='story-body-text story-content']/a/attribute::href").extract():
         links.append(i)
     item['links'] = links
     item['references'] = ""
     item['comments'] = ""
     item['tags'] = ""
     item['teaser'] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO) 
     return item
示例#12
0
文件: vox.py 项目: n-witt/BlogCrawler
 def extract_references(self, selector):
     references = []
     for n in selector:
         references.append(mergeListElements(n.xpath(".//text()").extract()))
     return references