Python safepop示例，Crawler.toolbox.safepop Python示例

示例#1

0

显示文件

文件： krugman.py 项目： n-witt/BlogCrawler

 def __init__(self, startDate, endDate, *args, **kwargs):
     urls = []
     #validating, parsing and converting the date/time-stuff
     try:
         toolbox.validate_date_range(startDate, endDate)
     except ValueError as e:
         raise ValueError(e.message)
     startDate = dparser.parse(startDate).replace(day=1).replace(tzinfo=None)
     endDate = dparser.parse(endDate).replace(tzinfo=None)
     endDate = endDate.replace(day=calendar.monthrange(int(endDate.strftime('%Y')), int(endDate.strftime('%m')))[1])
     endDate = endDate + datetime.timedelta(hours=23, minutes=59, seconds=59)
     
     try:
         json_page = self.fetch_json_doc(0)
     except urllib2.HTTPError:
         raise urllib2.HTTPError('The init-Page could not be fetched. Aborting.')
     i = 0
     while json_page['more_posts_next_page'] == True:
         pubDate = datetime.datetime.now() #for initializing-reasons
         for n in json_page['posts']:
             sel = Selector(text=n['html'])
             pubDate = dparser.parse(safepop(sel.xpath('//time/@datetime').extract(), 0)).replace(tzinfo=None)
             if pubDate >= startDate and pubDate <= endDate:
                 url = safepop(sel.xpath('//div[@data-url]/attribute::data-url').extract(), 0)
                 urls.append(url)
         if pubDate < startDate:
             break
         i += 1
         json_page = self.fetch_json_doc(i)
     #removing duplicates
     for url in urls:
         if self.start_urls.count(url) == 0:
             self.start_urls.append(url)
                        
     super(MySpider, self).__init__(*args, **kwargs)

示例#2

0

显示文件

文件： vox.py 项目： n-witt/BlogCrawler

 def parse_item(self, response):
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "Vox"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('/html/body/div[2]/div/div[2]/p/text()').extract())[0], fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = self.extract_authors(safepop(sel.xpath('/html/body/div[2]/div/div[2]/p/strong/text()').extract(), 0))
     item['headline'] = safepop(sel.xpath('/html/body/div[2]/div/div[2]/h1/text()').extract(), 0).strip()
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/preceding-sibling::*/text()').extract()), 0)
     item['links'] = sel.xpath('//div[@class="format_text entry-content"]/p[@class="to_comments"]/preceding-sibling::*/descendant-or-self::*/text()').extract()
     item['references'] = self.extract_references(sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/following-sibling::*'))
     item['comments'] = ""
     item['tags'] = ""
     item["teaser"] = safepop(mergeListElements(sel.xpath('//div[contains(@class, "teaser")]/descendant-or-self::*/text()').extract()), 0).strip()
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     
     return item

示例#3

0

显示文件

文件： marginal_revolution.py 项目： n-witt/BlogCrawler

 def parse_item(self, response):
     self.log(response.url, level=log.DEBUG)
     sel = Selector(response)
     item = BlogItem()
     
     item['blog_name'] = "MarginalRevolution"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//p[@class="headline_meta"]/text()').extract())[0], fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="format_text entry-content"]/p[@class="to_comments"]/preceding-sibling::*/descendant-or-self::text()').extract()), 0)
     item['links'] = sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/following-sibling::*/a/@href').extract()
     item['references'] = ""
     item['comments'] = ""
     item['tags'] = sel.xpath('//span[@class="post-footers"]/a[@rel="author"]/following-sibling::*/text()').extract()
     item["teaser"] = ""        
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     return item

示例#4

0

显示文件

文件： zero_hedge.py 项目： n-witt/BlogCrawler

 def parse_item(self, response):
     self.log(response.url, level=log.INFO)
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "Zero Hedge"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(sel.xpath('//span[@class="submitted"]/text()').extract()[-1], fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//span[@class="submitted"]/a/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//h1[@class="print-title"]/text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="content"]/descendant-or-self::text()').extract()), 0)
     item['links'] = self.extract_links(sel.xpath('//div[@class="print-links"]/p/text()').extract())
     item['references'] = ""
     item['comments'] = ""
     #the last two elements are always "permalink" an and "comment", hence they can be discarded
     item['tags'] = sel.xpath('//ul[@class="links"]/li/a/text()').extract()
     item["teaser"] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     return item

示例#5

0

显示文件

文件： the_big_picture.py 项目： n-witt/BlogCrawler

 def parse_item(self, response):
     self.log(response.url, level=log.DEBUG)
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "The BIG Picture"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(sel.xpath('//p[@class="byline"]/text()').extract()[-1], fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//span[@class="author"]/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//div[@class="headline"]/h2/a/text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//a[@rel="category tag"]/parent::*/preceding-sibling::*/descendant-or-self::*/text()').extract()), 0)
     item['links'] = sel.xpath('//a[@rel="category tag"]/parent::*/preceding-sibling::*/descendant-or-self::*/@href').extract()
     item['references'] = ""
     item['comments'] = sel.xpath('//div[@class="comment-meta commentmetadata"]/following-sibling::p/text()').extract() 
     item['tags'] = sel.xpath('//a[@rel="category tag"]/text()').extract()
     item["teaser"] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     
     return item

示例#6

0

显示文件

文件： naked_capitalism.py 项目： n-witt/BlogCrawler

 def parse_item(self, response):
     self.log(response.url, level=log.DEBUG)
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "naked capitalism"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(safepop(sel.xpath('//time[@class="entry-date date updated"]/text()').extract(), 0), fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/descendant-or-self::text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="pf-content"]/descendant-or-self::text()').extract()), 0)
     item['links'] = sel.xpath('//div[@class="pf-content"]/descendant-or-self::a/@href').extract()
     item['references'] = ""
     item['comments'] = self.extract_comments(sel) 
     item['tags'] = sel.xpath('//footer[@class="entry-meta"]/a/text()').extract()
     item["teaser"] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     
     return item

示例#7

0

显示文件

文件： econbrowser.py 项目： n-witt/BlogCrawler

 def parse_item(self, response):
     self.log(response.url, level=log.DEBUG)
     sel = Selector(response)
     item = BlogItem()
     item['blog_name'] = "econbrowser"
     item['url'] = response.url
     item['releasedate'] = dparser.parse(safepop(sel.xpath('//time[@class="entry-date"]/@datetime').extract(), 0), fuzzy=True)
     item['crawldate'] = datetime.now().isoformat()
     item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
     item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/descendant-or-self::text()').extract(), 0)
     item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-content"]/div[@class="addtoany_share_save_container addtoany_content_bottom"]/preceding-sibling::*/descendant-or-self::text()').extract()), 0)
     item['links'] = sel.xpath('//div[@class="entry-content"]/div[@class="addtoany_share_save_container addtoany_content_bottom"]/preceding-sibling::*/descendant-or-self::a/@href').extract()
     item['references'] = ""
     item['comments'] = self.extract_comments(sel) 
     item['tags'] = ""
     item["teaser"] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO)
     
     return item

示例#8

0

显示文件

文件： brad_delong.py 项目： n-witt/BlogCrawler

    def parse_item(self, response):
        self.log(response.url, level=log.DEBUG)
        sel = Selector(response)
        item = BlogItem()

        item['blog_name'] = "Brad DeLong's Grasping Reality..."
        item['url'] = response.url
        item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//h2[@class="date-header"]/text()').extract())[0], fuzzy=True)
        item['crawldate'] = datetime.now().isoformat()
        item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
        item['headline'] = safepop(sel.xpath('//h3[@class="entry-header"]/text()').extract(), 0)
        item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-body"]/descendant-or-self::text()').extract()), 0)
        item['links'] = sel.xpath('//div[@class="entry-body"]/descendant-or-self::a/@href').extract()
        item['references'] = ""
        item['comments'] = ""
        item['tags'] = sel.xpath('//span[@class="post-footers"]/a[@rel="author"]/following-sibling::*/text()').extract()
        item["teaser"] = ""
        self.log("parsed %s successfully" % response.url, level=log.INFO)
        return item

示例#9

0

显示文件

文件： krugman.py 项目： n-witt/BlogCrawler

 def parse(self, response):
     sel = Selector(response)
     links = []
     item = BlogItem()
     item['blog_name'] = "The Conscience of a Liberal"
     item['url'] = safepop(sel.xpath('//div[@data-url]/attribute::data-url').extract(), 0)
     item['releasedate'] = dparser.parse(safepop(sel.xpath('//time/@datetime').extract(), 0)).replace(tzinfo=None)
     item['crawldate'] = datetime.datetime.now().isoformat()
     item['author'] = "Paul Krugman"
     item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/text()').extract(), 0)
     item['body'] = safepop(toolbox.mergeListElements(sel.xpath('//p[@class="story-body-text"]/text() | //p[@class="story-body-text"]/a/text()').extract()), 0)
     for i in sel.xpath("//p[@class='story-body-text']/a/attribute::href").extract():
         links.append(i)
     item['links'] = links
     item['references'] = ""
     item['comments'] = ""
     item['tags'] = ""
     item['teaser'] = ""
     #self.log("parsed %s successfully" % response.url, level=log.INFO)
     return item

示例#10

0

显示文件

文件： economists_view.py 项目： n-witt/BlogCrawler

    def parse_item(self, response):
        self.log(response.url, level=log.DEBUG)
        sel = Selector(response)
        item = BlogItem()

        item['blog_name'] = "Economist's View"
        item['url'] = response.url
        item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//h2[@class="date-header"]/text()').extract())[0], fuzzy=True)
        item['crawldate'] = datetime.now().isoformat()
        item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0)
        item['headline'] = safepop(sel.xpath('//h3[@class="entry-header"]/a/text()').extract(), 0)
        item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-body"]/descendant-or-self::text()').extract()), 0)
        item['links'] = sel.xpath('//div[@class="entry-body"]/descendant-or-self::a/@href').extract()
        item['references'] = ""
        item['comments'] = ""
        #the last two elements are always "permalink" an and "comment", hence they can be discarded
        item['tags'] = sel.xpath('//p[@class="posted"]/a[@rel="author"]/following-sibling::*/text()').extract()[0:-2]
        item["teaser"] = ""
        self.log("parsed %s successfully" % response.url, level=log.INFO)
        return item

示例#11

0

显示文件

文件： upshot.py 项目： n-witt/BlogCrawler

 def parse(self, response):
     sel = Selector(response)
     item = BlogItem()
     links = []
     item['blog_name'] = "The Upshot"
     item['url'] = response.url
     item['releasedate'] = sel.xpath('//p[@class="byline-dateline"]/time[@class="dateline"]/attribute::datetime').extract()
     item['crawldate'] = datetime.datetime.now().isoformat()
     item['author'] = sel.xpath('//span[@class="byline-author"]/text()').extract()                                                                               
     item['headline'] = sel.xpath('//h1[@class="story-heading" and @itemprop="headline"]/text()').extract()
     item['body'] = safepop(mergeListElements(sel.xpath('//p[@class="story-body-text story-content"]/text() | //p[@class="story-body-text story-content"]/a/text()').extract()), 0)
     for i in sel.xpath("//p[@class='story-body-text story-content']/a/attribute::href").extract():
         links.append(i)
     item['links'] = links
     item['references'] = ""
     item['comments'] = ""
     item['tags'] = ""
     item['teaser'] = ""
     self.log("parsed %s successfully" % response.url, level=log.INFO) 
     return item