def __init__(self, startDate, endDate, *args, **kwargs): urls = [] #validating, parsing and converting the date/time-stuff try: toolbox.validate_date_range(startDate, endDate) except ValueError as e: raise ValueError(e.message) startDate = dparser.parse(startDate).replace(day=1).replace(tzinfo=None) endDate = dparser.parse(endDate).replace(tzinfo=None) endDate = endDate.replace(day=calendar.monthrange(int(endDate.strftime('%Y')), int(endDate.strftime('%m')))[1]) endDate = endDate + datetime.timedelta(hours=23, minutes=59, seconds=59) try: json_page = self.fetch_json_doc(0) except urllib2.HTTPError: raise urllib2.HTTPError('The init-Page could not be fetched. Aborting.') i = 0 while json_page['more_posts_next_page'] == True: pubDate = datetime.datetime.now() #for initializing-reasons for n in json_page['posts']: sel = Selector(text=n['html']) pubDate = dparser.parse(safepop(sel.xpath('//time/@datetime').extract(), 0)).replace(tzinfo=None) if pubDate >= startDate and pubDate <= endDate: url = safepop(sel.xpath('//div[@data-url]/attribute::data-url').extract(), 0) urls.append(url) if pubDate < startDate: break i += 1 json_page = self.fetch_json_doc(i) #removing duplicates for url in urls: if self.start_urls.count(url) == 0: self.start_urls.append(url) super(MySpider, self).__init__(*args, **kwargs)
def parse_item(self, response): sel = Selector(response) item = BlogItem() item['blog_name'] = "Vox" item['url'] = response.url item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('/html/body/div[2]/div/div[2]/p/text()').extract())[0], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = self.extract_authors(safepop(sel.xpath('/html/body/div[2]/div/div[2]/p/strong/text()').extract(), 0)) item['headline'] = safepop(sel.xpath('/html/body/div[2]/div/div[2]/h1/text()').extract(), 0).strip() item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/preceding-sibling::*/text()').extract()), 0) item['links'] = sel.xpath('//div[@class="format_text entry-content"]/p[@class="to_comments"]/preceding-sibling::*/descendant-or-self::*/text()').extract() item['references'] = self.extract_references(sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/following-sibling::*')) item['comments'] = "" item['tags'] = "" item["teaser"] = safepop(mergeListElements(sel.xpath('//div[contains(@class, "teaser")]/descendant-or-self::*/text()').extract()), 0).strip() self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "MarginalRevolution" item['url'] = response.url item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//p[@class="headline_meta"]/text()').extract())[0], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="format_text entry-content"]/p[@class="to_comments"]/preceding-sibling::*/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="article-content"]/h1[contains(., "Reference")]/following-sibling::*/a/@href').extract() item['references'] = "" item['comments'] = "" item['tags'] = sel.xpath('//span[@class="post-footers"]/a[@rel="author"]/following-sibling::*/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.INFO) sel = Selector(response) item = BlogItem() item['blog_name'] = "Zero Hedge" item['url'] = response.url item['releasedate'] = dparser.parse(sel.xpath('//span[@class="submitted"]/text()').extract()[-1], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//span[@class="submitted"]/a/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h1[@class="print-title"]/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="content"]/descendant-or-self::text()').extract()), 0) item['links'] = self.extract_links(sel.xpath('//div[@class="print-links"]/p/text()').extract()) item['references'] = "" item['comments'] = "" #the last two elements are always "permalink" an and "comment", hence they can be discarded item['tags'] = sel.xpath('//ul[@class="links"]/li/a/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "The BIG Picture" item['url'] = response.url item['releasedate'] = dparser.parse(sel.xpath('//p[@class="byline"]/text()').extract()[-1], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//span[@class="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//div[@class="headline"]/h2/a/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//a[@rel="category tag"]/parent::*/preceding-sibling::*/descendant-or-self::*/text()').extract()), 0) item['links'] = sel.xpath('//a[@rel="category tag"]/parent::*/preceding-sibling::*/descendant-or-self::*/@href').extract() item['references'] = "" item['comments'] = sel.xpath('//div[@class="comment-meta commentmetadata"]/following-sibling::p/text()').extract() item['tags'] = sel.xpath('//a[@rel="category tag"]/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "naked capitalism" item['url'] = response.url item['releasedate'] = dparser.parse(safepop(sel.xpath('//time[@class="entry-date date updated"]/text()').extract(), 0), fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/descendant-or-self::text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="pf-content"]/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="pf-content"]/descendant-or-self::a/@href').extract() item['references'] = "" item['comments'] = self.extract_comments(sel) item['tags'] = sel.xpath('//footer[@class="entry-meta"]/a/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "econbrowser" item['url'] = response.url item['releasedate'] = dparser.parse(safepop(sel.xpath('//time[@class="entry-date"]/@datetime').extract(), 0), fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/descendant-or-self::text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-content"]/div[@class="addtoany_share_save_container addtoany_content_bottom"]/preceding-sibling::*/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="entry-content"]/div[@class="addtoany_share_save_container addtoany_content_bottom"]/preceding-sibling::*/descendant-or-self::a/@href').extract() item['references'] = "" item['comments'] = self.extract_comments(sel) item['tags'] = "" item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "Brad DeLong's Grasping Reality..." item['url'] = response.url item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//h2[@class="date-header"]/text()').extract())[0], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h3[@class="entry-header"]/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-body"]/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="entry-body"]/descendant-or-self::a/@href').extract() item['references'] = "" item['comments'] = "" item['tags'] = sel.xpath('//span[@class="post-footers"]/a[@rel="author"]/following-sibling::*/text()').extract() item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse(self, response): sel = Selector(response) links = [] item = BlogItem() item['blog_name'] = "The Conscience of a Liberal" item['url'] = safepop(sel.xpath('//div[@data-url]/attribute::data-url').extract(), 0) item['releasedate'] = dparser.parse(safepop(sel.xpath('//time/@datetime').extract(), 0)).replace(tzinfo=None) item['crawldate'] = datetime.datetime.now().isoformat() item['author'] = "Paul Krugman" item['headline'] = safepop(sel.xpath('//h1[@class="entry-title"]/text()').extract(), 0) item['body'] = safepop(toolbox.mergeListElements(sel.xpath('//p[@class="story-body-text"]/text() | //p[@class="story-body-text"]/a/text()').extract()), 0) for i in sel.xpath("//p[@class='story-body-text']/a/attribute::href").extract(): links.append(i) item['links'] = links item['references'] = "" item['comments'] = "" item['tags'] = "" item['teaser'] = "" #self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse_item(self, response): self.log(response.url, level=log.DEBUG) sel = Selector(response) item = BlogItem() item['blog_name'] = "Economist's View" item['url'] = response.url item['releasedate'] = dparser.parse(mergeListElements(sel.xpath('//h2[@class="date-header"]/text()').extract())[0], fuzzy=True) item['crawldate'] = datetime.now().isoformat() item['author'] = safepop(sel.xpath('//a[@rel="author"]/text()').extract(), 0) item['headline'] = safepop(sel.xpath('//h3[@class="entry-header"]/a/text()').extract(), 0) item['body'] = safepop(mergeListElements(sel.xpath('//div[@class="entry-body"]/descendant-or-self::text()').extract()), 0) item['links'] = sel.xpath('//div[@class="entry-body"]/descendant-or-self::a/@href').extract() item['references'] = "" item['comments'] = "" #the last two elements are always "permalink" an and "comment", hence they can be discarded item['tags'] = sel.xpath('//p[@class="posted"]/a[@rel="author"]/following-sibling::*/text()').extract()[0:-2] item["teaser"] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item
def parse(self, response): sel = Selector(response) item = BlogItem() links = [] item['blog_name'] = "The Upshot" item['url'] = response.url item['releasedate'] = sel.xpath('//p[@class="byline-dateline"]/time[@class="dateline"]/attribute::datetime').extract() item['crawldate'] = datetime.datetime.now().isoformat() item['author'] = sel.xpath('//span[@class="byline-author"]/text()').extract() item['headline'] = sel.xpath('//h1[@class="story-heading" and @itemprop="headline"]/text()').extract() item['body'] = safepop(mergeListElements(sel.xpath('//p[@class="story-body-text story-content"]/text() | //p[@class="story-body-text story-content"]/a/text()').extract()), 0) for i in sel.xpath("//p[@class='story-body-text story-content']/a/attribute::href").extract(): links.append(i) item['links'] = links item['references'] = "" item['comments'] = "" item['tags'] = "" item['teaser'] = "" self.log("parsed %s successfully" % response.url, level=log.INFO) return item