def store_response(self, spider, request, response): data = { 'status': response.status, 'domain': get_domain(response.url), 'url': response.url, 'headers': self._clean_headers(response.headers), 'html': response.body, } self.db[self.COLLECTION_NAME].insert_one(data)
def store_response(self, spider, request, response): data = { 'status': response.status, 'domain': get_domain(response.url), 'url': response.url, 'html': str(response.body).lstrip("b'").strip("'").replace( "\\n", "").replace("\\t", "").replace("\\\\", "\\"), 'created': datetime.now() } data.update( self._flatten_headers(self._clean_headers(response.headers))) WebLink(meta={'id': get_urn(response.url)}, **data).save()
def parse_node(self, response, node): title = self.get_or_none(node.select('title/text()')) url = self.get_or_none(node.select('link/text()')) description = self.get_or_none(node.select('description/text()')) pub_date = self.get_or_none(node.select('pubDate/text()')) category = self.get_or_none(node.select('category/text()')) # image = node.select('item/media:content/url') item = {} item['title'] = title item['url'] = url item['pub_date'] = pub_date item['category'] = category item['description'] = description item['domain'] = get_domain(response.url) return item
def parse(self, response): # parse downloaded content with feedparser (NOT re-downloading with feedparser) feed = self.parse_feed(response.body) if feed: # grab some feed elements # - https://pythonhosted.org/feedparser/common-rss-elements.html # - https://pythonhosted.org/feedparser/common-atom-elements.html # ns = feed.namespaces # feed_title = feed.feed.title # feed_link = feed.feed.link # feed_desc = feed.feed.description for entry in feed.entries: # have content? content = entry.get('content') if content: # content = content[0] content = content[0]['value'] item = { # global feed data # 'feed_title': feed_title, # 'feed_link': feed_link, # 'feed_description': feed_desc, # # item entry data # 'url': response.url, 'url': entry.link, 'title': entry.title, 'domain': get_domain(response.url), 'description': entry.description, # 'date': entry.published, # 'date': entry.published_parsed, 'pub_date': entry.updated_parsed, # optional 'content': content, 'type': entry.get('dc_type'), } yield item
def store_response(self, spider, request, response): data = { 'status': response.status, 'domain': get_domain(response.url), 'url': response.url, 'html': str(response.body).lstrip("b'").strip("'").replace( "\\n", "").replace("\\t", "").replace("\\\\", "\\"), 'created': datetime.now() } data.update( self._flatten_headers(self._clean_headers(response.headers))) data = self.map_to_solr_datatypes(data=data) data['id'] = get_urn(response.url) self.solr.add([data])