def get_node(text, namespaces=None): """Get a scrapy selector for the given text node.""" node = Selector(text=text, type="xml") if namespaces: for ns in namespaces: node.register_namespace(ns[0], ns[1]) return node
def parse_node(self, response, node): sel = Selector(response) sel.register_namespace("wsj", "http://dowjones.net/rss/") if is_todays_article(node): title = node.xpath('title/text()').get().strip() description = remove_html(node.xpath('description/text()').get()) yield { "title": title, "link": node.xpath('link/text()').get().strip(), "description": description, "date": transform_date(node.xpath('pubDate/text()').get()), "categories": self.get_categories( sel.xpath('//wsj:articletype/text()').getall(), title, self.category_classifier), "source": "Wallstreet Journal", "sentiment": self.sentiment_classifier.classify("{} {}".format( title, description)) }