def extract_links(self, response): xs = HtmlXPathSelector(response) base_url = xs.select('//base/@href').extract() base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url links = [] for location in self.locations: if isinstance(location, basestring): selectors = xs.select(location) elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)): selectors = [location] if isinstance( location, HtmlXPathSelector) else location else: continue for selector in selectors: links.extend( self.extract_from_selector(selector, response.encoding)) seen, ret = set(), [] for link in links: link.url = urljoin_rfc(base_url, link.url, response.encoding) if self.unique: if link.url in seen: continue else: seen.add(link.url) if self.canonicalize: link.url = canonicalize_url(link.url) ret.append(link) return ret
def extract_links(self, response): xs = HtmlXPathSelector(response) base_url = xs.select('//base/@href').extract() base_url = urljoin_rfc(response.url, base_url[0]) if base_url else response.url links = [] for location in self.locations: if isinstance(location, basestring): selectors = xs.select(location) elif isinstance(location, (XPathSelectorList, HtmlXPathSelector)): selectors = [location] if isinstance(location, HtmlXPathSelector) else location else: continue for selector in selectors: links.extend(self.extract_from_selector(selector, response.encoding)) seen, ret = set(), [] for link in links: link.url = urljoin_rfc(base_url, link.url, response.encoding) if self.unique: if link.url in seen: continue else: seen.add(link.url) if self.canonicalize: link.url = canonicalize_url(link.url) ret.append(link) return ret
def parse_item(self, response): hxs = HtmlXPathSelector(response) # Assign given elements article = ArticleItem() article['source'] = self.source article['url'] = response.url # Parse Category try: category_str = sanitize(self.get_category(response)) article['category'] = capitalizeFirstCharInWord( self.normalize_category(self.parse_category(category_str))) except: article['category'] = '' # Parse Title try: article['title'] = sanitize( hxs.select(self.xpath_title).extract()[0]) except: article['title'] = '' # Parse Content paragraphs = hxs.select(self.xpath_content).extract() lines = [] for paragraph in paragraphs: line = sanitize(paragraph) if len(line) > 0: lines.append(line) article['content'] = '\n'.join(lines) # Parse Subtitle try: article['subtitle'] = capitalizeFirstCharInWord( sanitize(hxs.select(self.xpath_subtitle).extract()[0])) except: article['subtitle'] = '' # Parse Published_at try: date_str = sanitize( hxs.select(self.xpath_published_at).extract()[0]) article['published_at'] = self.parse_date(date_str) except: article['published_at'] = '' # Parse Place try: place_str = sanitize(hxs.select(self.xpath_place).extract()[0]) article['place'] = capitalizeFirstCharInWord( self.parse_place(place_str)) except: article['place'] = '' # Parse Author try: author_str = sanitize(hxs.select(self.xpath_author).extract()[0]) article['author'] = capitalizeFirstCharInWord( self.parse_author(author_str)) except: article['author'] = '' # Debug if self.debug == True: print article print '' else: return article
def get_category(self, response): hxs = HtmlXPathSelector(response) return hxs.select(self.xpath_category).extract()[0]
def parse_item(self, response): hxs = HtmlXPathSelector(response) # Assign given elements article = ArticleItem() article['source'] = self.source article['url'] = response.url # Parse Category try: category_str = sanitize(self.get_category(response)) article['category'] = capitalizeFirstCharInWord(self.normalize_category(self.parse_category(category_str))) except: article['category'] = '' # Parse Title try: article['title'] = sanitize(hxs.select(self.xpath_title).extract()[0]) except: article['title'] = '' # Parse Content paragraphs = hxs.select(self.xpath_content).extract() lines = [] for paragraph in paragraphs: line = sanitize(paragraph) if len(line) > 0: lines.append(line) article['content'] = '\n'.join(lines) # Parse Subtitle try: article['subtitle'] = capitalizeFirstCharInWord(sanitize(hxs.select(self.xpath_subtitle).extract()[0])) except: article['subtitle'] = '' # Parse Published_at try: date_str = sanitize(hxs.select(self.xpath_published_at).extract()[0]) article['published_at'] = self.parse_date(date_str) except: article['published_at'] = '' # Parse Place try: place_str = sanitize(hxs.select(self.xpath_place).extract()[0]) article['place'] = capitalizeFirstCharInWord(self.parse_place(place_str)) except: article['place'] = '' # Parse Author try: author_str = sanitize(hxs.select(self.xpath_author).extract()[0]) article['author'] = capitalizeFirstCharInWord(self.parse_author(author_str)) except: article['author'] = '' # Debug if self.debug == True: print article print '' else: return article