def get_link_content(link): try: response = requests.get(link) if response.status_code == 400: logging.warn(u"404 {}".format(link)) return None if response.status_code != 200: raise Exception(u"Unable to fetch release content: {0}".format(link)) except requests.exceptions.InvalidURL as e: logging.warn(u"Invalid link {0}: {1}".format(link, unicode(e))) return None content_type = response.headers.get('content-type') if not content_type: logging.warn(u"Response did not contain a Content-Type header: {0}".format(link)) return None (mime_type, mime_subtype, mt_params) = parse_mime_type(content_type) if mime_type != 'text' or mime_subtype not in ('html', 'xhtml'): logging.warn(u"Skipping non-HTML link: {0}".format(link)) return None if len(response.content) == 0: logging.warn(u"Server returned an empty body: {0}".format(link)) return None (title, body) = readability_extract(response.content) return kill_control_characters(body)
def body(self): if self._body is None: response = requests.get(self.url) response.raise_for_status() (_junk_title, body) = readability_extract(response.content) self._body = kill_control_characters(body) return self._body
def extract(self, link): response = requests.get(link).content (title, body) = readability_extract(response) date = getattr(self, 'parse_%s_date'% self.extra['leader'])(body, response, link) doc = { 'url': link, 'title': title, 'text': body, 'date': date, 'source': self.sources[self.index]} return doc