def scrape_release(source, feed, entry, link): title_text = entry.get('title') if not isinstance(title_text, unicode): title_text = title_text.encode('utf-8', 'ignore') title = kill_control_characters(title_text) date_text = (entry.get('published') or entry.get('updated') or entry.get('a10:updated')) date = dateutil.parser.parse(date_text) if date_text else now() body = get_link_content(link) if body is None: return try: # Does not use get_or_create because the unique constraint is just the url # and we don't want the source foreign key field to ever be null. release = Release.objects.get(url=link) release.title = title release.date = date release.body = body release.source = source release.save() except Release.DoesNotExist: release = Release.objects.create(url=link, source=source, title=title, date=date, body=body)
def body(self): if self._body is None: response = requests.get(self.url) response.raise_for_status() (_junk_title, body) = readability_extract(response.content) self._body = kill_control_characters(body) return self._body
def get_link_content(link): try: response = requests.get(link) if response.status_code == 400: logging.warn(u"404 {}".format(link)) return None if response.status_code != 200: raise Exception(u"Unable to fetch release content: {0}".format(link)) except requests.exceptions.InvalidURL as e: logging.warn(u"Invalid link {0}: {1}".format(link, unicode(e))) return None content_type = response.headers.get('content-type') if not content_type: logging.warn(u"Response did not contain a Content-Type header: {0}".format(link)) return None (mime_type, mime_subtype, mt_params) = parse_mime_type(content_type) if mime_type != 'text' or mime_subtype not in ('html', 'xhtml'): logging.warn(u"Skipping non-HTML link: {0}".format(link)) return None if len(response.content) == 0: logging.warn(u"Server returned an empty body: {0}".format(link)) return None (title, body) = readability_extract(response.content) return kill_control_characters(body)
def handle(self, *args, **options): if not hasattr(settings, "SUPERFASTMATCH"): raise CommandError("You must configure SUPERFASTMATCH in your project settings.") self.sfm = from_django_conf() for url in args: try: if url.startswith("http://") or url.startswith("https://"): release = Release.objects.get(url=url) body = get_link_content(release.url) release.title = kill_control_characters(release.title) release.body = body release.updated = now() release.save() logging.info("Updated release {0}: {1}".format(release.id, release.url)) else: logging.warning("Skipping non-HTTP link {0}".format(release.url)) except Exception as e: logging.error("Failed to rescrape {0}: {1}".format(url, str(e)))