def save(self, *args, **kwargs): super(Source, self).save(*args, **kwargs) if self.last_failure is None: failures = self.scrape_failures.filter(resolved__isnull=True) for f in failures: f.resolved = now() f.save()
def scrape_release(source, feed, entry, link): title_text = entry.get('title') if not isinstance(title_text, unicode): title_text = title_text.encode('utf-8', 'ignore') title = kill_control_characters(title_text) date_text = (entry.get('published') or entry.get('updated') or entry.get('a10:updated')) date = dateutil.parser.parse(date_text) if date_text else now() body = get_link_content(link) if body is None: return try: # Does not use get_or_create because the unique constraint is just the url # and we don't want the source foreign key field to ever be null. release = Release.objects.get(url=link) release.title = title release.date = date release.body = body release.source = source release.save() except Release.DoesNotExist: release = Release.objects.create(url=link, source=source, title=title, date=date, body=body)
def is_stale(self, seconds=None): seconds = seconds or settings.SCRAPE_PERIOD if self.last_retrieved is None: return True since_last = now() - self.last_retrieved if since_last.total_seconds() > seconds: return True return False
def handle(self, *args, **options): if not hasattr(settings, "SUPERFASTMATCH"): raise CommandError("You must configure SUPERFASTMATCH in your project settings.") self.sfm = from_django_conf() for url in args: try: if url.startswith("http://") or url.startswith("https://"): release = Release.objects.get(url=url) body = get_link_content(release.url) release.title = kill_control_characters(release.title) release.body = body release.updated = now() release.save() logging.info("Updated release {0}: {1}".format(release.id, release.url)) else: logging.warning("Skipping non-HTTP link {0}".format(release.url)) except Exception as e: logging.error("Failed to rescrape {0}: {1}".format(url, str(e)))
def handle(self, *args, **options): if not hasattr(settings, 'SUPERFASTMATCH'): raise CommandError('You must configure SUPERFASTMATCH in your project settings.') if not hasattr(settings, 'DEFAULT_DOCTYPE'): raise CommandError('You must specify a DEFAULT_DOCTYPE in your project settings.') self.sfm = from_django_conf() sources = Source.objects.filter(source_type=2) if len(args) == 1: arg = args[0] if arg.startswith('http://') or arg.startswith('https://'): sources = sources.filter(url=arg) else: try: sources = sources.filter(id=int(arg)) except ValueError: raise CommandError("Arguments must be source IDs or feed URLs") for source in sources: try: if source.is_stale() or options['including_stale']: self.scrape_releases(source) source.last_retrieved = now() source.last_failure = None source.save() except SourceScrapeFailure as failure: failure.save() except Exception as e: buf = StringIO() print_exc(1000, buf) failure = SourceScrapeFailure.objects.create(source=source, traceback=buf.getvalue(), description=unicode(e))