def execute(self, transform_manager): filename, headers = retrieve(url=self.url, user=transform_manager.owner, username=self.username, password=self.password, user_agent=self.user_agent) try: if headers.get('error'): raise TransformException("Failed to download %s" % self.url) if not filename: raise TransformException(headers.get('message')) content_type = headers.get('content-type', 'unknown/unknown') content_type = content_type.split(';')[0].strip() extension = self.extension \ or self.mimetype_overrides.get(content_type) \ or (mimetypes.guess_extension(content_type) or '').lstrip('.') \ or (mimetypes.guess_extension(content_type, strict=False) or '').lstrip('.') \ or 'unknown' logger.debug("Response had content-type %r; assigning extension %r" % (content_type, extension)) with open(transform_manager(extension, self.name), 'w') as output: transform_manager.start(self, [input], type='identity') with open(filename, 'r') as f: shutil.copyfileobj(f, output) logger.info("File from %r saved to %r" % (self.url, output.name)) return output.name finally: if headers['delete-after']: os.unlink(filename)
def load_vocabulary(self, transform_manager, prefix, uri): overrides = getattr(settings, "VOCABULARY_URL_OVERRIDES", {}) uri = overrides.get(prefix, uri) if not uri: return filename, headers = retrieve(uri) if not filename: logger.error("Unable to retrieve: %s", headers.get("message")) return try: logger.debug("About to fetch %r for vocabulary %r", uri, prefix) if headers["status"] != httplib.OK: logger.error("Failed to retrieve %r for vocabulary %r", uri, prefix, extra={"headers": headers}) return content_type = headers["content-type"].split(";")[0] if content_type not in ("application/rdf+xml", "text/n3", "text/plain", "text/turtle"): logger.error("Unexpected content-type: %r", content_type) return graph_name = settings.GRAPH_BASE + "vocabulary/" + prefix Uploader.upload( stores=(transform_manager.store,), graph_name=graph_name, filename=filename, mimetype=content_type ) finally: if headers["delete-after"]: os.unlink(filename)
def process_new_pingback(pingback): source_url = urlparse.urlparse(pingback.source) source_domain = source_url.netloc.split(':')[0].lower() target_url = urlparse.urlparse(pingback.target) target_domain = target_url.netloc.split(':')[0].lower() if target_domain not in settings.PINGBACK_TARGET_DOMAINS: logger.warning('Pingback for non-targetable host: %r (%r)' % (target_domain, pingback.target)) pingback.mark_invalid('non-targetable-host') return if source_url.scheme not in ('http', 'https', 'ftp'): logger.warning('Unsupported scheme for pingback') pingback.mark_invalid('unsupported-source-scheme') return pingback.state = 'processing' pingback.save() response, headers = retrieve(pingback.source, headers={'Accept': 'application/rdf+xml, text/n3, text/turtle, application/xhtml+xml;q=0.9, text/html;q=0.8'}) if headers.get('error'): logger.warning("Failed to retrieve pingback source (%s): %s", headers.get('status'), pingback.source) pingback.mark_invalid('http-error') return try: graph = extraction.extract(pingback, response) except extraction.InvalidPingback, e: pingback.mark_invalid(e.reason) else: pingback.invalid_reason = '' pingback.data = graph.serialize(format='n3') try: hostname = socket.gethostbyaddr(pingback.remote_addr)[0] except Exception: hostname = None source_domain = urlparse.urlparse(pingback.source)[1] actions = models.AutomatedAction.objects.filter(Q(field='ip', value=pingback.remote_addr) | Q(field='hostname', value=hostname) | Q(field='domain', value=source_domain)).order_by('action') for action in actions: if action.action == 'accepted': pingback.accept() break elif action.action == 'rejected': pingback.reject() break else: pingback.mark_pending()
def load_vocabulary(self, transform_manager, prefix, uri): overrides = getattr(settings, 'VOCABULARY_URL_OVERRIDES', {}) uri = overrides.get(prefix, uri) if not uri: return filename, headers = retrieve(uri) if not filename: logger.error("Unable to retrieve: %s", headers.get('message')) return try: logger.debug("About to fetch %r for vocabulary %r", uri, prefix) if headers['status'] != httplib.OK: logger.error("Failed to retrieve %r for vocabulary %r", uri, prefix, extra={'headers': headers}) return content_type = headers['content-type'].split(';')[0] if content_type not in ('application/rdf+xml', 'text/n3', 'text/plain', 'text/turtle'): logger.error('Unexpected content-type: %r', content_type) return graph_name = settings.GRAPH_BASE + 'vocabulary/' + prefix Uploader.upload(stores=(transform_manager.store, ), graph_name=graph_name, filename=filename, mimetype=content_type) finally: if headers['delete-after']: os.unlink(filename)