def open_file(cls, url, attribs): """ Open a local file for parsing. """ def open_file_from_path(path): try: return open(url, 'rb') except FileNotFoundError: error('Missing file: %s' % url) except IsADirectoryError: error('Missing file is a directory: %s' % url) return None if re.search(r'^([a-zA-z]:|/)', url): fp = open_file_from_path(url) else: try: # handles all the flavors of file: urls, including on windows fp = urllib.request.urlopen(url) except urllib.error.URLError as what: fp = None error('Missing file: %s' % what.reason) except ValueError: # just a relative path? fp = open_file_from_path(url) attribs.orig_mediatype = attribs.HeaderElement( MediaTypes.guess_type(url)) debug("... got mediatype %s from guess_type" % str(attribs.orig_mediatype)) attribs.orig_url = attribs.url = url return fp
def is_included_mediatype(self, attribs): """ Return True if this document is eligible. """ if attribs.orig_mediatype is None: mediatype = MediaTypes.guess_type(attribs.url) if mediatype: attribs.orig_mediatype = attribs.HeaderElement(mediatype) else: return True # always include if mediatype unknown mediatype = attribs.orig_mediatype.value included = any([ fnmatch.fnmatch(mediatype, pattern) for pattern in self.include_mediatypes ]) excluded = any([ fnmatch.fnmatch(mediatype, pattern) for pattern in self.exclude_mediatypes ]) if included and not excluded: return True if excluded: debug("Dropping excluded mediatype %s" % mediatype) if not included: debug("Dropping not included mediatype %s" % mediatype) return False
def get_mediatype(self, attribs): """ Get mediatype out of attribs, guessing if needed. """ if attribs.orig_mediatype is None: mediatype = MediaTypes.guess_type(attribs.url) if mediatype: attribs.orig_mediatype = attribs.HeaderElement(mediatype) else: return None return attribs.orig_mediatype.value
def open_file (cls, orig_url, attribs): """ Open a local file for parsing. """ url = orig_url if url.startswith ('file://'): fp = open (url[7:], "rb") else: fp = open (url, "rb") attribs.orig_mediatype = attribs.HeaderElement (MediaTypes.guess_type (url)) debug ("... got mediatype %s from guess_type" % str (attribs.orig_mediatype)) attribs.orig_url = orig_url attribs.url = url return fp
def open_resource (cls, orig_url, attribs): """ Open a python package resource file for parsing. """ # resource://python.package/filename.ext o = urllib.parse.urlsplit (url) package = o.host filename = o.path fp = resource_stream (package, filename) attribs.orig_mediatype = attribs.HeaderElement (MediaTypes.guess_type (filename)) debug ("... got mediatype %s from guess_type" % str (attribs.orig_mediatype)) attribs.orig_url = orig_url attribs.url = url return fp
def create(cls, url, attribs=None): """ Create an appropriate parser. """ url = parsers.webify_url(url) if attribs is None: attribs = parsers.ParserAttributes() # debug("Need parser for %s" % url) if url in cls.parsers: # debug("... reusing parser for %s" % url) # reuse same parser, maybe already filled with data parser = cls.parsers[url] parser.attribs.update(attribs) # debug(str(parser.attribs)) return parser scheme = urllib.parse.urlsplit(url).scheme if scheme == 'resource': fp = cls.open_resource(url, attribs) elif scheme in ('http', 'https'): fp = cls.open_url(url, attribs) else: fp = cls.open_file(url, attribs) if attribs.url in cls.parsers: # reuse parser because parsing may be expensive, eg. reST docs # debug("... reusing parser for %s" % attribs.url) parser = cls.parsers[attribs.url] parser.attribs.update(attribs) return parser # ok. so we have to create a new parser debug("... creating new parser for %s" % url) if options.mediatype_from_extension: attribs.orig_mediatype = attribs.HeaderElement( MediaTypes.guess_type(url)) debug("... set mediatype %s from extension" % attribs.orig_mediatype.value) attribs.orig_url = url parser = cls.get(attribs) parser.fp = fp cls.parsers[url] = parser return parser