Exemplo n.º 1
0
    def open_file(cls, url, attribs):
        """ Open a local file for parsing. """
        def open_file_from_path(path):
            try:
                return open(url, 'rb')
            except FileNotFoundError:
                error('Missing file: %s' % url)
            except IsADirectoryError:
                error('Missing file is a directory: %s' % url)
            return None

        if re.search(r'^([a-zA-z]:|/)', url):
            fp = open_file_from_path(url)
        else:
            try:
                # handles all the flavors of file: urls, including on windows
                fp = urllib.request.urlopen(url)
            except urllib.error.URLError as what:
                fp = None
                error('Missing file: %s' % what.reason)
            except ValueError:  # just a relative path?
                fp = open_file_from_path(url)

        attribs.orig_mediatype = attribs.HeaderElement(
            MediaTypes.guess_type(url))

        debug("... got mediatype %s from guess_type" %
              str(attribs.orig_mediatype))
        attribs.orig_url = attribs.url = url
        return fp
Exemplo n.º 2
0
    def is_included_mediatype(self, attribs):
        """ Return True if this document is eligible. """

        if attribs.orig_mediatype is None:
            mediatype = MediaTypes.guess_type(attribs.url)
            if mediatype:
                attribs.orig_mediatype = attribs.HeaderElement(mediatype)
            else:
                return True  # always include if mediatype unknown

        mediatype = attribs.orig_mediatype.value

        included = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.include_mediatypes
        ])
        excluded = any([
            fnmatch.fnmatch(mediatype, pattern)
            for pattern in self.exclude_mediatypes
        ])

        if included and not excluded:
            return True

        if excluded:
            debug("Dropping excluded mediatype %s" % mediatype)
        if not included:
            debug("Dropping not included mediatype %s" % mediatype)

        return False
Exemplo n.º 3
0
 def get_mediatype(self, attribs):
     """ Get mediatype out of attribs, guessing if needed. """
     if attribs.orig_mediatype is None:
         mediatype = MediaTypes.guess_type(attribs.url)
         if mediatype:
             attribs.orig_mediatype = attribs.HeaderElement(mediatype)
         else:
             return None
     return attribs.orig_mediatype.value
Exemplo n.º 4
0
    def open_file (cls, orig_url, attribs):
        """ Open a local file for parsing. """

        url = orig_url
        if url.startswith ('file://'):
            fp = open (url[7:], "rb")
        else:
            fp = open (url, "rb")
        attribs.orig_mediatype = attribs.HeaderElement (MediaTypes.guess_type (url))

        debug ("... got mediatype %s from guess_type" % str (attribs.orig_mediatype))
        attribs.orig_url = orig_url
        attribs.url = url
        return fp
Exemplo n.º 5
0
    def open_resource (cls, orig_url, attribs):
        """ Open a python package resource file for parsing. """

        # resource://python.package/filename.ext

        o = urllib.parse.urlsplit (url)
        package = o.host
        filename = o.path
        fp = resource_stream (package, filename)
        attribs.orig_mediatype = attribs.HeaderElement (MediaTypes.guess_type (filename))

        debug ("... got mediatype %s from guess_type" % str (attribs.orig_mediatype))
        attribs.orig_url = orig_url
        attribs.url = url
        return fp
Exemplo n.º 6
0
    def create(cls, url, attribs=None):
        """ Create an appropriate parser. """
        url = parsers.webify_url(url)
        if attribs is None:
            attribs = parsers.ParserAttributes()

        # debug("Need parser for %s" % url)

        if url in cls.parsers:
            # debug("... reusing parser for %s" % url)
            # reuse same parser, maybe already filled with data
            parser = cls.parsers[url]
            parser.attribs.update(attribs)
            # debug(str(parser.attribs))
            return parser

        scheme = urllib.parse.urlsplit(url).scheme
        if scheme == 'resource':
            fp = cls.open_resource(url, attribs)
        elif scheme in ('http', 'https'):
            fp = cls.open_url(url, attribs)
        else:
            fp = cls.open_file(url, attribs)

        if attribs.url in cls.parsers:
            # reuse parser because parsing may be expensive, eg. reST docs
            # debug("... reusing parser for %s" % attribs.url)
            parser = cls.parsers[attribs.url]
            parser.attribs.update(attribs)
            return parser

        # ok. so we have to create a new parser
        debug("... creating new parser for %s" % url)

        if options.mediatype_from_extension:
            attribs.orig_mediatype = attribs.HeaderElement(
                MediaTypes.guess_type(url))
            debug("... set mediatype %s from extension" %
                  attribs.orig_mediatype.value)

        attribs.orig_url = url
        parser = cls.get(attribs)
        parser.fp = fp

        cls.parsers[url] = parser

        return parser