Exemplo n.º 1
0
    def __init__(self,
                 path_to_html_file,
                 level,
                 encoding,
                 verbose,
                 referrer=None):
        '''
        :param level: The level of this file. Should be 0 for the root file.
        :param encoding: Use `encoding` to decode HTML.
        :param referrer: The :class:`HTMLFile` that first refers to this file.
        '''
        self.path = unicode_path(path_to_html_file, abs=True)
        self.title = os.path.splitext(os.path.basename(self.path))[0]
        self.base = os.path.dirname(self.path)
        self.level = level
        self.referrer = referrer
        self.links = []

        try:
            with open(self.path, 'rb') as f:
                src = header = f.read(4096)
                encoding = detect_xml_encoding(src)[1]
                if encoding:
                    try:
                        header = header.decode(encoding, errors='replace')
                    except ValueError:
                        pass
                self.is_binary = False
                if level > 0:
                    pat = self.HTML_PAT_BIN if isinstance(
                        header, bytes) else self.HTML_PAT
                    self.is_binary = not bool(pat.search(header))
                if not self.is_binary:
                    src += f.read()
        except OSError as err:
            msg = 'Could not read from file: %s with error: %s' % (
                self.path, as_unicode(err))
            if level == 0:
                raise OSError(msg)
            raise IgnoreFile(msg, err.errno)

        if not src:
            if level == 0:
                raise ValueError('The file %s is empty' % self.path)
            self.is_binary = True

        if not self.is_binary:
            if not encoding:
                encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
                self.encoding = encoding
            else:
                self.encoding = encoding

            src = src.decode(encoding, 'replace')
            match = self.TITLE_PAT.search(src)
            self.title = match.group(1) if match is not None else self.title
            self.find_links(src)
Exemplo n.º 2
0
    def __init__(self, path_to_html_file, level, encoding, verbose, referrer=None):
        '''
        :param level: The level of this file. Should be 0 for the root file.
        :param encoding: Use `encoding` to decode HTML.
        :param referrer: The :class:`HTMLFile` that first refers to this file.
        '''
        self.path     = unicode_path(path_to_html_file, abs=True)
        self.title    = os.path.splitext(os.path.basename(self.path))[0]
        self.base     = os.path.dirname(self.path)
        self.level    = level
        self.referrer = referrer
        self.links    = []

        try:
            with open(self.path, 'rb') as f:
                src = header = f.read(4096)
                encoding = detect_xml_encoding(src)[1]
                if encoding:
                    try:
                        header = header.decode(encoding)
                    except ValueError:
                        pass
                self.is_binary = level > 0 and not bool(self.HTML_PAT.search(header))
                if not self.is_binary:
                    src += f.read()
        except IOError as err:
            msg = 'Could not read from file: %s with error: %s'%(self.path, as_unicode(err))
            if level == 0:
                raise IOError(msg)
            raise IgnoreFile(msg, err.errno)

        if not src:
            if level == 0:
                raise ValueError('The file %s is empty'%self.path)
            self.is_binary = True

        if not self.is_binary:
            if not encoding:
                encoding = detect_xml_encoding(src[:4096], verbose=verbose)[1]
                self.encoding = encoding
            else:
                self.encoding = encoding

            src = src.decode(encoding, 'replace')
            match = self.TITLE_PAT.search(src)
            self.title = match.group(1) if match is not None else self.title
            self.find_links(src)