示例#1
0
class StandardFormat(object):
    """
    Class for reading and processing standard format marker files and strings.
    """
    def __init__(self, filename=None, encoding=None):
        self._encoding = encoding
        if filename is not None:
            self.open(filename)

    def open(self, sfm_file):
        """
        Open a standard format marker file for sequential reading.

        :param sfm_file: name of the standard format marker input file
        :type sfm_file: str
        """
        if isinstance(sfm_file, PathPointer):
            # [xx] We don't use 'rU' mode here -- do we need to?
            #      (PathPointer.open doesn't take a mode option)
            self._file = sfm_file.open(self._encoding)
        else:
            self._file = codecs.open(sfm_file, 'rU', self._encoding)

    def open_string(self, s):
        """
        Open a standard format marker string for sequential reading.

        :param s: string to parse as a standard format marker input file
        :type s: str
        """
        self._file = StringIO(s)

    def raw_fields(self):
        """
        Return an iterator that returns the next field in a (marker, value)
        tuple. Linebreaks and trailing white space are preserved except
        for the final newline in each field.

        :rtype: iter(tuple(str, str))
        """
        join_string = '\n'
        line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$'
        # discard a BOM in the first line
        first_line_pat = re.compile(line_regexp % '(?:\xef\xbb\xbf)?')
        line_pat = re.compile(line_regexp % '')
        # need to get first line outside the loop for correct handling
        # of the first marker if it spans multiple lines
        file_iter = iter(self._file)
        line = next(file_iter)
        mobj = re.match(first_line_pat, line)
        mkr, line_value = mobj.groups()
        value_lines = [
            line_value,
        ]
        self.line_num = 0
        for line in file_iter:
            self.line_num += 1
            mobj = re.match(line_pat, line)
            line_mkr, line_value = mobj.groups()
            if line_mkr:
                yield (mkr, join_string.join(value_lines))
                mkr = line_mkr
                value_lines = [
                    line_value,
                ]
            else:
                value_lines.append(line_value)
        self.line_num += 1
        yield (mkr, join_string.join(value_lines))

    def fields(self,
               strip=True,
               unwrap=True,
               encoding=None,
               errors='strict',
               unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3:  # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val)

    def close(self):
        """Close a previously opened standard format marker file or string."""
        self._file.close()
        try:
            del self.line_num
        except AttributeError:
            pass
示例#2
0
class StandardFormat(object):
    """
    Class for reading and processing standard format marker files and strings.
    """
    def __init__(self, filename=None, encoding=None):
        self._encoding = encoding
        if filename is not None:
            self.open(filename)

    def open(self, sfm_file):
        """
        Open a standard format marker file for sequential reading.

        :param sfm_file: name of the standard format marker input file
        :type sfm_file: str
        """
        if isinstance(sfm_file, PathPointer):
            # [xx] We don't use 'rU' mode here -- do we need to?
            #      (PathPointer.open doesn't take a mode option)
            self._file = sfm_file.open(self._encoding)
        else:
            self._file = codecs.open(sfm_file, 'rU', self._encoding)

    def open_string(self, s):
        """
        Open a standard format marker string for sequential reading.

        :param s: string to parse as a standard format marker input file
        :type s: str
        """
        self._file = StringIO(s)

    def raw_fields(self):
        """
        Return an iterator that returns the next field in a (marker, value)
        tuple. Linebreaks and trailing white space are preserved except
        for the final newline in each field.

        :rtype: iter(tuple(str, str))
        """
        join_string = '\n'
        line_regexp = r'^%s(?:\\(\S+)\s*)?(.*)$'
        # discard a BOM in the first line
        first_line_pat = re.compile(line_regexp % '(?:\xef\xbb\xbf)?')
        line_pat = re.compile(line_regexp % '')
        # need to get first line outside the loop for correct handling
        # of the first marker if it spans multiple lines
        file_iter = iter(self._file)
        line = next(file_iter)
        mobj = re.match(first_line_pat, line)
        mkr, line_value = mobj.groups()
        value_lines = [line_value,]
        self.line_num = 0
        for line in file_iter:
            self.line_num += 1
            mobj = re.match(line_pat, line)
            line_mkr, line_value = mobj.groups()
            if line_mkr:
                yield (mkr, join_string.join(value_lines))
                mkr = line_mkr
                value_lines = [line_value,]
            else:
                value_lines.append(line_value)
        self.line_num += 1
        yield (mkr, join_string.join(value_lines))

    def fields(self, strip=True, unwrap=True, encoding=None, errors='strict', unicode_fields=None):
        """
        Return an iterator that returns the next field in a ``(marker, value)``
        tuple, where ``marker`` and ``value`` are unicode strings if an ``encoding``
        was specified in the ``fields()`` method. Otherwise they are non-unicode strings.

        :param strip: strip trailing whitespace from the last line of each field
        :type strip: bool
        :param unwrap: Convert newlines in a field to spaces.
        :type unwrap: bool
        :param encoding: Name of an encoding to use. If it is specified then
            the ``fields()`` method returns unicode strings rather than non
            unicode strings.
        :type encoding: str or None
        :param errors: Error handling scheme for codec. Same as the ``decode()``
            builtin string method.
        :type errors: str
        :param unicode_fields: Set of marker names whose values are UTF-8 encoded.
            Ignored if encoding is None. If the whole file is UTF-8 encoded set
            ``encoding='utf8'`` and leave ``unicode_fields`` with its default
            value of None.
        :type unicode_fields: sequence
        :rtype: iter(tuple(str, str))
        """
        if encoding is None and unicode_fields is not None:
            raise ValueError('unicode_fields is set but not encoding.')
        unwrap_pat = re.compile(r'\n+')
        for mkr, val in self.raw_fields():
            if encoding and not PY3: # kludge - already decoded in PY3?
                if unicode_fields is not None and mkr in unicode_fields:
                    val = val.decode('utf8', errors)
                else:
                    val = val.decode(encoding, errors)
                mkr = mkr.decode(encoding, errors)
            if unwrap:
                val = unwrap_pat.sub(' ', val)
            if strip:
                val = val.rstrip()
            yield (mkr, val)

    def close(self):
        """Close a previously opened standard format marker file or string."""
        self._file.close()
        try:
            del self.line_num
        except AttributeError:
            pass