예제 #1
0
파일: wikipedia.py 프로젝트: rsesha/textacy
    def __iter__(self):
        """
        Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2),
        yielding one (page id, page title, page content) 3-tuple at a time.

        Yields:
            Tuple[str, str, str]: page id, title, content with wikimedia markup
        """
        if not self.filename:
            raise IOError('{} file not found'.format(self._filename))

        if compat.is_python2 is False:
            events = ('end', )
            f = fileio.open_sesame(self.filename, mode='rt')
        else:  # Python 2 can't open bzip in text mode :(
            events = (b'end', )
            f = fileio.open_sesame(self.filename, mode='rb')
        with f:

            elems = (elem for _, elem in iterparse(f, events=events))

            elem = next(elems)
            match = re.match('^{(.*?)}', elem.tag)
            namespace = match.group(1) if match else ''
            if not namespace.startswith(
                    'http://www.mediawiki.org/xml/export-'):
                raise ValueError(
                    'namespace "{}" not a valid MediaWiki dump namespace'.
                    format(namespace))

            page_tag = '{%s}page' % namespace
            ns_path = './{%s}ns' % namespace
            page_id_path = './{%s}id' % namespace
            title_path = './{%s}title' % namespace
            text_path = './{%s}revision/{%s}text' % (namespace, namespace)

            for elem in elems:
                if elem.tag == page_tag:
                    page_id = elem.find(page_id_path).text
                    title = elem.find(title_path).text
                    ns = elem.find(ns_path).text
                    if ns != '0':
                        content = ''
                    else:
                        content = elem.find(text_path).text
                    if content is None:
                        content = ''
                    elif not isinstance(content, compat.unicode_):
                        content = compat.bytes_to_unicode(content,
                                                          errors='ignore')
                    yield page_id, title, content
                    elem.clear()
예제 #2
0
 def __iter__(self):
     """
     Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2),
     yielding one (page id, title, page content) 3-tuple at a time.
     """
     if PY2 is False:
         for title, content, page_id in extract_pages(
                 open_sesame(self.wikicorpus.fname, mode='rt'),
                 self.wikicorpus.filter_namespaces):
             yield (page_id, title, content)
     else:  # Python 2 sucks and can't open bzip in text mode
         for title, content, page_id in extract_pages(
                 open_sesame(self.wikicorpus.fname, mode='rb'),
                 self.wikicorpus.filter_namespaces):
             yield (page_id, title, content)
예제 #3
0
파일: read.py 프로젝트: GregBowyer/textacy
def read_csv(filepath, encoding=None, dialect='excel', delimiter=','):
    """
    Iterate over a stream of rows, where each row is an iterable of strings
    and/or numbers with individual values separated by ``delimiter``.

    Args:
        filepath (str): /path/to/file on disk from which rows will be streamed
        encoding (str)
        dialect (str): a grouping of formatting parameters that determine how
            the tabular data is parsed when reading/writing; if 'infer', the
            first 1024 bytes of the file is analyzed, producing a best guess for
            the correct dialect
        delimiter (str): 1-character string used to separate fields in a row

    Yields:
        List[obj]: next row, whose elements are strings and/or numbers

    .. seealso:: https://docs.python.org/3/library/csv.html#csv.reader
    """
    with open_sesame(filepath, mode='rt', encoding=encoding, newline='') as f:
        if dialect == 'infer':
            dialect = csv.Sniffer().sniff(f.read(1024))
            f.seek(0)
        for row in csv.reader(f, dialect=dialect, delimiter=delimiter):
            yield row
예제 #4
0
파일: read.py 프로젝트: GregBowyer/textacy
def read_file(filepath, mode='rt', encoding=None):
    """
    Read the full contents of a file. Files compressed with gzip, bz2, or lzma
    are handled automatically.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        return f.read()
예제 #5
0
파일: read.py 프로젝트: dchllngr/textacy
def read_json_mash(filepath, mode='rt', encoding=None, buffersize=2048):
    """
    Iterate over a stream of JSON objects, all of them mashed together, end-to-end,
    on a single line of a file. Bad form, but still manageable.

    Args:
        filepath (str): /path/to/file on disk from which json objects will be streamed,
            where all json objects are mashed together, end-to-end, on a single line,;
            for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}{"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str, optional)
        encoding (str, optional)
        buffersize (int, optional): number of bytes to read in as a chunk

    Yields:
        dict: next valid JSON object, converted to native Python equivalent
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        buffer = ''
        for chunk in iter(partial(f.read, buffersize), ''):
            buffer += chunk
            while buffer:
                try:
                    result, index = JSON_DECODER.raw_decode(buffer)
                    yield result
                    buffer = buffer[index:]
                # not enough data to decode => read another chunk
                except ValueError:
                    break
예제 #6
0
파일: write.py 프로젝트: GregBowyer/textacy
def write_json(json_object, filepath, mode='wt', encoding=None,
               auto_make_dirs=False, ensure_ascii=False,
               indent=None, separators=(',', ':'), sort_keys=False):
    """
    Write JSON object all at once to disk at ``filepath``.

    Args:
        json_object (json): valid JSON object to be written
        filepath (str): /path/to/file on disk to which json object will be written,
            such as a JSON array; for example::

                [
                    {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."},
                    {"title": "2BR02B", "text": "Everything was perfectly swell."}
                ]

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        indent (int or str)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        f.write(json.dumps(json_object, indent=indent, ensure_ascii=ensure_ascii,
                           separators=separators, sort_keys=sort_keys))
예제 #7
0
파일: read.py 프로젝트: dchllngr/textacy
def read_json(filepath, mode='rt', encoding=None, prefix=''):
    """
    Iterate over JSON objects matching the field given by ``prefix``.
    Useful for reading a large JSON array one item (with ``prefix='item'``)
    or sub-item (``prefix='item.fieldname'``) at a time.

    Args:
        filepath (str): /path/to/file on disk from which json items will be streamed,
            such as items in a JSON array; for example::

                [
                    {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."},
                    {"title": "2BR02B", "text": "Everything was perfectly swell."}
                ]

        mode (str, optional)
        encoding (str, optional)
        prefix (str, optional): if '', the entire JSON object will be read in at once;
            if 'item', each item in a top-level array will be read in successively;
            if 'item.text', each array item's 'text' value will be read in successively

    Yields:
        next matching JSON object; could be a dict, list, int, float, str,
            depending on the value of ``prefix``

    Notes:
        Refer to ``ijson`` at https://pypi.python.org/pypi/ijson/ for usage details.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        if prefix == '':
            yield json.load(f)
        else:
            for item in ijson.items(f, prefix):
                yield item
예제 #8
0
파일: read.py 프로젝트: dchllngr/textacy
def read_file(filepath, mode='rt', encoding=None):
    """
    Read the full contents of a file. Files compressed with gzip, bz2, or lzma
    are handled automatically.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        return f.read()
예제 #9
0
파일: read.py 프로젝트: dchllngr/textacy
def read_csv(filepath, encoding=None, dialect='excel', delimiter=','):
    """
    Iterate over a stream of rows, where each row is an iterable of strings
    and/or numbers with individual values separated by ``delimiter``.

    Args:
        filepath (str): /path/to/file on disk from which rows will be streamed
        encoding (str)
        dialect (str): a grouping of formatting parameters that determine how
            the tabular data is parsed when reading/writing; if 'infer', the
            first 1024 bytes of the file is analyzed, producing a best guess for
            the correct dialect
        delimiter (str): 1-character string used to separate fields in a row

    Yields:
        List[obj]: next row, whose elements are strings and/or numbers

    .. seealso:: https://docs.python.org/3/library/csv.html#csv.reader
    """
    with open_sesame(filepath, mode='rt', encoding=encoding, newline='') as f:
        if dialect == 'infer':
            dialect = csv.Sniffer().sniff(f.read(1024))
            f.seek(0)
        for row in csv.reader(f, dialect=dialect, delimiter=delimiter):
            yield row
예제 #10
0
파일: read.py 프로젝트: GregBowyer/textacy
def read_json_mash(filepath, mode='rt', encoding=None, buffersize=2048):
    """
    Iterate over a stream of JSON objects, all of them mashed together, end-to-end,
    on a single line of a file. Bad form, but still manageable.

    Args:
        filepath (str): /path/to/file on disk from which json objects will be streamed,
            where all json objects are mashed together, end-to-end, on a single line,;
            for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}{"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str, optional)
        encoding (str, optional)
        buffersize (int, optional): number of bytes to read in as a chunk

    Yields:
        dict: next valid JSON object, converted to native Python equivalent
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        buffer = ''
        for chunk in iter(partial(f.read, buffersize), ''):
            buffer += chunk
            while buffer:
                try:
                    result, index = JSON_DECODER.raw_decode(buffer)
                    yield result
                    buffer = buffer[index:]
                # not enough data to decode => read another chunk
                except ValueError:
                    break
예제 #11
0
파일: write.py 프로젝트: GregBowyer/textacy
def write_json_lines(json_objects, filepath, mode='wt', encoding=None,
                     auto_make_dirs=False, ensure_ascii=False,
                     separators=(',', ':'), sort_keys=False):
    """
    Iterate over a stream of JSON objects, writing each to a separate line in
    file ``filepath`` but without a top-level JSON object (e.g. array).

    Args:
        json_objects (iterable[json]): iterable of valid JSON objects to be written
        filepath (str): /path/to/file on disk to which JSON objects will be written,
            where each line in the file is its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for json_object in json_objects:
            f.write(json.dumps(json_object,
                               ensure_ascii=ensure_ascii,
                               separators=separators,
                               sort_keys=sort_keys) + newline)
예제 #12
0
파일: read.py 프로젝트: GregBowyer/textacy
def read_json(filepath, mode='rt', encoding=None, prefix=''):
    """
    Iterate over JSON objects matching the field given by ``prefix``.
    Useful for reading a large JSON array one item (with ``prefix='item'``)
    or sub-item (``prefix='item.fieldname'``) at a time.

    Args:
        filepath (str): /path/to/file on disk from which json items will be streamed,
            such as items in a JSON array; for example::

                [
                    {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."},
                    {"title": "2BR02B", "text": "Everything was perfectly swell."}
                ]

        mode (str, optional)
        encoding (str, optional)
        prefix (str, optional): if '', the entire JSON object will be read in at once;
            if 'item', each item in a top-level array will be read in successively;
            if 'item.text', each array item's 'text' value will be read in successively

    Yields:
        next matching JSON object; could be a dict, list, int, float, str,
            depending on the value of ``prefix``

    Notes:
        Refer to ``ijson`` at https://pypi.python.org/pypi/ijson/ for usage details.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        if prefix == '':
            yield json.load(f)
        else:
            for item in ijson.items(f, prefix):
                yield item
예제 #13
0
파일: write.py 프로젝트: GregBowyer/textacy
def write_csv(rows, filepath, encoding=None, auto_make_dirs=False,
              dialect='excel', delimiter=',', ):
    """
    Iterate over a sequence of rows, where each row is an iterable of strings
    and/or numbers, writing each to a separate line in file ``filepath`` with
    individual values separated by ``delimiter``.

    Args:
        rows (Iterable[Iterable]): iterable of iterables of strings and/or
            numbers to write to disk; for example::

                [['That was a great movie!', 0.9],
                 ['The movie was okay, I guess.', 0.2],
                 ['Worst. Movie. Ever.', -1.0]]

        filepath (str): /path/to/file on disk where rows will be written
        encoding (str)
        auto_make_dirs (bool)
        dialect (str): a grouping of formatting parameters that determine how
            the tabular data is parsed when reading/writing
        delimiter (str): 1-character string used to separate fields in a row

    .. seealso:: https://docs.python.org/3/library/csv.html#csv.writer

    .. note:: Here, CSV is used as a catch-all term for *any* delimited file
        format, and `delimiter=','` is merely the function's default value.
        Other common delimited formats are TSV (tab-separated-value, with
        `delimiter='\t''`) and PSV (pipe-separated-value, with `delimiter='|'`.
    """
    with open_sesame(filepath, mode='wt', encoding=encoding, newline='') as f:
        csv_writer = csv.writer(f, dialect=dialect, delimiter=delimiter)
        csv_writer.writerows(rows)
예제 #14
0
파일: read.py 프로젝트: dchllngr/textacy
def read_file_lines(filepath, mode='rt', encoding=None):
    """
    Read the contents of a file, line by line. Files compressed with gzip, bz2,
    or lzma are handled automatically.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        for line in f:
            yield line
예제 #15
0
파일: read.py 프로젝트: GregBowyer/textacy
def read_file_lines(filepath, mode='rt', encoding=None):
    """
    Read the contents of a file, line by line. Files compressed with gzip, bz2,
    or lzma are handled automatically.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        for line in f:
            yield line
예제 #16
0
    def __iter__(self):
        """
        Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2),
        yielding one (page id, page title, page content) 3-tuple at a time.

        Yields:
            Tuple[str, str, str]: page id, title, content with wikimedia markup
        """
        if PY2 is False:
            events = ('end',)
            f = open_sesame(self.path, mode='rt')
        else:  # Python 2 can't open bzip in text mode :(
            events = (b'end',)
            f = open_sesame(self.path, mode='rb')
        with f:

            elems = (elem for _, elem in iterparse(f, events=events))

            elem = next(elems)
            match = re.match('^{(.*?)}', elem.tag)
            namespace = match.group(1) if match else ''
            if not namespace.startswith('http://www.mediawiki.org/xml/export-'):
                raise ValueError(
                    'namespace "{}" not a valid MediaWiki dump namespace'.format(namespace))

            page_tag = '{%s}page' % namespace
            ns_path = './{%s}ns' % namespace
            page_id_path = './{%s}id' % namespace
            title_path = './{%s}title' % namespace
            text_path = './{%s}revision/{%s}text' % (namespace, namespace)

            for elem in elems:
                if elem.tag == page_tag:
                    page_id = elem.find(page_id_path).text
                    title = elem.find(title_path).text
                    ns = elem.find(ns_path).text
                    if ns != '0':
                        content = ''
                    else:
                        content = elem.find(text_path).text
                    if not isinstance(content, unicode_type):
                        content = bytes_to_unicode(content, errors='ignore')
                    yield page_id, title, content
                    elem.clear()
예제 #17
0
파일: write.py 프로젝트: GregBowyer/textacy
def write_file(content, filepath, mode='wt', encoding=None,
               auto_make_dirs=False):
    """
    Write ``content`` to disk at ``filepath``. Files with appropriate extensions
    are compressed with gzip or bz2 automatically. Any intermediate folders
    not found on disk may automatically be created.
    """
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        f.write(content)
예제 #18
0
파일: write.py 프로젝트: GregBowyer/textacy
def write_file_lines(lines, filepath, mode='wt', encoding=None,
                     auto_make_dirs=False):
    """
    Write the content in ``lines`` to disk at ``filepath``, line by line. Files
    with appropriate extensions are compressed with gzip or bz2 automatically.
    Any intermediate folders not found on disk may automatically be created.
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath, mode=mode, encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for line in lines:
            f.write(line + newline)
예제 #19
0
파일: read.py 프로젝트: dchllngr/textacy
def read_spacy_docs(filepath):
    """
    Stream ``spacy.Doc`` s from disk at ``filepath`` where they were serialized
    via pickle.

    Args:
        filepath (str): /path/to/file on disk from which spacy docs will be streamed

    Yields:
        The next deserialized ``spacy.Doc``.
    """
    with open_sesame(filepath, mode='rb') as f:
        for spacy_doc in pickle.load(f):
            yield spacy_doc
예제 #20
0
def write_file(content,
               filepath,
               mode='wt',
               encoding=None,
               auto_make_dirs=False):
    """
    Write ``content`` to disk at ``filepath``. Files with appropriate extensions
    are compressed with gzip or bz2 automatically. Any intermediate folders
    not found on disk may automatically be created.
    """
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        f.write(content)
예제 #21
0
def read_spacy_docs(spacy_vocab, filepath):
    """
    Stream ``spacy.Doc`` s from disk at ``filepath`` where they were serialized
    using Spacy's ``spacy.Doc.to_bytes()`` functionality.

    Args:
        spacy_vocab (``spacy.Vocab``): the spacy vocab object used to serialize
            the docs in ``filepath``
        filepath (str): /path/to/file on disk from which spacy docs will be streamed

    Yields:
        the next deserialized ``spacy.Doc``
    """
    with open_sesame(filepath, mode='rb') as f:
        for bytes_string in SpacyDoc.read_bytes(f):
            yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
예제 #22
0
def write_spacy_docs(spacy_docs, filepath, auto_make_dirs=False):
    """
    Serialize a sequence of ``spacy.Doc`` s to disk at ``filepath`` using Spacy's
    ``spacy.Doc.to_bytes()`` functionality.

    Args:
        spacy_docs (``spacy.Doc`` or iterable(``spacy.Doc``)): a single spacy doc
            or a sequence of spacy docs to serialize to disk at ``filepath``
        filepath (str): /path/to/file on disk to which spacy docs will be streamed
        auto_make_dirs (bool)
    """
    if isinstance(spacy_docs, SpacyDoc):
        spacy_docs = (spacy_docs, )
    with open_sesame(filepath, mode='wb', auto_make_dirs=auto_make_dirs) as f:
        for doc in spacy_docs:
            f.write(doc.to_bytes())
예제 #23
0
파일: write.py 프로젝트: GregBowyer/textacy
def write_spacy_docs(spacy_docs, filepath, auto_make_dirs=False):
    """
    Serialize a sequence of ``spacy.Doc`` s to disk at ``filepath`` using Spacy's
    ``spacy.Doc.to_bytes()`` functionality.

    Args:
        spacy_docs (``spacy.Doc`` or iterable(``spacy.Doc``)): a single spacy doc
            or a sequence of spacy docs to serialize to disk at ``filepath``
        filepath (str): /path/to/file on disk to which spacy docs will be streamed
        auto_make_dirs (bool)
    """
    if isinstance(spacy_docs, SpacyDoc):
        spacy_docs = (spacy_docs,)
    with open_sesame(filepath, mode='wb', auto_make_dirs=auto_make_dirs) as f:
        for doc in spacy_docs:
            f.write(doc.to_bytes())
예제 #24
0
파일: write.py 프로젝트: dchllngr/textacy
def write_spacy_docs(spacy_docs, filepath, auto_make_dirs=False):
    """
    Serialize a sequence of ``spacy.Doc`` s to disk at ``filepath`` using pickle.

    Args:
        spacy_docs (``spacy.Doc`` or iterable(``spacy.Doc``)): a single spacy doc
            or a sequence of spacy docs to serialize to disk at ``filepath``
        filepath (str): /path/to/file on disk to which spacy docs will be streamed
        auto_make_dirs (bool)

    .. note:: The docs are pickled together, as a list, so they are all loaded
        into memory before saving. Mind your RAM usage!
    """
    if isinstance(spacy_docs, SpacyDoc):
        spacy_docs = [spacy_docs]
    with open_sesame(filepath, mode='wb', auto_make_dirs=auto_make_dirs) as f:
        pickle.dump(list(spacy_docs), f, protocol=-1)
예제 #25
0
def write_file_lines(lines,
                     filepath,
                     mode='wt',
                     encoding=None,
                     auto_make_dirs=False):
    """
    Write the content in ``lines`` to disk at ``filepath``, line by line. Files
    with appropriate extensions are compressed with gzip or bz2 automatically.
    Any intermediate folders not found on disk may automatically be created.
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for line in lines:
            f.write(line + newline)
예제 #26
0
def write_json(json_object,
               filepath,
               mode='wt',
               encoding=None,
               auto_make_dirs=False,
               ensure_ascii=False,
               indent=None,
               separators=(',', ':'),
               sort_keys=False):
    """
    Write JSON object all at once to disk at ``filepath``.

    Args:
        json_object (json): valid JSON object to be written
        filepath (str): /path/to/file on disk to which json object will be written,
            such as a JSON array; for example::

                [
                    {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."},
                    {"title": "2BR02B", "text": "Everything was perfectly swell."}
                ]

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        indent (int or str)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        f.write(
            json.dumps(json_object,
                       indent=indent,
                       ensure_ascii=ensure_ascii,
                       separators=separators,
                       sort_keys=sort_keys))
예제 #27
0
파일: read.py 프로젝트: GregBowyer/textacy
def read_json_lines(filepath, mode='rt', encoding=None):
    """
    Iterate over a stream of JSON objects, where each line of file ``filepath``
    is a valid JSON object but no JSON object (e.g. array) exists at the top level.

    Args:
        filepath (str): /path/to/file on disk from which json objects will be streamed,
            where each line in the file must be its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str, optional)
        encoding (str, optional)

    Yields:
        dict: next valid JSON object, converted to native Python equivalent
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        for line in f:
            yield json.loads(line)
예제 #28
0
파일: read.py 프로젝트: dchllngr/textacy
def read_json_lines(filepath, mode='rt', encoding=None):
    """
    Iterate over a stream of JSON objects, where each line of file ``filepath``
    is a valid JSON object but no JSON object (e.g. array) exists at the top level.

    Args:
        filepath (str): /path/to/file on disk from which json objects will be streamed,
            where each line in the file must be its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str, optional)
        encoding (str, optional)

    Yields:
        dict: next valid JSON object, converted to native Python equivalent
    """
    with open_sesame(filepath, mode=mode, encoding=encoding) as f:
        for line in f:
            yield json.loads(line)
예제 #29
0
def write_json_lines(json_objects,
                     filepath,
                     mode='wt',
                     encoding=None,
                     auto_make_dirs=False,
                     ensure_ascii=False,
                     separators=(',', ':'),
                     sort_keys=False):
    """
    Iterate over a stream of JSON objects, writing each to a separate line in
    file ``filepath`` but without a top-level JSON object (e.g. array).

    Args:
        json_objects (iterable[json]): iterable of valid JSON objects to be written
        filepath (str): /path/to/file on disk to which JSON objects will be written,
            where each line in the file is its own json object; for example::

                {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n
                {"title": "2BR02B", "text": "Everything was perfectly swell."}

        mode (str)
        encoding (str)
        auto_make_dirs (bool)
        ensure_ascii (bool)
        separators (tuple[str])
        sort_keys (bool)

    .. seealso:: https://docs.python.org/3/library/json.html#json.dump
    """
    newline = '\n' if 't' in mode else unicode_to_bytes('\n')
    with open_sesame(filepath,
                     mode=mode,
                     encoding=encoding,
                     auto_make_dirs=auto_make_dirs) as f:
        for json_object in json_objects:
            f.write(
                json.dumps(json_object,
                           ensure_ascii=ensure_ascii,
                           separators=separators,
                           sort_keys=sort_keys) + newline)
예제 #30
0
def write_csv(
    rows,
    filepath,
    encoding=None,
    auto_make_dirs=False,
    dialect='excel',
    delimiter=',',
):
    """
    Iterate over a sequence of rows, where each row is an iterable of strings
    and/or numbers, writing each to a separate line in file ``filepath`` with
    individual values separated by ``delimiter``.

    Args:
        rows (Iterable[Iterable]): iterable of iterables of strings and/or
            numbers to write to disk; for example::

                [['That was a great movie!', 0.9],
                 ['The movie was okay, I guess.', 0.2],
                 ['Worst. Movie. Ever.', -1.0]]

        filepath (str): /path/to/file on disk where rows will be written
        encoding (str)
        auto_make_dirs (bool)
        dialect (str): a grouping of formatting parameters that determine how
            the tabular data is parsed when reading/writing
        delimiter (str): 1-character string used to separate fields in a row

    .. seealso:: https://docs.python.org/3/library/csv.html#csv.writer

    .. note:: Here, CSV is used as a catch-all term for *any* delimited file
        format, and ``delimiter=','`` is merely the function's default value.
        Other common delimited formats are TSV (tab-separated-value, with
        ``delimiter='\\t'``) and PSV (pipe-separated-value, with ``delimiter='|'``.
    """
    with open_sesame(filepath, mode='wt', encoding=encoding, newline='') as f:
        csv_writer = csv.writer(f, dialect=dialect, delimiter=delimiter)
        csv_writer.writerows(rows)