def __iter__(self): """ Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, page title, page content) 3-tuple at a time. Yields: Tuple[str, str, str]: page id, title, content with wikimedia markup """ if not self.filename: raise IOError('{} file not found'.format(self._filename)) if compat.is_python2 is False: events = ('end', ) f = fileio.open_sesame(self.filename, mode='rt') else: # Python 2 can't open bzip in text mode :( events = (b'end', ) f = fileio.open_sesame(self.filename, mode='rb') with f: elems = (elem for _, elem in iterparse(f, events=events)) elem = next(elems) match = re.match('^{(.*?)}', elem.tag) namespace = match.group(1) if match else '' if not namespace.startswith( 'http://www.mediawiki.org/xml/export-'): raise ValueError( 'namespace "{}" not a valid MediaWiki dump namespace'. format(namespace)) page_tag = '{%s}page' % namespace ns_path = './{%s}ns' % namespace page_id_path = './{%s}id' % namespace title_path = './{%s}title' % namespace text_path = './{%s}revision/{%s}text' % (namespace, namespace) for elem in elems: if elem.tag == page_tag: page_id = elem.find(page_id_path).text title = elem.find(title_path).text ns = elem.find(ns_path).text if ns != '0': content = '' else: content = elem.find(text_path).text if content is None: content = '' elif not isinstance(content, compat.unicode_): content = compat.bytes_to_unicode(content, errors='ignore') yield page_id, title, content elem.clear()
def __iter__(self): """ Iterate over the pages in a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, title, page content) 3-tuple at a time. """ if PY2 is False: for title, content, page_id in extract_pages( open_sesame(self.wikicorpus.fname, mode='rt'), self.wikicorpus.filter_namespaces): yield (page_id, title, content) else: # Python 2 sucks and can't open bzip in text mode for title, content, page_id in extract_pages( open_sesame(self.wikicorpus.fname, mode='rb'), self.wikicorpus.filter_namespaces): yield (page_id, title, content)
def read_csv(filepath, encoding=None, dialect='excel', delimiter=','): """ Iterate over a stream of rows, where each row is an iterable of strings and/or numbers with individual values separated by ``delimiter``. Args: filepath (str): /path/to/file on disk from which rows will be streamed encoding (str) dialect (str): a grouping of formatting parameters that determine how the tabular data is parsed when reading/writing; if 'infer', the first 1024 bytes of the file is analyzed, producing a best guess for the correct dialect delimiter (str): 1-character string used to separate fields in a row Yields: List[obj]: next row, whose elements are strings and/or numbers .. seealso:: https://docs.python.org/3/library/csv.html#csv.reader """ with open_sesame(filepath, mode='rt', encoding=encoding, newline='') as f: if dialect == 'infer': dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) for row in csv.reader(f, dialect=dialect, delimiter=delimiter): yield row
def read_file(filepath, mode='rt', encoding=None): """ Read the full contents of a file. Files compressed with gzip, bz2, or lzma are handled automatically. """ with open_sesame(filepath, mode=mode, encoding=encoding) as f: return f.read()
def read_json_mash(filepath, mode='rt', encoding=None, buffersize=2048): """ Iterate over a stream of JSON objects, all of them mashed together, end-to-end, on a single line of a file. Bad form, but still manageable. Args: filepath (str): /path/to/file on disk from which json objects will be streamed, where all json objects are mashed together, end-to-end, on a single line,; for example:: {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}{"title": "2BR02B", "text": "Everything was perfectly swell."} mode (str, optional) encoding (str, optional) buffersize (int, optional): number of bytes to read in as a chunk Yields: dict: next valid JSON object, converted to native Python equivalent """ with open_sesame(filepath, mode=mode, encoding=encoding) as f: buffer = '' for chunk in iter(partial(f.read, buffersize), ''): buffer += chunk while buffer: try: result, index = JSON_DECODER.raw_decode(buffer) yield result buffer = buffer[index:] # not enough data to decode => read another chunk except ValueError: break
def write_json(json_object, filepath, mode='wt', encoding=None, auto_make_dirs=False, ensure_ascii=False, indent=None, separators=(',', ':'), sort_keys=False): """ Write JSON object all at once to disk at ``filepath``. Args: json_object (json): valid JSON object to be written filepath (str): /path/to/file on disk to which json object will be written, such as a JSON array; for example:: [ {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}, {"title": "2BR02B", "text": "Everything was perfectly swell."} ] mode (str) encoding (str) auto_make_dirs (bool) indent (int or str) ensure_ascii (bool) separators (tuple[str]) sort_keys (bool) .. seealso:: https://docs.python.org/3/library/json.html#json.dump """ with open_sesame(filepath, mode=mode, encoding=encoding, auto_make_dirs=auto_make_dirs) as f: f.write(json.dumps(json_object, indent=indent, ensure_ascii=ensure_ascii, separators=separators, sort_keys=sort_keys))
def read_json(filepath, mode='rt', encoding=None, prefix=''): """ Iterate over JSON objects matching the field given by ``prefix``. Useful for reading a large JSON array one item (with ``prefix='item'``) or sub-item (``prefix='item.fieldname'``) at a time. Args: filepath (str): /path/to/file on disk from which json items will be streamed, such as items in a JSON array; for example:: [ {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}, {"title": "2BR02B", "text": "Everything was perfectly swell."} ] mode (str, optional) encoding (str, optional) prefix (str, optional): if '', the entire JSON object will be read in at once; if 'item', each item in a top-level array will be read in successively; if 'item.text', each array item's 'text' value will be read in successively Yields: next matching JSON object; could be a dict, list, int, float, str, depending on the value of ``prefix`` Notes: Refer to ``ijson`` at https://pypi.python.org/pypi/ijson/ for usage details. """ with open_sesame(filepath, mode=mode, encoding=encoding) as f: if prefix == '': yield json.load(f) else: for item in ijson.items(f, prefix): yield item
def write_json_lines(json_objects, filepath, mode='wt', encoding=None, auto_make_dirs=False, ensure_ascii=False, separators=(',', ':'), sort_keys=False): """ Iterate over a stream of JSON objects, writing each to a separate line in file ``filepath`` but without a top-level JSON object (e.g. array). Args: json_objects (iterable[json]): iterable of valid JSON objects to be written filepath (str): /path/to/file on disk to which JSON objects will be written, where each line in the file is its own json object; for example:: {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n {"title": "2BR02B", "text": "Everything was perfectly swell."} mode (str) encoding (str) auto_make_dirs (bool) ensure_ascii (bool) separators (tuple[str]) sort_keys (bool) .. seealso:: https://docs.python.org/3/library/json.html#json.dump """ newline = '\n' if 't' in mode else unicode_to_bytes('\n') with open_sesame(filepath, mode=mode, encoding=encoding, auto_make_dirs=auto_make_dirs) as f: for json_object in json_objects: f.write(json.dumps(json_object, ensure_ascii=ensure_ascii, separators=separators, sort_keys=sort_keys) + newline)
def write_csv(rows, filepath, encoding=None, auto_make_dirs=False, dialect='excel', delimiter=',', ): """ Iterate over a sequence of rows, where each row is an iterable of strings and/or numbers, writing each to a separate line in file ``filepath`` with individual values separated by ``delimiter``. Args: rows (Iterable[Iterable]): iterable of iterables of strings and/or numbers to write to disk; for example:: [['That was a great movie!', 0.9], ['The movie was okay, I guess.', 0.2], ['Worst. Movie. Ever.', -1.0]] filepath (str): /path/to/file on disk where rows will be written encoding (str) auto_make_dirs (bool) dialect (str): a grouping of formatting parameters that determine how the tabular data is parsed when reading/writing delimiter (str): 1-character string used to separate fields in a row .. seealso:: https://docs.python.org/3/library/csv.html#csv.writer .. note:: Here, CSV is used as a catch-all term for *any* delimited file format, and `delimiter=','` is merely the function's default value. Other common delimited formats are TSV (tab-separated-value, with `delimiter='\t''`) and PSV (pipe-separated-value, with `delimiter='|'`. """ with open_sesame(filepath, mode='wt', encoding=encoding, newline='') as f: csv_writer = csv.writer(f, dialect=dialect, delimiter=delimiter) csv_writer.writerows(rows)
def read_file_lines(filepath, mode='rt', encoding=None): """ Read the contents of a file, line by line. Files compressed with gzip, bz2, or lzma are handled automatically. """ with open_sesame(filepath, mode=mode, encoding=encoding) as f: for line in f: yield line
def __iter__(self): """ Iterate over the pages of a Wikipedia articles database dump (*articles.xml.bz2), yielding one (page id, page title, page content) 3-tuple at a time. Yields: Tuple[str, str, str]: page id, title, content with wikimedia markup """ if PY2 is False: events = ('end',) f = open_sesame(self.path, mode='rt') else: # Python 2 can't open bzip in text mode :( events = (b'end',) f = open_sesame(self.path, mode='rb') with f: elems = (elem for _, elem in iterparse(f, events=events)) elem = next(elems) match = re.match('^{(.*?)}', elem.tag) namespace = match.group(1) if match else '' if not namespace.startswith('http://www.mediawiki.org/xml/export-'): raise ValueError( 'namespace "{}" not a valid MediaWiki dump namespace'.format(namespace)) page_tag = '{%s}page' % namespace ns_path = './{%s}ns' % namespace page_id_path = './{%s}id' % namespace title_path = './{%s}title' % namespace text_path = './{%s}revision/{%s}text' % (namespace, namespace) for elem in elems: if elem.tag == page_tag: page_id = elem.find(page_id_path).text title = elem.find(title_path).text ns = elem.find(ns_path).text if ns != '0': content = '' else: content = elem.find(text_path).text if not isinstance(content, unicode_type): content = bytes_to_unicode(content, errors='ignore') yield page_id, title, content elem.clear()
def write_file(content, filepath, mode='wt', encoding=None, auto_make_dirs=False): """ Write ``content`` to disk at ``filepath``. Files with appropriate extensions are compressed with gzip or bz2 automatically. Any intermediate folders not found on disk may automatically be created. """ with open_sesame(filepath, mode=mode, encoding=encoding, auto_make_dirs=auto_make_dirs) as f: f.write(content)
def write_file_lines(lines, filepath, mode='wt', encoding=None, auto_make_dirs=False): """ Write the content in ``lines`` to disk at ``filepath``, line by line. Files with appropriate extensions are compressed with gzip or bz2 automatically. Any intermediate folders not found on disk may automatically be created. """ newline = '\n' if 't' in mode else unicode_to_bytes('\n') with open_sesame(filepath, mode=mode, encoding=encoding, auto_make_dirs=auto_make_dirs) as f: for line in lines: f.write(line + newline)
def read_spacy_docs(filepath): """ Stream ``spacy.Doc`` s from disk at ``filepath`` where they were serialized via pickle. Args: filepath (str): /path/to/file on disk from which spacy docs will be streamed Yields: The next deserialized ``spacy.Doc``. """ with open_sesame(filepath, mode='rb') as f: for spacy_doc in pickle.load(f): yield spacy_doc
def read_spacy_docs(spacy_vocab, filepath): """ Stream ``spacy.Doc`` s from disk at ``filepath`` where they were serialized using Spacy's ``spacy.Doc.to_bytes()`` functionality. Args: spacy_vocab (``spacy.Vocab``): the spacy vocab object used to serialize the docs in ``filepath`` filepath (str): /path/to/file on disk from which spacy docs will be streamed Yields: the next deserialized ``spacy.Doc`` """ with open_sesame(filepath, mode='rb') as f: for bytes_string in SpacyDoc.read_bytes(f): yield SpacyDoc(spacy_vocab).from_bytes(bytes_string)
def write_spacy_docs(spacy_docs, filepath, auto_make_dirs=False): """ Serialize a sequence of ``spacy.Doc`` s to disk at ``filepath`` using Spacy's ``spacy.Doc.to_bytes()`` functionality. Args: spacy_docs (``spacy.Doc`` or iterable(``spacy.Doc``)): a single spacy doc or a sequence of spacy docs to serialize to disk at ``filepath`` filepath (str): /path/to/file on disk to which spacy docs will be streamed auto_make_dirs (bool) """ if isinstance(spacy_docs, SpacyDoc): spacy_docs = (spacy_docs, ) with open_sesame(filepath, mode='wb', auto_make_dirs=auto_make_dirs) as f: for doc in spacy_docs: f.write(doc.to_bytes())
def write_spacy_docs(spacy_docs, filepath, auto_make_dirs=False): """ Serialize a sequence of ``spacy.Doc`` s to disk at ``filepath`` using Spacy's ``spacy.Doc.to_bytes()`` functionality. Args: spacy_docs (``spacy.Doc`` or iterable(``spacy.Doc``)): a single spacy doc or a sequence of spacy docs to serialize to disk at ``filepath`` filepath (str): /path/to/file on disk to which spacy docs will be streamed auto_make_dirs (bool) """ if isinstance(spacy_docs, SpacyDoc): spacy_docs = (spacy_docs,) with open_sesame(filepath, mode='wb', auto_make_dirs=auto_make_dirs) as f: for doc in spacy_docs: f.write(doc.to_bytes())
def write_spacy_docs(spacy_docs, filepath, auto_make_dirs=False): """ Serialize a sequence of ``spacy.Doc`` s to disk at ``filepath`` using pickle. Args: spacy_docs (``spacy.Doc`` or iterable(``spacy.Doc``)): a single spacy doc or a sequence of spacy docs to serialize to disk at ``filepath`` filepath (str): /path/to/file on disk to which spacy docs will be streamed auto_make_dirs (bool) .. note:: The docs are pickled together, as a list, so they are all loaded into memory before saving. Mind your RAM usage! """ if isinstance(spacy_docs, SpacyDoc): spacy_docs = [spacy_docs] with open_sesame(filepath, mode='wb', auto_make_dirs=auto_make_dirs) as f: pickle.dump(list(spacy_docs), f, protocol=-1)
def write_json(json_object, filepath, mode='wt', encoding=None, auto_make_dirs=False, ensure_ascii=False, indent=None, separators=(',', ':'), sort_keys=False): """ Write JSON object all at once to disk at ``filepath``. Args: json_object (json): valid JSON object to be written filepath (str): /path/to/file on disk to which json object will be written, such as a JSON array; for example:: [ {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}, {"title": "2BR02B", "text": "Everything was perfectly swell."} ] mode (str) encoding (str) auto_make_dirs (bool) indent (int or str) ensure_ascii (bool) separators (tuple[str]) sort_keys (bool) .. seealso:: https://docs.python.org/3/library/json.html#json.dump """ with open_sesame(filepath, mode=mode, encoding=encoding, auto_make_dirs=auto_make_dirs) as f: f.write( json.dumps(json_object, indent=indent, ensure_ascii=ensure_ascii, separators=separators, sort_keys=sort_keys))
def read_json_lines(filepath, mode='rt', encoding=None): """ Iterate over a stream of JSON objects, where each line of file ``filepath`` is a valid JSON object but no JSON object (e.g. array) exists at the top level. Args: filepath (str): /path/to/file on disk from which json objects will be streamed, where each line in the file must be its own json object; for example:: {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n {"title": "2BR02B", "text": "Everything was perfectly swell."} mode (str, optional) encoding (str, optional) Yields: dict: next valid JSON object, converted to native Python equivalent """ with open_sesame(filepath, mode=mode, encoding=encoding) as f: for line in f: yield json.loads(line)
def write_json_lines(json_objects, filepath, mode='wt', encoding=None, auto_make_dirs=False, ensure_ascii=False, separators=(',', ':'), sort_keys=False): """ Iterate over a stream of JSON objects, writing each to a separate line in file ``filepath`` but without a top-level JSON object (e.g. array). Args: json_objects (iterable[json]): iterable of valid JSON objects to be written filepath (str): /path/to/file on disk to which JSON objects will be written, where each line in the file is its own json object; for example:: {"title": "Harrison Bergeron", "text": "The year was 2081, and everybody was finally equal."}\n {"title": "2BR02B", "text": "Everything was perfectly swell."} mode (str) encoding (str) auto_make_dirs (bool) ensure_ascii (bool) separators (tuple[str]) sort_keys (bool) .. seealso:: https://docs.python.org/3/library/json.html#json.dump """ newline = '\n' if 't' in mode else unicode_to_bytes('\n') with open_sesame(filepath, mode=mode, encoding=encoding, auto_make_dirs=auto_make_dirs) as f: for json_object in json_objects: f.write( json.dumps(json_object, ensure_ascii=ensure_ascii, separators=separators, sort_keys=sort_keys) + newline)
def write_csv( rows, filepath, encoding=None, auto_make_dirs=False, dialect='excel', delimiter=',', ): """ Iterate over a sequence of rows, where each row is an iterable of strings and/or numbers, writing each to a separate line in file ``filepath`` with individual values separated by ``delimiter``. Args: rows (Iterable[Iterable]): iterable of iterables of strings and/or numbers to write to disk; for example:: [['That was a great movie!', 0.9], ['The movie was okay, I guess.', 0.2], ['Worst. Movie. Ever.', -1.0]] filepath (str): /path/to/file on disk where rows will be written encoding (str) auto_make_dirs (bool) dialect (str): a grouping of formatting parameters that determine how the tabular data is parsed when reading/writing delimiter (str): 1-character string used to separate fields in a row .. seealso:: https://docs.python.org/3/library/csv.html#csv.writer .. note:: Here, CSV is used as a catch-all term for *any* delimited file format, and ``delimiter=','`` is merely the function's default value. Other common delimited formats are TSV (tab-separated-value, with ``delimiter='\\t'``) and PSV (pipe-separated-value, with ``delimiter='|'``. """ with open_sesame(filepath, mode='wt', encoding=encoding, newline='') as f: csv_writer = csv.writer(f, dialect=dialect, delimiter=delimiter) csv_writer.writerows(rows)