def save(self, path, fname_prefix=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def test_read_write_json_lines(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] filename = os.path.join(self.tempdir, 'test_read_write_json_lines.json') fileio.write_json_lines(expected, filename) observed = list(fileio.read_json_lines(filename)) self.assertEqual(observed, expected)
def save(self, path, fname_prefix=None, compression=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if PY2 is False or compression is None else 'wb' package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines( (doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def test_read_write_json_lines(self): expected = [{ 'idx': i, 'sent': sent.text } for i, sent in enumerate(self.spacy_doc.sents)] filename = os.path.join(self.tempdir, 'test_read_write_json_lines.json') fileio.write_json_lines(expected, filename) observed = list(fileio.read_json_lines(filename)) self.assertEqual(observed, expected)
def setUp(self): self.tempdir = tempfile.mkdtemp( prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__))) reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2') if PY2 is False: write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wt', auto_make_dirs=True) else: write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wb', auto_make_dirs=True) self.redditreader = RedditReader(reddit_fname)
def setUp(self): self.tempdir = tempfile.mkdtemp( prefix='test_corpora', dir=os.path.dirname(os.path.abspath(__file__))) reddit_fname = os.path.join(self.tempdir, 'RC_test.bz2') if is_python2 is False: write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wt', auto_make_dirs=True) else: write_json_lines(REDDIT_COMMENTS, reddit_fname, mode='wb', auto_make_dirs=True) self.redditreader = RedditReader(reddit_fname)
def test_read_write_json_lines_unicode(self): expected = [{'idx': i, 'sent': sent.text} for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join( self.tempdir, 'test_read_write_json_lines_unicode' + ext) if PY2 is True and ext != '.json': self.assertRaises( ValueError, fileio.open_sesame, filename, 'wt', None, True) else: fileio.write_json_lines(expected, filename, mode='wt', auto_make_dirs=True) observed = list(fileio.read_json_lines(filename, mode='rt')) self.assertEqual(observed, expected)
def save(self, path, fname_prefix=None, compression=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join( path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if PY2 is False or compression is None else 'wb' package_info = { 'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__ } fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def save(self, path, fname_prefix=None): """ Save serialized TextCorpus content and metadata to disk. Args: path (str): directory on disk where content + metadata will be saved fname_prefix (str, optional): prepend standard filenames 'spacy_docs.bin' and 'metadatas.json' with additional identifying information """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def save(self, path, name=None, compression=None): """ Save ``Corpus`` content and metadata to disk. Args: path (str): Directory on disk where content + metadata will be saved. name (str): Prepend default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json' with a name to identify/uniquify this particular corpus. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file, if any. .. warning:: If the ``spacy.Vocab`` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.pkl'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if is_python2 is False or compression is None else 'wb' package_info = { 'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__ } fileio.write_json(package_info, info_fname) fileio.write_json_lines((doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)
def test_read_write_json_lines_bytes(self): expected = [{ 'idx': i, 'sent': sent.text } for i, sent in enumerate(self.spacy_doc.sents)] for ext in ('.json', '.json.gz', '.json.bz2', '.json.xz'): filename = os.path.join(self.tempdir, 'test_read_write_json_lines_bytes' + ext) if is_python2 is True: if ext == '.json.xz': self.assertRaises(ValueError, fileio.open_sesame, filename, 'wb', 'utf-8', True) else: fileio.write_json_lines(expected, filename, mode='wb', auto_make_dirs=True) observed = list(fileio.read_json_lines(filename, mode='rb')) self.assertEqual(observed, expected) else: self.assertRaises(TypeError, fileio.write_json_lines, expected, filename, 'wb', None, True)
def save(self, path, name=None, compression=None): """ Save ``Corpus`` content and metadata to disk. Args: path (str): Directory on disk where content + metadata will be saved. name (str): Prepend default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json' with a name to identify/uniquify this particular corpus. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file, if any. .. warning:: If the ``spacy.Vocab`` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'wt' if PY2 is False or compression is None else 'wb' package_info = {'textacy_lang': self.lang, 'spacy_version': spacy.about.__version__} fileio.write_json(package_info, info_fname) fileio.write_json_lines( (doc.metadata for doc in self), meta_fname, mode=meta_mode, ensure_ascii=False, separators=(',', ':')) fileio.write_spacy_docs((doc.spacy_doc for doc in self), docs_fname)