def test_read_write_spacy_doc(self): expected = [tok.lemma_ for tok in self.spacy_doc] filename = os.path.join(self.tempdir, 'test_read_write_spacy_doc.bin') fileio.write_spacy_docs(self.spacy_doc, filename) observed = [tok.lemma_ for doc in fileio.read_spacy_docs(self.spacy_pipeline.vocab, filename) for tok in doc] self.assertEqual(observed, expected)
def load(cls, path, fname_prefix=None): """ Load serialized content and metadata from disk, and initialize a TextDoc. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_doc.bin' and 'metadata.json' when saving to disk Returns: :class:`textacy.TextDoc` """ if fname_prefix: meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadata.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') docs_fname = os.path.join(path, 'spacy_doc.bin') metadata = list(fileio.read_json(meta_fname))[0] lang = metadata.pop('textacy_lang') spacy_version = metadata.pop('spacy_version') if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextDoc to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextDoc may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) spacy_vocab = data.load_spacy(lang).vocab return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0], lang=lang, metadata=metadata)
def load(cls, path, fname_prefix=None, compression=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file Returns: :class:`textacy.TextCorpus` .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join( path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines( meta_fname, mode=meta_mode, ) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def test_read_write_spacy_doc(self): expected = [tok.lemma_ for tok in self.spacy_doc] filename = os.path.join(self.tempdir, 'test_read_write_spacy_doc.bin') fileio.write_spacy_docs(self.spacy_doc, filename) observed = [ tok.lemma_ for doc in fileio.read_spacy_docs( self.spacy_pipeline.vocab, filename) for tok in doc ] self.assertEqual(observed, expected)
def test_read_write_spacy_doc(self): expected = [tok.lemma_ for tok in self.spacy_doc] tempdir = tempfile.mkdtemp() filename = os.path.join(tempdir, 'test_read_write_spacy_doc.bin') fileio.write_spacy_docs(self.spacy_doc, filename) observed = [tok.lemma_ for doc in fileio.read_spacy_docs(self.spacy_pipeline.vocab, filename) for tok in doc] os.remove(filename) os.rmdir(tempdir) self.assertEqual(observed, expected)
def load(cls, path, name=None, compression=None): """ Load content and metadata from disk, and initialize a ``Corpus``. Args: path (str): Directory on disk where content + metadata are saved. name (str): Identifying/uniquifying name prepended to the default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json', used when corpus was saved to disk via :meth:`Corpus.save()`. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file when saved, if any. Returns: :class:`textacy.Corpus <Corpus>` .. warning:: If the ``spacy.Vocab`` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this Corpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded Corpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) corpus = Corpus(lang) metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode) spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): corpus.add_doc( Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) return corpus
def load(cls, path, fname_prefix=None, compression=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk compression ({'gzip', 'bz2', 'lzma'} or None): type of compression used to reduce size of metadatas json file Returns: :class:`textacy.TextCorpus` .. warn:: If the `spacy.Vocab` object used to save this corpus is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode,) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def test_read_write_spacy_docs(self): expected = [tok.lemma_ for tok in self.spacy_doc] for ext in ('.pkl', '.pkl.gz', '.pkl.bz2', '.pkl.xz'): filename = os.path.join(self.tempdir, 'test_read_write_spacy_docs' + ext) if is_python2 is True and ext == '.pkl.xz': self.assertRaises(ValueError, fileio.open_sesame, filename, 'wb', None, True) else: fileio.write_spacy_docs(self.spacy_doc, filename, True) observed = [ tok.lemma_ for doc in fileio.read_spacy_docs(filename) for tok in doc ] self.assertEqual(observed, expected)
def test_read_write_spacy_docs(self): expected = [tok.lemma_ for tok in self.spacy_doc] for ext in ('.bin', '.bin.gz', '.bin.bz2', '.bin.xz'): filename = os.path.join( self.tempdir, 'test_read_write_spacy_docs' + ext) if PY2 is True and ext == '.bin.gz': # no idea why this is the case self.assertRaises( TypeError, fileio.write_spacy_docs, self.spacy_doc, filename, True) else: fileio.write_spacy_docs(self.spacy_doc, filename, True) observed = [ tok.lemma_ for doc in fileio.read_spacy_docs(self.spacy_pipeline.vocab, filename) for tok in doc] self.assertEqual(observed, expected)
def test_read_write_spacy_docs(self): expected = [tok.lemma_ for tok in self.spacy_doc] for ext in ('.bin', '.bin.gz', '.bin.bz2', '.bin.xz'): filename = os.path.join(self.tempdir, 'test_read_write_spacy_docs' + ext) if PY2 is True and ext == '.bin.xz': self.assertRaises(ValueError, fileio.open_sesame, filename, 'wb', None, True) elif PY2 is True and ext == '.bin.gz': # no idea why this is the case self.assertRaises(TypeError, fileio.write_spacy_docs, self.spacy_doc, filename, True) else: fileio.write_spacy_docs(self.spacy_doc, filename, True) observed = [ tok.lemma_ for doc in fileio.read_spacy_docs( self.spacy_lang.vocab, filename) for tok in doc ] self.assertEqual(observed, expected)
def load(cls, path, fname_prefix=None): """ Load serialized content and metadata from disk, and initialize a TextCorpus. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json' when saving to disk Returns: :class:`textacy.TextCorpus` """ if fname_prefix: info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json'])) meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextCorpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextCorpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) textcorpus = TextCorpus(lang) metadata_stream = fileio.read_json_lines(meta_fname) spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): textcorpus.add_doc( TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline, lang=lang, metadata=metadata)) return textcorpus
def load(cls, path, fname_prefix=None): """ Load serialized content and metadata from disk, and initialize a TextDoc. Args: path (str): directory on disk where content + metadata are saved fname_prefix (str, optional): additional identifying information prepended to standard filenames 'spacy_doc.bin' and 'metadata.json' when saving to disk Returns: :class:`textacy.TextDoc` .. warn:: If the `spacy.Vocab` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if fname_prefix: meta_fname = os.path.join( path, '_'.join([fname_prefix, 'metadata.json'])) docs_fname = os.path.join( path, '_'.join([fname_prefix, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') docs_fname = os.path.join(path, 'spacy_doc.bin') metadata = list(fileio.read_json(meta_fname))[0] lang = metadata.pop('textacy_lang') spacy_version = metadata.pop('spacy_version') if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this TextDoc to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded TextDoc may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) spacy_vocab = data.load_spacy(lang).vocab return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0], lang=lang, metadata=metadata)
def load(cls, path, name=None): """ Load content and metadata from disk, and initialize a ``Doc``. Args: path (str): Directory on disk where content and metadata are saved. name (str): Identifying/uniquifying name prepended to the default filenames 'spacy_doc.bin' and 'metadata.json', used when doc was saved to disk via :meth:`Doc.save()`. Returns: :class:`textacy.Doc <Doc>` .. warning:: If the ``spacy.Vocab`` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: meta_fname = os.path.join(path, '_'.join([name, 'metadata.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_doc.bin'])) else: meta_fname = os.path.join(path, 'metadata.json') docs_fname = os.path.join(path, 'spacy_doc.bin') metadata = list(fileio.read_json(meta_fname))[0] lang = metadata.pop('textacy_lang') spacy_version = metadata.pop('spacy_version') if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this Doc to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded Doc may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) spacy_vocab = data.load_spacy(lang).vocab return cls(list(fileio.read_spacy_docs(spacy_vocab, docs_fname))[0], lang=lang, metadata=metadata)