Exemplo n.º 1
0
    def __init__(self, lang, texts=None, docs=None, metadatas=None):
        if isinstance(lang, unicode_type):
            self.lang = lang
            self.spacy_lang = data.load_spacy(self.lang)
        elif isinstance(lang, SpacyLang):
            self.lang = lang.lang
            self.spacy_lang = lang
        else:
            msg = '`lang` must be {}, not "{}"'.format(
                {unicode_type, SpacyLang}, type(lang))
            raise ValueError(msg)
        self.spacy_vocab = self.spacy_lang.vocab
        self.spacy_stringstore = self.spacy_vocab.strings
        self.docs = []
        self.n_docs = 0
        self.n_tokens = 0
        self.n_sents = 0 if self.spacy_lang.parser else None

        if texts and docs:
            msg = 'Corpus may be initialized with either `texts` or `docs`, but not both.'
            raise ValueError(msg)
        if texts:
            self.add_texts(texts, metadatas=metadatas)
        elif docs:
            if metadatas:
                for doc, metadata in zip(docs, metadatas):
                    self.add_doc(doc, metadata=metadata)
            else:
                for doc in docs:
                    self.add_doc(doc)
Exemplo n.º 2
0
    def from_texts(cls, lang, texts, metadata=None, n_threads=2, batch_size=1000):
        """
        Convenience function for creating a :class:`TextCorpus <textacy.texts.TextCorpus>`
        from an iterable of text strings.

        Args:
            lang (str)
            texts (iterable(str))
            metadata (iterable(dict), optional)
            n_threads (int, optional)
            batch_size (int, optional)

        Returns:
            :class:`TextCorpus <textacy.texts.TextCorpus>`
        """
        textcorpus = cls(lang=lang)
        spacy_docs = textcorpus.spacy_pipeline.pipe(
            texts, n_threads=n_threads, batch_size=batch_size)
        if metadata is not None:
            for spacy_doc, md in zip(spacy_docs, metadata):
                textcorpus.add_doc(TextDoc(spacy_doc, lang=lang,
                                           spacy_pipeline=textcorpus.spacy_pipeline,
                                           metadata=md))
        else:
            for spacy_doc in spacy_docs:
                textcorpus.add_doc(TextDoc(spacy_doc, lang=lang,
                                           spacy_pipeline=textcorpus.spacy_pipeline,
                                           metadata=None))
        return textcorpus
Exemplo n.º 3
0
    def add_texts(self, texts, metadatas=None, n_threads=4, batch_size=1000):
        """
        Process a stream of texts (and a corresponding stream of metadata dicts,
        optionally) in parallel with spaCy; add as :class:`textacy.Doc <textacy.doc.Doc>` s
        to the corpus.

        Args:
            texts (Iterable[str]): Stream of texts to add to corpus as ``Doc`` s
            metadatas (Iterable[dict]): Stream of dictionaries of relevant
                document metadata. **Note:** This stream must align exactly with
                ``texts``, or metadata will be mis-assigned to texts. More
                concretely, the first item in ``metadatas`` will be assigned to
                the first item in ``texts``, and so on from there.
            n_threads (int): Number of threads to use when processing ``texts``
                in parallel, if available.
            batch_size (int): Number of texts to process at a time.

        See Also:
            :func:`fileio.split_record_fields()`
            http://spacy.io/docs/#multi-threaded
        """
        spacy_docs = self.spacy_lang.pipe(
            texts, n_threads=n_threads, batch_size=batch_size)
        if metadatas:
            for spacy_doc, metadata in zip(spacy_docs, metadatas):
                self._add_textacy_doc(
                    Doc(spacy_doc, lang=self.spacy_lang, metadata=metadata))
        else:
            for spacy_doc in spacy_docs:
                self._add_textacy_doc(
                    Doc(spacy_doc, lang=self.spacy_lang, metadata=None))
Exemplo n.º 4
0
    def add_texts(self, texts, metadatas=None, n_threads=4, batch_size=1000):
        """
        Process a stream of texts (and a corresponding stream of metadata dicts,
        optionally) in parallel with spaCy; add as :class:`textacy.Doc <textacy.doc.Doc>` s
        to the corpus.

        Args:
            texts (Iterable[str]): Stream of texts to add to corpus as ``Doc`` s
            metadatas (Iterable[dict]): Stream of dictionaries of relevant
                document metadata. **Note:** This stream must align exactly with
                ``texts``, or metadata will be mis-assigned to texts. More
                concretely, the first item in ``metadatas`` will be assigned to
                the first item in ``texts``, and so on from there.
            n_threads (int): Number of threads to use when processing ``texts``
                in parallel, if available.
            batch_size (int): Number of texts to process at a time.

        See Also:
            :func:`fileio.split_record_fields()`
            http://spacy.io/docs/#multi-threaded
        """
        spacy_docs = self.spacy_lang.pipe(texts,
                                          n_threads=n_threads,
                                          batch_size=batch_size)
        if metadatas:
            for spacy_doc, metadata in zip(spacy_docs, metadatas):
                self._add_textacy_doc(
                    Doc(spacy_doc, lang=self.spacy_lang, metadata=metadata))
        else:
            for spacy_doc in spacy_docs:
                self._add_textacy_doc(
                    Doc(spacy_doc, lang=self.spacy_lang, metadata=None))
Exemplo n.º 5
0
    def __init__(self, lang, texts=None, docs=None, metadatas=None):
        if isinstance(lang, unicode_type):
            self.lang = lang
            self.spacy_lang = data.load_spacy(self.lang)
        elif isinstance(lang, SpacyLang):
            self.lang = lang.lang
            self.spacy_lang = lang
        else:
            msg = '`lang` must be {}, not "{}"'.format(
                {unicode_type, SpacyLang}, type(lang))
            raise ValueError(msg)
        self.spacy_vocab = self.spacy_lang.vocab
        self.spacy_stringstore = self.spacy_vocab.strings
        self.docs = []
        self.n_docs = 0
        self.n_tokens = 0
        self.n_sents = 0 if self.spacy_lang.parser else None

        if texts and docs:
            msg = 'Corpus may be initialized with either `texts` or `docs`, but not both.'
            raise ValueError(msg)
        if texts:
            self.add_texts(texts, metadatas=metadatas)
        elif docs:
            if metadatas:
                for doc, metadata in zip(docs, metadatas):
                    self.add_doc(doc, metadata=metadata)
            else:
                for doc in docs:
                    self.add_doc(doc)
Exemplo n.º 6
0
    def load(cls, path, fname_prefix=None, compression=None):
        """
        Load serialized content and metadata from disk, and initialize a TextCorpus.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json'
                when saving to disk
            compression ({'gzip', 'bz2', 'lzma'} or None): type of compression
                used to reduce size of metadatas json file

        Returns:
            :class:`textacy.TextCorpus`

        .. warn:: If the `spacy.Vocab` object used to save this corpus is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            info_fname = os.path.join(path,
                                      '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(
                path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(
                path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip' else
                                   '.bz2' if compression == 'bz2' else
                                   '.xz' if compression == 'lzma' else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextCorpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextCorpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        textcorpus = TextCorpus(lang)
        metadata_stream = fileio.read_json_lines(
            meta_fname,
            mode=meta_mode,
        )
        spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            textcorpus.add_doc(
                TextDoc(spacy_doc,
                        spacy_pipeline=textcorpus.spacy_pipeline,
                        lang=lang,
                        metadata=metadata))
        return textcorpus
Exemplo n.º 7
0
    def load(cls, path, name=None, compression=None):
        """
        Load content and metadata from disk, and initialize a ``Corpus``.

        Args:
            path (str): Directory on disk where content + metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json',
                used when corpus was saved to disk via :meth:`Corpus.save()`.
            compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression
                used to reduce size of 'metadatas.json' file when saved, if any.

        Returns:
            :class:`textacy.Corpus <Corpus>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            info_fname = os.path.join(path, '_'.join([name, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Corpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Corpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        corpus = Corpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode)
        spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            corpus.add_doc(
                Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
        return corpus
Exemplo n.º 8
0
    def load(cls, path, name=None, compression=None):
        """
        Load content and metadata from disk, and initialize a ``Corpus``.

        Args:
            path (str): Directory on disk where content + metadata are saved.
            name (str): Identifying/uniquifying name prepended to the default
                filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json',
                used when corpus was saved to disk via :meth:`Corpus.save()`.
            compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression
                used to reduce size of 'metadatas.json' file when saved, if any.

        Returns:
            :class:`textacy.Corpus <Corpus>`

        .. warning:: If the ``spacy.Vocab`` object used to save this document is
            not the same as the one used to load it, there will be problems!
            Consequently, this functionality is only useful as short-term but
            not long-term storage.
        """
        if name:
            info_fname = os.path.join(path, '_'.join([name, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this Corpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded Corpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        corpus = Corpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode)
        spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            corpus.add_doc(
                Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata))
        return corpus
Exemplo n.º 9
0
    def load(cls, path, fname_prefix=None, compression=None):
        """
        Load serialized content and metadata from disk, and initialize a TextCorpus.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json'
                when saving to disk
            compression ({'gzip', 'bz2', 'lzma'} or None): type of compression
                used to reduce size of metadatas json file

        Returns:
            :class:`textacy.TextCorpus`

        .. warn:: If the `spacy.Vocab` object used to save this corpus is not the
            same as the one used to load it, there will be problems! Consequently,
            this functionality is only useful as short-term but not long-term storage.
        """
        if fname_prefix:
            info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        meta_fname = meta_fname + ('.gz' if compression == 'gzip'
                                   else '.bz2' if compression == 'bz2'
                                   else '.xz' if compression == 'lzma'
                                   else '')
        meta_mode = 'rt' if PY2 is False or compression is None else 'rb'
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextCorpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextCorpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        textcorpus = TextCorpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode,)
        spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            textcorpus.add_doc(
                TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline,
                        lang=lang, metadata=metadata))
        return textcorpus
Exemplo n.º 10
0
    def load(cls, path, fname_prefix=None):
        """
        Load serialized content and metadata from disk, and initialize a TextCorpus.

        Args:
            path (str): directory on disk where content + metadata are saved
            fname_prefix (str, optional): additional identifying information
                prepended to standard filenames 'spacy_docs.bin' and 'metadatas.json'
                when saving to disk

        Returns:
            :class:`textacy.TextCorpus`
        """
        if fname_prefix:
            info_fname = os.path.join(path, '_'.join([fname_prefix, 'info.json']))
            meta_fname = os.path.join(path, '_'.join([fname_prefix, 'metadatas.json']))
            docs_fname = os.path.join(path, '_'.join([fname_prefix, 'spacy_docs.bin']))
        else:
            info_fname = os.path.join(path, 'info.json')
            meta_fname = os.path.join(path, 'metadatas.json')
            docs_fname = os.path.join(path, 'spacy_docs.bin')
        package_info = list(fileio.read_json(info_fname))[0]
        lang = package_info['textacy_lang']
        spacy_version = package_info['spacy_version']
        if spacy_version != spacy.about.__version__:
            msg = """
                the spaCy version used to save this TextCorpus to disk is not the
                same as the version currently installed ('{}' vs. '{}'); if the
                data underlying the associated `spacy.Vocab` has changed, this
                loaded TextCorpus may not be valid!
                """.format(spacy_version, spacy.about.__version__)
            warnings.warn(msg, UserWarning)
        textcorpus = TextCorpus(lang)
        metadata_stream = fileio.read_json_lines(meta_fname)
        spacy_docs = fileio.read_spacy_docs(textcorpus.spacy_vocab, docs_fname)
        for spacy_doc, metadata in zip(spacy_docs, metadata_stream):
            textcorpus.add_doc(
                TextDoc(spacy_doc, spacy_pipeline=textcorpus.spacy_pipeline,
                        lang=lang, metadata=metadata))
        return textcorpus
Exemplo n.º 11
0
    def from_texts(cls,
                   lang_or_pipeline,
                   texts,
                   metadata=None,
                   n_threads=2,
                   batch_size=1000):
        """
        Convenience function for creating a :class:`TextCorpus <textacy.texts.TextCorpus>`
        from an iterable of text strings.

        Args:
            lang_or_pipeline ({'en', 'de'} or :class:`spacy.<lang>.<Language>`)
            texts (iterable(str))
            metadata (iterable(dict), optional)
            n_threads (int, optional)
            batch_size (int, optional)

        Returns:
            :class:`TextCorpus <textacy.texts.TextCorpus>`
        """
        textcorpus = cls(lang_or_pipeline)
        spacy_docs = textcorpus.spacy_pipeline.pipe(texts,
                                                    n_threads=n_threads,
                                                    batch_size=batch_size)
        if metadata is not None:
            for spacy_doc, md in zip(spacy_docs, metadata):
                textcorpus.add_doc(
                    TextDoc(spacy_doc,
                            lang=textcorpus.lang,
                            spacy_pipeline=textcorpus.spacy_pipeline,
                            metadata=md))
        else:
            for spacy_doc in spacy_docs:
                textcorpus.add_doc(
                    TextDoc(spacy_doc,
                            lang=textcorpus.lang,
                            spacy_pipeline=textcorpus.spacy_pipeline,
                            metadata=None))
        return textcorpus