def add_texts(self, texts, metadatas=None, n_threads=4, batch_size=1000): """ Process a stream of texts (and a corresponding stream of metadata dicts, optionally) in parallel with spaCy; add as :class:`textacy.Doc <textacy.doc.Doc>` s to the corpus. Args: texts (Iterable[str]): Stream of texts to add to corpus as ``Doc`` s metadatas (Iterable[dict]): Stream of dictionaries of relevant document metadata. **Note:** This stream must align exactly with ``texts``, or metadata will be mis-assigned to texts. More concretely, the first item in ``metadatas`` will be assigned to the first item in ``texts``, and so on from there. n_threads (int): Number of threads to use when processing ``texts`` in parallel, if available. batch_size (int): Number of texts to process at a time. See Also: :func:`fileio.split_record_fields()` http://spacy.io/docs/#multi-threaded """ spacy_docs = self.spacy_lang.pipe(texts, n_threads=n_threads, batch_size=batch_size) if metadatas: for spacy_doc, metadata in zip(spacy_docs, metadatas): self._add_textacy_doc( Doc(spacy_doc, lang=self.spacy_lang, metadata=metadata)) else: for spacy_doc in spacy_docs: self._add_textacy_doc( Doc(spacy_doc, lang=self.spacy_lang, metadata=None))
def emotional_valence(text): doc = Doc(text, lang='en') scores = emotional_valence(doc.tokens, dm_data_dir='pretrained_models') return np.array([ scores['AFRAID'], scores['AMUSED'], scores['ANGRY'], scores['ANNOYED'], scores['DONT_CARE'], scores['HAPPY'], scores['INSPIRED'], scores['SAD'] ])
def add_text(self, text, metadata=None): """ Create a :class:`textacy.Doc <textacy.doc.Doc>` from ``text`` and ``metadata``, then add it to the corpus. Args: text (str): Document (text) content to add to corpus as a ``Doc``. metadata (dict): Dictionary of relevant document metadata. """ self._add_textacy_doc(Doc(text, lang=self.spacy_lang, metadata=metadata))
def test_build_phrase_models(self, Phrases): from eea.corpus.processing.phrases.phrases import build_phrase_models from textacy.doc import Doc content = [Doc('hello'), Doc('world')] phrases = Phrases() Phrases.return_value = phrases build_phrase_models(content, '/corpus/some.csv.phras', {'level': 2}) # call count should be 1, but we called above once assert Phrases.call_count == 2 assert phrases.save.call_args[0] == ('/corpus/some.csv.phras.2', ) build_phrase_models(content, '/corpus/some.csv.phras', {'level': 3}) # call count should be 1, but it accumulates with the 2 above assert Phrases.call_count == 4 assert phrases.save.call_args[0] == ('/corpus/some.csv.phras.3', )
def polarity(text): doc = Doc(text, lang='en') sentences = [span.text for span in doc.sents] scores = [ analyzer.polarity_scores(sentence) for sentence in sentences ] np_scores = [ np.array([ score['neg'], score['neu'], score['pos'], score['compound'] ]) for score in scores ] return np.mean(np.stack(np_scores), axis=0)
def load(cls, path, name=None, compression=None): """ Load content and metadata from disk, and initialize a ``Corpus``. Args: path (str): Directory on disk where content + metadata are saved. name (str): Identifying/uniquifying name prepended to the default filenames 'spacy_docs.bin', 'metadatas.json', and 'info.json', used when corpus was saved to disk via :meth:`Corpus.save()`. compression ({'gzip', 'bz2', 'lzma'} or None): Type of compression used to reduce size of 'metadatas.json' file when saved, if any. Returns: :class:`textacy.Corpus <Corpus>` .. warning:: If the ``spacy.Vocab`` object used to save this document is not the same as the one used to load it, there will be problems! Consequently, this functionality is only useful as short-term but not long-term storage. """ if name: info_fname = os.path.join(path, '_'.join([name, 'info.json'])) meta_fname = os.path.join(path, '_'.join([name, 'metadatas.json'])) docs_fname = os.path.join(path, '_'.join([name, 'spacy_docs.bin'])) else: info_fname = os.path.join(path, 'info.json') meta_fname = os.path.join(path, 'metadatas.json') docs_fname = os.path.join(path, 'spacy_docs.bin') meta_fname = meta_fname + ('.gz' if compression == 'gzip' else '.bz2' if compression == 'bz2' else '.xz' if compression == 'lzma' else '') meta_mode = 'rt' if PY2 is False or compression is None else 'rb' package_info = list(fileio.read_json(info_fname))[0] lang = package_info['textacy_lang'] spacy_version = package_info['spacy_version'] if spacy_version != spacy.about.__version__: msg = """ the spaCy version used to save this Corpus to disk is not the same as the version currently installed ('{}' vs. '{}'); if the data underlying the associated `spacy.Vocab` has changed, this loaded Corpus may not be valid! """.format(spacy_version, spacy.about.__version__) warnings.warn(msg, UserWarning) corpus = Corpus(lang) metadata_stream = fileio.read_json_lines(meta_fname, mode=meta_mode) spacy_docs = fileio.read_spacy_docs(corpus.spacy_vocab, docs_fname) for spacy_doc, metadata in zip(spacy_docs, metadata_stream): corpus.add_doc( Doc(spacy_doc, lang=corpus.spacy_lang, metadata=metadata)) return corpus
def preprocess_text_string(text): """Preprocesses text for feature extraction. Preprocessing tasks are as follows: - whitespace normalization - fixing broken unicode via ftfy - converting text to lowercase - replacing url strings with 'url' - replacing phone number strings with 'phone' - replacing currency symbols with their standard 3-letter abbreviations - stripping punctuation - replacing contractions with their unshortened forms - lemmatizing words Parameters ---------- text : str The input text to be preprocessed. Returns ------- preprocessed : str The preprocessed output text. """ text = preprocess_text(text, fix_unicode=True, lowercase=True, no_urls=True, no_phone_numbers=True, no_currency_symbols=True, no_punct=True, no_contractions=True) doc = Doc(text, lang='en') lemmatized_tokens = doc.to_terms_list(ngrams=1, named_entities=False, as_strings=True, normalize='lemma') return ' '.join(lemmatized_tokens)
def _get_quotes(self): quote_count = [] for stance in tqdm.tqdm(self._stances): body = self._original_articles.get(stance['Body ID']).decode( 'utf-8', 'replace') doc = Doc(content=body, lang=u'en') quotes = direct_quotations(doc) quote_counter = 0 for q in quotes: quote_counter = quote_counter + len(q[2]) quote_counter = quote_counter / len(body) quote_count.append(quote_counter) return quote_count
def add_doc(self, doc, metadata=None): """ Add an existing :class:`textacy.Doc <textacy.doc.Doc>` or initialize a new one from a ``spacy.Doc`` to the corpus. Args: doc (``textacy.Doc`` or ``spacy.Doc``) metadata (dict): Dictionary of relevant document metadata. If ``doc`` is a ``spacy.Doc``, it will be paired as usual; if ``doc`` is a ``textacy.Doc``, it will *overwrite* any existing metadata. .. warning:: If ``doc`` was already added to this or another ``Corpus``, it will be deep-copied and then added as if a new document. A warning message will be logged. This is probably not a thing you should do. """ if isinstance(doc, Doc): if doc.spacy_vocab is not self.spacy_vocab: msg = 'Doc.spacy_vocab {} != Corpus.spacy_vocab {}'.format( doc.spacy_vocab, self.spacy_vocab) raise ValueError(msg) if hasattr(doc, 'corpus_index'): doc = copy.deepcopy(doc) # TODO: make this into a logging warning print( '**WARNING: Doc already associated with a Corpus; adding anyway...' ) if metadata is not None: doc.metadata = metadata self._add_textacy_doc(doc) elif isinstance(doc, SpacyDoc): if doc.vocab is not self.spacy_vocab: msg = 'SpacyDoc.vocab {} != Corpus.spacy_vocab {}'.format( doc.vocab, self.spacy_vocab) raise ValueError(msg) self._add_textacy_doc( Doc(doc, lang=self.spacy_lang, metadata=metadata)) else: msg = '`doc` must be {}, not "{}"'.format({Doc, SpacyDoc}, type(doc)) raise ValueError(msg)
def doc_creator(text): text = preprocess_text(text, fix_unicode=True, lowercase=True, no_numbers=True, no_punct=True, no_contractions=True, no_accents=True) return Doc(text, lang="en_core_web_md")