def test_peek(): alist = ["Alice", "Bob", "Carol"] element, blist = peek(alist) element == alist[0] assert list(blist) == alist assert raises(StopIteration, lambda: peek([]))
def load(cls, filepath): """ Load documents' pickled content and metadata from disk, and initialize a :class:`Corpus` with a spacy language pipeline equivalent to what was in use previously, when the corpus was saved. Args: filepath (str): Full path to file on disk where documents' content and metadata are saved. Returns: :class:`Corpus` See Also: :meth:`Corpus.save()` """ spacy_docs = io.read_spacy_docs(filepath) # HACK: pop spacy language metadata from first doc's user_data # so we can (more or less...) re-instantiate the same language pipeline first_spacy_doc, spacy_docs = itertoolz.peek(spacy_docs) spacy_lang_meta = first_spacy_doc.user_data['textacy'].pop( 'spacy_lang_meta') # manually instantiate the spacy language pipeline and # hope that the spacy folks either make this easier or don't touch it spacy_lang = get_lang_class(spacy_lang_meta['lang'])( vocab=first_spacy_doc.vocab, meta=spacy_lang_meta) for name in spacy_lang_meta['pipeline']: spacy_lang.add_pipe(spacy_lang.create_pipe(name)) return cls(spacy_lang, docs=spacy_docs)
def add( self, data: CorpusData, batch_size: int = 1000, n_process: int = 1, ) -> None: """ Add one or a stream of texts, records, or :class:`spacy.tokens.Doc` s to the corpus, ensuring that all processing is or has already been done by the :attr:`Corpus.spacy_lang` pipeline. Args: data batch_size: Number of texts to buffer when processing with spaCy. n_process: Number of parallel processors to run when processing. If -1, this is set to ``multiprocessing.cpu_count()``. .. note:: This feature is only available in spaCy 2.2.2+, and only applies when ``data`` is a sequence of texts or records. See Also: * :meth:`Corpus.add_text()` * :meth:`Corpus.add_texts()` * :meth:`Corpus.add_record()` * :meth:`Corpus.add_records()` * :meth:`Corpus.add_doc()` * :meth:`Corpus.add_docs()` """ if isinstance(data, str): self.add_text(data) elif isinstance(data, spacy.tokens.Doc): self.add_doc(data) elif utils.is_record(data): self.add_record(data) elif isinstance(data, collections.abc.Iterable): first, data = itertoolz.peek(data) if isinstance(first, str): self.add_texts(data, batch_size=batch_size, n_process=n_process) elif isinstance(first, spacy.tokens.Doc): self.add_docs(data) elif utils.is_record(first): self.add_records(data, batch_size=batch_size, n_process=n_process) else: raise TypeError( "data must be one of {} or an interable thereof, not {}". format( {str, spacy.tokens.Doc, tuple}, type(data), )) else: raise TypeError( "data must be one of {} or an interable thereof, not {}". format( {str, spacy.tokens.Doc, tuple}, type(data), ))
def add(self, data, batch_size=1000): """ Add one or a stream of texts, records, or :class:`spacy.tokens.Doc` s to the corpus, ensuring that all processing is or has already been done by the :attr:`Corpus.spacy_lang` pipeline. Args: data (obj or Iterable[obj]): str or Iterable[str] Tuple[str, dict] or Iterable[Tuple[str, dict]] :class:`spacy.tokens.Doc` or Iterable[:class:`spacy.tokens.Doc`] batch_size (int) See Also: * :meth:`Corpus.add_text()` * :meth:`Corpus.add_texts()` * :meth:`Corpus.add_record()` * :meth:`Corpus.add_records()` * :meth:`Corpus.add_doc()` * :meth:`Corpus.add_docs()` """ if isinstance(data, compat.unicode_): self.add_text(data) elif isinstance(data, spacy.tokens.Doc): self.add_doc(data) elif utils.is_record(data): self.add_record(data) elif isinstance(data, compat.Iterable): first, data = itertoolz.peek(data) if isinstance(first, compat.unicode_): self.add_texts(data, batch_size=batch_size) elif isinstance(first, spacy.tokens.Doc): self.add_docs(data) elif utils.is_record(first): self.add_records(data, batch_size=batch_size) else: raise TypeError( "data must be one of {} or an interable thereof, not {}". format( {compat.unicode_, spacy.tokens.Doc, tuple}, type(data), )) else: raise TypeError( "data must be one of {} or an interable thereof, not {}". format( {compat.unicode_, spacy.tokens.Doc, tuple}, type(data), ))
def empty(it: Iterable[E]) -> Tuple[bool, Iterable[E]]: """ Checks, whether the sequence is empty or not. NOTE: This method modifies the original sequence (takes the first element), use the returned one, which contains the original items. >>> it_orig = iter([1, 2, 3]) >>> is_empty, it_new = empty(it_orig) >>> is_empty, list(it_new) (False, [1, 2, 3]) >>> is_empty, it_empty = empty(iter([])) >>> is_empty, list(it_empty) (True, []) """ try: _, it = peek(it) return False, it except StopIteration: return True, iter([])
def head_tail(it: Iterable[E]) -> Tuple[E, Iterable[E]]: """ Split provided iterable into head element and tail iterable. >>> head, tail = head_tail(iter([1, 2, 3])) >>> head, list(tail) (1, [2, 3]) >>> head, tail = head_tail(iter([42])) >>> head, list(tail) (42, []) Raises :class:`StopIteration` if the original iterable is empty. >>> head_tail(iter([])) Traceback (most recent call last): ... StopIteration """ head, seq = peek(it) tail = drop(1, seq) return head, tail
def build_graph_from_terms(terms, *, normalize="lemma", window_size=10, edge_weighting="count"): """ Transform an ordered list of non-overlapping terms into a graph, where each term is represented by a node with weighted edges linking it to other terms that co-occur within ``window_size`` terms of itself. Args: terms (List[str] or List[:class:`spacy.tokens.Token` or :class:`spacy.tokens.Span`]) normalize (str or Callable): If "lemma", lemmatize terms; if "lower", lowercase terms; if falsy, use the form of terms as they appear in ``terms``; if a callable, must accept a ``Token`` and return a str, e.g. :func:`textacy.spacier.utils.get_normalized_text()`. .. note:: This is applied to the elements of ``terms`` *only* if it's a list of ``Token`` or ``Span``. window_size (int): Size of sliding window over ``terms`` that determines which are said to co-occur. If 2, only immediately adjacent terms have edges in the returned network. edge_weighting ({"count", "binary"}): If "count", the nodes for all co-occurring terms are connected by edges with weight equal to the number of times they co-occurred within a sliding window; if "binary", all such edges have weight = 1. Returns: :class:`networkx.Graph`: Nodes in this network correspond to individual terms; those that co-occur are connected by edges with weights determined by ``edge_weighting``. """ if window_size < 2: raise ValueError( "window_size = {} is invalid; value must be >= 2".format( window_size)) if not terms: LOGGER.warning("input `terms` is empty, so output graph is also empty") return nx.Graph() # if len(terms) < window_size, cytoolz throws a StopIteration error; prevent it if len(terms) < window_size: LOGGER.info( "`terms` has fewer items (%s) than `window_size` (%s); " "setting window width to %s", len(terms), window_size, len(terms), ) window_size = len(terms) first_term, terms = itertoolz.peek(terms) if isinstance(first_term, str): windows = itertoolz.sliding_window(window_size, terms) elif isinstance(first_term, (Span, Token)): windows = itertoolz.sliding_window( window_size, ke_utils.normalize_terms(terms, normalize)) else: raise TypeError( "items in `terms` must be strings or spacy tokens, not {}".format( type(first_term))) graph = nx.Graph() if edge_weighting == "count": cooc_mat = collections.Counter( w1_w2 for window in windows for w1_w2 in itertools.combinations(sorted(window), 2)) graph.add_edges_from((w1, w2, { "weight": weight }) for (w1, w2), weight in cooc_mat.items()) elif edge_weighting == "binary": graph.add_edges_from(w1_w2 for window in windows for w1_w2 in itertools.combinations(window, 2)) else: raise ValueError( "edge_weighting = {} is invalid; must be one of {}".format( edge_weighting, {"count", "binary"})) return graph