def token_count_by( *, doc: Doc, target: Literal['lemma', 'lower', 'orth', 'text'] = 'lemma', weighting: Literal['count', 'freq'] = 'count', include: Callable[[Token], bool] = None, n_min_count: int = 2, as_strings: bool = False, ) -> Mapping[str | int, int | float]: """Return frequency count for `target` in `doc`.""" target_keys = {'lemma': attrs.LEMMA, 'lower': attrs.LOWER, 'orth': attrs.ORTH, 'text': attrs.TEXT} default_exclude: Callable[[Token], bool] = lambda x: x.is_stop or x.is_punct or x.is_space exclude: Callable[[Token], bool] = ( default_exclude if include is None else lambda x: x.is_stop or x.is_punct or x.is_space or not include(x) ) target_weights: Mapping[str | int, int | float] = doc.count_by(target_keys[target], exclude=exclude) if weighting == 'freq': n_tokens: int = len(doc) target_weights = {id_: weight / n_tokens for id_, weight in target_weights.items()} store = doc.vocab.strings if as_strings: bow = {store[word_id]: count for word_id, count in target_weights.items() if count >= n_min_count} else: bow = target_weights return bow
def to_bag_of_words( doc: Doc, *, normalize: str = "lemma", weighting: str = "count", as_strings: bool = False, filter_stops: bool = True, filter_punct: bool = True, filter_nums: bool = False, ) -> Dict[Union[int, str], Union[int, float]]: """ Transform ``Doc`` into a bag-of-words: the set of unique words in ``Doc`` mapped to their absolute, relative, or binary frequency of occurrence. Args: doc normalize: If "lemma", lemmatize words before counting; if "lower", lowercase words before counting; otherwise, words are counted using the form with which they they appear in doc. weighting ({"count", "freq", "binary"}): Type of weight to assign to words. If "count" (default), weights are the absolute number of occurrences (count) of word in doc. If "binary", all counts are set equal to 1. If "freq", word counts are normalized by the total token count, giving their relative frequency of occurrence. Note: The resulting set of frequencies won't (necessarily) sum to 1.0, since punctuation and stop words are filtered out after counts are normalized. as_strings (bool): If True, words are returned as strings; if False (default), words are returned as their unique integer ids filter_stops (bool): If True (default), stop words are removed after counting. filter_punct (bool): If True (default), punctuation tokens are removed after counting. filter_nums (bool): If True, tokens consisting of digits are removed after counting. Returns: Mapping of a unique term id or string (depending on the value of ``as_strings``) to its absolute, relative, or binary frequency of occurrence (depending on the value of ``weighting``). """ if weighting not in {"count", "freq", "binary"}: raise ValueError( errors.value_invalid_msg("weighting", weighting, {"count", "freq", "binary"}) ) count_by = ( spacy.attrs.LEMMA if normalize == "lemma" else spacy.attrs.LOWER if normalize == "lower" else spacy.attrs.ORTH ) wid_weights = doc.count_by(count_by) if weighting == "freq": n_tokens = len(doc) wid_weights = {wid: weight / n_tokens for wid, weight in wid_weights.items()} elif weighting == "binary": wid_weights = {wid: 1 for wid in wid_weights.keys()} bow = {} vocab = doc.vocab if as_strings is False: for wid, weight in wid_weights.items(): lex = vocab[wid] if not ( (lex.is_stop and filter_stops) or (lex.is_punct and filter_punct) or (lex.is_digit and filter_nums) or lex.is_space ): bow[wid] = weight else: ss = doc.vocab.strings for wid, weight in wid_weights.items(): lex = vocab[wid] if not ( (lex.is_stop and filter_stops) or (lex.is_punct and filter_punct) or (lex.is_digit and filter_nums) or lex.is_space ): bow[ss[wid]] = weight return bow