예제 #1
0
def token_count_by(
    *,
    doc: Doc,
    target: Literal['lemma', 'lower', 'orth', 'text'] = 'lemma',
    weighting: Literal['count', 'freq'] = 'count',
    include: Callable[[Token], bool] = None,
    n_min_count: int = 2,
    as_strings: bool = False,
) -> Mapping[str | int, int | float]:
    """Return frequency count for `target` in `doc`."""
    target_keys = {'lemma': attrs.LEMMA, 'lower': attrs.LOWER, 'orth': attrs.ORTH, 'text': attrs.TEXT}

    default_exclude: Callable[[Token], bool] = lambda x: x.is_stop or x.is_punct or x.is_space
    exclude: Callable[[Token], bool] = (
        default_exclude if include is None else lambda x: x.is_stop or x.is_punct or x.is_space or not include(x)
    )

    target_weights: Mapping[str | int, int | float] = doc.count_by(target_keys[target], exclude=exclude)

    if weighting == 'freq':
        n_tokens: int = len(doc)
        target_weights = {id_: weight / n_tokens for id_, weight in target_weights.items()}

    store = doc.vocab.strings
    if as_strings:
        bow = {store[word_id]: count for word_id, count in target_weights.items() if count >= n_min_count}
    else:
        bow = target_weights

    return bow
예제 #2
0
def to_bag_of_words(
    doc: Doc,
    *,
    normalize: str = "lemma",
    weighting: str = "count",
    as_strings: bool = False,
    filter_stops: bool = True,
    filter_punct: bool = True,
    filter_nums: bool = False,
) -> Dict[Union[int, str], Union[int, float]]:
    """
    Transform ``Doc`` into a bag-of-words: the set of unique words in ``Doc``
    mapped to their absolute, relative, or binary frequency of occurrence.

    Args:
        doc
        normalize: If "lemma", lemmatize words before counting;
            if "lower", lowercase words before counting; otherwise, words are
            counted using the form with which they they appear in doc.
        weighting ({"count", "freq", "binary"}): Type of weight to assign to
            words. If "count" (default), weights are the absolute number of
            occurrences (count) of word in doc. If "binary", all counts are
            set equal to 1. If "freq", word counts are normalized by the
            total token count, giving their relative frequency of occurrence.
            Note: The resulting set of frequencies won't (necessarily) sum
            to 1.0, since punctuation and stop words are filtered out after
            counts are normalized.
        as_strings (bool): If True, words are returned as strings; if False
            (default), words are returned as their unique integer ids
        filter_stops (bool): If True (default), stop words are removed after
            counting.
        filter_punct (bool): If True (default), punctuation tokens are removed
            after counting.
        filter_nums (bool): If True, tokens consisting of digits are removed
            after counting.

    Returns:
        Mapping of a unique term id or string (depending on the value of ``as_strings``)
        to its absolute, relative, or binary frequency of occurrence
        (depending on the value of ``weighting``).
    """
    if weighting not in {"count", "freq", "binary"}:
        raise ValueError(
            errors.value_invalid_msg("weighting", weighting, {"count", "freq", "binary"})
        )
    count_by = (
        spacy.attrs.LEMMA
        if normalize == "lemma"
        else spacy.attrs.LOWER
        if normalize == "lower"
        else spacy.attrs.ORTH
    )

    wid_weights = doc.count_by(count_by)
    if weighting == "freq":
        n_tokens = len(doc)
        wid_weights = {wid: weight / n_tokens for wid, weight in wid_weights.items()}
    elif weighting == "binary":
        wid_weights = {wid: 1 for wid in wid_weights.keys()}

    bow = {}
    vocab = doc.vocab
    if as_strings is False:
        for wid, weight in wid_weights.items():
            lex = vocab[wid]
            if not (
                (lex.is_stop and filter_stops)
                or (lex.is_punct and filter_punct)
                or (lex.is_digit and filter_nums)
                or lex.is_space
            ):
                bow[wid] = weight
    else:
        ss = doc.vocab.strings
        for wid, weight in wid_weights.items():
            lex = vocab[wid]
            if not (
                (lex.is_stop and filter_stops)
                or (lex.is_punct and filter_punct)
                or (lex.is_digit and filter_nums)
                or lex.is_space
            ):
                bow[ss[wid]] = weight
    return bow