Python Doc.count_by 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: spacy.tokens

클래스/타입: Doc

메소드/함수: count_by

hotexamples.com에서의 예제들: 2

Python Doc.count_by - 2개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 spacy.tokens.Doc.count_by에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

Doc(30)

has_extension(30)

set_extension(30)

ents(30)

from_array(30)

is_tagged(27)

is_parsed(26)

to_bytes(17)

to_array(17)

retokenize(17)

has_annotation(16)

char_span(15)

similarity(14)

remove_extension(13)

from_docs(8)

from_bytes(5)

tensor(4)

noun_chunks_iterator(4)

to_disk(3)

cats(3)

to_json(2)

split(2)

user_data(2)

set_ents(2)

read_bytes(2)

get_lca_matrix(2)

get_extension(2)

count_by(2)

sentiment(1)

__iter__(1)

from_disk(1)

to_dict(1)

_get_array_attrs(1)

merge(1)

예제 #1

파일 보기

def token_count_by(
    *,
    doc: Doc,
    target: Literal['lemma', 'lower', 'orth', 'text'] = 'lemma',
    weighting: Literal['count', 'freq'] = 'count',
    include: Callable[[Token], bool] = None,
    n_min_count: int = 2,
    as_strings: bool = False,
) -> Mapping[str | int, int | float]:
    """Return frequency count for `target` in `doc`."""
    target_keys = {'lemma': attrs.LEMMA, 'lower': attrs.LOWER, 'orth': attrs.ORTH, 'text': attrs.TEXT}

    default_exclude: Callable[[Token], bool] = lambda x: x.is_stop or x.is_punct or x.is_space
    exclude: Callable[[Token], bool] = (
        default_exclude if include is None else lambda x: x.is_stop or x.is_punct or x.is_space or not include(x)
    )

    target_weights: Mapping[str | int, int | float] = doc.count_by(target_keys[target], exclude=exclude)

    if weighting == 'freq':
        n_tokens: int = len(doc)
        target_weights = {id_: weight / n_tokens for id_, weight in target_weights.items()}

    store = doc.vocab.strings
    if as_strings:
        bow = {store[word_id]: count for word_id, count in target_weights.items() if count >= n_min_count}
    else:
        bow = target_weights

    return bow

예제 #2

파일 보기

def to_bag_of_words(
    doc: Doc,
    *,
    normalize: str = "lemma",
    weighting: str = "count",
    as_strings: bool = False,
    filter_stops: bool = True,
    filter_punct: bool = True,
    filter_nums: bool = False,
) -> Dict[Union[int, str], Union[int, float]]:
    """
    Transform ``Doc`` into a bag-of-words: the set of unique words in ``Doc``
    mapped to their absolute, relative, or binary frequency of occurrence.

    Args:
        doc
        normalize: If "lemma", lemmatize words before counting;
            if "lower", lowercase words before counting; otherwise, words are
            counted using the form with which they they appear in doc.
        weighting ({"count", "freq", "binary"}): Type of weight to assign to
            words. If "count" (default), weights are the absolute number of
            occurrences (count) of word in doc. If "binary", all counts are
            set equal to 1. If "freq", word counts are normalized by the
            total token count, giving their relative frequency of occurrence.
            Note: The resulting set of frequencies won't (necessarily) sum
            to 1.0, since punctuation and stop words are filtered out after
            counts are normalized.
        as_strings (bool): If True, words are returned as strings; if False
            (default), words are returned as their unique integer ids
        filter_stops (bool): If True (default), stop words are removed after
            counting.
        filter_punct (bool): If True (default), punctuation tokens are removed
            after counting.
        filter_nums (bool): If True, tokens consisting of digits are removed
            after counting.

    Returns:
        Mapping of a unique term id or string (depending on the value of ``as_strings``)
        to its absolute, relative, or binary frequency of occurrence
        (depending on the value of ``weighting``).
    """
    if weighting not in {"count", "freq", "binary"}:
        raise ValueError(
            errors.value_invalid_msg("weighting", weighting, {"count", "freq", "binary"})
        )
    count_by = (
        spacy.attrs.LEMMA
        if normalize == "lemma"
        else spacy.attrs.LOWER
        if normalize == "lower"
        else spacy.attrs.ORTH
    )

    wid_weights = doc.count_by(count_by)
    if weighting == "freq":
        n_tokens = len(doc)
        wid_weights = {wid: weight / n_tokens for wid, weight in wid_weights.items()}
    elif weighting == "binary":
        wid_weights = {wid: 1 for wid in wid_weights.keys()}

    bow = {}
    vocab = doc.vocab
    if as_strings is False:
        for wid, weight in wid_weights.items():
            lex = vocab[wid]
            if not (
                (lex.is_stop and filter_stops)
                or (lex.is_punct and filter_punct)
                or (lex.is_digit and filter_nums)
                or lex.is_space
            ):
                bow[wid] = weight
    else:
        ss = doc.vocab.strings
        for wid, weight in wid_weights.items():
            lex = vocab[wid]
            if not (
                (lex.is_stop and filter_stops)
                or (lex.is_punct and filter_punct)
                or (lex.is_digit and filter_nums)
                or lex.is_space
            ):
                bow[ss[wid]] = weight
    return bow