Python TokenCount示例

编程语言: Python

命名空间/包名称: text_models.dataset

类/类型: TokenCount

hotexamples.com的示例: 6

Python TokenCount - 已找到6个示例。这些是从开源项目中提取的最受好评的text_models.dataset.TokenCount现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

bigrams(2)

process(2)

single_co_ocurrence(2)

textModel(2)

TokenCount(1)

co_ocurrence(1)

示例#1

显示文件

文件： test_dataset.py 项目： INGEOTEC/text_models

def test_TokenCount_single_co_occurrence():
    from microtc.utils import tweet_iterator
    from text_models.dataset import TokenCount
    tcount = TokenCount.single_co_ocurrence()
    tcount.process_line("buenos xxx dias")
    assert tcount.counter["dias~xxx"] == 1
    assert tcount.counter["xxx"] == 1

示例#2

显示文件

文件： test_dataset.py 项目： INGEOTEC/text_models

def test_TokenCount_process():
    from microtc.utils import tweet_iterator
    from text_models.dataset import TokenCount
    tcount = TokenCount.bigrams()
    tcount.process(tweet_iterator(TWEETS))
    print(tcount.counter.most_common(10))
    assert tcount.counter["in~the"] == 313

示例#3

显示文件

文件： test_dataset.py 项目： INGEOTEC/text_models

def test_TokenCount_process_line():
    from text_models.dataset import TokenCount
    tcount = TokenCount.bigrams()
    tcount.process_line("buenos dias xx la dias xx")
    counter = tcount.counter
    print(counter)
    assert counter["dias~xx"] == 2 and tcount.num_documents == 1

示例#4

显示文件

文件： test_dataset.py 项目： INGEOTEC/text_models

def test_TokenCount_clean():
    from microtc.utils import tweet_iterator
    from text_models.dataset import TokenCount
    tcount = TokenCount.single_co_ocurrence()    
    tcount.process(tweet_iterator(TWEETS))
    ant = len(tcount.counter)
    tcount.clean()
    act = len(tcount.counter)
    assert ant > act

示例#5

显示文件

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from text_models.dataset import TokenCount
from glob import glob
from text_models.utils import TStatistic, LikelihoodRatios
from wordcloud import WordCloud as WC
from matplotlib import pylab as plt
from collections import defaultdict, Counter
from text_models import Vocabulary

tm = TokenCount.textModel(token_list=[-2, -1])
tm.tokenize("Good afternoon, we will make some nice word clouds.")
## SHOW - bigrams "w_{i}~w_{i+1}"

token = TokenCount(tokenizer=tm.tokenize)
for fname in glob("books/*.txt"):
    txt = open(fname, encoding="utf-8").read()
    token.process([txt])
token.counter
## SHOW - number of times each bigram and word appear

bigrams = {k: v for k, v in token.counter.items() if k.count("~")}
cnt = Counter(bigrams)
cnt.most_common(5)
# [('of~the', 14615),
#  ('in~the', 9913),

示例#6

显示文件

    def log_prob(self, ngram: str) -> float:
        c1 = self._data[ngram]
        if self._words:
            c2 = self.N
        else:
            words = "~".join(ngram.split("~")[:-1])
            c2 = self.N[words]
        if c1 and c2:
            return np.log(c1) - np.log(c2)
        raise ValueError("ngram %s not found" % ngram)

    def prob(self, ngram: str) -> float:
        return np.exp(self.log_prob(ngram))


tm = TokenCount.textModel(token_list=[-3])
token = TokenCount(tokenizer=tm.tokenize)
read = Read(glob("books/*.txt"), n_gram=tm.token_list[0] * -1)
token.process(read.read())

lm = LM(token.counter, words=tm.token_list[0] == -1)

logp = 0
max_logp, cnt = 0, 0
N = 0
for txt in read.test_set:
    for ngram in tm.tokenize(txt):
        N += 1
        try:
            _ = lm.log_prob(ngram)
            if _ < max_logp: