Python TokenCount.textModel示例

编程语言: Python

命名空间/包名称: text_models.dataset

类/类型: TokenCount

方法/功能: textModel

hotexamples.com的示例: 2

Python TokenCount.textModel - 已找到2个示例。这些是从开源项目中提取的最受好评的text_models.dataset.TokenCount.textModel现实Python示例。您可以评价示例，以帮助我们提高示例质量。

常用方法

显示隐藏

bigrams(2)

process(2)

single_co_ocurrence(2)

textModel(2)

TokenCount(1)

co_ocurrence(1)

示例#1

显示文件

# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from text_models.dataset import TokenCount
from glob import glob
from text_models.utils import TStatistic, LikelihoodRatios
from wordcloud import WordCloud as WC
from matplotlib import pylab as plt
from collections import defaultdict, Counter
from text_models import Vocabulary

tm = TokenCount.textModel(token_list=[-2, -1])
tm.tokenize("Good afternoon, we will make some nice word clouds.")
## SHOW - bigrams "w_{i}~w_{i+1}"

token = TokenCount(tokenizer=tm.tokenize)
for fname in glob("books/*.txt"):
    txt = open(fname, encoding="utf-8").read()
    token.process([txt])
token.counter
## SHOW - number of times each bigram and word appear

bigrams = {k: v for k, v in token.counter.items() if k.count("~")}
cnt = Counter(bigrams)
cnt.most_common(5)
# [('of~the', 14615),
#  ('in~the', 9913),

示例#2

显示文件

    def log_prob(self, ngram: str) -> float:
        c1 = self._data[ngram]
        if self._words:
            c2 = self.N
        else:
            words = "~".join(ngram.split("~")[:-1])
            c2 = self.N[words]
        if c1 and c2:
            return np.log(c1) - np.log(c2)
        raise ValueError("ngram %s not found" % ngram)

    def prob(self, ngram: str) -> float:
        return np.exp(self.log_prob(ngram))


tm = TokenCount.textModel(token_list=[-3])
token = TokenCount(tokenizer=tm.tokenize)
read = Read(glob("books/*.txt"), n_gram=tm.token_list[0] * -1)
token.process(read.read())

lm = LM(token.counter, words=tm.token_list[0] == -1)

logp = 0
max_logp, cnt = 0, 0
N = 0
for txt in read.test_set:
    for ngram in tm.tokenize(txt):
        N += 1
        try:
            _ = lm.log_prob(ngram)
            if _ < max_logp: