示例#1
0
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from text_models.dataset import TokenCount
from glob import glob
from text_models.utils import TStatistic, LikelihoodRatios
from wordcloud import WordCloud as WC
from matplotlib import pylab as plt
from collections import defaultdict, Counter
from text_models import Vocabulary

tm = TokenCount.textModel(token_list=[-2, -1])
tm.tokenize("Good afternoon, we will make some nice word clouds.")
## SHOW - bigrams "w_{i}~w_{i+1}"

token = TokenCount(tokenizer=tm.tokenize)
for fname in glob("books/*.txt"):
    txt = open(fname, encoding="utf-8").read()
    token.process([txt])
token.counter
## SHOW - number of times each bigram and word appear

bigrams = {k: v for k, v in token.counter.items() if k.count("~")}
cnt = Counter(bigrams)
cnt.most_common(5)
# [('of~the', 14615),
#  ('in~the', 9913),
示例#2
0
    def log_prob(self, ngram: str) -> float:
        c1 = self._data[ngram]
        if self._words:
            c2 = self.N
        else:
            words = "~".join(ngram.split("~")[:-1])
            c2 = self.N[words]
        if c1 and c2:
            return np.log(c1) - np.log(c2)
        raise ValueError("ngram %s not found" % ngram)

    def prob(self, ngram: str) -> float:
        return np.exp(self.log_prob(ngram))


tm = TokenCount.textModel(token_list=[-3])
token = TokenCount(tokenizer=tm.tokenize)
read = Read(glob("books/*.txt"), n_gram=tm.token_list[0] * -1)
token.process(read.read())

lm = LM(token.counter, words=tm.token_list[0] == -1)

logp = 0
max_logp, cnt = 0, 0
N = 0
for txt in read.test_set:
    for ngram in tm.tokenize(txt):
        N += 1
        try:
            _ = lm.log_prob(ngram)
            if _ < max_logp: