def getTransitionProb(sm, sents, tagset): # P(nextTag|prevTag) = transitionProb[prevTag].prob(nextTag) transition = [] for s in sents: tags = [t for (w, t) in s] transition += ngrams(tags, 2) transitionProb = {} for tag in tagset: nextTags = [ nextTag for (prevTag, nextTag) in transition if prevTag == tag ] if sm == "no": transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags), 0, bins=1e5) elif sm == "laplace": transitionProb[tag] = LidstoneProbDist(FreqDist(nextTags), 1, bins=1e5) elif sm == "goodturing": transitionProb[tag] = SimpleGoodTuringProbDist(FreqDist(nextTags), bins=1e5) else: transitionProb[tag] = WittenBellProbDist(FreqDist(nextTags), bins=1e5) return transitionProb
def raw_words(self, length=100): """Generates a list of words using an NLTK NgramModel.""" if not hasattr(self, '_ngram_model'): estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator) return self._ngram_model.generate(length, [random.choice(self.words)])[1:]
def make_model(nst_infile, picklefile, protocol=-1): """ Train a POS probability model on the NST lexicon and save it as a pickle file. The model is a LidstoneProbDist (NLTK) which has compounded POS tags (SUC set) as keys (e.g. "NN+NN") and smoothed probabilities as values.""" # Collect all compounds from nst data nst_full_compounds = set() with open(nst_infile, encoding='UTF-8') as f: for line in f: fields = line[:-1].split('\t') word = fields[0] comp = fields[3].replace("!", "") pos = fields[4] if "+" in comp and "_" not in word and not (comp.startswith("+") or comp.startswith("-")): nst_full_compounds.add((word, comp, pos)) # Build POS probability model pos_fdist = FreqDist() for _w, _c, pos in nst_full_compounds: if '+' in pos: pos = re.sub(r"\+LN", "", pos) pos_fdist[pos] += 1 pd = LidstoneProbDist(pos_fdist, 0.001, pos_fdist.B()) # Save probability model as pickle with open(picklefile, "wb") as f: pickle.dump(pd, f, protocol=protocol)
def _estimator(fdist, *estimator_args, **estimator_kwargs): """ Default estimator function using a SimpleGoodTuringProbDist. """ # can't be an instance method of NgramModel as they # can't be pickled either. return LidstoneProbDist(fdist, *estimator_args, **estimator_kwargs)
def make_model(stats_infile, picklefile, smoothingparam=0.001, min_freq=3, protocol=-1): """Train a probability model on a korp statistics file and save it as a pickle file. The model is a LidstoneProbDist (NLTK) which has tuples (wordform, MSD-tag) as keys and smoothed probabilities as values.""" fdist = FreqDist() with open(stats_infile, encoding='utf-8') as f: for line in f: fields = line[:-1].split('\t') word = fields[0] # Skip word forms that occur fewer times than min_freq if int(fields[4]) < min_freq: break # Get rid of all urls if word.startswith("http://"): continue # # Words that only occur once may only contain letters and hyphens # if fields[4] == '1' and any(not (c.isalpha() or c == "-") for c in word): # continue # if len(word) > 100: # continue simple_msd = fields[1][:fields[1].find('.')] if '.' in fields[1] else fields[1] fdist[(word, simple_msd)] += int(fields[4]) pd = LidstoneProbDist(fdist, smoothingparam, fdist.B()) # Save probability model as pickle with open(picklefile, "wb") as p: pickle.dump(pd, p, protocol=protocol)
def getEmissionProb(sm, sents, tagset): # P(word|tag) = transitionProb[tag].prob(word) emission = [] for s in sents: emission += [(w.lower(), t) for (w, t) in s] emissionProb = {} for tag in tagset: words = [w for (w, t) in emission if t == tag] if sm == "no": emissionProb[tag] = LidstoneProbDist(FreqDist(words), 0, bins=1e5) elif sm == "laplace": emissionProb[tag] = LidstoneProbDist(FreqDist(words), 1, bins=1e5) elif sm == "goodturing": emissionProb[tag] = SimpleGoodTuringProbDist(FreqDist(words), bins=1e5) else: emissionProb[tag] = WittenBellProbDist(FreqDist(words), bins=1e5) return emissionProb
def get_probs_dist(freq_dist, smoothing=1): freq_dists = defaultdict(FreqDist) for ngram, freq in freq_dist.items(): *prefix, cur = ngram key = ''.join(prefix) freq_dists[key].update({cur: freq_dist[ngram]}) probs_dist = defaultdict(LidstoneProbDist) for prefix, fd in freq_dists.items(): probs_dist[prefix] = LidstoneProbDist(fd, gamma=smoothing) return probs_dist
def demo_pos(): # demonstrates POS tagging using supervised training print print "HMM POS tagging demo" print print 'Training HMM...' labelled_sequences, tag_set, symbols = load_pos(200) trainer = HiddenMarkovModelTrainer(tag_set, symbols) hmm = trainer.train_supervised(labelled_sequences[10:], estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print 'Testing...' test_pos(hmm, labelled_sequences[:10], True)
def __init__(self, dataset, capitalize=False): self.capitalize = capitalize tweets = dataset.split("\n") words = [] for tweet in tweets: if "@" in tweet or tweet.startswith("RT"): continue words += [ word for word in tweet.split() if word[0] not in ["@", "#", ":", "(", ")", "2"] and not "http://" in word and not "https://" in word ] self.words = words self.model = nltk.Text(words) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._ngram_model = NgramModel(2, self.model, estimator=estimator)
def demo_pos_bw(): # demonstrates the Baum-Welch algorithm in POS tagging print print "Baum-Welch demo for POS tagging" print print 'Training HMM (supervised)...' sentences, tag_set, symbols = load_pos(210) symbols = set() for sentence in sentences: for token in sentence: symbols.add(token[_TEXT]) trainer = HiddenMarkovModelTrainer(tag_set, list(symbols)) hmm = trainer.train_supervised(sentences[10:200], estimator=lambda fd, bins: LidstoneProbDist(fd, 0.1, bins)) print 'Training (unsupervised)...' # it's rather slow - so only use 10 samples unlabeled = _untag(sentences[200:210]) hmm = trainer.train_unsupervised(unlabeled, model=hmm, max_iterations=5) test_pos(hmm, sentences[:10], True)
def run(self): cfd = ConditionalFreqDist((tuple(self.data_set[i: i + self.n - 1]), self.data_set[i + self.n - 1]) for i in range(len(self.data_set) - self.n + 1)) lidstone_estimator = lambda fd: LidstoneProbDist(fd, self.gamma, fd.B() + 1) cpd = ConditionalProbDist(cfd, lidstone_estimator) self.model = cpd