def __init__(self, fileid): try: # Reads the UDHR file corpus = udhr.raw(fileid) except: print("UDHR language file " + fileid + " does not exist", file=sys.stderr) sys.exit(1) # Generate training dataset, lowercase and newlines converted to space self.train = re.sub(r'[\n]+', ' ', corpus[0:1000].strip().lower()) # Generate dev dataset self.dev = corpus[1000:1100] # Convert training words to single characters tokens = list(self.train) self.unigram = tokens self.bigram = list(nltk.bigrams(tokens)) self.trigram = list(nltk.trigrams(tokens)) # Generate unigram frequency distirbution self.unigramFreq = FreqDist(self.unigram) # Generate bigram frequency distribution self.bigramFreq = ConditionalFreqDist(self.bigram) # Generate trigram frequency distribution self.trigramFreq = ConditionalFreqDist( list(((w0, w1), w2) for w0, w1, w2 in self.trigram))
def build_top_words(self): pos_reviews = [(review, c) for (review, c) in self.documents if c == 'pos'] neg_reviews = [(review, c) for (review, c) in self.documents if c == 'neg'] pos_words = [token for (review, c) in pos_reviews for token in review] neg_words = [token for (review, c) in neg_reviews for token in review] fd_all = FreqDist(pos_words + neg_words) pos_class_words = [('pos', word) for word in pos_words] neg_class_words = [('neg', word) for word in neg_words] cfd_pos = ConditionalFreqDist(pos_class_words) cfd_neg = ConditionalFreqDist(neg_class_words) pos_word_count = len(pos_words) neg_word_count = len(neg_words) total_word_count = pos_word_count + neg_word_count word_scores = {} for (word, freq) in fd_all.items(): pos_score = BigramAssocMeasures.chi_sq(cfd_pos['pos'][word], (freq, pos_word_count), total_word_count) neg_score = BigramAssocMeasures.chi_sq(cfd_neg['neg'][word], (freq, neg_word_count), total_word_count) word_scores[word] = pos_score + neg_score best = sorted(word_scores.items(), reverse=True, key=lambda x: x[1])[:1000] self.top_words = set([w for w, s in best])
def train_model(): """Create ngram model from Project Gutenberg texts""" text = '' for corpus in CORPORA: with open(corpus, 'r') as file_: text += file_.read().replace('\n', '') sents = sent_tokenize(text.lower()) tokens = [] # appends <start> and <end> tokens to each sentence for sent in sents: sent = 'START ' + sent + ' END' tokens += word_tokenize(sent) ngrams_ = tuple(ngrams(tokens, N_VAL)) # bigram frequency distribution bi_cfdist = ConditionalFreqDist((ngram[0], ngram[:2]) for ngram in ngrams_) # bigram probability distribution bi_cpdist = ConditionalProbDist(bi_cfdist, LaplaceProbDist) # conditional frequency distribution cfdist = ConditionalFreqDist( (ngram[:N_MINUS1], ngram) for ngram in ngrams_) # conditional probability cpdist = ConditionalProbDist(cfdist, LaplaceProbDist) return bi_cpdist, cpdist
def __init__(self, file): corpus = udhr.raw(file) self.training_set = corpus[0:1000] token = list(self.training_set) self.unigram = token self.bigram = list(nltk.bigrams(token)) self.trigram = list(nltk.trigrams(token)) self.unigram_frequency = FreqDist(self.unigram) self.bigram_frequency = ConditionalFreqDist(self.bigram) self.trigam_frequency = ConditionalFreqDist( list(((x, y), z) for x, y, z in self.trigram))
def train_supervised(self, labelled_sequences, **kwargs): """ Supervised training maximising the joint probability of the symbol and state sequences. This is done via collecting frequencies of transitions between states, symbol observations while within each state and which states start a sentence. These frequency distributions are then normalised into probability estimates, which can be smoothed if desired. @return: the trained model @rtype: HiddenMarkovModelTagger @param labelled_sequences: the training data, a set of labelled sequences of observations @type labelled_sequences: list @param kwargs: may include an 'estimator' parameter, a function taking a C{FreqDist} and a number of bins and returning a C{ProbDistI}; otherwise a MLE estimate is used """ # default to the MLE estimate estimator = kwargs.get('estimator') if estimator == None: estimator = lambda fdist, bins: MLEProbDist(fdist) # count occurences of starting states, transitions out of each state # and output symbols observed in each state starting = FreqDist() transitions = ConditionalFreqDist() outputs = ConditionalFreqDist() for sequence in labelled_sequences: lasts = None for token in sequence: state = token[_TAG] symbol = token[_TEXT] if lasts == None: starting.inc(state) else: transitions[lasts].inc(state) outputs[state].inc(symbol) lasts = state # update the state and symbol lists if state not in self._states: self._states.append(state) if symbol not in self._symbols: self._symbols.append(symbol) # create probability distributions (with smoothing) N = len(self._states) pi = estimator(starting, N) A = ConditionalProbDist(transitions, estimator, False, N) B = ConditionalProbDist(outputs, estimator, False, len(self._symbols)) return HiddenMarkovModelTagger(self._symbols, self._states, A, B, pi)
def __init__(self, corpura): corpus = udhr.raw(corpura) self.TrainingSet = corpus[0:1000] token = list(self.TrainingSet) self.Uni = token self.Bi = list(nltk.bigrams(token)) self.Tri = list(nltk.trigrams(token)) self.UniFreq = FreqDist(self.Uni) self.BiFreq = ConditionalFreqDist(self.Bi) self.TriFreq = ConditionalFreqDist( list(((w1, w2), w3) for w1, w2, w3 in self.Tri))
def buildTransitionMatrix(self, tagged_corpus: list, train_size): train = tagged_corpus[:int(train_size * len(tagged_corpus))] random.shuffle(train) #construction of the transition matrix transition = ConditionalFreqDist() for (tag1, tag2) in train: if tag1 not in transition: transition[tag1] = FreqDist() if tag2 not in transition[tag1]: transition[tag1][tag2] = 0.0 transition[tag1][tag2] += 1 for tag in transition.keys(): somme = 0.0 for value in transition[tag].values(): somme += value for successor in transition[tag].keys(): transition[tag][successor] = round( float("{0:.6f}".format(transition[tag][successor] / somme)), 6) self.TRANSITION_MATRIX = transition return transition
def __init__(self, sentences): # FIXME should use smoothing here. I tried SimpleGoodTuringProbDist but # it returns zero probability for event with freq=1. Possibly due to # too small test corpus self.cfd = ConditionalFreqDist( (ngram[:-1], ngram[-1]) for sentence in sentences for ngram in ngrams(sentence, 3, pad_left=True))
def get_bigrams(self, text): list_bigrams = bigrams(text) cfd = ConditionalFreqDist(list_bigrams) list = [] for i in cfd: list.append(cfd[i]) return list
def constructTransitionMatrix(self, sourceFilesList: list): #construction of the transition matrix for fileName in sourceFilesList: file = open(fileName, 'r', encoding="windows-1256") fileFinal = "" for line in file: line = line.upper() if (len(line) > 1): if not line.startswith("<S>"): fileFinal += '<S> ' + line[:-1] + ' <E>\n' else: fileFinal += line[:-1] + '\n' file.close() tokens = [el for el in re.split("[\s\n]+", fileFinal) if el != ''] self.initialProbabilities = FreqDist([ tokens[i] for i in range(1, len(tokens)) if tokens[i - 1] == '<S>' ]) self.tags = list(set(tokens)) self.bigramDist = FreqDist(list(bigrams(tokens))) Trigrams = list(trigrams(tokens)) cfd = ConditionalFreqDist(((el[2], (el[0], el[1])) for el in Trigrams)) for word in cfd.conditions(): for bigram in cfd[word]: cfd[word][bigram] = round( float("{0:.6f}".format(cfd[word].freq(bigram))), 6) self.TRANSITION_MATRIX = cfd return cfd
def train_model_get_cosine_matrix(statements): statements = [statement.split() for statement in statements] frequencies = FreqDist(w for word in statements for w in word) conditionalFrequencies = ConditionalFreqDist( (key,word) for key in sorted(frequencies.keys()) for statement in statements for word in statement if key in statement) pmi = [[npmi_scorer(frequencies[worda], frequencies[wordb], conditionalFrequencies[worda][wordb], len(frequencies.keys()), 2, sum(frequencies[key] for key in frequencies.keys())) for wordb in sorted(frequencies.keys())] for worda in sorted(frequencies.keys())] pmi = np.array(pmi) pmi[np.isinf(pmi)] = -1 pmi[np.where(pmi < 0)] = 0 pmi = pd.DataFrame(pmi) pmi.columns = sorted(frequencies.keys()) pmi.index = sorted(frequencies.keys()) return pmi
def test_tabulate(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) with pytest.raises(ValueError): empty.tabulate( conditions="BUG") # nonexistent keys shouldn't be added self.assertEqual(empty.conditions(), [])
def train(self): """ This trains a simple baseline which just uses majority class voting for every word in vocabulary disregarding of its context """ self.word_pos_cfd = ConditionalFreqDist( tp for seq_list in self.corpus.train for tp in seq_list.get_tag_word_tuples())
def test_plot(self): empty = ConditionalFreqDist() self.assertEqual(empty.conditions(), []) try: empty.plot(conditions="BUG") # nonexistent keys shouldn't be added except: pass self.assertEqual(empty.conditions(), [])
def find_language(string): text=string.split(" ") text=[word for word in text if word.isalpha()] l=len(text) avail_langs=[file for file in udhr.fileids() if 'Latin1' in file] cfd=ConditionalFreqDist([(lang, word) for lang in avail_langs for word in [word for word in text if word in udhr.words(lang)]]) ls=sorted([(lang,cfd[lang]) for lang in avail_langs], key=lambda tple: tple[1].N()) print("The most probable language of the text is {0} with {1:3.3f}% probability.".format(ls[-1][0].replace('-Latin1',''), 100*ls[-1][1].N()/l))
def __init__(self, corpus): """Initializer of the BigramWordCandidateProvider. Args: corpus: An iterable of word strings. """ _bigrams = bigrams(corpus) self._cfd = ConditionalFreqDist(_bigrams)
def train(self): """ Construct the conditional frequencies and probabilities """ #extract tags from sentences tags = [tag for (_, tag) in self.tagged_sents] self.replaceUnique() self.emission_frequencies = ConditionalFreqDist( [tup[::-1] for tup in self.tagged_sents]) self.tagset_size = len(self.emission_frequencies.conditions()) # emission - probability that a certain tag is a certain word # e.g. probability that a VB is 'race' self.emission_probabilities = ConditionalProbDist( self.emission_frequencies, MLEProbDist) self.transition_frequencies = ConditionalFreqDist(bigrams(tags)) self.transition_probabilities = ConditionalProbDist( self.transition_frequencies, MLEProbDist) self.word_tag_frequencies = ConditionalFreqDist(self.tagged_sents)
def conditional_freq(self): result = [] cfd = ConditionalFreqDist(self.bigram_list) for key, values in cfd.items(): for word, freq in values.items(): result.append((key, word, freq)) return result
def suf_tag_freq(self): cfd = ConditionalFreqDist() for w in set(self.wt_freq.keys()) - set(self.c_words): for t in self.wt_freq[w].keys(): for suf_len in xrange(1, max(self.max_suf_len, len(w))): suf = w[-suf_len:] cfd[suf].inc(t, self.wt_freq[w][t]) cfd[''].inc(t) return cfd
def tabulateWordsInAllGeners(self, theWords): """ find the distribution of a word within all Brown corpus genres @params theWord: the word/list of words to find info about """ cdf = ConditionalFreqDist((genre, word) for genre in brown.categories() for word in brown.words(categories=genre)) cdf.tabulate(samples=theWords, conditions=brown.categories())
def __init__(self, n, training_data): """Create an n order model using training_data.""" # Set n and train self._n = n train_ngrams = _make_ngram_tuples(training_data, self._n) self._cfd = ConditionalFreqDist( (context, event) for (context, event) in train_ngrams) self._estimators = dict((context, self._cfd[context]) for context in self._cfd.conditions())
def language_model(collection): from nltk import ConditionalProbDist from nltk import ConditionalFreqDist from nltk import bigrams from nltk import MLEProbDist words = tokenize_collection(collection) freq_model = ConditionalFreqDist(bigrams(words)) prob_model = ConditionalProbDist(freq_model, MLEProbDist) return prob_model
def _train(self, tagged_corpus, cutoff=0, verbose=False): """ Initialize this C{ContextTagger}'s L{_context_to_tag} table based on the given training data. In particular, for each context C{I{c}} in the training data, set C{_context_to_tag[I{c}]} to the most frequent tag for that context. However, exclude any contexts that are already tagged perfectly by the backoff tagger(s). The old value of C{self._context_to_tag} (if any) is discarded. @param tagged_corpus: A tagged corpus. Each item should be a C{list} of C{(word, tag)} tuples. @param cutoff: If the most likely tag for a context occurs fewer than C{cutoff} times, then exclude it from the context-to-tag table for the new tagger. """ token_count = hit_count = 0 # A context is considered 'useful' if it's not already tagged # perfectly by the backoff tagger. useful_contexts = set() # Count how many times each tag occurs in each context. fd = ConditionalFreqDist() for sentence in tagged_corpus: tokens, tags = zip(*sentence) for index, (token, tag) in enumerate(sentence): # Record the event. token_count += 1 context = self.context(tokens, index, tags[:index]) if context is None: continue fd[context].inc(tag) # If the backoff got it wrong, this context is useful: if (self.backoff is None or tag != self.backoff.tag_one(tokens, index, tags[:index])): useful_contexts.add(context) # Build the context_to_tag table -- for each context, figure # out what the most likely tag is. Only include contexts that # we've seen at least `cutoff` times. for context in useful_contexts: best_tag = fd[context].max() hits = fd[context][best_tag] if hits > cutoff: self._context_to_tag[context] = best_tag hit_count += hits # Display some stats, if requested. if verbose: size = len(self._context_to_tag) backoff = 100 - (hit_count * 100.0)/ token_count pruning = 100 - (size * 100.0) / len(fd.conditions()) print "[Trained Unigram tagger:", print "size=%d, backoff=%.2f%%, pruning=%.2f%%]" % ( size, backoff, pruning)
def get_lookup_tagger_accuracy(test_set, lookup_tagger_basis, corpus): words = [word for sent in lookup_tagger_basis for word in sent] fd = FreqDist(words) cfd = ConditionalFreqDist(corpus.tagged_words()) most_freq_words = fd.most_common(200) likely_tags = dict( (word[0], cfd[word[0]].max()) for (word, _) in most_freq_words) baseline_tagger = UnigramTagger(model=likely_tags) result = baseline_tagger.evaluate(test_set) return result
def __get_conditional_freq_dist(self): t = trange( len(self.__ngram), desc= f'Creating Conditional frequency distributions for {len(self.__ngram[0])}-gram' ) condition_pairs = [] for i in t: words = self.__ngram[i] condition_pairs.append((tuple(words[:-1]), words[-1])) return ConditionalFreqDist(condition_pairs)
def display(): import pylab words_by_freq = FreqDist(brown.words(categories='news')).most_common(2**15) cfd = ConditionalFreqDist(brown.tagged_words(categories='news')) sizes = 2 ** pylab.arange(15) perfs = [performance(cfd, words_by_freq[:size]) for size in sizes] pylab.plot(sizes, perfs, '-bo') pylab.title('Lookup Tagger Performance with Varying Model Size') pylab.xlabel('Model Size') pylab.ylabel('Performance') pylab.show()
def conditional_dist(): cfdist = ConditionalFreqDist() fileids = corpus.gutenberg.fileids() for id in fileids: condition = id filteredText = freq_dist_filter(corpus.gutenberg.words(condition)) for word in filteredText: if word not in cfdist[condition]: cfdist[condition][word] = 0 cfdist[condition][word] += 1 return cfdist
def __init__(self): """Initializes the del_probs and ins_probs variables to empty MLE probability distributions, and the sub_probs to an empty conditional probability distribution.""" self.del_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be deleted self.ins_probs = MLEProbDist( FreqDist() ) # a MLE probability distribution representing how likely each character is to be inserted self.sub_probs = ConditionalProbDist( ConditionalFreqDist(), MLEProbDist ) # a Conditional Probability Distribution representing how likely a given character is to be replaced by another character
def tabulateWordsInPeriods(self, theWords): """ find the distribution of words within the years, based in Inaugural corpus @params theWords: the word/list of words to find info about """ cdf = ConditionalFreqDist((textid[:4], target) for textid in inaugural.fileids() for word in inaugural.words(textid) for target in theWords if word.lower().startswith(target) or word.lower().endswith(target)) cdf.tabulate()
def generateText(self, text, word, num=15): """ Generate semi-random text based on what's the likelihood of two words to appear together depending on the frequency distribution of a text bigrams @params text: the target text @params word: the seed word @params num: the length of the generated text, set to 15 as a default """ bigrams = nltk.bigrams(text) cfdist = ConditionalFreqDist(bigrams) for i in range(num): print(word, end=' ') word = cfdist[word].max()