Exemplo n.º 1
0
def distance_matrix(word_label_pairs):

    words, labels = zip(*word_label_pairs)
    unked_word_label_pairs = zip(unk(words), labels)

    conditions  = set(labels)
    divergences = []
    for c1, c2 in pair_generator(conditions):

        fd1 = nltk.FreqDist([w for w, c in unked_word_label_pairs if c == c1])
        fd2 = nltk.FreqDist([w for w, c in unked_word_label_pairs if c == c2])

        P = nltk.MLEProbDist(fd1)
        Q = nltk.MLEProbDist(fd2)

        divergences.append(jensen_shannon_divergence(P, Q))

    n_conditions = len(conditions)
    distances = zip(divergences, pair_generator(conditions))
    divergences  = np.array(divergences).reshape((n_conditions, n_conditions))

    # plot that matrix
    cmap = plt.get_cmap('Blues')
    plt.pcolor(divergences, cmap=cmap)
    plt.xticks([x + .5 for x in xrange(n_conditions)], list(conditions), rotation=90)
    plt.yticks([x + .5 for x in xrange(n_conditions)], list(conditions))
    plt.title('Jensen-Shannon Divergence of conditional distributions')
    plt.ylabel('P()')
    plt.xlabel('Q()')
    plt.show()

    return distances
 def _create_probabilities(self):
     self.probs_by_features = {
         features: nltk.MLEProbDist(freq_dist)
         for (features, freq_dist) in self.freqs_by_features.items()
     }
     if USE_NEXT_TAG:
         self.probs_by_tags = {
             features: nltk.MLEProbDist(freq_dist)
             for (features, freq_dist) in self.freqs_by_tags.items()
         }
     self.probs_by_prev_tag = {
         prev_features: nltk.MLEProbDist(freq_dist)
         for (prev_features, freq_dist) in self.freqs_by_prev_tag.items()
     }
     self.probs_by_tag = {
         tag: nltk.MLEProbDist(freq_dist)
         for (tag, freq_dist) in self.freqs_by_tag.items()
     }
Exemplo n.º 3
0
    def get_next(self, current, weighted_by_probability=False):

        next_freq = self.transition_probabilities[current]

        if weighted_by_probability:
            prob_dist = nltk.MLEProbDist(next_freq)
            return prob_dist.generate()
        else:
            prob_dist = nltk.UniformProbDist(next_freq)
            return prob_dist.generate()
Exemplo n.º 4
0
    def entropy(self, condition, base=None):
        """Return the entropy of the distribution of a given condition. If base
        is set as None (which it is as default), the log base of entropy is the
        number of possible outcomes in the distribution."""

        if condition == 'Ø' and 'Ø' not in self.conditions():
            condition = '#ALL#'

        prob_dist = nltk.MLEProbDist(self[condition])
        probs = [prob_dist.prob(bin_) for bin_ in prob_dist.samples()]
        if not base:
            base = len(self._possible_outcomes())
        return stats.entropy(probs, base=base)
Exemplo n.º 5
0
    def rep(self):
        scores = self.get_scores()
        #sums = dict((i,sum([t[1] for t in scores[i]])) for i in scores.keys())
        sums = dict()
        for k in scores.keys():
            sums[k] = nltk.MLEProbDist(nltk.FreqDist(dict(scores[k])))

        def rate(cand):
            r = 0
            for ngramlength, dat in scores.items():
                for c, s in dat:
                    if c == cand:
                        r += ngramlength * sums[ngramlength].prob(c)
            return r

        return rate
Exemplo n.º 6
0
        def f(nick):
            # For each message
            bigram_frequency = defaultdict(nltk.FreqDist)
            for message in self.nicks[nick].messages:
                # Compute bigrams for the message
                bigrams = list(nltk.bigrams(nltk.word_tokenize(message)))

                if len(bigrams) < 1:
                    continue

                bigrams = [(0, bigrams[0][0])] + bigrams + [(bigrams[-1][1], 0)]

                # Put bigrams into frequency distribution
                for bigram in bigrams:
                    bigram_frequency[bigram[0]][bigram[1]] += 1

            for word, freq in bigram_frequency.items():
                self.nicks[nick].bigram_distribution[word] = nltk.MLEProbDist(freq)
def main():
    sents = create_tokens(CORPUS_FILENAME)
    train_corpus, test_corpus = train_test_split(sents)

    fd_1gram = ngram_freq_dist(train_corpus, ngram=1)
    cpd_1gram = nltk.MLEProbDist(fd_1gram)

    freq_dist2 = ngram_freq_dist(train_corpus, 2)

    print('Nations', freq_dist2["nations"])

    cfd_2gram = ngram_freq_dist(train_corpus, ngram=2) #conditional frequency distribution for bigrams
    cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist) # conditional probality distribution for bigrams

    cfd_3gram = ngram_freq_dist(train_corpus, ngram=3)
    cpd_3gram = nltk.ConditionalProbDist(cfd_3gram, nltk.MLEProbDist)

    pws_2gram = probable_words('united states', cpd_2gram, 2)
    pws_3gram = probable_words('donald trump', cpd_3gram, 3)

    print('Probable words for donald trump using 3 gram model', pws_3gram)

    test_sent1 = 'donald president is trump'
    test_sent2 = 'donald trump is president'

    prob_1gram = find_sent_prob(test_sent2, cpd_1gram, ngram=1)
    print('Sentance probability of {}'.format(test_sent1), prob_1gram)

    print('Entropy of 1 gram model', entropy(cpd_1gram, test_corpus, 1))
    print('Entropy of 2 gram model', entropy(cpd_2gram, test_corpus, 2))
    print('Entropy of 3 gram model', entropy(cpd_3gram, test_corpus, 3))

    print('Perplexity of 1 gram model', perplexity(cpd_1gram, test_corpus, 1))
    print('Perplexity of 2 gram model', perplexity(cpd_2gram, test_corpus, 2))
    print('Perplexity of 3 gram model', perplexity(cpd_3gram, test_corpus, 3))

    text_wiki = generate_txt_bigram_model(cpd_2gram, 'trump', numwords=10)
    print('Test sentance for trump:', text_wiki)
Exemplo n.º 8
0
# review the text files for cleaniness
# Describe this in the document
bbTokens = bbTokens[114:]
# no need to do this for King of the Wind

# Now we need to convert the tokens to all lowercase
bbWords = [w.lower() for w in bbTokens]
kwWords = [w.lower() for w in kwTokens]
# check the length of the list of words
bbDict['Tokens'] = len(bbWords)
kwDict['Tokens'] = len(kwWords)

# **skip this or is this the frequency for bigrams?
bbBigram = ngrams(bbTokens, 2)
freq_dist = nltk.FreqDist(bbBigram)
prob_dist = nltk.MLEProbDist(freq_dist)
numBigrams = freq_dist.N()

bbTrigram = ngrams(bbTokens, 3)
Tfreq_dist = nltk.FreqDist(bbTrigram)
Tprob_dist = nltk.MLEProbDist(Tfreq_dist)
numTrigrams = Tfreq_dist.N()

# Bigrams - Black Beauty
bbBigramList = list(nltk.bigrams(bbWords))
print(bbBigramList[:30])
# Bigrams - King of the Wind
kwBigramList = list(nltk.bigrams(kwWords))
print(kwBigramList[:30])

# Trigrams - Black Beauty
Exemplo n.º 9
0
def main():
    with open(FILE_PATH, 'r') as f:
        data = f.read().lower().replace('\n', ' ')

    sents = tokenized_words(data)
    rev_sents = tokenized_rev_words(data)
    train_corpus = [word for sent in sents for word in sent]
    rev_train_corpus = [word for sent in rev_sents for word in sent]

    cfd_2gram = ngram_freq_dist(train_corpus, 2)
    cfd_2gram_rev = ngram_freq_dist(rev_train_corpus, 2)

    cpd_2gram = nltk.ConditionalProbDist(cfd_2gram, nltk.MLEProbDist)
    cpd_2gram_rev = nltk.ConditionalProbDist(cfd_2gram_rev, nltk.MLEProbDist)

    cfd_1gram = ngram_freq_dist(train_corpus)
    cpd_1gram = nltk.MLEProbDist(cfd_1gram)

    random_sentences = []
    random_pos_tags = []
    random_word_pos_tags = []

    # Generate 200 sentences randomly
    for _ in range(5000):
        sent = generate_txt_bigram_model_random(cpd_2gram, cpd_2gram_rev,
                                                'education', 9)
        word_pos_tags = nltk.pos_tag(sent.split())
        pos_tags = [x[1] for x in word_pos_tags]

        random_word_pos_tags.append(word_pos_tags)
        random_sentences.append(sent)
        random_pos_tags.append(pos_tags)
    '''
    RULES:

    1. Determiner always comes before a noun.
    2. Noun can be followed by another noun phrase.
    3. Modals (could, will) can follow nouns.
    4. ..

    '''

    pos_template_dict = {
        'NN': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'VBZ', 'NNS'],
        'NNS': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'NN'],
        'NNP': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN', 'NNS'],
        'NNPS': ['NN', 'VB', 'VBD', 'MD', 'VBP', 'IN'],
        'DT': ['NN', 'NNS', 'NNP', 'NNPS', 'VBP', 'JJ'],
        'JJ': ['CC'],
        'CC': ['NN', 'NNS', 'NNP', 'NNPS'],
        'VB': ['NN', 'DT', 'TO'],
        'VBD': ['NN', 'TO'],
        'VBG': ['IN', 'TO'],
        'VBP': ['VBG', 'RB', 'TO'],
        'VBN': ['RB', 'PRP', 'TO'],
        'VBZ': ['VBN'],
        'MD': ['VB', 'PRP'],
        'IN': ['DT', 'JJ'],
        'RB': ['NN', 'NNS'],
        'PRP': ['MD', 'VBD'],
        'TO': ['VB'],
    }

    filtered_sent = filter_sentences(random_pos_tags, random_sentences,
                                     pos_template_dict)
    #print_filtered_sent(filtered_sent)
    #print('------------------------------------------------------------------------------')

    dict_of_probs = sent_prob(filtered_sent, cpd_1gram, cpd_2gram)
    top_five = get_top_five(dict_of_probs)

    print('Top five tweets:\n')
    for tweet in top_five:
        print(tweet)
        print('=============')
    '''
Exemplo n.º 10
0
 def assignProbabilities(self, person):
     fd = nltk.FreqDist(self.personMessageDict[person])
     probDist = nltk.MLEProbDist(fd)
     for y in probDist.samples():
         self.probList[y] = probDist.prob(y)
Exemplo n.º 11
0
        j = j.replace("--", "")
        j = j.replace("_", "")
        book = book + j
    print(len(book))
    print()

print(len(book))
print()

#region unigramms

words = nltk.word_tokenize(book)
unigram = nltk.ngrams(words, 1)

freq_dist_un = nltk.FreqDist(unigram)
prob_dist_un = nltk.MLEProbDist(freq_dist_un)
# number_of_unigrams = prob_dist_un.N()

if False:
    for i in freq_dist_un:
        print(i, " ", freq_dist_un[i], " ", prob_dist_un.prob(i))

#endregion

#region bigramms

sentences = nltk.sent_tokenize(book)
tokenized = map(nltk.tokenize.word_tokenize, sentences)

bigrams = map(ngrams_wrapper, tokenized)
bigram = list(itertools.chain.from_iterable(bigrams))
Exemplo n.º 12
0
    def train(self, training_data):
        '''
        Trains an n-gram model.
        '''
        if self.status != 0:
            self.clear()

        # parse training data, counting n-grams
        for alignment in training_data:
            graphs = ['<', '<', '<', '<']
            graphs.extend(alignment[0])
            graphs.append('>')
            phons = ['<', '<', '<', '<']
            phons.extend(alignment[1])
            phons.append('>')
            for i in range(4, len(phons)):
                self.uni[(graphs[i], phons[i])] += 1
                self.bi[((graphs[i - 1], graphs[i]), (phons[i - 1],
                                                      phons[i]))] += 1
                self.tri[((graphs[i - 2], graphs[i - 1], graphs[i]),
                          (phons[i - 2], phons[i - 1], phons[i]))] += 1
                self.quad[((graphs[i - 3], graphs[i - 2], graphs[i - 1],
                            graphs[i]), (phons[i - 3], phons[i - 2],
                                         phons[i - 1], phons[i]))] += 1
                self.quin[((graphs[i - 4], graphs[i - 3], graphs[i - 2],
                            graphs[i - 1], graphs[i]),
                           (phons[i - 4], phons[i - 3], phons[i - 2],
                            phons[i - 1], phons[i]))] += 1
                self.N[((graphs[i - 4], graphs[i - 3], graphs[i - 2],
                         graphs[i - 1], graphs[i]),
                        (phons[i - 4], phons[i - 3], phons[i - 2],
                         phons[i - 1], phons[i]))] += 1

        # smoothing
        self.uni = nltk.MLEProbDist(self.uni)
        self.bi = nltk.MLEProbDist(self.bi)
        self.tri = nltk.MLEProbDist(self.tri)
        self.quad = nltk.MLEProbDist(self.quad)
        self.quin = nltk.MLEProbDist(self.quin)

        # lambda estimation
        for ngram in self.N:
            four_gram = ((ngram[0][1], ngram[0][2], ngram[0][3], ngram[0][4]),
                         (ngram[1][1], ngram[1][2], ngram[1][3], ngram[1][4]))
            three_gram = ((ngram[0][2], ngram[0][3], ngram[0][4]),
                          (ngram[1][2], ngram[1][3], ngram[1][4]))
            two_gram = ((ngram[0][3], ngram[0][4]), (ngram[1][3], ngram[1][4]))
            one_gram = (ngram[0][4], ngram[1][4])

            if self.quin.prob(ngram) >= self.quad.prob(
                    four_gram) and self.quin.prob(ngram) >= self.tri.prob(
                        three_gram) and self.quin.prob(ngram) >= self.bi.prob(
                            two_gram) and self.quin.prob(
                                ngram) >= self.uni.prob(one_gram):
                self.lambda5 += self.N.freq(ngram) * self.N.N()
            elif self.quad.prob(four_gram) >= self.tri.prob(
                    three_gram) and self.quad.prob(four_gram) >= self.bi.prob(
                        two_gram) and self.quad.prob(
                            four_gram) >= self.uni.prob(one_gram):
                self.lambda4 += self.N.freq(ngram) * self.N.N()
            elif self.tri.prob(three_gram) >= self.bi.prob(
                    two_gram) and self.tri.prob(three_gram) >= self.uni.prob(
                        one_gram):
                self.lambda3 += self.N.freq(ngram) * self.N.N()
            elif self.bi.prob(two_gram) >= self.uni.prob(one_gram):
                self.lambda2 += self.N.freq(ngram) * self.N.N()
            else:
                self.lambda1 += self.N.freq(ngram) * self.N.N()

        self.lambda5 = self.lambda5 / self.N.N()
        self.lambda4 = self.lambda4 / self.N.N()
        self.lambda3 = self.lambda3 / self.N.N()
        self.lambda2 = self.lambda2 / self.N.N()
        self.lambda1 = self.lambda1 / self.N.N()

        # set status
        self.status = 1
Exemplo n.º 13
0
with open('{}/test_plain.txt'.format(args.data_id), 'r') as f:
    data = f.read()
test_plain = data.split('\n')
if '' in test_plain:
    test_plain.remove('')

test_data = []
for c_sent, p_sent in zip(test_cipher, test_plain):
    sent_tuples = [(c_sent[i], p_sent[i]) for i in range(len(c_sent))]
    test_data.append(sent_tuples)

if args.laplace:
    estim = lambda fd, bins: nltk.LaplaceProbDist(fd, bins)
else:
    estim = lambda fdist, bins: nltk.MLEProbDist(fdist)

# Train HMM on POS tagging instead of ciphers
if args.pos:
    # nltk.download('brown')
    # nltk.download('universal_tagset')
    from nltk.corpus import brown

    # list of (list of (str,str)), each top level list is a sentence, containing (word,tag) pairs
    brown_news_tagged = brown.tagged_sents(categories='news',
                                           tagset='universal')[:2000]
    n = len(brown_news_tagged)

    # Clean up sentences from brown and build sets of states and symbols
    tag_re = re.compile(r'[*]|--|[^+*-]+')
    tag_set = set()
Exemplo n.º 14
0
def get_word_dist(text):
    words = nltk.word_tokenize(text)
    freq = nltk.FreqDist(words).most_common(200)
    dist = nltk.MLEProbDist(freq)
    return dist