Пример #1
2
import cPickle as pickle
import random
from nltk.model import NgramModel
from nltk.probability import LidstoneProbDist
import nltk

print "... loading text"
text_train = list(nltk.corpus.gutenberg.words('austen-emma.txt'))
print len(set(text_train))
text_test = list(nltk.corpus.gutenberg.words('austen-sense.txt'))

#with open('./../datasets/t5_train') as f:
#    text_train =(' '.join(pickle.load(f))).split(' . ')
#    random.shuffle(text_train)
#    text_train = (' . '.join(text_train)).split(' ')
#    
#with open('./../datasets/t5_test') as f:
#    text_test =(' '.join(pickle.load(f))).split(' . ')
#    random.shuffle(text_test)
#    text_test = (' . '.join(text_test)).split(' ')

print "... training model"
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) 
lm = NgramModel(3, text_train, estimator=estimator)

print "... results"
print lm.generate(50, ['dog'])
print lm.perplexity(text_test)
print lm.entropy(text_test)
def generateNgramModel(corpusPath, corpusName):
    corpusdir = 'corpora/'  # Directory of corpus.
    generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName)
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False,
                            estimator)  #uses bigrams just cause they BETTER
    return ngrammodel
Пример #3
0
def train_model(fdist, listObj, n):
    """ 
        @n - size of ngram
        @fdist - frequency distribution
        @listObj - ngram data list
        
        """

    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    lm = NgramModel(n, fdist, estimator=estimator)
    return lm
Пример #4
0
  def train(self, corpus):
    """Trains a language using a trigram model with stupid backoff
    to a bigram model with stupid backoff to a unigram model
    with plus one smoothing"""
    for sentence in corpus.corpus:
      for i in xrange(0, len(sentence.data)):
        token = sentence.data[i].word
        self.unigramCounts[token] += 1
        self.total += 1
        if i + 1 < len(sentence.data): 
            next = sentence.data[i+1].word
            self.bigramCounts[(token, next)] += 1
        if i + 2 < len(sentence.data):
            third = sentence.data[i+2].word
            self.trigramCounts[(token, next, third)] += 1

    train_tokens = brown.words()
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    self.trilm = NgramModel(3, train_tokens, True, False, estimator)
    self.bilm = NgramModel(2, train_tokens, True, False, estimator)
    self.unilm = NgramModel(1, train_tokens, True, False, estimator)
Пример #5
0
    def generate(self, length=100):
        """
        Print random text, generated using a trigram language model.

        :param length: The length of text to generate (default=100)
        :type length: int
        :seealso: NgramModel
        """
        if '_trigram_model' not in self.__dict__:
            print("Building ngram index...")
            estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
            self._trigram_model = NgramModel(3, self, estimator=estimator)
        text = self._trigram_model.generate(length)
        print(tokenwrap(text))
Пример #6
0
    def calcWordProb(self):

        word_seq = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz']

        text = "They become more expensive already. Mine is like 25. So horrible and they did less things than I did last time."
        text = nltk.word_tokenize(text.translate(None, ',.'))

        print text
        #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
        #lm = NgramModel(2, word_seq, estimator)

        est = lambda freqdist, bins: LidstoneProbDist(freqdist, 0.2, bins)
        model = NgramModel(3, text, True, True, est, 21)

        print model.prob("more", text)
Пример #7
0
    def __init__(self):
        cess_sents = cess.tagged_sents()
        self.uni_tag = ut(cess_sents)

        self.model = NgramModel(3, brown.words())

        self.translation = []
        self.dictionary = collections.defaultdict(lambda: 0)
        dictionaryFile = open("../corpus/Dictionary.txt", 'r')
        for translation in dictionaryFile:
            spanish, english = translation.split(" - ")
            spanish = spanish.decode('utf-8')
            self.dictionary[spanish] = collections.defaultdict(lambda: [])
            english = english.rstrip(';\n').split('; ')
            for pos in english:
                pos = pos.split(': ')
                self.dictionary[spanish][pos[0]] = pos[1].split(', ')

        self.sentences = []
        sentencesFile = open("../corpus/TestSet.txt", 'r')
        for sentence in sentencesFile:
            self.sentences.append(sentence.rstrip('\n'))
Пример #8
0
spans = [span for span in training_spans]
training_offsets = [span[0] for span in spans]
train = []
for s in spans:
    train.append(training_raw[s[0]:s[1]])

testing_spans = WhitespaceTokenizer().span_tokenize(testing_raw)
spans = [span for span in testing_spans]
testing_offsets = [span[0] for span in spans]
test = []
for s in spans:
    test.append(testing_raw[s[0]:s[1]])

estimator = lambda fdist, bins: LidstoneProbDist(fdist, args.
                                                 estimator_probability)
lm = NgramModel(args.num_grams, train, estimator=estimator)

t0 = 0
t1 = 1
current_best = ''
while t1 < len(test):
    perplexity = lm.perplexity(test[t0:t1])
    if perplexity > args.cutoff_max_perplexity:
        if (len(current_best) > 1):
            print current_best + '.'
            current_best = ''
        t0 = t1 + 1
        t1 = t0 + 1
    else:
        t1 += 1
        if t1 - t0 > args.min_sentence_length and perplexity < args.output_max_perplexity:
Пример #9
0
			if parsed.word_type == "stem":
				stemmer = Stemmer.Stemmer('russian')
				words += stemmer.stemWords([inp])
			elif parsed.word_type == "surface_all":
				words += nltk.word_tokenize(inp)
			elif parsed.word_type == "surface_no_pm" or parsed.word_type[:7] == "suffix_":
				inp = inp.translate(None, string.punctuation)
				words += nltk.word_tokenize(inp)
			else:
				words += nltk.word_tokenize(inp)
			

if parsed.word_type[:7] == "suffix_":
	l = int(parsed.word_type.split("_")[1])
	words = [x[-l:] for x in words]

if parsed.unknown_word_freq:
	unknown_words = []
	# print "Removing unknown words"
	fq = FreqDist(words)
	for w, count in fq.iteritems():
		if count < parsed.unknown_word_freq:
			unknown_words += w

	words[:] = [x if x not in unknown_words else "<UNK>" for x in words]

lm = NgramModel(n, words, estimator=estimator)
outf = open(output, "wb")
dill.dump(lm, outf ,protocol= 2)
outf.close()
Пример #10
0
#!/usr/bin/env python
import nltk
from nltk import bigrams
from nltk import trigrams
from nltk.probability import LidstoneProbDist
from nltk.model import NgramModel

with open('./austen/persuasion.txt', 'r') as training_file:
    raw = training_file.read()
tokens = nltk.word_tokenize(raw)

with open('./austen/sense_and_sensibility.txt', 'r') as test_file:
    test = test_file.read()
test_list = nltk.word_tokenize(test)

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
model = NgramModel(3, tokens, True, False, estimator)
tri = model.entropy(test_list)
print "tri-gram: " + str(tri)

model = NgramModel(2, tokens, True, False, estimator)
bi = model.entropy(test_list)
print "bi-gram: " + str(bi)
Пример #11
0
import nltk

print("... build")
brown = nltk.corpus.brown
corpus = [word.lower() for word in brown.words()]

# Train on 95% f the corpus and test on the rest
spl = 95*len(corpus)/100
train = corpus[:spl]
test = corpus[spl:]

# Remove rare words from the corpus
fdist = nltk.FreqDist(w for w in train)
vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))

train = map(lambda x: x if x in vocabulary else "*unknown*", train)
test = map(lambda x: x if x in vocabulary else "*unknown*", test)

print("... train")


from nltk.model import NgramModel
from nltk.probability import LidstoneProbDist

estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
lm = NgramModel(5, train, estimator=estimator)

print("len(corpus) = %s, len(vocabulary) = %s, len(train) = %s, len(test) = %s" % (len(corpus), len(vocabulary), len(train), len(test)))
print("perplexity(test) =", lm.perplexity(test))
Пример #12
0
def process_plaintext(dir_path):
    reader = CategorizedPlaintextCorpusReader(dir_path,
                                              r'.*\.txt',
                                              cat_pattern=r'.+_.+_(.*)\.txt')
    facilitator_files = reader.fileids(categories='facilitator')
    participant_files = reader.fileids(categories='participant')
    print facilitator_files, participant_files

    #print reader.categories()
    #print len(reader.words())
    #print len(reader.sents())

    fac_words = [word for word in reader.words(facilitator_files)]
    par_words = [word for word in reader.words(participant_files)]

    fac_words = edit_tokens(fac_words)
    par_words = edit_tokens(par_words)

    speakers = ([(word, 'facilitator')
                 for word in reader.words(facilitator_files)] +
                [(word, 'participant')
                 for word in reader.words(participant_files)])

    features = get_features(speakers)

    size = int(len(features) * 0.3)
    nb_train = features[size:]
    nb_test = features[:size]

    classifier = nltk.NaiveBayesClassifier.train(nb_train)
    print "Classifier labels:", classifier.labels()
    print classifier.show_most_informative_features()
    print "Clasify test:", nltk.classify.accuracy(classifier, nb_test)
    #print classifier.classify(get_features(["Yolo", "bag", "sp"], False))

    #random.shuffle(speakers)
    three_quarters = int(len(speakers) * 0.75)
    train = speakers[:three_quarters]
    test = speakers[three_quarters:]

    est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist)
    un_lm = NgramModel(1, train, estimator=est)
    bi_lm = NgramModel(2, train, estimator=est)
    tr_lm = NgramModel(3, train, estimator=est)
    qu_lm = NgramModel(4, train, estimator=est)
    pe_lm = NgramModel(5, train, estimator=est)
    print un_lm
    print bi_lm
    print tr_lm
    print qu_lm
    print pe_lm
    print "1 gram Perplexity:", un_lm.perplexity(test)
    print "2 gram Perplexity:", bi_lm.perplexity(test)
    print "3 gram Perplexity:", tr_lm.perplexity(test)
    print "4 gram Perplexity:", qu_lm.perplexity(test)
    print "5 gram Perplexity:", pe_lm.perplexity(test)

    print bi_lm.generate(10, ["uh", "sp"])

    fd_fac = nltk.FreqDist(fac_words)
    vocab_fac = fd_fac.keys()

    fd_par = nltk.FreqDist(par_words)
    vocab_par = fd_par.keys()

    print "Fac Vocab: ", len(vocab_fac)
    print "Fac Tokens: ", len(fac_words)
    print vocab_fac[:20]
    print "Par Vocab: ", len(vocab_par)
    print "Par Tokens: ", len(par_words)
    print vocab_par[:20]
    fd_par.plot(50)
Пример #13
0
    forbidden_words = forbidden_words_doc.readline()
    # список предлогов и других служебных слов
    prep_doc = codecs.open(scriptdir + u'prepos_list.txt',
                           u'r',
                           encoding='utf-8')
    prepositions = []
    for line in prep_doc:
        if len(line) > 0: prepositions.append(strip_string(line))

    # Представляем входной текст в виде списка токенов
    all_tokens = tokenize(text)

    # строим языковую модель
    ngrams = 10
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.1)
    model = NgramModel(ngrams, all_tokens, estimator=estimator)
    print 'Language model built.'

    # рандомно генерируем первое слово
    random_word = generate_first_word(all_tokens)
    meisterwerk = [random_word]

    # генерируем 4 строки. Количество слогов по строкам: 9/8/9/8
    first_line = generate_line(meisterwerk, 9)
    for word in first_line:
        meisterwerk.append(word)
    print '1st line generated.'

    second_line = generate_line(meisterwerk, 8)
    for word in second_line:
        meisterwerk.append(word)
Пример #14
0

# Tokens contains the words for Genesis and Reuters Trade
tokens = tokenize_file("simple_wikipedia_plaintext.txt")
#tokens = brown.words(categories='news')
#print tokens[1:100]
#tokens = list(genesis.words('english-kjv.txt'))
#tokens.extend(list(reuters.words(categories = 'trade')))
#tokens.extend(list(brown.words(categories='news')))
#tokens.extend(list(reuters.words(categories = 'earn')))

# estimator for smoothing the N-gram model
est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

# N-gram language model with 3-grams
model = NgramModel(N, tokens, pad_left=True, pad_right=True, estimator=est)
#model = NgramModel(N, tokens, estimator=est)

# Apply the language model to generate 50 words in sequence
#text_words = model.generate(50)

# Concatenate all words generated in a string separating them by a space.
#text = ' '.join([word for word in text_words])

# print the text
#print text

sentence = "This is a sample sentence."
print sentence
print "p:", sentence_probability(sentence, model)
print "p_m:", sentence_probability_modified(sentence, model)
Пример #15
0
    # Read file
    f = io.open('/veu4/usuaris30/speech00/corpus/train/spanishlit_ninc_v' +
                version + '_nlm/' + categ + '.txt',
                encoding='utf8')
    g = f.read().lower()
    # Obtain tokenized words
    train = nltk.word_tokenize(g)
    print "e"
    # Remove rare words from the corpus
    # fdist = nltk.FreqDist(w for w in train)
    # vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems())))
    # train1 = map(lambda x: x if x in vocabulary else "*unknown*", train)

    # Obtain the Language Model using WittenBellProbDist to smooth unseen events
    estimator = lambda fdist, bins: WittenBellProbDist(fdist, 10)
    lm[categ] = NgramModel(N, train, estimator=estimator)

    print "> Obtain language model of", categ, "... Done!"
print "> Obtain all language models... Done!"

# Load dictionary with: {category:tests}
n_categ = []
test_corpus = dict()
for categ in all_categ:
    files = os.listdir(
        '/veu4/usuaris30/speech00/corpus/testc/spanishlit_ninc_v' + version +
        '_nlm/' + categ)
    n_categ.append(len(files))
    tests = []
    for fi in files:
        f = io.open('/veu4/usuaris30/speech00/corpus/testc/spanishlit_ninc_v' +
Пример #16
0
import os
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
from nltk.probability import LidstoneProbDist, WittenBellProbDist
from nltk.model import NgramModel
from nltk.tokenize import sent_tokenize, word_tokenize


corpusdir = 'corpora/' # Directory of corpus.
SickCorpus = PlaintextCorpusReader(corpusPath, 'sick_tweets.txt')
HealthyCorpus = PlaintextCorpusReader(corpusdir, 'healthy_tweets.txt')
estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    


estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)

sick_model_1 = NgramModel(1, SickCorpus.sents(), True, False, estimator)
sick_model_2 = NgramModel(2, SickCorpus.sents(), True, False, estimator)

healthy_model_1 = NgramModel(1, HealthyCorpus.sents(), True, False, estimator)
healthy_model_2 = NgramModel(2, HealthyCorpus.sents(), True, False, estimator)

tweet = "Remember when we were all diagnosed with Bieber fever ? Lol"

print "sick_model_1 is: " + str(sick_model_1.perplexity(word_tokenize(tweet)))
print "sick_model_2 is: " + str(sick_model_2.perplexity(word_tokenize(tweet)))
print "healthy_model_1 is: " + str(healthy_model_1.perplexity(word_tokenize(tweet)))
print "healthy_model_2 is: " + str(healthy_model_2.perplexity(word_tokenize(tweet)))
Пример #17
0
def trainModel():
    totalwords = abc.words() #+ genesis.words() + gutenberg.words() + webtext.words()
    estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
    BigramModel = NgramModel(2, totalwords)
    UnigramModel = NgramModel(1, totalwords)
    return (UnigramModel, BigramModel)