import cPickle as pickle import random from nltk.model import NgramModel from nltk.probability import LidstoneProbDist import nltk print "... loading text" text_train = list(nltk.corpus.gutenberg.words('austen-emma.txt')) print len(set(text_train)) text_test = list(nltk.corpus.gutenberg.words('austen-sense.txt')) #with open('./../datasets/t5_train') as f: # text_train =(' '.join(pickle.load(f))).split(' . ') # random.shuffle(text_train) # text_train = (' . '.join(text_train)).split(' ') # #with open('./../datasets/t5_test') as f: # text_test =(' '.join(pickle.load(f))).split(' . ') # random.shuffle(text_test) # text_test = (' . '.join(text_test)).split(' ') print "... training model" estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(3, text_train, estimator=estimator) print "... results" print lm.generate(50, ['dog']) print lm.perplexity(text_test) print lm.entropy(text_test)
def generateNgramModel(corpusPath, corpusName): corpusdir = 'corpora/' # Directory of corpus. generatedCorpus = PlaintextCorpusReader(corpusPath, corpusName) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) ngrammodel = NgramModel(2, generatedCorpus.sents(), True, False, estimator) #uses bigrams just cause they BETTER return ngrammodel
def train_model(fdist, listObj, n): """ @n - size of ngram @fdist - frequency distribution @listObj - ngram data list """ estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(n, fdist, estimator=estimator) return lm
def train(self, corpus): """Trains a language using a trigram model with stupid backoff to a bigram model with stupid backoff to a unigram model with plus one smoothing""" for sentence in corpus.corpus: for i in xrange(0, len(sentence.data)): token = sentence.data[i].word self.unigramCounts[token] += 1 self.total += 1 if i + 1 < len(sentence.data): next = sentence.data[i+1].word self.bigramCounts[(token, next)] += 1 if i + 2 < len(sentence.data): third = sentence.data[i+2].word self.trigramCounts[(token, next, third)] += 1 train_tokens = brown.words() estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self.trilm = NgramModel(3, train_tokens, True, False, estimator) self.bilm = NgramModel(2, train_tokens, True, False, estimator) self.unilm = NgramModel(1, train_tokens, True, False, estimator)
def generate(self, length=100): """ Print random text, generated using a trigram language model. :param length: The length of text to generate (default=100) :type length: int :seealso: NgramModel """ if '_trigram_model' not in self.__dict__: print("Building ngram index...") estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) self._trigram_model = NgramModel(3, self, estimator=estimator) text = self._trigram_model.generate(length) print(tokenwrap(text))
def calcWordProb(self): word_seq = ['foo', 'foo', 'foo', 'foo', 'bar', 'baz'] text = "They become more expensive already. Mine is like 25. So horrible and they did less things than I did last time." text = nltk.word_tokenize(text.translate(None, ',.')) print text #estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) #lm = NgramModel(2, word_seq, estimator) est = lambda freqdist, bins: LidstoneProbDist(freqdist, 0.2, bins) model = NgramModel(3, text, True, True, est, 21) print model.prob("more", text)
def __init__(self): cess_sents = cess.tagged_sents() self.uni_tag = ut(cess_sents) self.model = NgramModel(3, brown.words()) self.translation = [] self.dictionary = collections.defaultdict(lambda: 0) dictionaryFile = open("../corpus/Dictionary.txt", 'r') for translation in dictionaryFile: spanish, english = translation.split(" - ") spanish = spanish.decode('utf-8') self.dictionary[spanish] = collections.defaultdict(lambda: []) english = english.rstrip(';\n').split('; ') for pos in english: pos = pos.split(': ') self.dictionary[spanish][pos[0]] = pos[1].split(', ') self.sentences = [] sentencesFile = open("../corpus/TestSet.txt", 'r') for sentence in sentencesFile: self.sentences.append(sentence.rstrip('\n'))
spans = [span for span in training_spans] training_offsets = [span[0] for span in spans] train = [] for s in spans: train.append(training_raw[s[0]:s[1]]) testing_spans = WhitespaceTokenizer().span_tokenize(testing_raw) spans = [span for span in testing_spans] testing_offsets = [span[0] for span in spans] test = [] for s in spans: test.append(testing_raw[s[0]:s[1]]) estimator = lambda fdist, bins: LidstoneProbDist(fdist, args. estimator_probability) lm = NgramModel(args.num_grams, train, estimator=estimator) t0 = 0 t1 = 1 current_best = '' while t1 < len(test): perplexity = lm.perplexity(test[t0:t1]) if perplexity > args.cutoff_max_perplexity: if (len(current_best) > 1): print current_best + '.' current_best = '' t0 = t1 + 1 t1 = t0 + 1 else: t1 += 1 if t1 - t0 > args.min_sentence_length and perplexity < args.output_max_perplexity:
if parsed.word_type == "stem": stemmer = Stemmer.Stemmer('russian') words += stemmer.stemWords([inp]) elif parsed.word_type == "surface_all": words += nltk.word_tokenize(inp) elif parsed.word_type == "surface_no_pm" or parsed.word_type[:7] == "suffix_": inp = inp.translate(None, string.punctuation) words += nltk.word_tokenize(inp) else: words += nltk.word_tokenize(inp) if parsed.word_type[:7] == "suffix_": l = int(parsed.word_type.split("_")[1]) words = [x[-l:] for x in words] if parsed.unknown_word_freq: unknown_words = [] # print "Removing unknown words" fq = FreqDist(words) for w, count in fq.iteritems(): if count < parsed.unknown_word_freq: unknown_words += w words[:] = [x if x not in unknown_words else "<UNK>" for x in words] lm = NgramModel(n, words, estimator=estimator) outf = open(output, "wb") dill.dump(lm, outf ,protocol= 2) outf.close()
#!/usr/bin/env python import nltk from nltk import bigrams from nltk import trigrams from nltk.probability import LidstoneProbDist from nltk.model import NgramModel with open('./austen/persuasion.txt', 'r') as training_file: raw = training_file.read() tokens = nltk.word_tokenize(raw) with open('./austen/sense_and_sensibility.txt', 'r') as test_file: test = test_file.read() test_list = nltk.word_tokenize(test) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) model = NgramModel(3, tokens, True, False, estimator) tri = model.entropy(test_list) print "tri-gram: " + str(tri) model = NgramModel(2, tokens, True, False, estimator) bi = model.entropy(test_list) print "bi-gram: " + str(bi)
import nltk print("... build") brown = nltk.corpus.brown corpus = [word.lower() for word in brown.words()] # Train on 95% f the corpus and test on the rest spl = 95*len(corpus)/100 train = corpus[:spl] test = corpus[spl:] # Remove rare words from the corpus fdist = nltk.FreqDist(w for w in train) vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) train = map(lambda x: x if x in vocabulary else "*unknown*", train) test = map(lambda x: x if x in vocabulary else "*unknown*", test) print("... train") from nltk.model import NgramModel from nltk.probability import LidstoneProbDist estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(5, train, estimator=estimator) print("len(corpus) = %s, len(vocabulary) = %s, len(train) = %s, len(test) = %s" % (len(corpus), len(vocabulary), len(train), len(test))) print("perplexity(test) =", lm.perplexity(test))
def process_plaintext(dir_path): reader = CategorizedPlaintextCorpusReader(dir_path, r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt') facilitator_files = reader.fileids(categories='facilitator') participant_files = reader.fileids(categories='participant') print facilitator_files, participant_files #print reader.categories() #print len(reader.words()) #print len(reader.sents()) fac_words = [word for word in reader.words(facilitator_files)] par_words = [word for word in reader.words(participant_files)] fac_words = edit_tokens(fac_words) par_words = edit_tokens(par_words) speakers = ([(word, 'facilitator') for word in reader.words(facilitator_files)] + [(word, 'participant') for word in reader.words(participant_files)]) features = get_features(speakers) size = int(len(features) * 0.3) nb_train = features[size:] nb_test = features[:size] classifier = nltk.NaiveBayesClassifier.train(nb_train) print "Classifier labels:", classifier.labels() print classifier.show_most_informative_features() print "Clasify test:", nltk.classify.accuracy(classifier, nb_test) #print classifier.classify(get_features(["Yolo", "bag", "sp"], False)) #random.shuffle(speakers) three_quarters = int(len(speakers) * 0.75) train = speakers[:three_quarters] test = speakers[three_quarters:] est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist) un_lm = NgramModel(1, train, estimator=est) bi_lm = NgramModel(2, train, estimator=est) tr_lm = NgramModel(3, train, estimator=est) qu_lm = NgramModel(4, train, estimator=est) pe_lm = NgramModel(5, train, estimator=est) print un_lm print bi_lm print tr_lm print qu_lm print pe_lm print "1 gram Perplexity:", un_lm.perplexity(test) print "2 gram Perplexity:", bi_lm.perplexity(test) print "3 gram Perplexity:", tr_lm.perplexity(test) print "4 gram Perplexity:", qu_lm.perplexity(test) print "5 gram Perplexity:", pe_lm.perplexity(test) print bi_lm.generate(10, ["uh", "sp"]) fd_fac = nltk.FreqDist(fac_words) vocab_fac = fd_fac.keys() fd_par = nltk.FreqDist(par_words) vocab_par = fd_par.keys() print "Fac Vocab: ", len(vocab_fac) print "Fac Tokens: ", len(fac_words) print vocab_fac[:20] print "Par Vocab: ", len(vocab_par) print "Par Tokens: ", len(par_words) print vocab_par[:20] fd_par.plot(50)
forbidden_words = forbidden_words_doc.readline() # список предлогов и других служебных слов prep_doc = codecs.open(scriptdir + u'prepos_list.txt', u'r', encoding='utf-8') prepositions = [] for line in prep_doc: if len(line) > 0: prepositions.append(strip_string(line)) # Представляем входной текст в виде списка токенов all_tokens = tokenize(text) # строим языковую модель ngrams = 10 estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.1) model = NgramModel(ngrams, all_tokens, estimator=estimator) print 'Language model built.' # рандомно генерируем первое слово random_word = generate_first_word(all_tokens) meisterwerk = [random_word] # генерируем 4 строки. Количество слогов по строкам: 9/8/9/8 first_line = generate_line(meisterwerk, 9) for word in first_line: meisterwerk.append(word) print '1st line generated.' second_line = generate_line(meisterwerk, 8) for word in second_line: meisterwerk.append(word)
# Tokens contains the words for Genesis and Reuters Trade tokens = tokenize_file("simple_wikipedia_plaintext.txt") #tokens = brown.words(categories='news') #print tokens[1:100] #tokens = list(genesis.words('english-kjv.txt')) #tokens.extend(list(reuters.words(categories = 'trade'))) #tokens.extend(list(brown.words(categories='news'))) #tokens.extend(list(reuters.words(categories = 'earn'))) # estimator for smoothing the N-gram model est = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) # N-gram language model with 3-grams model = NgramModel(N, tokens, pad_left=True, pad_right=True, estimator=est) #model = NgramModel(N, tokens, estimator=est) # Apply the language model to generate 50 words in sequence #text_words = model.generate(50) # Concatenate all words generated in a string separating them by a space. #text = ' '.join([word for word in text_words]) # print the text #print text sentence = "This is a sample sentence." print sentence print "p:", sentence_probability(sentence, model) print "p_m:", sentence_probability_modified(sentence, model)
# Read file f = io.open('/veu4/usuaris30/speech00/corpus/train/spanishlit_ninc_v' + version + '_nlm/' + categ + '.txt', encoding='utf8') g = f.read().lower() # Obtain tokenized words train = nltk.word_tokenize(g) print "e" # Remove rare words from the corpus # fdist = nltk.FreqDist(w for w in train) # vocabulary = set(map(lambda x: x[0], filter(lambda x: x[1] >= 5, fdist.iteritems()))) # train1 = map(lambda x: x if x in vocabulary else "*unknown*", train) # Obtain the Language Model using WittenBellProbDist to smooth unseen events estimator = lambda fdist, bins: WittenBellProbDist(fdist, 10) lm[categ] = NgramModel(N, train, estimator=estimator) print "> Obtain language model of", categ, "... Done!" print "> Obtain all language models... Done!" # Load dictionary with: {category:tests} n_categ = [] test_corpus = dict() for categ in all_categ: files = os.listdir( '/veu4/usuaris30/speech00/corpus/testc/spanishlit_ninc_v' + version + '_nlm/' + categ) n_categ.append(len(files)) tests = [] for fi in files: f = io.open('/veu4/usuaris30/speech00/corpus/testc/spanishlit_ninc_v' +
import os from nltk.corpus.reader.plaintext import PlaintextCorpusReader from nltk.probability import LidstoneProbDist, WittenBellProbDist from nltk.model import NgramModel from nltk.tokenize import sent_tokenize, word_tokenize corpusdir = 'corpora/' # Directory of corpus. SickCorpus = PlaintextCorpusReader(corpusPath, 'sick_tweets.txt') HealthyCorpus = PlaintextCorpusReader(corpusdir, 'healthy_tweets.txt') estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) sick_model_1 = NgramModel(1, SickCorpus.sents(), True, False, estimator) sick_model_2 = NgramModel(2, SickCorpus.sents(), True, False, estimator) healthy_model_1 = NgramModel(1, HealthyCorpus.sents(), True, False, estimator) healthy_model_2 = NgramModel(2, HealthyCorpus.sents(), True, False, estimator) tweet = "Remember when we were all diagnosed with Bieber fever ? Lol" print "sick_model_1 is: " + str(sick_model_1.perplexity(word_tokenize(tweet))) print "sick_model_2 is: " + str(sick_model_2.perplexity(word_tokenize(tweet))) print "healthy_model_1 is: " + str(healthy_model_1.perplexity(word_tokenize(tweet))) print "healthy_model_2 is: " + str(healthy_model_2.perplexity(word_tokenize(tweet)))
def trainModel(): totalwords = abc.words() #+ genesis.words() + gutenberg.words() + webtext.words() estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) BigramModel = NgramModel(2, totalwords) UnigramModel = NgramModel(1, totalwords) return (UnigramModel, BigramModel)