def shakespeare_words(): """ Concatenate all of shakespeare :return: """ return itertools.chain.from_iterable( shakespeare.words(fileid) for fileid in shakespeare.fileids())
from nltk.corpus import brown, shakespeare from nltk.probability import LidstoneProbDist from nltk.model.ngram import NgramModel ##todo: try shakespeare corpus NGRAM_MODEL_N = 3 #TRAIN = brown.words(categories='lore') ## just a list of strings TRAIN = shakespeare.words() ESTIMATOR = lambda fdist, bins: LidstoneProbDist(fdist, 0.2) lm = NgramModel(NGRAM_MODEL_N, TRAIN, estimator=ESTIMATOR) print lm print lm.generate(40) print 'done'
def __init__(self, limit=1000): # TODO: Read all of shakespeare into words? fileid = shakespeare.fileids()[0] words = remove_punctuation(shakespeare.words(fileid)) self.finder = BigramCollocationFinder.from_words(words) self.bigrams = self.finder.nbest(bigram_measures.raw_freq, limit)
from nltk.corpus import brown from nltk.corpus import shakespeare import string if __name__ == '__main__': num_simulations = 10000 alphabet = list(string.ascii_lowercase) word_length = 5 start_state = ['t'] # words = [w.lower() for w in brown.words()] all_words = [] for book in shakespeare.fileids(): all_words.extend(shakespeare.words(book)) words = set([w.lower() for w in all_words]) eval_function = lambda word: 1 if ''.join(word) in words else 0 mcts = TextMCTS(alphabet, word_length, eval_function) state = start_state while len(state) < word_length: state = mcts.search(state, num_simulations) print(state) generated_word = ''.join(state) print("generated word: %s" % generated_word) print("is in corpus: %s" % (generated_word in words))
hamlet_all = nltk.corpus.gutenberg.raw("shakespeare-macbeth.txt") #entire text print(hamlet) print(hamlet_sentences) print(hamlet_all) #bringing in your own text in as an NLTK text object from nltk.corpus import PlaintextCorpusReader dir = 'C:/Users/adam_/Documents/MSDA/Data Foundations/' file = 'IS6713_syllabus.txt' syllabus = PlaintextCorpusReader(dir, file) syllabus.words() #ERROR #conditional frequency distributions provide handy tables and plot from nltk import ConditionalFreqDist as CFD cfd=CFD((fileid, len(word)) for fileid in shakespeare.fileids() \ for word in shakespeare.words(fileid)[:20000] if len(word)>3) #loop through each file in the Shakespeare collection and return the frequency distribution for words greater than #3 characters long for each file truncating each file after the first 20,000 words cfd.tabulate() cfd.plot() cfd[u'a_and_c.xml'] #'u' indicates unicode parsing of the source file from nltk.corpus import stopwords #Brown stopword list stopwords.words('english') #note all stopwords are lowercase from nltk.tokenize import word_tokenize sentence = "This is an example showing off stop word filtration" stopwords = set(stopwords.words('english')) words = word_tokenize(sentence) print(words) filtered_sentences = [
def words(): return list( itertools.chain.from_iterable( shakespeare.words(fileid) for fileid in shakespeare.fileids()))