import pickle import textstats #read in the source text f = open('hitler_speeches.txt', encoding='utf-8') htxt = f.read() f.close() #obtain new list of word tokens htoks = textstats.getTokens(htxt) #remove symbols symbols = list("~!@#$%^&*()_+-=`{}[]|\\:;\"',./<>?") htoks_nosym = [t for t in htoks if t not in symbols] #open a pickled version of the xkcd simple word list #see https://xkcd.com/simplewriter/ f = open('xkcd_simple_words.p', 'rb') xkcd_simp = pickle.load(f) f.close() #create new list of toks not in the xkcd_simp list hnotsimptoks = [t for t in htoks_nosym if t not in xkcd_simp] f = open('hnotsimptoks.p', 'wb') pickle.dump(hnotsimptoks, f, -1) f.close()
# Christian Clark, [email protected], 29 September 2014 import pickle, textstats as ts outFile = open('bigram_bible_austen_out.txt', 'w') # Part 1: The King James Bible # (A) and (B) Create token and type lists from the text file bInfile = open('../Ling 1330/gutenberg/gutenberg/bible-kjv.txt') bTxt = bInfile.read() bInfile.close() bToks = ts.getTokens(bTxt) bTypes = ts.getTypes(bTxt) # (C) Write out token and type counts to outFile outFile.write('There are a total of '+str(len(bToks))+' word tokens and '+\ str(len(bTypes))+' word types in the King James Bible.'+'\n\n') # (D) Create bigram frequency dictionary bBigrFreq = {} for bigr in ts.getWordNGrams(bToks, 2): if bigr in bBigrFreq: bBigrFreq[bigr] += 1 else: bBigrFreq[bigr] = 1
import pickle import textstats #read in the source text f = open('mussolini_speeches.txt', encoding='utf-8') mtxt = f.read() f.close() #obtain new list of word tokens mtoks = textstats.getTokens(mtxt) #remove symbols symbols = list("~!@#$%^&*()_+-=`{}[]|\\:;\"',./<>?") mtoks_nosym = [t for t in mtoks if t not in symbols] #open a pickled version of the xkcd simple word list #see https://xkcd.com/simplewriter/ f = open('xkcd_simple_words.p', 'rb') xkcd_simp = pickle.load(f) f.close() #create new list of toks not in the xkcd_simp list mnotsimptoks = [t for t in mtoks_nosym if t not in xkcd_simp] f = open('mnotsimptoks.p', 'wb') pickle.dump(mnotsimptoks, f, -1) f.close()
# Christian ... import pickle, textstats as ts outFile = open('2009-Obama_out.txt', 'w') # Part 1: The King James Bible # (A) and (B) Create token and type lists from the text file bInfile = open('2009-Obama.txt') bTxt = bInfile.read() bInfile.close() bToks = ts.getTokens(bTxt) bTypes = ts.getTypes(bTxt) # (C) Write out token and type counts to outFile outFile.write('There are a total of '+str(len(bToks))+' word tokens and '+\ str(len(bTypes))+' word types in Obama\'s speech.'+'\n\n') # (D) Create bigram frequency dictionary bBigrFreq = {} for bigr in ts.getWordNGrams(bToks, 2): if bigr in bBigrFreq: bBigrFreq[bigr] += 1 else: bBigrFreq[bigr] = 1