예제 #1
0
import pickle
import textstats

#read in the source text
f = open('hitler_speeches.txt', encoding='utf-8')
htxt = f.read()
f.close()

#obtain new list of word tokens
htoks = textstats.getTokens(htxt)

#remove symbols

symbols = list("~!@#$%^&*()_+-=`{}[]|\\:;\"',./<>?")

htoks_nosym = [t for t in htoks if t not in symbols]

#open a pickled version of the xkcd simple word list
#see https://xkcd.com/simplewriter/
f = open('xkcd_simple_words.p', 'rb')
xkcd_simp = pickle.load(f)
f.close()

#create new list of toks not in the xkcd_simp list
hnotsimptoks = [t for t in htoks_nosym if t not in xkcd_simp]

f = open('hnotsimptoks.p', 'wb')
pickle.dump(hnotsimptoks, f, -1)
f.close()
예제 #2
0
# Christian Clark, [email protected], 29 September 2014

import pickle, textstats as ts

outFile = open('bigram_bible_austen_out.txt', 'w')


# Part 1: The King James Bible
# (A) and (B) Create token and type lists from the text file

bInfile = open('../Ling 1330/gutenberg/gutenberg/bible-kjv.txt')
bTxt = bInfile.read()
bInfile.close()

bToks = ts.getTokens(bTxt)
bTypes = ts.getTypes(bTxt)


# (C) Write out token and type counts to outFile

outFile.write('There are a total of '+str(len(bToks))+' word tokens and '+\
              str(len(bTypes))+' word types in the King James Bible.'+'\n\n')


# (D) Create bigram frequency dictionary

bBigrFreq = {}
for bigr in ts.getWordNGrams(bToks, 2):
    if bigr in bBigrFreq: bBigrFreq[bigr] += 1
    else: bBigrFreq[bigr] = 1
예제 #3
0
import pickle
import textstats

#read in the source text
f = open('mussolini_speeches.txt', encoding='utf-8')
mtxt = f.read()
f.close()

#obtain new list of word tokens
mtoks = textstats.getTokens(mtxt)

#remove symbols

symbols = list("~!@#$%^&*()_+-=`{}[]|\\:;\"',./<>?")

mtoks_nosym = [t for t in mtoks if t not in symbols]

#open a pickled version of the xkcd simple word list
#see https://xkcd.com/simplewriter/
f = open('xkcd_simple_words.p', 'rb')
xkcd_simp = pickle.load(f)
f.close()

#create new list of toks not in the xkcd_simp list
mnotsimptoks = [t for t in mtoks_nosym if t not in xkcd_simp]

f = open('mnotsimptoks.p', 'wb')
pickle.dump(mnotsimptoks, f, -1)
f.close()
예제 #4
0
파일: Obama.py 프로젝트: cclark94/compLing
# Christian ...

import pickle, textstats as ts

outFile = open('2009-Obama_out.txt', 'w')


# Part 1: The King James Bible
# (A) and (B) Create token and type lists from the text file

bInfile = open('2009-Obama.txt')
bTxt = bInfile.read()
bInfile.close()

bToks = ts.getTokens(bTxt)
bTypes = ts.getTypes(bTxt)


# (C) Write out token and type counts to outFile

outFile.write('There are a total of '+str(len(bToks))+' word tokens and '+\
              str(len(bTypes))+' word types in Obama\'s speech.'+'\n\n')


# (D) Create bigram frequency dictionary

bBigrFreq = {}
for bigr in ts.getWordNGrams(bToks, 2):
    if bigr in bBigrFreq: bBigrFreq[bigr] += 1
    else: bBigrFreq[bigr] = 1