Python getTokens示例

编程语言: Python

命名空间/包名称: textstats

方法/功能: getTokens

hotexamples.com的示例: 4

Python getTokens - 已找到4个示例。这些是从开源项目中提取的最受好评的textstats.getTokens现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： hitler_analy.py 项目： knapppv94/knapp-synthesis-1

import pickle
import textstats

#read in the source text
f = open('hitler_speeches.txt', encoding='utf-8')
htxt = f.read()
f.close()

#obtain new list of word tokens
htoks = textstats.getTokens(htxt)

#remove symbols

symbols = list("~!@#$%^&*()_+-=`{}[]|\\:;\"',./<>?")

htoks_nosym = [t for t in htoks if t not in symbols]

#open a pickled version of the xkcd simple word list
#see https://xkcd.com/simplewriter/
f = open('xkcd_simple_words.p', 'rb')
xkcd_simp = pickle.load(f)
f.close()

#create new list of toks not in the xkcd_simp list
hnotsimptoks = [t for t in htoks_nosym if t not in xkcd_simp]

f = open('hnotsimptoks.p', 'wb')
pickle.dump(hnotsimptoks, f, -1)
f.close()

示例#2

显示文件

文件： bigrams_bible_austen.py 项目： cclark94/compLing

# Christian Clark, [email protected], 29 September 2014

import pickle, textstats as ts

outFile = open('bigram_bible_austen_out.txt', 'w')


# Part 1: The King James Bible
# (A) and (B) Create token and type lists from the text file

bInfile = open('../Ling 1330/gutenberg/gutenberg/bible-kjv.txt')
bTxt = bInfile.read()
bInfile.close()

bToks = ts.getTokens(bTxt)
bTypes = ts.getTypes(bTxt)


# (C) Write out token and type counts to outFile

outFile.write('There are a total of '+str(len(bToks))+' word tokens and '+\
              str(len(bTypes))+' word types in the King James Bible.'+'\n\n')


# (D) Create bigram frequency dictionary

bBigrFreq = {}
for bigr in ts.getWordNGrams(bToks, 2):
    if bigr in bBigrFreq: bBigrFreq[bigr] += 1
    else: bBigrFreq[bigr] = 1

示例#3

显示文件

文件： musso_analy.py 项目： knapppv94/knapp-synthesis-1

import pickle
import textstats

#read in the source text
f = open('mussolini_speeches.txt', encoding='utf-8')
mtxt = f.read()
f.close()

#obtain new list of word tokens
mtoks = textstats.getTokens(mtxt)

#remove symbols

symbols = list("~!@#$%^&*()_+-=`{}[]|\\:;\"',./<>?")

mtoks_nosym = [t for t in mtoks if t not in symbols]

#open a pickled version of the xkcd simple word list
#see https://xkcd.com/simplewriter/
f = open('xkcd_simple_words.p', 'rb')
xkcd_simp = pickle.load(f)
f.close()

#create new list of toks not in the xkcd_simp list
mnotsimptoks = [t for t in mtoks_nosym if t not in xkcd_simp]

f = open('mnotsimptoks.p', 'wb')
pickle.dump(mnotsimptoks, f, -1)
f.close()

示例#4

显示文件

文件： Obama.py 项目： cclark94/compLing

# Christian ...

import pickle, textstats as ts

outFile = open('2009-Obama_out.txt', 'w')


# Part 1: The King James Bible
# (A) and (B) Create token and type lists from the text file

bInfile = open('2009-Obama.txt')
bTxt = bInfile.read()
bInfile.close()

bToks = ts.getTokens(bTxt)
bTypes = ts.getTypes(bTxt)


# (C) Write out token and type counts to outFile

outFile.write('There are a total of '+str(len(bToks))+' word tokens and '+\
              str(len(bTypes))+' word types in Obama\'s speech.'+'\n\n')


# (D) Create bigram frequency dictionary

bBigrFreq = {}
for bigr in ts.getWordNGrams(bToks, 2):
    if bigr in bBigrFreq: bBigrFreq[bigr] += 1
    else: bBigrFreq[bigr] = 1