Python gathterDocs示例

编程语言: Python

命名空间/包名称: util

方法/功能: gathterDocs

hotexamples.com的示例: 3

Python gathterDocs - 已找到3个示例。这些是从开源项目中提取的最受好评的util.gathterDocs现实Python示例。您可以评价示例，以帮助我们提高示例质量。

示例#1

显示文件

文件： distinctWordUse.py 项目： consciousgaze/toyingWithGutenberg

def buildGloVe(author, vocabulary = 'vocabulary.txt', rebuild = False):
	'''
		build GloVe model. Fine tune the model paramter here
	'''
	if rebuild or not os.path.exists('%s%s.txt'%(util.modelDir, author)):
		print 'Building GloVe model for %s -----------' % author
		docFname = 'corpus'
		util.gathterDocs([author], docFname)
		# build occurance file
		# ../glove/cooccur -memory 4.0 -vocab-file vocabulary -verbose 2 -window-size 15 < docFname > coocur.bin
		cmd = '../glove/cooccur'
		param = '-memory 4.0 -vocab-file %s -verbose 2 -window-size 15 < %s > coocur.bin' % (util.modelDir+vocabulary, docFname)
		os.system(cmd+' '+param)
		# shuffle occurance file
		# ../glove/shuffle -memory 4.0 -verbose 2 < coocur.bin > shuffle.bin
		cmd = '../glove/shuffle'
		param = '-memory 4.0 -verbose 2 < coocur.bin > shuffle.bin'
		os.system(cmd+' '+param)
		# build glove
		# ../glove/glove -save-file glove_+$author -threads 4 -input-file shuffle.bin -x-max 10 -iter 50 -vector-size 50 -binary 2 -vocab-file vocabulary -verbose 2
		cmd = '../glove/glove'
		param = '-save-file glove_%s -threads 4 -input-file shuffle.bin -x-max 10 -iter 50 -vector-size 50 -binary 2 -vocab-file %s -verbose 2' % ("ModelFiles/"+author, util.modelDir+vocabulary)
		os.system(cmd+' '+param)

		os.remove(docFname)
		os.remove('shuffle.bin')
		os.remove('coocur.bin')

示例#2

显示文件

文件： distinctWordUse.py 项目： consciousgaze/toyingWithGutenberg

def buildVocabulary(rebuild = False):
	# check if the vocabulary is there and if rebuild
	if rebuild or not os.path.exists(util.modelDir+'vocabulary.txt'):
		vocFileName = 'voc.txt'
		util.gathterDocs(util.authors, vocFileName)
		print 'voc.txt done'
		# execute ../glove/vocab_count -min-count 5 -verbose 2 < vocFileName > vocabulary.txt
		cmd = '../glove/vocab_count'
		param = '-min-count 5 -verbose 2 < %s > %svocabulary.txt' % (vocFileName, util.modelDir)

		os.system(cmd+' '+param)
		os.remove(vocFileName)

示例#3

显示文件

文件： authorIdentifier.py 项目： consciousgaze/toyingWithGutenberg

def buildNgramModels():
	outputFile = 'out.txt'
	for author in [util.authors[0]]:
		print util.authors[:1]
		estimator = lambda fdist, bins: LidstoneProbDist(fdist, 0.2)
		util.gathterDocs([author], outputFile)
		f = open(outputFile)
		train = f.read()#.split()
		f.close()
		ngrammodel = NM(5, train, estimator = estimator)
		pickle.dump(ngrammodel, open(author+'_ngram.model', 'w'))
		os.remove(outputFile)