Exemplo n.º 1
0
Arquivo: cli.py Projeto: smeylan/ngrok
def marginalizeNgramFile(inputfile, outputfile, n, sorttype):
    '''Produce lower-order aggregate counts from higher-order ngram file'''
    ngrok.marginalizeNgramFile(inputfile, outputfile, n, sorttype)
Exemplo n.º 2
0
Arquivo: main.py Projeto: smeylan/opus
for language in languages:
	print('Counting ngrams...')
	countfile = os.path.join(directories['intermediatecountpath'],language+'_counted.txt')
	ngrok.countNgrams(os.path.join(directories['intermediatecountpath'],language+'_combined.txt'), countfile, n=1)

	print('Rearranging ngrams...')
	rearrangedFile = os.path.join(directories['intermediatecountpath'],language+'_2013_rearrange_counts.txt')
	ngrok.rearrangeNgramFile(countfile, rearrangedFile , reverse=False)

	print('Cleaning ngrams...')
	cleanedFile =  os.path.join(directories['intermediatecountpath'],language+'_2013_cleaned.txt')	
	ngrok.cleanUnigramCountFile(rearrangedFile, cleanedFile, numItems, language)	

	print('Sorting ngrams...')
	sortedfile =  os.path.join(directories['intermediatecountpath'],language+'_2013_sorted.txt')	
	ngrok.sortNgramFile(cleanedFile,  sortedfile)

	print('Collapsing ngrams...')
	collapsedfile =  os.path.join(directories['intermediatecountpath'],language+'_2013_collapsed.txt')	
	ngrok.collapseNgrams(sortedfile, collapsedfile)


	print('Marginalizing ngrams, first pass...')	
	ngrok.marginalizeNgramFile(collapsedfile,  os.path.join(slowstoragedir,language+'_2013.txt'), n=1, sorttype='numeric')

	


#!!!num items is deprecated
#moved the cleaning step up so that we don't produce equivalent items at the end. May increase the number of calls to Aspell
Exemplo n.º 3
0
Arquivo: cli.py Projeto: smeylan/ngrok
def marginalizeNgramFile(inputfile, outputfile, n, sorttype):
	'''Produce lower-order aggregate counts from higher-order ngram file'''
	ngrok.marginalizeNgramFile(inputfile, outputfile, n, sorttype)