def sortNgramFile(inputfile, outputfile): '''Sort Ngram count files alphabetically''' ngrok.sortNgramFile(inputfile, outputfile)
for language in languages: print('Counting ngrams...') countfile = os.path.join(directories['intermediatecountpath'],language+'_counted.txt') ngrok.countNgrams(os.path.join(directories['intermediatecountpath'],language+'_combined.txt'), countfile, n=1) print('Rearranging ngrams...') rearrangedFile = os.path.join(directories['intermediatecountpath'],language+'_2013_rearrange_counts.txt') ngrok.rearrangeNgramFile(countfile, rearrangedFile , reverse=False) print('Cleaning ngrams...') cleanedFile = os.path.join(directories['intermediatecountpath'],language+'_2013_cleaned.txt') ngrok.cleanUnigramCountFile(rearrangedFile, cleanedFile, numItems, language) print('Sorting ngrams...') sortedfile = os.path.join(directories['intermediatecountpath'],language+'_2013_sorted.txt') ngrok.sortNgramFile(cleanedFile, sortedfile) print('Collapsing ngrams...') collapsedfile = os.path.join(directories['intermediatecountpath'],language+'_2013_collapsed.txt') ngrok.collapseNgrams(sortedfile, collapsedfile) print('Marginalizing ngrams, first pass...') ngrok.marginalizeNgramFile(collapsedfile, os.path.join(slowstoragedir,language+'_2013.txt'), n=1, sorttype='numeric') #!!!num items is deprecated #moved the cleaning step up so that we don't produce equivalent items at the end. May increase the number of calls to Aspell