def rearrangeNgramFile(inputfile, outputfile, reverse): ''' Move the count to the end and reverse, if specified, the order of the ngram for an ngram txt file produced by AutoCorpus''' ngrok.rearrangeNgramFile(inputfile, outputfile, reverse)
directories = opus.makeDirectoryStructure('/shared_hd0/corpora/OPUS/2013_OPUS') [opus.downloadLanguage(x, directories['downloadpath'], directories['expandpath']) for x in languages] [opus.processLanguage(x, directories['expandpath'], directories['combinedpath']) for x in languages] [opus.combineLanguage(os.path.join(directories['combinedpath'], x), os.path.join(directories['intermediatecountpath'],x+'_combined.txt')) for x in languages] for language in languages: print('Counting ngrams...') countfile = os.path.join(directories['intermediatecountpath'],language+'_counted.txt') ngrok.countNgrams(os.path.join(directories['intermediatecountpath'],language+'_combined.txt'), countfile, n=1) print('Rearranging ngrams...') rearrangedFile = os.path.join(directories['intermediatecountpath'],language+'_2013_rearrange_counts.txt') ngrok.rearrangeNgramFile(countfile, rearrangedFile , reverse=False) print('Cleaning ngrams...') cleanedFile = os.path.join(directories['intermediatecountpath'],language+'_2013_cleaned.txt') ngrok.cleanUnigramCountFile(rearrangedFile, cleanedFile, numItems, language) print('Sorting ngrams...') sortedfile = os.path.join(directories['intermediatecountpath'],language+'_2013_sorted.txt') ngrok.sortNgramFile(cleanedFile, sortedfile) print('Collapsing ngrams...') collapsedfile = os.path.join(directories['intermediatecountpath'],language+'_2013_collapsed.txt') ngrok.collapseNgrams(sortedfile, collapsedfile) print('Marginalizing ngrams, first pass...')