def iter(self, sorted=False, consolidated=False): self.flush() iters = [FileReader(fname) for fname in self.files] if sorted or consolidated: return consolidate(mergeSort(iters)) else: return itertools.chain(*iters)
def iter(self, sorted=False, consolidated=False): self.flush() iters = [ FileReader(fname) for fname in self.files ] if sorted or consolidated: return consolidate(mergeSort(iters)) else: return itertools.chain(*iters)
def main(options, args): if options.vocabulary: vocabulary = loadVocabulary(options.vocabulary) else: vocabulary = OpenVocabulary() if options.text: text = misc.gOpenIn(options.text) sentences = itertools.imap(str.split, text) sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences) grams = mGramsChainCount(sentences, options.order - 1) counts = createStorage(options) counts.addIter(grams) elif options.read: if len(options.read) > 1: counts = createStorage(options) counts.addIter( consolidate( mergeSort([TextStorage(fname) for fname in options.read]))) else: counts = TextStorage(options.read[0]) else: print("no counts", file=sys.stderr) return if options.map_oov: if not options.vocabulary: print("you need to specify a vocabulary", file=sys.stderr) filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol) mappedCounts = createStorage(options) mappedCounts.addIter(filt.rawIter()) counts = mappedCounts if options.write: countFile = misc.gOpenOut(options.write) TextStorage.write(countFile, counts) if options.counts_of_counts: coc = [ countsOfCounts(mGramReduceToOrder(counts, order)) for order in range(options.order) ] import pprint pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
def main(options, args): if options.vocabulary: vocabulary = loadVocabulary(options.vocabulary) else: vocabulary = OpenVocabulary() if options.text: text = misc.gOpenIn(options.text) sentences = itertools.imap(str.split, text) sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences) grams = mGramsChainCount(sentences, options.order - 1) counts = createStorage(options) counts.addIter(grams) elif options.read: if len(options.read) > 1: counts = createStorage(options) counts.addIter(consolidate(mergeSort( [ TextStorage(fname) for fname in options.read ]))) else: counts = TextStorage(options.read[0]) else: print >> sys.stderr, 'no counts' return if options.map_oov: if not options.vocabulary: print >> sys.stderr, 'you need to specify a vocabulary' filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol) mappedCounts = createStorage(options) mappedCounts.addIter(filt.rawIter()) counts = mappedCounts if options.write: countFile = misc.gOpenOut(options.write) TextStorage.write(countFile, counts) if options.counts_of_counts: coc = [ countsOfCounts(mGramReduceToOrder(counts, order)) for order in range(options.order) ] import pprint pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))