def iter(self, sorted=False, consolidated=False): self.flush() iters = [FileReader(fname) for fname in self.files] if sorted or consolidated: return consolidate(mergeSort(iters)) else: return itertools.chain(*iters)
def iter(self, sorted=False, consolidated=False): self.flush() iters = [ FileReader(fname) for fname in self.files ] if sorted or consolidated: return consolidate(mergeSort(iters)) else: return itertools.chain(*iters)
def flush(self): if self.primary: if self.currentFile is None: self.currentFile = FileWriter(self.newFile()) self.primary.sort() for item in consolidate(self.primary): self.currentFile.write(item) self.nStoredItems += 1 if self.currentFile: self.currentFile.close() self.currentFile = None self.primary = [] if self.secondary: self.secondary.sort() self.nStoredItems += writeToFile(self.newFile(), consolidate(self.secondary)) self.secondary = [] self.isUnderfull = True
def main(options, args): if options.vocabulary: vocabulary = loadVocabulary(options.vocabulary) else: vocabulary = OpenVocabulary() if options.text: text = misc.gOpenIn(options.text) sentences = itertools.imap(str.split, text) sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences) grams = mGramsChainCount(sentences, options.order - 1) counts = createStorage(options) counts.addIter(grams) elif options.read: if len(options.read) > 1: counts = createStorage(options) counts.addIter( consolidate( mergeSort([TextStorage(fname) for fname in options.read]))) else: counts = TextStorage(options.read[0]) else: print("no counts", file=sys.stderr) return if options.map_oov: if not options.vocabulary: print("you need to specify a vocabulary", file=sys.stderr) filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol) mappedCounts = createStorage(options) mappedCounts.addIter(filt.rawIter()) counts = mappedCounts if options.write: countFile = misc.gOpenOut(options.write) TextStorage.write(countFile, counts) if options.counts_of_counts: coc = [ countsOfCounts(mGramReduceToOrder(counts, order)) for order in range(options.order) ] import pprint pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
def main(options, args): if options.vocabulary: vocabulary = loadVocabulary(options.vocabulary) else: vocabulary = OpenVocabulary() if options.text: text = misc.gOpenIn(options.text) sentences = itertools.imap(str.split, text) sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences) grams = mGramsChainCount(sentences, options.order - 1) counts = createStorage(options) counts.addIter(grams) elif options.read: if len(options.read) > 1: counts = createStorage(options) counts.addIter(consolidate(mergeSort( [ TextStorage(fname) for fname in options.read ]))) else: counts = TextStorage(options.read[0]) else: print >> sys.stderr, 'no counts' return if options.map_oov: if not options.vocabulary: print >> sys.stderr, 'you need to specify a vocabulary' filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol) mappedCounts = createStorage(options) mappedCounts.addIter(filt.rawIter()) counts = mappedCounts if options.write: countFile = misc.gOpenOut(options.write) TextStorage.write(countFile, counts) if options.counts_of_counts: coc = [ countsOfCounts(mGramReduceToOrder(counts, order)) for order in range(options.order) ] import pprint pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
def flush(self): if len(self.current) == 0: return self.current.sort() self.store(consolidate(self.current)) self.current = []
def consolidate(self): if not self.isConsolidated: self.sort() self.items = consolidate(self.items) self.isConsolidated = True