示例#1
0
 def iter(self, sorted=False, consolidated=False):
     self.flush()
     iters = [FileReader(fname) for fname in self.files]
     if sorted or consolidated:
         return consolidate(mergeSort(iters))
     else:
         return itertools.chain(*iters)
示例#2
0
    def iter(self, sorted=False, consolidated=False):
	self.flush()
	iters = [ FileReader(fname) for fname in self.files ]
	if sorted or consolidated:
	    return consolidate(mergeSort(iters))
	else:
	    return itertools.chain(*iters)
示例#3
0
    def flush(self):
	if self.primary:
	    if self.currentFile is None:
		self.currentFile = FileWriter(self.newFile())
	    self.primary.sort()
	    for item in consolidate(self.primary):
		self.currentFile.write(item)
		self.nStoredItems += 1
	if self.currentFile:
	    self.currentFile.close()
	    self.currentFile = None
	self.primary = []

	if self.secondary:
	    self.secondary.sort()
	    self.nStoredItems += writeToFile(self.newFile(), consolidate(self.secondary))
	self.secondary = []

	self.isUnderfull = True
示例#4
0
    def flush(self):
        if self.primary:
            if self.currentFile is None:
                self.currentFile = FileWriter(self.newFile())
            self.primary.sort()
            for item in consolidate(self.primary):
                self.currentFile.write(item)
                self.nStoredItems += 1
        if self.currentFile:
            self.currentFile.close()
            self.currentFile = None
        self.primary = []

        if self.secondary:
            self.secondary.sort()
            self.nStoredItems += writeToFile(self.newFile(),
                                             consolidate(self.secondary))
        self.secondary = []

        self.isUnderfull = True
示例#5
0
def main(options, args):
    if options.vocabulary:
        vocabulary = loadVocabulary(options.vocabulary)
    else:
        vocabulary = OpenVocabulary()

    if options.text:
        text = misc.gOpenIn(options.text)
        sentences = itertools.imap(str.split, text)
        sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences)
        grams = mGramsChainCount(sentences, options.order - 1)
        counts = createStorage(options)
        counts.addIter(grams)
    elif options.read:
        if len(options.read) > 1:
            counts = createStorage(options)
            counts.addIter(
                consolidate(
                    mergeSort([TextStorage(fname) for fname in options.read])))
        else:
            counts = TextStorage(options.read[0])
    else:
        print("no counts", file=sys.stderr)
        return

    if options.map_oov:
        if not options.vocabulary:
            print("you need to specify a vocabulary", file=sys.stderr)
        filt = MapUnknownsFilter(counts, vocabulary.list,
                                 vocabulary.unknownSymbol)
        mappedCounts = createStorage(options)
        mappedCounts.addIter(filt.rawIter())
        counts = mappedCounts

    if options.write:
        countFile = misc.gOpenOut(options.write)
        TextStorage.write(countFile, counts)

    if options.counts_of_counts:
        coc = [
            countsOfCounts(mGramReduceToOrder(counts, order))
            for order in range(options.order)
        ]
        import pprint

        pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
示例#6
0
def main(options, args):
    if options.vocabulary:
	vocabulary = loadVocabulary(options.vocabulary)
    else:
	vocabulary = OpenVocabulary()

    if options.text:
	text = misc.gOpenIn(options.text)
	sentences = itertools.imap(str.split, text)
	sentences = itertools.imap(lambda s: map(vocabulary.map, s), sentences)
	grams = mGramsChainCount(sentences, options.order - 1)
	counts = createStorage(options)
	counts.addIter(grams)
    elif options.read:
	if len(options.read) > 1:
	    counts = createStorage(options)
	    counts.addIter(consolidate(mergeSort(
		[ TextStorage(fname) for fname in options.read ])))
	else:
	    counts = TextStorage(options.read[0])
    else:
	print >> sys.stderr, 'no counts'
	return

    if options.map_oov:
	if not options.vocabulary:
	    print >> sys.stderr, 'you need to specify a vocabulary'
	filt = MapUnknownsFilter(counts, vocabulary.list, vocabulary.unknownSymbol)
	mappedCounts = createStorage(options)
	mappedCounts.addIter(filt.rawIter())
	counts = mappedCounts

    if options.write:
	countFile = misc.gOpenOut(options.write)
	TextStorage.write(countFile, counts)

    if options.counts_of_counts:
	coc = [ countsOfCounts(mGramReduceToOrder(counts, order))
		for order in range(options.order) ]
	import pprint
	pprint.pprint(coc, misc.gOpenOut(options.counts_of_counts))
示例#7
0
    def flush(self):
	if len(self.current) == 0: return
	self.current.sort()
	self.store(consolidate(self.current))
	self.current = []
示例#8
0
    def consolidate(self):
	if not self.isConsolidated:
	    self.sort()
	    self.items = consolidate(self.items)
	self.isConsolidated = True
示例#9
0
 def flush(self):
     if len(self.current) == 0: return
     self.current.sort()
     self.store(consolidate(self.current))
     self.current = []
示例#10
0
 def consolidate(self):
     if not self.isConsolidated:
         self.sort()
         self.items = consolidate(self.items)
     self.isConsolidated = True