Пример #1
0
 def compressionRatio(self,encodingWordUsage, compressionUsage,tree = None):
     # the first two arguments can either be speeches or dictionaries with words and frequencies
     if(isinstance(encodingWordUsage,speech.Speech)):
        encodingWordUsage = encodingWordUsage.wordUsage
     if(isinstance(compressionUsage,speech.Speech)):
        compressionUsage = compressionUsage.wordUsage
     #tree can be provided, to help performance, otherwise make a tree
     if(tree == None):
         tree = huffmanCoding.huffmanCodingTree(encodingWordUsage)
     wordCount = 0 
     bitsInHuffmanCoding = 0
     #note, don't use the additive smoothing for finding ratio
     for entry in compressionUsage.iteritems():
         #bits needed dictionary has the number of bits that each node of tree would encode.  This could
         #be found using tree.findNode(), but looking things up in the tree took a long time, relative
         #to the number of times that it has to be done.
         bitsInHuffmanCoding += (entry[1]-1)*tree.bitsNeededDictionary[entry[0]]
         wordCount += entry[1] - 1
     bitsInBlockCoding = wordCount*int(math.ceil(math.log(len(compressionUsage),2)))
     #for debugging
     #print "huffmanCoding = "+str(bitsInHuffmanCoding)+" blockCoding: "+str(bitsInBlockCoding)
     if(bitsInBlockCoding > 0):
         return float(bitsInHuffmanCoding)/float(bitsInBlockCoding)
     else:
         return 0
    if(len(sys.argv) >= 3):
        print "finding speech set"
        set = speechSet.SpeechSet(sys.argv[1])
        reverse = False
        if(len(sys.argv) == 4):
            reverse = True
        print "finding all filenames"
        if(reverse):#note that reversing the list puts it in chronological non-reverse
            listRecentFilenames = sorted(glob.glob(set.directoryPath+"/"+set.fileType),reverse = False)
        else:
            listRecentFilenames = sorted(glob.glob(set.directoryPath+"/"+set.fileType),reverse = True)
        print "finding word usage"
        wordUsage = determineWordUsage(set.wordUsage,int(sys.argv[2]))
        wordsSorted = sorted(wordUsage.iteritems(),key = operator.itemgetter(1))
        print "start finding coding tree "+str(time.clock())
        tree = huffmanCoding.huffmanCodingTree(wordUsage)
        print "bits of the longest codeword:"+str(wordsSorted[0])+", "+str(tree.bitsNeededDictionary[wordsSorted[0][0]])
        print "bits of the shortest codeword:"+str(wordsSorted[len(wordsSorted)-1])+", "+str(tree.bitsNeededDictionary[wordsSorted[len(wordsSorted)-1][0]])

        #print "bit the "+ str(tree.bitsNeededDictionary["the"])
        #print "bit a " + str(tree.bitsNeededDictionary["a"])
        #print "bit for " + str(tree.bitsNeededDictionary["for"])
        print "completed finding coding tree "+str(time.clock())
        file = None
        fileForWordCount = open("WordCount.txt","w+")
        if(reverse):
            file = open(sys.argv[2]+"LeastRecentSpeech.txt","w+")
        else:
            file = open(sys.argv[2]+"MostRecentSpeech.txt","w+")
        print "starting compression ratio, "+str(time.clock())
        for filename in listRecentFilenames: