示例#1
0
def buildTables(s):
        startTime = time.time()
        infile = open("train"+ path_separator + s,"r")
        lineCount = 0
        wordCount = 0

        ## careful of string overflows!
        for line in infile:
                lineCount += 1
                line = line.replace('\x97',' ') # remove long dashes
                words = line.split()
                firstword = ' '
                secondword = ' '
                endOfSentence = False

                for w in words:
                        wordCount += 1
                        word = sanitizeWord(w)
                        firstword = secondword
                        secondword = word
                        if endOfSentence:
                                firstword = ' '
                                endOfSentence = False
                        if isEndofSentence(w): endOfSentence = True

                        ## - charbigrams -
                        firstletter = ' '
                        secondletter = ' '
                        for character in word:
                                ## - obsbigrams -
                                addObservation(character)
                                firstletter = secondletter
                                secondletter = character
                                incrementInTable(firstletter,secondletter,charbigrams)
                        ## - corpus -
                        incrementInTrie(word,corpus)
                        
                        ## - wordbigrams -
                        incrementInTable(firstword,secondword,wordbigrams)

                        ## - tranbigrams -
                        incrementInTable(mirror_functions.mirror(secondword),secondword,tranbigrams)
                      

        print "Lines Processed: ", str(lineCount)
        print "Words Processed: ", str(wordCount)
        print "Seconds: ", time.time() - startTime

        infile.close()
示例#2
0
def runTest(s,n):
        startTime = time.time()
        infile = open("test"+ path_separator + s,"r")
        outfile = open("test" + path_separator + "results.csv", "w")
        outfile.write("word,mirror,length,numMirrors,predIndx,correct\n\n")

        ## Initialize testing counts
        wordCount = 0
        correctWords = 0
        allRslts = {}

        for line in infile:
                words = line.split()
                firstword = ' '
                secondword = ' '
                endOfSentence = False

                for w in words:
                        wordCount += 1
                        word = build.sanitizeWord(w)
                        mirror = mirror_functions.mirror(word)

                        #specify how to get these results
                        results = mirror_functions.getTopNext(mirror+'|','tran',n,"",True)

                        outfile.write(word + ',' + mirror + ',' + str(len(word)) + ',' + \
                                  str(mirror_functions.mirrorCount(word)) + ',')

                        # Correct prediction
                        if (word in results):
                                correctWords += 1
                                indx = results.index(word)
                                incrementDict(word + '|' + str(indx),allRslts)
                                outfile.write(str(indx) + ",1,\n")
                        else:
                                outfile.write(" ,0,\n")

        infile.close()
        outfile.close()

        ## Print results
        print "Seconds: ", time.time() - startTime
        print "Total words tested:", wordCount
        print "Total correctly predicted:", correctWords
示例#3
0
def addObservation(c):
        m = mirror_functions.mirror(c)
        incrementInTable(m,c,obsbigrams)