def buildTables(s): startTime = time.time() infile = open("train"+ path_separator + s,"r") lineCount = 0 wordCount = 0 ## careful of string overflows! for line in infile: lineCount += 1 line = line.replace('\x97',' ') # remove long dashes words = line.split() firstword = ' ' secondword = ' ' endOfSentence = False for w in words: wordCount += 1 word = sanitizeWord(w) firstword = secondword secondword = word if endOfSentence: firstword = ' ' endOfSentence = False if isEndofSentence(w): endOfSentence = True ## - charbigrams - firstletter = ' ' secondletter = ' ' for character in word: ## - obsbigrams - addObservation(character) firstletter = secondletter secondletter = character incrementInTable(firstletter,secondletter,charbigrams) ## - corpus - incrementInTrie(word,corpus) ## - wordbigrams - incrementInTable(firstword,secondword,wordbigrams) ## - tranbigrams - incrementInTable(mirror_functions.mirror(secondword),secondword,tranbigrams) print "Lines Processed: ", str(lineCount) print "Words Processed: ", str(wordCount) print "Seconds: ", time.time() - startTime infile.close()
def runTest(s,n): startTime = time.time() infile = open("test"+ path_separator + s,"r") outfile = open("test" + path_separator + "results.csv", "w") outfile.write("word,mirror,length,numMirrors,predIndx,correct\n\n") ## Initialize testing counts wordCount = 0 correctWords = 0 allRslts = {} for line in infile: words = line.split() firstword = ' ' secondword = ' ' endOfSentence = False for w in words: wordCount += 1 word = build.sanitizeWord(w) mirror = mirror_functions.mirror(word) #specify how to get these results results = mirror_functions.getTopNext(mirror+'|','tran',n,"",True) outfile.write(word + ',' + mirror + ',' + str(len(word)) + ',' + \ str(mirror_functions.mirrorCount(word)) + ',') # Correct prediction if (word in results): correctWords += 1 indx = results.index(word) incrementDict(word + '|' + str(indx),allRslts) outfile.write(str(indx) + ",1,\n") else: outfile.write(" ,0,\n") infile.close() outfile.close() ## Print results print "Seconds: ", time.time() - startTime print "Total words tested:", wordCount print "Total correctly predicted:", correctWords
def addObservation(c): m = mirror_functions.mirror(c) incrementInTable(m,c,obsbigrams)