def testReadWordsFromFile(): testwordlist = ['moon', 'man', 'over', 'the', 'help', 'me', 't', 'fi'] testhashtaglist = ['manoverthemoon', 'helpmeoverthere', 'findmefood', 'whaaaaat'] createWordAndHashtagFiles() hashtaglist = werthman_assgn1.readWordsFromFile('testhashtaglist.txt', False, 20) wordlist = werthman_assgn1.readWordsFromFile('testwordlist.txt', True, 20) #tests that the right words and hashtags are in the lists for word in wordlist: assertion(word in testwordlist, "Set contains {0}.".format(word)) for hashtag in hashtaglist: assertion(hashtag in testhashtaglist, "Set contains {0}.".format(hashtag)) #test that the word list is limited by a certain size wordlist = werthman_assgn1.readWordsFromFile('testwordlist.txt', True, 2) assertion(len(wordlist) == 2, "Word list should be size 2.")
def testMinEditDistanceAlgo(): #retrieve the wordlist and hashtag list form the file system wordlist = werthman_assgn1.readWordsFromFile("testwordlist.txt", True, 75000) hashtags = werthman_assgn1.readWordsFromFile("testhashtaglist.txt", False, 0) #use the maxmatch algo and change the hashtags and add them to a list maxmatchHashtags = [] for hashtag in hashtags: maxmatchHashtags.append(werthman_assgn1.maxMatch(hashtag, wordlist, "")) #read in the list of what the hashtags should really look like correctHashtags = [] with open("realtesthashtags.txt", "r") as f: for line in f: correctHashtags.append(line.strip()) #compare each maxmatchHashtag to each correctHashtag word by word totalWER = 0.0 for maxmatchHashtag, correctHashtag in zip(maxmatchHashtags, correctHashtags): #convert each string to a list of the words in the string maxmatchHashtagAsList = maxmatchHashtag.split() correctHashtagAsList = correctHashtag.split() #test WER for the the hashtags created by maxmatch if correctHashtag == "man over the moon" : #maxmatchHashtag is the same as the correctHashtag so no changes need to be made assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList) == 0, "Man over the moon should have a min edit distance of 0") assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) == 0, "Man over the moon should have a WER of 0") totalWER += werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) elif correctHashtag == "find me food": #maxmatchHashtag:fi ndmefood requires two substitutions fi -> find and ndmefood -> me and an insertion of food assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList) == 7, "Find me food should have a min edit distance of 7") assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) == 7.0/3, "Find me food should have a should have a WER of 7/3") totalWER += werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) elif correctHashtag == "help me over there": #masmatchHashtag:help me over the re requires a substitution the->there and a deletion of re assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList) == 3, "help me over there should have a min edit distance of 3") assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) == 3.0/4, "help me over there should have a WER of 3/4") totalWER += werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) #average the WER across of the hashtags assertion(totalWER/len(correctHashtags) == ((7.0/3 + 3.0/4)/3.0), "Average WER across test set should be .58")
def finalTestMaxMatchAlgo(): #retrieve the wordlist and hashtag list form the file system wordlist = werthman_assgn1.readWordsFromFile('bigwordlist.txt', True, 75000) hashtags = werthman_assgn1.readWordsFromFile('hashtags-train.txt', False, 0) #use the maxmatch algo and change the hashtags and add them to a list maxmatchHashtags = [] for hashtag in hashtags: maxmatchHashtags.append(werthman_assgn1.maxMatch(hashtag, wordlist, "")) #get the hashtags of the expected output of the maxmatch algo from professor's provided file expectedHashtags = [] with open('hashtags-train-maxmatch.txt', 'r') as f: for line in f: #strip off whitespace characters like newlines expectedHashtags.append(line.strip()) #compare the hashtags created by the maxmatch algo to those provided by the professor for maxmatchHashtag, expectedmaxmatchHashtag in zip(maxmatchHashtags, expectedHashtags): assertion(maxmatchHashtag == expectedmaxmatchHashtag, "My maxmatch algo hashtag: {0} should be the same as the professor's maxmatch algo: {1}.".format(maxmatchHashtag, expectedmaxmatchHashtag))