def initialTestMaxMatchAlgo(): testwordlist = ['moon', 'man', 'over', 'the', 'help', 'me', 't', 'fi'] testhashtaglist = ['manoverthemoon', 'helpmeoverthere', 'findmefood', 'whaaaaat'] for hashtag in testhashtaglist: maxmatchedHashtag = "" if hashtag == "manoverthemoon": #test for the hashtag being made up of entirely of words in the word list maxmatchedHashtag = werthman_assgn1.maxMatch("manoverthemoon", testwordlist, maxmatchedHashtag) assertion(maxmatchedHashtag == "man over the moon", "manoverthemoon should be changed to man over the moon.") elif hashtag == "helpmeoverthere": #test for the hashtag being made up of some words from the word list maxmatchedHashtag = werthman_assgn1.maxMatch("helpmeoverthere", testwordlist, maxmatchedHashtag) assertion(maxmatchedHashtag == "help me over the r e", "helpmeoverthere should be changed to help me over the r e.") elif hashtag == "whaaaaat": #test for the hashtag being made up of none of the words in the wordlist maxmatchedHashtag = werthman_assgn1.maxMatch("whaaaaat", testwordlist, maxmatchedHashtag) assertion(maxmatchedHashtag == "w h a a a a a t", "whaaaaat should be changed to w h a a a a a t.") elif hashtag == "findmefood": #test for the hashtag being made up of none of the words in the wordlist maxmatchedHashtag = werthman_assgn1.maxMatch("findmefood", testwordlist, maxmatchedHashtag) assertion(maxmatchedHashtag == "fi n d me f o o d", "findmefood should be changed to fi n d me f o o d.")
def testMinEditDistanceAlgo(): #retrieve the wordlist and hashtag list form the file system wordlist = werthman_assgn1.readWordsFromFile("testwordlist.txt", True, 75000) hashtags = werthman_assgn1.readWordsFromFile("testhashtaglist.txt", False, 0) #use the maxmatch algo and change the hashtags and add them to a list maxmatchHashtags = [] for hashtag in hashtags: maxmatchHashtags.append(werthman_assgn1.maxMatch(hashtag, wordlist, "")) #read in the list of what the hashtags should really look like correctHashtags = [] with open("realtesthashtags.txt", "r") as f: for line in f: correctHashtags.append(line.strip()) #compare each maxmatchHashtag to each correctHashtag word by word totalWER = 0.0 for maxmatchHashtag, correctHashtag in zip(maxmatchHashtags, correctHashtags): #convert each string to a list of the words in the string maxmatchHashtagAsList = maxmatchHashtag.split() correctHashtagAsList = correctHashtag.split() #test WER for the the hashtags created by maxmatch if correctHashtag == "man over the moon" : #maxmatchHashtag is the same as the correctHashtag so no changes need to be made assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList) == 0, "Man over the moon should have a min edit distance of 0") assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) == 0, "Man over the moon should have a WER of 0") totalWER += werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) elif correctHashtag == "find me food": #maxmatchHashtag:fi ndmefood requires two substitutions fi -> find and ndmefood -> me and an insertion of food assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList) == 7, "Find me food should have a min edit distance of 7") assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) == 7.0/3, "Find me food should have a should have a WER of 7/3") totalWER += werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) elif correctHashtag == "help me over there": #masmatchHashtag:help me over the re requires a substitution the->there and a deletion of re assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList) == 3, "help me over there should have a min edit distance of 3") assertion(werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) == 3.0/4, "help me over there should have a WER of 3/4") totalWER += werthman_assgn1.minEditDist(correctHashtagAsList, maxmatchHashtagAsList)/len(correctHashtagAsList) #average the WER across of the hashtags assertion(totalWER/len(correctHashtags) == ((7.0/3 + 3.0/4)/3.0), "Average WER across test set should be .58")
def finalTestMaxMatchAlgo(): #retrieve the wordlist and hashtag list form the file system wordlist = werthman_assgn1.readWordsFromFile('bigwordlist.txt', True, 75000) hashtags = werthman_assgn1.readWordsFromFile('hashtags-train.txt', False, 0) #use the maxmatch algo and change the hashtags and add them to a list maxmatchHashtags = [] for hashtag in hashtags: maxmatchHashtags.append(werthman_assgn1.maxMatch(hashtag, wordlist, "")) #get the hashtags of the expected output of the maxmatch algo from professor's provided file expectedHashtags = [] with open('hashtags-train-maxmatch.txt', 'r') as f: for line in f: #strip off whitespace characters like newlines expectedHashtags.append(line.strip()) #compare the hashtags created by the maxmatch algo to those provided by the professor for maxmatchHashtag, expectedmaxmatchHashtag in zip(maxmatchHashtags, expectedHashtags): assertion(maxmatchHashtag == expectedmaxmatchHashtag, "My maxmatch algo hashtag: {0} should be the same as the professor's maxmatch algo: {1}.".format(maxmatchHashtag, expectedmaxmatchHashtag))