def makeWordFreqList(filename): tokenFrequencyList = {} #track freq of each token #split the corpus into sentences. corpusSentences = sensplit.sen_splitter(filename) #for each sentence... for sen in corpusSentences: #Make tokens (words) from the sentence by splitting on whitespace. senTokens = sen.split() #for each 'word' in the current sentence... for token in senTokens: #ignore any stop words (function words, etc). #stop words will be found in stopwords.txt #first turn the token lowercase for easier comparison! token = token.lower() #remove punctuation if it exists in the current token. token = removePunctuation(token) #if not a stop word... if not isStopWord(token) and len(token.strip()) > 0: #add to the list if token in tokenFrequencyList: tokenFrequencyList[token] += 1 else: tokenFrequencyList[token] = 1 return tokenFrequencyList
def makeBigramFreqList(filename): bigramFrequencyList = {} #track freq of each bigram #split the corpus into sentences. (Due to assuming bigrams cannot cross sentence ends.) corpusSentences = sensplit.sen_splitter(filename) #quick check if we should continue - if file is not found, don't move on. if len(corpusSentences) == 0: return [] wordFreqList = makeWordFreqList(filename) #make a combinatorial list of all bigram pairs first. (extras smoothed later) for word1 in wordFreqList: for word2 in wordFreqList: bigram = word1 + " " + word2 bigramFrequencyList[bigram] = 0 #for each sentence... for sen in corpusSentences: #Make tokens (words) from the sentence by splitting on whitespace. senTokens = sen.split() tokenPair = [] #keep track of our current bigram pair as we go through the sentence #for each 'word' in the current sentence... for token in senTokens: #ignore any stop words (function words, etc). #stop words will be found in stopwords.txt #first turn the token lowercase for easier comparison! token = token.lower() #remove punctuation if it exists in the current token. token = removePunctuation(token) #if not a stop word... if not isStopWord(token) and len(token.strip()) > 0: #add to our current tokenpair tokenPair.append(token) #if our tokenpair is now two words, add the pair to the bigramFreq list by # combining the two words as the key; ex: "word1 word2" as a key, seperated by space # Steps: check if pair exists already, if not add pair and set freq to 1 # Ex. bigramFrequencyList["word1 word2"] = 1 # If pair exists already, simply increment frequency for that pair by 1. if len(tokenPair) == 2: pairKey = tokenPair[0] + ' ' + tokenPair[1] if pairKey in bigramFrequencyList: bigramFrequencyList[pairKey] += 1 else: bigramFrequencyList[pairKey] = 1 #if we put the tokenpair into the bigramFreq list, clear the current pair and start a new pair # this pair will start with the current token as the first word. tokenPair = [] tokenPair.append(token) #SMOOTHING TIME #use Good Turing discount formula to modify the frequency of the bigram table #Formula: # c* = (c+1) * NumBigramsOfFreq(C+1) / NumBigramsOfFreq(C) #NOTE: There is an inherent issue with Good Turing smoothing when numBigramsOfFreq(C+1) == 0 #This becomes almost a non-issue with any regular sized corpus, but this smoothing will ruin #the frequencies by setting them to 0 if there are any interm frequency counts that are 0. #Moral: Don't use tiny data sets. #Source: http://www.ee.ucla.edu/~weichu/htkbook/node214_mn.html #Now, get some data we'll need for our formula... #get a list of all frequencies and occurances of those frequencies in the freq. list bigramStats = queryBigramStats(bigramFrequencyList) #make a new list to hold the new bigram frequencies we will replace the old ones with newBigramFrequencies = copy.deepcopy(bigramStats) #for each frequency... (Use -1 due to using c+1 and c being an index) for c in range(0,len(bigramStats)-1): #avoid division by 0 error if bigramStats[c] != 0: #Adjust the counts using the Good Turing Discount formula newBigramFrequencies[c] = float((c+1)*bigramStats[c+1])/bigramStats[c] else: newBigramFrequencies[c] = 0 #Then replace the old frequency counts with the new frequency counts #The old frequency will be the index into the new array to get the updated count for bigram in bigramFrequencyList.keys(): oldBigramFreq = bigramFrequencyList[bigram] bigramFrequencyList[bigram] = newBigramFrequencies[oldBigramFreq] return bigramFrequencyList
def makeBigramFreqList(filename): bigramFrequencyList = {} #track freq of each bigram #split the corpus into sentences. (Due to assuming bigrams cannot cross sentence ends.) corpusSentences = sensplit.sen_splitter(filename) #quick check if we should continue - if file is not found, don't move on. if len(corpusSentences) == 0: return [] wordFreqList = makeWordFreqList(filename) #make a combinatorial list of all bigram pairs first. (extras smoothed later) for word1 in wordFreqList: for word2 in wordFreqList: bigram = word1 + " " + word2 bigramFrequencyList[bigram] = 0 #for each sentence... for sen in corpusSentences: #Make tokens (words) from the sentence by splitting on whitespace. senTokens = sen.split() tokenPair = [ ] #keep track of our current bigram pair as we go through the sentence #for each 'word' in the current sentence... for token in senTokens: #ignore any stop words (function words, etc). #stop words will be found in stopwords.txt #first turn the token lowercase for easier comparison! token = token.lower() #remove punctuation if it exists in the current token. token = removePunctuation(token) #if not a stop word... if not isStopWord(token) and len(token.strip()) > 0: #add to our current tokenpair tokenPair.append(token) #if our tokenpair is now two words, add the pair to the bigramFreq list by # combining the two words as the key; ex: "word1 word2" as a key, seperated by space # Steps: check if pair exists already, if not add pair and set freq to 1 # Ex. bigramFrequencyList["word1 word2"] = 1 # If pair exists already, simply increment frequency for that pair by 1. if len(tokenPair) == 2: pairKey = tokenPair[0] + ' ' + tokenPair[1] if pairKey in bigramFrequencyList: bigramFrequencyList[pairKey] += 1 else: bigramFrequencyList[pairKey] = 1 #if we put the tokenpair into the bigramFreq list, clear the current pair and start a new pair # this pair will start with the current token as the first word. tokenPair = [] tokenPair.append(token) #SMOOTHING TIME #use Good Turing discount formula to modify the frequency of the bigram table #Formula: # c* = (c+1) * NumBigramsOfFreq(C+1) / NumBigramsOfFreq(C) #NOTE: There is an inherent issue with Good Turing smoothing when numBigramsOfFreq(C+1) == 0 #This becomes almost a non-issue with any regular sized corpus, but this smoothing will ruin #the frequencies by setting them to 0 if there are any interm frequency counts that are 0. #Moral: Don't use tiny data sets. #Source: http://www.ee.ucla.edu/~weichu/htkbook/node214_mn.html #Now, get some data we'll need for our formula... #get a list of all frequencies and occurances of those frequencies in the freq. list bigramStats = queryBigramStats(bigramFrequencyList) #make a new list to hold the new bigram frequencies we will replace the old ones with newBigramFrequencies = copy.deepcopy(bigramStats) #for each frequency... (Use -1 due to using c+1 and c being an index) for c in range(0, len(bigramStats) - 1): #avoid division by 0 error if bigramStats[c] != 0: #Adjust the counts using the Good Turing Discount formula newBigramFrequencies[c] = float( (c + 1) * bigramStats[c + 1]) / bigramStats[c] else: newBigramFrequencies[c] = 0 #Then replace the old frequency counts with the new frequency counts #The old frequency will be the index into the new array to get the updated count for bigram in bigramFrequencyList.keys(): oldBigramFreq = bigramFrequencyList[bigram] bigramFrequencyList[bigram] = newBigramFrequencies[oldBigramFreq] return bigramFrequencyList