# sentences holds the correct sentences that we will then # compare the output of our algorithm with # ignore the length 0 string after the last period sentences = cleantext.split('.')[:-1] # remove spaces from excerpt nospaces = re.sub('[^a-zA-Z\'.]', '', text) # ignore the length 0 string after the last period nospace_sentences = nospaces.split('.')[:-1] # tally up sentences that are correct tallyNaiveProb = 0 tallyTransProb = 0 # get frequencies from basetext (freq_dict, normFactor) = helpers.getFreq("alphanumeric.txt") (transition_freq_dict, transNormFactor) = helpers.getTransitionFreq("alphanumeric.txt") # iterate over sentences for (idx, sentence) in enumerate(nospace_sentences): # default max word length as 15 mytext = models.NoSpaceText(sentence, 15) # set frequency dictionaries mytext.freq_dict = freq_dict mytext.normFactor = normFactor mytext.transition_freq_dict = transition_freq_dict mytext.transNormFactor = transNormFactor # find segmentation using naive frequencies mytext.dpGreedy()
def initalizeFrequencies(basetext = "alphanumeric.txt"): (self.freq_dict, self.normFactor) = helpers.getFreq(basetext) (self.transition_freq_dict, self.transNormFactor) = helpers.getTransitionFreq(basetext) return (self.freq_dict, self.normFactor, self.transition_freq_dict, self.transNormFactor)