/
testYourOwnText.py
73 lines (57 loc) · 2.08 KB
/
testYourOwnText.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import models
import re
import enchant
import helpers
import time
print "What text would you like to segment? Please type a file name. "
+ "This should be a normal text file, as we will remove the spaces" +
" and then put them back to test accuracy."
basetext = raw_input()
# start timing
start_time = time.time()
text = open(basetext).read()
helpers.parseBaseText()
print "Segmenting Text..."
# remove excess line breaks and all punctuation except periods
cleantext = re.sub('[^a-zA-Z\n\'.]', ' ', text)
# parse excerpt into list of sentences
# sentences holds the correct sentences that we will then
# compare the output of our algorithm with
# ignore the length 0 string after the last period
sentences = cleantext.split('.')[:-1]
# remove spaces from excerpt
nospaces = re.sub('[^a-zA-Z\'.]', '', text)
# ignore the length 0 string after the last period
nospace_sentences = nospaces.split('.')[:-1]
# tally up sentences that are correct
tallyNaiveProb = 0
tallyTransProb = 0
# get frequencies from basetext
(freq_dict, normFactor) = helpers.getFreq("alphanumeric.txt")
(transition_freq_dict, transNormFactor) = helpers.getTransitionFreq("alphanumeric.txt")
# iterate over sentences
for (idx, sentence) in enumerate(nospace_sentences):
# default max word length as 15
mytext = models.NoSpaceText(sentence, 15)
# set frequency dictionaries
mytext.freq_dict = freq_dict
mytext.normFactor = normFactor
mytext.transition_freq_dict = transition_freq_dict
mytext.transNormFactor = transNormFactor
# find segmentation using naive frequencies
mytext.dpGreedy()
bestSeg = mytext.getBestSeg()
if helpers.compare(bestSeg, sentences[idx]):
tallyNaiveProb += 1
# find segmentation using transition frequencies
mytext.dpGreedy(transFreq = True)
bestSeg = mytext.getBestSeg()
if helpers.compare(bestSeg, sentences[idx]):
tallyTransProb += 1
# print results
print "Using Naive Frequencies:"
print float(tallyNaiveProb)/(len(sentences))
print "Using Transition Frequencies:"
print float(tallyTransProb)/(len(sentences))
# print total time
print("--- %s total seconds ---\n" % (time.time() - start_time))