-
Notifications
You must be signed in to change notification settings - Fork 0
/
makeFeatureFile.py
executable file
·106 lines (99 loc) · 3.3 KB
/
makeFeatureFile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
from CorpusReader import CorpusReader
from levenshtein import levenshtein as lev
from collections import Counter
from pdb import set_trace
from Header import Header
header = Header()
SUFF = ('eas', 'is', 'ais', 'as', 'eamar', 'amar', 'eabhair', 'abhair', 'ead ar', 'adar')
books = False
sentences = True
print "Loading Corpora..."
if books:
print "\tloading munster"
M = CorpusReader('munster')
print "\tloading connacht"
C = CorpusReader('connacht',M.countBooks())
C.truncateBooks(M.countBooks())
print "\tloading ulster"
U = CorpusReader('ulster', M.countBooks())
U.truncateBooks(M.countBooks())
l = [U,M,C]
#print "Done."
if sentences:
print "Creating Balanced Set of sentences"
M = CorpusReader('munster')
C = CorpusReader('connacht')
U = CorpusReader('ulster')
l = [U,M,C]
MIN_LENG = min([x.countSentences() for x in l])
for x in l:
x.truncateSentences(MIN_LENG)
print "Done!"
def makeRow(cnt,header,dialect):
l = []
if dialect == 'M': dialect = 1
if dialect == 'C': dialect = 2
if dialect == 'U': dialect = 3
c = sorted(cnt.items(),key = lambda x: int(x[0]))
for k in c:
l.append("%s:%f"%(k[0],k[1]))
return str(dialect)+" "+' '.join(l)+'\n'
def isVerb(POS):
for pos in POS.split('|'):
if pos.startswith('V'):
return True
return False
def checkSuff(sentence):
suff_cnt = 0
for word in sentence:
if isVerb(word['POS']):
if word['token'].endswith(SUFF):
suff_cnt += 1
return suff_cnt
def checkLenitedN(sentence):
lenited = 0
eclip = 0
leng = len(sentence)
for i,word in enumerate(sentence):
if word["POS"].startswith('S') and i+2 <leng:
if sentence[i+1]['POS'].startswith("T"):
if sentence[i+2]['POS'].startswith('N'):
if sentence[i+2]['token'].lower() != \
sentence[i+2]['lemma'].split('-')[0].lower():
# set_trace()
if sentence[i+2]['token'].lower()[0] != sentence[i+2]['lemma'].split('-')[0].lower()[0]:
eclip+=1
elif sentence[i+2]['token'].lower()[1] == 'h':
lenited +=1
return (eclip,lenited)
featureFile = []
print "Creating Feature File"
header.add("EMPTY")
header.add("ECLIPSIS")
header.add("LENITION")
header.add("SUFFIX_COUNT")
with open('featureFile.dat', 'w') as fo:
i = 0
for dialect in l:
for book in dialect:
for sentence in book['sentences']:
cnt = Counter()
suff = checkSuff(sentence)
cnt[header["SUFFIX_COUNT"]] = suff
e_l = checkLenitedN(sentence)
cnt[header["ECLIPSIS"]] = e_l[0]
cnt[header["LENITION"]] = e_l[1]
for word in sentence:
#if isVerb(word['POS']):
header.add(word['token'])
cnt[header[word['token']]] += 1
if len(cnt) == 0:
cnt[header['EMPTY']] = 0
i+=1
print str(i)
try:
s = makeRow(cnt,header,book['dialect'][0])
except:
set_trace()
fo.write(s)
print "Done!"