-
Notifications
You must be signed in to change notification settings - Fork 0
/
cnf.py
139 lines (115 loc) · 3.94 KB
/
cnf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import sys, re, random
import nltk
from nltk import grammar
N = [] #nonterminals
sigma = [] #terminals
R = [] #rules
marked_for_deletion = []
#Load CFG from file into global vars.
def get_cfg(filename):
global N, sigma, R
my_grammar = nltk.data.load(filename)
for p in my_grammar.productions():
N.append(p.lhs().symbol()) #add nonterminal
for element in p.rhs():
if grammar.is_terminal(element):
sigma.append(element) #add rhs terminal(s)
else:
N.append(element.symbol()) #add rhs nonterminal(s)
R.append(p) #add rule
#remove duplicates
N = list(set(N))
sigma = list(set(sigma))
#Determine whether production is already in Chomsky Normal Form.
def is_cnf(production):
rhs = production.rhs()
if len(rhs) == 1:
return grammar.is_terminal(rhs[0])
elif len(rhs) == 2:
return (grammar.is_nonterminal(rhs[0])
and grammar.is_nonterminal(rhs[1]))
else:
return False
#Determine whether production is a unit production. (e.g. A -> B)
def is_unit(production):
if (len(production.rhs()) == 1
and grammar.is_nonterminal(production.rhs()[0])):
return True
else:
return False
#Replace unit production of form A -> B by adding rule A -> rhs
# for every rule B -> rhs.
def repl_unit(A, B):
for rule in R:
if rule.lhs() == B:
R.append(grammar.Production(A, rule.rhs()))
#Replace terminals on right hand side of production by substituting
# with dummy nonterminals.
# (e.g. [A -> B b] becomes [A -> B C] and [C -> b])
def remove_rhs_terminals(production):
rhs = production.rhs()
if len(rhs) > 1:
new_rhs = ()
for element in rhs:
if grammar.is_terminal(element):
#create dummy nonterminal
new_nt = grammar.Nonterminal(create_nonterminal())
new_rhs = new_rhs + (new_nt,)
#define dummy nonterminal
R.append(grammar.Production(new_nt, (element,)))
else:
new_rhs = new_rhs + (element,)
#replace rule
R[R.index(production)] = grammar.Production(production.lhs(), new_rhs)
#Choose random name for new nonterminal.
def create_nonterminal():
while True:
string = "_"
for i in range(3):
string += str(unichr(random.randint(ord('A'), ord('Z'))))
if not string in N:
N.append(string)
return string
#Shorten long productions by splitting them into two new productions.
def shorten(production):
lhs = production.lhs()
rhs = production.rhs()
if len(rhs) > 2: #it's too long
new_nt = grammar.Nonterminal(create_nonterminal())
new_production_1 = grammar.Production(lhs, (rhs[0], new_nt))
new_rhs = ()
for i in range(1, len(rhs)):
new_rhs = new_rhs + (rhs[i],)
new_production_2 = grammar.Production(new_nt, new_rhs)
marked_for_deletion.append(production)
R.append(new_production_1)
R.append(new_production_2)
def main():
global R, marked_for_deletion
get_cfg(sys.argv[1])
#replace rhs terminals with dummy nonterminals
for rule in R:
remove_rhs_terminals(rule)
#eliminate unit productions
marked_for_deletion = []
for rule in R:
if is_unit(rule):
marked_for_deletion.append(rule)
A = rule.lhs()
B = rule.rhs()[0]
repl_unit(A, B)
for rule in marked_for_deletion:
R.remove(rule)
#shorten long rules
marked_for_deletion = []
for rule in R:
shorten(rule)
for rule in marked_for_deletion:
R.remove(rule)
new_grammar = grammar.CFG(nltk.data.load(sys.argv[1]).start(), R)
if not new_grammar.is_chomsky_normal_form():
print("warning!: it did not work!")
print("%start "+new_grammar.start().symbol())
for p in new_grammar.productions():
print(p)
main()