def generate_events_grammar(attribute, parent, phase): gr = [ Production(Nonterminal('S'), (Nonterminal('AUX1'), )), Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))), Production(Nonterminal('S1'), ('you', Nonterminal('V1'))), Production(Nonterminal('V1'), ('think', Nonterminal('ART'))), Production(Nonterminal('ATTR'), (attribute, Nonterminal('END'))), Production(Nonterminal('END'), ('?', )) ] if parent is not None: art = Production(Nonterminal('ART'), ('the', Nonterminal('PAR'))) par = Production(Nonterminal('PAR'), (parent, Nonterminal('V2'))) else: art = Production(Nonterminal('ART'), ('the', Nonterminal('PAR'))) par = Production( Nonterminal('PAR'), ('events that caused the incident', Nonterminal('V2'))) if phase == 1: v2 = Production(Nonterminal('V2'), ('included', Nonterminal('ATTR'))) else: v2 = Production(Nonterminal('V2'), ('did not include', Nonterminal('ATTR'))) gr.append(art) gr.append(par) gr.append(v2) grammar = CFG(Nonterminal('S'), gr) return grammar
def generate_sources_grammar(attribute, parent, phase): gr = [ Production(Nonterminal('S'), (Nonterminal('AUX1'), )), Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))), Production(Nonterminal('S1'), ('you', Nonterminal('V1'))), Production(Nonterminal('V1'), ('think', Nonterminal('ART'))), Production(Nonterminal('ATTR'), (attribute, Nonterminal('END'))), Production(Nonterminal('END'), ('?', )) ] if phase == 1: v2 = Production(Nonterminal('V2'), ('included', Nonterminal('ATTR'))) else: v2 = Production(Nonterminal('V2'), ('didn´t include', Nonterminal('ATTR'))) if parent is None: article = Production(Nonterminal('ART'), ('the', Nonterminal('CLS'))) parent = Production(Nonterminal('CLS'), ('sources', Nonterminal('V2'))) else: article = Production(Nonterminal('ART'), ('the', Nonterminal('PAR'))) parent = Production(Nonterminal('PAR'), (parent, Nonterminal('V2'))) gr.append(v2) gr.append(article) gr.append(parent) grammar = CFG(Nonterminal('S'), gr) return grammar
def train(): print("Collecting sub-corpus from Penn Treebank (nltk.corpus)") # prepare parsing trees, extrated from treebank tbank_trees = [] for sent in treebank.parsed_sents(): sent.chomsky_normal_form() tbank_trees.append(sent) # build vocabulary list, extracted from treebank vocab_size = 10000 # set vocabulary size to 10000 words = [wrd.lower() for wrd in treebank.words()] vocab = [wrd for wrd,freq in Counter(treebank.words()).most_common(vocab_size)] # generate grammar rules list, extracted from treebank. and calculate their probablity based their frequency tbank_productions = set(production for tree in tbank_trees for production in tree.productions()) tbank_grammar = CFG(Nonterminal('S'), list(tbank_productions)) production_rules = tbank_grammar.productions() rules_to_prob = defaultdict(int) nonterm_occurrence = defaultdict(int) #calculate probablity for rules for sent in tbank_trees: for production in sent.productions(): if len(production.rhs()) == 1 and not isinstance(production.rhs()[0], Nonterminal): production = Production(production.lhs(), [production.rhs()[0].lower()]) nonterm_occurrence[production.lhs()] += 1 rules_to_prob[production] += 1 for rule in rules_to_prob: rules_to_prob[rule] /= nonterm_occurrence[rule.lhs()] # use Katz smoothing rules_to_prob, vocab = katz_smooth(rules_to_prob, vocab) rules = list(rules_to_prob.keys()) rules_reverse_dict = dict((j,i) for i, j in enumerate(rules)) left_rules = defaultdict(set) right_rules = defaultdict(set) unary_rules = defaultdict(set) # classify left, right rules for rule in rules: if len(rule.rhs()) > 1: left_rules[rule.rhs()[0]].add(rule) right_rules[rule.rhs()[1]].add(rule) else: unary_rules[rule.rhs()[0]].add(rule) terminal_nonterms_rules = set(rule for rule in rules_to_prob if len(rule.rhs()) == 1 and isinstance(rule.rhs()[0], str)) terminal_nonterms = defaultdict(int) for rule in terminal_nonterms_rules: terminal_nonterms[rule.lhs()] += 1 pcfg_parser = { 'vocab': vocab, 'left_rules': left_rules, 'right_rules': right_rules, 'unary_rules': unary_rules, 'rules_to_prob': rules_to_prob, 'terminal_nonterms': terminal_nonterms } return pcfg_parser
def generate_impacts_grammar(attribute, phase): gr = [ Production(Nonterminal('S'), (Nonterminal('AUX1'), )), Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))), Production(Nonterminal('S1'), ('you', Nonterminal('V1'))), Production(Nonterminal('V1'), ('think', Nonterminal('ART'))), Production(Nonterminal('ART'), ('the impact of the incident', Nonterminal('V2'))), Production(Nonterminal('END'), ('?', )) ] if phase == 1: v2 = Production(Nonterminal('V2'), ('was', Nonterminal('ATTR'))) else: v2 = Production(Nonterminal('V2'), ('was not', Nonterminal('ATTR'))) attribute = Production(Nonterminal('ATTR'), (attribute, Nonterminal('END'))) gr.append(v2) gr.append(attribute) grammar = CFG(Nonterminal('S'), gr) return grammar
def generate_entities_grammar(attribute, phase): gr = [ Production(Nonterminal('S'), (Nonterminal('AUX1'), )), Production(Nonterminal('AUX1'), ('Do', Nonterminal('S1'))), Production(Nonterminal('S1'), ('you', Nonterminal('V1'))), Production(Nonterminal('V1'), ('think', Nonterminal('ATTR'))), Production(Nonterminal('V3'), ('impacted', Nonterminal('OBJ'))), Production(Nonterminal('V3'), ('affected', Nonterminal('OBJ'))), Production(Nonterminal('OBJ'), ('by the incident', Nonterminal('END'))), Production(Nonterminal('END'), ('?', )) ] if phase == 1: v2 = Production(Nonterminal('V2'), ('are', Nonterminal('V3'))) else: v2 = Production(Nonterminal('V2'), ('are not', Nonterminal('V3'))) attribute = Production(Nonterminal('ATTR'), (attribute, Nonterminal('V2'))) gr.append(v2) gr.append(attribute) grammar = CFG(Nonterminal('S'), gr) return grammar
def add_production(g, prod): """Add production to a grammar Parameters ---------- g : nltk.CFG prod : nltk.Production Returns ------- nltk.CFG See Also -------- nltk.CFG, nltk.Production """ prods = list(g.productions()) prods.append(prod) return CFG(prods)
def remove_nonterminal(g, nont): """Remove nonterminal from a grammar Parameters ---------- g : nltk.CFG nont : nltk.Nonterminal Returns ------- nltk.CFG See Also -------- nltk.CFG, nltk.Nonterminal """ prods = [ p for p in g.productions() if p.lhs() != nont and nont not in p.rhs() ] return CFG(prods)
def remove_production(g, prod): """Remove production from a grammar Parameters ---------- g : nltk.CFG prod : nltk.Production Returns ------- nltk.CFG See Also -------- nltk.CFG, nltk.Production """ if len(g.productions(prod.lhs())) == 1: return remove_nonterminal(g, prod.lhs()) prods = [p for p in g.productions() if p != prod] return CFG(prods)
def create_templates(): """Creates the templates from the grammar.""" prods = [ # Specific verb with goal and the rest of instruction body. Production(Nonterminal('S'), (Nonterminal('V2'), Nonterminal('V2_BODY'))), # A verb and rest of the instruction body assuming goal already mentioned. Production(Nonterminal('V2_BODY'), (Nonterminal('V1'), Nonterminal('M_G_ALREADY_V'))), # A verb and the rest of the instruction body assuming the goal wasn't # mentioned before. Production(Nonterminal('S'), (Nonterminal('V1'), Nonterminal('NO_GOAL'))), # The goal in the begining and the rest of the instruction body assuming # goal already mentioned. Production(Nonterminal('S'), (Nonterminal('V1_GOAL'), Nonterminal('WITH_GOAL'))), # A verb and 'to the' and then goal mention and the rest of the instruction # body. Production(Nonterminal('V1_GOAL'), (Nonterminal('V1'), Nonterminal('V1_CON'))), # A goal mention and the rest of the instruction body. Production(Nonterminal('WITH_GOAL'), (Nonterminal('GOAL'), Nonterminal('M_G'))), # Main part of the instruction without verb in begining and resuming # sentence. Production( Nonterminal('M_G_ALREADY_V'), (Nonterminal('MAIN_NO_V'), Nonterminal('END_NEAR_GOAL_KNOWN'))), # # Main part of the instruction, adding a new sentence. Production(Nonterminal('M_G'), (Nonterminal('MAIN'), Nonterminal('END_NEAR_GOAL_KNOWN'))), # End part - (1) near pivot assuming goal already mentioned; and (2) avoid # sentence. Production(Nonterminal('END_NEAR_GOAL_KNOWN'), (Nonterminal('NEAR_GOAL_START'), Nonterminal('AVOID'))), # End part - (1) near pivot assuming goal not mentioned yet; and (2) avoid # sentence. Production(Nonterminal('END_NEAR_GOAL_KNOWN'), (Nonterminal('NEAR_GOAL_END'), Nonterminal('AVOID'))), # Main part of the instruction without verb in begining and resuming # sentence assuming no goal mentioned before. Production( Nonterminal('NO_GOAL'), (Nonterminal('MAIN_NO_V'), Nonterminal('END_NEAR_GOAL_UNKNOWN'))), # Add Goal to main part and then resume instruction by adding an # ending(near+avoid). Production( Nonterminal('END_NEAR_GOAL_UNKNOWN'), (Nonterminal('GOAL_END'), Nonterminal('END_NEAR_GOAL_KNOWN'))), # Add Goal with near and then add an avoid sentenece. Production(Nonterminal('END_NEAR_GOAL_UNKNOWN'), (Nonterminal('NEAR_GOAL_END'), Nonterminal('AVOID'))), # Termial for IN+DT after verb. Production(Nonterminal('V1_CON'), ('to the', )), ] prods += add_rules('V2', V2) prods += add_rules('AVOID', AVOID) prods += add_rules('NEAR_GOAL_START', NEAR_GOAL_START) prods += add_rules('NEAR_GOAL_END', NEAR_GOAL_END) prods += add_rules('GOAL', GOAL) prods += add_rules('GOAL_END', GOAL_END) prods += add_rules('MAIN_NO_V', MAIN_NO_V) prods += add_rules('MAIN', MAIN) prods += add_rules('V1', V1) grammar = CFG(Nonterminal('S'), prods) # Generate templates. templates = [] for sentence in nltk.parse.generate.generate(grammar): sentence = ' '.join(sentence) if sentence[-1] != '.': sentence += '.' sentence = sentence.replace(" .", ".") sentence = sentence.replace(" ,", ",") sentence = sentence.replace("..", ".") re_space = re.compile(r'[\s]+') sentence = re_space.sub(r' ', sentence) templates.append(sentence) templates_df = pd.DataFrame(templates, columns=['sentence']).drop_duplicates() # Save templates templates_df.to_csv('templates.csv', index=False, header=False) # Flag features. for column in STREET_FEATURES: templates_df[column] = templates_df['sentence'].apply( lambda x: column.upper() in x) return templates_df
if (len(rule) == 2): newrules.append(rule) allrules.remove(rule) # In[11]: print(len(newrules)) print(len(allrules)) # print(newrules) print(allrules) # In[24]: # Create CNF Grammar cnf_ = CFG(start=Nonterminal('SIGMA'), productions=newrules) # In[25]: f = open('cnf_grammar.pkl', 'wb') pickle.dump(cnf_, f) f.close() # In[26]: # Check CNF print(cnf_.is_chomsky_normal_form()) # In[27]: