def __init__(self, parse_file): counter = Counts() for l in open(parse_file): tree = json.loads(l) counter.count(tree) #N is 'Non-terminal'=> N[symbol] = count self.N = counter.nonterm #binary_R is 'binary Rule' binary_R[symbol, y1, y2] = count self.binary_R = counter.binary #unary will be on this form=> unary[symbol, word] = count self.unary_R = counter.unary #V is 'vocabulary' and there are 245 words in it. self.V = counter.vocabulary #245 # pi[i, i, x] = probability #'pi' is a dictionary, 'i' is the index of the word in the sentence, 'x' is the non-terminal symbol #'pi[i, i, x]' = 0.03 where 0.03 is the probability of the word at 'i' being assigned to 'x' self.pi = {} #'bp' stands for 'back pointer' and it's a dictionary that maps the best rule and best_split to #the binary table #'bp[i, i, x]' = [(x,word),i] self.bp = {} #binary_table is a dictionary that collects the binary rules derived from one symbol #binary_table['S'] = [ ('NP', 'VP'), ('NP', 'VP+VERB'), ...] self.binary_table = defaultdict(set) self.initialize_binary_table()
class PCFG: def __init__(self, rare_count_threshold=5): self.rare_count_threshold = rare_count_threshold self.cfg_counter = Counts() self.word_freq = defaultdict(int) # This tree is in Penn Tree Bank format not in Chomsky Normal Form def create_parse_tree(self, elem_root_subtree): if 'null' in elem_root_subtree.attrib: return None subtree_array = [elem_root_subtree.attrib['cat']] if elem_root_subtree.tag == 'cons': for child in elem_root_subtree: child_subtree_array = self.create_parse_tree(child) if child_subtree_array is not None: subtree_array.append(child_subtree_array) elif elem_root_subtree.tag == 'tok': subtree_array.append(elem_root_subtree.text) else: # TBD: print error message pass return subtree_array # right-factored # TBD: Provide option for left-factored too # TBD: Handle null as mentioned in section 6.3 Null Elements in GENIA corpus manual def convert_PTB_to_CNF(self, tree_array): if len(tree_array) > 3: # convert multi-child tree into tree with two children subtree_array = [tree_array[0]] for i in range(2, len(tree_array)): subtree_array.append(tree_array[i]) tree_array[2] = subtree_array # tree_array[2] = [tree_array[0], tree_array[2], tree_array[3:]] del tree_array[3:] self.convert_PTB_to_CNF(tree_array) elif len(tree_array) == 3: # root of both the children should be non-terminals assert (type(tree_array[1]) is list), "expected list for left child: {0}".format(tree_array[1]) assert (type(tree_array[2]) is list), "expected list for right child: {0}".format(tree_array[2]) # convert left child into CNF form if its not already in that form self.convert_PTB_to_CNF(tree_array[1]) # convert right child into CNF form if its not already in that form self.convert_PTB_to_CNF(tree_array[2]) elif (len(tree_array) == 2) and (type(tree_array[1]) is list): # Form: X->Y where X,Y are non-terminals tree_array[0] = tree_array[0] + '+' + tree_array[1][0] if type(tree_array[1][1]) is list: subtree_array = tree_array[1][1:] del tree_array[1:] for subtree in subtree_array: tree_array.append(subtree) self.convert_PTB_to_CNF(tree_array) else: # e.g. [NP [DET There]] tree_array[1] = tree_array[1][1] # Extract parse tree from the xml files. These parse trees are in Penn TreeBank format. # Randomly assign xml files into training and validation set. # Write the parse trees after converting them into Chomsky Normal Form def create_train_and_test_parse_tree(self, treebank_folder, train_key_file, test_file, test_key_file, n_train_file): file_list = glob.glob(treebank_folder+'/'+'*.xml') # randomly select n_train_file for training and rest for testing train_index_list = random.sample(range(len(file_list)), n_train_file) # write the train file count_parse_tree = 0 with open(train_key_file, 'w') as fd: for file_i in train_index_list: train_file = file_list[file_i] try: count_parse_tree_in_xml = self.write_parse_tree(fd, train_file) count_parse_tree += count_parse_tree_in_xml print('train file: {0} :: parse tree count till now: {1}'.format(train_file, count_parse_tree)) except: err = sys.exc_info()[0] print('Error in {0}: {1}'.format(train_file, err)) # test files are actually more of validation set # write the test parse tree file failure_parse_file_list = [] with open(test_key_file, 'w') as fd: # optimize the search to O(n) by sorting the train_index_list first for file_i in range(len(file_list)): if file_i not in train_index_list: test_xml_file = file_list[file_i] try: self.write_parse_tree(fd, test_xml_file) except: err = sys.exc_info() print('Error in {0}: {1}'.format(test_xml_file, err)) parts_filename = os.path.split(test_xml_file) failure_parse_file_list.append(parts_filename[1]) # Now write the test sentences with open(test_file, 'w') as fd: # optimize the search to O(n) by sorting the train_index_list first for file_i in range(len(file_list)): if file_i not in train_index_list: test_xml_file = file_list[file_i] parts_filename = os.path.split(test_xml_file) if parts_filename[1] in failure_parse_file_list: print('ignoring sentence extraction from {0}'.format(test_xml_file)) continue try: self.write_sentences(fd, test_xml_file) except: err = sys.exc_info()[0] print('Error in extracting sentence from {0}: {1}'.format(test_xml_file, err)) # Create parse trees from xml file and write in the train/test file (fd) def write_parse_tree(self, fd, xml_filename): tree = ET.parse(xml_filename) root = tree.getroot() count_parse_tree_in_xml = 0 # reading the sentences only from the section: AbstractText for abstractText in root.iter('AbstractText'): # iterate over each of the sentences for sentence in abstractText.iter('sentence'): # TBD: Following should have a single iteration # Need to check if the root is an actual root tag e.g. 'S' for sentence_root in sentence: tree_ptb_array = self.create_parse_tree(sentence_root) tree_cnf_array = deepcopy(tree_ptb_array) self.convert_PTB_to_CNF(tree_cnf_array) # convert string into json (this converts single quotes to double quotes) # required due to failure in json load of tree in count_cfg_freq.py tree_cnf_json = json.dumps(tree_cnf_array) fd.write('{0}\n'.format(tree_cnf_json)) count_parse_tree_in_xml += 1 return count_parse_tree_in_xml # Extract sentences from xml file and write in fd @staticmethod def write_sentences(fd, xml_filename): tree = ET.parse(xml_filename) root = tree.getroot() for abstractText in root.iter('AbstractText'): for sentence in abstractText.iter('sentence'): token_array = [] for token in sentence.iter('tok'): token_array.append(token.text) fd.write('{0}\n'.format(' '.join(token_array))) def create_train_with_rare(self, orig_train_key_file, mod_train_key_file): # First check if self.word_freq is already populated or not if len(self.cfg_counter.unary) == 0: self.compute_cfg_frequency_in_train_file(orig_train_key_file) if len(self.word_freq) == 0: self.compute_word_frequency_in_cfg() # Now iterate through each parse tree and replace the rare word with rare symbol # Write the changed parse trees into new train file count_parse_tree = 0 with open(mod_train_key_file, 'w') as wfd: with open(orig_train_key_file, 'r') as rfd: for line in rfd: count_parse_tree += 1 tree = json.loads(line) try: self.replace_infrequent_words_in_parse_tree(tree) tree_json = json.dumps(tree) wfd.write('{0}\n'.format(tree_json)) except: print('Error: create_train_with_rare(): parse tree # {0} :: line: {1}\n'.format(count_parse_tree, line)) def compute_cfg_frequency_in_train_file(self, train_file): self.cfg_counter = Counts() with open(train_file, 'r') as fd: for line in fd: try: cfg_tree = json.loads(line) self.cfg_counter.count(cfg_tree) except: print('Error: compute_cfg_frequency_in_train_file(): line: {0}'.format(line)) def compute_word_frequency_in_cfg(self): self.word_freq = defaultdict(int) # Terminal word can be assigned to multiple non-Terminals for (sym, word), count in self.cfg_counter.unary.iteritems(): self.word_freq[word] += count def write_word_frequency_in_cfg(self, word_freq_file): word_freq_sorted = sorted(self.word_freq.items(), key=lambda x: x[1], reverse=True) with open(word_freq_file, 'w') as fd: for word_freq in word_freq_sorted: fd.write('{0} {1}\n'.format(word_freq[1], word_freq[0])) def is_rare_word(self, word): return self.word_freq[word.lower()] < self.rare_count_threshold def replace_infrequent_words_in_parse_tree(self, tree): if isinstance(tree, basestring): return if len(tree) == 3: # binary rule self.replace_infrequent_words_in_parse_tree(tree[1]) self.replace_infrequent_words_in_parse_tree(tree[2]) elif len(tree) == 2: # unary rule word = tree[1] assert (type(tree[1]) is not list), "expected string: {0}".format(tree[1]) if self.is_rare_word(word): tree[1] = '_RARE_' # x -> y z # x,y,z are non-terminals def compute_binary_parameter(self, x, y, z): key = (x, y, z) if key not in self.cfg_counter.binary: return 0.0 else: return self.cfg_counter.binary[key]*1.0/self.cfg_counter.nonterm[x] # x -> w # x: non terminal # w: terminal def compute_unary_parameter(self, x, w): key = (x, w) if key not in self.cfg_counter.unary: return 0.0 else: return self.cfg_counter.unary[key]*1.0/self.cfg_counter.nonterm[x] def compute_parse_tree_using_cky_algorithm(self, sentence): tokens = sentence.strip().split(' ') n = len(tokens) # replace the rare token with rare symbol for i in range(n): if self.is_rare_word(tokens[i]): tokens[i] = '_RARE_'.lower() else: tokens[i] = tokens[i].lower() pi = {} # store the matrix in dict form bp = {} # initialize the dict for i in range(n): pi[i] = {} bp[i] = {} for j in range(i, n): pi[i][j] = {} bp[i][j] = {} if i == j: # initialize pi for x->w for each of the word for x in self.cfg_counter.nonterm: pi[i][i][x] = self.compute_unary_parameter(x, tokens[i]) bp[i][i][x] = None else: for x in self.cfg_counter.nonterm: pi[i][j][x] = 0.0 bp[i][j][x] = None # Now for x -> y z where x,y,z are non-terminals for l in range(1, n): for i in range(n-l): j = i + l # Here we only consider the (x,y,z) tuple which we have seen in training data for (x, y, z) in self.cfg_counter.binary.keys(): q_x_to_yz = self.compute_binary_parameter(x, y, z) if q_x_to_yz <= pi[i][j][x]: # current x -> y z can't give better probability than already computed max prob with # non-terminal x spanning words i..j inclusive continue max_arg_s, max_val_s_i_j = self.compute_best_split(pi, i, j, y, z) # now check if the current value of pi[i][j][x] is better than the value computed earlier val = q_x_to_yz * max_val_s_i_j if pi[i][j][x] <= val: pi[i][j][x] = val bp[i][j][x] = [max_arg_s, y, z] """ # In the following we try all the (x,y,z) combinations # This is a slower process for x in self.cfg_counter.nonterm: max_pi_i_j_x = 0.0 # [s,y,z] arg_max_pi = [None, None, None] for y in self.cfg_counter.nonterm: for z in self.cfg_counter.nonterm: q_x_to_yz = self.compute_binary_parameter(x, y, z) max_val_s_i_j = 0.0 max_arg_s = None for s in range(i, j): val = pi[i][s][y] * pi[s+1][j][z] if max_val_s_i_j < val: max_val_s_i_j = val max_arg_s = s val = q_x_to_yz * max_val_s_i_j if max_pi_i_j_x < val: max_pi_i_j_x = val arg_max_pi = [max_arg_s, y, z] pi[i][j][x] = max_pi_i_j_x bp[i][j][x] = arg_max_pi """ assert (pi[0][n-1]['S'] > 0), "pi[0][{0}]['S'] is zero".format(n-1) # split the sentence into tokens again. In the beginning of the function we had replaced the rare tokens with rare symbol tokens = sentence.strip().split(' ') best_parse_tree = self.create_parse_tree_from_backpointers(tokens, bp, 0, n-1, 'S') best_prob = pi[0][n-1]['S'] return best_parse_tree, best_prob # For the given x -> y z for the words spanning i,..,j inclusive, compute the best split # Select s which maximizes pi(i,s,y)*pi(s+1,j,z) @staticmethod def compute_best_split(pi, i, j, y, z): max_val_s_i_j = 0.0 max_arg_s = None for s in range(i, j): val = pi[i][s][y] * pi[s + 1][j][z] if max_val_s_i_j <= val: max_val_s_i_j = val max_arg_s = s return max_arg_s, max_val_s_i_j def create_parse_tree_from_backpointers(self, tokens, bp, i, j, x): if i == j: return [x, tokens[i]] assert (bp[i][j][x] is not None), "bp[{0}][{1}][{2}] is None".format(i, j, x) split_index, y, z = bp[i][j][x] parse_tree_left_child = self.create_parse_tree_from_backpointers(tokens, bp, i, split_index, y) parse_tree_right_child = self.create_parse_tree_from_backpointers(tokens, bp, split_index+1, j, z) parse_tree = [x, parse_tree_left_child, parse_tree_right_child] return parse_tree def compute_parse_tree_for_test_sentences(self, test_sentence_file, test_key_file): with open(test_key_file, 'w') as wfd: with open(test_sentence_file, 'r') as rfd: sent_i = 0 for sentence in rfd: sent_i += 1 if sent_i % 20 == 19: print('sent_i: {0}'.format(sent_i)) try: tree, prob = self.compute_parse_tree_using_cky_algorithm(sentence) # convert into json tree_json = json.dumps(tree) wfd.write('{0}\n'.format(tree_json)) print('prob of sent_i #{0}: {1}'.format(sent_i, prob)) except: err = sys.exc_info()[0] print('Error: compute_parse_tree_for_test_sentences(): sent_i: {0} :: sentence: {1} :: \ error: {2}\n'.format(sent_i, sentence, err))
class PCFG: def __init__(self, original_train_file, modified_train_file, count_threshold=5): self.count_threshold = count_threshold self.word_freq = defaultdict(int) self.cfg_counter = Counts() self.bp_table = {} def compute_cfg_frequency_in_train_file(self, train_file): self.cfg_counter = Counts() with open(train_file, 'r') as fd: for line in fd: cfg_tree = json.loads(line) self.cfg_counter.count(cfg_tree) def compute_word_frequency_in_cfg(self): self.word_freq = defaultdict(int) # case sensitive # https://class.coursera.org/nlangp-001/forum/thread?thread_id=631 # Terminal word can be assigned to multiple non-Terminals # https://class.coursera.org/nlangp-001/forum/thread?thread_id=620#post-2613 for (sym, word), count in self.cfg_counter.unary.iteritems(): self.word_freq[word] += count def is_rare_word(self, word): return self.word_freq[word] < self.count_threshold def replace_infrequent_words_in_parse_tree(self, tree): if isinstance(tree, basestring): return if len(tree) == 3: # binary rule self.replace_infrequent_words_in_parse_tree(tree[1]) self.replace_infrequent_words_in_parse_tree(tree[2]) elif len(tree) == 2: # unary rule word = tree[1] if self.is_rare_word(word): tree[1] = '_RARE_' def compute_binary_parameter(self, symbol, y1, y2): return self.cfg_counter.binary[(symbol, y1, y2)]*1.0/self.cfg_counter.nonterm[symbol] def compute_unary_parameter(self, symbol, word): return self.cfg_counter.unary[(symbol, word)]*1.0/self.cfg_counter.nonterm[symbol] def CKY_algorithm(self, sentence): sentence_tokens = re.split(r'[ ]+', sentence.rstrip()) n_tokens = len(sentence_tokens) max_prob_table = defaultdict(float) # pi table self.bp_table = {} # now build the dynamic programming table bottom-up for symbol in self.cfg_counter.nonterm.iterkeys(): for i in range(0, n_tokens): word = sentence_tokens[i] if self.is_rare_word(word): word = '_RARE_' key = (symbol, word) if key in self.cfg_counter.unary.keys(): max_prob_table[(i, i, symbol)] = self.compute_unary_parameter(symbol, word) for step in range(1, n_tokens): for i in range(0, n_tokens-step): j = i + step for (sym, y1, y2) in self.cfg_counter.binary.iterkeys(): binary_param = self.compute_binary_parameter(sym, y1, y2) max_prob_mult = 0 max_prob_s = None for s in range(i, j): prob_mult = max_prob_table[(i, s, y1)]*max_prob_table[(s+1, j, y2)] if max_prob_mult < prob_mult: max_prob_mult = prob_mult max_prob_s = s prob_with_current_binary_rule_over_i_j = binary_param*max_prob_mult if max_prob_table[(i, j, sym)] < prob_with_current_binary_rule_over_i_j: max_prob_table[(i, j, sym)] = prob_with_current_binary_rule_over_i_j self.bp_table[(i, j, sym)] = tuple([max_prob_s, y1, y2]) parse_tree = self.create_parse_tree(0, n_tokens-1, 'SBARQ', sentence_tokens) return parse_tree def create_parse_tree(self, i, j, sym, sentence_tokens): # [sym, func(i,s,y1), func(s+1,j,y2) ] parse_sub_tree = [] if i == j: parse_sub_tree = [sym, sentence_tokens[i]] # parse_sub_tree = '[' + sym + ', ' + sentence_tokens[i] + ']' else: split_tuple = self.bp_table[(i, j, sym)] s = split_tuple[0] y1 = split_tuple[1] y2 = split_tuple[2] parse_sub_tree = [sym, self.create_parse_tree(i, s, y1, sentence_tokens), self.create_parse_tree(s+1, j, y2, sentence_tokens)] ''' parse_sub_tree = '[' parse_sub_tree += sym parse_sub_tree += ', ' parse_sub_tree += self.create_parse_tree(i, s, y1, sentence_tokens) parse_sub_tree += ', ' parse_sub_tree += self.create_parse_tree(s+1, j, y2, sentence_tokens) parse_sub_tree += ']' ''' return parse_sub_tree
class CKY: def __init__(self): self.binary_q = {} self.unary_q = {} self.counter = Counts() self.pi = [] self.bp = [] self.parser = {} self.use_vert = False def loadData(self, corpus_file): l = corpus_file.readline() while (l): t = json.loads(l) self.counter.count(t) l = corpus_file.readline() def compute(self): for (sym, word), count in self.counter.unary.iteritems(): self.unary_q[(sym, word)] = count / self.counter.nonterm[sym] for (sym, y1, y2), count in self.counter.binary.iteritems(): self.binary_q[(sym, y1, y2)] = count / self.counter.nonterm[sym] def buildNTDict(self): self.binary_rule = {} for x, y1, y2 in self.binary_q.keys(): if x not in self.binary_rule.keys(): self.binary_rule[x] = [] self.binary_rule[x].append((y1, y2)) def cky_init(self, wordList): self.pi = [] self.bp = [] sent_len = len(wordList) for i in range(0, sent_len): self.pi.append([]) self.bp.append([]) for j in range(0, sent_len): self.pi[i].append({}) self.bp[i].append({}) for (sym, word), q in self.unary_q.iteritems(): if wordList[i] == word: self.pi[i][i][sym] = q self.bp[i][i][sym] = word pass def clean_sent(self, sentence): wordList = sentence.split() vocabDict = [v for s, v in self.counter.unary.keys()] for i, word in enumerate(wordList): if word not in vocabDict: wordList[i] = '_RARE_' return wordList def cky_algorithm(self, sentence): self.inputWordList = sentence.split() wordList = self.clean_sent(sentence) sent_len = len(wordList) self.cky_init(wordList) #print self.pi[10][10] for l in range(1, sent_len): for i in range(0, sent_len - l): j = i + l #print i,j for sym in self.binary_rule.keys(): for s in range(i, j): derivations = self.binary_rule[sym] for y1, y2 in derivations: if y1 not in self.pi[i][s].keys( ) or y2 not in self.pi[s + 1][j].keys(): #if i == 0 and j == 4 : #print s,'t' #if sym=='S' and y1=='NP' and y2=='S': # print sym,y1,y2 # print s # print self.pi[0][1].keys() # print self.pi[2][4].keys() # print y1 in self.pi[i][s].keys() # print y2 in self.pi[s+1][j].keys() # print (sym,y1,y2) not in self.binary_q.keys() #pass continue #print i,j temp = self.binary_q.get( (sym, y1, y2)) * self.pi[i][s].get( y1) * self.pi[s + 1][j].get(y2) self.bp[i][j][sym] = ( s, y1, y2) if temp > self.pi[i][j].get( sym, 0) else self.bp[i][j].get(sym, 0) self.pi[i][j][sym] = max(temp, self.pi[i][j].get(sym, 0)) #print self.pi[1][sent_len-1] #print self.pi[0][0] #print self.binary_q['S','NP^<S>+NOUN','S'] #print self.bp #try: if 'S' in self.pi[0][sent_len - 1].keys(): root_score = 'S', self.pi[0][sent_len - 1]['S'] else: root_score = max(self.pi[0][sent_len - 1].iteritems(), key=lambda x: x[1]) self.parse_res = self.buildTreeHelper(0, sent_len - 1, root_score[0]) #,self.parser #except: # self.parse_res = "cannot parse unk" pass #print 'start',root_score def buildTreeHelper(self, start, end, root): if start == end: #self.parser[root]= self.bp[start][end][root] #t=self.bp[start][end][root] return '["' + root + '", "' + self.inputWordList[start] + '"]' else: s, y1, y2 = self.bp[start][end][root] r1 = self.buildTreeHelper(start, s, y1) r2 = self.buildTreeHelper(s + 1, end, y2) return ('["' + root + '", ' + r1 + ', ' + r2 + ']') def dev(self, dev_file, output_file): output = file(output_file, 'w') for l in open(dev_file): print l.strip() self.parse_res = "" self.cky_algorithm(l.strip()) if self.use_vert: new_parse_res = re.sub("\^\[[A-Z]+\]", "", self.parse_res) self.parse_res = new_parse_res output.write(self.parse_res) output.write('\n')