def create_lexicalized_tree(self, mrg, heads): """ Creates a lexicalized syntax tree given a MRG-style parse and a Penn2Malt style heads file. """ t = LexicalizedTree.parse(mrg, leaf_pattern = '(?<=\\s)[^\)\(]+') # Vanessa's modification t.lexicalize(heads, from_string = True) return t
def get_parsed_trees_from_string(self, tree_strings): # tree_strings separated by "\n" parsed_trees = [] for line in tree_strings: line = line.strip() if line != '': parsed_trees.append(LexicalizedTree.parse(line, leaf_pattern = '(?<=\\s)[^\)\(]+')) return parsed_trees
def get_parsed_trees_from_string(tree_strings): # tree_strings separated by "\n" parsed_trees = [] for line in tree_strings: line = line.strip() #print line if line != '': parsed_trees.append( LexicalizedTree.parse(line, leaf_pattern='(?<=\\s)[^\)\(]+')) return parsed_trees
def process_single_sentence(self, doc, raw_text, end_of_para): sentence = Sentence(len(doc.sentences), raw_text + ('<s>' if not end_of_para else '<P>'), doc) parse_tree_str, deps_str = self.parse_single_sentence(raw_text) parse = LexicalizedTree.parse(parse_tree_str, leaf_pattern = '(?<=\\s)[^\)\(]+') sentence.set_unlexicalized_tree(parse) for (token_id, te) in enumerate(parse.leaves()): word = te token = Token(word, token_id + 1, sentence) sentence.add_token(token) heads = self.get_heads(sentence, deps_str.split('\n')) sentence.heads = heads sentence.set_lexicalized_tree(prep_utils.create_lexicalized_tree(parse, heads)) doc.add_sentence(sentence)
def do_segment(self, text, input_edus = None, parsed_filename = None, heads_filename = None, deps_filename = None): """ Segments a text into elementary discourse units. Assumes that the text is pre-processed such that end of sentences are marked with <s>, end of paragraphs with <p> Returns a list containing : - a list of lexicalized syntax trees for each sentence - a list of couples (m, n) indicating a discourse unit between tokens at index m and n in the corresponding tree - a list of unescaped edus (including paragraph boundaries indications """ segmented_text = self.split_by_sentence(text) if parsed_filename and heads_filename and deps_filename: #print os.path.exists(parsed_filename) unlexicalized_trees = self.get_parsed_trees_from_string(open(parsed_filename).readlines()) heads = open(heads_filename).read().split("\r\n\r\n")[:-1] #print open(heads_filename).read() # for head in heads: # print head lexicalized_trees = [] for line in open(parsed_filename).readlines(): line = line.strip() if line != '': t = LexicalizedTree.parse(line, leaf_pattern = '(?<=\\s)[^\)\(]+') t.lexicalize(heads[len(lexicalized_trees)], from_string = True) lexicalized_trees.append(t) dep_parses = self.get_deps(deps_filename) else: unlexicalized_trees, heads, dep_parses = self.syntax_parser.parse(map(lambda x: x[0], segmented_text)) lexicalized_trees = map(lambda x, y: self.create_lexicalized_tree(x, y), unlexicalized_trees, heads) # for tree in unlexicalized_trees: # print tree #print unlexicalized_trees #print heads edus_intervals_pairs = [] edus = [] if input_edus: edus, edus_intervals_pairs = utils.utils.align_edus_with_syntax_trees(input_edus, lexicalized_trees, segmented_text, self.penn_special_chars) else: for i in range(0, len(lexicalized_trees)): t = lexicalized_trees[i] t_words = map(lambda x: t.unescape(x), t.leaves()) # Vanessa's modification # check if breaks contains all exclude words exclude_words = ".`':;!?" eval_boundaries = self.segment_tree(t) for j in range(len(eval_boundaries) - 1): if eval_boundaries[j] == 1.0 and eval_boundaries[j + 1] == 1.0: #if not t_words[j].strip(exclude_words) and not t_words[j + 1].strip(exclude_words): if not t_words[j + 1].strip(exclude_words): eval_boundaries[j] = -1.0 cur_edus_intervals = [k+1 for k in range(0, len(eval_boundaries)) if eval_boundaries[k] == +1.0] if not 0 in cur_edus_intervals: cur_edus_intervals = [0] + cur_edus_intervals if not len(eval_boundaries) in cur_edus_intervals: cur_edus_intervals.append(len(eval_boundaries)) #print cur_edus_intervals cur_edus = [] cur_edus_intervals_pairs = [] for j in range(0, len(cur_edus_intervals) - 1): cur_edus.append(t_words[cur_edus_intervals[j]:cur_edus_intervals[j+1]]) cur_edus_intervals_pairs.append((cur_edus_intervals[j], cur_edus_intervals[j+1])) # Add eventual paragraph boundary #if segmented_text[i][1] == True: #cur_edus[len(cur_edus) - 1].append("<p>") #print segmented_text[i][1] cur_edus[len(cur_edus) - 1].append(segmented_text[i][1]) #print 'cur_edus', cur_edus edus_intervals_pairs.append(cur_edus_intervals_pairs) edus.extend(cur_edus) #print lexicalized_trees #print "edus_intervals_pairs: ", edus_intervals_pairs #print "edus: ", edus #print return [lexicalized_trees, dep_parses, edus_intervals_pairs, edus]