def create_lexicalized_tree(self, mrg, heads):
     """
     Creates a lexicalized syntax tree given a MRG-style parse and a Penn2Malt style heads file. 
     """
     t = LexicalizedTree.parse(mrg, leaf_pattern = '(?<=\\s)[^\)\(]+')  # Vanessa's modification
     t.lexicalize(heads, from_string = True)
     
     return t
 def get_parsed_trees_from_string(self, tree_strings):
     # tree_strings separated by "\n"
     parsed_trees = []
     for line in tree_strings:
         line = line.strip()
         if line != '':
             parsed_trees.append(LexicalizedTree.parse(line, leaf_pattern = '(?<=\\s)[^\)\(]+'))
    
     return parsed_trees
Exemplo n.º 3
0
def get_parsed_trees_from_string(tree_strings):
    # tree_strings separated by "\n"
    parsed_trees = []
    for line in tree_strings:
        line = line.strip()
        #print line
        if line != '':
            parsed_trees.append(
                LexicalizedTree.parse(line, leaf_pattern='(?<=\\s)[^\)\(]+'))

    return parsed_trees
    def process_single_sentence(self, doc, raw_text, end_of_para):
        sentence = Sentence(len(doc.sentences), raw_text + ('<s>' if not end_of_para else '<P>'), doc)
        parse_tree_str, deps_str = self.parse_single_sentence(raw_text)

        parse = LexicalizedTree.parse(parse_tree_str, leaf_pattern = '(?<=\\s)[^\)\(]+')  
        sentence.set_unlexicalized_tree(parse)
        
        for (token_id, te) in enumerate(parse.leaves()):
            word = te
            token = Token(word, token_id + 1, sentence)
            sentence.add_token(token)

        heads = self.get_heads(sentence, deps_str.split('\n'))
        sentence.heads = heads
        sentence.set_lexicalized_tree(prep_utils.create_lexicalized_tree(parse, heads))
     
        doc.add_sentence(sentence)
    def do_segment(self, text, input_edus = None, parsed_filename = None, heads_filename = None, deps_filename = None):
        """
        Segments a text into elementary discourse units.
        Assumes that the text is pre-processed such that end of sentences are marked with <s>, end of paragraphs with <p>
        
        Returns a list containing :
        - a list of lexicalized syntax trees for each sentence
        - a list of couples (m, n) indicating a discourse unit between tokens at index m and n in the corresponding tree
        - a list of unescaped edus (including paragraph boundaries indications
        """
        
        segmented_text = self.split_by_sentence(text)
        
        if parsed_filename and heads_filename and deps_filename:
            #print os.path.exists(parsed_filename)
            unlexicalized_trees = self.get_parsed_trees_from_string(open(parsed_filename).readlines())
            heads = open(heads_filename).read().split("\r\n\r\n")[:-1]
            
            #print open(heads_filename).read()
#            for head in heads:
#                print head
                
            lexicalized_trees = []
            for line in open(parsed_filename).readlines():
                line = line.strip()
                if line != '':
                    t = LexicalizedTree.parse(line, leaf_pattern = '(?<=\\s)[^\)\(]+')
                    t.lexicalize(heads[len(lexicalized_trees)], from_string = True)
                    lexicalized_trees.append(t)
        
            dep_parses = self.get_deps(deps_filename)
            
        else:
            unlexicalized_trees, heads, dep_parses = self.syntax_parser.parse(map(lambda x: x[0], segmented_text))
            lexicalized_trees = map(lambda x, y: self.create_lexicalized_tree(x, y), unlexicalized_trees, heads)
        
#        for tree in unlexicalized_trees:
#            print tree
            
        #print unlexicalized_trees
        #print heads
        
        edus_intervals_pairs = []
        edus = []
        
        if input_edus:
            edus, edus_intervals_pairs = utils.utils.align_edus_with_syntax_trees(input_edus, lexicalized_trees, segmented_text, self.penn_special_chars)
        else:
            for i in range(0, len(lexicalized_trees)):
                t = lexicalized_trees[i]
                
                t_words = map(lambda x: t.unescape(x), t.leaves())
                
                # Vanessa's modification
                # check if breaks contains all exclude words
                exclude_words = ".`':;!?"
                
                eval_boundaries = self.segment_tree(t)

                for j in range(len(eval_boundaries) - 1):
                    if eval_boundaries[j] == 1.0 and eval_boundaries[j + 1] == 1.0:
                        #if not t_words[j].strip(exclude_words) and not t_words[j + 1].strip(exclude_words):
                        if not t_words[j + 1].strip(exclude_words):
                            eval_boundaries[j] = -1.0
            
                cur_edus_intervals = [k+1 for k in range(0, len(eval_boundaries)) if eval_boundaries[k] == +1.0]
                if not 0 in cur_edus_intervals:
                    cur_edus_intervals = [0] + cur_edus_intervals
                if not len(eval_boundaries) in cur_edus_intervals:
                    cur_edus_intervals.append(len(eval_boundaries))
   
                #print cur_edus_intervals
                cur_edus = []
                cur_edus_intervals_pairs = []
                        
                for j in range(0, len(cur_edus_intervals) - 1):
                    cur_edus.append(t_words[cur_edus_intervals[j]:cur_edus_intervals[j+1]])
                    cur_edus_intervals_pairs.append((cur_edus_intervals[j], cur_edus_intervals[j+1]))
                    
                # Add eventual paragraph boundary
                #if segmented_text[i][1] == True:
                    #cur_edus[len(cur_edus) - 1].append("<p>") 
                #print segmented_text[i][1]
                cur_edus[len(cur_edus) - 1].append(segmented_text[i][1]) 
                
                #print 'cur_edus', cur_edus
                edus_intervals_pairs.append(cur_edus_intervals_pairs)
                edus.extend(cur_edus)
        
        #print lexicalized_trees
        #print "edus_intervals_pairs: ", edus_intervals_pairs
        #print "edus: ", edus
        #print
         
        return [lexicalized_trees, dep_parses, edus_intervals_pairs, edus]