def get_parsed_trees_from_string(tree_strings):
    # tree_strings separated by "\n"
    parsed_trees = []
    for line in tree_strings:
        line = line.strip()
        #print line
        if line != '':
            parsed_trees.append(
                LexicalizedTree.fromstring(line,
                                           leaf_pattern='(?<=\\s)[^\)\(]+'))

    return parsed_trees
    def process_single_sentence(self, doc, raw_text):
        sentence = Sentence(len(doc.sentences), raw_text, doc)
        parse_tree_str, deps_str = self.parse_single_sentence(raw_text)

        parse = LexicalizedTree.fromstring(parse_tree_str,
                                           leaf_pattern='(?<=\\s)[^\)\(]+')
        sentence.set_unlexicalized_tree(parse)

        for (token_id, te) in enumerate(parse.leaves()):
            word = te
            token = Token(word, token_id + 1, sentence)
            sentence.add_token(token)

        heads = self.get_heads(sentence, deps_str.split('\n'))
        sentence.heads = heads
        sentence.set_lexicalized_tree(
            prep_utils.create_lexicalized_tree(parse, heads))

        doc.add_sentence(sentence)
Exemplo n.º 3
0
    def process_single_sentence(self, doc, raw_text, end_of_para):
        sentence = Sentence(len(doc.sentences),
                            raw_text + ('<s>' if not end_of_para else '<P>'),
                            doc)
        parse_tree_str, deps_str = self.parse_single_sentence(raw_text)

        parse_tree_str = parse_tree_str.decode('utf-8').encode(
            'ascii', 'ignore')
        parse = LexicalizedTree.fromstring(parse_tree_str,
                                           leaf_pattern='(?<=\\s)[^\)\(]+')
        sentence.set_unlexicalized_tree(parse)

        for (token_id, te) in enumerate(parse.leaves()):
            word = te
            token = Token(word, token_id + 1, sentence)
            sentence.add_token(token)

        heads = self.get_heads(sentence, deps_str.split('\n'))
        sentence.heads = heads
        sentence.set_lexicalized_tree(
            prep_utils.create_lexicalized_tree(parse, heads))

        doc.add_sentence(sentence)
    def process_single_sentence(self, doc, raw_text, end_of_para):
        sentence = Sentence(len(doc.sentences),
                            raw_text + (b'<s>' if not end_of_para else b'<P>'),
                            doc)

        parse_tree_str, deps_str = self.parse_single_sentence(raw_text)
        # self.parse_single_sentence(raw_text) returns different result from
        # self.syntax_parser.parse_sentence(raw_text)

        parse = LexicalizedTree.fromstring(parse_tree_str,
                                           leaf_pattern='(?<=\\s)[^\)\(]+')
        sentence.set_unlexicalized_tree(parse)

        for (token_id, te) in enumerate(parse.leaves()):
            word = te
            token = Token(word, token_id + 1, sentence)
            sentence.add_token(token)

        heads = self.get_heads(sentence, deps_str.split('\n'))
        sentence.heads = heads
        sentence.set_lexicalized_tree(
            prep_utils.create_lexicalized_tree(parse, heads))

        doc.add_sentence(sentence)