Пример #1
0
def parse_using_stanfordparser(tokenized_sent,
                               display_tree=False,
                               printNP=False,
                               printLeave=False):
    result = stanford_parser.tagged_parse(tokenized_sent)
    for item in result:
        # print item
        if display_tree:
            Tree.draw(item)
        if printNP:
            NPs = list(
                Tree.subtrees(
                    item,
                    filter=lambda x: x.label() == 'NP' and x.height() <= 6))
            for n in NPs:
                if printLeave:
                    candidate = Tree.leaves(n)
                    s = ' '.join(candidate)
                    if len(candidate) == 1:
                        if re.search(re.compile(r'[A-Z_-]+', re.X), s):
                            print s
                    else:
                        print s
                else:
                    tags = []
                    for t in Tree.subtrees(n):
                        if t.label() not in ['NP', 'S', 'VP']:
                            tags.append(t.label())
                    tagged = []
                    for w in range(len(Tree.leaves(n))):
                        tagged.append(
                            (Tree.leaves(n)[w], tags[w].encode('gbk')))
                    regexp_ner_m2(regexp_grammar, tagged)
Пример #2
0
def getSecondLvNPsOfParseTree(parse_tree, nps, display_tree=False):
    if display_tree:
        Tree.pretty_print(parse_tree)

    for subtree in parse_tree:
        if isinstance(subtree, Tree) and subtree.label() == 'NP' and subtree.height() == 3:
            np = subtree
            start_flag = "B-NP"
            print('\nNP: ' + ' '.join(Tree.leaves(np)))
            # obtained = False
            # may or may not be a terminal
            for np_derivation in Tree.subtrees(np):
                getSecondLvNPsOfParseTree(np_derivation, nps, False)
                if np_derivation.label() in penni_tags:
                    # if not obtained:
                    #     print('\nNP: ' + ' '.join(Tree.leaves(np)))
                    #     nps.append(Tree.leaves(np))
                    #     obtained = True
                    print(np_derivation.leaves()[0]+'\t'+np_derivation.label()+'\t'+start_flag)
                    start_flag = "I-NP"
            nps.append(Tree.leaves(np))
        elif isinstance(subtree, Tree) and subtree.label() != 'NP':
            getSecondLvNPsOfParseTree(subtree, nps, False)
        elif isinstance(subtree, Tree) and subtree.label() == 'NP' and subtree.height() != 3:
            getSecondLvNPsOfParseTree(subtree, nps, False)
        else:
            # reach terminal
            pass
Пример #3
0
def generate_chunks(tagged_sent, expression=r'CHUNK: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}'):
    chunks = []
    chunkParser = RegexpParser(expression)
    try:
        if len(tagged_sent) == 0:
            tree = Tree('S', [])
        else:
            tree = chunkParser.parse(tagged_sent, trace=0)
        for subtree in tree.subtrees():
            if subtree.label() == "CHUNK":
                chunks.append(subtree.leaves())
    except ValueError:
        chunks = []
    return chunks
Пример #4
0
def regexp_ner_m2(grammar_re, tagged_sentence):
    result = []
    cp = nltk.RegexpParser(grammar_re)
    result_tree = cp.parse(tagged_sentence)
    nps = list(
        Tree.subtrees(result_tree,
                      filter=lambda x: x.label() == 'NE' and x.height() <= 5))
    if nps is not []:
        # print "(M2)NE found: "
        for n in nps:
            ne_list = [i[0] for i in Tree.leaves(n)]
            s = ' '.join(ne_list)
            result.append(s)
            # print s
    return result
Пример #5
0
def get_noun_phrases(text_list, tagger):
    noun_phrases = []
    tagged_texts = [tagger.tag(text.split()) for text in text_list]

    expression = r'NOUN_PHRASE: {(<adj>* <n.*>+ <prp>)? <adj>* <n.*>+}'

    chunkParser = RegexpParser(expression)

    for tagged_sent in tagged_texts:
        try:
            if len(tagged_sent) == 0:
                tree = Tree('S', [])
            else:
                tree = chunkParser.parse(tagged_sent, trace=0)
            for subtree in tree.subtrees():
                if subtree.label() == "NOUN_PHRASE":
                    noun_phrases.append([el[0] for el in subtree.leaves()])
        except ValueError:
            noun_phrases = []
    return noun_phrases
Пример #6
0
def getFirstLvNPsOfParseTree(parse_tree, nps, display_tree=False):
    if display_tree:
        Tree.pretty_print(parse_tree)
        # print(Tree.leaf_treeposition(parser_tree, 1)) get a child index by leaves list index
        # print(parser_tree[(0, 0, 1,)]) get a tree by index
    for subtree in parse_tree:
        if isinstance(subtree, Tree) and subtree.label() == 'NP':
            np = subtree
            start_flag = "B-NP"
            print('\nNP: '+' '.join(Tree.leaves(np)))
            # may or may not be a terminal
            for np_derivation in Tree.subtrees(np):
                # below gets smaller np scope
                # getNPsOfParseTree(np_derivation, nps, False)
                if np_derivation.label() in penni_tags:
                    print(np_derivation.leaves()[0]+'\t'+np_derivation.label()+'\t'+start_flag)
                    start_flag = "I-NP"
            nps.append(Tree.leaves(np))
        elif isinstance(subtree, Tree) and subtree.label() != 'NP':
            getFirstLvNPsOfParseTree(subtree, nps, False)
        else:
            # reach terminal
            pass
Пример #7
0
    Tree('List', [
        Tree('Item', ['f1']),
        Tree('List',
             [Tree('Item', ['f2']),
              Tree('List', [Tree('Item', ['f3'])])])
    ]), 'to',
    Tree('Item', ['folder'])
])

print(t2)
print(t2.flatten())
print(type(t2.flatten()))
print(t2.collapse_unary())

max_subtree = Tree('', [])
for subtree in t2.subtrees(filter=lambda x: x.label() == 'List'):
    if len(subtree.flatten().pos()) > len(max_subtree.pos()):
        max_subtree = subtree
print(max_subtree)
'''
tmp = t2
i = 
while tmp.label() != 'List':
	i += 1
	tmp = tmp.pos()
	print("the " + str(i) + " time:")
	print(tmp)
	print(type(tmp))
tmp = tmp.flatten()

print(t2)
Пример #8
0
	'copy', 
	Tree('List', [
		Tree('Item', ['f1']), 
		Tree('List', [
			Tree('Item', ['f2']),
			Tree('List', [Tree('Item', ['f3'])])])]),
	'to', 
	Tree('Item', ['folder'])])

print(t2)
print(t2.flatten())
print(type(t2.flatten()))
print(t2.collapse_unary())

max_subtree = Tree('', [])
for subtree in t2.subtrees(filter = lambda x: x.label() == 'List'):
	if len(subtree.flatten().pos()) > len(max_subtree.pos()):
		max_subtree = subtree
print(max_subtree)


'''
tmp = t2
i = 
while tmp.label() != 'List':
	i += 1
	tmp = tmp.pos()
	print("the " + str(i) + " time:")
	print(tmp)
	print(type(tmp))
tmp = tmp.flatten()