file_name = '/home/momo/Dropbox/parse_trees/' + file_name
#file_name = "./"+file_name
"""
try:
    tree = pp.parse_sentence(s,'stat')
    #numerate_non_terminals(tree)
    dot_code = utils.nltk_tree_to_dot(tree)
    print dot_code
    utils.dot_to_image(dot_code, file_name + '_stat')
    print
except :
    print 'cannot parse with stat'
"""

try:
    tree = pp.parse_sentence(s, 'stanford')
    #numerate_non_terminals(tree)
    tree = tree[0]
    dot_code = utils.nltk_tree_to_dot(tree)
    utils.dot_to_image(dot_code, file_name + '_stanford')

    #dot_code = utils.list_of_tripels_to_dot(dep)
    #utils.dot_to_image(dot_code, file_name + '_dep_stanford')

    print "stanford done"
except:
    print 'cannot parse with stanford'
"""
try :
    tree = pp.parse_sentence(s,'berkeley')
    tree = tree[0]
示例#2
0
        else:
            print "====================================================================="

        pattern_dict = load_pattern_list()

        #for i in pattern_dict.items() :
        #    print i
        #raw_input()

        #s = "The Anaconda, or Water Boa, is one the world's largest snakes, when born they can be 3 feet (1m) long."
        #s = ' '.join(sys.argv[1:])

        sentences = sent_tokenize(s)
        for s in sentences:
            count += 1
            tree = pp.parse_sentence(s, parser)
            tree = tree[0]
            #tree = Tree('S', [Tree('NP', [Tree('NNP', ['Leon'])]), Tree('VP', [Tree('VBZ', ['hits']), Tree('NP', [Tree('NNP', ['Kai'])])]), Tree('.', ['.'])])

            path = utils.get_knoex_path()
            dot_code = utils.nltk_tree_to_dot(tree)
            utils.dot_to_image(dot_code, 'temptree_' + str(count))
            if show == 2:
                os.popen('gnome-open ' + 'temptree_' + str(count) + '.png')

            g, _ = match_tree(tree, pattern_dict)
            graph += g

        while ['', '', ''] in graph:
            graph.remove(['', '', ''])
        print graph
示例#3
0
        return tree.leaves()

    for subtree in tree :
        terminals = get_terminals(subtree,node)
        if terminals != None :
            return terminals


if __name__ == '__main__':

    from nltk import Tree  
    import string
    import preprocessor

    tree = Tree('A',[Tree('A',['A','A']),'A'])
    tree = preprocessor.parse_sentence('Leon hits Kai.')
    print 'Tree:', tree
    print

    numerate_non_terminals(tree)
    print 'Num_Tree:', tree
    print

    print 'NP0:', get_terminals(tree,'NP0')
    print 'NP1:', get_terminals(tree,'NP1') 
    print 'NP2:', get_terminals(tree,'NP2')
    print 'NP3:', get_terminals(tree,'NP3')
    print 'NP4:', get_terminals(tree,'NP4')
    print

    combis = all_parsing_combinations(tree)
示例#4
0
def match_to_joined_terminals(match, parsetree):
	list_ = []
	for path in match :
		list_.append(join(parsetree.get_terminals(path)))
	return list_
	

def intersect(l1,l2):
	s = set(l1).intersection(set(l2))
	return list(s)


if __name__=="__main__":
	import preprocessor as pp
	tree = pp.parse_sentence('The python hits Kai.')
	tree = tree[0]
	pt = ParseTree(tree)
	print
	for item in pt.nt_dict.items() :
		print item
	print
	for path in pt.nodepaths :
		print path
	print

	print 'test get_subtree'
	print pt.get_subtree((1,))
	print 'test get_node'
	print pt.get_node((1,))
	print 'test get_terminals'
示例#5
0
        else :
            print "====================================================================="

        pattern_dict = load_pattern_list()

        #for i in pattern_dict.items() :
        #    print i
        #raw_input()

        #s = "The Anaconda, or Water Boa, is one the world's largest snakes, when born they can be 3 feet (1m) long."
        #s = ' '.join(sys.argv[1:])
        
        sentences = sent_tokenize(s)
        for s in sentences:
            count+=1
            tree = pp.parse_sentence(s,parser)
            tree = tree[0]
            #tree = Tree('S', [Tree('NP', [Tree('NNP', ['Leon'])]), Tree('VP', [Tree('VBZ', ['hits']), Tree('NP', [Tree('NNP', ['Kai'])])]), Tree('.', ['.'])])
            
            path = utils.get_knoex_path()
            dot_code = utils.nltk_tree_to_dot(tree)
            utils.dot_to_image(dot_code, 'temptree_'+str(count))
            if show == 2:
                os.popen('gnome-open ' + 'temptree_'+str(count)+'.png')

            g,_ = match_tree(tree, pattern_dict)
            graph += g

        while ['','',''] in graph:
            graph.remove(['','',''])
        print graph
def find_realation(text):

    # HEARST PATTERNS
    hp1 = 'NP\d+ such as ((NP\d+ ,d+ )+(and |or ))?NP\d+'

    def m2r_1(match,tree):
        NPs = re.findall(r'NP\d+', match)
        NPs = [string.join(tc.get_terminals(tree,NP)) for NP in NPs]
        return [(NP,'hyponym',NPs[0]) for NP in NPs[1:]]

    hp2 = 'NP0 VBZ0 NP1 \.0'

    def m2r_2(match,tree):
        subject = ' '.join(tc.get_terminals(tree,'NP0'))
        predicate = ' '.join(tc.get_terminals(tree,'VBZ0'))
        object_ = ' '.join(tc.get_terminals(tree,'NP1'))
        return [(subject,predicate,object_)]

    hp3 = 'NP\d+ is NP\d+'

    def m2r_3(match,tree):
        NNs = re.findall(r'NP\d+', match)
        NNs = [string.join(tc.get_terminals(tree,NN)) for NN in NNs]
        return [(NN,'hyponym',NNs[0]) for NN in NNs[1:]]

    # m2r = [ m2r_1, m2r_2, m2r_3  ] # functions to map matches on relations
    # pattern_list = [hp1,hp2,hp3]
    m2r = [ m2r_2 ] # functions to map matches on relations
    pattern_list = [hp2]

    sentences = split_into_sentences(text)

    # The next part is rather unclean, but it's hopefully gonna work
    # Produces a string where every noun phrase is preplaced by NP1,... to NPn
    # Then a regex search for hearst patterns is applied to find hyponym relations

    relations = []
    for s in sentences:
        s = format_sentence(s)
        s +=' .'  # adding a fullstop in the end to satisfy the needs of stanford parser
        tree = parse_sentence(s)
        print(type(tree))
        tc.numerate_non_terminals(tree) 

        combi = tc.all_parsing_combinations(tree)

        combi = [string.join(c) for c in combi]

        #for c in combi :
        #    print c

        #for c in combi : print c
        for i, pattern in enumerate(pattern_list) :
            pattern = re.compile(pattern)
            #open('combi','w').write(str(combi).replace(',','\n'))
            for c in combi :
                match = re.match(pattern,c)
                if match :
                    print 'match : ', match.group(), ' --> ', c
                    match = match.group()
                    if match :
                        tmp = m2r[i](match,tree)
                        print type(tmp), ' -- ', tmp
                        relations.extend(tmp)
    print 'relations', relations
    return set(relations)
示例#7
0
        return tree.leaves()

    for subtree in tree :
        terminals = get_terminals(subtree,node)
        if terminals != None :
            return terminals


if __name__ == '__main__':

    from nltk import Tree  
    import string
    import preprocessor

    tree = Tree('A',[Tree('A',['A','A']),'A'])
    tree = preprocessor.parse_sentence('Leon hits Kai.')
    print 'Tree:', tree
    print

    numerate_non_terminals(tree)
    print 'Num_Tree:', tree
    print

    print 'NP0:', get_terminals(tree,'NP0')
    print 'NP1:', get_terminals(tree,'NP1') 
    print 'NP2:', get_terminals(tree,'NP2')
    print 'NP3:', get_terminals(tree,'NP3')
    print 'NP4:', get_terminals(tree,'NP4')
    print

    combis = all_parsing_combinations(tree)
示例#8
0
def find_realation(text):

    # HEARST PATTERNS
    hp1 = 'NP\d+ such as ((NP\d+ ,d+ )+(and |or ))?NP\d+'

    def m2r_1(match,tree):
        NPs = re.findall(r'NP\d+', match)
        NPs = [string.join(tc.get_terminals(tree,NP)) for NP in NPs]
        return [(NP,'hyponym',NPs[0]) for NP in NPs[1:]]

    hp2 = 'NP0 VBZ0 NP1 \.0'

    def m2r_2(match,tree):
        subject = ' '.join(tc.get_terminals(tree,'NP0'))
        predicate = ' '.join(tc.get_terminals(tree,'VBZ0'))
        object_ = ' '.join(tc.get_terminals(tree,'NP1'))
        return [(subject,predicate,object_)]

    hp3 = 'NP\d+ is NP\d+'

    def m2r_3(match,tree):
        NNs = re.findall(r'NP\d+', match)
        NNs = [string.join(tc.get_terminals(tree,NN)) for NN in NNs]
        return [(NN,'hyponym',NNs[0]) for NN in NNs[1:]]


    #m2r = [ m2r_1, m2r_2, m2r_3  ] # functions to map matches on relations
    #pattern_list = [hp1,hp2,hp3]
    m2r = [ m2r_2 ] # functions to map matches on relations
    pattern_list = [hp2]

    sentences = split_into_sentences(text)


    # The next part is rather unclean, but it's hopefully gonna work
    # Produces a string where every noun phrase is preplaced by NP1,... to NPn
    # Then a regex search for hearst patterns is applied to find hyponym relations

    relations = []
    for s in sentences:
        s = format_sentence(s)
        s+=' .' # adding a fullstop in the end to satisfy the needs of stanford parser
        tree  = parse_sentence(s)

        tc.numerate_non_terminals(tree) 

        combi = tc.all_parsing_combinations(tree)

        combi = [string.join(c) for c in combi]

        #for c in combi :
        #    print c

        #for c in combi : print c
        for i, pattern in enumerate(pattern_list) :
            pattern = re.compile(pattern)
            #open('combi','w').write(str(combi).replace(',','\n'))
            for c in combi :
                match = re.match(pattern,c)
                if match :
                    print 'match : ', match.group(), ' --> ', c
                    match = match.group()
                    if match :
                        tmp = m2r[i](match,tree)
                        print type(tmp), ' -- ', tmp
                        relations.extend(tmp)
    print 'relations', relations
    return set(relations)
示例#9
0
#file_name = "./"+file_name

"""
try:
    tree = pp.parse_sentence(s,'stat')
    #numerate_non_terminals(tree)
    dot_code = utils.nltk_tree_to_dot(tree)
    print dot_code
    utils.dot_to_image(dot_code, file_name + '_stat')
    print
except :
    print 'cannot parse with stat'
"""

try :
    tree = pp.parse_sentence(s,'stanford')
    #numerate_non_terminals(tree)
    tree = tree[0]
    dot_code = utils.nltk_tree_to_dot(tree)
    utils.dot_to_image(dot_code, file_name + '_stanford')

    #dot_code = utils.list_of_tripels_to_dot(dep)
    #utils.dot_to_image(dot_code, file_name + '_dep_stanford')

    print "stanford done"
except :
    print 'cannot parse with stanford'


"""
try :