示例#1
0
def main():
    cconv = Conversion()
    he = head()
    a_ontonotes,head_trees_path = cconv.loadOntonotes()
    #head_rules = he.loadHeadrules("/nfs/guest/yaqiny/Dropbox/Code/OntonotesUtil/ontonotes-db-tool-v0.999b/data/headrules.txt")
    #head_rules = he.loadHeadrules("/home/j/llc/cwang24/R_D/AMR/ontonotes-db-tool-v0.999b/data/headrules.txt")
    head_trees = he.loadHeadTrees(head_trees_path)
    cconv.convertTrees(a_ontonotes,head_trees)
示例#2
0
    def convert(self, head_trees, a_tree, amr):
        """
        recursively convert a tree/subtree into amr  
        """

        #if a_tree.tag == 'DNP':
        #    import pdb
        #    pdb.set_trace()
        def getDNPType(a_tr,h_id,sc_id):
            """DNP types """
            a_DNP_tree = a_tr.children[sc_id]
            tag_pattern = '+'.join(x.tag.split('-')[0] for x in a_DNP_tree.children)
            return tag_pattern
            
        def isPredicate(a_tr,h_idx):
            """check whether the current verb head is a predicate in propbank"""
            relative_idx = h_idx - a_tr.get_token_index()
            head_node = a_tr[relative_idx]
            if head_node.tag == 'VV':
                import pdb
                pdb.set_trace()
                return head_node.proposition != None 
            else:
                return True

        def fix_predicate(amr,a_tree,h_idx):
            """fix the current amr by eliminating the non-predicate (in propbank) head
               here maybe a little confused since the predicates in propbank are not always
               the head in the treebank.
            """
            pass
            
        def convertCompVerb(h_child,amr,h_idx):
            """combine VSB's two VV as one"""
            del amr[h_idx]
            foo = amr[h_idx]
            amr.node_to_concepts[h_idx] = ''.join(x.word for x in h_child.children)
            
        def isVPCoordination(a_tr):
            """VP(VP(...)PU(,)VP(...))"""
            if a_tr.tag == 'VP' and re.match('VPPUVP',''.join(x.tag for x in a_tr.children)):
                return True
            else:
                return False
        def recover_subject(amr,head_token_idx):
            arg0 = None
            for op in amr[head_token_idx].values():
                subject = amr[op[0]].itemsfor('ARG0')
                if subject != []:
                    arg0 = subject[0][1]
            for vp in amr[head_token_idx].values():
                subject = amr[vp[0]].itemsfor('ARG0')
                if subject == []:
                    amr._add_triple(vp[0],'ARG0',arg0)

        
        he = head()
        modifier_structs,head_child_id = he.getHeadNodeTree(head_trees, a_tree)
        
        #propbank args and functional tags for current tree node/leaf
        aftags = {}
        pred_args = self.get_edge_labels(a_tree)
        
        rel = None
        ftag = None
        if "PRED-ARG" in pred_args:
            rel = pred_args["PRED-ARG"]
        if "FUNCT" in pred_args:
            ftag = pred_args["FUNCT"]

        if not rel and ftag:
            rel = ftag

        if a_tree.is_leaf():
            #if a_tree.word in [u'\u3002',u'\u300a',u'\uff01']: #punctuation
            #    return (None,)
            token_idx = a_tree.get_token_index()
            if a_tree.tag in ['PU','CS','SP','DEC','AS']:
                return (-1, token_idx)
            foo = amr[token_idx]  #add only the node with no children
            amr.node_to_concepts[token_idx] = a_tree.word #update the node_id->concept mapping
            return (rel,token_idx)
        else:
            head_rel, head_token_idx = self.convert(head_trees, a_tree.children[head_child_id], amr)        
            opNum = 1
            head_child = a_tree.children[head_child_id]
            

            # deal with compound verb
            if head_child.tag in ['VSB','VRD','VCD','VNV']:
                convertCompVerb(head_child,amr,head_token_idx)

            if head_rel == -1:
                if head_child.tag == 'PU':  # cannot drop conjunction head: the comma
                    foo = amr[head_token_idx]
                else:
                    rel = head_rel
            
            # coreference
            if a_tree.coreference_link and a_tree.coreference_chain.type == 'IDENT':
                self.coref_chains[a_tree.coreference_chain.id].append((head_token_idx,a_tree.coreference_link))
            
            if ftag and 'PN' in ftag.split('-'):
                return self.convertPN(a_tree,amr,rel,head_token_idx)
                
            if modifier_structs == {}:
                for child_id, child in enumerate(a_tree.children):            
                    if child_id != head_child_id:
                        child_rel,child_token_idx = self.convert(head_trees, child, amr)   
                        if child_rel == None:
                            amr._add_triple(head_token_idx, "NA", child_token_idx)
                        elif isinstance(child_rel,list):
                            for pred,arg in child_rel:
                                amr._add_triple(pred, arg, child_token_idx)
                        elif isinstance(child_rel,unicode):
                            if child_rel == u'OBJ':
                                child_rel = 'ARG1'
                            elif child_rel == u'SBJ' or child_rel.split('-')[-1] == u'SBJ':
                                child_rel = 'ARG0'
                            amr._add_triple(head_token_idx, child_rel, child_token_idx)
                        elif child_rel == -1:
                            continue
                        else:
                            print a_tree.pretty_print()
                            raise TypeError('Wrong relation types: %s,%s'%(child_rel,child_rel.__class__.__name__))
                if head_rel == 'VPCOORD':
                    recover_subject(amr,head_token_idx)

            elif 'CONJUNCTION' in modifier_structs:
                assert len(modifier_structs) == 1
                for child_id, child in enumerate(a_tree.children):
                    if child_id != head_child_id:
                        child_rel,child_token_idx = self.convert(head_trees, child, amr)   
                        # if child_rel == PN convertPN
                        print child_rel,child_token_idx

                        if child_rel == -1: # punctuation
                            continue
                        elif isinstance(child_rel,list):
                            for pred,arg in child_rel:
                                amr._add_triple(pred, arg, child_token_idx)
                        elif isinstance(child_rel,unicode):
                            pass
                        elif child_rel == None or child_rel == -1 or child_rel == 'PN' or child_rel == 'Q':
                            pass
                        else:
                            raise TypeError('Wrong relation types: %s,%s'%(child_rel,child_rel.__class__.__name__))
                        amr._add_triple(head_token_idx, "op"+str(opNum), child_token_idx)
                        amr.node_to_concepts[head_token_idx] = 'and'
                        opNum+=1
                if isVPCoordination(a_tree):
                    assert rel == None
                    rel = 'VPCOORD'
 
            else:  # current tree contains various kind of adjunction subtree
                spec_id_list = []
                # first deal with the adjunct subtrees
                for modifier in modifier_structs: 
                    spec_child_id = modifier_structs[modifier]
                    spec_id_list.append(spec_child_id)

                    if modifier == 'RELATIVE-CLAUSE':
                        spec_child = a_tree.children[spec_child_id]
                        spec_child_rel,spec_child_token_idx = self.convert(head_trees, spec_child, amr)

                        if amr.node_to_concepts[spec_child_token_idx] == 'and': # conjunction
                            for relation,op in amr[spec_child_token_idx].items():
                                if relation.startswith('op'):
                                    self.convertRC(amr,a_tree,op[0],head_token_idx)
                        else:
                            self.convertRC(amr,a_tree,spec_child_token_idx,head_token_idx)

                        
                    elif modifier == 'CND':
                        spec_child = a_tree.children[spec_child_id]
                        spec_child_rel,spec_child_token_idx = self.convert(head_trees, spec_child, amr)
                        amr._add_triple(head_token_idx, 'condition', spec_child_token_idx)
                    elif modifier == 'BA':
                        spec_child = a_tree.children[spec_child_id]
                        # here we simply don't add the BA 
                    elif modifier == 'DNP-NMOD':
                        assert spec_child_id != -1
                        DNP_TYPE = getDNPType(a_tree,head_child_id,spec_child_id)
                        # different categories of complements in DNP
                        if DNP_TYPE == 'PP+DEG':
                            spec_child = a_tree.children[spec_child_id]
                            spec_child_rel,spec_child_token_idx = self.convert(head_trees, spec_child, amr)
                            assert spec_child_rel == None
                            assert spec_child.children[-1].tag == 'DEG'
                            rel_child_pairs = amr[spec_child_token_idx].items()
                            assert len(rel_child_pairs) == 1
                            pp_rel = rel_child_pairs[0][0]
                            pp_idx = rel_child_pairs[0][1]
                            del amr[spec_child_token_idx]
                            amr._add_triple(head_token_idx,pp_rel,pp_idx)
                            
                            for poss_id in [spec_child_id - 1,spec_child_id + 1]:
                                if poss_id >= 0 and poss_id < len(a_tree.children) and poss_id != head_child_id:
                                    poss_child = a_tree.children[poss_id]
                                    poss_child_rel,poss_child_token_idx = self.convert(head_trees,poss_child, amr)
                                    assert poss_child_rel == None
                                    amr._add_triple(head_token_idx,'poss',poss_child_token_idx)
                                    spec_id_list.append(poss_id)

                        elif DNP_TYPE in ['NP+DEG','DP+DEG','ADJP+DEG']:

                            spec_child = a_tree.children[spec_child_id]
                            spec_child_rel,spec_child_token_idx = self.convert(head_trees,spec_child, amr)
                            assert spec_child_rel == None
                            assert spec_child.children[-1].tag == 'DEG'
                            rel_child_pairs = amr[spec_child_token_idx].items()
                            assert len(rel_child_pairs) == 1
                            xp_rel = rel_child_pairs[0][0]
                            # if xp_rel == 'PN': convertPN
                            assert xp_rel == None or xp_rel == -1 or xp_rel == 'PN' or xp_rel == 'NA'
                            xp_idx = rel_child_pairs[0][1]
                            del amr[spec_child_token_idx]
                            amr._add_triple(head_token_idx,'mod',xp_idx)
                                        
                    elif modifier.startswith('PP'): # preposition
                        PP_TYPE = modifier.split('-')[-1]
                        
                        if spec_child_id != head_child_id:
                            spec_child = a_tree.children[spec_child_id]
                            spec_child_rel,spec_child_token_idx = self.convert(head_trees,spec_child, amr)
                            # print spec_child_rel
                            # NEED CHECK here:assert spec_child_rel ==  None
                            # get complement of preposition
                            #print spec_child.pretty_print()
                            #print spec_child_token_idx
                            #print amr[spec_child_token_idx].values()

                            relative_token_idx = spec_child_token_idx - spec_child.get_token_index()
                            if spec_child[relative_token_idx].tag == 'P':
                                # only one node(complement) follow preposition
                                # assert len(amr[spec_child_token_idx].values()) == 1 or len(amr[spec_child_token_idx].values()) == 0
                                if len(amr[spec_child_token_idx].values()) == 0:
                                    continue

                                complement_idx = amr[spec_child_token_idx].values()[-1]
                                del amr[spec_child_token_idx] # here we eliminate the prepoision within PP replace it with :prep-x
                                amr._add_triple(head_token_idx,'prep-'+PP_TYPE,complement_idx)
                            else:
                                # add the child as normal 
                                spec_id_list.pop()

                        else:
                            relative_token_idx = head_token_idx - head_child.get_token_index()
                            if head_child[relative_token_idx].tag == 'P':
                                # only one node(complement) follow preposition
                                assert len(amr[head_token_idx].values()) == 1 or len(amr[head_token_idx].values()) == 0
                                if len(amr[head_token_idx].values()) == 0:
                                    continue
                                complement_idx = amr[head_token_idx].values()[0]
                                del amr[head_token_idx] # here we eliminate the prepoision within PP replace it with :prep-x
                                head_token_idx = complement_idx[0]
                                PP_TYPE = head_child.tag.split('-')[-1]
                                rel = 'prep-'+PP_TYPE
                            else:
                                spec_id_list.pop()
                            

                            

                        
                    else:
                        raise TypeError('Wrong modifier_type: %s'%(modifier))
                
                # add other relation subtrees
                for child_id, child in enumerate(a_tree.children):
                    if child_id != head_child_id and child_id not in spec_id_list:
                        child_rel,child_token_idx = self.convert(head_trees, child, amr)   
                        
                        if child_rel == None:
                            amr._add_triple(head_token_idx, "NA", child_token_idx)
                        elif isinstance(child_rel,list):
                            for pred,arg in child_rel:
                                amr._add_triple(pred, arg, child_token_idx)
                        elif isinstance(child_rel,unicode):
                            if child_rel == u'OBJ':
                                child_rel = 'ARG1'
                            elif child_rel == u'SBJ':
                                child_rel = 'ARG0'
                            amr._add_triple(head_token_idx, child_rel, child_token_idx)
                        elif child_rel == -1:
                            continue
                        else:
                            raise TypeError('Wrong relation types: %s,%s'%(child_rel,child_rel.__class__.__name__))
            
            # fix logistic subject
            if 'prep-LGS' in amr[head_token_idx] and amr[head_token_idx].itemsfor('ARG0') != []:
                amr[head_token_idx].replace('ARG0',amr[head_token_idx]['prep-LGS'])
                del amr[head_token_idx]['prep-LGS']
                        
            return rel, head_token_idx