def merge(treesFile,graphFile): trees = load_depTrees_from_file(treesFile) graphsFromFile = load_depGraphs_from_file(graphFile) # assert(len(trees)==len(graphsFromFile)) # print "len ok" graphs = [] for i,t in enumerate(trees): curGraph,nodesMap = graphsFromFile[i] curGraph.wsj_id = t[0].wsj_id curGraph.sent_id = t[0].sent_id curGraph.originalSentence = t[0].original_sentence curGraph.wsj_id = t[0].wsj_id curGraph.sent_id = t[0].sent_id curGraph.tree_str = "\n".join(t[0].to_original_format().split("\n")[1:]) for node_id in nodesMap: int_node_id = int(node_id.split("'")[0]) treeNode = t[int_node_id] child_dic = treeNode._get_child_dic() if 'cc' in child_dic: conj_type = (" ".join([cc.word for cc in sorted(child_dic['cc'],key=lambda cc:cc.id)]),[cc.id for cc in child_dic['cc']]) else: conj_type = False graphNodes = [nodesMap[n] for n in nodesMap if n.split("'")[0] == node_id] for graphNode in graphNodes: graphNode.features = get_verbal_features(treeNode) if conj_type: graphNode.features["conjType"] = conj_type graphNode.features["pos"]=treeNode.pos graphNode.isPredicate = treeNode.is_verbal_predicate() # graphNode.original_text = copy.copy(graphNode.text) graphNode.original_text = treeNode.get_text() graphNode.surface_form += missing_children(treeNode,graphNode) curGraph.del_node(nodesMap['0']) # delete root graphs.append(curGraph) return graphs
def parseVerbal(self, indexes, verbs, arguments, tree): """ add a verbal subgraph to the graph @type indexes: list [int] @param indexes: the index(es) of the verb in the sentence @type verbs: list [string] @param verbs: the string(s) representing the verb @type tree: DepTree @param tree: tree object from which to extract various features @type arguments: list @param arguments: list of DepTrees of arguments """ # create verbal head node # start by extracting features feats = syntactic_item.get_verbal_features(tree) if feats['Lemma'] == verbs[0]: del (feats['Lemma']) for k in feats: self.types.add(k) verbNode = graph_representation.node.Node( isPredicate=True, text=[ Word(index=index, word=verb) for index, verb in zip(indexes, verbs) ], features=feats, valid=True) self.gr.add_node(verbNode) # handle arguments for arg_t in arguments: curNode = self.parse(arg_t) #curNode.features = syntactic_item.get_verbal_features(arg_t) self.gr.add_edge((verbNode, curNode), arg_t.parent_relation) # handle time expressions (timeSubtree, _) = tree._VERBAL_PREDICATE_SUBTREE_Time() if timeSubtree: timeNode = graph_representation.node.TimeNode.init(features={}) self.gr.add_node(timeNode) timeSubGraph = self.parse(timeSubtree) self.gr.add_edge((verbNode, timeNode)) self.gr.add_edge((timeNode, timeSubGraph)) return verbNode
def treeNode_to_graphNode(treeNode, gr): """ @type treeNode DepTree """ feats = get_verbal_features(treeNode) ret = newNode.Node(text=[Word(index=treeNode.id, word=treeNode.word)], isPredicate=treeNode.is_verbal_predicate(), features=feats, gr=gr) ret.features["pos"] = treeNode.pos ret.original_text = copy(ret.text) return ret
def parseVerbal(self,indexes,verbs,arguments,tree): """ add a verbal subgraph to the graph @type indexes: list [int] @param indexes: the index(es) of the verb in the sentence @type verbs: list [string] @param verbs: the string(s) representing the verb @type tree: DepTree @param tree: tree object from which to extract various features @type arguments: list @param arguments: list of DepTrees of arguments """ # create verbal head node # start by extracting features feats = syntactic_item.get_verbal_features(tree) if feats['Lemma'] == verbs[0]: del(feats['Lemma']) for k in feats: self.types.add(k) verbNode = graph_representation.node.Node(isPredicate=True, text = [Word(index=index, word=verb) for index,verb in zip(indexes,verbs)], features=feats, valid=True) self.gr.add_node(verbNode) # handle arguments for arg_t in arguments: curNode = self.parse(arg_t) #curNode.features = syntactic_item.get_verbal_features(arg_t) self.gr.add_edge((verbNode,curNode), arg_t.parent_relation) # handle time expressions (timeSubtree,_) = tree._VERBAL_PREDICATE_SUBTREE_Time() if timeSubtree: timeNode = graph_representation.node.TimeNode.init(features = {}) self.gr.add_node(timeNode) timeSubGraph = self.parse(timeSubtree) self.gr.add_edge((verbNode,timeNode)) self.gr.add_edge((timeNode,timeSubGraph)) return verbNode
def treeNode_to_graphNode(treeNode,gr): """ @type treeNode DepTree """ feats = get_verbal_features(treeNode) ret = newNode.Node(text = [Word(index=treeNode.id, word = treeNode.word)], isPredicate = treeNode.is_verbal_predicate(), features = feats, gr = gr) ret.features["pos"] = treeNode.pos ret.original_text = copy(ret.text) return ret
def read_dep_graphs_file(constituency_tree_fn, wsjInfo_exists=False, HOME_DIR="./", stanford_json_sent=None): stream = convert_json_to_dep_graph(stanford_json_sent) \ if stanford_json_sent \ else convert_to_dep_graph(constituency_tree_fn) graphsFromFile = create_dep_graphs_from_stream(stream, HOME_DIR) trees = read_trees_file(constituency_tree_fn, False, stanford_json_sent) graphs = [] for i, t in enumerate(trees): curGraph, nodesMap = graphsFromFile[i] curGraph.originalSentence = t[0].original_sentence curGraph.tree_str = "\n".join( t[0].to_original_format().split("\n")[1:]) curGraph.dep_tree = t for node_id in nodesMap: int_node_id = int(node_id.split("'")[0]) treeNode = t[int_node_id] child_dic = treeNode._get_child_dic() if 'cc' in child_dic: conj_type = (" ".join([ cc.word for cc in sorted(child_dic['cc'], key=lambda cc: cc.id) ]), [cc.id for cc in child_dic['cc']]) else: conj_type = False graphNodes = [ nodesMap[n] for n in nodesMap if n.split("'")[0] == node_id ] for graphNode in graphNodes: graphNode.features = get_verbal_features(treeNode) if conj_type: graphNode.features["conjType"] = conj_type graphNode.features["pos"] = treeNode.pos graphNode.isPredicate = treeNode.is_verbal_predicate() # graphNode.original_text = copy.copy(graphNode.text) graphNode.original_text = treeNode.get_text() graphNode.surface_form += missing_children(treeNode, graphNode) curGraph.del_node(nodesMap['0']) # delete root graphs.append(curGraph) return graphs
def merge(treesFile, graphFile): trees = load_depTrees_from_file(treesFile) graphsFromFile = load_depGraphs_from_file(graphFile) # assert(len(trees)==len(graphsFromFile)) # print "len ok" graphs = [] for i, t in enumerate(trees): curGraph, nodesMap = graphsFromFile[i] curGraph.wsj_id = t[0].wsj_id curGraph.sent_id = t[0].sent_id curGraph.originalSentence = t[0].original_sentence curGraph.wsj_id = t[0].wsj_id curGraph.sent_id = t[0].sent_id curGraph.tree_str = "\n".join( t[0].to_original_format().split("\n")[1:]) for node_id in nodesMap: int_node_id = int(node_id.split("'")[0]) treeNode = t[int_node_id] child_dic = treeNode._get_child_dic() if 'cc' in child_dic: conj_type = (" ".join([ cc.word for cc in sorted(child_dic['cc'], key=lambda cc: cc.id) ]), [cc.id for cc in child_dic['cc']]) else: conj_type = False graphNodes = [ nodesMap[n] for n in nodesMap if n.split("'")[0] == node_id ] for graphNode in graphNodes: graphNode.features = get_verbal_features(treeNode) if conj_type: graphNode.features["conjType"] = conj_type graphNode.features["pos"] = treeNode.pos graphNode.isPredicate = treeNode.is_verbal_predicate() # graphNode.original_text = copy.copy(graphNode.text) graphNode.original_text = treeNode.get_text() graphNode.surface_form += missing_children(treeNode, graphNode) curGraph.del_node(nodesMap['0']) # delete root graphs.append(curGraph) return graphs
def parse(self,t): """ Get the graph representation from a syntactic representation Returns through the graph parameter. @type t: DepTree @param tree: syntactic tree to be converted @rtype: Node @return: the node in the graph corresponding to the top node in t """ #order matters! if t.is_conditional_predicate(): self.types.add(APPENDIX_COND) return self.parseConditional(outcome = t._CONDITIONAL_PREDICATE_FEATURE_Outcome()["Value"], condList = t.condPred) if t._VERBAL_PREDICATE_SUBTREE_Adv(): advChildren = t.adverb_children advSubj = t.adverb_subj return self.parseAdverb(subj=advSubj, advChildren=advChildren) if t.is_conjunction_predicate(): self.types.add(APPENDIX_CONJUNCTION) return self.parseConjunction(baseElm = t.baseElm, conjResult = t.conjResult) if t.is_appositional_predicate(): self.types.add(APPENDIX_APPOS) firstEntity = t._APPOSITIONAL_PREDICATE_FEATURE_Left_Side()["Value"] secondEntity = t._APPOSITIONAL_PREDICATE_FEATURE_Right_Side()["Value"] return self.parseApposition(index = t.id, first_entity=firstEntity, second_entity=secondEntity) if t.is_relative_clause(): self.types.add(APPENDIX_RCMOD) return self.parseRcmod(np = t._RELCLAUSE_PREDICATE_FEATURE_Rest()['Value'], modList = t.rcmodPred) if t.is_prepositional_predicate(): self.types.add(APPENDIX_PREP) return self.parsePreposition(psubj=t._PREPOSITIONAL_PREDICATE_FEATURE_psubj()["Value"], prepChildList=t.prepChildList) if t.is_copular_predicate(): self.types.add(APPENDIX_COP) firstEntity = t._COPULAR_PREDICATE_FEATURE_Copular_Predicate()["Value"] secondEntity = t._COPULAR_PREDICATE_FEATURE_Copular_Object()["Value"] return self.parseCopular(index = t.id, first_entity=firstEntity, second_entity=secondEntity, features = syntactic_item.get_verbal_features(t)) if t.is_possesive_predicate(): self.types.add(APPENDIX_POSS) possessor = t._POSSESSIVE_PREDICATE_FEATURE_Possessor()["Value"] possessed = t._POSSESSIVE_PREDICATE_FEATURE_Possessed()["Value"] possessive = t._POSSESSIVE_PREDICATE_FEATURE_Possessive()["Value"] return self.parsePossessive(possessor = possessor, possessed = possessed, possessive = possessive) if t.is_adjectival_predicate(): self.types.add(APPENDIX_ADJ) return self.parseProp(subject = t._ADJECTIVAL_PREDICATE_FEATURE_Subject()["Value"], copulaIndex = NO_INDEX, adjectiveChildList = t.adjectivalChildList, propAsHead=False) if t.is_clausal_complement(): self.types.add(APPENDIX_COMPLEMENT) return self.parseComplement(compSubj = t.compSubj, compChildren = t.compChildList) if t.unhandled_advcl(): # put each unhandled advcl as a disconnected subgraph for c in t.advcl: self.parse(c) return self.parse(t) if t.is_verbal_predicate(): self.types.add(APPENDIX_VERB) head_ret = t._VERBAL_PREDICATE_SUBTREE_Head() return self.parseVerbal(indexes = head_ret["Span"], verbs = head_ret["Value"].split(" "), arguments = t.collect_arguments(), tree = t) else: # fall back - pack all the tree in a single node if len(t.children)==1: if (t.children[0].parent_relation == "nn") and (t.word.endswith(",")) and (t.children[0].word.endswith(",")): #conjunction in disguise child = t.children[0] t.children = [] ret = self.parseConjunction(cc = [(t.id,"and")], conjElements = [t,child]) t.children = [child] return ret nodes = t._get_subtree(filter_labels_ban) text = [Word(index=index, word=nodes[index]) for index in sorted(nodes.keys())] topNode = self.parseBottom(text = sorted(text,key=lambda x:x.index), features = syntactic_item.get_verbal_features(t)) return topNode
def parse(self,t): """ Get the graph representation from a syntactic representation Returns through the graph parameter. @type t: DepTree @param tree: syntactic tree to be converted @rtype: Node @return: the node in the graph corresponding to the top node in t """ #order matters! if t.is_conditional_predicate(): self.types.add(APPENDIX_COND) return self.parseConditional(outcome = t._CONDITIONAL_PREDICATE_FEATURE_Outcome()["Value"], condList = t.condPred) if t._VERBAL_PREDICATE_SUBTREE_Adv(): advChildren = t.adverb_children advSubj = t.adverb_subj return self.parseAdverb(subj=advSubj, advChildren=advChildren) if t.is_conjunction_predicate(): self.types.add(APPENDIX_CONJUNCTION) return self.parseConjunction(baseElm = t.baseElm, conjResult = t.conjResult) if t.is_appositional_predicate(): self.types.add(APPENDIX_APPOS) firstEntity = t._APPOSITIONAL_PREDICATE_FEATURE_Left_Side()["Value"] secondEntity = t._APPOSITIONAL_PREDICATE_FEATURE_Right_Side()["Value"] return self.parseApposition(index = t.id, first_entity=firstEntity, second_entity=secondEntity) if t.is_relative_clause(): self.types.add(APPENDIX_RCMOD) return self.parseRcmod(np = t._RELCLAUSE_PREDICATE_FEATURE_Rest()['Value'], modList = t.rcmodPred) if t.is_prepositional_predicate(): self.types.add(APPENDIX_PREP) return self.parsePreposition(psubj=t._PREPOSITIONAL_PREDICATE_FEATURE_psubj()["Value"], prepChildList=t.prepChildList) if t.is_copular_predicate(): self.types.add(APPENDIX_COP) firstEntity = t._COPULAR_PREDICATE_FEATURE_Copular_Predicate()["Value"] secondEntity = t._COPULAR_PREDICATE_FEATURE_Copular_Object()["Value"] return self.parseCopular(index = t.id, first_entity=firstEntity, second_entity=secondEntity, features = syntactic_item.get_verbal_features(t)) if t.is_possesive_predicate(): self.types.add(APPENDIX_POSS) possessor = t._POSSESSIVE_PREDICATE_FEATURE_Possessor()["Value"] possessed = t._POSSESSIVE_PREDICATE_FEATURE_Possessed()["Value"] possessive = t._POSSESSIVE_PREDICATE_FEATURE_Possessive()["Value"] return self.parsePossessive(possessor = possessor, possessed = possessed, possessive = possessive) if t.is_adjectival_predicate(): self.types.add(APPENDIX_ADJ) return self.parseProp(subject = t._ADJECTIVAL_PREDICATE_FEATURE_Subject()["Value"], copulaIndex = NO_INDEX, adjectiveChildList = t.adjectivalChildList, propAsHead=False) if t.is_clausal_complement(): self.types.add(APPENDIX_COMPLEMENT) return self.parseComplement(compSubj = t.compSubj, compChildren = t.compChildList) if t.unhandled_advcl(): # put each unhandled advcl as a disconnected subgraph for c in t.advcl: self.parse(c) return self.parse(t) if t.is_verbal_predicate(): self.types.add(APPENDIX_VERB) head_ret = t._VERBAL_PREDICATE_SUBTREE_Head() return self.parseVerbal(indexes = head_ret["Span"], verbs = head_ret["Value"].split(" "), arguments = t.collect_arguments(), tree = t) else: # fall back - pack all the tree in a single node if len(t.children)==1: if (t.children[0].parent_relation == "nn") and (t.word.endswith(",")) and (t.children[0].word.endswith(",")): #conjunction in disguise child = t.children[0] t.children = [] ret = self.parseConjunction(cc = [(t.id,"and")], conjElements = [t,child]) t.children = [child] return ret nodes = t._get_subtree(filter_labels_ban) text = [Word(index=index, word=nodes[index]) for index in sorted(nodes.keys())] topNode = self.parseBottom(text = sorted(text,key=lambda x:x.index), features = syntactic_item.get_verbal_features(t)) return topNode