def setArgs(self, art_id, sent_id, sent, idx): this_part = Part.getPartByRootNodeId( genTreeNodeID(art_id, sent_id, idx)) this_node = this_part.getRelTreeRoot() node_clust = Clust.getClust(this_part.getClustIdx()) children = sent.get_children(idx) if children is None: return None else: for dependency, child_index in children: child_node_id = genTreeNodeID(art_id, sent_id, child_index) path = Path(dependency) argTypeIdx = path.getArgType() child_part = Part.getPartByRootNodeId(child_node_id) if child_part.getParPart() is not None: continue arg = Argument(this_node, path, child_part) argIdx = this_part.addArgument(arg) child_part.setParent(this_part, argIdx) argClustIdxs = node_clust.getArgClustIdxs(argTypeIdx) argClustIdx = -1 if argClustIdxs is None: argClustIdx = node_clust.createArgClust(argTypeIdx) else: argClustIdx = next(iter(argClustIdxs)) this_part.setArgClust(argIdx, argClustIdx, clust_only=True) setArgs(art_id, sent_id, sent, child_index) return None
def createAgenda(self, verbose=False): if verbose: clust_cnt = len(Part.getClustPartRootNodeIds()) milestones = set([x for x in range(1, 10, 1)]) i = 0 for clust_id in Part.getClustPartRootNodeIds(): clust = Clust.getClust(clust_id) if clust.getType() != 'C': continue elif clust.isStop(): continue # # self.logc.write("Adding to agenda for cluster {}\n".format(clust_id)) self.addAgendaForNewClust(clust_id, verbose) if verbose: i += 1 done = math.floor(i * 10 / clust_cnt) if done in milestones: milestones.remove(done) print("{}% complete.".format(done * 10)) # self.logc.close() return None
def part_from_node(ai, sj, sent, k, tok): if not Parse.isIgnore(sent, k): tn = TreeNode(genTreeNodeID(ai, sj, k), tok) part = Part(tn) relTypeIdx = part.getRelTypeIdx() clustIdxs = Clust.getClustsWithRelType(relTypeIdx) if clustIdxs is not None: clustIdx = next(iter(clustIdxs)) else: clustIdx = Clust.createClust(relTypeIdx) part.setClust(clustIdx) return None
def mergeArg(self, clust, aci1, aci2): ac2 = clust._argClusts[aci2] for node_id in ac2._partRootTreeNodeIds.copy(): part = Part.getPartByRootNodeId(node_id) for arg_id, arg_clust_id in part._argIdx_argClustIdx.items(): if arg_clust_id == aci2: part.setArgClust(arg_id, aci1) return None
def addAgendaForNewClust(self, newClustIdx, verbose=False): part_node_ids = Part.getClustPartRootNodeIds()[newClustIdx] num_parts = len(part_node_ids) # if verbose: # print("Updating agenda: {} possible operations.".format(num_parts*(num_parts-1))) if len(part_node_ids) > 1: for node_id in part_node_ids: part_1 = Part.getPartByRootNodeId(node_id) for node_id2 in part_node_ids: if node_id <= node_id2: break part_2 = Part.getPartByRootNodeId(node_id2) # self.logc.write("\tAdding parts {} and {} to agenda for cluster {}\n".format(node_id, node_id2, newClustIdx)) self.addAgendaAfterMergeClust(part_1, part_2) return None
def initializeSent(self, ai, sj, sent, verbose=False): ''' Create TreeNode, Part, and Clust for each token in a sentence, also adding/assigning RelTypes. Increment the root count for the cluster assigned to the root token (tokens with a parent of ROOT). Finally, run CreateArgs() to define the parent-child relation- ships. This call is recursive, traversing the whole dependency tree for each sentence. ''' self.numTkns += len(sent.get_tokens()) - 1 roots = sent.get_children(0) if roots is None: return None elif len(roots) == 0: return None for k in range(1, len(sent.get_tokens())): Parse.part_from_node(ai, sj, sent, k, sent.get_token(k)) # if len(roots) == 1: for _, idx in roots: sub_node_id = genTreeNodeID(ai, sj, idx) # Is this global set really necessary? I don't think it is... self.rootTreeNodeIds.add(sub_node_id) node_part = Part.getPartByRootNodeId(sub_node_id) if node_part is None: continue ncl = Clust.getClust(node_part.getClustIdx()) ncl.incRootCnt() self.createArgs(ai, sj, sent, idx, verbose) return None
def execCompose(self, op): parClustIdx = op._parClustIdx chdClustIdx = op._chdClustIdx new_clust_id = -1 # # If either cluster are None, return -1 # if Clust.getClust(parClustIdx) is None or Clust.getClust( chdClustIdx) is None: return -1 new_clust = None parent_child_pair = (parClustIdx, chdClustIdx) part_ids = set() part_ids.update( Part.pairClustIdxs_pairPartRootNodeIds[parent_child_pair]) deleted_parts = [] for parent_id, child_id in part_ids: if parent_id in deleted_parts or child_id in deleted_parts: continue parent_part = Part.getPartByRootNodeId(parent_id) child_part = Part.getPartByRootNodeId(child_id) dep = parent_part.getArguments()[ child_part._parArgIdx]._path.getDep() parent_part._relTreeRoot.addChild(dep, child_part._relTreeRoot) nrti = RelType.getRelType(parent_part._relTreeRoot) if new_clust is None: # on first loop rel_clusts = Clust.getClustsWithRelType(nrti) if rel_clusts is None: new_clust = Clust.getClust(Clust.createClust(nrti)) elif len(rel_clusts) > 1: raise Exception else: new_clust = Clust.getClust(next(iter(rel_clusts))) new_clust_id = new_clust.getId() parent_part.removeArgument(child_part._parArgIdx) if parent_part.getClustIdx() != new_clust_id: for argIdx in parent_part.getArguments(): parent_part.unsetArgClust(argIdx) arg = parent_part.getArgument(argIdx) arg._argPart.unsetParent() parent_part.changeClust(new_clust_id, nrti) for argIdx, arg in parent_part.getArguments().items(): arg_type = arg._path.getArgType() arg_clust_id = -1 if arg_type not in new_clust._argTypeIdx_argClustIdxs: arg_clust_id = new_clust.createArgClust(arg_type) elif len( new_clust._argTypeIdx_argClustIdxs[arg_type]) == 0: arg_clust_id = new_clust.createArgClust(arg_type) else: arg_clust_id = next( iter(new_clust._argTypeIdx_argClustIdxs[arg_type])) arg._argPart.setParent(parent_part, argIdx) parent_part.setArgClust(argIdx, arg_clust_id) parent_part.setRelTypeIdx(nrti) else: parent_part.unsetRelTypeIdx() parent_part.setRelTypeIdx(nrti) # # Connect the child part's arguments directly to the parent part now # for argIdx, arg in child_part.getArguments().items(): child_part.unsetArgClust(argIdx) arg_type = arg._path.getArgType() arg_clust_id = -1 if arg_type not in new_clust._argTypeIdx_argClustIdxs: arg_clust_id = new_clust.createArgClust(arg_type) elif len(new_clust._argTypeIdx_argClustIdxs[arg_type]) == 0: arg_clust_id = new_clust.createArgClust(arg_type) else: arg_clust_id = next( iter(new_clust._argTypeIdx_argClustIdxs[arg_type])) newArgIdx = parent_part.addArgument(arg) arg._argPart.setParent(parent_part, newArgIdx) parent_part.setArgClust(newArgIdx, arg_clust_id) # # Remove the old child part # deleted_parts.append(child_part.getRelTreeRoot().getId()) child_part.destroy() # Part.clustIdx_pairClustIdxs[parClustIdx].remove(pci) # Part.clustIdx_pairClustIdxs[chdClustIdx].remove(pci) del Part.pairClustIdxs_pairPartRootNodeIds[parent_child_pair] return new_clust_id
def execMC(self, op): # # Get clusters associated with our op # cluster1 = Clust.getClust(op._clustIdx1) cluster2 = Clust.getClust(op._clustIdx2) if cluster1 is None or cluster2 is None: return -1 # # If cluster 1 has fewer argument clusters than cluster 2, swap them. # We merge the "smaller" cluster into the larger one. # if len(cluster1._argClusts) < len(cluster2._argClusts): clust_swap = cluster2 cluster2 = cluster1 cluster1 = clust_swap # # Align the argument clusters based on scores, and then map over # any remaining argument clusters from cluster 2 to cluster 1. # aci2_aci1 = dict() scorer = self._parse.scorer _, aci2_aci1 = scorer.scoreMCForAlign(cluster1, cluster2, aci2_aci1) for arg_clust_id2 in cluster2._argClusts: if arg_clust_id2 not in aci2_aci1: arg_clust = cluster2._argClusts[arg_clust_id2] for arg_type in arg_clust._argTypeIdx_cnt: arg_clust_ids = cluster1.getArgClustIdxs(arg_type) if arg_clust_ids is None: arg_clust_id1 = cluster1.createArgClust(arg_type) else: arg_clust_id1 = next(iter(arg_clust_ids)) aci2_aci1[arg_clust_id2] = arg_clust_id1 break # # Finally, remap the Parts in cluster 2 to cluster 1 as well. # part_ids = set() part_ids.update(Part.getPartRootNodeIds(cluster2.getId())) for part_id in part_ids: pt = Part.getPartByRootNodeId(part_id) for arg in pt.getArguments().values(): arg._argPart.unsetParent() pt.changeClustRemap(cluster1.getId(), aci2_aci1) for argIdx, arg in pt.getArguments().items(): arg._argPart.setParent(pt, argIdx) Clust.removeClust(cluster2) return cluster1.getId()
def scoreMergeArgs(self, clust, arg1, arg2): # log = open("/Users/ben_ryan/Documents/DARPA ASKE/usp-code/genia_full/score.log", "a+") # log.write("Scoring merge for args {} and {} for cluster {}\n".format(arg1, arg2, clust)) score = 0 score -= ParseParams.priorMerge # log.write("Score = {}\n".format(score)) total_part_cnt = clust._ttlCnt arg_clust1 = clust._argClusts[arg1] arg_clust2 = clust._argClusts[arg2] part_ids1 = arg_clust1._partRootTreeNodeIds part_ids2 = arg_clust2._partRootTreeNodeIds total_part_count1 = len(part_ids1) total_part_count2 = len(part_ids2) total_arg_count1 = arg_clust1._ttlArgCnt total_arg_count2 = arg_clust2._ttlArgCnt score -= (xlogx(total_part_cnt - total_part_count1) \ + xlogx(total_part_cnt - total_part_count2)) # log.write("score -= (xlogx(total_part_cnt - total_part_count1) + xlogx(total_part_cnt - total_part_count2)) = {}\n".format(score)) score += xlogx(total_part_cnt) # log.write("score += xlogx(total_part_cnt) = {}\n".format(score)) score -= (2 * Scorer.updateScore(total_arg_count1, total_arg_count2)) # log.write("score -= (2 * Scorer.updateScore(total_arg_count1, total_arg_count2)) = {}\n".format(score)) argNum_newCnt = dict() for dic in (arg_clust1._argNum_cnt, arg_clust2._argNum_cnt): for arg_num, count in dic.items(): if count == 0: print("Zero arguments of type {}".format(arg_num)) raise Exception else: score -= xlogx(count) # log.write("score -= xlogx({} argnum {}) = {}\n".format(arg_num, count, score)) argNum_newCnt = inc_key(argNum_newCnt, arg_num, inc=count) comb_part_cnt = total_part_count1 + total_part_count2 part_iter1 = iter(part_ids1) part_iter2 = iter(part_ids2) pid1 = next(part_iter1) pid2 = next(part_iter2) while True: # log.write("pid1 = {}, pid2 = {}\n".format(pid1, pid2)) if pid1 == pid2: cnt1 = len(Part.getPartByRootNodeId(pid1)._argClustIdx_argIdxs[arg1]) cnt2 = len(Part.getPartByRootNodeId(pid2)._argClustIdx_argIdxs[arg2]) comb_cnts = cnt1 + cnt2 comb_part_cnt -= 1 argNum_newCnt = inc_key(argNum_newCnt, comb_cnts) argNum_newCnt = dec_key(argNum_newCnt, cnt1, remove=True) argNum_newCnt = dec_key(argNum_newCnt, cnt2, remove=True) try: pid1 = next(part_iter1) pid2 = next(part_iter2) except StopIteration: break elif pid1 < pid2: while True: try: pid1 = next(part_iter1) except StopIteration: break if pid1 >= pid2: break if pid1 < pid2: break else: while True: try: pid2 = next(part_iter2) except StopIteration: break if pid1 <= pid2: break if pid1 > pid2: break score += xlogx(total_part_cnt - comb_part_cnt) # log.write("score += xlogx(total_part_cnt - comb_part_cnt) = {}\n".format(score)) for count in argNum_newCnt.values(): score += xlogx(count) # log.write("score += xlogx(argNum_newCnt ({})) = {}\n".format(count, score)) score += ((len(arg_clust1._argNum_cnt) \ + len(arg_clust2._argNum_cnt) \ - len(argNum_newCnt)) \ * ParseParams.priorNumParam) # log.write("score += ((len(arg_clust1._argNum_cnt) + len(arg_clust2._argNum_cnt) - len(argNum_newCnt)) * ParseParams.priorNumParam) = {}\n".format(score)) argtype_count1 = arg_clust1._argTypeIdx_cnt argtype_count2 = arg_clust2._argTypeIdx_cnt score = Scorer.update_score_from_ds(score, argtype_count1, argtype_count2) # log.write("score after counting ArgTypes = {}\n".format(score)) child_clust_count1 = arg_clust1._chdClustIdx_cnt child_clust_count2 = arg_clust2._chdClustIdx_cnt score = Scorer.update_score_from_ds(score, child_clust_count1, child_clust_count2) # log.write("score after counting child clusters = {}\n\n".format(score)) # log.close() return score
def scoreOpCompose(self, rcidx, acidx): def update_score_from_dict(scr, d, orig_d): for key, cnt in d.items(): origcnt = orig_d[key] # assert origcnt >= cnt scr -= xlogx(origcnt) if cnt > 0: scr += xlogx(cnt) else: scr += ParseParams.priorNumParam return scr # get parent and child root-node id numbers parChdNids = Part.getPairPartRootNodeIds(rcidx, acidx) if parChdNids is None: return -10000 score = 0 rcl = Clust.getClust(rcidx) acl = Clust.getClust(acidx) # Parent count, child count, and count of times they occur # together. rtc_new = rcl._ttlCnt atc_new = acl._ttlCnt ratc_new = 0 raRootCnt = 0 parArg_cnt = dict() rRelTypeIdx_newcnt = dict() aRelTypeIdx_newcnt = dict() raRelTypeIdx_newcnt = dict() rArgClustIdx_argNum_cnt = dict() aArgClustIdx_argNum_cnt = dict() rNewArgClustIdx_argNum_cnt = dict() aNewArgClustIdx_argNum_cnt = dict() rArgClustIdx_argTypeIdx_cnt = dict() aArgClustIdx_argTypeIdx_cnt = dict() rNewArgClustIdx_argTypeIdx_cnt = dict() aNewArgClustIdx_argTypeIdx_cnt = dict() rArgClustIdx_chdClustIdx_cnt = dict() aArgClustIdx_chdClustIdx_cnt = dict() rNewArgClustIdx_chdClustIdx_cnt = dict() aNewArgClustIdx_chdClustIdx_cnt = dict() rArgClustIdx_partCnt = dict() aArgClustIdx_partCnt = dict() rNewArgClustIdx_partCnt = dict() aNewArgClustIdx_partCnt = dict() rArgClustIdx_argCnt = dict() aArgClustIdx_argCnt = dict() rNewArgClustIdx_argCnt = dict() aNewArgClustIdx_argCnt = dict() # For each parent-child pair: for pcnid in parChdNids: pp, cp = Part.getPartByRootNodeId(pcnid[0]), Part.getPartByRootNodeId(pcnid[1]) rtc_new -= 1 atc_new -= 1 ratc_new += 1 rrt = pp.getRelTypeIdx() art = cp.getRelTypeIdx() raArgClustidx = pp.getArgClust(cp._parArgIdx) # Decrement individual relType counts and increment the combined # relType count for this pair rRelTypeIdx_newcnt = dec_key(rRelTypeIdx_newcnt, rrt, base=rcl._relTypeIdx_cnt[rrt]) aRelTypeIdx_newcnt = dec_key(aRelTypeIdx_newcnt, art, base=acl._relTypeIdx_cnt[art]) raRelTypeIdx_newcnt = inc_key(raRelTypeIdx_newcnt, (rrt, art)) pp_par = pp.getParPart() # If the parent has a parent, increment the parArg count, otherwise # increment the root count. if pp_par is not None: ai = pp.getParArgIdx() ppi = pp_par.getClustIdx() aci = pp_par.getArgClust(ai) parArg_cnt = inc_key(parArg_cnt, (ppi, aci)) else: raRootCnt += 1 # For each argClust on the parent part, decrement the old parent # part count and argClust count, and increment the new ones. The # trick is don't copy/increment the counts for argument shared by # this pair. for arg_ci in pp._argClustIdx_argIdxs: an = len(pp._argClustIdx_argIdxs[arg_ci]) ac = rcl._argClusts[arg_ci] rArgClustIdx_partCnt = dec_key(rArgClustIdx_partCnt, arg_ci, base=len(ac._partRootTreeNodeIds)) if arg_ci not in rArgClustIdx_argNum_cnt: rArgClustIdx_argNum_cnt[arg_ci] = {} rArgClustIdx_argNum_cnt[arg_ci] = \ dec_key(rArgClustIdx_argNum_cnt[arg_ci], an, base=ac._argNum_cnt[an]) newArgNum = an if arg_ci == raArgClustidx: newArgNum -= 1 if newArgNum == 0: continue if arg_ci not in rNewArgClustIdx_argNum_cnt: rNewArgClustIdx_argNum_cnt[arg_ci] = {} rNewArgClustIdx_argNum_cnt[arg_ci] = \ inc_key(rNewArgClustIdx_argNum_cnt[arg_ci], newArgNum) rNewArgClustIdx_partCnt = inc_key(rNewArgClustIdx_partCnt, arg_ci) # Same as above, but for child part, and we don't skip anything. for arg_ci in cp._argClustIdx_argIdxs: an = len(cp._argClustIdx_argIdxs[arg_ci]) ac = acl._argClusts[arg_ci] aArgClustIdx_partCnt = dec_key(aArgClustIdx_partCnt, arg_ci, base=len(ac._partRootTreeNodeIds)) if arg_ci not in aArgClustIdx_argNum_cnt: aArgClustIdx_argNum_cnt[arg_ci] = {} aArgClustIdx_argNum_cnt[arg_ci] = \ dec_key(aArgClustIdx_argNum_cnt[arg_ci], an, base=ac._argNum_cnt[an]) if arg_ci not in aNewArgClustIdx_argNum_cnt: aNewArgClustIdx_argNum_cnt[arg_ci] = {} aNewArgClustIdx_argNum_cnt[arg_ci] = \ inc_key(aNewArgClustIdx_argNum_cnt[arg_ci], an) aNewArgClustIdx_partCnt = inc_key(aNewArgClustIdx_partCnt, arg_ci) args = pp.getArguments() # For all the parent's arguments for ai, arg in args.items(): arg_part = arg._argPart child_clust_id = arg_part._clustIdx aci = pp.getArgClust(ai) ac = rcl._argClusts[aci] ati = arg._path.getArgType() # Drop the old arguments rArgClustIdx_argCnt = dec_key(rArgClustIdx_argCnt, aci, base=ac._ttlArgCnt) if aci not in rArgClustIdx_argTypeIdx_cnt: rArgClustIdx_argTypeIdx_cnt[aci] = {} rArgClustIdx_argTypeIdx_cnt[aci] = \ dec_key(rArgClustIdx_argTypeIdx_cnt[aci], ati, base=ac._argTypeIdx_cnt[ati]) if aci not in rArgClustIdx_chdClustIdx_cnt: rArgClustIdx_chdClustIdx_cnt[aci] = {} rArgClustIdx_chdClustIdx_cnt[aci] = \ dec_key(rArgClustIdx_chdClustIdx_cnt[aci], child_clust_id, base=ac._chdClustIdx_cnt[child_clust_id]) # Add the new arguments, except for the child part we're possibly # absorbing if arg_part.getRelTreeRoot().getId() != cp.getRelTreeRoot().getId(): rNewArgClustIdx_argCnt = inc_key(rNewArgClustIdx_argCnt, aci) if aci not in rNewArgClustIdx_argTypeIdx_cnt: rNewArgClustIdx_argTypeIdx_cnt[aci] = {} rNewArgClustIdx_argTypeIdx_cnt[aci] = \ inc_key(rNewArgClustIdx_argTypeIdx_cnt[aci], ati) if aci not in rNewArgClustIdx_chdClustIdx_cnt: rNewArgClustIdx_chdClustIdx_cnt[aci] = {} rNewArgClustIdx_chdClustIdx_cnt[aci] = \ inc_key(rNewArgClustIdx_chdClustIdx_cnt[aci], child_clust_id) args = cp.getArguments() for ai, arg in args.items(): ap = arg._argPart cci = ap._clustIdx aci = cp.getArgClust(ai) ac = acl._argClusts[aci] ati = arg._path.getArgType() # Drop the old arguments aArgClustIdx_argCnt = dec_key(aArgClustIdx_argCnt, aci, base=ac._ttlArgCnt) if aci not in aArgClustIdx_argTypeIdx_cnt: aArgClustIdx_argTypeIdx_cnt[aci] = {} aArgClustIdx_argTypeIdx_cnt[aci] = \ dec_key(aArgClustIdx_argTypeIdx_cnt[aci], ati, base=ac._argTypeIdx_cnt[ati]) if aci not in aArgClustIdx_chdClustIdx_cnt: aArgClustIdx_chdClustIdx_cnt[aci] = dict() aArgClustIdx_chdClustIdx_cnt[aci] = \ dec_key(aArgClustIdx_chdClustIdx_cnt[aci], cci, base=ac._chdClustIdx_cnt[cci]) # Add the new arguments aNewArgClustIdx_argCnt = inc_key(aNewArgClustIdx_argCnt, aci) if aci not in aNewArgClustIdx_argTypeIdx_cnt: aNewArgClustIdx_argTypeIdx_cnt[aci] = {} aNewArgClustIdx_argTypeIdx_cnt[aci] = \ inc_key(aNewArgClustIdx_argTypeIdx_cnt[aci], ati) if aci not in aNewArgClustIdx_chdClustIdx_cnt: aNewArgClustIdx_chdClustIdx_cnt[aci] = {} aNewArgClustIdx_chdClustIdx_cnt[aci] = \ inc_key(aNewArgClustIdx_chdClustIdx_cnt[aci], cci) if raRootCnt > 0: origRootCnt = Clust.clustIdx_rootCnt[rcidx] if origRootCnt > raRootCnt: score += xlogx(raRootCnt) \ + xlogx(origRootCnt - raRootCnt) \ - xlogx(origRootCnt) score -= ParseParams.priorNumParam denomor = xlogx(rcl._ttlCnt) denomnr = xlogx(rtc_new) score = update_score_from_dict(score, rRelTypeIdx_newcnt, rcl._relTypeIdx_cnt) score += denomor score -= denomnr denomoa = xlogx(acl._ttlCnt) denomna = xlogx(atc_new) score = update_score_from_dict(score, aRelTypeIdx_newcnt, acl._relTypeIdx_cnt) score += denomoa score -= denomna for cnt in raRelTypeIdx_newcnt.values(): score -= ParseParams.priorNumParam score += xlogx(cnt) denomra = xlogx(ratc_new) score -= denomra for pi, cnt in parArg_cnt.items(): pc = Clust.getClust(pi[0]) ac = pc._argClusts[pi[1]] origcnt = ac._chdClustIdx_cnt[rcidx] if cnt == origcnt: continue score -= ParseParams.priorNumParam score += xlogx(cnt) + xlogx(origcnt-cnt) - xlogx(origcnt) for aci, ac in rcl._argClusts.items(): origPartCnt = len(ac._partRootTreeNodeIds) score -= (xlogx(rcl._ttlCnt - origPartCnt) - denomor) if aci not in rArgClustIdx_partCnt: score += (xlogx(rtc_new - origPartCnt) - denomnr) continue if rArgClustIdx_partCnt[aci] > 0: score += (xlogx(rtc_new - rArgClustIdx_partCnt[aci]) - denomnr) score = update_score_from_dict(score, rArgClustIdx_argNum_cnt[aci], ac._argNum_cnt) score -= 2 * (xlogx(rArgClustIdx_argCnt[aci]) - xlogx(ac._ttlArgCnt)) score = update_score_from_dict(score, rArgClustIdx_argTypeIdx_cnt[aci], ac._argTypeIdx_cnt) score = update_score_from_dict(score, rArgClustIdx_chdClustIdx_cnt[aci], ac._chdClustIdx_cnt) # line 570 in Scorer.java for aci, ac in acl._argClusts.items(): origPartCnt = len(ac._partRootTreeNodeIds) score -= (xlogx(acl._ttlCnt - origPartCnt) - denomoa) if aci not in aArgClustIdx_partCnt: score += (xlogx(atc_new - origPartCnt) - denomna) continue if aArgClustIdx_partCnt[aci] > 0: score += (xlogx(atc_new - aArgClustIdx_partCnt[aci]) - denomna) score = update_score_from_dict(score, aArgClustIdx_argNum_cnt[aci], ac._argNum_cnt) score -= 2 * (xlogx(aArgClustIdx_argCnt[aci]) - xlogx(ac._ttlArgCnt)) score = update_score_from_dict(score, aArgClustIdx_argTypeIdx_cnt[aci], ac._argTypeIdx_cnt) score = update_score_from_dict(score, aArgClustIdx_chdClustIdx_cnt[aci], ac._chdClustIdx_cnt) for ds in [(rNewArgClustIdx_partCnt, rNewArgClustIdx_argNum_cnt), (aNewArgClustIdx_partCnt, aNewArgClustIdx_argNum_cnt)]: for aci, partCnt in ds[0].items(): score += xlogx(ratc_new-partCnt) - denomra for idx, cnt in ds[1][aci].items(): score += xlogx(cnt) score -= ParseParams.priorNumParam for ds in [(rNewArgClustIdx_argCnt, rNewArgClustIdx_argTypeIdx_cnt, rNewArgClustIdx_chdClustIdx_cnt), (aNewArgClustIdx_argCnt, aNewArgClustIdx_argTypeIdx_cnt, aNewArgClustIdx_chdClustIdx_cnt)]: for aci, argCnt in ds[0].items(): score -= 2 * xlogx(argCnt) for idx, cnt in ds[1][aci].items(): score += xlogx(cnt) score -= ParseParams.priorNumParam for idx, cnt in ds[2][aci].items(): score += xlogx(cnt) score -= ParseParams.priorNumParam return score
def reparse(self, aid, si): a = id_article[aid] sent = a.sentences[si] roots = sent.get_children(0) if roots is None: return None elif len(roots) == 0: return None else: old_nid_part = {} for ni in range(len(sent.get_tokens())): if Parse.isIgnore(sent, ni): continue nid = genTreeNodeID(aid, si, ni) np = Part.getPartByRootNodeId(nid) del Part.rootTreeNodeId_part[nid] old_nid_part[nid] = np nid_part = {} for ni in range(len(sent.get_tokens())): if Parse.isIgnore(sent, ni): continue part, clustIdx = Parse.part_from_node(aid, si, sent, ni) nid_part[genTreeNodeID(aid, si, ni)] = part part.setClust(clustIdx, clust_only=True) if len(roots) == 1: _, idx = next(iter(roots)) nid = genTreeNodeID(aid, si, idx) np = Part.getPartByRootNodeId(nid) if np is not None: setArgs(aid, si, sent, idx) maxImp = 1 while maxImp > 0: rp, ap = None, None maxImp = 0 for prt in nid_part.values(): for arg in prt.getArguments().values(): score = self.scorer.scoreOpComposePart(prt, arg) if score > maxImp: maxImp = score rp, ap = prt, arg if maxImp <= 0: break self.executor.execComposePart(rp, ap) del nid_part[ap.getRelTreeRoot().getId()] Clust.removePartAndUpdateStat(old_nid_part) Clust.updatePartStat(nid_part) return None
def createArgs(self, art_id, sent_id, sent, parent_id, done=set(), verbose=False): ''' For each token, get the TreeNode, Part, Cluster and (based on sentence dependencies) the children tokens. For each child token, use the dependency relationship to define a Path and then argument type and Argument defining the parent- child relationship. Then add/create an ArgClust before recursing on any grand-child tokens. # ## CHECK TOKENS SO WE DON'T GET STUCK IN A RECURSIVE LOOP IF ## DEPENDENCIES ARE MALFORMED # ''' parent_node_id = genTreeNodeID(art_id, sent_id, parent_id) parent = TreeNode.getTreeNode(parent_node_id) parent_part = Part.getPartByRootNodeId(parent_node_id) parent_clust = Clust.getClust(parent_part.getClustIdx()) children = sent.get_children(parent_id) if children is not None: for relation, child_id in children: child_node_id = genTreeNodeID(art_id, sent_id, child_id) path = Path(relation) arg_type_id = path.getArgType() # if child_node_id in done: # continue child_part = Part.getPartByRootNodeId(child_node_id) if child_part is None: if verbose: print("Child node id {} has no part".format( child_node_id)) if child_part.getParPart() is not None: if verbose: print("Child node id {} already has " "parent {}".format( child_node_id, child_part.getParPart().getRelTreeRoot(). getId())) continue arg = Argument(parent, path, child_part) arg_id = parent_part.addArgument(arg) child_part.setParent(parent_part, arg_id) arg_clust_ids = parent_clust.getArgClustIdxs(arg_type_id) if arg_clust_ids is None: arg_clust_id = parent_clust.createArgClust(arg_type_id) else: arg_clust_id = next(iter(arg_clust_ids)) parent_part.setArgClust(arg_id, arg_clust_id) #done.add(child_node_id) self.createArgs(art_id, sent_id, sent, child_id) #done.add(parent_node_id) return None
def updateAgendaAfterExecMC(self, op, newClustIdx, verbose=False): assert op._op == SearchOp.OP_MERGE_CLUST oldClustIdx = op._clustIdx2 if oldClustIdx == newClustIdx: oldClustIdx = op._clustIdx1 while len(self._clustIdx_agenda[oldClustIdx]) > 0: oop = next(iter(self._clustIdx_agenda[oldClustIdx])) self.removeAgenda(oop) if oop._op == SearchOp.OP_MERGE_CLUST: ci1 = oop._clustIdx1 ci2 = oop._clustIdx2 if ci1 == oldClustIdx: ci1 = newClustIdx if ci2 == oldClustIdx: ci2 = newClustIdx if ci1 != ci2: nop = oop nop._clustIdx1 = min((ci1, ci2)) nop._clustIdx2 = max((ci1, ci2)) nop.genString() self.addAgendaToScore(nop) elif oop._op == SearchOp.OP_COMPOSE: ci1 = oop._parClustIdx ci2 = oop._chdClustIdx if ci1 == oldClustIdx: ci1 = newClustIdx if ci2 == oldClustIdx: ci2 = newClustIdx nop = oop nop._parClustIdx = ci1 nop._chdClustIdx = ci2 nop.genString() self.addAgendaToScore(nop) del self._clustIdx_agenda[oldClustIdx] num_parts_old = len(Part.getClustPartRootNodeIds()[oldClustIdx]) num_parts_new = len(Part.getClustPartRootNodeIds()[newClustIdx]) if verbose: print("Updating agenda: {} possible operations.".format( num_parts_new * (num_parts_old))) for prnid in Part.getClustPartRootNodeIds()[newClustIdx]: p = Part.getPartByRootNodeId(prnid) for prnid2 in Part.getClustPartRootNodeIds()[oldClustIdx]: p2 = Part.getPartByRootNodeId(prnid2) self.addAgendaAfterMergeClust(p, p2) return None