def intensional_proposal(best, A, B): matches = tipward(best, A, B) result = {} incoming = index_by_target(matches) # Suppose `node` x comes from the A checklist, and there is a split # such that x matches multiple nodes y1, y2 in the B checklist. # Modify the relation for all approximate-match nodes. for y in incoming: # Many x's, one y arts = incoming[y] # Canonical. back.cod will be among the incoming, by construction. back = matches.get(y) # back : y -> x if not back: continue x0 = back.cod # Back match y -> x -> y revarts = incoming[x0] if len(arts) > 1: # multiple x's if len(revarts) > 1: art.proclaim(result, art.set_relation(back, rel.eq)) dribble.log("** Tangle:\n %s\n %s" % ("\n ".join(map(art.express, arts)), ("\n ".join(map(art.express, revarts))))) art.proclaim(result, art.set_relation(back, rel.eq)) else: # OK. We're going to just throw away all non-sibling matches. rent = cl.get_parent(x0) sibs = [ar for ar in arts if cl.get_parent(ar.dom) == rent] # e.g. ar: x2 -> y # Don't even try to do anything with N->M node tangles. if len(sibs) == 1: art.proclaim(result, art.set_relation(back, rel.eq)) else: for sib in sibs: ar = art.change_relation(sib, rel.lt, "merge", "split") if sib.dom == x0: art.proclaim(result, ar) # gt else: art.half_proclaim(result, ar) art.half_proclaim( result, art.bridge(y, rent, rel.lt, "split", "merge")) # Report! dribble.log( "# Split/lump %s < %s < %s" % (" + ".join(map(lambda e: cl.get_unique(e.dom), sibs)), cl.get_unique(y), cl.get_unique(rent))) elif len(revarts) > 1: # multiple y's pass else: # n.b. arts[0] is reverse of back art.proclaim(result, art.set_relation(back, rel.eq)) return result
def find_changed_subtrees(roots, children, all_props): any_descendant_differs = {} def process(node): node_changed = False (x, y) = node if not x or not y: node_changed = True else: comparison = changes.differences(x, y, all_props) if not changes.same(comparison): node_changed = True descendant_changed = False for child in children.get(node, []): if process(child): descendant_changed = True if descendant_changed: any_descendant_differs[node] = True return descendant_changed or node_changed for root in roots: c = process(root) if c: any_descendant_differs[root] = c dribble.log("# %s nodes in merge have some change in their descendants" % (len(any_descendant_differs))) return any_descendant_differs
def process(here, there): for node in here.get_all_nodes(): if cl.is_accepted(node) and not node in best: ar = best_intensional_match(node, there) if dribble.watch(node): dribble.log("# Best: %s" % art.express(ar)) if ar: assert ar.dom == node assert cl.is_accepted(ar.cod) art.half_proclaim(best, ar)
def choose_best_match(arts): # => art assert is_matches(arts) if len(arts) == 0: return None arts = skim_best_matches(arts) b = arts[0] if len(arts) == 1: return b dribble.log("** Multiple least-bad matches. Need to find tie-breakers.") dribble.log(" %s -> %s" % (cl.get_unique(b.dom), [cl.get_unique(a.cod) for a in arts])) return None
def express_proof(proof): (c, d, e) = proof # Assume resolution (x < y) until conflict is proven # assume potential child until proven otherwise if c and d and e: proof_expression = (">< %s [%s, %s, %s]" % (cl.get_unique(yk), cl.get_unique(c), cl.get_unique(d), cl.get_unique(e))) dribble.log( "** %s doesn't refine %s because\n %s\n yk [in x, in both, in yk]" % (cl.get_unique(x), cl.get_unique(y), proof_expression)) # Should squirrel away the proof somewhere! return proof_expression
def align(B, A): # Precompute all best matches best = intension.best_intensional_match_map(B, A) # Extensional analysis yields <= relationships between hierarchies # (written as the 'matches' relation ~) xmrcas = infer_partners(best, A, B) dribble.log("# Number of cross-mrcas: %s" % len(xmrcas)) # Turn tipward best matches into = or < articulations as appropriate proposal = intension.intensional_proposal(best, A, B) # Add extensional matches to a draft that already has intensional matches the_alignment = propose_alignment(proposal, best, xmrcas) return (the_alignment, xmrcas)
def write_report(A, B, al, xmrcas, format, outpath): if format == "eulerx": eulerx.dump_alignment(al, outpath) elif format == "diff": keyprop = None for prop in [cl.eol_page_id, cl.ncbi_id, cl.gbif_id]: if prop in A.properties: keyprop = prop diff.write_diff_set(A, B, al, keyprop, outpath) else: with open(outpath, "w") as outfile: (parents, roots) = merge.merge_checklists(A, B, al) dribble.log("Merged. %s roots in merge, %s nodes with parents" % (len(roots), len(parents))) report(A, B, al, roots, parents, outfile) report_on_collisions(A, B, al)
def align(B, A): # Precompute all best matches best = intension.best_intensional_match_map(B, A) # Turn tipward best matches into = or < articulations as appropriate tipwards = intension.intensional_alignment(tipward(best, A, B)) # Extensional analysis cross_mrcas = analyze_cross_mrcas(B, A, tipwards) dribble.log("# Number of cross-mrcas: %s" % len(cross_mrcas)) ext_map = extensional_match_map(A, B, tipwards, cross_mrcas) dribble.log("# Number of extensional relationships: %s" % len(ext_map)) # Add extensional matches to a draft that already has intensional matches the_alignment = assemble_alignment(tipwards, best, ext_map) return (the_alignment, cross_mrcas)
def best_intensional_match_map(A, B): best = {} def process(here, there): for node in here.get_all_nodes(): if cl.is_accepted(node) and not node in best: ar = best_intensional_match(node, there) if dribble.watch(node): dribble.log("# Best: %s" % art.express(ar)) if ar: assert ar.dom == node assert cl.is_accepted(ar.cod) art.half_proclaim(best, ar) process(A, B) process(B, A) dribble.log("%s best matches" % len(best)) return best
def process(node): merged = inject(node, al) if not merged in parents: p = merged_parent(merged, al) if p: if dribble.watch(node): (x, y) = p dribble.log("# Merged parent(%s) = (%s, %s)" % (cl.get_unique(node), cl.get_unique(x), cl.get_unique(y))) parents[merged] = p # Otherwise it's a root else: if dribble.watch(node): dribble.log("# No merge(%s)" % cl.get_unique(node)) if not merged in roots: roots.append(merged) for child in cl.get_children(node): process(child)
def subinfer_partners(x, other): y = None for child in cl.get_children(x): child_ar = subinfer_partners(child, other) # an articulation if child_ar != None: child_y = child_ar.cod if y == None: y = child_y else: y = cl.mrca(y, child_y) if y != None: ar = art.extensional(x, y, rel.matches, "cross-mrca") else: ar = get_mutual(best, x) if ar: assert cl.get_checklist(ar.cod) != cl.get_checklist(x) if dribble.watch(x): dribble.log("# Cross-mrca: %s" % (art.express(ar))) xmrcas[x] = ar return ar # in B
def analyze_cross_mrcas(A, B, tipwards): cross_mrcas = {} def half_analyze_cross_mrcas(checklist, other): def subanalyze_cross_mrcas(node, other): result = None probe = tipwards.get(node) if probe: # Could be: = < or > result = probe.cod else: children = cl.get_children(node) if children: m = None # None is the identity for mrca for child in children: m2 = subanalyze_cross_mrcas(child, other) if m2 != None: m = cl.mrca(m, m2) if m != None else m2 if m != None: result = m if result: assert cl.get_checklist(result) != cl.get_checklist(node) if dribble.watch(node): dribble.log("# Cross-mrca(%s) = %s" % (cl.get_unique(node), cl.get_unique(result))) cross_mrcas[node] = result return result # in B for root in cl.get_roots(checklist): subanalyze_cross_mrcas(root, other) half_analyze_cross_mrcas(A, B) half_analyze_cross_mrcas(B, A) # Sanity check for node in cross_mrcas: cross = cross_mrcas[node] probe = cross_mrcas.get(cross) if probe: assert cl.get_checklist(probe) == cl.get_checklist(node) else: dribble.log("# No return cross-MRCA for %s -> %s -> ..." %\ (cl.get_unique(node), cl.get_unique(cross))) return cross_mrcas
def correct_children_mutexes(parent, parent_mutex): for child in get_children(parent): child_mutex = get_mutex(child) if child_mutex <= parent_mutex: if child_mutex == parent_mutex: dribble.log("# ** Child %s (%s) has same rank as parent %s" % \ (get_unique(child), get_nominal_rank(child), get_unique(parent))) else: dribble.log("# ** Child %s (%s) is of higher rank than parent %s (%s)" %\ (get_unique(child), get_nominal_rank(child), get_unique(parent), get_nominal_rank(parent))) if is_container(child): new_mutex = parent_mutex + 1 # demote! set_mutex(child, new_mutex) correct_children_mutexes(child, new_mutex) # ? else: set_mutex(child, parent_mutex + 10) # demote!
def report_on_collisions(A, B, al): index = cl.index_by_value(A, canonical_name) for name in index: A_nodes = index[name] if len(A_nodes) == 1: B_nodes = cl.get_nodes_with_value(B, canonical_name, name) if B_nodes and len(B_nodes) == 1: A_node = A_nodes[0] B_node = B_nodes[0] if cl.is_accepted(A_node) and cl.is_accepted(B_node): ar1 = al.get(A_node) ar2 = al.get(B_node) ar1_bad = (ar1 and ar1.relation == rel.eq and ar1.cod != B_node) ar2_bad = (ar2 and ar2.relation == rel.eq and ar2.cod != A_node) if ar1_bad or ar2_bad: dribble.log( "# \"%s\" names different taxa in the two checklists" % name) dribble.log(" %s [%s]" % (art.express(ar1), art.reason(ar1) if ar1 else "-")) dribble.log(" %s [%s]" % (art.express(ar2), art.reason(ar2) if ar2 else "-"))
def subanalyze_cross_mrcas(node, other): result = None probe = tipwards.get(node) if probe: # Could be: = < or > result = probe.cod else: children = cl.get_children(node) if children: m = None # None is the identity for mrca for child in children: m2 = subanalyze_cross_mrcas(child, other) if m2 != None: m = cl.mrca(m, m2) if m != None else m2 if m != None: result = m if result: assert cl.get_checklist(result) != cl.get_checklist(node) if dribble.watch(node): dribble.log("# Cross-mrca(%s) = %s" % (cl.get_unique(node), cl.get_unique(result))) cross_mrcas[node] = result return result # in B
def validate(checklist): syn_count = 0 acc_count = 0 for node in checklist.get_all_nodes(): parent_id = get_value(node, parent_taxon_id) accepted_id = get_value(node, accepted_taxon_id) status = get_taxonomic_status(node) if accepted_id: # assert is_synonym_status(status) - we actually don't know # It's a synonym. No parent, children, or synonyms allowed. assert not parent_id assert len(get_raw_children(node)) == 0 assert len(get_raw_synonyms(node)) == 0 assert accepted_id a = get_raw_accepted(node) if not a: print("** %s (taxonID %s) has accepted id %s, which doesn't resolve" % (get_unique(node), get_taxon_id(node), accepted_id)) assert a syn_count += 1 else: # It's accepted. Parent and children must all # also be accepted, and synonyms must not be. assert not accepted_id if parent_id: # N.b. parent of a root is simply an undefined id p = get_raw_parent(node) if p: assert is_accepted(p) for child in get_raw_children(node): assert is_accepted(child) for syn in get_raw_synonyms(node): assert not is_accepted(syn) acc_count += 1 dribble.log("# Validated %s accepted nodes, %s synonyms, total %s" % (acc_count, syn_count, len(checklist.get_all_nodes())))
def main(c1, c1_tag, c2, c2_tag, out, format): global dribble_file dribpath = out + ".log" with open(dribpath, "w") as dribfile: dribble.dribble_file = dribfile dribble.log("\nLogging to %s" % (dribpath, )) A = cl.read_checklist(c1, c1_tag + ".", "low-checklist") B = cl.read_checklist(c2, c2_tag + ".", "high-checklist") dribble.log("Node counts: %s %s" % (len(A.get_all_nodes()), len(B.get_all_nodes()))) # Map each B to a corresponding A dribble.log("Aligning ...") (al, xmrcas) = alignment.align(B, A) dribble.log(" ... finished aligning; %s articulations\n" % len(al)) # Where do xmrcas come from? write_report(A, B, al, xmrcas, format, out) dribble.dribble_file = None
def half_proclaim(draft, ar): p = proclaimable(draft, ar) if p == MEH: if dribble.watch(ar.dom): dribble.log("# Meh: %s" % express(ar)) pass elif p: if dribble.watch(ar.dom): dribble.log("# storing") draft[ar.dom] = ar else: print("** Not OK to replace %s\n with %s" % (express(draft.get(ar.dom)), express(ar))) assert False if dribble.watch(ar.dom): dribble.log("# Proclaim %s\n yields %s" % (express(ar), express(draft.get(ar.dom))))
def filter(node): debug = dribble.watch(node) found_match = None for child in cl.get_children(node): ar = filter(child) if ar: found_match = ar if found_match: # Some descendant is a particle if debug: dribble.log("# %s: descendant matches, not keeping: %s" % (cl.get_unique(node), art.express(found_match))) return found_match elif node in amap: ar = amap[node] tw[ar.dom] = ar if debug: dribble.log("# %s is a tipward match, keeping: %s" % (cl.get_unique(node), art.express(ar))) return ar else: if debug: dribble.log("# %s is unmatched" % cl.get_unique(node)) return None
def extensional_match(node, xmrcas): partner = xmrcas.get(node) # node in other checklist; 'conode' if not partner: # Descendant of a particle if dribble.watch(node): dribble.log("# EM: %s is not tipward." % cl.get_unique(node)) return None back = xmrcas.get(partner) # 'bounce' if not back: # Not sure how this can happen but it does (NCBI vs. GBIF) dribble.log("%s <= %s <= nowhere" % (cl.get_unique(node), cl.get_unique(partner))) if dribble.watch(node): dribble.log("# EM: %s killed because aborted round trip." % cl.get_unique(node)) return None # node <= partner <= back how = cl.how_related(node, back) # Alway rcc5 if how == rel.eq: # Should end up being eq iff name match or unique match # Can test for unique match by looking at xmrca of parent # Could be part of a 'monotypic' chain; fix later how = rel.matches reason = "mutual-cross-mrca" elif how == rel.gt: how = rel.matches reason = "monotypic-inversion" elif how == rel.disjoint: reason = "particle-set-exclusion" else: # must be rel.lt # Assume resolution (node < partner) until conflict is proven reason = "refinement" # Look for an intersection between any partner-child and node # x is in A checklist, y is in B checklist for pchild in cl.get_children(partner): pchild_back = xmrcas.get(pchild) if pchild_back == None: # pchild ! node pass else: (d, e) = cross_compare(node, pchild, xmrcas) # d < node while e ! node if d and e: how = rel.conflict reason = ("%s is in; its sibling %s is not" % (cl.get_unique(d), cl.get_unique(e))) dribble.log("** %s conflicts with %s because\n" " %s ! %s\n (but sibling %s < %s)" % (cl.get_unique(node), cl.get_unique(partner), cl.get_unique(e), cl.get_unique(node), cl.get_unique(d), cl.get_unique(node))) break elif e: reason = ("%s is not in it" % cl.get_unique(e)) ar = art.extensional(node, partner, how, reason) if dribble.watch(node): dribble.log("# Extensional articulation %s" % art.express(ar)) return ar
def by_name(name): assert not "\t" in name prop = properties_by_pet_name.get(name) if not prop: dribble.log("No such property: %s" % name) return prop
def extensional(dom, cod, re, reason, revreason=None): ar = bridge(dom, cod, re, reason, revreason) if dribble.watch(dom): dribble.log("# Extensional articulation %s" % art.express(ar)) return ar