예제 #1
0
def intensional_proposal(best, A, B):
    matches = tipward(best, A, B)

    result = {}

    incoming = index_by_target(matches)

    # Suppose `node` x comes from the A checklist, and there is a split
    # such that x matches multiple nodes y1, y2 in the B checklist.

    # Modify the relation for all approximate-match nodes.
    for y in incoming:  # Many x's, one y
        arts = incoming[y]

        # Canonical.  back.cod will be among the incoming, by construction.
        back = matches.get(y)  # back : y -> x
        if not back: continue
        x0 = back.cod  # Back match y -> x -> y

        revarts = incoming[x0]

        if len(arts) > 1:  # multiple x's
            if len(revarts) > 1:
                art.proclaim(result, art.set_relation(back, rel.eq))
                dribble.log("** Tangle:\n   %s\n   %s" %
                            ("\n   ".join(map(art.express, arts)),
                             ("\n   ".join(map(art.express, revarts)))))
                art.proclaim(result, art.set_relation(back, rel.eq))
            else:

                # OK.  We're going to just throw away all non-sibling matches.

                rent = cl.get_parent(x0)
                sibs = [ar for ar in arts if cl.get_parent(ar.dom) == rent]
                # e.g. ar: x2 -> y
                # Don't even try to do anything with N->M node tangles.
                if len(sibs) == 1:
                    art.proclaim(result, art.set_relation(back, rel.eq))
                else:
                    for sib in sibs:
                        ar = art.change_relation(sib, rel.lt, "merge", "split")
                        if sib.dom == x0:
                            art.proclaim(result, ar)  # gt
                        else:
                            art.half_proclaim(result, ar)
                    art.half_proclaim(
                        result, art.bridge(y, rent, rel.lt, "split", "merge"))
                    # Report!
                    dribble.log(
                        "# Split/lump %s < %s < %s" %
                        (" + ".join(map(lambda e: cl.get_unique(e.dom), sibs)),
                         cl.get_unique(y), cl.get_unique(rent)))

        elif len(revarts) > 1:  # multiple y's
            pass
        else:
            # n.b. arts[0] is reverse of back
            art.proclaim(result, art.set_relation(back, rel.eq))

    return result
예제 #2
0
def find_changed_subtrees(roots, children, all_props):
    any_descendant_differs = {}

    def process(node):
        node_changed = False
        (x, y) = node
        if not x or not y:
            node_changed = True
        else:
            comparison = changes.differences(x, y, all_props)
            if not changes.same(comparison):
                node_changed = True
        descendant_changed = False
        for child in children.get(node, []):
            if process(child):
                descendant_changed = True
        if descendant_changed:
            any_descendant_differs[node] = True
        return descendant_changed or node_changed

    for root in roots:
        c = process(root)
        if c: any_descendant_differs[root] = c
    dribble.log("# %s nodes in merge have some change in their descendants" %
                (len(any_descendant_differs)))
    return any_descendant_differs
예제 #3
0
 def process(here, there):
     for node in here.get_all_nodes():
         if cl.is_accepted(node) and not node in best:
             ar = best_intensional_match(node, there)
             if dribble.watch(node):
                 dribble.log("# Best: %s" % art.express(ar))
             if ar:
                 assert ar.dom == node
                 assert cl.is_accepted(ar.cod)
                 art.half_proclaim(best, ar)
예제 #4
0
def choose_best_match(arts):  # => art
    assert is_matches(arts)
    if len(arts) == 0: return None
    arts = skim_best_matches(arts)
    b = arts[0]
    if len(arts) == 1: return b
    dribble.log("** Multiple least-bad matches. Need to find tie-breakers.")
    dribble.log("   %s -> %s" %
                (cl.get_unique(b.dom), [cl.get_unique(a.cod) for a in arts]))
    return None
예제 #5
0
def express_proof(proof):
    (c, d, e) = proof
    # Assume resolution (x < y) until conflict is proven
    # assume potential child until proven otherwise
    if c and d and e:
        proof_expression = (">< %s [%s, %s, %s]" %
                            (cl.get_unique(yk), cl.get_unique(c),
                             cl.get_unique(d), cl.get_unique(e)))
        dribble.log(
            "** %s doesn't refine %s because\n   %s\n   yk [in x, in both, in yk]"
            % (cl.get_unique(x), cl.get_unique(y), proof_expression))
    # Should squirrel away the proof somewhere!
    return proof_expression
예제 #6
0
def align(B, A):

  # Precompute all best matches
  best = intension.best_intensional_match_map(B, A)

  # Extensional analysis yields <= relationships between hierarchies
  # (written as the 'matches' relation ~)
  xmrcas = infer_partners(best, A, B)
  dribble.log("# Number of cross-mrcas: %s" % len(xmrcas))

  # Turn tipward best matches into = or < articulations as appropriate
  proposal = intension.intensional_proposal(best, A, B)

  # Add extensional matches to a draft that already has intensional matches
  the_alignment = propose_alignment(proposal, best, xmrcas)
  return (the_alignment, xmrcas)
예제 #7
0
def write_report(A, B, al, xmrcas, format, outpath):
    if format == "eulerx":
        eulerx.dump_alignment(al, outpath)
    elif format == "diff":
        keyprop = None
        for prop in [cl.eol_page_id, cl.ncbi_id, cl.gbif_id]:
            if prop in A.properties:
                keyprop = prop
        diff.write_diff_set(A, B, al, keyprop, outpath)
    else:
        with open(outpath, "w") as outfile:
            (parents, roots) = merge.merge_checklists(A, B, al)
            dribble.log("Merged.  %s roots in merge, %s nodes with parents" %
                        (len(roots), len(parents)))
            report(A, B, al, roots, parents, outfile)
        report_on_collisions(A, B, al)
예제 #8
0
def align(B, A):

  # Precompute all best matches
  best = intension.best_intensional_match_map(B, A)

  # Turn tipward best matches into = or < articulations as appropriate
  tipwards = intension.intensional_alignment(tipward(best, A, B))

  # Extensional analysis
  cross_mrcas = analyze_cross_mrcas(B, A, tipwards)
  dribble.log("# Number of cross-mrcas: %s" % len(cross_mrcas))
  ext_map = extensional_match_map(A, B, tipwards, cross_mrcas)
  dribble.log("# Number of extensional relationships: %s" % len(ext_map))

  # Add extensional matches to a draft that already has intensional matches
  the_alignment = assemble_alignment(tipwards, best, ext_map)
  return (the_alignment, cross_mrcas)
예제 #9
0
def best_intensional_match_map(A, B):
    best = {}

    def process(here, there):
        for node in here.get_all_nodes():
            if cl.is_accepted(node) and not node in best:
                ar = best_intensional_match(node, there)
                if dribble.watch(node):
                    dribble.log("# Best: %s" % art.express(ar))
                if ar:
                    assert ar.dom == node
                    assert cl.is_accepted(ar.cod)
                    art.half_proclaim(best, ar)

    process(A, B)
    process(B, A)
    dribble.log("%s best matches" % len(best))
    return best
예제 #10
0
 def process(node):
     merged = inject(node, al)
     if not merged in parents:
         p = merged_parent(merged, al)
         if p:
             if dribble.watch(node):
                 (x, y) = p
                 dribble.log("# Merged parent(%s) = (%s, %s)" %
                             (cl.get_unique(node), cl.get_unique(x),
                              cl.get_unique(y)))
             parents[merged] = p  # Otherwise it's a root
         else:
             if dribble.watch(node):
                 dribble.log("# No merge(%s)" % cl.get_unique(node))
             if not merged in roots:
                 roots.append(merged)
     for child in cl.get_children(node):
         process(child)
예제 #11
0
 def subinfer_partners(x, other):
   y = None
   for child in cl.get_children(x):
     child_ar = subinfer_partners(child, other) # an articulation
     if child_ar != None:
       child_y = child_ar.cod
       if y == None:
         y = child_y
       else:
         y = cl.mrca(y, child_y)
   if y != None:
     ar = art.extensional(x, y, rel.matches, "cross-mrca")
   else:
     ar = get_mutual(best, x)
   if ar:
     assert cl.get_checklist(ar.cod) != cl.get_checklist(x)
     if dribble.watch(x):
       dribble.log("# Cross-mrca: %s" % (art.express(ar)))
     xmrcas[x] = ar
   return ar             # in B
예제 #12
0
def analyze_cross_mrcas(A, B, tipwards):
  cross_mrcas = {}
  def half_analyze_cross_mrcas(checklist, other):
    def subanalyze_cross_mrcas(node, other):
      result = None
      probe = tipwards.get(node)
      if probe:
        # Could be: = < or >
        result = probe.cod
      else:
        children = cl.get_children(node)
        if children:
          m = None      # None is the identity for mrca
          for child in children:
            m2 = subanalyze_cross_mrcas(child, other)
            if m2 != None:
              m = cl.mrca(m, m2) if m != None else m2
          if m != None:
            result = m
      if result:
        assert cl.get_checklist(result) != cl.get_checklist(node)
        if dribble.watch(node):
          dribble.log("# Cross-mrca(%s) = %s" %
                      (cl.get_unique(node), cl.get_unique(result)))
        cross_mrcas[node] = result
      return result             # in B
    for root in cl.get_roots(checklist):
      subanalyze_cross_mrcas(root, other)
  half_analyze_cross_mrcas(A, B)
  half_analyze_cross_mrcas(B, A)

  # Sanity check
  for node in cross_mrcas:
    cross = cross_mrcas[node]
    probe = cross_mrcas.get(cross)
    if probe:
      assert cl.get_checklist(probe) == cl.get_checklist(node)
    else:
      dribble.log("# No return cross-MRCA for %s -> %s -> ..." %\
                  (cl.get_unique(node), cl.get_unique(cross)))
  return cross_mrcas
예제 #13
0
def correct_children_mutexes(parent, parent_mutex):
  for child in get_children(parent):
    child_mutex = get_mutex(child)
    if child_mutex <= parent_mutex:
      if child_mutex == parent_mutex:
        dribble.log("# ** Child %s (%s) has same rank as parent %s" % \
                    (get_unique(child),
                     get_nominal_rank(child),
                     get_unique(parent)))
      else:
        dribble.log("# ** Child %s (%s) is of higher rank than parent %s (%s)" %\
                    (get_unique(child),
                     get_nominal_rank(child),
                     get_unique(parent),
                     get_nominal_rank(parent)))
      if is_container(child):
        new_mutex = parent_mutex + 1 # demote!
        set_mutex(child, new_mutex)
        correct_children_mutexes(child, new_mutex) # ?
      else:
        set_mutex(child, parent_mutex + 10)  # demote!
예제 #14
0
def report_on_collisions(A, B, al):
    index = cl.index_by_value(A, canonical_name)
    for name in index:
        A_nodes = index[name]
        if len(A_nodes) == 1:
            B_nodes = cl.get_nodes_with_value(B, canonical_name, name)
            if B_nodes and len(B_nodes) == 1:
                A_node = A_nodes[0]
                B_node = B_nodes[0]
                if cl.is_accepted(A_node) and cl.is_accepted(B_node):
                    ar1 = al.get(A_node)
                    ar2 = al.get(B_node)
                    ar1_bad = (ar1 and ar1.relation == rel.eq
                               and ar1.cod != B_node)
                    ar2_bad = (ar2 and ar2.relation == rel.eq
                               and ar2.cod != A_node)
                    if ar1_bad or ar2_bad:
                        dribble.log(
                            "# \"%s\" names different taxa in the two checklists"
                            % name)
                        dribble.log("  %s [%s]" %
                                    (art.express(ar1),
                                     art.reason(ar1) if ar1 else "-"))
                        dribble.log("  %s [%s]" %
                                    (art.express(ar2),
                                     art.reason(ar2) if ar2 else "-"))
예제 #15
0
 def subanalyze_cross_mrcas(node, other):
   result = None
   probe = tipwards.get(node)
   if probe:
     # Could be: = < or >
     result = probe.cod
   else:
     children = cl.get_children(node)
     if children:
       m = None      # None is the identity for mrca
       for child in children:
         m2 = subanalyze_cross_mrcas(child, other)
         if m2 != None:
           m = cl.mrca(m, m2) if m != None else m2
       if m != None:
         result = m
   if result:
     assert cl.get_checklist(result) != cl.get_checklist(node)
     if dribble.watch(node):
       dribble.log("# Cross-mrca(%s) = %s" %
                   (cl.get_unique(node), cl.get_unique(result)))
     cross_mrcas[node] = result
   return result             # in B
예제 #16
0
def validate(checklist):
  syn_count = 0
  acc_count = 0
  for node in checklist.get_all_nodes():
    parent_id = get_value(node, parent_taxon_id)
    accepted_id = get_value(node, accepted_taxon_id)
    status = get_taxonomic_status(node)
    if accepted_id:
      # assert is_synonym_status(status)  - we actually don't know
      # It's a synonym.  No parent, children, or synonyms allowed.
      assert not parent_id
      assert len(get_raw_children(node)) == 0
      assert len(get_raw_synonyms(node)) == 0
      assert accepted_id
      a = get_raw_accepted(node)
      if not a:
        print("** %s (taxonID %s) has accepted id %s, which doesn't resolve" %
              (get_unique(node), get_taxon_id(node), accepted_id))
        assert a
      syn_count += 1
    else:
      # It's accepted.  Parent and children must all
      # also be accepted, and synonyms must not be.
      assert not accepted_id
      if parent_id:
        # N.b. parent of a root is simply an undefined id
        p = get_raw_parent(node)
        if p:
          assert is_accepted(p)
      for child in get_raw_children(node):
        assert is_accepted(child)
      for syn in get_raw_synonyms(node):
        assert not is_accepted(syn)
      acc_count += 1
  dribble.log("# Validated %s accepted nodes, %s synonyms, total %s" %
              (acc_count, syn_count, len(checklist.get_all_nodes())))
예제 #17
0
def main(c1, c1_tag, c2, c2_tag, out, format):
    global dribble_file
    dribpath = out + ".log"
    with open(dribpath, "w") as dribfile:
        dribble.dribble_file = dribfile
        dribble.log("\nLogging to %s" % (dribpath, ))
        A = cl.read_checklist(c1, c1_tag + ".", "low-checklist")
        B = cl.read_checklist(c2, c2_tag + ".", "high-checklist")
        dribble.log("Node counts: %s %s" %
                    (len(A.get_all_nodes()), len(B.get_all_nodes())))
        # Map each B to a corresponding A
        dribble.log("Aligning ...")
        (al, xmrcas) = alignment.align(B, A)
        dribble.log("  ... finished aligning; %s articulations\n" % len(al))
        # Where do xmrcas come from?
        write_report(A, B, al, xmrcas, format, out)
        dribble.dribble_file = None
예제 #18
0
def half_proclaim(draft, ar):
    p = proclaimable(draft, ar)
    if p == MEH:
        if dribble.watch(ar.dom):
            dribble.log("# Meh: %s" % express(ar))
        pass
    elif p:
        if dribble.watch(ar.dom):
            dribble.log("# storing")
        draft[ar.dom] = ar
    else:
        print("** Not OK to replace %s\n   with %s" %
              (express(draft.get(ar.dom)), express(ar)))
        assert False

    if dribble.watch(ar.dom):
        dribble.log("# Proclaim %s\n  yields %s" %
                    (express(ar), express(draft.get(ar.dom))))
예제 #19
0
 def filter(node):
   debug = dribble.watch(node)
   found_match = None
   for child in cl.get_children(node):
     ar = filter(child)
     if ar:
       found_match = ar
   if found_match:    # Some descendant is a particle
     if debug: dribble.log("# %s: descendant matches, not keeping: %s" %
                           (cl.get_unique(node), art.express(found_match)))
     return found_match
   elif node in amap:
     ar = amap[node]
     tw[ar.dom] = ar
     if debug: dribble.log("# %s is a tipward match, keeping: %s" %
                           (cl.get_unique(node), art.express(ar)))
     return ar
   else:
     if debug: dribble.log("# %s is unmatched" % cl.get_unique(node))
     return None
예제 #20
0
def extensional_match(node, xmrcas):
  partner = xmrcas.get(node)      # node in other checklist; 'conode'
  if not partner:
    # Descendant of a particle
    if dribble.watch(node):
      dribble.log("# EM: %s is not tipward." % cl.get_unique(node))
    return None
  back = xmrcas.get(partner)    # 'bounce'
  if not back:
    # Not sure how this can happen but it does (NCBI vs. GBIF)
    dribble.log("%s <= %s <= nowhere" % (cl.get_unique(node),
                                         cl.get_unique(partner)))
    if dribble.watch(node):
      dribble.log("# EM: %s killed because aborted round trip." % cl.get_unique(node))
    return None
  # node <= partner <= back
  how = cl.how_related(node, back)    # Alway rcc5
  if how == rel.eq:
    # Should end up being eq iff name match or unique match
    # Can test for unique match by looking at xmrca of parent

    # Could be part of a 'monotypic' chain; fix later
    how = rel.matches
    reason = "mutual-cross-mrca"
  elif how == rel.gt:
    how = rel.matches
    reason = "monotypic-inversion"
  elif how == rel.disjoint:
    reason = "particle-set-exclusion"
  else:               # must be rel.lt
    # Assume resolution (node < partner) until conflict is proven
    reason = "refinement"
    # Look for an intersection between any partner-child and node
    # x is in A checklist, y is in B checklist
    for pchild in cl.get_children(partner):
      pchild_back = xmrcas.get(pchild)
      if pchild_back == None:
        # pchild ! node
        pass
      else:
        (d, e) = cross_compare(node, pchild, xmrcas)
        # d < node while e ! node
        if d and e:
          how = rel.conflict
          reason = ("%s is in; its sibling %s is not" %
                    (cl.get_unique(d), cl.get_unique(e)))
          dribble.log("** %s conflicts with %s because\n"
                      "   %s ! %s\n   (but sibling %s < %s)" %
                      (cl.get_unique(node),
                       cl.get_unique(partner),
                       cl.get_unique(e),
                       cl.get_unique(node),
                       cl.get_unique(d),
                       cl.get_unique(node)))
          break
        elif e:
          reason = ("%s is not in it" % cl.get_unique(e))

  ar = art.extensional(node, partner, how, reason)
  if dribble.watch(node):
    dribble.log("# Extensional articulation %s" % art.express(ar))
  return ar
예제 #21
0
파일: property.py 프로젝트: jar398/cldiff
def by_name(name):
  assert not "\t" in name
  prop = properties_by_pet_name.get(name)
  if not prop:
    dribble.log("No such property: %s" % name)
  return prop
예제 #22
0
def extensional(dom, cod, re, reason, revreason=None):
    ar = bridge(dom, cod, re, reason, revreason)
    if dribble.watch(dom):
        dribble.log("# Extensional articulation %s" % art.express(ar))
    return ar