Exemplo n.º 1
0
def add_taxonomy_metadata(non_monophyletic_taxa):
    broken_taxa = non_monophyletic_taxa['non_monophyletic_taxa']
    taxonomy = ott.OTT()
    id2names = taxonomy.ott_id_to_names
    id2ranks = taxonomy.ott_id_to_ranks

    for oid in broken_taxa:
        pattern = re.compile(r'ott')
        int_id = int(re.sub(pattern, '', oid))
        name = "no name"
        rank = "no rank"
        if int_id in id2names:
            name = id2names[int_id]
            if (isinstance(name, tuple)):
                name = name[0]
            if int_id in id2ranks:
                rank = id2ranks[int_id]
        print(oid, name, rank)
        broken_taxa[oid]['name'] = name
        broken_taxa[oid]['rank'] = rank
    non_monophyletic_taxa['non_monophyletic_taxa'] = broken_taxa
    return non_monophyletic_taxa
Exemplo n.º 2
0
def newly_broken_taxa_report(run1, run2):
    # load local copy of OTT
    print("\nAnalyzing broken taxa:", file=sys.stderr)
    print("  * Loading OTT ... ", end='', flush=True, file=sys.stderr)
    taxonomy = ott.OTT()
    print("done. (Using version {})".format(taxonomy.version),
          flush=True,
          file=sys.stderr)

    id2names = taxonomy.ott_id_to_names
    for id in id2names:
        if isinstance(id2names[id], tuple):
            id2names[id] = id2names[id][0]

    id2ranks = rank_unranked_nodes(taxonomy)

    # print details of names in 2 but not in 1 (the 'newly broken names')
    bt1 = set(run1.broken_taxa)
    bt2 = set(run2.broken_taxa)
    diff = bt2.difference(bt1)
    broken_taxa_filename = 'broken_taxa_report.csv'
    print("  * Printing details of {x} broken taxa to {f}".format(
        x=len(diff), f=broken_taxa_filename))

    conflict_status2 = run2.get_taxon_conflict_info()

    with open(broken_taxa_filename, 'w') as f:
        for ottID in diff:
            # 1. Get name and rank for ottID
            int_id = get_id_from_ottnum(ottID)
            name = "no name"
            rank = "no rank"
            if int_id in id2names:
                name = id2names[int_id]
            if int_id in id2ranks:
                rank = id2ranks[int_id]
            f.write("{i},{n},{r}\n".format(i=int_id, n=name, r=rank))

    # We want to know:
    #   For each tree,
    #     For each rank, and for all ranks put together
    #       How many taxa does this tree (a) conflict with, (b) resolve (c) align to
    # Then, for the conflict, we want to know how many of each rank are NEW with run2?
    # Maybe we also want to know how many trees from the same study are being used?

    # tree:
    #   total:  (newly broken) conflicts / resolves / aligns
    #   rank1:  (newly broken) conflicts / resolves / aligns: newly-broken names
    #   rank2:  (newly broken) conflicts / resolves / aligns: newly-broken names
    #   genus:  (newly broken) conflicts / resolves / aligns

    victims = defaultdict(set)
    new_victims = defaultdict(set)

    tree_conflict = defaultdict(lambda: defaultdict(set))
    tree_conflict_at_rank = defaultdict(
        lambda: defaultdict(lambda: defaultdict(set)))
    for ott_node, node_conflict in conflict_status2.items():
        int_id = get_id_from_ottnum(ott_node)
        rank = id2ranks[int_id]
        for rel, tree_nodes in node_conflict.items():
            for tree, nodes in tree_nodes.items():
                if rel == "conflicts_with":
                    tree_conflict[tree]["conflicts_with"].add(int_id)
                    tree_conflict_at_rank[tree][rank]["conflicts_with"].add(
                        int_id)
                    victims[tree].add(int_id)
                    if ott_node in diff:
                        new_victims[tree].add(int_id)
                        tree_conflict[tree]["newly_broken"].add(int_id)
                        tree_conflict_at_rank[tree][rank]["newly_broken"].add(
                            int_id)
                elif rel == "supported_by" or rel == "partial_path_of":
                    tree_conflict[tree]["aligns_to"].add(int_id)
                    tree_conflict_at_rank[tree][rank]["aligns_to"].add(int_id)
                elif rel == "resolved_by":
                    tree_conflict[tree]["resolves"].add(int_id)
                    tree_conflict_at_rank[tree][rank]["resolves"].add(int_id)

# FIXME: write out number of duplicate trees per study
# NEW: show aligns_to last

    print(
        "Here are the {} trees that broke NEW taxa, starting with the most newly-broken taxa:\n"
        .format(len(new_victims)))
    print("\n\n{}: ({}) {} / {} / {} ".format("tree",
                                              bold(yellow("newly_broken")),
                                              yellow("conflicts_with"),
                                              cyan("aligns_to"),
                                              green("resolves")))

    for tree in sorted(new_victims,
                       key=lambda x: len(new_victims.get(x)),
                       reverse=True):
        ctree = tree
        if len(tree_conflict[tree]["conflicts_with"]) > len(
                tree_conflict[tree]["aligns_to"]):
            ctree = bold(red(tree))
        print("\n\n{}: ({}) {} / {} / {} ".format(
            ctree, bold(yellow(len(tree_conflict[tree]["newly_broken"]))),
            yellow(len(tree_conflict[tree]["conflicts_with"])),
            cyan(len(tree_conflict[tree]["aligns_to"])),
            green(len(tree_conflict[tree]["resolves"]))))

        for rank in sorted(tree_conflict_at_rank[tree],
                           key=lambda key: rank_of_rank[key]):

            conflict = tree_conflict_at_rank[tree][rank]
            examples = ''
            if rank_of_rank[rank] < rank_of_rank["genus"]:
                examples2 = set()
                for example_id in conflict["newly_broken"]:
                    examples2.add(id2names[example_id])
                if (len(examples2) > 0):
                    examples = '{}'.format(examples2)

            n_newly_broken = len(conflict["newly_broken"])

            if (n_newly_broken > 0):
                n_newly_broken = bold(yellow(len(conflict["newly_broken"])))
                crank = get_color_rank(rank)
            else:
                n_newly_broken = '0'
                crank = rank

            n_conflicts_with = len(conflict["conflicts_with"])
            if (n_conflicts_with > 0):
                n_conflicts_with = yellow(n_conflicts_with)

            n_aligns_to = len(conflict["aligns_to"])
            if (n_aligns_to > 0):
                n_aligns_to = cyan(n_aligns_to)

            n_resolves = len(conflict["resolves"])
            if (n_resolves > 0):
                n_resolves = green(n_resolves)

            print("   {}: ({}) {} / {} / {}    {}".format(
                crank, n_newly_broken, n_conflicts_with, n_aligns_to,
                n_resolves, examples))

    print(
        "\n\n\nHere are the other {} trees that broke taxa, starting with the most newly-broken taxa:\n"
        .format(len(victims) - len(new_victims)))
    for tree in sorted(victims,
                       key=lambda x: len(victims.get(x)),
                       reverse=True):
        if tree in new_victims:
            continue

        ctree = tree
        if len(tree_conflict[tree]["conflicts_with"]) > len(
                tree_conflict[tree]["aligns_to"]):
            ctree = bold(red(tree))
        print("\n\n{}: {} / {} / {} ".format(
            ctree, yellow(len(tree_conflict[tree]["conflicts_with"])),
            cyan(len(tree_conflict[tree]["aligns_to"])),
            green(len(tree_conflict[tree]["resolves"]))))

        for rank in sorted(tree_conflict_at_rank[tree],
                           key=lambda key: rank_of_rank[key]):

            conflict = tree_conflict_at_rank[tree][rank]
            examples = ''
            if rank_of_rank[rank] < rank_of_rank["genus"]:
                examples2 = set()
                for example_id in conflict["conflicts_with"]:
                    examples2.add(id2names[example_id])
                if (len(examples2) > 0):
                    examples = '{}'.format(examples2)

            n_conflicts_with = len(conflict["conflicts_with"])
            if (n_conflicts_with > 0):
                n_conflicts_with = yellow(n_conflicts_with)
                crank = get_color_rank(rank)
            else:
                crank = rank

            n_aligns_to = len(conflict["aligns_to"])
            if (n_aligns_to > 0):
                n_aligns_to = cyan(n_aligns_to)

            n_resolves = len(conflict["resolves"])
            if (n_resolves > 0):
                n_resolves = green(n_resolves)

            print("   {}: {} / {} / {}    {}".format(crank, n_conflicts_with,
                                                     n_aligns_to, n_resolves,
                                                     examples))
Exemplo n.º 3
0
def newly_broken_taxa_report(run1,run2):
    # load local copy of OTT
    print("\nAnalyzing broken taxa:")
    print("  * Loading OTT ... ", end='',flush=True);
    taxonomy = ott.OTT()
    print("done. (Using version {})".format(taxonomy.version), flush=True);

    id2names = taxonomy.ott_id_to_names
    for id in id2names:
        if isinstance(id2names[id],tuple):
            id2names[id] = id2names[id][0]

    id2ranks = rank_unranked_nodes(taxonomy)

    # print details of names in 2 but not in 1 (the 'newly broken names')
    bt1=set(run1.broken_taxa)
    bt2=set(run2.broken_taxa)
    diff = bt2.difference(bt1)
    broken_taxa_filename = 'broken_taxa_report.csv'
    print("  * Printing details of {x} broken taxa to {f}".format(
        x=len(diff),
        f=broken_taxa_filename
        ))

    conflict_status1 = run1.get_taxon_conflict_info()
    conflict_status2 = run2.get_taxon_conflict_info()
#    print(conflict_status1)
#    exit(0)
    with open(broken_taxa_filename, 'w') as f:
        for ottID in diff:
            # 1. Get name and rank for ottID
            int_id = get_id_from_ottnum(ottID)
            name = "no name"
            rank = "no rank"
            if int_id in id2names:
                name = id2names[int_id]
            if int_id in id2ranks:
                rank = id2ranks[int_id]
            f.write("{i},{n},{r}\n".format(i=int_id,n=name,r=rank))


    # We want to know:
    #   For each tree,
    #     For each rank, and for all ranks put together
    #       How many taxa does this tree (a) conflict with, (b) resolve (c) align to
    # Then, for the conflict, we want to know how many of each rank are NEW with run2?
    # Maybe we also want to know how many trees from the same study are being used?

    # tree:
    #   total:  (newly broken) conflicts / resolves / aligns
    #   rank1:  (newly broken) conflicts / resolves / aligns: newly-broken names
    #   rank2:  (newly broken) conflicts / resolves / aligns: newly-broken names
    #   genus:  (newly broken) conflicts / resolves / aligns

    (tree_conflict1,tree_conflict_at_rank1) = get_conflict_info_by_tree(conflict_status1, id2ranks)
    (conflict1,conflict_at_rank1) = union_over_trees(tree_conflict1, tree_conflict_at_rank1)
    (tree_conflict2,tree_conflict_at_rank2) = get_conflict_info_by_tree(conflict_status2, id2ranks)

# FIXME: write out number of duplicate trees per study
# NEW: show aligns_to last
    
    print("Here are the trees in order of NEW broken taxa, then all broken taxa:\n")
    print("\n\n{}: ({}) {} / ({}) {} / ({}) {} ".format("tree",
                                              bold(yellow("change")),
                                              yellow("conflicts_with"),
                                              bold(cyan("change")),
                                              cyan("aligns_to"),
                                              bold(green("change")),
                                              green("resolves")))

    for tree in sorted(tree_conflict2,
                       key=lambda tree:( len(tree_conflict2[tree]["conflicts_with"] - conflict1["conflicts_with"]),
                                         len(tree_conflict2[tree]["conflicts_with"]) ),
                       reverse=True):
        ctree=tree
        if len(tree_conflict2[tree]["conflicts_with"]) > len(tree_conflict2[tree]["aligns_to"]):
            ctree=bold(red(tree))
        print("\n\n{}: {}".format(ctree, conflict_summary_line(conflict1, tree_conflict2[tree])))

        for rank in sorted(tree_conflict_at_rank2[tree], key=lambda key:rank_of_rank[key]):

            c1 = conflict_at_rank1[rank]
            c2 = tree_conflict_at_rank2[tree][rank]

            examples=''
            if rank_of_rank[rank] < rank_of_rank["genus"]:
                examples2 = set()
                for example_id in c2["conflicts_with"] - c1["conflicts_with"]:
                    examples2.add(id2names[example_id])
                if (len(examples2) > 0):
                    examples = '{}'.format(examples2)
                
            n_newly_broken = len(c2["conflicts_with"] - c1["conflicts_with"])

            if (n_newly_broken > 0):
                crank = get_color_rank(rank)
            else:
                crank = rank

            print("   {}: {}    {}".format(crank,
                                           conflict_summary_line(c1, c2),
                                           examples))

    # Find duplicate trees per study
    print ("Studies with duplicate trees:\n\n")
    trees_for_study = defaultdict(set)
    for study_tree in run2.read_input_trees()['input_trees']:
        (study,tree) = study_tree.split('@')
        trees_for_study[study].add(tree)
    for study,trees in trees_for_study.items():
        if len(trees) > 1:
            print("{} : {} : {}".format(study,len(trees),trees))
Exemplo n.º 4
0
    parser.add_argument('-v',
                        dest='verbose',
                        action='store_true',
                        help='print details on diffs; default false')
    return parser.parse_args()


if __name__ == "__main__":
    args = parse_cmdline()
    print("  * Synth output: {d}".format(d=args.synth), file=sys.stderr)

    taxon = args.taxon
    taxon_id = get_id_from_ottnum(taxon)

    print("  * Loading OTT ... ", end='', flush=True, file=sys.stderr)
    taxonomy = ott.OTT()
    print("done. (Using version {})".format(taxonomy.version),
          flush=True,
          file=sys.stderr)

    # get stats object for each run
    synth = runStatistics(args.synth)

    print("  * Finding nodes for taxon {}, id={}".format(args.taxon, taxon_id),
          file=sys.stderr)
    desc = get_all_descendants(taxonomy, taxon_id)
    print("    - Found {} descendants, including original taxon".format(
        len(desc)),
          file=sys.stderr)

    conflict = synth.get_taxon_conflict_info()