def __init__(self, config, taxonomy_fname): self.config = config self.taxonomy = GGTaxonomyFile(taxonomy_fname) self.max_level = self.taxonomy.max_rank_level() self.stats = [StatEntry() for i in range(self.max_level + 1)] self.totstat_lvl = None self.miss_list = [] self.tp = [] self.tn = [] self.fp = [] self.fn = []
def __init__(self, config, taxonomy_fname): self.config = config self.taxonomy = GGTaxonomyFile(taxonomy_fname) self.max_level = self.taxonomy.max_rank_level() self.stats = [ StatEntry() for i in range(self.max_level+1) ] self.totstat_lvl = None self.miss_list = [] self.tp = [] self.tn = [] self.fp = [] self.fn = []
for mis_rec in mislabel_map.itervalues(): rank = mis_rec["mis_rank"] rank_mis_map[rank] = rank_mis_map.get(rank, 0) + 1 rank_mis_count = sorted(rank_mis_map.items(), key=itemgetter(0)) total = 0 for rank, cnt in rank_mis_count: total += cnt # print "%s:\t%d\t%d" % (rank,cnt,total) print "" if not args.gtruth_fname: sys.exit(0) true_taxonomy = GGTaxonomyFile(args.gtruth_fname) out_path, out_stem = os.path.split(args.mis_fname) if args.output_dir: out_path = args.output_dir if out_stem.endswith(".mis"): out_stem = out_stem[:-4] # mis_fname = os.path.join(out_path, out_stem + ".mis") e = EpataxEvaluator(config, args.taxonomy_fname) e.calc_mislabel_stats(mislabel_map, true_taxonomy.seq_ranks_map, False) fp_fname = os.path.join(out_path, out_stem + ".eval%d.fp" % int(args.min_conf * 100)) with open(fp_fname, "w") as fout: for sid in e.fp:
def set_taxonomy(self, taxonomy_fname): self.taxonomy = GGTaxonomyFile(taxonomy_fname)
class EpataxEvaluator: def __init__(self, config, taxonomy_fname): self.config = config self.taxonomy = GGTaxonomyFile(taxonomy_fname) self.max_level = self.taxonomy.max_rank_level() self.stats = [StatEntry() for i in range(self.max_level + 1)] self.totstat_lvl = None self.miss_list = [] self.tp = [] self.tn = [] self.fp = [] self.fn = [] def set_taxonomy(self, taxonomy_fname): self.taxonomy = GGTaxonomyFile(taxonomy_fname) def calc_stats(self, tax_assign_map, cumulative=True): if not cumulative: self.stats += [StatEntry()] self.totstat_lvl = len(self.stats) - 1 t = self.taxonomy # evaluate the classification results against the ground truth for sid, epa_ranks in tax_assign_map.items(): true_ranks = t.get_seq_ranks(sid) r = self.max_level miss = False # for clades which were ingored during the reference tree construction, # true assignment will be the empty one (i.e., if we excluded a genus G1 # from the tree, the "right" assignment will be the family it belongs to) ignore_level = len(true_ranks) for (rank_level, rank_name) in self.config.eval_ignored_clades: if rank_level < ignore_level and true_ranks[ rank_level] == rank_name: ignore_level = rank_level while r >= 0: # as described above, we treat ranks from ignored clades as # "missing data" in the ground truth if r >= ignore_level: # NOTE: even though no assignment is a technically TN, we consider # it being a TP in order to calculate precision/recall as usual if epa_ranks[r] == "": self.stats[r].tp += 1 else: self.stats[r].fp += 1 # "normal" clades elif epa_ranks[r] == "" and true_ranks[r] == "": self.stats[r].tn += 1 elif epa_ranks[r] == "" and true_ranks[r] != "": self.stats[r].fn += 1 miss = True elif epa_ranks[r] != "" and true_ranks[r] == "": # TODO think about this case # it's considered FP for now, but can be actually correct classification self.stats[r].fp2 += 1 miss = True elif epa_ranks[r] == true_ranks[r]: self.stats[r].tp += 1 elif epa_ranks[r] != true_ranks[r]: self.stats[r].fp += 1 miss = True else: print "FATAL ERROR: Oops, this shouldn't have happened..." sys.exit() r -= 1 if miss: self.miss_list += [sid] # calculate accuracy measures for i in range(len(self.stats)): self.stats[i].recalc() def calc_ranktest_stats(self, tax_assign_map): t = self.taxonomy # evaluate the classification results against the ground truth for sid, epa_ranks in tax_assign_map.items(): true_ranks = t.get_seq_ranks(sid) r = self.max_level miss = False fp = False # for clades which were ingored during the reference tree construction, # true assignment will be the empty one (i.e., if we excluded a genus G1 # from the tree, the "right" assignment will be the family it belongs to) ignore_level = len(true_ranks) for (rank_level, rank_name) in self.config.eval_ignored_clades: if rank_level < ignore_level and true_ranks[ rank_level] == rank_name: ignore_level = rank_level while r >= 0: # as described above, we treat ranks from ignored clades as # "missing data" in the ground truth if r >= ignore_level: # NOTE: even though no assignment is a technically TN, we consider # it being a TP in order to calculate precision/recall as usual if epa_ranks[r] == "": self.stats[r].tp += 1 else: self.stats[r].fp += 1 fp = True # FP in lower rank -> FP in all upper ranks elif fp: self.stats[r].fp += 1 # "normal" clades elif epa_ranks[r] == "" and true_ranks[r] == "": self.stats[r].tn += 1 elif epa_ranks[r] == "" and true_ranks[r] != "": self.stats[r].fn += 1 miss = True elif epa_ranks[r] != true_ranks[r]: self.stats[r].fp += 1 miss = True fp = True elif epa_ranks[r] != "" and true_ranks[r] == "": # TODO think about this case # it's considered FP for now, but can be actually correct classification self.stats[r].fp2 += 1 miss = True fp = True elif epa_ranks[r] == true_ranks[r]: self.stats[r].tp += 1 else: print "FATAL ERROR: Oops, this shouldn't have happened..." sys.exit() r -= 1 if miss: self.miss_list += [sid] # calculate accuracy measures for i in range(len(self.stats)): self.stats[i].recalc() def get_mis_level(self, ranks, true_ranks): mislabel_lvl = -1 min_len = min(len(ranks), len(true_ranks)) for rank_lvl in range(min_len): if ranks[rank_lvl] != "-" and ranks[rank_lvl] != true_ranks[ rank_lvl]: mislabel_lvl = rank_lvl break return mislabel_lvl def calc_mislabel_stats(self, mislabel_map, true_map, cumulative=True): t = self.taxonomy if not cumulative: self.stats += [StatEntry()] self.totstat_lvl = len(self.stats) - 1 for sid in mislabel_map.iterkeys(): if sid not in true_map: self.fp += [sid] mis_rec = mislabel_map[sid] lvl = mis_rec["mis_lvl"] rlist = range( lvl - 1, 7) if cumulative else [lvl - 1, self.totstat_lvl] for r in rlist: self.stats[r].fp += 1 # evaluate the classification results against the ground truth for sid in true_map.iterkeys(): true_ranks = true_map[sid] lvl = self.get_mis_level(t.get_seq_ranks(sid), true_ranks) rlist = range(lvl, 7) if cumulative else [lvl, self.totstat_lvl] if sid not in mislabel_map: self.fn += [sid] for r in rlist: self.stats[r].fn += 1 else: self.tp += [sid] mis_rec = mislabel_map[sid] # lvl = mis_rec["mis_lvl"] for r in rlist: self.stats[r].tp += 1 # calculate accuracy measures for i in range(len(self.stats)): self.stats[i].recalc() def write_misclassified(self, tax_assign_map, tax_assign_conf, out_fname): t = self.taxonomy with open(out_fname, 'w') as fout: fout.write("; List of misclassified sequences.\n") fout.write("; Format: 4 lines per sequence\n") fout.write("; 1 sequence ID\n") fout.write("; 2 correct assignment\n") fout.write("; 3 epatax assignment\n") fout.write("; 4 rank confidence levels\n\n") for sid in self.miss_list: fout.write(sid + "\n") true_ranks = t.get_seq_ranks(sid) fout.write(Taxonomy.lineage_str(true_ranks) + "\n") epa_ranks = tax_assign_map[sid] fout.write(Taxonomy.lineage_str(epa_ranks) + "\n") rank_conf = tax_assign_conf[sid] fout.write("\t".join(["%.3f" % conf for conf in rank_conf]) + "\n") fout.write("\n") def write_stats(self, out_fname): t = self.taxonomy with open(out_fname, 'w') as fout: fout.write(StatEntry.desc_string() + "\n") for i in range(len(self.stats)): rname = t.rank_level_name( i) if i != self.totstat_lvl else "Total" fout.write(rname + "\t" + self.stats[i].to_string("\t") + "\n")
for mis_rec in mislabel_map.itervalues(): rank = mis_rec["mis_rank"] rank_mis_map[rank] = rank_mis_map.get(rank, 0) + 1 rank_mis_count = sorted(rank_mis_map.items(), key=itemgetter(0)) total = 0 for rank, cnt in rank_mis_count: total += cnt # print "%s:\t%d\t%d" % (rank,cnt,total) print "" if not args.gtruth_fname: sys.exit(0) true_taxonomy = GGTaxonomyFile(args.gtruth_fname) out_path, out_stem = os.path.split(args.mis_fname) if args.output_dir: out_path = args.output_dir if out_stem.endswith(".mis"): out_stem = out_stem[:-4] # mis_fname = os.path.join(out_path, out_stem + ".mis") e = EpataxEvaluator(config, args.taxonomy_fname) e.calc_mislabel_stats(mislabel_map, true_taxonomy.seq_ranks_map, False) fp_fname = os.path.join(out_path, out_stem + ".eval%d.fp" % int(args.min_conf * 100)) with open(fp_fname, "w") as fout: for sid in e.fp: ranks = mislabel_map[sid]["ranks"]
class EpataxEvaluator: def __init__(self, config, taxonomy_fname): self.config = config self.taxonomy = GGTaxonomyFile(taxonomy_fname) self.max_level = self.taxonomy.max_rank_level() self.stats = [ StatEntry() for i in range(self.max_level+1) ] self.totstat_lvl = None self.miss_list = [] self.tp = [] self.tn = [] self.fp = [] self.fn = [] def set_taxonomy(self, taxonomy_fname): self.taxonomy = GGTaxonomyFile(taxonomy_fname) def calc_stats(self, tax_assign_map, cumulative=True): if not cumulative: self.stats += [ StatEntry() ] self.totstat_lvl = len(self.stats)-1 t = self.taxonomy # evaluate the classification results against the ground truth for sid, epa_ranks in tax_assign_map.items(): true_ranks = t.get_seq_ranks(sid) r = self.max_level miss = False # for clades which were ingored during the reference tree construction, # true assignment will be the empty one (i.e., if we excluded a genus G1 # from the tree, the "right" assignment will be the family it belongs to) ignore_level = len(true_ranks) for (rank_level, rank_name) in self.config.eval_ignored_clades: if rank_level < ignore_level and true_ranks[rank_level] == rank_name: ignore_level = rank_level while r >= 0: # as described above, we treat ranks from ignored clades as # "missing data" in the ground truth if r >= ignore_level: # NOTE: even though no assignment is a technically TN, we consider # it being a TP in order to calculate precision/recall as usual if epa_ranks[r] == "": self.stats[r].tp += 1 else: self.stats[r].fp += 1 # "normal" clades elif epa_ranks[r] == "" and true_ranks[r] == "": self.stats[r].tn += 1 elif epa_ranks[r] == "" and true_ranks[r] != "": self.stats[r].fn += 1 miss = True elif epa_ranks[r] != "" and true_ranks[r] == "": # TODO think about this case # it's considered FP for now, but can be actually correct classification self.stats[r].fp2 += 1 miss = True elif epa_ranks[r] == true_ranks[r]: self.stats[r].tp += 1 elif epa_ranks[r] != true_ranks[r]: self.stats[r].fp += 1 miss = True else: print "FATAL ERROR: Oops, this shouldn't have happened..." sys.exit() r -= 1 if miss: self.miss_list += [sid] # calculate accuracy measures for i in range(len(self.stats)): self.stats[i].recalc() def calc_ranktest_stats(self, tax_assign_map): t = self.taxonomy # evaluate the classification results against the ground truth for sid, epa_ranks in tax_assign_map.items(): true_ranks = t.get_seq_ranks(sid) r = self.max_level miss = False fp = False # for clades which were ingored during the reference tree construction, # true assignment will be the empty one (i.e., if we excluded a genus G1 # from the tree, the "right" assignment will be the family it belongs to) ignore_level = len(true_ranks) for (rank_level, rank_name) in self.config.eval_ignored_clades: if rank_level < ignore_level and true_ranks[rank_level] == rank_name: ignore_level = rank_level while r >= 0: # as described above, we treat ranks from ignored clades as # "missing data" in the ground truth if r >= ignore_level: # NOTE: even though no assignment is a technically TN, we consider # it being a TP in order to calculate precision/recall as usual if epa_ranks[r] == "": self.stats[r].tp += 1 else: self.stats[r].fp += 1 fp = True # FP in lower rank -> FP in all upper ranks elif fp: self.stats[r].fp += 1 # "normal" clades elif epa_ranks[r] == "" and true_ranks[r] == "": self.stats[r].tn += 1 elif epa_ranks[r] == "" and true_ranks[r] != "": self.stats[r].fn += 1 miss = True elif epa_ranks[r] != true_ranks[r]: self.stats[r].fp += 1 miss = True fp = True elif epa_ranks[r] != "" and true_ranks[r] == "": # TODO think about this case # it's considered FP for now, but can be actually correct classification self.stats[r].fp2 += 1 miss = True fp = True elif epa_ranks[r] == true_ranks[r]: self.stats[r].tp += 1 else: print "FATAL ERROR: Oops, this shouldn't have happened..." sys.exit() r -= 1 if miss: self.miss_list += [sid] # calculate accuracy measures for i in range(len(self.stats)): self.stats[i].recalc() def get_mis_level(self, ranks, true_ranks): mislabel_lvl = -1 min_len = min(len(ranks),len(true_ranks)) for rank_lvl in range(min_len): if ranks[rank_lvl] != "-" and ranks[rank_lvl] != true_ranks[rank_lvl]: mislabel_lvl = rank_lvl break return mislabel_lvl def calc_mislabel_stats(self, mislabel_map, true_map, cumulative=True): t = self.taxonomy if not cumulative: self.stats += [ StatEntry() ] self.totstat_lvl = len(self.stats)-1 for sid in mislabel_map.iterkeys(): if sid not in true_map: self.fp += [sid] mis_rec = mislabel_map[sid] lvl = mis_rec["mis_lvl"] rlist = range(lvl-1, 7) if cumulative else [lvl-1, self.totstat_lvl] for r in rlist: self.stats[r].fp += 1 # evaluate the classification results against the ground truth for sid in true_map.iterkeys(): true_ranks = true_map[sid] lvl = self.get_mis_level(t.get_seq_ranks(sid), true_ranks) rlist = range(lvl, 7) if cumulative else [lvl, self.totstat_lvl] if sid not in mislabel_map: self.fn += [sid] for r in rlist: self.stats[r].fn += 1 else: self.tp += [sid] mis_rec = mislabel_map[sid] # lvl = mis_rec["mis_lvl"] for r in rlist: self.stats[r].tp += 1 # calculate accuracy measures for i in range(len(self.stats)): self.stats[i].recalc() def write_misclassified(self, tax_assign_map, tax_assign_conf, out_fname): t = self.taxonomy with open(out_fname, 'w') as fout: fout.write("; List of misclassified sequences.\n") fout.write("; Format: 4 lines per sequence\n") fout.write("; 1 sequence ID\n") fout.write("; 2 correct assignment\n") fout.write("; 3 epatax assignment\n") fout.write("; 4 rank confidence levels\n\n") for sid in self.miss_list: fout.write(sid + "\n") true_ranks = t.get_seq_ranks(sid) fout.write(Taxonomy.lineage_str(true_ranks) + "\n") epa_ranks = tax_assign_map[sid] fout.write(Taxonomy.lineage_str(epa_ranks) + "\n") rank_conf = tax_assign_conf[sid] fout.write("\t".join(["%.3f" % conf for conf in rank_conf]) + "\n") fout.write("\n") def write_stats(self, out_fname): t = self.taxonomy with open(out_fname, 'w') as fout: fout.write(StatEntry.desc_string() + "\n") for i in range(len(self.stats)): rname = t.rank_level_name(i) if i != self.totstat_lvl else "Total" fout.write(rname + "\t" + self.stats[i].to_string("\t") + "\n")