def doit(): rug = Taxonomy.getTaxonomy('scratch/Ruggiero/', 'rug') ott = Taxonomy.getTaxonomy('tax/ott/', 'ott') union = UnionTaxonomy.newTaxonomy('ott') union.absorb(rug) union.absorb(ott) union.dump('scratch/compare_Ruggiero/', '\t')
def conflict(spec1, space1, spec2, space2): # Reference tree ref = Taxonomy.getTaxonomy(spec1, space1) # Input tree input = Taxonomy.getTaxonomy(spec2, space2) a = AlignmentByName(input, ref) a.align(); if False: for node in input.taxa(): print node, a.getTaxon(node) print 'Conflict analysis' ca = ConflictAnalysis(input, ref, a, False) print ' input root:', ca.inputRoot print ' ref root:', ca.refRoot print ' induced root:', ca.inducedRoot print ' ingroup:', ca.ingroup print ' induced ingroup:', ca.inducedIngroup print ' map size:', ca.map.size() print ' comap size:', ca.comap.size() mapped_tip_count = 0 unmapped_tip_count = 0 none_count = 0 rel_counts = {} if ca.inducedRoot != None: for node in ca.ingroup.descendants(True): if node.hasChildren(): art = ca.articulation(node) if art != None: n = art.disposition.name print node, n, art.witness rel_counts[n] = rel_counts.get(n, 0) + 1 else: print node, 'no articulation' none_count += 1 elif a.getTaxon(node) != None: mapped_tip_count += 1 else: unmapped_tip_count += 1 print node, 'unmapped' else: print 'no induced root!' print for n in rel_counts: print '%s: %s' % (n, rel_counts[n]) print 'Mapped tips:', mapped_tip_count print 'Unmapped tips:', unmapped_tip_count print 'Other:', none_count
def tst(noise, target, source): print '##', noise sep = Taxonomy.getRawTaxonomy('tax/skel/', 'ott') t = Taxonomy.getRawTaxonomy(target, 'target') s = Taxonomy.getRawTaxonomy(source, 'source') u = combine(sep, t, s, blustery) u.dumpChoices('/tmp/align_tests_choices.tsv') subprocess.call(['cat', '/tmp/align_tests_choices.tsv']) if False: u.dumpLog('/tmp/align_tests_log.tsv') subprocess.call(['cat', '/tmp/align_tests_log.tsv']) print
def load_tree(path): tree = Taxonomy.getTaxonomy(path, 'ott') count = 0 for id in tree.allIds(): count += 1 print count, 'ids' return tree
def doit(tax_path, ids_path): ott = Taxonomy.getRawTaxonomy(tax_path, 'ott') all_nodes = {} with open(ids_path, 'r') as infile: reader = csv.reader(infile, delimiter='\t') otu_count = 0 for row in reader: id = row[0] if otu_count % 50000 == 0: print otu_count, id otu_count += 1 node = ott.lookupId(id) if node != None: all_nodes[node.id] = node print 'OTT taxa assigned to OTUs:', len(all_nodes) prefix_to_count = {} ott_count = 0 for id in all_nodes: node = all_nodes[id] ott_count += 1 for qid in node.sourceIds: prefix = qid.prefix count = prefix_to_count.get(prefix, 0) prefix_to_count[prefix] = count + 1 print 'OTT ids assigned to OTUs:', otu_count for prefix in prefix_to_count: print prefix, prefix_to_count[prefix]
def load_fung(): fung = Taxonomy.getTaxonomy('tax/fung/', 'if') fung.analyzeMajorRankConflicts() # 2014-04-14 Bad Fungi homonyms in new version of IF. 90156 is the good one. # 90154 has no descendants if fung.maybeTaxon('90154') != None: print 'Removing Fungi 90154' fung.taxon('90154').prune(this_source) # 90155 is "Nom. inval." and has no descendants if fung.maybeTaxon('90155') != None: print 'Removing Fungi 90155' fung.taxon('90155').prune(this_source) fix_basal(fung) # smush folds sibling taxa that have the same name. # fung.smush() if True: patch_fung(fung) else: try: patch_fung(fung) except: print '**** Exception in patch_fung' fung.smush() return fung
def compare(t1, t2): print 'comparing', t1, 'to', t2 retired = 0 became_hidden = 0 became_unhidden = 0 became_extinct = 0 became_unextinct = 0 became_suppressed = 0 became_unsuppressed = 0 kept = 0 novel = 0 tax1 = Taxonomy.getTaxonomy(t1, 'x') tax1.inferFlags() tax2 = Taxonomy.getTaxonomy(t2, 'x') tax2.inferFlags() for taxon in tax1.taxa(): probe = tax2.lookupId(taxon.id) if probe == None: retired += 1 elif probe.isAnnotatedHidden() and not taxon.isAnnotatedHidden(): became_hidden += 1 elif not probe.isAnnotatedHidden() and taxon.isAnnotatedHidden(): became_unhidden += 1 elif probe.isExtinct() and not taxon.isExtinct(): became_extinct += 1 elif not probe.isExtinct() and taxon.isExtinct(): became_unextinct += 1 elif probe.isHidden() and not taxon.isHidden(): became_suppressed += 1 elif not probe.isHidden() and taxon.isHidden(): became_unsuppressed += 1 else: kept += 1 for taxon in tax2.taxa(): if tax1.lookupId(taxon.id) == None: novel += 1 print print 'id retired:', retired print 'newly hidden:', became_hidden print 'no longer hidden:', became_unhidden print 'newly extinct:', became_extinct print 'no longer extinct:', became_unextinct print 'newly otherwise suppressed:', became_suppressed print 'no longer otherwise suppressed:', became_unsuppressed print 'new:', novel print 'no change in status:', kept
def report(dir, idspace): tax = Taxonomy.getRawTaxonomy(os.path.join('tax', dir, ''), idspace) # tax.smush() # HomonymReport.homonymDensityReport(tax, dir + '-density-report.csv') # HomonymReport.homonymUncertaintyReport(tax, 'reports/' + dir + '-uncertainty-report.csv') if not os.path.isdir(report_dir): os.makedirs(report_dir) HomonymReport.homonymReport(tax, os.path.join(report_dir, dir + '-homonym-report.csv'))
def report(dir, idspace): tax = Taxonomy.getRawTaxonomy(os.path.join('tax', dir, ''), idspace) # tax.smush() # HomonymReport.homonymDensityReport(tax, dir + '-density-report.csv') # HomonymReport.homonymUncertaintyReport(tax, 'reports/' + dir + '-uncertainty-report.csv') if not os.path.isdir(report_dir): os.makedirs(report_dir) HomonymReport.homonymReport( tax, os.path.join(report_dir, dir + '-homonym-report.csv'))
def load_silva(): silva = Taxonomy.getTaxonomy('tax/silva/', 'silva') # Used in studies pg_2448,pg_2783,pg_2753, seen deprecated on 2015-07-20 silva.taxon('AF364847').rename('Pantoea ananatis LMG 20103') # ncbi:706191 silva.taxon('EF690403').rename('Pantoea ananatis B1-9') # ncbi:1048262 patch_silva(silva) return silva
def load_gbif(): gbif = Taxonomy.getTaxonomy('tax/gbif/', 'gbif') gbif.smush() # In GBIF, if a rank is skipped for some children but not others, that # means the rank-skipped children are incertae sedis. Mark them so. gbif.analyzeMajorRankConflicts() fix_basal(gbif) # creates a Eukaryota node gbif.taxon('Animalia').synonym('Metazoa') patch_gbif(gbif) return gbif
def load_ncbi(): ncbi = Taxonomy.getTaxonomy('tax/ncbi/', 'ncbi') fix_SAR(ncbi) ncbi.taxon('Viridiplantae').rename('Chloroplastida') patch_ncbi(ncbi) # analyzeOTUs sets flags on questionable taxa ("unclassified", # hybrids, and so on) to allow the option of suppression downstream ncbi.analyzeOTUs() ncbi.analyzeContainers() return ncbi
def doit(ott, sep, outpath, conpath): do_rug = False #os.path.isdir('out/ruggiero') if do_rug: rug = Taxonomy.getRawTaxonomy('out/ruggiero/', 'rug') # Prepare for conflict analysis # oh no, we really need a separation taxonomy to do that. rug_alignment = AlignmentByName(rug, ott) rug_alignment.align() rug_conflict = ConflictAnalysis(rug, ott, rug_alignment, True) overall_table(ott, outpath) source_breakdown_table(ott, conpath)
def load_h2007(): h2007 = Taxonomy.getNewick('feed/h2007/tree.tre', 'h2007') # 2014-04-08 Misspelling if h2007.maybeTaxon('Chaetothryriomycetidae') != None: h2007.taxon('Chaetothryriomycetidae').rename('Chaetothyriomycetidae') if h2007.maybeTaxon('Asteriniales') != None: h2007.taxon('Asteriniales').rename('Asterinales') else: h2007.taxon('Asterinales').synonym('Asteriniales') # h2007/if synonym https://github.com/OpenTreeOfLife/reference-taxonomy/issues/40 h2007.taxon('Urocystales').synonym('Urocystidales') return h2007
def load_worms(): worms = Taxonomy.getTaxonomy('tax/worms/', 'worms') worms.smush() worms.taxon('Biota').rename('life') worms.taxon('Animalia').synonym('Metazoa') fix_basal(worms) # 2015-02-17 According to WoRMS web site. Occurs in pg_1229 if worms.maybeTaxon('Scenedesmus communis') != None: worms.taxon('Scenedesmus communis').synonym('Scenedesmus caudata') # See NCBI worms.taxon('Millericrinida').extant() # Help to match up with IRMNG worms.taxon('Ochrophyta').synonym('Heterokontophyta') worms.smush() # Gracilimesus gorbunovi, pg_1783 return worms
# counts number of taxa with rank=family in a given taxon from org.opentreeoflife.taxa import Taxonomy, Rank import argparse parser = argparse.ArgumentParser(description='load nexsons into postgres') parser.add_argument('taxonname', help='name of taxon to count' ) args = parser.parse_args() name = args.taxonname ott_path = '/Users/karen/Documents/opentreeoflife/data/ott/ott2.9draft12/' ott = Taxonomy.getTaxonomy(ott_path, 'ott') def count_families(taxon): count = 0 with open('families.txt','w') as f: for t in taxon.descendants(False): if t.rank == Rank.FAMILY_RANK: f.write("{n}\n".format(n=t.name)) count += 1 f.close() return count print "number families: ",count_families(ott.taxon(name))
def tst(target, source, want): global tests t = Taxonomy.getRawTaxonomy(target, 'target') s = Taxonomy.getRawTaxonomy(source, 'source') u = combine(t, s, blustery) tests.append((t, s, u, want))
# Command line argument = file to write to # Writes a row for every OTT id that # (a) occurs in tax/ott/, # (b) occurs as an OTU in phylesystem, # (c) is sourced only from in IRMNG. import csv, sys from org.opentreeoflife.taxa import Taxonomy, Rank from org.opentreeoflife.smasher import UnionTaxonomy union = UnionTaxonomy.newTaxonomy('ott') union.loadPreferredIds('ids_that_are_otus.tsv', False) union.loadPreferredIds('ids_in_synthesis.tsv', True) ott = Taxonomy.getTaxonomy('tax/ott/', 'ott') #ott = Taxonomy.getTaxonomy('t/tax/aster/', 'ott') with open(sys.argv[1], 'w') as outfile: writer = csv.writer(outfile) writer.writerow(['irmng', 'ott', 'name', 'synthesis']) for taxon in ott.taxa(): # if (taxon.rank == Rank.SPECIES_RANK and ...) if (len(taxon.sourceIds) == 1 and taxon.sourceIds[0].prefix == 'irmng'): probe = union.importantIds.lookupId(taxon.id) if probe != None: writer.writerow([ taxon.sourceIds[0].id, taxon.id, taxon.name, 'synthesis' if probe.inSynthesis else '' ])
# Jython script to build the "model village" taxonomy. from org.opentreeoflife.taxa import Taxonomy from org.opentreeoflife.smasher import UnionTaxonomy from claim import Has_child # Create model taxonomy tax = UnionTaxonomy() # Establish homonym-resolution skeleton (not really used here) # skel = Taxonomy.getTaxonomy('tax/skel/', 'skel') # tax.setSkeleton(skel) # Add NCBI subset to the model taxonomy ncbi = Taxonomy.getTaxonomy('t/tax/ncbi_aster/') # analyzeOTUs sets flags on questionable taxa ("unclassified" and so on) # to allow the option of suppression downstream ncbi.analyzeOTUs() tax.absorb(ncbi) # Add GBIF subset fo the model taxonomy gbif = Taxonomy.getTaxonomy('t/tax/gbif_aster/') # analyzeMajorRankConflicts sets the "major_rank_conflict" flag when # intermediate ranks are missing (e.g. a family that's a child of a # class) gbif.analyzeMajorRankConflicts() tax.absorb(gbif) # "Old" patch system with tab-delimited files tax.edit('t/edits/')
import sys from org.opentreeoflife.taxa import Taxonomy, Newick source = sys.argv[1] # Name of directory containing original taxonomy (must end in /) ott = Taxonomy.getRawTaxonomy(source, 'ott') count = 0 grafts = 0 non_tip_grafts = 0 # Seen = seen idspaces among ancestors. # Returns set of seen idspaces. def recur(taxon, seen): global count, grafts, non_tip_grafts count += 1 # idspace (source) of taxon space = taxon.sourceIds.get(0).prefix all = empty() seen_child = adjoin(space, seen) for child in taxon.getChildren(): under = recur(child, seen_child) child_space = child.sourceIds.get(0).prefix if child_space != space: # A graft or resolution. if intersectp(under, seen): # A resolution. print 'resolve', child, taxon, child.rank
def load_irmng(): irmng = Taxonomy.getTaxonomy('tax/irmng/', 'irmng') irmng.smush() irmng.analyzeMajorRankConflicts() fix_basal(irmng) irmng.taxon('Animalia').synonym('Metazoa') # JAR 2014-04-26 Flush all 'Unaccepted' taxa irmng.taxon('Unaccepted', 'life').prune(this_source) # Fixes # Neopithecus (extinct) occurs in two places. Flush one, mark the other irmng.taxon('1413316').prune(this_source) #Neopithecus in Mammalia irmng.taxon('1413315').extinct() #Neopithecus in Primates (Pongidae) # RR #50 # irmng.taxon('Saxo-Fridericia').rename('Saxofridericia') # irmng.taxon('Saxofridericia').absorb(irmng.taxon('Saxo-fridericia')) saxo = irmng.maybeTaxon('1063899') if saxo != None: saxo.absorb(irmng.taxon('1071613')) # Romina 2014-04-09 # IRMNG has EIGHT different Trichodermas. (Four are synonyms of other things.) # 1307461 = Trichoderma Persoon 1794, in Hypocreaceae # https://github.com/OpenTreeOfLife/reference-taxonomy/issues/86 irmng.taxon('Hypocrea').absorb(irmng.taxon('1307461')) # JAR 2015-06-28 # The synonym Ochrothallus multipetalus -> Niemeyera multipetala # is no good; it interferes with correct processing of Ochrothallus # multipetalus. We could remove the synonym, but instead remove its # target because no synonym-removal command is available. irmng.taxon('Niemeyera multipetala').prune(this_source) tip = irmng.taxon('Tipuloidea', 'Hemiptera') # irmng:1170022 if tip != None: tip.prune("about:blank#this-homonym-is-causing-too-much-trouble") oph = irmng.taxon('Ophiurina', 'Ophiurinidae') # irmng:1346026 if oph != None: oph.prune("about:blank#this-homonym-is-causing-too-much-trouble") # NCBI synonymizes Pelecypoda = Bivalvia irmng.taxon('Bivalvia').absorb(irmng.taxon('Pelecypoda')) # bogus order # hmm irmng.taxon('Bivalvia').extant() # This one was mapping to Blattodea, and making it extinct. # Caused me a couple of hours of grief. # My guess is it's because its unique child Sinogramma is in Blattodea in GBIF. # Wikipedia says it's paraphyletic. irmng.taxon('Blattoptera', 'Insecta').prune('https://en.wikipedia.org/wiki/Blattoptera') # 2015-07-25 Found while trying to figure out why Theraphosidae was marked extinct. # NCBI thinks that Theraphosidae and Aviculariidae are the same. irmng.taxon('Aviculariidae').extant() # 2015-07-25 Extra Dipteras are confusing new division logic. Barren genus irmng.taxon('1323521').prune(this_source) # 2015-09-10 This one is unclassified (Diptera) and is leading to confusion with two other Steinias. irmng.taxon('1299622').prune(this_source) # 2015-09-11 https://github.com/OpenTreeOfLife/feedback/issues/74 # Lymnea is a snail, not a shark irmng.taxon('1317416').prune(this_source) # 2015-10-12 JAR checked IRMNG online and this taxon (Ctenophora in Chelicerata) did not exist if irmng.maybeTaxon('1279363') != None: irmng.taxon('1279363').prune(this_source) return irmng
def load_713(): study713 = Taxonomy.getTaxonomy('tax/713/', 'study713') return study713
* copied = total number of nodes originating from this source (copied) * aligned = number of source nodes aligned and copied * absorbed = number of source nodes absorbed (not copied) * conflict = number of inconsistent source nodes (not copied) """ def dump_table_as_csv(table, outfile): # Provide CSV form for Pensoft writer = csv.writer(outfile) for row in table: writer.writerow(row) def max_depth(node): m = 0 for child in node.getChildren(): d = max_depth(child) + 1 if d > m: m = d return m if __name__ == '__main__': taxpath = sys.argv[1] seppath = sys.argv[2] outpath = sys.argv[3] # general report, JSON conpath = sys.argv[4] # contributions, CSV sep = Taxonomy.getRawTaxonomy(seppath, 'ott') ott = Taxonomy.getRawTaxonomy(taxpath, 'ott') ott.inferFlags() doit(ott, sep, outpath, conpath)
import sys from org.opentreeoflife.taxa import Taxonomy, Rank ott = Taxonomy.getRawTaxonomy(sys.argv[1], 'ott') # Look for splitting: # Suppose X, Y are distinct in GBIF, but both align to X in NCBI, # because NCBI says Y a synonym of X. # Then we have X in NCBI with GBIF X and Y aligning to it, and # Y a synonym via NCBI but not via GBIF. # So GBIF X is a source for X, and GBIF Y is in sources for Y-synonym of X. for X in ott.taxa(): # Species only if X.rank != Rank.SPECIES_RANK: continue xid = X.sourceIds[0].id # Look for Y, a synonym of X... for Y in X.getSynonyms(): yids = [qid.id for qid in Y.sourceIds] # that has same source as X... if not xid in yids: continue # but, an alignment from Y for yid in yids:
# Command line argument = file to write to # Writes a row for every OTT id that # (a) occurs in tax/ott/, # (b) occurs as an OTU in phylesystem, # (c) is sourced only from in IRMNG. import csv, sys from org.opentreeoflife.taxa import Taxonomy, Rank from org.opentreeoflife.smasher import UnionTaxonomy union = UnionTaxonomy.newTaxonomy('ott') union.loadPreferredIds('ids_that_are_otus.tsv', False) union.loadPreferredIds('ids_in_synthesis.tsv', True) ott = Taxonomy.getTaxonomy('tax/ott/', 'ott') #ott = Taxonomy.getTaxonomy('t/tax/aster/', 'ott') with open(sys.argv[1], 'w') as outfile: writer = csv.writer(outfile) writer.writerow(['irmng','ott','name','synthesis']) for taxon in ott.taxa(): # if (taxon.rank == Rank.SPECIES_RANK and ...) if (len(taxon.sourceIds) == 1 and taxon.sourceIds[0].prefix == 'irmng'): probe = union.importantIds.lookupId(taxon.id) if probe != None: writer.writerow([taxon.sourceIds[0].id, taxon.id, taxon.name, 'synthesis' if probe.inSynthesis else ''])
def assemble(): # Create model taxonomy tax = UnionTaxonomy.newTaxonomy('ott') for name in [ 'Pentaphragma ellipticum', 'Lachnophyllum', 'Sipolisia', 'Cicerbita bourgaei', 'Adenophora triphylla', 'Artemisia vulgaris', 'Carlina libanotica', ]: tax.watch(name) # Establish homonym-resolution skeleton (not really used here) # skel = Taxonomy.getTaxonomy('tax/skel/', 'skel') # tax.setSkeleton(skel) # Add NCBI subset to the model taxonomy ncbi = Taxonomy.getTaxonomy('t/tax/ncbi_aster/', 'ncbi') # analyzeOTUs sets flags on questionable taxa ("unclassified" and so on) # to allow the option of suppression downstream ncbi.analyzeOTUs() align_and_merge(tax.alignment(ncbi)) # Add GBIF subset fo the model taxonomy gbif = Taxonomy.getTaxonomy('t/tax/gbif_aster/', 'gbif') gbif.smush() # analyzeMajorRankConflicts sets the "major_rank_conflict" flag when # intermediate ranks are missing (e.g. a family that's a child of a # class) gbif.analyzeMajorRankConflicts() align_and_merge(tax.alignment(gbif)) # "Old" patch system with tab-delimited files TsvEdits.edit(tax, 't/edits/') props = [has_parent(taxon('Phellinaceae'), taxon('Asterales'), 'test:1')] for prop in props: print proclaim(tax, prop) gen = tax.newTaxon("Opentreeia", "genus", "data:testing") gen.take(tax.newTaxon("Opentreeia sp. C", "species", "data:testing")) gen.take(tax.newTaxon("Opentreeia sp. D", "species", "data:testing")) # Example of referring to a taxon fam = tax.maybeTaxon("Phellinaceae") if fam != None: # Example of how you might add a genus to the taxonomy fam.take(gen) # Test deletion feature sp = tax.newTaxon("Opentreeia sp. C", "species", "data:testing") gen.take(sp) sp.prune("aster.py") # tax.loadPreferredIds('ids-that-are-otus.tsv') additions_repo_path = 't/feed/amendments/amendments-0' new_taxa_path = 't/new_taxa' # Assign identifiers to the taxa in the model taxonomy. Identifiers # assigned in the previous version are carried over to this version. ids = Taxonomy.getTaxonomy('t/tax/prev_aster/', 'ott') tax.carryOverIds(ids) # performs alignment Addition.processAdditions(additions_repo_path, tax) if False: # too slow for everyday testing purposes. print '-- Checking id list' assign_ids_from_list(tax, 'ott_id_list/by_qid.csv') tax.assignNewIds(new_taxa_path) tax.check() # Write the model taxonomy out to a set of files tax.dump('t/tax/aster/', '\t|\t')
small, big, small_tax.id, small_id) show_interloper(small_node, small_id, ott) else: print '** More than one taxon named %s is in %s' % (small, big) print ' ', small_nodes infile.close() def show_interloper(small_node, small_id, ott): if small_node != small_node.taxon(): print ' %s is a synonym for %s' % (small_node.name, small_node.taxon().name) probe = ott.lookupId(small_id) if probe != None: print ' Id %s belongs to %s' % (small_id, probe) else: print ' (There is no taxon with id %s)' % small_id if __name__ == '__main__': if len(sys.argv) == 3: inclusions = sys.argv[1] taxname = sys.argv[2] else: print 'ignoring supplied args', sys.argv inclusions = 'inclusions.csv' taxname = 'tax/ott/' check(inclusions, Taxonomy.getTaxonomy(taxname, 'ott'))
import sys from org.opentreeoflife.taxa import Taxonomy from org.opentreeoflife.smasher import AlignmentByName from org.opentreeoflife.conflict import ConflictAnalysis rug = Taxonomy.getTaxonomy('scratch/Ruggiero/', 'rug') with open('scratch/Ruggiero.tre', 'w') as outfile: outfile.write(rug.toNewick(False)) outfile.write('\n')
def create_ott(): ott = UnionTaxonomy.newTaxonomy() # There ought to be tests for all of these... for name in names_of_interest: ott.eventlogger.namesOfInterest.add(name) # When lumping, prefer to use ids that have been used in OTU matching # This list could be used for all sorts of purposes... ott.loadPreferredIds('ids-that-are-otus.tsv', False) ott.loadPreferredIds('ids-in-synthesis.tsv', True) ott.setSkeleton(Taxonomy.getTaxonomy('tax/skel/', 'skel')) silva = prepare_silva(ott) ott.absorb(silva) check_invariants(ott) h2007 = prepare_h2007(ott) ott.absorb(h2007) (fungi, fungorum_sans_fungi) = prepare_fungorum(ott) ott.absorb(fungi) check_invariants(ott) # the non-Fungi from Index Fungorum get absorbed below lamiales = prepare_lamiales(ott) ott.absorb(lamiales) (malacostraca, worms_sans_malacostraca) = prepare_worms(ott) ott.absorb(malacostraca) ncbi = prepare_ncbi(ott) align_ncbi_to_silva(ncbi, silva, ott) ott.absorb(ncbi) check_invariants(ott) ott.absorb(worms_sans_malacostraca) ott.absorb(fungorum_sans_fungi) gbif = prepare_gbif(ott) ott.absorb(gbif) irmng = prepare_irmng(ott) ott.absorb(irmng) taxonomies.link_to_h2007(ott) get_default_extinct_info_from_gbif(gbif, ott) check_invariants(ott) # consider try: ... except: print '**** Exception in patch_ott' patch_ott(ott) # Experimental... unextinct_ncbi(ncbi, ott) # Remove all trees but the largest (or make them life incertae sedis) ott.deforestate() # ----------------------------------------------------------------------------- # OTT id assignment # Force some id assignments... will try to automate this in the future. # Most of these come from looking at the otu-deprecated.tsv file after a # series of smasher runs. for (inf, sup, id) in [ ('Tipuloidea', 'Diptera', '722875'), ('Saccharomycetes', 'Saccharomycotina', '989999'), ('Phaeosphaeria', 'Ascomycota', '5486272'), ('Synedra acus','Eukaryota','992764'), ('Epiphloea','Halymeniaceae','5342325'), ('Hessea','Archaeplastida','600099'), ('Morganella','Arthropoda','6400'), ('Rhynchonelloidea','Rhynchonellidae','5316010'), ('Epiphloea', 'Lichinales', '5342482'), ('Morganella', 'Fungi', '973932'), ('Parmeliaceae', 'Lecanorales', '305904'), ]: tax = ott.taxon(inf, sup) if tax != None: tax.setId(id) ott.taxonThatContains('Rhynchonelloidea', 'Sphenarina').setId('795939') # NCBI for (ncbi_id, ott_id, name) in ncbi_assignments_list: n = ncbi.maybeTaxon(ncbi_id) if n != None: im = ott.image(n) if im != None: im.setId(ott_id) else: print '** NCBI %s not mapped - %s' % (ncbi_id, name) else: print '** No NCBI taxon %s - %s' % (ncbi_id, name) # Cylindrocarpon is now Neonectria ott.image(gbif.taxon('2563163')).setId('51754') # Foo trich = fungi.maybeTaxon('Trichosporon') if trich != None: ott.image(trich).setId('364222') #ott.image(fungi.taxon('11060')).setId('4107132') #Cryptococcus - a total mess # Assign OTT ids to taxa that don't have them, re-using old ids when possible ids = Taxonomy.getTaxonomy('tax/prev_ott/') # Assign old ids to nodes in the new version ott.assignIds(ids) report_on_h2007(h2007, ott) return ott
import sys, codecs from org.opentreeoflife.taxa import Taxonomy, Newick source = sys.argv[1] # Name of directory containing original taxonomy (must end in /) name = sys.argv[2] # Name of taxon to extract dest = sys.argv[3] # Directory to store result (must end in /) if not (dest.endswith('/') or dest.endswith('.tre')): print >>sys.stderr, 'Invalid taxonomy destination (need / or .tre)', dest sys.exit(1) selection = Taxonomy.getRawTaxonomy(source, 'foo').select(name) if dest.endswith('.tre'): with codecs.open(dest, 'w', 'utf-8') as outfile: outfile.write(Newick.toNewick(selection, Newick.USE_NAMES_AND_IDS)) outfile.write('\n') else: selection.dump(dest)
def assemble(): # Create model taxonomy tax = UnionTaxonomy.newTaxonomy('ott') for name in ['Pentaphragma ellipticum', 'Lachnophyllum', 'Sipolisia', 'Cicerbita bourgaei', 'Adenophora triphylla', 'Artemisia vulgaris', 'Carlina libanotica', ]: tax.watch(name) # Establish homonym-resolution skeleton (not really used here) # skel = Taxonomy.getTaxonomy('tax/skel/', 'skel') # tax.setSkeleton(skel) # Add NCBI subset to the model taxonomy ncbi = Taxonomy.getTaxonomy('t/tax/ncbi_aster/', 'ncbi') # analyzeOTUs sets flags on questionable taxa ("unclassified" and so on) # to allow the option of suppression downstream ncbi.analyzeOTUs() align_and_merge(tax.alignment(ncbi)) # Add GBIF subset fo the model taxonomy gbif = Taxonomy.getTaxonomy('t/tax/gbif_aster/', 'gbif') gbif.smush() # analyzeMajorRankConflicts sets the "major_rank_conflict" flag when # intermediate ranks are missing (e.g. a family that's a child of a # class) gbif.analyzeMajorRankConflicts() align_and_merge(tax.alignment(gbif)) # "Old" patch system with tab-delimited files TsvEdits.edit(tax, 't/edits/') props = [ has_parent(taxon('Phellinaceae'), taxon('Asterales'), 'test:1') ] for prop in props: print proclaim(tax, prop) gen = tax.newTaxon("Opentreeia", "genus", "data:testing") gen.take(tax.newTaxon("Opentreeia sp. C", "species", "data:testing")) gen.take(tax.newTaxon("Opentreeia sp. D", "species", "data:testing")) # Example of referring to a taxon fam = tax.maybeTaxon("Phellinaceae") if fam != None: # Example of how you might add a genus to the taxonomy fam.take(gen) # Test deletion feature sp = tax.newTaxon("Opentreeia sp. C", "species", "data:testing") gen.take(sp) sp.prune("aster.py") # tax.loadPreferredIds('ids-that-are-otus.tsv') additions_repo_path = 't/feed/amendments/amendments-0' new_taxa_path = 't/new_taxa' # Assign identifiers to the taxa in the model taxonomy. Identifiers # assigned in the previous version are carried over to this version. ids = Taxonomy.getTaxonomy('t/tax/prev_aster/', 'ott') tax.carryOverIds(ids) # performs alignment Addition.processAdditions(additions_repo_path, tax) if False: # too slow for everyday testing purposes. print '-- Checking id list' assign_ids_from_list(tax, 'ott_id_list/by_qid.csv') tax.assignNewIds(new_taxa_path) tax.check() # Write the model taxonomy out to a set of files tax.dump('t/tax/aster/', '\t|\t')
import sys, os, csv from org.opentreeoflife.taxa import Taxonomy, SourceTaxonomy, Taxon from org.opentreeoflife.smasher import UnionTaxonomy dwh = UnionTaxonomy.newTaxonomy('dwh') #Use this to tell smasher what separation file to use dwh.setSkeleton(Taxonomy.getTaxonomy('tax/separation/', 'separation')) # 1. trunk # 2. ictv # 3. IOC # 4. ASW # 5. ODO # 6. BOM # 7. ERE # 8. ONY # 9. EET # 10. NCBI # 11. WOR # 12. CLP # 13. COL #use this to load the taxonomies trunk = Taxonomy.getTaxonomy('t/tax/2018_12/dynamichierarchytrunk2018-11-21/', 'trunk') ictv = Taxonomy.getTaxonomy( 't/tax/2018_12/ICTV-virus_taxonomy-with-higherClassification/', 'ictv') IOC = Taxonomy.getTaxonomy('t/tax/2018_12/ioc-birdlist/', 'IOC')
# One-off script prepared to provide data to David Hibbett and Romina Gazis. # Lists numbers of species in each fungal order. from org.opentreeoflife.taxa import Taxonomy import csv, sys from taxonomies import load_fung, load_ncbi, load_gbif, load_irmng taxonomies = [('fung', load_fung(), 'Index Fungorum'), ('ncbi', load_ncbi(), 'NCBI'), ('gbif', load_gbif(), 'GBIF'), ('irmng', load_irmng(), 'IRMNG'), ('ott', Taxonomy.getTaxonomy('tax/ott/'), 'OTT 2.9'), ] def main(): infile = open('order-counts-orders.csv', 'r') reader = csv.reader(infile) reader.next() #header row taxa = ['Fungi'] for tuple in reader: taxa.append(tuple[0]) infile.close() write_counts(taxa) def write_counts(taxa): outfile = open('order-counts.csv', 'w') writer = csv.writer(outfile) header = ['order'] for (name, taxonomy, label) in taxonomies: header += [label + ' bin', label + ' sp', label + ' tip']
* conflict = number of inconsistent source nodes (not copied) """ def dump_table_as_csv(table, outfile): # Provide CSV form for Pensoft writer = csv.writer(outfile) for row in table: writer.writerow(row) def max_depth(node): m = 0 for child in node.getChildren(): d = max_depth(child) + 1 if d > m: m = d return m if __name__ == '__main__': taxpath = sys.argv[1] seppath = sys.argv[2] outpath = sys.argv[3] # general report, JSON conpath = sys.argv[4] # contributions, CSV sep = Taxonomy.getRawTaxonomy(seppath, 'ott') ott = Taxonomy.getRawTaxonomy(taxpath, 'ott') ott.inferFlags() doit(ott, sep, outpath, conpath)
def create_ott(ott_spec): # Fail fast additions_clone_path = os.path.join(access_head('amendments'), 'amendments-1') if not os.path.isdir(additions_clone_path): print '# cannot find', additions_clone_path sys.exit(1) with open(os.path.join(access_head('idlist'), 'by_qid.csv'), 'r') as infile: print '# can access idlist' ott_path = management.source_path(ott_spec) ott = UnionTaxonomy.newTaxonomy('ott') # Would be nice if there were tests for all of these... for name in names_of_interest: ott.eventLogger.namesOfInterest.add(name) ott.setSkeleton(Taxonomy.getTaxonomy('curation/separation/', 'separation')) # These are particularly hard cases; create alignment targets up front adjustments.deal_with_polysemies(ott) # Align and merge each source in sequence merge_sources(ott) # "Old" patch system TsvEdits.edit(ott, 'curation/edits/') # consider try: ... except: print '**** Exception in patch_ott' amendments.patch_ott(ott) # End of topology changes. Now assign ids. retain_ids(ott, access_source('ott-PREVIOUS'), os.path.join(access_head('idlist'), 'by_qid.csv')) # Apply the additions (which already have ids assigned). # This has to happen *after* ids are assigned, since additions use OTT # ids to identify parents. print '-- Processing additions --' Addition.processAdditions(additions_clone_path, ott) # Mint ids for new nodes print '-- Minting new ids --' ott.assignNewIds(new_taxa_path) # Remove all trees but the largest (or make them life incertae sedis) ott.deforestate() # data structure integrity checks ott.check() # For deprecated id report (dump) ott.loadPreferredIds('ids_that_are_otus.tsv', False) ott.loadPreferredIds('ids_in_synthesis.tsv', True) ott.dump(ott_path) record_ott_sources(ott_spec) return ott
import sys, os, csv from org.opentreeoflife.taxa import Taxonomy, SourceTaxonomy, Taxon from org.opentreeoflife.smasher import UnionTaxonomy dwh = UnionTaxonomy.newTaxonomy('dwh') #Use this to tell smasher what separation file to use dwh.setSkeleton(Taxonomy.getTaxonomy('tax/separation/', 'separation')) # 1. trunk # 2. ictv # 3. IOC # 4. ASW # 5. ODO # 6. BOM # 7. ERE # 8. ONY # 9. EET # 10. NCBI # 11. WOR # 12. CLP # 13. COL #use this to load the taxonomies trunk = Taxonomy.getTaxonomy('t/tax/2018_12/trunk/', 'trunk') ictv = Taxonomy.getTaxonomy('t/tax/2018_12/ictv/', 'ictv') IOC = Taxonomy.getTaxonomy('t/tax/2018_12/IOC/', 'IOC') ASW = Taxonomy.getTaxonomy('t/tax/2018_12/ASW/', 'ASW') ODO = Taxonomy.getTaxonomy('t/tax/2018_12/ODO/', 'ODO')
def merge_sources(ott): # Genbank - this is a kludge to make sure it's in the dependencies list. # But eventually it ought to be handled in this file, not in the silva # import script. access_head('genbank') # SILVA silva = load_taxonomy('silva') adjustments.adjust_silva(silva) silva_to_ott = adjustments.align_silva(silva, ott) align_and_merge(silva_to_ott) # Hibbett 2007 h2007 = Taxonomy.getTaxonomy('curation/h2007/tree.tre', 'h2007') adjustments.adjust_h2007(h2007) h2007_to_ott = ott.alignment(h2007) align_and_merge(h2007_to_ott) # Index Fungorum fungorum = load_taxonomy('fung') adjustments.adjust_fung(fungorum) (fungi, fungorum_sans_fungi) = split_taxonomy(fungorum, 'Fungi') align_and_merge(adjustments.align_fungi(fungi, ott)) # Connect IF families to Hibbett 2007 orders adjustments.link_to_h2007(ott) # Look for orders that have no children in OTT report_on_h2007(h2007, h2007_to_ott, '#') # the non-Fungi from Index Fungorum get absorbed below lamiales = Taxonomy.getTaxonomy('curation/lamiales/', 'study713') adjustments.adjust_lamiales(lamiales) align_and_merge(adjustments.align_lamiales(lamiales, ott)) # WoRMS # higher priority to Worms for Malacostraca, Cnidaria, Mollusca # so we split out # those clades from worms and absorb them before NCBI worms = load_taxonomy('worms') adjustments.adjust_worms(worms) # Malacostraca instead of Decapoda because M. is in the separation taxonomy (malacostraca, worms_sans_malacostraca) = split_taxonomy(worms, 'Malacostraca') align_and_merge(ott.alignment(malacostraca)) (cnidaria, worms_sans_cnidaria) = split_taxonomy(worms_sans_malacostraca, 'Cnidaria') align_and_merge(ott.alignment(cnidaria)) (mollusca, low_priority_worms) = split_taxonomy(worms_sans_cnidaria, 'Mollusca') align_and_merge(ott.alignment(mollusca)) # NCBI ncbi = load_taxonomy('ncbi') adjustments.adjust_ncbi(ncbi) # analyzeOTUs sets flags on questionable taxa (hybrid, metagenomes, # etc) to allow the option of suppression downstream ncbi.analyzeOTUs() ncbi_to_ott = adjustments.align_ncbi(ncbi, silva, ott) align_and_merge(ncbi_to_ott) # Look for orders that have no children in OTT report_on_h2007(h2007, h2007_to_ott, '#') # Reporting # Get mapping from NCBI to OTT, derived via SILVA and Genbank. mappings = load_ncbi_to_silva( os.path.join(management.resource_path('silva'), 'ncbi_to_silva.tsv'), ncbi, silva, silva_to_ott) compare_ncbi_to_silva(mappings, silva_to_ott) # Low-priority WoRMS # This is suboptimal, but the names are confusing the division logic a = adjustments.align_worms(low_priority_worms, ott) align_and_merge(a) # The rest of Index Fungorum. (Maybe not a good idea to use this. # These taxa are all in GBIF.) # align_and_merge(adjustments.align_fungorum_sans_fungi(fungorum_sans_fungi, ott)) # GBIF gbif = load_taxonomy('gbif') adjustments.adjust_gbif(gbif) gbif_to_ott = adjustments.align_gbif(gbif, ott) align_and_merge(gbif_to_ott) # http://dx.doi.org/10.1016/j.ympev.2004.12.019 "Eccrinales # (Trichomycetes) are not fungi, but a clade of protists at the # early divergence of animals and fungi" debug_divisions('Enterobryus cingaloboli', gbif, ott) # Cylindrocarpon is now Neonectria cyl = gbif_to_ott.image(gbif.taxon('Cylindrocarpon', 'Ascomycota')) if cyl != None: cyl.setId('51754') # IRMNG irmng = load_taxonomy('irmng') adjustments.adjust_irmng(irmng) a = adjustments.align_irmng(irmng, ott) hide_irmng(irmng) align_and_merge(a) # Misc fixups report_on_h2007(h2007, h2007_to_ott, '**') get_default_extinct_info_from_gbif( os.path.join(management.resource_path('gbif'), 'paleo.tsv'), gbif, gbif_to_ott)
# Requires python.security.respectJavaAccessibility = false # on java command line or in .jython from org.opentreeoflife.taxa import Taxonomy from org.opentreeoflife.smasher import UnionTaxonomy, HomonymReport union = UnionTaxonomy() skel = Taxonomy.getTaxonomy('tax/skel/', 'skel') union.setSkeleton(skel) def report(tax, tag): union.markDivisionsFromSkeleton(tax, skel) HomonymReport.homonymReport(tax, 'reports/' + tag + '-homonym-report.tsv') if True: ott = Taxonomy.getTaxonomy('tax/ott/', 'ott') report(ott, 'ott') else: import taxonomies report(taxonomies.loadSilva(), 'silva') report(taxonomies.loadH2007(), 'h2007') report(taxonomies.loadFung(), 'worms') report(taxonomies.loadFung(), 'if') report(taxonomies.loadNcbi(), 'ncbi') report(taxonomies.loadGbif(), 'gbif') report(taxonomies.loadIrmng(), 'irmng')
# counts number of taxa with rank=family in a given taxon from org.opentreeoflife.taxa import Taxonomy, Rank import argparse parser = argparse.ArgumentParser(description='load nexsons into postgres') parser.add_argument('taxonname', help='name of taxon to count') args = parser.parse_args() name = args.taxonname ott_path = '/Users/karen/Documents/opentreeoflife/data/ott/ott2.9draft12/' ott = Taxonomy.getTaxonomy(ott_path, 'ott') def count_families(taxon): count = 0 with open('families.txt', 'w') as f: for t in taxon.descendants(False): if t.rank == Rank.FAMILY_RANK: f.write("{n}\n".format(n=t.name)) count += 1 f.close() return count print "number families: ", count_families(ott.taxon(name))
def load_taxonomy(spec): return Taxonomy.getTaxonomy(access_head(spec), management.get_property(spec, "ott_idspace"))
small = row[0] big = row[1] small_id = row[2] small_tax = ott.maybeTaxon(small_id) if small_tax == None: small_tax = ott.maybeTaxon(small) if small_tax == None: print '** No unique taxon with id %s or name %s' % (small_id, small) else: print '** %s is %s, not %s' % (small, small_tax.id, small_id) else: look = ott.maybeTaxon(small, big) if look == None: print '** %s=%s not under %s' % (small, small_id, big) small_tax.show() elif look != small_tax: print '** The %s that descends from %s is %s, not %s' % (small, big, look.id, small_id) if small_tax.isHidden(): print '%s (%s) is hidden' % (small, small_id) infile.close() if __name__ == '__main__': taxname = 'tax/ott/' if len(sys.argv) > 1: taxname = sys.argv[1] else: print sys.argv check(Taxonomy.getTaxonomy(taxname))
def retain_ids(ott, prev_path, by_qid): # ad hoc assignments specifically for NCBI taxa, basedon NCBI id for (ncbi_id, ott_id, name) in ncbi_ott_assignments.ncbi_assignments_list: im = ott.lookupQid(QualifiedId('ncbi', ncbi_id)) if im == None: print '* ncbi:%s not found in OTT - %s' % (ncbi_id, name) else: if im.name != name: print '* ncbi:%s name is %s, but expected %s' % (ncbi_id, im.name, name) im.addId(ott_id) # Force some id assignments... will try to automate this in the future. # Most of these come from looking at the deprecated.tsv file after a # series of smasher runs. for (inf, sup, id) in [ ('Tipuloidea', 'Diptera', '722875'), ('Saccharomycetes', 'Saccharomycotina', '989999'), ('Phaeosphaeria', 'Ascomycota', '5486272'), ('Synedra acus', 'Eukaryota', '992764'), ('Hessea', 'Archaeplastida', '600099'), ('Morganella', 'Arthropoda', '6400'), ('Rhynchonelloidea', 'Rhynchonellidae', '5316010'), ('Morganella', 'Fungi', '973932'), ('Parmeliaceae', 'Lecanorales', '305904'), ('Cordana', 'Ascomycota', '946160'), ('Pseudofusarium', 'Ascomycota', '655794'), ('Marssonina', 'Dermateaceae', '372158'), # ncbi:324777 ('Marssonia', 'Lamiales', '5512668'), # gbif:7268388 # ('Gloeosporium', 'Pezizomycotina', '75019'), # synonym for Marssonina ('Escherichia coli', 'Enterobacteriaceae', '474506'), # ncbi:562 # ('Dischloridium', 'Trichocomaceae', '895423'), ('Exaiptasia pallida', 'Cnidaria', '135923'), ('Choanoflagellida', 'Holozoa', '202765'), ('Billardiera', 'Lamiales', '798963'), ('Trachelomonas grandis', 'Bacteria', '58035'), # study ot_91 Tr46259 ('Hypomyzostoma', 'Myzostomida', '552744'), # was incorrectly in Annelida ('Gyromitus', 'SAR', '696946'), ('Pseudogymnoascus destructans', 'Pezizomycotina', '428163'), # ('Amycolicicoccus subflavus', 'Mycobacteriaceae', '541768'), # ncbi:639313 # ('Pohlia', 'Foraminifera', '5325989') - NO ('Pohlia', 'Amphibia', '5325989'), # irmng:1311321 ('Phyllanthus', 'Pentapetalae', '452944'), # pg_25 @josephwb = 5509975 ]: tax = ott.maybeTaxon(inf, sup) if tax != None: tax.setId(id) ott.taxon('452944').addId('5509975') # ott.taxon('474506') ... ott.taxonThatContains('Rhynchonelloidea', 'Sphenarina').setId('795939') # NCBI # Trichosporon is a mess, because it occurs 3 times in NCBI. trich = ott.taxonThatContains('Trichosporon', 'Trichosporon cutaneum') if trich != None: trich.setId('364222') #ott.image(fungi.taxon('11060')).setId('4107132') #Cryptococcus - a total mess # -------------------- # Assign OTT ids to taxa that don't have them, re-using old ids when possible ids = Taxonomy.getRawTaxonomy(prev_path, 'ott') # Edit the id source taxonomy to optimize id coverage # Kludge to undo lossage in OTT 2.9 for taxon in ids.taxa(): if (len(taxon.sourceIds) >= 2 and taxon.sourceIds[0].prefix == "ncbi" and taxon.sourceIds[1].prefix == "silva"): taxon.sourceIds.remove(taxon.sourceIds[0]) # OTT 2.9 has both Glaucophyta and Glaucophyceae... # this creates an ambiguity when aligning. # Need to review this; maybe they *should* be separate taxa. g1 = ids.maybeTaxon('Glaucophyta') g2 = ids.maybeTaxon('Glaucophyceae') if g1 != None and g2 != None and g1 != g2: g1.absorb(g2) # Assign old ids to nodes in the new version ott.carryOverIds(ids) # Align & copy ids print '-- Checking id list' retain_ids_from_list(ott, by_qid)
small_node = small_nodes[0] small_tax = small_node.taxon() if small_id != '' and small_tax != small_id_tax: print '** The id of %s in %s is %s (expected %s)' % (small, big, small_tax.id, small_id) show_interloper(small_node, small_id, ott) else: print '** More than one taxon named %s is in %s' % (small, big) print ' ', small_nodes infile.close() def show_interloper(small_node, small_id, ott): if small_node != small_node.taxon(): print ' %s is a synonym for %s' % (small_node.name, small_node.taxon().name) probe = ott.lookupId(small_id) if probe != None: print ' Id %s belongs to %s' % (small_id, probe) else: print ' (There is no taxon with id %s)' % small_id if __name__ == '__main__': if len(sys.argv) == 3: inclusions = sys.argv[1] taxname = sys.argv[2] else: print 'ignoring supplied args', sys.argv inclusions = 'inclusions.csv' taxname = 'tax/ott/' check(inclusions, Taxonomy.getTaxonomy(taxname, 'ott'))