def main(): # do stuff parser = argparse.ArgumentParser( prog="Remove taxa taxonomy", description= """Removes taxa from a taxonomy (or indeed any) tree that aren't in a dataset""", ) parser.add_argument('-v', '--verbose', action='store_true', help="Verbose output: mainly progress reports.", default=False) parser.add_argument('input_file', metavar='input_file', nargs=1, help="Your input Phyml") parser.add_argument('input_tree', metavar='input_tree', nargs=1, help="Your input tree files") parser.add_argument('new_file', metavar='new_file', nargs=1, help="The new tree file") args = parser.parse_args() verbose = args.verbose input_file = args.input_file[0] input_tree = args.input_tree[0] new_file = args.new_file[0] # load tree tree = stk.import_tree(input_tree) tree_taxa = stk._getTaxaFromNewick(tree) # grab taxa in dataset XML = stk.load_phyml(input_file) taxa = stk.get_all_taxa(XML) # build our subs up deleteme = [] for taxon in tree_taxa: if not taxon in taxa: deleteme.append(taxon) new_tree = stk._sub_taxa_in_tree(tree, deleteme) t = stk._parse_tree(new_tree) t.writeNexus(fName=new_file) tree_taxa = stk._getTaxaFromNewick(new_tree) tree_taxa.sort() for t in tree_taxa: print t
def main(): # do stuff parser = argparse.ArgumentParser( prog="Create colours for iTOL", description= "Generate a asthetically pleasing colour scheme for iToL based" + " on a Phyml and a taxonomy csv file", ) parser.add_argument('-v', '--verbose', action='store_true', help="Verbose output: mainly progress reports.", default=False) parser.add_argument('--scheme', choices=['pastel', 'bright', 'dark', 'faded'], default='pastel', help="Choose a colour scheme") parser.add_argument( '--level', choices=[ 'Genus', 'Subfamily', 'Family', 'Superfamily', 'Infraorder', 'Suborder', 'Order' ], default='Family', help= "Which taxonomic level to colour at. Note that not all will return data. Family and Order will always work." ) parser.add_argument( '--tree', help= "Give a tree to colour and the colour will go around the tree, rather than be sorted alphabetically", action='store_true', default=False, ) parser.add_argument( 'input_file', metavar='input_file', nargs=1, help= "Your Phyml *or* a taxa lis *or* a tree file (use --tree in this case)t" ) parser.add_argument('input_taxonomy', metavar='input_taxonomy', nargs=1, help="Your taxonomy") parser.add_argument('output_file', metavar='output_file', nargs=1, help="The output file for iToL") args = parser.parse_args() verbose = args.verbose level = args.level scheme = args.scheme input_file = args.input_file[0] input_taxonomy = args.input_taxonomy[0] output_file = args.output_file[0] tree = args.tree saturation = 0.5 value = 0.95 if (scheme == 'bright'): saturation = 0.99 value = 0.99 elif (scheme == 'dark'): saturation = 0.6 value = 0.8 elif (scheme == 'faded'): saturation = 0.25 value = 0.8 if (tree): tree_data = stk.import_tree(input_file) # rather than simply grabbing taxa, just go through in "tree order" tree_data = tree_data.replace("(", "") tree_data = tree_data.replace(")", "") tree_data = tree_data.replace(";", "") taxa = tree_data.split(",") for i in range(0, len(taxa)): taxa[i] = taxa[i].strip() else: # grab taxa in dataset - ignore if tree fileName, fileExtension = os.path.splitext(input_file) if (fileExtension == '.phyml'): print "Parsing PHYML" XML = stk.load_phyml(input_file) taxa = stk.get_all_taxa(XML) else: f = open(input_file, "r") taxa = [] for line in f: taxa.append(line.strip()) f.close() print len(taxa) taxonomy = {} index = 0 with open(input_taxonomy, 'r') as f: reader = csv.reader(f) i = 0 for row in reader: if i == 0: # find index of the level required j = 0 for r in row: if r.lower() == level.lower(): index = j i = 1 break j = j + 1 if j == len(row): print "Error finding the desired level in your taxonomy file." print "You asked for: " + level.lower() print "Your taxonomy contains: " + " ".join(row) sys.exit() else: taxonomy[row[0].replace(" ", "_")] = row[ index] # the replace is to make sure we're consistant across PHYML, tree and taxonomy print len(taxonomy) values = taxonomy.values() values = _uniquify(values) n = len(values) colours = get_colours(n, format="HEX", saturation=saturation, value=value) output_colours = {} i = 0 for v in values: output_colours[v] = colours[i] i += 1 f = open(output_file, "w") # write header f.write("""DATASET_COLORSTRIP #=================================================================# # MANDATORY SETTINGS # #=================================================================# SEPARATOR COMMA #label is used in the legend table (can be changed later) DATASET_LABEL,""") f.write(level + "\n") f.write(""" #dataset color (can be changed later) COLOR,#000000 #=================================================================# # OPTIONAL SETTINGS # #=================================================================# COLOR_BRANCHES,1 #=================================================================# # Actual data follows after the "DATA" keyword # #=================================================================# DATA """) for t in taxa: tt = t.replace(" ", "_") try: if (taxonomy[tt] == "-"): f.write(t + ",#000000\n") else: f.write(t + ",#" + output_colours[taxonomy[tt]] + "," + taxonomy[tt] + "\n") except KeyError: #print "Couldn't find "+tt f.write(t + ",#000000\n") f.close()
def main(): # do stuff parser = argparse.ArgumentParser( prog="plot tree-taxa matrix", description="""Plot a matrix of trees against taxa""", ) parser.add_argument('-v', '--verbose', action='store_true', help="Verbose output: mainly progress reports.", default=False) parser.add_argument( '-t', '--taxonomy', help= "Use taxonomy to sort the taxa on the axis. Supply a STK taxonomy file", ) parser.add_argument( '--level', choices=['family', 'superfamily', 'infraorder', 'suborder', 'order'], default='family', help="""What level to group the taxonomy at. Default is family. Note data for a particular levelmay be missing in taxonomy.""" ) parser.add_argument('input_file', metavar='input_file', nargs=1, help="Your pyhml") parser.add_argument('output_file', metavar='output_file', nargs=1, help="The output graphics. .png, .pdf, or .svg") args = parser.parse_args() verbose = args.verbose input_file = args.input_file[0] output_file = args.output_file[0] taxonomy = args.taxonomy level = args.level XML = stk.load_phyml(input_file) if not taxonomy == None: taxonomy = stk.load_taxonomy(taxonomy) all_taxa = stk.get_all_taxa(XML) taxa_tree_matrix = {} for t in all_taxa: taxa_tree_matrix[t] = [] if not taxonomy == None: tax_data = {} new_all_taxa = [] for t in all_taxa: taxon = t.replace("_", " ") try: if taxonomy[taxon][level] == "": # skip this continue tax_data[t] = taxonomy[taxon][level] except KeyError: print "Couldn't find " + t + " in taxonomy. Adding as null data" tax_data[t] = 'zzzzz' # it's at the end... from sets import Set unique = set(tax_data.values()) unique = list(unique) unique.sort() print "Groups are:" print unique counts = [] for u in unique: count = 0 for t in tax_data: if tax_data[t] == u: count += 1 new_all_taxa.append(t) counts.append(count) all_taxa = new_all_taxa # cumulate counts count_cumulate = [] count_cumulate.append(counts[0]) for c in counts[1::]: count_cumulate.append(c + count_cumulate[-1]) print count_cumulate trees = stk.obtain_trees(XML) i = 0 for t in trees: taxa = stk.get_taxa_from_tree(XML, t, sort=True) for taxon in taxa: taxon = taxon.replace(" ", "_") taxa_tree_matrix[taxon].append(i) i += 1 # create a map y = [] for i in range(0, len(all_taxa)): for j in range(0, len(trees)): if (j in taxa_tree_matrix[all_taxa[i]]): y.append(j) tree_count = Counter(y) tree_dict = dict(tree_count) tree_order = sorted(tree_dict.items(), key=lambda x: x[1], reverse=True) new_x = [] new_y = [] for i in range(0, len(all_taxa)): counter = 0 for t in tree_order: j = t[0] if (j in taxa_tree_matrix[all_taxa[i]]): new_x.append(i) new_y.append(counter) counter += 1 fig = figure(figsize=(22, 17), dpi=90) fig.subplots_adjust(left=0.3) ax = fig.add_subplot(1, 1, 1) ax.scatter(new_x, new_y, 50, marker='o', c='k', lw=0) ax.set_xlim(0, len(all_taxa)) ax.set_ylim(0, len(trees)) xlabel('Taxa') ylabel('Tree Number') savefig(output_file, dpi=90)
def main(): # do stuff parser = argparse.ArgumentParser( prog="remove poorly contrained taxa", description="""Remove taxa that appea in one source tree only.""", ) parser.add_argument( '-v', '--verbose', action='store_true', help="Verbose output: mainly progress reports.", default=False ) parser.add_argument( '--delete_list', help="Produce a deleted taxa list. Give filename." ) parser.add_argument( '--poly_only', default=False, action='store_true', help="Restrict removal of taxa that are in polytomies only in source trees. Default"+ " to removal those in polytomies *and* only in one other tree." ) parser.add_argument( '--tree_only', default=False, action='store_true', help="Restrict removal of taxa that only occur in one source tree. Default"+ " to removal those in polytomies *and* only in one other tree." ) parser.add_argument( 'input_phyml', metavar='input_phyml', nargs=1, help="Your input phyml" ) parser.add_argument( 'input_tree', metavar='input_tree', nargs=1, help="Your tree - can be NULL or None" ) parser.add_argument( 'output_tree', metavar='output_tree', nargs=1, help="Your output tree or phyml - if input_tree is none, this is the Phyml" ) args = parser.parse_args() verbose = args.verbose delete_list_file = args.delete_list if (delete_list_file == None): dl = False else: dl = True poly_only = args.poly_only input_tree = args.input_tree[0] if input_tree == 'NULL' or input_tree == 'None': input_tree = None output_file = args.output_tree[0] input_phyml = args.input_phyml[0] XML = stk.load_phyml(input_phyml) # load tree if (not input_tree == None): supertree = stk.import_tree(input_tree) taxa = stk._getTaxaFromNewick(supertree) else: supertree = None taxa = stk.get_all_taxa(XML) # grab taxa delete_list = [] # loop over taxa in supertree and get some stats for t in taxa: #print "Looking at "+t nTrees = 0 nResolved = 0 nPoly = 0 # search each source tree xml_root = stk._parse_xml(XML) # By getting source, we can then loop over each source_tree find = etree.XPath("//source") sources = find(xml_root) # loop through all sources for s in sources: # for each source, get source name name = s.attrib['name'] for tr in s.xpath("source_tree/tree/tree_string"): tree = tr.xpath("string_value")[0].text current_taxa = stk._getTaxaFromNewick(tree) # if tree contains taxa if (t in current_taxa): nTrees += 1 tree_obj = stk._parse_tree(tree,fixDuplicateTaxa=True) siblings = stk._get_all_siblings(tree_obj.node(t)) # check where it occurs - polytomies only? if (len(siblings) > 3): #2? nPoly += 1 else: nResolved += 1 # record stats for this taxon and decide if to delete it if (poly_only): if (nPoly == nTrees): # all in polytomies delete_list.append(t) else: if (nPoly == nTrees or # all in polytomies (nResolved == 1 and (nPoly+nResolved)==nTrees) # only 1 resolved and rest (if any) polytomies ): delete_list.append(t) print "Taxa: "+str(len(taxa)) print "Deleting: "+str(len(delete_list)) if not supertree == None: # done, so delete the problem taxa from the supertree for t in delete_list: # remove taxa from supertree supertree = stk._sub_taxa_in_tree(supertree,t) # save supertree tree = {} tree['Tree_1'] = supertree output = stk._amalgamate_trees(tree,format='nexus') # write file f = open(output_file,"w") f.write(output) f.close() else: new_phyml = stk.substitute_taxa(XML,delete_list) # write file f = open(output_file,"w") f.write(new_phyml) f.close() if (dl): # write file delete_list.sort() f = open(delete_list_file,"w") string = '\n'.join(delete_list) f.write(string) f.close()
def main(): # do stuff parser = argparse.ArgumentParser( prog="plot chracter taxa matrix", description="""Plot a matrix of character availability against taxa""", ) parser.add_argument( '-v', '--verbose', action='store_true', help="Verbose output: mainly progress reports.", default=False ) parser.add_argument( '-t', '--taxonomy', help="Use taxonomy to sort the taxa on the axis. Supply a STK taxonomy file", ) parser.add_argument( '--level', choices=['family','superfamily','infraorder','suborder','order'], default='family', help="""What level to group the taxonomy at. Default is family. Note data for a particular levelmay be missing in taxonomy.""" ) parser.add_argument( 'input_file', metavar='input_file', nargs=1, help="Your pyhml" ) parser.add_argument( 'output_file', metavar='output_file', nargs=1, help="The output graphics. .png, .pdf, or .svg" ) args = parser.parse_args() verbose = args.verbose input_file = args.input_file[0] output_file = args.output_file[0] taxonomy = args.taxonomy level = args.level XML = stk.load_phyml(input_file) if not taxonomy == None: taxonomy = stk.load_taxonomy(taxonomy) all_taxa = stk.get_all_taxa(XML) all_chars_d = stk.get_all_characters(XML) all_chars = [] for c in all_chars_d: all_chars.extend(all_chars_d[c]) if not taxonomy == None: tax_data = {} new_all_taxa = [] for t in all_taxa: taxon = t.replace("_"," ") try: if taxonomy[taxon][level] == "": # skip this continue tax_data[t] = taxonomy[taxon][level] except KeyError: print "Couldn't find "+t+" in taxonomy. Adding as null data" tax_data[t] = 'zzzzz' # it's at the end... from sets import Set unique = set(tax_data.values()) unique = list(unique) unique.sort() print "Groups are:" print unique counts = [] for u in unique: count = 0 for t in tax_data: if tax_data[t] == u: count += 1 new_all_taxa.append(t) counts.append(count) all_taxa = new_all_taxa # cumulate counts count_cumulate = [] count_cumulate.append(counts[0]) for c in counts[1::]: count_cumulate.append(c+count_cumulate[-1]) print count_cumulate taxa_character_matrix = {} for t in all_taxa: taxa_character_matrix[t] = [] trees = stk.obtain_trees(XML) for t in trees: chars = stk.get_characters_from_tree(XML,t,sort=True) taxa = stk.get_taxa_from_tree(XML,t, sort=True) for taxon in taxa: taxon = taxon.replace(" ","_") if taxon in all_taxa: taxa_character_matrix[taxon].extend(chars) for t in taxa_character_matrix: array = taxa_character_matrix[t] taxa_character_matrix[t] = list(set(array)) # create a map x = [] y = [] for i in range(0,len(all_taxa)): for j in range(0,len(all_chars)): if (all_chars[j] in taxa_character_matrix[all_taxa[i]]): x.append(i) y.append(j) i = 0 for j in all_chars: # do a substitution of character names to tidy things up if j.lower().startswith('mitochondrial carrier; adenine nucleotide translocator'): j = "ANT" if j.lower().startswith('mitochondrially encoded 12s'): j = '12S' if j.lower().startswith('complete mitochondrial genome'): j = 'Mitogenome' if j.lower().startswith('mtdna'): j = "mtDNA restriction sites" if j.lower().startswith('h3 histone'): j = 'H3' if j.lower().startswith('mitochondrially encoded cytochrome'): j = 'COI' if j.lower().startswith('rna, 28s'): j = '28S' if j.lower().startswith('rna, 18s'): j = '18S' if j.lower().startswith('mitochondrially encoded 16s'): j = '16S' all_chars[i] = j i += 1 fig=figure(figsize=(22,17),dpi=90) fig.subplots_adjust(left=0.3) ax = fig.add_subplot(1,1,1) ax.scatter(x,y,50,marker='o',c='r',lw=0) yticks(range(0,len(all_chars)), all_chars) ax.set_xlim(0,len(all_taxa)) ax.set_ylim(0,len(all_chars)) xlabel('Taxa') ylabel('Characters') savefig(output_file, dpi=90)
def main(): # do stuff parser = argparse.ArgumentParser( prog="Create a taxonomy", description= "Generate a taxonomy from Phyml. Fills in most taxonomic levels. Uses EOL and ITIS", ) parser.add_argument('-v', '--verbose', action='store_true', help="Verbose output: mainly progress reports.", default=False) parser.add_argument( '--pref_db', help="Preferred database. Need to be able to list avialable databases?" ) parser.add_argument('input_file', metavar='input_file', nargs=1, help="Your input taxa list or phyml") parser.add_argument('output_file', metavar='output_file', nargs=1, help="The output file. A CSV-based taxonomy") args = parser.parse_args() verbose = args.verbose input_file = args.input_file[0] output_file = args.output_file[0] pref_db = args.pref_db # grab taxa in dataset fileName, fileExtension = os.path.splitext(input_file) if (fileExtension == '.phyml'): XML = stk.load_phyml(input_file) taxa = stk.get_all_taxa(XML) else: f = open(input_file, "r") taxa = [] for line in f: taxa.append(line.strip()) f.close() taxonomy = {} for taxon in taxa: taxon = taxon.replace("_", " ") if (verbose): print "Looking up ", taxon # get the data from EOL on taxon # What about synonyms? taxonq = quote_plus(taxon) URL = "http://eol.org/api/search/1.0.json?q=" + taxonq req = urllib2.Request(URL) opener = urllib2.build_opener() f = opener.open(req) data = json.load(f) # check if there's some data if len(data['results']) == 0: taxonomy[taxon] = {} continue ID = str(data['results'][0]['id']) # take first hit # Now look for taxonomies URL = "http://eol.org/api/pages/1.0/" + ID + ".json" req = urllib2.Request(URL) opener = urllib2.build_opener() f = opener.open(req) data = json.load(f) if len(data['taxonConcepts']) == 0: taxonomy[taxon] = {} continue TID = str(data['taxonConcepts'][0]['identifier']) # take first hit currentdb = str(data['taxonConcepts'][0]['nameAccordingTo']) # loop through and get preferred one if specified # now get taxonomy if (not pref_db == None): for db in data['taxonConcepts']: currentdb = db['nameAccordingTo'].lower() if (pref_db.lower() in currentdb): TID = str(db['identifier']) break URL = "http://eol.org/api/hierarchy_entries/1.0/" + TID + ".json" req = urllib2.Request(URL) opener = urllib2.build_opener() f = opener.open(req) data = json.load(f) this_taxonomy = {} this_taxonomy['provider'] = currentdb for a in data['ancestors']: try: this_taxonomy[a['taxonRank']] = a['scientificName'] except KeyError: continue try: if (not data['taxonRank'].lower() == 'species'): # higher taxa, add it in to the taxonomy! this_taxonomy[data['taxonRank'].lower()] = taxon except KeyError: continue taxonomy[taxon] = this_taxonomy if (verbose): print "Done basic taxonomy, getting more info from ITIS" # fill in the rest of the taxonomy # get all genera genera = [] for t in taxonomy: try: genera.append(taxonomy[t]['genus']) except KeyError: continue genera = _uniquify(genera) for g in genera: if (verbose): print "Looking up ", g try: URL = "http://www.itis.gov/ITISWebService/jsonservice/searchByScientificName?srchKey=" + quote_plus( g.strip()) except: continue req = urllib2.Request(URL) opener = urllib2.build_opener() f = opener.open(req) string = unicode(f.read(), "ISO-8859-1") data = json.loads(string) if data['scientificNames'][0] == None: continue tsn = data["scientificNames"][0]["tsn"] URL = "http://www.itis.gov/ITISWebService/jsonservice/getFullHierarchyFromTSN?tsn=" + str( tsn) req = urllib2.Request(URL) opener = urllib2.build_opener() f = opener.open(req) try: string = unicode(f.read(), "ISO-8859-1") except: continue data = json.loads(string) this_taxonomy = {} for level in data['hierarchyList']: if not level['rankName'].lower() in current_taxonomy_levels: this_taxonomy[level['rankName'].lower()] = level['taxonName'] for t in taxonomy: try: if taxonomy[t]['genus'] == g: taxonomy[t].update(this_taxonomy) except KeyError: continue stk.save_taxonomy(taxonomy, output_file)