def pull(self, options): """Create taxonomy file from a decorated tree.""" check_file_exists(options.input_tree) if options.no_validation: tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting="force-rooted", preserve_underscores=True) taxonomy = {} for leaf in tree.leaf_node_iter(): taxon_id = leaf.taxon.label node = leaf.parent_node taxa = [] while node: support, taxon, aux_info = parse_label(node.label) if taxon: for t in map(str.strip, taxon.split(';'))[::-1]: taxa.append(t) node = node.parent_node taxonomy[taxon_id] = taxa[::-1] else: taxonomy = Taxonomy().read_from_tree(options.input_tree) Taxonomy().write(taxonomy, options.output_taxonomy) self.logger.info('Stripped tree written to: %s' % options.output_taxonomy)
def lsu_tree(self, options): """Infer 23S tree spanning GTDB genomes.""" check_dependencies(['esl-sfetch', 'cmsearch', 'cmalign', 'esl-alimask', 'FastTreeMP', 'blastn']) check_file_exists(options.gtdb_metadata_file) check_file_exists(options.gtdb_lsu_file) make_sure_path_exists(options.output_dir) rna_workflow = RNA_Workflow(options.cpus) rna_workflow.run('lsu', options.gtdb_metadata_file, options.gtdb_lsu_file, options.min_lsu_length, options.min_scaffold_length, options.min_quality, options.max_contigs, options.min_N50, not options.disable_tax_filter, #options.reps_only, #options.user_genomes, options.genome_list, options.output_dir) self.logger.info('Results written to: %s' % options.output_dir)
def strip(self, options): """Remove taxonomic labels from tree.""" check_file_exists(options.input_tree) outgroup_in_tree = set() tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for node in tree.internal_nodes(): if node.label: if ':' in node.label: support, _taxa = node.label.split(':') node.label = support else: node.label = None tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Stripped tree written to: %s' % options.output_tree)
def cluster(self, options): """Cluster command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - cluster] Partitioning bin into clusters.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_stats_file) check_file_exists(options.genome_file) make_sure_path_exists(options.output_dir) self.logger.info('') self.logger.info(' Reading scaffold statistics.') scaffold_stats = ScaffoldStats() scaffold_stats.read(options.scaffold_stats_file) cluster = Cluster(options.cpus) cluster.run(scaffold_stats, options.num_clusters, options.num_components, options.K, options.no_coverage, options.no_pca, options.iterations, options.genome_file, options.output_dir) self.logger.info('') self.logger.info(' Partitioned sequences written to: ' + options.output_dir) self.time_keeper.print_time_stamp()
def gene(self, options): self.logger.info('Calculating gene properties of genome.') check_file_exists(options.genome_file) check_file_exists(options.gff_file) make_sure_path_exists(options.output_dir) meta_genes = MetadataGenes() metadata_values, metadata_desc = meta_genes.generate(options.genome_file, options.gff_file) # write statistics to file output_file = os.path.join(options.output_dir, 'metadata.genome_gene.tsv') fout = open(output_file, 'w') for field in sorted(metadata_values.keys()): fout.write('%s\t%s\n' % (field, str(metadata_values[field]))) fout.close() # write description to file output_file = os.path.join(options.output_dir, 'metadata.genome_gene.desc.tsv') fout = open(output_file, 'w') for field in sorted(metadata_desc.keys()): fout.write('%s\t%s\t%s\n' % (field, metadata_desc[field], type(metadata_values[field]).__name__.upper())) fout.close()
def reference(self, options): """Reference command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - reference] Identifying scaffolds similar to specific genome(s).') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_prot_file) check_file_exists(options.scaffold_stats_file) make_sure_path_exists(options.output_dir) ref_gene_files = self._genome_files(options.ref_genome_prot_dir, options.protein_ext) if not self._check_protein_seqs(ref_gene_files): self.logger.warning('[Warning] All files must contain amino acid sequences.') sys.exit() reference = Reference(options.cpus, options.output_dir) reference_out = reference.run(options.scaffold_prot_file, options.scaffold_stats_file, ref_gene_files, options.db_file, options.evalue, options.per_identity) self.logger.info('') self.logger.info(' Results written to: ' + reference_out) self.time_keeper.print_time_stamp()
def phylogenetic_diversity_clade(self, options): """Calculate phylogenetic diversity of named groups.""" check_file_exists(options.decorated_tree) pd = PhylogeneticDiversity() pd.pd_clade(options.decorated_tree, options.output_file, options.taxa_list, options.rep_list)
def append(self, options): """Append command""" check_file_exists(options.input_tree) check_file_exists(options.input_taxonomy) taxonomy = Taxonomy().read(options.input_taxonomy) tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) for n in tree.leaf_node_iter(): taxa_str = taxonomy.get(n.taxon.label, None) if taxa_str == None: self.logger.error('Taxonomy file does not contain an entry for %s.' % n.label) sys.exit(-1) n.taxon.label = n.taxon.label + '|' + '; '.join(taxonomy[n.taxon.label]) tree.write_to_path(options.output_tree, schema='newick', suppress_rooting=True, unquoted_underscores=True) self.logger.info('Decorated tree written to: %s' % options.output_tree)
def cluster_stats(self, options): """Calculate statistics for species cluster.""" check_file_exists(options.cluster_file) check_file_exists(options.genome_path_file) p = ClusterStats(options.ani_cache_file, options.cpus, options.output_dir) p.run(options.cluster_file, options.genome_path_file)
def scaffold_stats(self, options): """Scaffold statistics command""" print options self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info(' [RefineM - scaffold_stats] Calculating statistics for scaffolds.') self.logger.info('*******************************************************************************') check_file_exists(options.scaffold_file) if not self._check_nuclotide_seqs([options.scaffold_file]): self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.') sys.exit() genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() make_sure_path_exists(options.output_dir) # get coverage information if not options.coverage_file: if not options.bam_files: self.logger.warning('\n [Warning] One or more BAM files must be specified in order to calculate coverage profiles.') coverage_file = None else: coverage = Coverage(options.cpus) coverage_file = os.path.join(options.output_dir, 'coverage.tsv') coverage.run(options.bam_files, coverage_file, options.cov_all_reads, options.cov_min_align, options.cov_max_edit_dist) self.logger.info('') self.logger.info(' Coverage profiles written to: %s' % coverage_file) else: coverage_file = options.coverage_file # get tetranucleotide signatures - ALEX - IMPORTANT FOR MY STUFF if not options.tetra_file: self.logger.info('') tetra = Tetranucleotide(options.cpus) tetra_file = os.path.join(options.output_dir, 'tetra.tsv') signatures = tetra.run(options.scaffold_file) tetra.write(signatures, tetra_file) self.logger.info(' Tetranucleotide signatures written to: %s' % tetra_file) else: tetra_file = options.tetra_file # write out scaffold statistics stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv') stats = ScaffoldStats(options.cpus) stats.run(options.scaffold_file, genome_files, tetra_file, coverage_file, stats_output) self.logger.info(' Scaffold statistic written to: %s' % stats_output) self.time_keeper.print_time_stamp()
def arb_records(self, options): """Create an ARB records file from GTDB metadata.""" check_file_exists(options.metadata_file) arb = Arb() arb.create_records(options.metadata_file, options.msa_file, options.taxonomy_file, options.genome_list, options.output_file)
def pull(self, options): """Pull command""" check_file_exists(options.input_tree) t = Taxonomy().read_from_tree(options.input_tree) #, False) if not options.no_rank_fill: for taxon_id, taxa in t.iteritems(): t[taxon_id] = Taxonomy().fill_missing_ranks(taxa) Taxonomy().write(t, options.output_file) self.logger.info('Taxonomy strings written to: %s' % options.output_file)
def rd_ranks(self, options): """Calculate number of taxa for specified rd thresholds.""" check_file_exists(options.input_tree) make_sure_path_exists(options.output_dir) r = RdRanks() r.run(options.input_tree, options.thresholds, options.output_dir) self.logger.info('Done.')
def bl_table(self, options): """Produce table with number of lineage for increasing mean branch lengths.""" check_file_exists(options.input_tree) check_file_exists(options.taxon_category) b = BranchLengthDistribution() b.table(options.input_tree, options.taxon_category, options.step_size, options.output_table) self.logger.info('Done.')
def run(self, genome_files, scaffold_file, min_seq_len): """Fragment genome sequences into fragments of a fixed size. Parameters ---------- genome_files : list of str Fasta files of genomes to process. scaffold_file : str Scaffolds binned to generate putative genomes. min_seq_len : int Ignore scaffolds shorter than the specified length. Returns ------- dict : d[seq_id] -> seq Dictionary of unbinned sequences. """ check_file_exists(scaffold_file) # get list of sequences in bins self.logger.info('') self.logger.info(' Reading binned scaffolds.') binned_seq_ids = set() total_binned_bases = 0 for genome_file in genome_files: for seq_id, seq in seq_io.read_seq(genome_file): binned_seq_ids.add(seq_id) total_binned_bases += len(seq) self.logger.info(' Read %d (%.2f Mbp) binned scaffolds.' % (len(binned_seq_ids), float(total_binned_bases) / 1e6)) # write all unbinned sequences self.logger.info('') self.logger.info(' Identifying unbinned scaffolds >= %d bp.' % min_seq_len) unbinned_bases = 0 unbinned_seqs = {} for seq_id, seq in seq_io.read_seq(scaffold_file): if seq_id not in binned_seq_ids and len(seq) >= min_seq_len: unbinned_seqs[seq_id] = seq unbinned_bases += len(seq) self.logger.info(' Identified %d (%.2f Mbp) unbinned scaffolds.' % (len(unbinned_seqs), float(unbinned_bases) / 1e6)) self.logger.info('') self.logger.info(' Percentage of unbinned scaffolds: %.2f%%' % (len(unbinned_seqs) * 100.0 / (len(unbinned_seqs) + len(binned_seq_ids)))) self.logger.info(' Percentage of unbinned bases: %.2f%%' % (unbinned_bases * 100.0 / (unbinned_bases + total_binned_bases))) return unbinned_seqs
def outgroup(self, options): """Reroot tree with outgroup.""" check_file_exists(options.taxonomy_file) self.logger.info('Identifying genomes from the specified outgroup.') outgroup = set() for genome_id, taxa in Taxonomy().read(options.taxonomy_file).iteritems(): if options.outgroup_taxon in taxa: outgroup.add(genome_id) self.logger.info('Identifying %d genomes in the outgroup.' % len(outgroup)) reroot = RerootTree() reroot.root_with_outgroup(options.input_tree, options.output_tree, outgroup)
def bl_dist(self, options): """Calculate distribution of branch lengths at each taxonomic rank.""" check_file_exists(options.input_tree) make_sure_path_exists(options.output_dir) b = BranchLengthDistribution() b.run(options.input_tree, options.trusted_taxa_file, options.min_children, options.taxonomy_file, options.output_dir) self.logger.info('Done.')
def tree_tax_diff(self, options): """Taxonomy difference command.""" check_file_exists(options.input_tree1) check_file_exists(options.input_tree2) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) td = TaxDiff() td.tree_tax_diff(options.input_tree1, options.input_tree2, options.output_dir) self.logger.info('Done.')
def decorate(self, options): """Place internal taxonomic labels on tree.""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) decorate = Decorate() decorate.run(options.input_tree, options.taxonomy_file, options.trusted_taxa_file, options.min_children, options.min_support, options.output_tree) self.logger.info('Finished decorating tree.')
def tax_diff(self, options): """Taxonomy difference command.""" check_file_exists(options.tax1_file) check_file_exists(options.tax2_file) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) td = TaxDiff() td.tax_diff(options.tax1_file, options.tax2_file, options.include_user_taxa, options.output_dir) self.logger.info('Done.')
def bl_decorate(self, options): """Decorate tree based using a mean branch length criterion.""" check_file_exists(options.input_tree) b = BranchLengthDistribution() b.decorate(options.input_tree, options.taxonomy_file, options.threshold, options.rank, options.retain_named_lineages, options.keep_labels, options.prune, options.output_tree) self.logger.info('Done.')
def jk_taxa(self, options): """Jackknife taxa.""" check_file_exists(options.input_tree) check_file_exists(options.msa_file) make_sure_path_exists(options.output_dir) jackknife_taxa = JackknifeTaxa(options.cpus) output_tree = jackknife_taxa.run(options.input_tree, options.msa_file, options.outgroup_ids, options.perc_taxa, options.num_replicates, options.model, options.output_dir) self.logger.info('Jackknifed taxa tree written to: %s' % output_tree)
def diss(self, options): """Calculate dissimilarity between usage profiles.""" check_file_exists(options.profile_file) genome_ids = [] profiles = [] with open(options.profile_file) as f: f.readline() # burn header for line in f: line_split = line.rstrip().split('\t') genome_id = line_split[0] profile = [float(v) for v in line_split[1:]] genome_ids.append(genome_id) profiles.append(profile) # calculate dissimilarity between genomes d = scipy_pdist(profiles, metric=options.metric) fout = open(options.output_file, 'w') if not options.full_matrix: # write out lower triangle from condense dissimilarity matrix, # in pairwise fashion fout.write('Genome A\tGenome B\tDissimilarity\n') condensed_idx = lambda i,j,n: n*j - j*(j+1)/2 + i - 1 - j for i in xrange(1, len(genome_ids)): for j in xrange(i): fout.write('%s\t%s\t%f\n' % (genome_ids[i], genome_ids[j], d[condensed_idx(i, j, len(genome_ids))])) else: # write out full dissimilarity matrix ds = scipy_squareform(d) for genome_id in genome_ids: fout.write('\t' + genome_id) fout.write('\n') for i, genome_id in enumerate(genome_ids): fout.write(genome_id) for j in xrange(len(genome_ids)): fout.write('\t%f' % ds[i,j]) fout.write('\n') fout.close() self.logger.info('Dissimilarity values written to: %s' % options.output_file)
def mark_tree(self, options): """Mark tree command""" check_file_exists(options.input_tree) mt = MarkTree() mt.run(options.input_tree, options.output_tree, options.min_support, options.only_named_clades, options.min_length, not options.no_percentile, not options.no_relative_divergence, not options.no_prediction, options.thresholds) self.logger.info('Marked tree written to: %s' % options.output_tree)
def tree_diff(self, options): """Tree diff command.""" check_file_exists(options.input_tree1) check_file_exists(options.input_tree2) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) td = TreeDiff() td.run(options.input_tree1, options.input_tree2, options.output_dir, options.min_support, options.min_taxa, options.named_only) self.logger.info('Done.')
def classify(self, options): """Classify genomes based on AAI values.""" check_file_exists(options.sorted_hit_table) make_sure_path_exists(options.output_dir) classify = Classify(options.cpus) results_file = classify.run(options.query_gene_file, options.target_gene_file, options.sorted_hit_table, options.evalue, options.per_identity, options.per_aln_len, options.num_top_targets, options.taxonomy_file, options.keep_rbhs, options.output_dir) self.logger.info('Classification results written to: %s' % results_file)
def aai(self, options): """AAI command""" check_file_exists(options.sorted_hit_table) make_sure_path_exists(options.output_dir) aai_calculator = AAICalculator(options.cpus) aai_output_file, rbh_output_file = aai_calculator.run(options.query_gene_file, None, options.sorted_hit_table, options.evalue, options.per_identity, options.per_aln_len, options.keep_rbhs, options.output_dir) if rbh_output_file: self.logger.info('Identified reciprocal best hits written to: %s' % rbh_output_file) self.logger.info('AAI between genomes written to: %s' % aai_output_file)
def rna_dump(self, options): """Dump all 5S, 16S, and 23S sequences to files.""" check_file_exists(options.genomic_file) make_sure_path_exists(options.output_dir) rna_workflow = RNA_Workflow(1) rna_workflow.dump(options.genomic_file, options.gtdb_taxonomy, options.min_5S_len, options.min_16S_ar_len, options.min_16S_bac_len, options.min_23S_len, options.min_contig_len, options.include_user, options.genome_list, options.output_dir) self.logger.info('Results written to: %s' % options.output_dir)
def jk_markers(self, options): """Jackknife marker genes.""" check_file_exists(options.input_tree) if options.msa_file != 'NONE': check_file_exists(options.msa_file) make_sure_path_exists(options.output_dir) jackknife_markers = JackknifeMarkers(options.cpus) output_tree = jackknife_markers.run(options.input_tree, options.msa_file, options.marker_info_file, options.mask_file, options.perc_markers, options.num_replicates, options.model, options.jk_dir, options.output_dir) self.logger.info('Jackknifed marker tree written to: %s' % output_tree)
def bootstrap(self, options): """Bootstrap multiple sequence alignment.""" check_file_exists(options.input_tree) if options.msa_file != 'NONE': check_file_exists(options.msa_file) make_sure_path_exists(options.output_dir) bootstrap = Bootstrap(options.cpus) output_tree = bootstrap.run(options.input_tree, options.msa_file, options.num_replicates, options.model, options.gamma, options.base_type, options.fraction, options.boot_dir, options.output_dir) self.logger.info('Bootstrapped tree written to: %s' % output_tree)
def taxonomy_files(self, options): """Generate taxonomy files for GTDB website.""" check_file_exists(options.metadata_file) check_file_exists(options.gtdb_sp_clusters_file) check_file_exists(options.user_gid_table) make_sure_path_exists(options.output_dir) p = WebsiteData(options.release_number, options.output_dir) p.taxonomy_files(options.metadata_file, options.gtdb_sp_clusters_file, options.user_gid_table) self.logger.info('Done.')
def sp_cluster_file(self, options): """Generate file indicating GTDB species clusters.""" check_file_exists(options.metadata_file) check_file_exists(options.gtdb_sp_clusters_file) check_file_exists(options.user_gid_table) make_sure_path_exists(options.output_dir) p = WebsiteData(options.release_number, options.output_dir) p.sp_cluster_file(options.metadata_file, options.gtdb_sp_clusters_file, options.user_gid_table) self.logger.info('Done.')
def reduce(self, options): """Infer tree for reduced set of genes.""" check_file_exists(options.homolog_file) check_file_exists(options.gene_ids) check_file_exists(options.taxonomy_file) make_sure_path_exists(options.output_dir) r = Reduce(options.cpus) r.run(options.homolog_file, options.gene_ids, options.taxonomy_file, options.min_per_taxa, options.consensus, options.min_per_bp, options.use_trimAl, options.msa_program, options.tree_program, options.prot_model, options.output_dir)
def outliers(self, options): """Create information for identifying taxnomic outliers""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) if options.plot_taxa_file: check_file_exists(options.plot_taxa_file) if options.trusted_taxa_file: check_file_exists(options.trusted_taxa_file) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) if options.highlight_polyphyly and not options.fmeasure_table: self.logger.error("The '--highlight_polyphyly' flag must be used with the '--fmeasure_table' flag.") return o = Outliers(options.dpi) o.run(options.input_tree, options.taxonomy_file, options.output_dir, options.plot_taxa_file, options.plot_dist_taxa_only, options.plot_domain, options.highlight_polyphyly, options.highlight_taxa_file, options.trusted_taxa_file, options.fixed_root, options.min_children, options.min_support, options.mblet, options.fmeasure_table, options.min_fmeasure, options.fmeasure_mono, options.verbose_table) self.logger.info('Done.')
def tree_gids(self, options): """Determine genome IDs for test/validation tree.""" check_file_exists(options.qc_file) check_file_exists(options.gtdb_metadata_file) check_file_exists(options.gtdb_final_clusters) try: p = TreeGIDs() p.run(options.qc_file, options.gtdb_metadata_file, options.gtdb_final_clusters, options.output_dir) except GenomeTreeTkError as e: print e.message raise SystemExit self.logger.info('Results written to: %s' % options.output_dir)
def dist_plot(self, options): """Distribution plot command""" check_file_exists(options.input_tree) if options.plot_taxa_file: check_file_exists(options.plot_taxa_file) if options.trusted_taxa_file: check_file_exists(options.trusted_taxa_file) dist_plot = DistributionPlot() dist_plot.run(options.input_tree, options.output_prefix, options.plot_taxa_file, options.trusted_taxa_file, options.min_children, options.min_support) self.logger.info('Done.')
def derep_tree(self, options): """Dereplicate tree.""" check_file_exists(options.input_tree) check_file_exists(options.gtdb_metadata) check_file_exists(options.msa_file) make_sure_path_exists(options.output_dir) derep_tree = DereplicateTree() derep_tree.run(options.input_tree, options.lineage_of_interest, options.outgroup, options.gtdb_metadata, options.taxa_to_retain, options.msa_file, options.keep_unclassified, options.output_dir)
def cluster_user(self, args): """Cluster User genomes to GTDB species clusters.""" check_file_exists(args.gtdb_metadata_file) check_file_exists(args.genome_path_file) check_file_exists(args.final_cluster_file) make_sure_path_exists(args.output_dir) try: p = ClusterUser(args.ani_cache_file, args.cpus, args.output_dir) p.run(args.gtdb_metadata_file, args.genome_path_file, args.final_cluster_file) except GTDB_Error as e: print(e.message) raise SystemExit self.logger.info('Clustering results written to: %s' % args.output_dir)
def cluster(self, options): """Cluster remaining genomes based on Mash distances.""" check_file_exists(options.rep_genome_file) check_file_exists(options.metadata_file) check_file_exists(options.mash_pairwise_file) try: rep = Representatives() rep.cluster(options.rep_genome_file, options.metadata_file, options.mash_pairwise_file, options.cluster_file) self.logger.info('Clustering information written to: %s' % options.cluster_file) except GenomeTreeTkError as e: print(e.message) raise SystemExit
def outliers(self, options): """Create information for identifying taxnomic outliers""" check_file_exists(options.input_tree) if options.plot_taxa_file: check_file_exists(options.plot_taxa_file) if options.trusted_taxa_file: check_file_exists(options.trusted_taxa_file) if not os.path.exists(options.output_dir): os.makedirs(options.output_dir) o = Outliers(options.dpi) o.run(options.input_tree, options.taxonomy_file, options.output_dir, options.plot_taxa_file, options.plot_dist_taxa_only, options.plot_domain, options.trusted_taxa_file, options.fixed_root, options.min_children, options.min_support, options.verbose_table) self.logger.info('Done.')
def blast(self, options): """Infer gene tree using BLAST.""" check_file_exists(options.query_proteins) check_file_exists(options.db_file) check_file_exists(options.taxonomy_file) # sanity check arguments if options.prot_model == 'AUTO' and options.tree_program != 'raxml': self.logger.error( "The 'AUTO' protein model can only be used with RAxML.") sys.exit(-1) blast_workflow = BlastWorkflow(options.cpus) blast_workflow.run( options.query_proteins, options.db_file, options.custom_db_file, options.taxonomy_file, options.custom_taxonomy_file, options.evalue, options.per_identity, options.per_aln_len, options.max_matches, options.homology_search, options.min_per_taxa, options.consensus, options.min_per_bp, options.use_trimAl, options.restrict_taxon, options.msa_program, options.tree_program, options.prot_model, options.skip_rooting, options.output_dir)
def dereplicate(self, options): """Select representative genomes for named species.""" check_file_exists(options.metadata_file) check_file_exists(options.prev_rep_file) check_file_exists(options.trusted_user_file) try: rep = Representatives() rep.dereplicate(options.metadata_file, options.prev_rep_file, options.exceptions_file, options.trusted_user_file, options.max_species, options.min_rep_comp, options.max_rep_cont, options.min_quality, options.max_contigs, options.min_N50, options.max_ambiguous, options.max_gap_length, options.strict_filtering, options.species_derep_file) except GenomeTreeTkError as e: print(e.message) raise SystemExit self.logger.info('RefSeq representative genomes written to: %s' % options.species_derep_file)
def taxon_profile(self, options): """Call genes command""" make_sure_path_exists(options.output_dir) check_file_exists(options.scaffold_stats_file) check_file_exists(options.taxonomy_file) check_file_exists(options.db_file) gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext) if not self._check_protein_seqs(gene_files): self.logger.warning('All files must contain amino acid sequences.') sys.exit() # build gene profile taxon_profile = TaxonProfile(options.cpus, options.output_dir) taxon_profile.run(gene_files, options.scaffold_stats_file, options.db_file, options.taxonomy_file, options.per_to_classify, options.evalue, options.per_identity, options.per_aln_len, options.tmpdir) self.logger.info('Results written to: %s' % options.output_dir)
def rank_res(self, options): """Calculate taxonomic resolution at each rank.""" check_file_exists(options.input_tree) check_file_exists(options.taxonomy_file) if options.taxa_file: taxa_out = open(options.taxa_file, 'w') taxa_out.write('Rank\tLowest Rank\tTaxon\n') # determine taxonomic resolution of named groups tree = dendropy.Tree.get_from_path(options.input_tree, schema='newick', rooting='force-rooted', preserve_underscores=True) rank_res = defaultdict(lambda: defaultdict(int)) for node in tree.preorder_node_iter(lambda n: n != tree.seed_node): if not node.label or node.is_leaf(): continue _support, taxon_name, _auxiliary_info = parse_label(node.label) if taxon_name: lowest_rank = [x.strip() for x in taxon_name.split(';')][-1][0:3] for rank_prefix in Taxonomy.rank_prefixes: if rank_prefix in taxon_name: rank_res[rank_prefix][lowest_rank] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[ Taxonomy.rank_index[rank_prefix]] lowest_rank_name = Taxonomy.rank_labels[ Taxonomy.rank_index[lowest_rank]] taxa_out.write('%s\t%s\t%s\n' % (rank_prefix_name, lowest_rank_name, taxon_name)) # identify any singleton taxa which are treated as having species level resolution for line in open(options.taxonomy_file): line_split = line.split('\t') genome_id = line_split[0] taxonomy = line_split[1].split(';') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes): if taxonomy[i] == rank_prefix: # this taxa is undefined at the specified rank so # must be the sole representative; e.g., a p__ # indicates a taxon that represents a novel phyla rank_res[rank_prefix]['s__'] += 1 if options.taxa_file: rank_prefix_name = Taxonomy.rank_labels[ Taxonomy.rank_index[rank_prefix]] taxa_out.write('%s\t%s\t%s (%s)\n' % (rank_prefix_name, 'species', taxonomy[i], genome_id)) if options.taxa_file: taxa_out.close() # write out results fout = open(options.output_file, 'w') fout.write('Category') for rank in Taxonomy.rank_labels[1:]: fout.write('\t' + rank) fout.write('\n') for i, rank_prefix in enumerate(Taxonomy.rank_prefixes[1:]): fout.write(Taxonomy.rank_labels[i + 1]) for j, r in enumerate(Taxonomy.rank_prefixes[1:]): if i >= j: fout.write('\t' + str(rank_res[r].get(rank_prefix, 0))) else: fout.write('\t-') fout.write('\n') fout.close() self.logger.info('Done.')
def check_unique_strains(self, options): check_file_exists(options.node) check_file_exists(options.name) check_file_exists(options.metadata_file) p = Tools() p.parse_ncbi_names_and_nodes(options.name, options.node, options.metadata_file, options.output_file)
def annoted_features(self, options): """Making annoted features matrix""" missing = [] features2annotation = {} with open(options.features_annotation) as f: for line in f: line = line.rstrip() features_id, annotation = line.split('\t') features2annotation[features_id] = annotation counts = {} id2description = {} annotation_id_list = [] with open(options.annotation_description) as f: for line in f: line = line.rstrip() annotation_id, description = line.split('\t') id2description[annotation_id] = description annotation_id_list.append(annotation_id) counts[annotation_id] = {} annotation_id_list.append('hypothetical protein') counts['hypothetical protein'] = {} check_dir_exists(options.features_dir) input_matrices = DefaultValues.FEATURES_ABUNDANCE_FILES output_matrices = DefaultValues.ANNOTATE_ABUNDANCE_FILES for index, input_matrix in enumerate(input_matrices): input_matrix = os.path.join(options.features_dir, input_matrix) count_type, abundance_type = input_matrix.split('_')[1:3] check_file_exists(input_matrix) counts_all = {} header = [] with open(input_matrix) as f: for line in f: line = line.rstrip() line_list = line.split('\t') if len(header) == 0: header = line_list for i in range(3, len(header), 1): sample = header[i] for annotation_id in annotation_id_list: counts[annotation_id][sample] = 0 counts_all[sample] = 0 else: features = line_list[0] annotation_id = features2annotation[features] if annotation_id not in counts: if annotation_id not in missing: self.logger.warning( "'%s' not present in %s" % (annotation_id, options.annotation_description)) missing.append(annotation_id) continue for i in range(3, len(header), 1): sample = header[i] counts[annotation_id][sample] = counts[ annotation_id][sample] + float(line_list[i]) counts_all[sample] = counts_all[sample] + float( line_list[i]) output_matrix = os.path.join(options.features_dir, output_matrices[index]) self.logger.info('Print %s %s abundance matrix in "%s"' % (count_type, abundance_type, output_matrix)) output_handle = open(output_matrix, "w") output_handle.write('\t'.join(['Features'] + header[3:len(header)]) + '\n') for annotation in annotation_id_list: if sum([counts[annotation][s] for s in counts[annotation]]) == 0 and options.removed: continue else: output_handle.write('\t'.join([annotation] + [ str(counts[annotation][s]) for s in counts[annotation] ]) + '\n') self.logger.info('Printing matrices done')
def compare_red(self, options): """Compare RED values of taxa calculated over different trees.""" check_file_exists(options.red_table1) check_file_exists(options.red_table2) check_file_exists(options.red_dict2) median_reds = eval(open(options.red_dict2).readline()) red1 = {} red2 = {} lineage = {} for d, red_file in [(red1, options.red_table1), (red2, options.red_table2)]: with open(red_file) as f: f.readline() for line in f: line_split = line.strip().split('\t') taxon = line_split[0] median_red = float(line_split[2]) d[taxon] = median_red if d == red1: lineage[taxon] = line_split[1] red1_label = os.path.splitext(os.path.basename(options.red_table1))[0] red2_label = os.path.splitext(os.path.basename(options.red_table2))[0] fout = open(options.output_table, 'w') fout.write( 'Taxon\tLineage\t%s\t%s\tDifference\tAbs. Difference\tChanged rank\n' % (red1_label, red2_label)) if options.viral: sorted_taxa = sort_viral_taxa(set(red1.keys()).union(red2.keys())) else: sorted_taxa = Taxonomy().sort_taxa( set(red1.keys()).union(red2.keys())) for taxon in sorted_taxa: r1 = red1.get(taxon, 'NA') r2 = red2.get(taxon, 'NA') if r1 == 'NA': fout.write('%s\t%s\t%s\t%.3f\t%s\t%s' % (taxon, 'NA', 'NA', r2, 'NA', 'NA')) elif r2 == 'NA': fout.write('%s\t%s\t%.3f\t%s\t%s\t%s\t%s\n' % (taxon, lineage[taxon], r1, 'NA', 'NA', 'NA', 'NA')) else: fout.write( '%s\t%s\t%.3f\t%.3f\t%.3f\t%.3f' % (taxon, lineage[taxon], r1, r2, r1 - r2, abs(r1 - r2))) if r2 != 'NA': rank_prefix = taxon[0:3] if rank_prefix == 'd__': continue if options.viral: rank_label = VIRAL_RANK_LABELS[VIRAL_RANK_PREFIXES.index( rank_prefix)] else: rank_label = Taxonomy.rank_labels[ Taxonomy.rank_prefixes.index(rank_prefix)] rank_median = median_reds[rank_label] closest_rank = rank_label closest_dist = 1e6 if r2 < rank_median - 0.1 or r2 > rank_median + 0.1: for rank, median_red in median_reds.items(): d = abs(r2 - median_red) if d < closest_dist: closest_dist = d closest_rank = rank if rank_label != closest_rank: fout.write('\tTrue (%s: %.3f)' % (closest_rank, closest_dist)) else: fout.write('\tFalse') fout.write('\n') fout.close()
def u_synonyms(self, args): """Determine synonyms for validly or effectively published species.""" check_file_exists(args.gtdb_clusters_file) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.ncbi_misclassified_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.ani_af_rep_vs_nonrep) check_file_exists(args.gtdb_type_strains_ledger) check_file_exists(args.sp_priority_ledger) check_file_exists(args.genus_priority_ledger) check_file_exists(args.dsmz_bacnames_file) make_sure_path_exists(args.output_dir) p = UpdateSynonyms(args.output_dir) p.run(args.gtdb_clusters_file, args.cur_gtdb_metadata_file, args.uba_genome_paths, args.qc_passed_file, args.ncbi_misclassified_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.ani_af_rep_vs_nonrep, args.gtdb_type_strains_ledger, args.sp_priority_ledger, args.genus_priority_ledger, args.dsmz_bacnames_file) self.logger.info('Done.')
def u_ncbi_erroneous(self, args): """Identify genomes with erroneous NCBI species assignments.""" check_file_exists(args.gtdb_clusters_file) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.cur_genomic_path_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.gtdb_type_strains_ledger) check_file_exists(args.sp_priority_ledger) check_file_exists(args.genus_priority_ledger) check_file_exists(args.dsmz_bacnames_file) make_sure_path_exists(args.output_dir) p = UpdateErroneousNCBI(args.ani_ncbi_erroneous, args.ani_cache_file, args.cpus, args.output_dir) p.run(args.gtdb_clusters_file, args.cur_gtdb_metadata_file, args.cur_genomic_path_file, args.uba_genome_paths, args.qc_passed_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.gtdb_type_strains_ledger, args.sp_priority_ledger, args.genus_priority_ledger, args.dsmz_bacnames_file) self.logger.info('Done.')
def u_genus_names(self, args): """Update genus names as a precursor for establish binomial species names.""" check_file_exists(args.gtdb_clusters_file) check_file_exists(args.prev_gtdb_metadata_file) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.gtdbtk_classify_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.gtdb_type_strains_ledger) check_file_exists(args.sp_priority_ledger) check_file_exists(args.gtdb_taxa_updates_ledger) check_file_exists(args.dsmz_bacnames_file) make_sure_path_exists(args.output_dir) p = UpdateGenusNames(args.output_dir) p.run(args.gtdb_clusters_file, args.prev_gtdb_metadata_file, args.cur_gtdb_metadata_file, args.uba_genome_paths, args.qc_passed_file, args.gtdbtk_classify_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.gtdb_type_strains_ledger, args.sp_priority_ledger, args.gtdb_taxa_updates_ledger, args.dsmz_bacnames_file) self.logger.info('Done.')
def select_type_genomes(self, args): """Select representative genomes for named species.""" check_file_exists(args.qc_file) check_file_exists(args.gtdb_metadata_file) check_file_exists(args.genome_path_file) check_file_exists(args.prev_rep_file) check_file_exists(args.ncbi_refseq_assembly_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.gtdb_domain_report) check_file_exists(args.species_exception_file) check_file_exists(args.gtdb_type_genome_file) make_sure_path_exists(args.output_dir) try: p = SelectTypeGenomes(args.ani_cache_file, args.cpus, args.output_dir) p.run(args.qc_file, args.gtdb_metadata_file, args.ltp_blast_file, args.genome_path_file, args.prev_rep_file, args.ncbi_refseq_assembly_file, args.ncbi_genbank_assembly_file, args.gtdb_domain_report, args.species_exception_file, args.gtdb_type_genome_file) except GTDB_Error as e: print(e.message) raise SystemExit self.logger.info('GTDB type genomes written to: %s' % args.output_dir)
def pmc_check_type_strains(self, args): """Check for agreement between GTDB species and genomes assembled from type strain of species.""" check_file_exists(args.manual_taxonomy) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.synonym_file) check_file_exists(args.gtdb_type_strains_ledger) check_file_exists(args.sp_priority_ledger) check_file_exists(args.genus_priority_ledger) check_file_exists(args.dsmz_bacnames_file) make_sure_path_exists(args.output_dir) p = PMC_CheckTypeStrains(args.output_dir) p.run(args.manual_taxonomy, args.cur_gtdb_metadata_file, args.uba_genome_paths, args.qc_passed_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.synonym_file, args.gtdb_type_strains_ledger, args.sp_priority_ledger, args.genus_priority_ledger, args.dsmz_bacnames_file) self.logger.info('Done.')
def u_species_init(self, args): """Produce initial best guess at GTDB species clusters.""" check_file_exists(args.gtdb_clusters_file) check_file_exists(args.prev_gtdb_metadata_file) check_file_exists(args.prev_genomic_path_file) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.cur_genomic_path_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.gtdbtk_classify_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.synonym_file) check_file_exists(args.gtdb_type_strains_ledger) check_file_exists(args.sp_priority_ledger) check_file_exists(args.genus_priority_ledger) check_file_exists(args.gtdb_taxa_updates_ledger) check_file_exists(args.dsmz_bacnames_file) make_sure_path_exists(args.output_dir) p = UpdateSpeciesInit(args.ani_cache_file, args.cpus, args.output_dir) p.run(args.gtdb_clusters_file, args.prev_gtdb_metadata_file, args.prev_genomic_path_file, args.cur_gtdb_metadata_file, args.cur_genomic_path_file, args.uba_genome_paths, args.qc_passed_file, args.gtdbtk_classify_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.synonym_file, args.gtdb_type_strains_ledger, args.sp_priority_ledger, args.genus_priority_ledger, args.gtdb_taxa_updates_ledger, args.dsmz_bacnames_file) self.logger.info('Done.')
def pmc_validate(self, args): """Validate final species names.""" check_file_exists(args.final_taxonomy) check_file_exists(args.final_scaled_tree) check_file_exists(args.manual_sp_names) check_file_exists(args.pmc_custom_species) check_file_exists(args.gtdb_clusters_file) check_file_exists(args.prev_gtdb_metadata_file) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.ncbi_misclassified_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.synonym_file) check_file_exists(args.updated_species_reps) check_file_exists(args.gtdb_type_strains_ledger) check_file_exists(args.species_classification_ledger) check_file_exists(args.sp_priority_ledger) check_file_exists(args.genus_priority_ledger) check_file_exists(args.specific_epithet_ledger) check_file_exists(args.dsmz_bacnames_file) check_file_exists(args.ground_truth_test_cases) make_sure_path_exists(args.output_dir) p = PMC_Validation(args.output_dir) p.run(args.final_taxonomy, args.final_scaled_tree, args.manual_sp_names, args.pmc_custom_species, args.gtdb_clusters_file, args.prev_gtdb_metadata_file, args.cur_gtdb_metadata_file, args.uba_genome_paths, args.qc_passed_file, args.ncbi_misclassified_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.synonym_file, args.updated_species_reps, args.gtdb_type_strains_ledger, args.species_classification_ledger, args.sp_priority_ledger, args.genus_priority_ledger, args.specific_epithet_ledger, args.dsmz_bacnames_file, args.ground_truth_test_cases, args.skip_genus_checks) self.logger.info('Done.')
def u_summary_stats(self, args): """Summary statistics indicating changes to GTDB species clusters.""" check_file_exists(args.updated_sp_rep_file) check_file_exists(args.gtdb_clusters_file) check_file_exists(args.prev_gtdb_metadata_file) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.gtdbtk_classify_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.synonym_file) check_file_exists(args.gtdb_type_strains_ledger) make_sure_path_exists(args.output_dir) p = UpdateSummaryStats(args.output_dir) p.run(args.updated_sp_rep_file, args.gtdb_clusters_file, args.prev_gtdb_metadata_file, args.cur_gtdb_metadata_file, args.uba_genome_paths, args.qc_passed_file, args.gtdbtk_classify_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.synonym_file, args.gtdb_type_strains_ledger) self.logger.info('Done.')
def rep_compare(self, args): """Compare current and previous representatives.""" check_file_exists(args.cur_metadata_file) check_file_exists(args.prev_metadata_file) # get representatives in current taxonomy cur_gids = set() cur_species = set() cur_genera = set() cur_reps_taxa = {} cur_rep_species = set() cur_rep_genera = set() header = True for row in csv.reader(open(args.cur_metadata_file)): if header: header = False gtdb_rep_index = row.index('gtdb_representative') gtdb_taxonomy_index = row.index('gtdb_taxonomy') else: gid = row[0] cur_gids.add(gid) gtdb_taxonomy = row[gtdb_taxonomy_index] if gtdb_taxonomy: gtdb_taxa = [ t.strip() for t in row[gtdb_taxonomy_index].split(';') ] if gtdb_taxa[6] != 's__': cur_species.add(gtdb_taxa[6]) if gtdb_taxa[5] != 'g__': cur_genera.add(gtdb_taxa[5]) if row[gtdb_rep_index] == 't': cur_reps_taxa[gid] = gtdb_taxa if gtdb_taxa[6] != 's__': cur_rep_species.add(gtdb_taxa[6]) if gtdb_taxa[5] != 'g__': cur_rep_genera.add(gtdb_taxa[5]) # get representatives in previous taxonomy prev_reps_taxa = {} prev_rep_species = set() prev_rep_genera = set() header = True for row in csv.reader(open(args.prev_metadata_file)): if header: header = False gtdb_rep_index = row.index('gtdb_representative') gtdb_taxonomy_index = row.index('gtdb_taxonomy') else: if row[gtdb_rep_index] == 't': gid = row[0] gtdb_taxonomy = row[gtdb_taxonomy_index] if gtdb_taxonomy: gtdb_taxa = [ t.strip() for t in row[gtdb_taxonomy_index].split(';') ] prev_reps_taxa[gid] = gtdb_taxa if gtdb_taxa[6] != 's__': prev_rep_species.add(gtdb_taxa[6]) if gtdb_taxa[5] != 'g__': prev_rep_genera.add(gtdb_taxa[5]) # summarize differences print('No. current representatives: %d' % len(cur_reps_taxa)) print('No. previous representatives: %d' % len(prev_reps_taxa)) print('') print('No. current species with representatives: %d' % len(cur_rep_species)) print('No. previous species with representatives: %d' % len(prev_rep_species)) print('') print('No. new representatives: %d' % len(set(cur_reps_taxa) - set(prev_reps_taxa))) print('No. retired representatives: %d' % len(set(prev_reps_taxa) - set(cur_reps_taxa))) print('') print('No. new species with representative: %d' % len(cur_rep_species - prev_rep_species)) print('No. new genera with representative: %d' % len(cur_rep_genera - prev_rep_genera)) print('') missing_sp_reps = prev_rep_species.intersection( cur_species) - cur_rep_species print('No. species that no longer have a representative: %d' % len(missing_sp_reps)) for sp in missing_sp_reps: print(' ' + sp) print('') missing_genera_reps = prev_rep_genera.intersection( cur_genera) - cur_rep_genera print('No. genera that no longer have a representative: %d' % len(missing_genera_reps)) for g in missing_genera_reps: print(' ' + g) print('') deprecated_reps = set(prev_reps_taxa).intersection(cur_gids) - set( cur_reps_taxa) print('No. deprecated previous representatives: %d' % len(deprecated_reps))
def u_cluster_named_reps(self, args): """Cluster genomes to selected GTDB representatives.""" check_file_exists(args.named_rep_file) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.cur_genomic_path_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.rep_mash_sketch_file) check_file_exists(args.rep_ani_file) check_file_exists(args.gtdb_type_strains_ledger) make_sure_path_exists(args.output_dir) p = UpdateClusterNamedReps(args.ani_sp, args.af_sp, args.ani_cache_file, args.cpus, args.output_dir) p.run(args.named_rep_file, args.cur_gtdb_metadata_file, args.cur_genomic_path_file, args.uba_genome_paths, args.qc_passed_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.rep_mash_sketch_file, args.rep_ani_file, args.gtdb_type_strains_ledger) self.logger.info('Done.')
def u_cluster_de_novo(self, args): """Infer de novo species clusters and representatives for remaining genomes.""" check_file_exists(args.named_cluster_file) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.cur_genomic_path_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.gtdbtk_classify_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.ani_af_rep_vs_nonrep) check_file_exists(args.gtdb_type_strains_ledger) make_sure_path_exists(args.output_dir) p = UpdateClusterDeNovo(args.ani_sp, args.af_sp, args.ani_cache_file, args.cpus, args.output_dir) p.run(args.named_cluster_file, args.cur_gtdb_metadata_file, args.cur_genomic_path_file, args.uba_genome_paths, args.qc_passed_file, args.gtdbtk_classify_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.ani_af_rep_vs_nonrep, args.gtdb_type_strains_ledger) self.logger.info('Done.')
def pmc_species_names(self, args): """Establish final species names based on manual curation.""" check_file_exists(args.manual_taxonomy) check_file_exists(args.manual_sp_names) check_file_exists(args.pmc_custom_species) check_file_exists(args.gtdb_clusters_file) check_file_exists(args.prev_gtdb_metadata_file) check_file_exists(args.cur_gtdb_metadata_file) check_file_exists(args.uba_genome_paths) check_file_exists(args.qc_passed_file) check_file_exists(args.ncbi_misclassified_file) check_file_exists(args.ncbi_genbank_assembly_file) check_file_exists(args.untrustworthy_type_file) check_file_exists(args.synonym_file) check_file_exists(args.updated_species_reps) check_file_exists(args.gtdb_type_strains_ledger) check_file_exists(args.species_classification_ledger) check_file_exists(args.sp_priority_ledger) check_file_exists(args.genus_priority_ledger) check_file_exists(args.specific_epithet_ledger) check_file_exists(args.dsmz_bacnames_file) make_sure_path_exists(args.output_dir) p = PMC_SpeciesNames(args.output_dir) p.run(args.manual_taxonomy, args.manual_sp_names, args.pmc_custom_species, args.gtdb_clusters_file, args.prev_gtdb_metadata_file, args.cur_gtdb_metadata_file, args.uba_genome_paths, args.qc_passed_file, args.ncbi_misclassified_file, args.ncbi_genbank_assembly_file, args.untrustworthy_type_file, args.synonym_file, args.updated_species_reps, args.gtdb_type_strains_ledger, args.species_classification_ledger, args.sp_priority_ledger, args.genus_priority_ledger, args.specific_epithet_ledger, args.dsmz_bacnames_file) self.logger.info('Done.')
def propagate(self, options): """Propagate labels to all genomes in a cluster.""" check_file_exists(options.input_taxonomy) check_file_exists(options.metadata_file) user_to_uba = {} if options.uba_mapping_file: self.logger.info('Parsing genome ID mapping file.') with open(options.uba_mapping_file) as f: for line in f: tokens = line.strip().split('\t') if len(tokens) == 2: user_to_uba[tokens[0]] = tokens[1] self.logger.info(' - found mappings for {:,} genomes.'.format( len(user_to_uba))) # get representative genome information rep_metadata = read_gtdb_metadata( options.metadata_file, ['gtdb_representative', 'gtdb_clustered_genomes']) rep_metadata = { canonical_gid(gid): values for gid, values in rep_metadata.items() } rep_metadata = { user_to_uba.get(gid, gid): values for gid, values in rep_metadata.items() } explict_tax = Taxonomy().read(options.input_taxonomy) self.logger.info(f' - identified {len(rep_metadata):,} genomes') # sanity check all representatives have a taxonomy string rep_count = 0 for gid in rep_metadata: is_rep_genome, clustered_genomes = rep_metadata.get( gid, (None, None)) if is_rep_genome: rep_count += 1 if gid not in explict_tax: self.logger.error( 'Expected to find {} in input taxonomy as it is a GTDB representative.' .format(gid)) sys.exit(-1) self.logger.info( 'Identified {:,} representatives in metadata file and {:,} genomes in input taxonomy file.' .format(rep_count, len(explict_tax))) # propagate taxonomy to genomes clustered with each representative fout = open(options.output_taxonomy, 'w') for rid, taxon_list in explict_tax.items(): taxonomy_str = ';'.join(taxon_list) rid = canonical_gid(rid) rid = user_to_uba.get(rid, rid) is_rep_genome, clustered_genomes = rep_metadata[rid] if is_rep_genome: # assign taxonomy to representative and all genomes in the cluster fout.write('{}\t{}\n'.format(rid, taxonomy_str)) for cid in [ gid.strip() for gid in clustered_genomes.split(';') ]: cid = canonical_gid(cid) cid = user_to_uba.get(cid, cid) if cid != rid: if cid in rep_metadata: fout.write('{}\t{}\n'.format(cid, taxonomy_str)) else: self.logger.warning( 'Skipping {} as it is not in GTDB metadata file.' .format(cid)) else: self.logger.error( 'Did not expected to find {} in input taxonomy as it is not a GTDB representative.' .format(rid)) sys.exit(-1) self.logger.info('Taxonomy written to: {}'.format( options.output_taxonomy))