Пример #1
0
    def call_genes(self, options):
        """Call genes command"""

        make_sure_path_exists(options.output_dir)
        
        genome_files = self._input_files(options.input_genomes, options.file_ext)

        prodigal = Prodigal(options.cpus, not options.silent)
        summary_stats = prodigal.run(genome_files, 
                                        options.output_dir, 
                                        called_genes=False, 
                                        translation_table=options.force_table, 
                                        meta=False,
                                        closed_ends=True)

        # write gene calling summary
        fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w')
        fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n')
        for genome_id, stats in summary_stats.items():
            fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id,
                                                     stats.best_translation_table,
                                                     stats.coding_density_4,
                                                     stats.coding_density_11))
        fout.close()

        self.logger.info('Identified genes written to: %s' % options.output_dir)
Пример #2
0
    def gene_profile(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - gene_profile] Generating taxonomic profiles from genes.')
        self.logger.info('*******************************************************************************')

        make_sure_path_exists(options.output_dir)
        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.taxonomy_file)
        check_file_exists(options.db_file)

        gene_files = self._genome_files(options.genome_prot_dir, options.protein_ext)
        if not self._check_protein_seqs(gene_files):
            self.logger.warning('[Warning] All files must contain amino acid sequences.')
            sys.exit()

        # build gene profile
        gene_profile = GeneProfile(options.cpus, options.output_dir)
        gene_profile.run(gene_files,
                            options.scaffold_stats_file,
                            options.db_file,
                            options.taxonomy_file,
                            options.per_to_classify,
                            options.evalue,
                            options.per_identity)

        self.logger.info('')
        self.logger.info('  Results written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
Пример #3
0
    def reference(self, options):
        """Reference command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - reference] Identifying scaffolds similar to specific genome(s).')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_prot_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        ref_gene_files = self._genome_files(options.ref_genome_prot_dir, options.protein_ext)
        if not self._check_protein_seqs(ref_gene_files):
            self.logger.warning('[Warning] All files must contain amino acid sequences.')
            sys.exit()

        reference = Reference(options.cpus, options.output_dir)
        reference_out = reference.run(options.scaffold_prot_file,
                                        options.scaffold_stats_file,
                                        ref_gene_files,
                                        options.db_file,
                                        options.evalue,
                                        options.per_identity)

        self.logger.info('')
        self.logger.info('  Results written to: ' + reference_out)

        self.time_keeper.print_time_stamp()
Пример #4
0
    def cluster(self, options):
        """Cluster command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - cluster] Partitioning bin into clusters.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(options.cpus)
        cluster.run(scaffold_stats,
                    options.num_clusters,
                    options.num_components,
                    options.K,
                    options.no_coverage,
                    options.no_pca,
                    options.iterations,
                    options.genome_file,
                    options.output_dir)

        self.logger.info('')
        self.logger.info('  Partitioned sequences written to: ' + options.output_dir)

        self.time_keeper.print_time_stamp()
Пример #5
0
    def call_genes(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - call_genes] Identifying genes within genomes.')
        self.logger.info('*******************************************************************************')

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_dir, options.genome_ext)
        if not genome_files:
            self.logger.warning('  [Warning] No genome files found. Check the --genome_ext flag used to identify genomes.')
            sys.exit()

        prodigal = Prodigal(options.cpus)
        summary_stats = prodigal.run(genome_files, False, options.force_table, False, options.output_dir)

        # write gene calling summary
        fout = open(os.path.join(options.output_dir, 'call_genes.summary.tsv'), 'w')
        fout.write('Genome Id\tSelected translation table\tTable 4 coding density\tTable 11 coding density\n')
        for genome_id, stats in summary_stats.iteritems():
            fout.write('%s\t%d\t%.2f%%\t%.2f%%\n' % (genome_id,
                                                     stats.best_translation_table,
                                                     stats.coding_density_4,
                                                     stats.coding_density_11))
        fout.close()

        self.logger.info('')
        self.logger.info('  Identified genes written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
Пример #6
0
    def call_genes(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.genome_nt_dir)
        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        # call genes in genomes
        prodigal = Prodigal(options.cpus)
        prodigal.run(genome_files, options.output_dir)
        self.logger.info('  Genes in genomes written to: %s' % options.output_dir)

        # call genes in unbinned scaffolds
        if options.unbinned_file:
            unbinned_output_dir = os.path.join(options.output_dir, 'unbinned')
            prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True)
            self.logger.info('  Genes in unbinned scaffolds written to: %s' % unbinned_output_dir)

        self.time_keeper.print_time_stamp()
Пример #7
0
    def lsu_tree(self, options):
        """Infer 23S tree spanning GTDB genomes."""

        check_dependencies(['esl-sfetch', 'cmsearch', 'cmalign', 'esl-alimask', 'FastTreeMP', 'blastn'])

        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.gtdb_lsu_file)
        make_sure_path_exists(options.output_dir)

        rna_workflow = RNA_Workflow(options.cpus)
        rna_workflow.run('lsu',
                            options.gtdb_metadata_file,
                            options.gtdb_lsu_file,
                            options.min_lsu_length,
                            options.min_scaffold_length,
                            options.min_quality,
                            options.max_contigs,
                            options.min_N50,
                            not options.disable_tax_filter,
                            #options.reps_only,
                            #options.user_genomes,
                            options.genome_list,
                            options.output_dir)

        self.logger.info('Results written to: %s' % options.output_dir)
Пример #8
0
    def select_type_genomes(self, options):
        """Select representative genomes for named species."""

        check_file_exists(options.qc_file)
        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.genome_path_file)
        check_file_exists(options.prev_rep_file)
        check_file_exists(options.ncbi_refseq_assembly_file)
        check_file_exists(options.ncbi_genbank_assembly_file)
        check_file_exists(options.gtdb_domain_report)
        check_file_exists(options.species_exception_file)
        check_file_exists(options.gtdb_type_genome_file)
        make_sure_path_exists(options.output_dir)

        try:
            p = SelectTypeGenomes(options.ani_cache_file, options.cpus, options.output_dir)
            p.run(options.qc_file,
                        options.gtdb_metadata_file,
                        options.ltp_blast_file,
                        options.genome_path_file,
                        options.prev_rep_file,
                        options.ncbi_refseq_assembly_file,
                        options.ncbi_genbank_assembly_file,
                        options.gtdb_domain_report,
                        options.species_exception_file,
                        options.gtdb_type_genome_file)
        except GenomeTreeTkError as e:
            print e.message
            raise SystemExit

        self.logger.info('GTDB type genomes written to: %s' % options.output_dir)
Пример #9
0
    def cluster_named_types(self, options):
        """Cluster genomes to selected GTDB type genomes."""

        check_file_exists(options.qc_file)
        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.genome_path_file)
        check_file_exists(options.named_type_genome_file)
        check_file_exists(options.type_genome_ani_file)
        check_file_exists(options.species_exception_file)
        make_sure_path_exists(options.output_dir)

        try:
            p = ClusterNamedTypes(options.ani_sp,
                                    options.af_sp,
                                    options.ani_cache_file, 
                                    options.cpus,
                                    options.output_dir)
            p.run(options.qc_file,
                    options.gtdb_metadata_file,
                    options.genome_path_file,
                    options.named_type_genome_file,
                    options.type_genome_ani_file,
                    options.mash_sketch_file,
                    options.species_exception_file)
        except GenomeTreeTkError as e:
            print e.message
            raise SystemExit

        self.logger.info('Clustering results written to: %s' % options.output_dir)
Пример #10
0
    def gene(self, options):
        self.logger.info('Calculating gene properties of genome.')

        check_file_exists(options.genome_file)
        check_file_exists(options.gff_file)
        make_sure_path_exists(options.output_dir)

        meta_genes = MetadataGenes()
        metadata_values, metadata_desc = meta_genes.generate(options.genome_file,
                                                                options.gff_file)

        # write statistics to file
        output_file = os.path.join(options.output_dir, 'metadata.genome_gene.tsv')
        fout = open(output_file, 'w')
        for field in sorted(metadata_values.keys()):
            fout.write('%s\t%s\n' % (field, str(metadata_values[field])))
        fout.close()

        # write description to file
        output_file = os.path.join(options.output_dir, 'metadata.genome_gene.desc.tsv')
        fout = open(output_file, 'w')
        for field in sorted(metadata_desc.keys()):
            fout.write('%s\t%s\t%s\n' % (field,
                                         metadata_desc[field],
                                         type(metadata_values[field]).__name__.upper()))
        fout.close()
Пример #11
0
    def ani(self, options):
        """ANI command"""

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_dir, options.file_ext)

        self.logger.info('Average nucleotide identity information written to: %s' % options.output_dir)
Пример #12
0
    def scaffold_stats(self, options):
        """Scaffold statistics command"""
        print options
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - scaffold_stats] Calculating statistics for scaffolds.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_file)

        if not self._check_nuclotide_seqs([options.scaffold_file]):
            self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.')
            sys.exit()

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        make_sure_path_exists(options.output_dir)

        # get coverage information
        if not options.coverage_file:
            if not options.bam_files:
                self.logger.warning('\n  [Warning] One or more BAM files must be specified in order to calculate coverage profiles.')
                coverage_file = None
            else:
                coverage = Coverage(options.cpus)
                coverage_file = os.path.join(options.output_dir, 'coverage.tsv')
                coverage.run(options.bam_files, coverage_file, options.cov_all_reads, options.cov_min_align, options.cov_max_edit_dist)
                self.logger.info('')
                self.logger.info('  Coverage profiles written to: %s' % coverage_file)
        else:
            coverage_file = options.coverage_file

        # get tetranucleotide signatures - ALEX - IMPORTANT FOR MY STUFF 
        if not options.tetra_file:
            self.logger.info('')
            tetra = Tetranucleotide(options.cpus)
            tetra_file = os.path.join(options.output_dir, 'tetra.tsv')
            signatures = tetra.run(options.scaffold_file)
            tetra.write(signatures, tetra_file)
            self.logger.info('  Tetranucleotide signatures written to: %s' % tetra_file)
        else:
            tetra_file = options.tetra_file

        # write out scaffold statistics
        stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv')
        stats = ScaffoldStats(options.cpus)
        stats.run(options.scaffold_file, genome_files, tetra_file, coverage_file, stats_output)

        self.logger.info('  Scaffold statistic written to: %s' % stats_output)

        self.time_keeper.print_time_stamp()
Пример #13
0
    def modify(self, options):
        """Modify command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - modify] Modifying scaffolds in genome.')
        self.logger.info('*******************************************************************************')

        make_sure_path_exists(os.path.dirname(options.output_genome))

        if not (options.add or options.remove or options.outlier_file or options.compatible_file):
            self.logger.warning('  [Warning] No modification to bin requested.\n')
            sys.exit()

        if (options.add or options.remove) and (options.outlier_file or options.compatible_file):
            self.logger.warning("  [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n")
            sys.exit()

        if options.outlier_file and options.compatible_file:
            self.logger.warning("  [Warning] The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n")
            sys.exit()

        failed_to_add = []
        failed_to_remove = []
        if options.add or options.remove:
            failed_to_add, failed_to_remove = genome_tk.modify(options.genome_file,
                                                               options.scaffold_file,
                                                               options.add,
                                                               options.remove,
                                                               options.output_genome)
        elif options.outlier_file:
            outliers = Outliers()
            outliers.remove_outliers(options.genome_file, options.outlier_file, options.output_genome)
        elif options.compatible_file:
            outliers = Outliers()
            if options.unique_only:
                outliers.add_compatible_unique(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome)
            else:
                outliers.add_compatible_closest(options.scaffold_file, options.genome_file, options.compatible_file, options.output_genome)

        if failed_to_add:
            self.logger.warning('  [Warning] Failed to add the following sequence(s):')
            for seq_id in failed_to_add:
                self.logger.warning('    %s' % seq_id)

        if failed_to_remove:
            self.logger.warning('  [Warning] Failed to remove the following sequence(s):')
            for seq_id in failed_to_remove:
                self.logger.warning('    %s' % seq_id)

        self.logger.info('')
        self.logger.info('  Modified genome written to: ' + options.output_genome)

        self.time_keeper.print_time_stamp()
Пример #14
0
    def rd_ranks(self, options):
        """Calculate number of taxa for specified rd thresholds."""

        check_file_exists(options.input_tree)
        make_sure_path_exists(options.output_dir)

        r = RdRanks()
        r.run(options.input_tree,
                options.thresholds,
                options.output_dir)

        self.logger.info('Done.')
Пример #15
0
    def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep, num_replicates, model, output_dir):
        """Jackknife taxa.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        outgroup_file : str
          File indicating labels of outgroup taxa.
        perc_taxa_to_keep : float
          Percentage of taxa to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          input_tree directory for bootstrap trees.
        """

        assert(model in ['wag', 'jtt'])

        self.perc_taxa_to_keep = perc_taxa_to_keep
        self.model = model
        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)
        # read outgroup taxa
        self.outgroup_ids = set()
        if outgroup_file:
            for line in open(outgroup_file):
                self.outgroup_ids.add(line.strip())

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        #***self.logger.info('Calculating jackknife taxa replicates:')
        #***parallel = Parallel(self.cpus)
        #***parallel.run(self._producer, None, xrange(num_replicates), self._progress)

        # calculate support
        rep_tree_files = []
        for rep_index in xrange(num_replicates):
            rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(rep_index) + '.tre'))

        tree_support = TreeSupport()
        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_taxa.tree')
        tree_support.subset_taxa(input_tree, rep_tree_files, output_tree)

        return output_tree
Пример #16
0
    def bl_dist(self, options):
        """Calculate distribution of branch lengths at each taxonomic rank."""

        check_file_exists(options.input_tree)
        make_sure_path_exists(options.output_dir)

        b = BranchLengthDistribution()
        b.run(options.input_tree,
                options.trusted_taxa_file,
                options.min_children,
                options.taxonomy_file,
                options.output_dir)

        self.logger.info('Done.')
Пример #17
0
    def ani(self, options):
        """ANI command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - ani] Calculating the ANI between genome pairs.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_dir, options.genome_ext)

        self.logger.info('')
        self.logger.info('  Average nucleotide identity information written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
Пример #18
0
 def derep_tree(self, options):
     """Dereplicate tree."""
     
     check_file_exists(options.input_tree)
     check_file_exists(options.gtdb_metadata)
     check_file_exists(options.msa_file)
     make_sure_path_exists(options.output_dir)
     
     derep_tree = DereplicateTree()
     derep_tree.run(options.input_tree,
                     options.lineage_of_interest,
                     options.outgroup,
                     options.gtdb_metadata,
                     options.taxa_to_retain,
                     options.msa_file,
                     options.keep_unclassified,
                     options.output_dir)
Пример #19
0
    def jk_taxa(self, options):
        """Jackknife taxa."""

        check_file_exists(options.input_tree)
        check_file_exists(options.msa_file)
        make_sure_path_exists(options.output_dir)

        jackknife_taxa = JackknifeTaxa(options.cpus)
        output_tree = jackknife_taxa.run(options.input_tree,
                                            options.msa_file,
                                            options.outgroup_ids,
                                            options.perc_taxa,
                                            options.num_replicates,
                                            options.model,
                                            options.output_dir)

        self.logger.info('Jackknifed taxa tree written to: %s' % output_tree)
Пример #20
0
 def classify(self, options):
     """Classify genomes based on AAI values."""
     check_file_exists(options.sorted_hit_table)
     make_sure_path_exists(options.output_dir)
     
     classify = Classify(options.cpus)
     results_file = classify.run(options.query_gene_file,
                                     options.target_gene_file,
                                     options.sorted_hit_table,
                                     options.evalue,
                                     options.per_identity,
                                     options.per_aln_len,
                                     options.num_top_targets,
                                     options.taxonomy_file,
                                     options.keep_rbhs,
                                     options.output_dir)
     
     self.logger.info('Classification results written to: %s' % results_file)
Пример #21
0
    def rna_tree(self, options):
        """Infer 16S + 23S tree spanning GTDB genomes."""

        check_dependencies(['FastTreeMP'])

        check_file_exists(options.ssu_msa)
        check_file_exists(options.ssu_tree)
        check_file_exists(options.lsu_msa)
        check_file_exists(options.lsu_tree)
        make_sure_path_exists(options.output_dir)

        rna_workflow = RNA_Workflow(options.cpus)
        rna_workflow.combine(options.ssu_msa,
                                options.ssu_tree,
                                options.lsu_msa,
                                options.lsu_tree,
                                options.output_dir)

        self.logger.info('Results written to: %s' % options.output_dir)
Пример #22
0
 def rna_dump(self, options):
     """Dump all 5S, 16S, and 23S sequences to files."""
     
     check_file_exists(options.genomic_file)
     make_sure_path_exists(options.output_dir)
     
     rna_workflow = RNA_Workflow(1)
     rna_workflow.dump(options.genomic_file,
                         options.gtdb_taxonomy,
                         options.min_5S_len,
                         options.min_16S_ar_len,
                         options.min_16S_bac_len,
                         options.min_23S_len,
                         options.min_contig_len,
                         options.include_user,
                         options.genome_list,
                         options.output_dir)
                         
     self.logger.info('Results written to: %s' % options.output_dir)
Пример #23
0
    def aai(self, options):
        """AAI command"""
        check_file_exists(options.sorted_hit_table)
        make_sure_path_exists(options.output_dir)

        aai_calculator = AAICalculator(options.cpus)
        aai_output_file, rbh_output_file = aai_calculator.run(options.query_gene_file,
                                                                None,
                                                                options.sorted_hit_table,
                                                                options.evalue,
                                                                options.per_identity,
                                                                options.per_aln_len,
                                                                options.keep_rbhs,
                                                                options.output_dir)

        if rbh_output_file:
            self.logger.info('Identified reciprocal best hits written to: %s' % rbh_output_file)
            
        self.logger.info('AAI between genomes written to: %s' % aai_output_file)
Пример #24
0
    def cluster_user(self, options):
        """Cluster User genomes to GTDB species clusters."""

        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.genome_path_file)
        check_file_exists(options.final_cluster_file)
        make_sure_path_exists(options.output_dir)

        try:
            p = ClusterUser(options.ani_cache_file, 
                                options.cpus,
                                options.output_dir)
            p.run(options.gtdb_metadata_file,
                        options.genome_path_file,
                        options.final_cluster_file)
        except GenomeTreeTkError as e:
            print e.message
            raise SystemExit

        self.logger.info('Clustering results written to: %s' % options.output_dir)
Пример #25
0
    def jk_markers(self, options):
        """Jackknife marker genes."""

        check_file_exists(options.input_tree)
        if options.msa_file != 'NONE':
            check_file_exists(options.msa_file)
        make_sure_path_exists(options.output_dir)

        jackknife_markers = JackknifeMarkers(options.cpus)
        output_tree = jackknife_markers.run(options.input_tree,
                                                options.msa_file,
                                                options.marker_info_file,
                                                options.mask_file,
                                                options.perc_markers,
                                                options.num_replicates,
                                                options.model,
                                                options.jk_dir,
                                                options.output_dir)

        self.logger.info('Jackknifed marker tree written to: %s' % output_tree)
Пример #26
0
    def bootstrap(self, options):
        """Bootstrap multiple sequence alignment."""

        check_file_exists(options.input_tree)
        if options.msa_file != 'NONE':
            check_file_exists(options.msa_file)
        make_sure_path_exists(options.output_dir)

        bootstrap = Bootstrap(options.cpus)
        output_tree = bootstrap.run(options.input_tree,
                                    options.msa_file,
                                    options.num_replicates,
                                    options.model,
                                    options.gamma,
                                    options.base_type,
                                    options.fraction,
                                    options.boot_dir,
                                    options.output_dir)

        self.logger.info('Bootstrapped tree written to: %s' % output_tree)
Пример #27
0
    def assign(self, options):
        """Assign genomes to canonical genomes comprising GTDB reference tree."""

        check_file_exists(options.canonical_taxonomy_file)
        check_file_exists(options.full_taxonomy_file)
        check_file_exists(options.metadata_file)
        check_file_exists(options.genome_path_file)
        
        make_sure_path_exists(options.output_dir)

        try:
            assign = AssignGenomes(options.cpus, options.output_dir)
            assign.run(options.canonical_taxonomy_file,
                        options.full_taxonomy_file,
                        options.metadata_file,
                        options.genome_path_file,
                        options.user_genomes)

        except GenomeTreeTkError as e:
            print e.message
            raise SystemExit
Пример #28
0
    def similarity(self, options):
        """Perform sequence similarity search between genes"""

        make_sure_path_exists(options.output_dir)
        
        query_gene_files = self._input_files(options.query_proteins, options.file_ext)
        target_gene_files = self._input_files(options.target_proteins, options.file_ext)
        
        ss = SimilaritySearch(options.cpus)
        ss.run(query_gene_files, 
                target_gene_files,
                options.evalue, 
                options.per_identity, 
                options.per_aln_len,
                True,
                options.tmp_dir,
                options.blastp,
                options.sensitive,
                options.keep_headers,
                options.output_dir)

        self.logger.info('Sequence similarity results written to: %s' % options.output_dir)
Пример #29
0
    def compatible(self, options):
        """Compatible command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - compatible] Identify scaffolds with compatible genomic statistics.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.reference_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        # read scaffold statistics and calculate genome stats
        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify putative homologs to reference genomes
        reference = Reference(1, None)
        putative_homologs = reference.homology_check(options.reference_file,
                                                         options.min_genes,
                                                         float(options.perc_genes))

        # identify scaffolds compatible with bins
        outliers = Outliers()
        output_file = os.path.join(options.output_dir, 'compatible.tsv')
        outliers.compatible(putative_homologs, scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
                                      options.cov_corr, options.cov_perc,
                                      options.report_type, output_file)

        self.logger.info('')
        self.logger.info('  Results written to: ' + output_file)

        self.time_keeper.print_time_stamp()
Пример #30
0
    def cluster_de_novo(self, options):
        """Infer de novo species clusters and type genomes for remaining genomes."""

        check_file_exists(options.qc_file)
        check_file_exists(options.gtdb_metadata_file)
        check_file_exists(options.gtdb_user_genomes_file)
        check_file_exists(options.genome_path_file)
        check_file_exists(options.type_genome_cluster_file)
        check_file_exists(options.type_genome_synonym_file)
        check_file_exists(options.ncbi_refseq_assembly_file)
        check_file_exists(options.ncbi_genbank_assembly_file)
        check_file_exists(options.ani_af_nontype_vs_type)
        check_file_exists(options.species_exception_file)
        make_sure_path_exists(options.output_dir)

        try:
            p = ClusterDeNovo(options.ani_sp,
                                    options.af_sp,
                                    options.ani_cache_file, 
                                    options.cpus,
                                    options.output_dir)
            p.run(options.qc_file,
                        options.gtdb_metadata_file,
                        options.gtdb_user_genomes_file,
                        options.genome_path_file,
                        options.type_genome_cluster_file,
                        options.type_genome_synonym_file,
                        options.ncbi_refseq_assembly_file,
                        options.ncbi_genbank_assembly_file,
                        options.ani_af_nontype_vs_type,
                        options.species_exception_file,
                        options.rnd_type_genome)
        except GenomeTreeTkError as e:
            print e.message
            raise SystemExit

        self.logger.info('Clustering results written to: %s' % options.output_dir)
Пример #31
0
    def run(self, input_tree, msa_file, num_replicates, model, base_type, frac,
            boot_dir, output_dir):
        """Bootstrap multiple sequence alignment.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        base_type : str
          Indicates if bases are nucleotides or amino acids.
        frac : float
          Fraction of alignment to subsample.
        output_dir : str
          Directory for bootstrap trees.
        """

        assert (model in ['wag', 'lg', 'jtt'])
        assert (base_type in ['nt', 'prot'])

        self.model = model
        self.base_type = base_type
        self.frac = frac

        rep_tree_files = []
        if not boot_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)

            # calculate replicates
            self.logger.info('Calculating bootstrap replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates),
                         self._progress)

            for rep_index in xrange(num_replicates):
                rep_tree_files.append(
                    os.path.join(
                        self.replicate_dir,
                        'bootstrap_tree.r_' + str(rep_index) + '.tree'))
        else:
            for f in os.listdir(boot_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(boot_dir, f))
            self.logger.info('Read %d bootstrap replicates.' %
                             len(rep_tree_files))

        # calculate support values
        self.logger.info('Calculating bootstrap support values.')
        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Пример #32
0
    def _calculate_fastani_distance(self, list_leaf, genomes):
        """ Calculate the FastANI distance between all user genomes and the reference to classfy them at the species level
        
        Parameters
        ----------
        list_leaf : List of leaves uncluding one or many user genomes and one reference genome.
        genomes : Dictionary of user genomes d[genome_id] -> FASTA file
    
        Returns
        -------
        dictionary
            dict_results[user_g]={"ref_genome":ref_genome,"mash_dist":mash_dist}
        
        """
        try:
            self.tmp_output_dir = tempfile.mkdtemp()
            query_list_file = open(
                os.path.join(self.tmp_output_dir, 'query_list.txt'), 'w')
            ref_list_file = open(
                os.path.join(self.tmp_output_dir, 'ref_list.txt'), 'w')
            make_sure_path_exists(self.tmp_output_dir)
            for leaf in list_leaf:
                if not leaf.startswith('GB_') and not leaf.startswith(
                        'RS_') and not leaf.startswith('UBA'):
                    query_list_file.write('{0}\n'.format(genomes.get(leaf)))
                else:
                    shortleaf = leaf
                    if leaf.startswith('GB_') or leaf.startswith('RS_'):
                        shortleaf = leaf[3:]
                    ref_list_file.write('{0}{1}{2}\n'.format(
                        Config.FASTANI_GENOMES, shortleaf,
                        Config.FASTANI_GENOMES_EXT))

            query_list_file.close()
            ref_list_file.close()

            if not os.path.isfile(
                    os.path.join(self.tmp_output_dir,
                                 'query_list.txt')) or not os.path.isfile(
                                     os.path.join(self.tmp_output_dir,
                                                  'ref_list.txt')):
                raise

            cmd = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>{3}'.format(
                os.path.join(self.tmp_output_dir, 'query_list.txt'),
                os.path.join(self.tmp_output_dir, 'ref_list.txt'),
                os.path.join(self.tmp_output_dir, 'results.tab'),
                os.path.join(self.tmp_output_dir, 'error.log'))
            os.system(cmd)

            if not os.path.isfile(
                    os.path.join(self.tmp_output_dir, 'results.tab')):
                errstr = 'FastANI has stopped:\n'
                if os.path.isfile(
                        os.path.join(self.tmp_output_dir, 'error.log')):
                    with open(os.path.join(self.tmp_output_dir,
                                           'error.log')) as debug:
                        for line in debug:
                            finalline = line
                        errstr += finalline
                raise ValueError(errstr)

            dict_parser_distance = self._parse_fastani_results(
                os.path.join(self.tmp_output_dir, 'results.tab'), list_leaf)
            shutil.rmtree(self.tmp_output_dir)
            return dict_parser_distance

        except ValueError as error:
            if os.path.exists(self.tmp_output_dir):
                shutil.rmtree(self.tmp_output_dir)
            raise
        except Exception as error:
            if os.path.exists(self.tmp_output_dir):
                shutil.rmtree(self.tmp_output_dir)
            raise
Пример #33
0
def main(args=None):
    # initialize the options parser
    parser = argparse.ArgumentParser(add_help=False)
    subparsers = parser.add_subparsers(help="--", dest='subparser_name')

    # create table and plot useful for identifying taxonomic outliers.
    outliers_parser = subparsers.add_parser(
        'outliers',
        formatter_class=CustomHelpFormatter,
        description='Create information for identifying taxonomic outliers')

    outliers_parser.add_argument(
        'input_tree', help="decorated tree for inferring RED outliers")
    outliers_parser.add_argument(
        'taxonomy_file',
        help='taxonomy file for inferring RED outliers',
        default=None)
    outliers_parser.add_argument(
        'output_dir', help="desired output directory for generated files")
    outliers_parser.add_argument(
        '--viral',
        action="store_true",
        help='indicates a viral input tree and taxonomy')
    outliers_parser.add_argument(
        '--fixed_root',
        action="store_true",
        help='use single fixed root to infer outliers')
    outliers_parser.add_argument(
        '-t',
        '--trusted_taxa_file',
        help=
        "file indicating trusted taxonomic groups to use for inferring distribution (default: all taxa)",
        default=None)
    outliers_parser.add_argument(
        '-m',
        '--min_children',
        help=
        'minimum required child taxa to consider taxa when inferring distribution',
        type=int,
        default=2)
    outliers_parser.add_argument(
        '-s',
        '--min_support',
        help=
        "minimum support value to consider taxa when inferring distribution (default: 0)",
        type=float,
        default=0.0)
    outliers_parser.add_argument(
        '--fmeasure_table',
        help="table indicating F-measure score for each taxa")
    outliers_parser.add_argument(
        '--min_fmeasure',
        help="minimum F-measure to consider taxa when inferring distribution",
        type=float,
        default=0.95)
    outliers_parser.add_argument(
        '--fmeasure_mono',
        help="minimum F-measure to consider taxa monophyletic",
        type=float,
        default=0.95)
    outliers_parser.add_argument(
        '--highlight_polyphyly',
        help='highlight taxa with an F-measure less than --fmeasure_mono',
        action="store_true")
    outliers_parser.add_argument(
        '--mblet',
        action="store_true",
        help=
        "calculate 'mean branch length to extent taxa' instead of 'relative evolutionary distances'"
    )
    outliers_parser.add_argument(
        '-p',
        '--plot_taxa_file',
        help="file indicating taxonomic groups to plot (default: all taxa)",
        default=None)
    outliers_parser.add_argument('--plot_domain',
                                 action="store_true",
                                 help='show domain rank in plot')
    outliers_parser.add_argument(
        '--plot_dist_taxa_only',
        help='only plot taxa used to infer distribution',
        action="store_true")
    outliers_parser.add_argument('--highlight_taxa_file',
                                 help='file indicating taxa to highlight')
    outliers_parser.add_argument('--dpi',
                                 help='DPI of plots',
                                 type=int,
                                 default=96)
    outliers_parser.add_argument('--verbose_table',
                                 action="store_true",
                                 help='add additional columns to output table')
    outliers_parser.add_argument('--skip_mpld3',
                                 action="store_true",
                                 help='skip plots requiring mpld3')

    # create table and plot useful for identifying taxonomic outliers.
    scale_tree_parser = subparsers.add_parser(
        'scale_tree',
        formatter_class=CustomHelpFormatter,
        description='Scale a rooted tree based on RED')

    scale_tree_parser.add_argument('input_tree', help="rooted tree to scale")
    scale_tree_parser.add_argument('output_tree', help="tree scaled by RED")

    # Compare RED values of taxa calculated over different trees
    compare_red_parser = subparsers.add_parser(
        'compare_red',
        formatter_class=CustomHelpFormatter,
        description='Compare RED values of taxa calculated over different trees'
    )
    compare_red_parser.add_argument(
        'red_table1', help="RED table calculated by 'outlier' command.")
    compare_red_parser.add_argument(
        'red_table2', help="RED table calculated by 'outlier' command.")
    compare_red_parser.add_argument(
        'red_dict2',
        help="Median RED dictionary calculated by 'outlier' command.")
    compare_red_parser.add_argument('output_table', help='output table')
    compare_red_parser.add_argument(
        '--viral',
        action="store_true",
        help='indicates a viral input tree and taxonomy')

    # plot distribution of groups in each taxonomic rank
    dist_plot_parser = subparsers.add_parser(
        'dist_plot',
        formatter_class=CustomHelpFormatter,
        description='Plot distribution of taxa in each taxonomic rank')

    dist_plot_parser.add_argument(
        'input_tree',
        help="decorated tree for establishing relative divergence distributions"
    )
    dist_plot_parser.add_argument('output_prefix',
                                  help="output prefix for generated files")
    dist_plot_parser.add_argument(
        '-p',
        '--plot_taxa_file',
        help="file indicating taxonomic groups to plot (default: all taxa)",
        default=None)
    dist_plot_parser.add_argument(
        '-t',
        '--trusted_taxa_file',
        help=
        "file indicating trusted taxonomic groups to use for inferring distribution (default: all taxa)",
        default=None)
    dist_plot_parser.add_argument(
        '-m',
        '--min_children',
        help=
        'minimum required child taxa to consider taxa when inferring distribution  (default: 0)',
        type=int,
        default=0)
    dist_plot_parser.add_argument(
        '-s',
        '--min_support',
        help=
        "minimum support value to consider taxa when inferring distribution (default: 0)",
        type=float,
        default=0.0)

    # decorate nodes with inferred taxonomic ranks

    # ******** MAYBE THIS SHOULD JUST TAKE A 'DISTRIBUTIONS FILE' produce by 'dist_plot'
    # ************************************************************************************
    mark_tree_parser = subparsers.add_parser(
        'mark_tree',
        formatter_class=CustomHelpFormatter,
        description=
        'Mark nodes with distribution information and predicted taxonomic ranks.'
    )

    mark_tree_parser.add_argument('input_tree', help="input tree to mark")
    mark_tree_parser.add_argument(
        'output_tree', help="output tree with assigned taxonomic ranks")
    mark_tree_parser.add_argument(
        '-t',
        '--thresholds',
        help="relative divergence thresholds for taxonomic ranks",
        type=json.loads,
        default=
        '{"d": 0.33, "p": 0.56, "c": 0.65, "o": 0.78, "f": 0.92, "g": 0.99}')
    mark_tree_parser.add_argument(
        '-s',
        '--min_support',
        help="only mark nodes above the specified support value (default=0)",
        type=float,
        default=0)
    mark_tree_parser.add_argument(
        '-n',
        '--only_named_clades',
        help="only mark nodes with an existing label",
        action='store_true')
    mark_tree_parser.add_argument(
        '-l',
        '--min_length',
        help=
        "only mark nodes with a parent branch above the specified length (default=0)",
        type=float,
        default=0.0)
    mark_tree_parser.add_argument(
        '--no_percentile',
        action="store_true",
        help="do not mark with percentile information")
    mark_tree_parser.add_argument(
        '--no_relative_divergence',
        action="store_true",
        help="do not mark with relative divergence information")
    mark_tree_parser.add_argument(
        '--no_prediction',
        action="store_true",
        help="do not mark with predicted rank information")

    # rogue test
    rogue_test_parser = subparsers.add_parser(
        'rogue_test',
        formatter_class=CustomHelpFormatter,
        description=
        'Index indicating the incongruence of genomes over a set of tree.')

    rogue_test_parser.add_argument(
        'input_tree_dir',
        help="directory containing trees to assess incongruence over")
    rogue_test_parser.add_argument(
        'taxonomy_file', help='file indicating taxonomy of extant taxa')
    rogue_test_parser.add_argument(
        'output_dir', help="desired output directory for generated files")
    rogue_test_parser.add_argument(
        '--outgroup_taxon',
        help=
        'taxon to use as outgroup (e.g., d__Archaea); imples tree should be rooted'
    )
    rogue_test_parser.add_argument('--decorate',
                                   help='indicates trees should be decorated',
                                   action='store_true')

    # decorate ree
    decorate_parser = subparsers.add_parser(
        'decorate',
        formatter_class=CustomHelpFormatter,
        description='Place internal taxonomic labels on tree.')
    decorate_parser.add_argument('input_tree', help='tree to decorate')
    decorate_parser.add_argument(
        'taxonomy_file', help='file indicating taxonomy of extant taxa')
    decorate_parser.add_argument('output_tree', help='decorated tree')
    decorate_parser.add_argument(
        '--viral',
        action="store_true",
        help='indicates a viral input tree and taxonomy')
    decorate_parser.add_argument(
        '--skip_rd_refine',
        help=
        "skip refinement of taxonomy based on relative divergence information",
        action='store_true')
    decorate_parser.add_argument(
        '-t',
        '--trusted_taxa_file',
        help=
        "file indicating trusted taxonomic groups to use for inferring distribution (default: all taxa)",
        default=None)
    decorate_parser.add_argument(
        '-m',
        '--min_children',
        help=
        'minimum required child taxa to consider taxa when inferring distribution',
        type=int,
        default=2)
    decorate_parser.add_argument(
        '-s',
        '--min_support',
        help=
        "minimum support value to consider taxa when inferring distribution (default: 0)",
        type=float,
        default=0.0)

    # pull taxonomy strings from tree
    pull_parser = subparsers.add_parser(
        'pull',
        formatter_class=CustomHelpFormatter,
        description='Pull taxonomy information from tree.')

    pull_parser.add_argument('input_tree',
                             help="input tree to extract taxonomy from")
    pull_parser.add_argument(
        'output_file',
        help="file to contain taxonomy strings for each extant taxon")
    pull_parser.add_argument('--no_rank_fill',
                             action="store_true",
                             help="do not automatically fill in missing ranks")

    # validate consistency of taxonomy
    validate_parser = subparsers.add_parser(
        'validate',
        formatter_class=CustomHelpFormatter,
        description='Validate consistency of taxonomy.')

    validate_parser.add_argument('taxonomy_file',
                                 help="file with taxonomy for extant taxa")
    validate_parser.add_argument('--no_prefix',
                                 action="store_true",
                                 help="do not check taxon prefixes")
    validate_parser.add_argument(
        '--no_all_ranks',
        action="store_true",
        help="do not check for the presence of all ranks")
    validate_parser.add_argument(
        '--no_hierarhcy',
        action="store_true",
        help="do not check for inconsistencies in the taxonomic hierarchy")
    validate_parser.add_argument(
        '--no_species',
        action="store_true",
        help="do not check for hierarchical inconsistencies with named species"
    )

    # summary statistics of taxonomic groups
    taxon_stats_parser = subparsers.add_parser(
        'taxon_stats',
        formatter_class=CustomHelpFormatter,
        description='Summary statistics of taxonomic groups.')

    taxon_stats_parser.add_argument('taxonomy_file',
                                    help="file with taxonomy for extant taxa")
    taxon_stats_parser.add_argument('output_file',
                                    help="output file with summary statistics")

    # plot relative distance of groups across a set of trees.
    robustness_plot_parser = subparsers.add_parser(
        'robustness_plot',
        formatter_class=CustomHelpFormatter,
        description='Plot relative divergence of groups across a set of trees')

    robustness_plot_parser.add_argument(
        'rank',
        help="taxonomic rank of named groups to plot",
        type=int,
        choices=[1, 2, 3, 4, 5, 6])
    robustness_plot_parser.add_argument(
        'input_tree_dir',
        help="directory containing trees to inferred relative divergence across"
    )
    robustness_plot_parser.add_argument(
        'full_tree_file',
        help=
        "unmodified tree to include in plot; must be decorate with taxonomy")
    robustness_plot_parser.add_argument(
        'derep_tree_file', help="dereplicated tree to include in plot")
    robustness_plot_parser.add_argument(
        'taxonomy_file',
        help="file indicating taxonomy string for each genome")
    robustness_plot_parser.add_argument(
        'output_prefix', help="output prefix for generated files")
    robustness_plot_parser.add_argument(
        '-m',
        '--min_children',
        help='minimum named child taxa to consider taxa',
        type=int,
        default=2)
    robustness_plot_parser.add_argument('-t',
                                        '--title',
                                        help='title of plot',
                                        default=None)

    rd_ranks_parser = subparsers.add_parser(
        'rd_ranks',
        formatter_class=CustomHelpFormatter,
        description='Calculate number of taxa for specified rd thresholds.')

    rd_ranks_parser.add_argument('input_tree',
                                 help="input tree to calculate ranks over")
    rd_ranks_parser.add_argument(
        'output_dir', help="desired output directory for generated files")
    rd_ranks_parser.add_argument(
        '-t',
        '--thresholds',
        help="relative divergence thresholds for taxonomic ranks",
        type=json.loads,
        default=
        '{"p": 0.35, "c": 0.52, "o": 0.67, "f": 0.79, "g": 0.94, "s":0.996}')

    bl_dist_parser = subparsers.add_parser(
        'bl_dist',
        formatter_class=CustomHelpFormatter,
        description=
        'Calculate distribution of branch lengths at each taxonomic rank.')

    bl_dist_parser.add_argument(
        'input_tree',
        help="input tree to calculate branch length distributions")
    bl_dist_parser.add_argument(
        'output_dir', help="desired output directory for generated files")
    bl_dist_parser.add_argument(
        '-t',
        '--trusted_taxa_file',
        help=
        "file indicating trusted taxonomic groups to use for inferring distribution (default: all taxa)",
        default=None)
    bl_dist_parser.add_argument(
        '-m',
        '--min_children',
        help=
        'minimum required child taxa to consider taxa when inferring distribution',
        type=int,
        default=2)
    bl_dist_parser.add_argument(
        '--taxonomy_file',
        help='read taxonomy from this file instead of directly from tree',
        default=None)

    bl_optimal_parser = subparsers.add_parser(
        'bl_optimal',
        formatter_class=CustomHelpFormatter,
        description=
        'Determine branch length for best congruency with existing taxonomy.')

    bl_optimal_parser.add_argument(
        'input_tree',
        help="input tree to calculate branch length distributions")
    bl_optimal_parser.add_argument('rank',
                                   help="rank of labels",
                                   type=int,
                                   choices=[1, 2, 3, 4, 5, 6])
    bl_optimal_parser.add_argument('output_table',
                                   help="desired named of output table")
    bl_optimal_parser.add_argument(
        '--min_dist',
        help='minimum mean branch length value to evaluate',
        type=float,
        default=0.5)
    bl_optimal_parser.add_argument(
        '--max_dist',
        help='maximum mean branch length value to evaluate',
        type=float,
        default=1.2)
    bl_optimal_parser.add_argument(
        '--step_size',
        help='step size of mean branch length values',
        type=float,
        default=0.025)

    bl_decorate_parser = subparsers.add_parser(
        'bl_decorate',
        formatter_class=CustomHelpFormatter,
        description='Decorate tree using a mean branch length criterion.')

    bl_decorate_parser.add_argument('input_tree',
                                    help="input tree to decorate")
    bl_decorate_parser.add_argument(
        'taxonomy_file', help="file with taxonomic information for each taxon")
    bl_decorate_parser.add_argument('threshold',
                                    help="mean branch length threshold",
                                    type=float)
    bl_decorate_parser.add_argument('rank',
                                    help="rank of labels",
                                    type=int,
                                    choices=[1, 2, 3, 4, 5, 6])
    bl_decorate_parser.add_argument('output_tree', help="decorate tree")
    bl_decorate_parser.add_argument(
        '--retain_named_lineages',
        action="store_true",
        help='retain existing named lineages at the specified rank')
    bl_decorate_parser.add_argument('--keep_labels',
                                    action="store_true",
                                    help='keep all existing internal labels')
    bl_decorate_parser.add_argument(
        '--prune',
        action="store_true",
        help=
        'prune tree to preserve only the shallowest and deepest taxa in each child lineage from newly decorated nodes'
    )

    bl_table_parser = subparsers.add_parser(
        'bl_table',
        formatter_class=CustomHelpFormatter,
        description=
        'Produce table with number of lineage for increasing mean branch lengths.'
    )

    bl_table_parser.add_argument(
        'input_tree',
        help="input tree to calculate branch length distributions")
    bl_table_parser.add_argument(
        'taxon_category',
        help="file indicating category for each taxon in the tree")
    bl_table_parser.add_argument('output_table',
                                 help="desired named of output table")
    bl_table_parser.add_argument(
        '--step_size',
        help="step size for mean branch length criterion",
        type=float,
        default=0.01)

    rank_res_parser = subparsers.add_parser(
        'rank_res',
        formatter_class=CustomHelpFormatter,
        description='Calculate taxonomic resolution at each rank.')

    rank_res_parser.add_argument('input_tree', help="decorated tree")
    rank_res_parser.add_argument('taxonomy_file',
                                 help="file with taxonomy for extant taxa")
    rank_res_parser.add_argument(
        'output_file', help="output file with resolution of taxa at each rank")
    rank_res_parser.add_argument(
        '--taxa_file',
        help="output file indicating taxa within each resolution category",
        default=None)

    # get and check options
    if len(sys.argv) == 1 or sys.argv[1] in {'-h', '--help'}:
        print_help()
        sys.exit(0)
    else:
        args = parser.parse_args()

    if hasattr(args, 'output_dir'):
        make_sure_path_exists(args.output_dir)
        logger_setup(os.path.join(args.output_dir, 'phylorank.log'), False)
    elif hasattr(args, 'output_prefix'):
        output_dir, output_prefix = os.path.split(args.output_prefix)
        if output_dir:
            make_sure_path_exists(output_dir)
        logger_setup(os.path.join(output_dir, 'phylorank.log'), False)
    else:
        logger_setup('phylorank.log', False)

    # do what we came here to do
    try:
        parser = OptionsParser()
        if (False):
            # import pstats
            # p = pstats.Stats('prof')
            # p.sort_stats('cumulative').print_stats(10)
            # p.sort_stats('time').print_stats(10)
            import cProfile
            cProfile.run('parser.parse_options(args)', 'prof')
        elif False:
            import pdb
            pdb.run(parser.parse_options(args))
        else:
            parser.parse_options(args)
    except SystemExit:
        print(
            "\n  Controlled exit resulting from an unrecoverable error or warning."
        )
    except:
        print("\nUnexpected error:", sys.exc_info()[0])
        raise
Пример #34
0
def logger_setup(log_dir, log_file, program_name, version, silent):
    """Setup loggers.
    
    Two logger are setup which both print to the stdout and a 
    log file when the log_dir is not None. The first logger is
    named 'timestamp' and provides a timestamp with each call,
    while the other is named 'no_timestamp' and does not prepend
    any information. The attribution 'is_silent' is also added
    to each logger to indicate if the silent flag is thrown.

    Parameters
    ----------
    log_dir : str
        Output directory for log file.
    log_file : str
        Desired name of log file.
    program_name : str
        Name of program.
    version : str
        Program version number.
    silent : boolean
        Flag indicating if output to stdout should be suppressed.
    """

    # setup general properties of loggers
    timestamp_logger = logging.getLogger('timestamp')
    timestamp_logger.setLevel(logging.DEBUG)
    log_format = logging.Formatter(
        fmt="[%(asctime)s] %(levelname)s: %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S")

    no_timestamp_logger = logging.getLogger('no_timestamp')
    no_timestamp_logger.setLevel(logging.DEBUG)

    # setup logging to console
    timestamp_stream_logger = logging.StreamHandler(sys.stdout)
    timestamp_stream_logger.setFormatter(log_format)
    timestamp_logger.addHandler(timestamp_stream_logger)

    no_timestamp_stream_logger = logging.StreamHandler(sys.stdout)
    no_timestamp_stream_logger.setFormatter(None)
    no_timestamp_logger.addHandler(no_timestamp_stream_logger)

    timestamp_logger.is_silent = False
    no_timestamp_stream_logger.is_silent = False
    if silent:
        timestamp_logger.is_silent = True
        timestamp_stream_logger.setLevel(logging.ERROR)
        no_timestamp_stream_logger.is_silent = True

    if log_dir:
        make_sure_path_exists(log_dir)
        timestamp_file_logger = logging.FileHandler(
            os.path.join(log_dir, log_file), 'a')
        timestamp_file_logger.setFormatter(log_format)
        timestamp_logger.addHandler(timestamp_file_logger)

        no_timestamp_file_logger = logging.FileHandler(
            os.path.join(log_dir, log_file), 'a')
        no_timestamp_file_logger.setFormatter(None)
        no_timestamp_logger.addHandler(no_timestamp_file_logger)

    timestamp_logger.info('%s v%s' % (program_name, version))
    timestamp_logger.info(
        ntpath.basename(sys.argv[0]) + ' ' + ' '.join(sys.argv[1:]))
Пример #35
0
 def download_strains(self, options):
     make_sure_path_exists(options.output_dir)
     p = BacDive(options.output_dir, options.username, options.pwd)
     p.download_strains()
Пример #36
0
    def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity,
            window_size, step_size):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        genome_files : list of str
            Fasta files of genomes to process.
        db_file : str
            Database of reference genes.
        taxonomy_file : str
            File containing GreenGenes taxonomy strings for reference genomes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        window_size : int
            Size of each fragment.
        step_size : int
            Number of bases to move after each window.
        """

        # parse taxonomy file
        self.logger.info(
            '  Reading taxonomic assignment of reference genomes.')
        taxonomy = Taxonomy().read(taxonomy_file)

        # fragment each genome into fixed sizes windows
        self.logger.info('')
        self.logger.info('  Fragmenting sequences in each bin:')
        diamond_output_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(diamond_output_dir)

        fragment_file = os.path.join(diamond_output_dir, 'fragments.fna')
        fragment_out = open(fragment_file, 'w')
        contig_id_to_genome_id = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            self.profiles[genome_id] = Profile(genome_id, taxonomy)
            self._fragment_genomes(genome_file, window_size, step_size,
                                   self.profiles[genome_id], fragment_out)

            for seq_id, _seq in seq_io.read_seq(genome_file):
                contig_id_to_genome_id[seq_id] = genome_id

        # run diamond
        self.logger.info('')
        self.logger.info(
            '  Running diamond blastx with %d processes (be patient!)' %
            self.cpus)

        diamond = Diamond(self.cpus)
        diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
        diamond.blastx(fragment_file, db_file, evalue, per_identity, 1,
                       diamond_daa_out)

        diamond_table_out = os.path.join(diamond_output_dir,
                                         'diamond_hits.tsv')
        diamond.view(diamond_daa_out + '.daa', diamond_table_out)

        self.logger.info('')
        self.logger.info('  Creating taxonomic profile for each genome.')
        self._taxonomic_profiles(diamond_table_out, taxonomy,
                                 contig_id_to_genome_id)

        self.logger.info('')
        self.logger.info('  Writing taxonomic profile for each genome.')

        report_dir = os.path.join(self.output_dir, 'bin_reports')
        make_sure_path_exists(report_dir)

        for genome_id, profile in self.profiles.iteritems():
            seq_summary_out = os.path.join(report_dir,
                                           genome_id + '.sequences.tsv')
            profile.write_seq_summary(seq_summary_out)

            genome_profile_out = os.path.join(report_dir,
                                              genome_id + '.profile.tsv')
            profile.write_genome_profile(genome_profile_out)

        genome_summary_out = os.path.join(self.output_dir,
                                          'genome_summary.tsv')
        self._write_genome_summary(genome_summary_out)

        # create Krona plot
        krona_profiles = defaultdict(lambda: defaultdict(int))
        for genome_id, profile in self.profiles.iteritems():
            seq_assignments = profile.classify_seqs(taxonomy)

            for seq_id, classification in seq_assignments.iteritems():
                taxa = []
                for r in xrange(0, len(profile.rank_labels)):
                    taxa.append(classification[r][0])

                krona_profiles[genome_id][';'.join(
                    taxa)] += profile.seq_len[seq_id]

        krona = Krona()
        krona_output_file = os.path.join(self.output_dir,
                                         'taxonomic_profiles.krona.html')
        krona.create(krona_profiles, krona_output_file)
Пример #37
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold of valid hits.
        per_identity : float
            Percent identity threshold of valid hits [0,100].
        per_aln_len : float
            Percent query coverage of valid hits [0, 100].
        """

        # read statistics file
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('Creating diamond database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.make_database(ref_gene_file, ref_diamond_db)

        self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, hits_ref_genomes)

        self.logger.info('Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.iteritems():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs')
        fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.iteritems():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_bin_id, subject_gene_id  = hit.subject_id.split('~')
                subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1
               

            sorted_subject_bin_ids = sorted(subject_bin_ids.items(), 
                                                key=operator.itemgetter(1),
                                                reverse=True)
            subject_bin_id_str = []
            for bin_id, num_hits in sorted_subject_bin_ids:
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), 
                                                    key=operator.itemgetter(1),
                                                    reverse=True)
            subject_scaffold_id_str = []
            for subject_id, num_hits in sorted_subject_scaffold_ids:
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_bin_id_str,
                                                                        subject_scaffold_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out
Пример #38
0
 def full_lpsn_wf(self, options):
     """Full workflow to parse LPSN."""
     make_sure_path_exists(options.output_dir)
     p = LPSN(options.output_dir)
     p.full_lpsn_wf()
Пример #39
0
 def parse_html(self, options):
     """Parse all html files."""
     make_sure_path_exists(options.output_dir)
     p = LPSN(options.output_dir)
     p.parse_html(options.input_dir)
Пример #40
0
    def features(self, options):
        """Making bam features matrix"""

        make_sure_path_exists(options.output_dir)
        reads_abundance = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[0])
        reads_normalised = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[1])
        reads_relative = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[2])
        base_abundance = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[3])
        base_normalised = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[4])
        base_relative = os.path.join(options.output_dir,
                                     DefaultValues.FEATURES_ABUNDANCE_FILES[5])

        features_size = {}
        counts = {}
        counts_base = {}

        self.logger.info('Get features and initialise matrix')
        with open(options.faidx) as f:
            for line in f:
                if not line.startswith('#'):
                    line_list = line.rstrip().split('\t')
                    features = line_list[0]
                    features_size[features] = line_list[1]
                    counts[features] = 0
                    counts_base[features] = 0

        counts_all = []
        counts_all_normalised = []
        counts_all_relative = []
        counts_base_all = []
        counts_base_all_normalised = []
        counts_base_all_relative = []

        header = ["Features", "Features_size"]
        self.logger.info('Browse alignement file(s)')

        samtoolsexec = findEx('samtools')
        samtoolsthreads = '-@ ' + options.threads
        samtoolsminqual = '-q ' + options.mapQ

        with open(options.bam_list, 'r') as b:
            for bam in b:
                if bam.startswith('#'):
                    continue
                i = 0
                alignementfile, librarysize = bam.split('\t')
                if librarysize == '' or librarysize == 0:
                    librarysize = 1
                samplename = remove_extension(os.path.basename(alignementfile),
                                              options.extension)
                header.append(samplename)
                self.logger.info('\t' + samplename)
                cmd = [
                    samtoolsexec, 'view', samtoolsthreads, samtoolsminqual,
                    alignementfile
                ]
                p = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout
                for line in p:
                    line = line.decode(sys.getdefaultencoding()).rstrip()
                    if i > 0 and i % 1000000 == 0:
                        self.logger.info("Alignment record %s processed" % i)
                    i += 1
                    line_list = line.split('\t')
                    features = line_list[2]
                    cigar = line_list[5]
                    base_mapped = 0
                    match = re.findall(r'(\d+)M', cigar)
                    read_len = len(line_list[6])
                    for base_match in match:
                        base_mapped += int(base_match)
                    if read_len == 0:
                        self.logger.info(line_list)

                    if base_mapped / read_len < float(options.id_cutoff):
                        continue

                    counts[features] += 1

                    if options.discard_gene_length_normalisation:
                        counts_base[features] += base_mapped
                    else:
                        counts_base[features] += base_mapped / int(
                            features_size[features])

                # raw reads count
                counts_all.append(counts.copy())

                # normalised reads count
                count_tmp = {}
                count_tmp = {
                    k: (v / int(librarysize)) * options.feature_normalisation
                    for k, v in counts.items()
                }
                counts_all_normalised.append(count_tmp.copy())

                # relative reads count
                count_tmp = {}
                count_tmp = {
                    k: v / total
                    for total in (sum(counts.values()), )
                    for k, v in counts.items()
                }
                counts_all_relative.append(count_tmp.copy())

                # raw bases count
                counts_base_all.append(counts_base.copy())

                # normalised bases count
                count_tmp = {}
                count_tmp = {
                    k: (v / int(librarysize)) * options.feature_normalisation
                    for k, v in counts_base.items()
                }
                counts_base_all_normalised.append(count_tmp.copy())

                # relative bases count
                count_tmp = {}
                count_tmp = {
                    k: v / total
                    for total in (sum(counts_base.values()), )
                    for k, v in counts_base.items()
                }
                counts_base_all_relative.append(count_tmp.copy())

                for fn in counts:
                    counts[fn] = 0
                    counts_base[fn] = 0

        self.logger.info('Print matrices')

        self.logger.info('Print raw reads abundance matrix in %s' %
                         reads_abundance)
        output_handle = open(reads_abundance, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts.keys():
            if sum([c[fn] for c in counts_all]) == 0 and options.removed:
                continue
            else:
                output_handle.write('\t'.join([fn] + [features_size[fn]] +
                                              [str(c[fn])
                                               for c in counts_all]) + '\n')
        output_handle.close()

        self.logger.info('Print normalised reads abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(reads_normalised, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts.keys():
            if sum([c[fn]
                    for c in counts_all_normalised]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn])
                               for c in counts_all_normalised]) + '\n')
        output_handle.close()

        self.logger.info('Print relative reads abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(reads_relative, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts.keys():
            if sum([c[fn]
                    for c in counts_all_relative]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn])
                               for c in counts_all_relative]) + '\n')
        output_handle.close()

        self.logger.info('Print raw base abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(base_abundance, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts_base.keys():
            if sum([c[fn] for c in counts_all]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn]) for c in counts_base_all]) + '\n')
        output_handle.close()

        self.logger.info('Print normalised base abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(base_normalised, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts_base.keys():
            if sum([c[fn]
                    for c in counts_all_normalised]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn])
                               for c in counts_base_all_normalised]) + '\n')
        output_handle.close()

        self.logger.info('Print relative base abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(base_relative, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts_base.keys():
            if sum([c[fn]
                    for c in counts_all_relative]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn])
                               for c in counts_base_all_relative]) + '\n')
        output_handle.close()

        self.logger.info('Matrices printed')
Пример #41
0
    def modify_bin(self, options):
        """Modify bin command"""

        make_sure_path_exists(os.path.dirname(options.output_genome))

        if not (options.add or options.remove or options.outlier_file
                or options.compatible_file):
            self.logger.warning('No modification to bin requested.\n')
            sys.exit()

        if (options.add or options.remove) and (options.outlier_file
                                                or options.compatible_file):
            self.logger.warning(
                "The 'outlier_file' and 'compatible_file' options cannot be specified with 'add' or 'remove'.\n"
            )
            sys.exit()

        if options.outlier_file and options.compatible_file:
            self.logger.warning(
                "The 'outlier_file' and 'compatible_file' options cannot be specified at the same time.\n"
            )
            sys.exit()

        failed_to_add = []
        failed_to_remove = []
        if options.add or options.remove:
            failed_to_add, failed_to_remove = genome_tk.modify(
                options.genome_file, options.scaffold_file, options.add,
                options.remove, options.output_genome)
        elif options.outlier_file:
            outliers = Outliers()
            outliers.remove_outliers(options.genome_file, options.outlier_file,
                                     options.output_genome, False)
        elif options.compatible_file:
            outliers = Outliers()
            if options.unique_only:
                outliers.add_compatible_unique(options.scaffold_file,
                                               options.genome_file,
                                               options.compatible_file,
                                               options.min_len,
                                               options.output_genome)
            elif options.closest_only:
                outliers.add_compatible_closest(options.scaffold_file,
                                                options.genome_file,
                                                options.compatible_file,
                                                options.min_len,
                                                options.output_genome)
            else:
                outliers.add_compatible(options.scaffold_file,
                                        options.genome_file,
                                        options.compatible_file,
                                        options.min_len, options.output_genome)

        if failed_to_add:
            self.logger.warning('Failed to add the following sequence(s):')
            for seq_id in failed_to_add:
                print '    %s' % seq_id

        if failed_to_remove:
            self.logger.warning('Failed to remove the following sequence(s):')
            for seq_id in failed_to_remove:
                print '    %s' % seq_id

        self.logger.info('Modified genome written to: ' +
                         options.output_genome)
Пример #42
0
    def run(self, query_gene_file, target_gene_file, sorted_hit_table,
            evalue_threshold, per_iden_threshold, per_aln_len_threshold,
            num_top_targets, taxonomy_file, keep_rbhs, output_dir):
        """Classify genomes based on AAI to reference genomes.

        Parameters
        ----------
        query_gene_file : str
            File with all query genes in FASTA format.
        target_gene_file : str
            File with all target genes in FASTA format.
        sorted_hit_table : str
            Sorted table indicating genes with sequence similarity.
        evalue_threshold : float
            Evalue threshold used to define a homologous gene.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        num_top_targets : int
            Number of top scoring target genomes to report per query genome.
        taxonomy_file : str
            File indicating taxonomic identification of all target genomes.
        keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """

        # read taxonomic identification of each genome
        taxonomy = {}
        if taxonomy_file:
            for line in open(taxonomy_file):
                genome_id, taxa_str = line.rstrip().split('\t')
                taxonomy[genome_id] = taxa_str

        # calculate AAI between query and target genomes
        aai_output_dir = os.path.join(output_dir, 'aai')
        make_sure_path_exists(aai_output_dir)
        aai_calculator = AAICalculator(self.cpus)
        aai_output_file, rbh_output_file = aai_calculator.run(
            query_gene_file, target_gene_file, sorted_hit_table,
            evalue_threshold, per_iden_threshold, per_aln_len_threshold,
            keep_rbhs, aai_output_dir)

        # determine matches to each query genomes
        aai_results_file = os.path.join(aai_output_dir, 'aai_summary.tsv')
        with open(aai_results_file) as f:
            f.readline()

            hits = defaultdict(list)
            for line in f:
                line_split = line.rstrip().split('\t')
                query_id = line_split[0]
                target_id = line_split[2]
                aai = float(line_split[5])
                of = float(line_split[7])

                hits[query_id].append([target_id, aai, of])

        # report top matches
        results_file = os.path.join(output_dir, 'classify.tsv')
        fout = open(results_file, 'w')
        fout.write('Query Id\tTarget Id\tAAI\tOF\tScore')
        if taxonomy:
            fout.write('\tTarget Taxonomy')
        fout.write('\n')

        for query_id, cur_hits in hits.items():
            cur_hits.sort(key=lambda x: x[1], reverse=True)
            for i in range(0, min(num_top_targets, len(cur_hits))):
                data = [query_id] + cur_hits[i]
                fout.write('%s\t%s\t%.2f\t%.2f' % tuple(data))

                aai = data[2]
                of = data[3]
                fout.write('\t%.2f' % (aai + of))

                target_id = cur_hits[i][0]
                if target_id in taxonomy:
                    fout.write('\t%s' % taxonomy[target_id])

                fout.write('\n')
        fout.close()

        return results_file
Пример #43
0
    def run(self,
                taxonomy_file, type_strains_file,
                genome_prot_dir, extension,
                max_taxa, rank,
                per_identity, per_aln_len,
                genomes_to_process, keep_all_genes,
                no_reformat_gene_ids,
                output_dir):
        """ Create dereplicate set of genes.

        Taxonomy file should have the following format:
            <genome_id>\t<taxonomy_str>

            where taxonomy_str is in GreenGenes format:
                d__Bacteria;p__Proteobacteria;...;s__Escherichia coli

        Type strain file should have the following format:
            <genome_id>\t<genome name>

        Parameters
        ----------
        taxonomy_file : str
            File indicating taxonomy string for all genomes of interest
        type_strains_file : str
            File indicating type strains.
        genome_prot_dir : str
            Directory containing amino acid genes for each genome.
        extension : str
            Extension of files with called genes.
        max_taxa : int
            Maximum taxa to retain in a named group.
        rank : int
            Taxonomic rank to perform dereplication (0 = domain, ..., 6 = species).
        per_identity : float
            Percent identity for subsampling similar genes.
        per_aln_len : float
            Percent alignment length for subsampling similar genes.
        genomes_to_process : str
            File with list of genomes to retain instead of performing taxon subsampling.
        keep_all_genes : boolean
            Flag indicating that no gene subsampling should be performed.
        no_reformat_gene_ids : boolean
            Flag indicating if gene ids should be reformatted to include scaffold names given by the GFF file.
        output_dir : str
            Desired output directory for storing results.
        """

        make_sure_path_exists(output_dir)
        self.logger.info('Dereplicating at the rank of %s.' % self.rank_labels[rank])

        # get taxonomy string for each genome
        taxonomy = {}
        if taxonomy_file:
            self.logger.info('Reading taxonomy file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            self.logger.info('There are %d genomes with taxonomy strings.' % len(taxonomy))

        # get type strains; genomes which should never be dereplicated
        type_strains = set()
        if type_strains_file:
            self.logger.info('Reading type strain file.')
            type_strains = self.read_type_strain(type_strains_file)
            self.logger.info('There are %d type strains.' % len(type_strains))

        # get specific list of genomes to process
        genomes_to_retain = set()
        if genomes_to_process:
            self.logger.info('Reading genomes to retain.')
            for line in open(genomes_to_process):
                line_split = line.split()
                genomes_to_retain.add(line_split[0])
            self.logger.info('Retaining %d genomes.' % len(genomes_to_retain))
            
        # make sure extension filter starts with a '.'
        if not extension.startswith('.'):
            extension = '.' + extension

        # identify unique genes in each named group
        fout = open(os.path.join(output_dir, 'genomes_without_called_genes.tsv'), 'w')
        rank_genomes = defaultdict(list)
        genome_files = os.listdir(genome_prot_dir)
        underclassified_genomes = 0
        genomes_with_missing_data = 0
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file, extension)

            if not genome_file.endswith(extension):
                continue

            if genomes_to_process and genome_id not in genomes_to_retain:
                continue

            genome_file = os.path.join(genome_prot_dir, genome_file)
            if not os.path.exists(genome_file):
                genomes_with_missing_data += 1
                fout.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n')
                continue

            t = taxonomy.get(genome_id, self.rank_prefixes)
            taxa = t[rank]
            if taxa[3:] == '':
                underclassified_genomes += 1
                rank_genomes[self.underclassified].append(genome_id)
            else:
                rank_genomes[taxa].append(genome_id)

            validate_seq_ids(genome_file)

        fout.close()

        total_genomes_to_process = sum([len(genome_list) for genome_list in rank_genomes.values()])
        if total_genomes_to_process == 0:
            self.logger.error('No genomes found in directory: %s. Check the --extension flag used to identify genomes.' % genome_prot_dir)
            sys.exit(-1)

        self.logger.info('Under-classified genomes automatically placed into the database: %d' % underclassified_genomes)
        self.logger.info('Genomes with missing sequence data: %d' % genomes_with_missing_data)
        self.logger.info('Total named groups: %d' % len(rank_genomes))
        self.logger.info('Total genomes to process: %d' % total_genomes_to_process)

        # process each named group
        gene_file = os.path.join(output_dir, 'custom_db.faa')
        gene_out = open(gene_file, 'w')

        taxonomy_out = open(os.path.join(output_dir, 'custom_taxonomy.tsv'), 'w')

        tmp_dir = tempfile.mkdtemp()
        total_genes_removed = 0
        total_genes_kept = 0
        total_genomes_kept = 0
        processed_genomes = 0
        for taxa, genome_list in rank_genomes.iteritems():
            processed_genomes += len(genome_list)

            print '-------------------------------------------------------------------------------'
            self.logger.info('Processing %s | Finished %d of %d (%.2f%%) genomes.' % (taxa, processed_genomes, total_genomes_to_process, processed_genomes * 100.0 / total_genomes_to_process))

            # create directory with selected genomes
            taxon_dir = os.path.join(tmp_dir, 'taxon')
            os.mkdir(taxon_dir)

            reduced_genome_list = genome_list
            if not genomes_to_process and taxa != self.underclassified:  # perform taxon subsampling
                reduced_genome_list = self.select_taxa(genome_list, taxonomy, type_strains, max_taxa)
            total_genomes_kept += len(reduced_genome_list)

            gene_dir = os.path.join(taxon_dir, 'genes')
            os.mkdir(gene_dir)
            for genome_id in reduced_genome_list:
                taxonomy_out.write(genome_id + '\t' + ';'.join(taxonomy.get(genome_id, self.rank_prefixes)) + '\n')

                genome_gene_file = os.path.join(genome_prot_dir, genome_id + extension)
                gff_file = os.path.join(genome_prot_dir, genome_id + '.gff')
                output_gene_file = os.path.join(gene_dir, genome_id + '.faa')
                if not no_reformat_gene_ids:
                    self.reformat_gene_id_to_scaffold_id(genome_gene_file, gff_file, taxonomy, output_gene_file)
                else:
                    os.system('cp %s %s' % (genome_gene_file, output_gene_file))

            # filter genes based on amino acid identity
            genes_to_remove = []
            amended_gene_dir = os.path.join(taxon_dir, 'amended_genes')
            if keep_all_genes or taxa == self.underclassified:
                # modify gene identifiers to include genome ids
                self.amend_gene_identifies(gene_dir, amended_gene_dir)
            else:
                # filter genes on AAI
                genes_to_remove = self.filter_aai(taxon_dir, gene_dir, amended_gene_dir, per_identity, per_aln_len, self.cpus)

            self.logger.info('Writing unique genes from genomes in %s.' % taxa)
            genes_kept = self.write_gene_file(gene_out, amended_gene_dir, reduced_genome_list, taxonomy, genes_to_remove)

            self.logger.info('Retain %d of %d taxa.' % (len(reduced_genome_list), len(genome_list)))
            self.logger.info('Genes to keep: %d' % genes_kept)
            self.logger.info('Genes removed: %d' % len(genes_to_remove))

            total_genes_kept += genes_kept
            total_genes_removed += len(genes_to_remove)

            shutil.rmtree(taxon_dir)

        taxonomy_out.close()
        gene_out.close()

        self.logger.info('Retain %d of %d (%.1f%%) genomes' % (total_genomes_kept, total_genomes_to_process, total_genomes_kept * 100.0 / (total_genomes_to_process)))
        self.logger.info('Total genes kept: %d' % total_genes_kept)
        self.logger.info('Total genes removed: %d (%.1f%%)' % (total_genes_removed, total_genes_removed * 100.0 / (total_genes_kept + total_genes_removed)))

        self.logger.info('Creating BLAST database.')
        os.system('makeblastdb -dbtype prot -in %s' % gene_file)

        shutil.rmtree(tmp_dir)
Пример #44
0
    def deleteGenomes(self, batchfile=None, db_genome_ids=None, reason=None):
        '''
        Delete Genomes
        Returns True for success or False for fail

        Parameters:
        :param batchfile: text file listing a range of ids to delete
        :param db_genome_ids: a list of ids can be written directly in the command line
        '''

        self._loggerSetup()

        try:
            if db_genome_ids is False:
                raise GenomeDatabaseError(
                    "Unable to delete genomes. Unable to retrieve genome ids.")

            # restrict deletion to genomes owned by user
            has_permission, username, genomes_owners = self._hasPermissionToEditGenomes(
                db_genome_ids)

            if has_permission is None:
                raise GenomeDatabaseError(
                    "Unable to delete genomes. Unable to retrieve permissions for genomes."
                )

            if has_permission is False:
                raise GenomeDatabaseError(
                    "Unable to delete genomes. Insufficient permissions.")

            if db_genome_ids:
                if not confirm(
                        "Are you sure you want to delete %i genomes (this action cannot be undone)"
                        % len(db_genome_ids)):
                    raise GenomeDatabaseError("User aborted database action.")

                self.cur.execute(
                    "DELETE FROM aligned_markers " + "WHERE genome_id IN %s ",
                    (tuple(db_genome_ids), ))

                self.cur.execute(
                    "DELETE FROM genome_list_contents " +
                    "WHERE genome_id IN %s", (tuple(db_genome_ids), ))

                # Deletion of metadata

                self.cur.execute(
                    "DELETE FROM metadata_genes " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_ncbi " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_nucleotide " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_taxonomy " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_rna " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))
                self.cur.execute(
                    "DELETE FROM metadata_sequence " + "WHERE id IN %s",
                    (tuple(db_genome_ids), ))

                self.cur.execute("DELETE FROM genomes " + "WHERE id IN %s",
                                 (tuple(db_genome_ids), ))

                self.cur.execute(
                    "UPDATE metadata_taxonomy set gtdb_genome_representative = NULL where  "
                    + "gtdb_genome_representative in %s",
                    (tuple(genomes_owners.keys()), ))

                for genome, info in genomes_owners.iteritems():
                    if str(username) != str(info.get("owner")):
                        logging.info(
                            '''Genome {0} has been deleted by {1} for the following reason '{2}'
                                          WARNING: {1} is not the owner of this {0} (real owner {3} )
                                          {0} needs to be moved manually to the deprecated folder'''
                            .format(genome, username, reason,
                                    info.get("owner")))
                    else:
                        if info.get("prefix") is "U":
                            target = os.path.dirname(
                                os.path.join(self.deprecatedUserDir,
                                             info.get("relative_path")))
                        elif info.get("prefix") is "GB":
                            target = os.path.join(self.deprecatedGBKDir,
                                                  info.get("relative_path"))
                        elif info.get("prefix") is "RS":
                            target = os.path.join(self.deprecatedRSQDir,
                                                  info.get("relative_path"))
                        make_sure_path_exists(target)
                        os.rename(
                            os.path.dirname(
                                Tools.fastaPathGenerator(
                                    info.get("relative_path"),
                                    info.get("prefix"))), target)
                        logging.info(
                            "Genome {0} has been deleted by {1} for the following reason '{2}'"
                            .format(genome, username, reason))
        except GenomeDatabaseError as e:
            raise e

        return True
Пример #45
0
    def run(self, 
                genome_files, 
                output_dir, 
                called_genes=False, 
                translation_table=None, 
                meta=False,
                closed_ends=False):
        """Call genes with Prodigal.

        Call genes with prodigal and store the results in the
        specified output directory. For convenience, the
        called_gene flag can be used to indicate genes have
        previously been called and simply need to be copied
        to the specified output directory.

        Parameters
        ----------
        genome_files : list of str
            Nucleotide fasta files to call genes on.
        called_genes : boolean
            Flag indicating if genes are already called.
        translation_table : int
            Specifies desired translation table, use None to automatically
            select between tables 4 and 11.
        meta : boolean
            Flag indicating if prodigal should call genes with the metagenomics procedure.
        closed_ends : boolean
            If True, do not allow genes to run off edges (throws -c flag).
        output_dir : str
            Directory to store called genes.

        Returns
        -------
        d[genome_id] -> namedtuple(best_translation_table
                                            coding_density_4
                                            coding_density_11)
            Summary statistics of called genes for each genome.
        """

        self.called_genes = called_genes
        self.translation_table = translation_table
        self.meta = meta
        self.closed_ends = closed_ends
        self.output_dir = output_dir

        make_sure_path_exists(self.output_dir)

        progress_func = None
        if self.verbose:
            file_type = 'genomes'
            self.progress_str = '  Finished processing %d of %d (%.2f%%) genomes.'
            if meta:
                file_type = 'scaffolds'
                if len(genome_files):
                    file_type = ntpath.basename(genome_files[0])

                self.progress_str = '  Finished processing %d of %d (%.2f%%) files.'

            self.logger.info('Identifying genes within %s: ' % file_type)
            progress_func = self._progress

        parallel = Parallel(self.cpus)
        summary_stats = parallel.run(self._producer, self._consumer, genome_files, progress_func)

        return summary_stats
Пример #46
0
 def pull_html(self, options):
     """Pull all genus.html files."""
     make_sure_path_exists(options.output_dir)
     p = LPSN(options.output_dir)
     p.pull_html()
Пример #47
0
    def parse_options(self, options):
        """Parse user options and call the correct pipeline(s)"""

        try:
            if options.file == "stdout":
                options.file = ''
        except:
            pass

        if (options.subparser_name == 'call_genes'):
            self.call_genes(options)
        elif (options.subparser_name == 'similarity'):
            self.similarity(options)
        elif (options.subparser_name == 'aai'):
            self.aai(options)
        elif (options.subparser_name == 'classify'):
            self.classify(options)
        elif (options.subparser_name == 'aai_wf'):
            root_dir = options.output_dir
            make_sure_path_exists(root_dir)

            if options.proteins:
                if options.file_ext == 'fna':
                    self.logger.warning(
                        "Changing file extension from 'fna' to 'faa' since 'proteins' flag was given."
                    )
                    options.file_ext = 'faa'
                options.query_proteins = options.input_files
                options.target_proteins = options.input_files
            else:
                options.input_genomes = options.input_files
                options.output_dir = os.path.join(root_dir, 'genes')
                self.call_genes(options)
                options.query_proteins = os.path.join(root_dir, 'genes')
                options.target_proteins = os.path.join(root_dir, 'genes')
                options.file_ext = 'faa'

            options.output_dir = os.path.join(root_dir, 'similarity')
            self.similarity(options)

            options.query_gene_file = os.path.join(options.output_dir,
                                                   'query_genes.faa')
            options.sorted_hit_table = os.path.join(options.output_dir,
                                                    'hits_sorted.tsv')
            options.output_dir = os.path.join(root_dir, 'aai')
            self.aai(options)
        elif (options.subparser_name == 'classify_wf'):
            root_dir = options.output_dir
            make_sure_path_exists(root_dir)

            if options.query_files == options.target_files:
                self.logger.error(
                    "The 'query_files' and 'target_files' arguments must be different."
                )
                sys.exit()

            if options.proteins:
                if options.file_ext == 'fna':
                    self.logger.warning(
                        "Changing file extension from 'fna' to 'faa' since 'proteins' flag was given."
                    )
                    options.file_ext = 'faa'
                options.query_proteins = options.query_files
                options.target_proteins = options.target_files
            else:
                options.input_genomes = options.query_files
                options.output_dir = os.path.join(root_dir, 'query_genes')
                self.call_genes(options)

                options.input_genomes = options.target_files
                options.output_dir = os.path.join(root_dir, 'target_genes')
                self.call_genes(options)

                options.query_proteins = os.path.join(root_dir, 'query_genes')
                options.target_proteins = os.path.join(root_dir,
                                                       'target_genes')
                options.file_ext = 'faa'

            options.output_dir = os.path.join(root_dir, 'similarity')
            self.similarity(options)

            options.query_gene_file = os.path.join(options.output_dir,
                                                   'query_genes.faa')
            options.target_gene_file = os.path.join(options.output_dir,
                                                    'target_genes.faa')
            options.sorted_hit_table = os.path.join(options.output_dir,
                                                    'hits_sorted.tsv')
            options.output_dir = os.path.join(root_dir, 'classify')
            self.classify(options)
        elif (options.subparser_name == 'aa_usage'):
            self.aa_usage(options)
        elif (options.subparser_name == 'codon_usage'):
            self.codon_usage(options)
        elif (options.subparser_name == 'kmer_usage'):
            self.kmer_usage(options)
        elif (options.subparser_name == 'stop_usage'):
            self.stop_usage(options)
        elif (options.subparser_name == 'lgt_di'):
            self.lgt_di(options)
        elif (options.subparser_name == 'lgt_codon'):
            self.lgt_codon(options)
        elif (options.subparser_name == 'diss'):
            self.diss(options)
        elif (options.subparser_name == 'hclust'):
            self.hclust(options)
        elif (options.subparser_name == 'pcoa_plot'):
            self.pcoa_plot(options)
        elif (options.subparser_name == 'heatmap'):
            self.heatmap(options)
        else:
            self.logger.error('  [Error] Unknown CompareM command: "' +
                              options.subparser_name + '"\n')
            sys.exit()

        return 0
    def __tigrfam_worker(self, queue_in, queue_out):
        """Process each data item in parallel."""
        tigrfam_version = 'tigrfam_15.0'
        tigrfam_extension = f'_{tigrfam_version}.tsv'
        tigrfam_tophit_extension = f'_{tigrfam_version}_tophit.tsv'

        symlink_tigrfam_extension = '_tigrfam.tsv'
        symlink_tigrfam_tophit_extension = '_tigrfam_tophit.tsv'

        while True:
            gene_file = queue_in.get(block=True, timeout=None)
            if gene_file == None:
                break

            assembly_dir, filename = os.path.split(gene_file)
            make_sure_path_exists(os.path.join(assembly_dir, tigrfam_version))

            output_hit_file = os.path.join(
                assembly_dir, tigrfam_version,
                filename.replace(self.protein_file_ext, tigrfam_extension))
            hmmsearch_out = os.path.join(
                assembly_dir, tigrfam_version,
                filename.replace(self.protein_file_ext,
                                 f'_{tigrfam_version}.out'))
            cmd = 'hmmsearch -o %s --tblout %s --noali --notextw --cut_nc --cpu 1 %s %s' % (
                hmmsearch_out, output_hit_file, self.tigrfam_hmms, gene_file)
            os.system(cmd)
            #==================================================================
            # print(cmd)
            #==================================================================

            # calculate checksum
            checksum = sha256(output_hit_file)
            fout = open(output_hit_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()

            # determine top hits
            tigrfam_tophit_file = os.path.join(
                assembly_dir, tigrfam_version,
                filename.replace(self.protein_file_ext,
                                 tigrfam_tophit_extension))
            self._tigr_top_hit(output_hit_file, tigrfam_tophit_file)

            # create symlink in prodigal_folder
            new_hit_link = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_ext,
                                 symlink_tigrfam_extension))
            new_tophit_link = os.path.join(
                assembly_dir,
                filename.replace(self.protein_file_ext,
                                 symlink_tigrfam_tophit_extension))

            #==================================================================
            # print(f'{new_hit_link} will point to {output_hit_file}')
            # print(f'{new_tophit_link} will point to {tigrfam_tophit_file}')
            #==================================================================

            os.symlink(output_hit_file, new_hit_link)
            os.symlink(tigrfam_tophit_file, new_tophit_link)

            # allow results to be processed or written to file
            queue_out.put(gene_file)
Пример #49
0
    def _calculate_fastani_distance(self, user_genome, genome_reps):
        """ Calculate the FastANI distance between all user genomes and the reference to classify them at the species level

        Parameters
        ----------
        user_leaf : User genome
        genome_reps : list of representatives genomes

        """
        try:
            self.tmp_output_dir = tempfile.mkdtemp()
            make_sure_path_exists(self.tmp_output_dir)

            # we write the two input files for fastani, the query file and
            # reference file
            query_list_file = open(os.path.join(
                self.tmp_output_dir, 'query_list.txt'), 'w')

            # We need to rebuild the path for each unprocessed genomes
            genome_dirs_query = ("SELECT g.id, g.fasta_file_location,gs.external_id_prefix "
                                 "FROM genomes g " +
                                 "LEFT JOIN genome_sources gs ON gs.id = g.genome_source_id " +
                                 "WHERE g.id in %s")
            self.cur.execute(genome_dirs_query,
                             (tuple([user_genome]),))
            raw_results = self.cur.fetchall()
            genome_dir_user = {a: fastaPathGenerator(
                b, c) for a, b, c in raw_results}
            for _k, v in genome_dir_user.iteritems():
                query_list_file.write('{}\n'.format(v))
            query_list_file.close()

            # We need to rebuild the path for each potential reps
            genome_dirs_query = ("SELECT g.id, g.fasta_file_location,gs.external_id_prefix "
                                 "FROM genomes g " +
                                 "LEFT JOIN genome_sources gs ON gs.id = g.genome_source_id " +
                                 "WHERE g.id in %s")
            self.cur.execute(genome_dirs_query,
                             (tuple(list(zip(*genome_reps))[0]),))
            raw_results = self.cur.fetchall()
            genome_dirs = {a: fastaPathGenerator(
                b, c) for a, b, c in raw_results}
            ref_list_file = open(os.path.join(
                self.tmp_output_dir, 'ref_list.txt'), 'w')
            for _k, v in genome_dirs.iteritems():
                ref_list_file.write('{}\n'.format(v))
            ref_list_file.close()

            # run fastANI
            if not os.path.isfile(os.path.join(self.tmp_output_dir, 'query_list.txt')) or not os.path.isfile(os.path.join(self.tmp_output_dir, 'ref_list.txt')):
                raise

            cmd = 'fastANI --ql {0} --rl {1} -o {2} > /dev/null 2>{3}'.format(os.path.join(self.tmp_output_dir, 'query_list.txt'),
                                                                              os.path.join(
                                                                                  self.tmp_output_dir, 'ref_list.txt'),
                                                                              os.path.join(
                                                                                  self.tmp_output_dir, 'results.tab'),
                                                                              os.path.join(self.tmp_output_dir, 'error.log'))
            os.system(cmd)

            if not os.path.isfile(os.path.join(self.tmp_output_dir, 'results.tab')):
                errstr = 'FastANI has stopped:\n'
                if os.path.isfile(os.path.join(self.tmp_output_dir, 'error.log')):
                    with open(os.path.join(self.tmp_output_dir, 'error.log')) as debug:
                        for line in debug:
                            finalline = line
                        errstr += finalline
                raise ValueError(errstr)

            dict_parser_distance = self._parse_fastani_results(
                os.path.join(self.tmp_output_dir, 'results.tab'), genome_dirs, user_genome)
            if len(dict_parser_distance) == 0:
                return None
            sorted_dict = sorted(dict_parser_distance.get(
                user_genome).iteritems(), key=lambda(_x, y): y['ani'], reverse=True)
            fastani_matching_reference = sorted_dict[0][0]
            shutil.rmtree(self.tmp_output_dir)
            return fastani_matching_reference

        except ValueError as error:
            if os.path.exists(self.tmp_output_dir):
                shutil.rmtree(self.tmp_output_dir)
            raise error
        except Exception as error:
            if os.path.exists(self.tmp_output_dir):
                shutil.rmtree(self.tmp_output_dir)
            raise error
Пример #50
0
    def run(self, input_tree, msa_file, marker_info_file, mask_file,
            perc_markers_to_keep, num_replicates, model, output_dir):
        """Jackknife marker genes.

        Marker file should have the format:
          <marker id>\t<marker name>\t<marker desc>\t<length>\n

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        marker_info_file : str
          File indicating database id, HMM name, description and length of each marker in the alignment.
        mask_file : str
          File indicating masking of multiple sequence alignment.
        perc_markers_to_keep : float [0, 1]
          Percentage of marker genes to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          Output directory for jackkife trees.
        """

        assert (model in ['wag', 'jtt'])

        self.model = model
        self.perc_markers_to_keep = perc_markers_to_keep
        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)

        # determine length of each marker gene in alignment
        marker_lengths = []
        total_len = 0
        with open(marker_info_file) as f:
            f.readline()
            for line in f:
                line_split = line.split('\t')
                ml = int(line_split[3])
                marker_lengths.append(ml)
                total_len += ml

        self.logger.info('Concatenated length of markers: %d' % total_len)

        # read mask
        mask = open(mask_file).readline().strip()
        start = 0
        self.marker_lengths = []
        total_mask_len = 0
        for ml in marker_lengths:
            end = start + ml
            zeros = mask[start:end].count('0')
            start = end

            self.marker_lengths.append(ml - zeros)
            total_mask_len += ml - zeros

        self.logger.info('Concatenated length of filtered MSA: %d' %
                         total_mask_len)

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        if len(list(self.msa.values())[0]) != total_mask_len:
            self.logger.error('Length of MSA does not meet length of mask.')
            sys.exit()

        # calculate replicates
        self.logger.info('Calculating jackknife marker replicates:')
        parallel = Parallel(self.cpus)
        parallel.run(self._producer, None, range(num_replicates),
                     self._progress)

        # calculate support
        self.logger.info('Calculating support for %d replicates.' %
                         num_replicates)
        rep_tree_files = []
        for rep_index in range(num_replicates):
            rep_tree_files.append(
                os.path.join(self.replicate_dir,
                             'jk_markers.tree.' + str(rep_index) + '.tre'))

        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.jk_markers.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Пример #51
0
 def __init__(self, output_dir):
     self.outdir = output_dir
     make_sure_path_exists(self.outdir)
     self.outfile = os.path.join(self.outdir, 'existing_names.tsv')