예제 #1
0
파일: main.py 프로젝트: gsc0107/RefineM
    def compatible(self, options):
        """Compatible command"""

        check_file_exists(options.reference_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        # read scaffold statistics and calculate genome stats
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify putative homologs to reference genomes
        reference = Reference(1, None)
        putative_homologs = reference.homology_check(options.reference_file,
                                                     options.min_genes,
                                                     float(options.perc_genes))

        # identify scaffolds compatible with bins
        outliers = Outliers()
        output_file = os.path.join(options.output_dir, 'compatible.tsv')
        outliers.compatible(putative_homologs, scaffold_stats, genome_stats,
                            options.gc_perc, options.td_perc, options.cov_corr,
                            options.cov_perc, options.report_type, output_file)

        self.logger.info('Results written to: ' + output_file)
예제 #2
0
파일: main.py 프로젝트: AlexRBaker/RefineM
    def cluster(self, options):
        """Cluster command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - cluster] Partitioning bin into clusters.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(options.cpus)
        cluster.run(scaffold_stats,
                    options.num_clusters,
                    options.num_components,
                    options.K,
                    options.no_coverage,
                    options.no_pca,
                    options.iterations,
                    options.genome_file,
                    options.output_dir)

        self.logger.info('')
        self.logger.info('  Partitioned sequences written to: ' + options.output_dir)

        self.time_keeper.print_time_stamp()
예제 #3
0
파일: main.py 프로젝트: gsc0107/RefineM
    def outliers(self, options):
        """Outlier command"""

        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify outliers
        outliers = Outliers()
        outlier_file = os.path.join(options.output_dir, 'outliers.tsv')
        outliers.identify(scaffold_stats, genome_stats, options.gc_perc,
                          options.td_perc, options.cov_corr, options.cov_perc,
                          options.report_type, outlier_file)
        self.logger.info('Outlier information written to: ' + outlier_file)

        # create outlier plots
        if not options.no_plots:
            plot_dir = os.path.join(options.output_dir, 'plots')
            make_sure_path_exists(plot_dir)

            outliers.plot(scaffold_stats, genome_stats, outliers.gc_dist,
                          outliers.td_dist, options, options.highlight_file,
                          options.links_file, options.individual_plots,
                          plot_dir)

            self.logger.info('Outlier plots written to: ' + plot_dir)
예제 #4
0
파일: main.py 프로젝트: gsc0107/RefineM
    def scaffold_stats(self, options):
        """Scaffold statistics command"""

        check_file_exists(options.scaffold_file)

        if not self._check_nuclotide_seqs([options.scaffold_file]):
            self.logger.warning(
                'Scaffold file must contain nucleotide sequences.')
            sys.exit()

        genome_files = self._genome_files(options.genome_nt_dir,
                                          options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        make_sure_path_exists(options.output_dir)

        # get coverage information
        if not options.coverage_file:
            if not options.bam_files:
                self.logger.warning(
                    'One or more BAM files must be specified in order to calculate coverage profiles.'
                )
                coverage_file = None
            else:
                coverage = Coverage(options.cpus)
                coverage_file = os.path.join(options.output_dir,
                                             'coverage.tsv')
                coverage.run(options.bam_files, coverage_file,
                             options.cov_all_reads, options.cov_min_align,
                             options.cov_max_edit_dist)
                self.logger.info('Coverage profiles written to: %s' %
                                 coverage_file)
        else:
            check_file_exists(options.coverage_file)
            coverage_file = options.coverage_file

        # get tetranucleotide signatures
        if not options.tetra_file:
            tetra = Tetranucleotide(options.cpus)
            tetra_file = os.path.join(options.output_dir, 'tetra.tsv')
            signatures = tetra.run(options.scaffold_file)
            tetra.write(signatures, tetra_file)
            self.logger.info('Tetranucleotide signatures written to: %s' %
                             tetra_file)
        else:
            tetra_file = options.tetra_file

        # write out scaffold statistics
        stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv')
        stats = ScaffoldStats(options.cpus)
        stats.run(options.scaffold_file, genome_files, tetra_file,
                  coverage_file, stats_output)

        self.logger.info('Scaffold statistic written to: %s' % stats_output)
예제 #5
0
파일: main.py 프로젝트: AlexRBaker/RefineM
    def scaffold_stats(self, options):
        """Scaffold statistics command"""
        print options
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - scaffold_stats] Calculating statistics for scaffolds.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_file)

        if not self._check_nuclotide_seqs([options.scaffold_file]):
            self.logger.warning('[Warning] Scaffold file must contain nucleotide sequences.')
            sys.exit()

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        make_sure_path_exists(options.output_dir)

        # get coverage information
        if not options.coverage_file:
            if not options.bam_files:
                self.logger.warning('\n  [Warning] One or more BAM files must be specified in order to calculate coverage profiles.')
                coverage_file = None
            else:
                coverage = Coverage(options.cpus)
                coverage_file = os.path.join(options.output_dir, 'coverage.tsv')
                coverage.run(options.bam_files, coverage_file, options.cov_all_reads, options.cov_min_align, options.cov_max_edit_dist)
                self.logger.info('')
                self.logger.info('  Coverage profiles written to: %s' % coverage_file)
        else:
            coverage_file = options.coverage_file

        # get tetranucleotide signatures - ALEX - IMPORTANT FOR MY STUFF 
        if not options.tetra_file:
            self.logger.info('')
            tetra = Tetranucleotide(options.cpus)
            tetra_file = os.path.join(options.output_dir, 'tetra.tsv')
            signatures = tetra.run(options.scaffold_file)
            tetra.write(signatures, tetra_file)
            self.logger.info('  Tetranucleotide signatures written to: %s' % tetra_file)
        else:
            tetra_file = options.tetra_file

        # write out scaffold statistics
        stats_output = os.path.join(options.output_dir, 'scaffold_stats.tsv')
        stats = ScaffoldStats(options.cpus)
        stats.run(options.scaffold_file, genome_files, tetra_file, coverage_file, stats_output)

        self.logger.info('  Scaffold statistic written to: %s' % stats_output)

        self.time_keeper.print_time_stamp()
예제 #6
0
파일: main.py 프로젝트: gsc0107/RefineM
    def genome_stats(self, options):
        """Genomes statistics command"""

        check_file_exists(options.scaffold_stats_file)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats(options.cpus)
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats.run(scaffold_stats)
        genome_stats.write(options.output_file)

        self.logger.info('Genome statistic written to: %s' %
                         options.output_file)
예제 #7
0
파일: main.py 프로젝트: gsc0107/RefineM
    def split(self, options):
        """Split command"""

        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(1)
        cluster.split(scaffold_stats, options.criteria1, options.criteria2,
                      options.genome_file, options.output_dir)

        self.logger.info('Partitioned sequences written to: ' +
                         options.output_dir)
예제 #8
0
파일: main.py 프로젝트: gsc0107/RefineM
    def dbscan(self, options):
        """dbscan command"""

        check_file_exists(options.scaffold_stats_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        cluster = Cluster(options.cpus)
        cluster.dbscan(scaffold_stats, options.num_clusters,
                       options.num_components, options.min_pts,
                       options.dist_frac, options.no_coverage, options.no_pca,
                       options.genome_file, options.output_dir)

        self.logger.info('Partitioned sequences written to: ' +
                         options.output_dir)
예제 #9
0
파일: main.py 프로젝트: AlexRBaker/RefineM
    def genome_stats(self, options):
        """Genomes statistics command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - genome_stats] Calculating statistics for genomes.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_stats_file)

        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats(options.cpus)
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats.run(scaffold_stats)
        genome_stats.write(options.output_file)

        self.logger.info('  Genome statistic written to: %s' % options.output_file)

        self.time_keeper.print_time_stamp()
예제 #10
0
파일: main.py 프로젝트: AlexRBaker/RefineM
    def compatible(self, options):
        """Compatible command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - compatible] Identify scaffolds with compatible genomic statistics.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.reference_file)
        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        # read scaffold statistics and calculate genome stats
        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify putative homologs to reference genomes
        reference = Reference(1, None)
        putative_homologs = reference.homology_check(options.reference_file,
                                                         options.min_genes,
                                                         float(options.perc_genes))

        # identify scaffolds compatible with bins
        outliers = Outliers()
        output_file = os.path.join(options.output_dir, 'compatible.tsv')
        outliers.compatible(putative_homologs, scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
                                      options.cov_corr, options.cov_perc,
                                      options.report_type, output_file)

        self.logger.info('')
        self.logger.info('  Results written to: ' + output_file)

        self.time_keeper.print_time_stamp()
예제 #11
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity, per_aln_len):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold of valid hits.
        per_identity : float
            Percent identity threshold of valid hits [0,100].
        per_aln_len : float
            Percent query coverage of valid hits [0, 100].
        """

        # read statistics file
        self.logger.info('Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('Creating DIAMOND database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.create_db(ref_gene_file, ref_diamond_db)

        self.logger.info('Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, per_aln_len, 1, False, hits_ref_genomes)

        self.logger.info('Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, per_aln_len, 1, False, hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.items():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold ID\tSubject genome IDs\tSubject scaffold IDs')
        fout.write('\tGenome ID\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.items():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_bin_id, subject_gene_id  = hit.subject_id.split('~')
                subject_scaffold_id = subject_gene_id[0:subject_gene_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1
               

            sorted_subject_bin_ids = sorted(subject_bin_ids.items(), 
                                                key=operator.itemgetter(1),
                                                reverse=True)
            subject_bin_id_str = []
            for bin_id, num_hits in sorted_subject_bin_ids:
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            sorted_subject_scaffold_ids = sorted(subject_scaffold_ids.items(), 
                                                    key=operator.itemgetter(1),
                                                    reverse=True)
            subject_scaffold_id_str = []
            for subject_id, num_hits in sorted_subject_scaffold_ids:
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_bin_id_str,
                                                                        subject_scaffold_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out
예제 #12
0
파일: main.py 프로젝트: AlexRBaker/RefineM
    def outliers(self, options):
        """Outlier command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - outliers] Identifying scaffolds with divergent characteristics.')
        self.logger.info('*******************************************************************************')

        check_file_exists(options.scaffold_stats_file)
        make_sure_path_exists(options.output_dir)

        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(options.scaffold_stats_file)

        genome_stats = GenomeStats()
        genome_stats = genome_stats.run(scaffold_stats)

        # identify outliers
        outliers = Outliers()
        outlier_file = os.path.join(options.output_dir, 'outliers.tsv')
        outliers.identify(scaffold_stats, genome_stats,
                                      options.gc_perc, options.td_perc,
                                      options.cov_corr, options.cov_perc,
                                      options.report_type, outlier_file)
        self.logger.info('  Outlier information written to: ' + outlier_file)

        # create outlier plots
        self.logger.info('')

        highlight_scaffolds_ids = {}
        if options.highlight_file:
            for line in open(options.highlight_file):
                line_split = line.strip().split('\t')
                if len(line_split) > 1:
                    highlight_scaffolds_ids[line_split[0]] = [float(x.strip()) / 255.0 for x in line_split[1].split(',')]
                else:
                    highlight_scaffolds_ids[line_split[0]] = [1.0, 0, 0]

        link_scaffold_ids = []
        if options.links_file:
            with open(options.links_file) as links_file:
                for line in links_file:
                    #print line.strip().split('\t')
                    link_scaffold_ids.append([ast.literal_eval(item) if i not in (0,2) else item for i,item in enumerate((line.strip().split('\t')))])
            #link_scaffold_ids.append(line.strip().split('\t') for line in open(options.links_file))
            
        #print list(link_scaffold_ids[0])
        
        # create plots
        genomes_processed = 0
        plot_dir = os.path.join(options.output_dir, 'plots')
        make_sure_path_exists(plot_dir)
        genome_plots = defaultdict(list)
        for genome_id, gs in genome_stats.iteritems():
            genomes_processed += 1

            sys.stdout.write('  Plotting scaffold distribution for %d of %d (%.1f%%) genomes.\r' %
                                                                                            (genomes_processed,
                                                                                             len(genome_stats),
                                                                                             genomes_processed * 100.0 / len(genome_stats)))
            sys.stdout.flush()

            genome_scaffold_stats = {}
            for scaffold_id in scaffold_stats.scaffolds_in_genome[genome_id]:
                genome_scaffold_stats[scaffold_id] = scaffold_stats.stats[scaffold_id]

            if options.individual_plots:
                #~ # GC plot
                #~ gc_plots = GcPlots(options)
                #~ gc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_gc, outliers.gc_dist, [options.gc_perc])
#~ 
                #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_plots.' + options.image_type)
                #~ gc_plots.save_plot(output_plot, dpi=options.dpi)
                #~ gc_plots.save_html(os.path.join(plot_dir, genome_id + '.gc_plots.html'))

                # TD plot
                td_plots = TdPlots(options)
                td_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_signature, outliers.td_dist, [options.td_perc])

                output_plot = os.path.join(plot_dir, genome_id + '.td_plots.' + options.image_type)
                td_plots.save_plot(output_plot, dpi=options.dpi)
                td_plots.save_html(os.path.join(plot_dir, genome_id + '.td_plots.html'))

                #~ # mean absolute deviation of coverage profiles
                #~ cov_perc_plots = CovPercPlots(options)
                #~ cov_perc_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids, gs.mean_coverage, [options.cov_perc])
#~ 
                #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_perc.' + options.image_type)
                #~ cov_perc_plots.save_plot(output_plot, dpi=options.dpi)
                #~ cov_perc_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_perc.html'))
#~ 
                #~ # coverage correlation plots
                #~ if len(gs.mean_coverage) > 1:
                    #~ cov_corr_plots = CovCorrPlots(options)
                    #~ cov_corr_plots.plot(genome_scaffold_stats, highlight_scaffolds_ids, gs.mean_coverage, [options.cov_corr])
#~ 
                    #~ output_plot = os.path.join(plot_dir, genome_id + '.cov_corr.' + options.image_type)
                    #~ cov_corr_plots.save_plot(output_plot, dpi=options.dpi)
                    #~ cov_corr_plots.save_html(os.path.join(plot_dir, genome_id + '.cov_corr.html'))

            #~ # combined distribution, GC vs. coverage, and tetranucleotide signature plots
            #~ combined_plots = CombinedPlots(options)
            #~ combined_plots.plot(genome_scaffold_stats,
                            #~ highlight_scaffolds_ids, link_scaffold_ids, gs,
                            #~ outliers.gc_dist, outliers.td_dist,
                            #~ options.gc_perc, options.td_perc, options.cov_perc)
#~ 
            #~ output_plot = os.path.join(plot_dir, genome_id + '.combined.' + options.image_type)
            #~ combined_plots.save_plot(output_plot, dpi=options.dpi)
            #~ combined_plots.save_html(os.path.join(plot_dir, genome_id + '.combined.html'))
#~ 
            #~ genome_plots[genome_id].append(('Combined', genome_id + '.combined.html'))
#~ 
            #~ # combined plot of distributions
            #~ dist_plots = DistributionPlots(options)
            #~ dist_plots.plot(genome_scaffold_stats,
                            #~ highlight_scaffolds_ids,
                            #~ link_scaffold_ids,
                            #~ gs,
                            #~ outliers.gc_dist, outliers.td_dist,
                            #~ options.gc_perc, options.td_perc, options.cov_perc)
#~ 
            #~ output_plot = os.path.join(plot_dir, genome_id + '.dist_plot.' + options.image_type)
            #~ dist_plots.save_plot(output_plot, dpi=options.dpi)
            #~ dist_plots.save_html(os.path.join(plot_dir, genome_id + '.dist_plot.html'))
#~ 
            #~ genome_plots[genome_id].append(('Distributions', genome_id + '.dist_plot.html'))
#~ 
            #~ # GC vs. coverage plot
            #~ gc_cov_plot = GcCovPlot(options)
            #~ gc_cov_plot.plot(genome_scaffold_stats,
                             #~ highlight_scaffolds_ids, link_scaffold_ids,
                             #~ gs.mean_gc, gs.mean_coverage)
#~ 
            #~ output_plot = os.path.join(plot_dir, genome_id + '.gc_coverge.' + options.image_type)
            #~ gc_cov_plot.save_plot(output_plot, dpi=options.dpi)
            #~ gc_cov_plot.save_html(os.path.join(plot_dir, genome_id + '.gc_coverge.html'))
#~ 
            #~ genome_plots[genome_id].append(('GC vs. coverage', genome_id + '.gc_coverge.html'))

            # tetranucleotide signature PCA plot
            tetra = TetraPcaPlot(options)
            tetra.plot(genome_scaffold_stats, highlight_scaffolds_ids, link_scaffold_ids)

            output_plot = os.path.join(plot_dir, genome_id + '.tetra_pca.' + options.image_type)
            tetra.save_plot(output_plot, dpi=options.dpi)
            tetra.save_html(os.path.join(plot_dir, genome_id + '.tetra_pca.html'))

            genome_plots[genome_id].append(('Tetra PCA', genome_id + '.tetra_pca.html'))

        sys.stdout.write('\n')

        outliers.create_html_index(plot_dir, genome_plots)

        self.logger.info('  Outlier plots written to: ' + plot_dir)

        self.time_keeper.print_time_stamp()
예제 #13
0
    def run(self, scaffold_gene_file, stat_file, ref_genome_gene_files, db_file, evalue, per_identity,):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        scaffold_gene_file : str
            Fasta file of genes on scaffolds in amino acid space.
        stat_file : str
            File with statistics for individual scaffolds.
        ref_genome_gene_files : list of str
            Fasta files of called genes on reference genomes of interest.
        db_file : str
            Database of competing reference genes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        """

        # read statistics file
        self.logger.info('')
        self.logger.info('  Reading scaffold statistics.')
        scaffold_stats = ScaffoldStats()
        scaffold_stats.read(stat_file)

        # perform homology searches
        self.logger.info('')
        self.logger.info('  Creating diamond database for reference genomes.')
        ref_gene_file = os.path.join(self.output_dir, 'ref_genes.faa')
        concatenate_gene_files(ref_genome_gene_files, ref_gene_file)

        diamond = Diamond(self.cpus)
        ref_diamond_db = os.path.join(self.output_dir, 'ref_genes')
        diamond.make_database(ref_gene_file, ref_diamond_db)

        self.logger.info('  Identifying homologs within reference genomes of interest (be patient!).')
        self.diamond_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(self.diamond_dir)
        hits_ref_genomes_daa = os.path.join(self.diamond_dir, 'ref_hits')
        diamond.blastp(scaffold_gene_file, ref_diamond_db, evalue, per_identity, 1, hits_ref_genomes_daa)

        hits_ref_genomes = os.path.join(self.diamond_dir, 'ref_hits.tsv')
        diamond.view(hits_ref_genomes_daa + '.daa', hits_ref_genomes)

        self.logger.info('  Identifying homologs within competing reference genomes (be patient!).')
        hits_comp_ref_genomes_daa = os.path.join(self.diamond_dir, 'competing_ref_hits')
        diamond.blastp(scaffold_gene_file, db_file, evalue, per_identity, 1, hits_comp_ref_genomes_daa)

        hits_comp_ref_genomes = os.path.join(self.diamond_dir, 'competing_ref_hits.tsv')
        diamond.view(hits_comp_ref_genomes_daa + '.daa', hits_comp_ref_genomes)

        # get list of genes with a top hit to the reference genomes of interest
        hits_to_ref = self._top_hits_to_reference(hits_ref_genomes, hits_comp_ref_genomes)

        # get number of genes on each scaffold
        num_genes_on_scaffold = defaultdict(int)
        for seq_id, _seq in seq_io.read_seq(scaffold_gene_file):
            scaffold_id = seq_id[0:seq_id.rfind('_')]
            num_genes_on_scaffold[scaffold_id] += 1

        # get hits to each scaffold
        hits_to_scaffold = defaultdict(list)
        for query_id, hit in hits_to_ref.iteritems():
            gene_id = query_id[0:query_id.rfind('~')]
            scaffold_id = gene_id[0:gene_id.rfind('_')]
            hits_to_scaffold[scaffold_id].append(hit)

        # report summary stats for each scaffold
        reference_out = os.path.join(self.output_dir, 'references.tsv')
        fout = open(reference_out, 'w')
        fout.write('Scaffold id\tSubject scaffold ids\tSubject genome ids')
        fout.write('\tGenome id\tLength (bp)\tGC\tMean coverage')
        fout.write('\t# genes\t# hits\t% genes\tAvg. align. length (bp)\tAvg. % identity\tAvg. e-value\tAvg. bitscore\n')

        for scaffold_id, hits in hits_to_scaffold.iteritems():
            aln_len = []
            perc_iden = []
            evalue = []
            bitscore = []
            subject_scaffold_ids = defaultdict(int)
            subject_bin_ids = defaultdict(int)
            for hit in hits:
                aln_len.append(hit.aln_length)
                perc_iden.append(hit.perc_identity)
                evalue.append(hit.evalue)
                bitscore.append(hit.bitscore)

                subject_id, subject_bin_id = hit.subject_id.split('~')
                subject_scaffold_id = subject_id[0:subject_id.rfind('_')]
                subject_scaffold_ids[subject_scaffold_id] += 1
                subject_bin_ids[subject_bin_id] += 1

            subject_scaffold_id_str = []
            for subject_id, num_hits in subject_scaffold_ids.iteritems():
                subject_scaffold_id_str.append(subject_id + ':' + str(num_hits))
            subject_scaffold_id_str = ','.join(subject_scaffold_id_str)

            subject_bin_id_str = []
            for bin_id, num_hits in subject_bin_ids.iteritems():
                subject_bin_id_str.append(bin_id + ':' + str(num_hits))
            subject_bin_id_str = ','.join(subject_bin_id_str)

            fout.write('%s\t%s\t%s\t%s\t%.2f\t%d\t%d\t%.2f\t%d\t%.2f\t%.2g\t%.2f\n' % (
                                                                        scaffold_id,
                                                                        subject_scaffold_id_str,
                                                                        subject_bin_id_str,
                                                                        scaffold_stats.print_stats(scaffold_id),
                                                                        mean(scaffold_stats.coverage(scaffold_id)),
                                                                        num_genes_on_scaffold[scaffold_id],
                                                                        len(hits),
                                                                        len(hits) * 100.0 / num_genes_on_scaffold[scaffold_id],
                                                                        mean(aln_len),
                                                                        mean(perc_iden),
                                                                        mean(evalue),
                                                                        mean(bitscore)))

        fout.close()

        return reference_out