Exemplo n.º 1
0
    def call_genes(self, options):
        """Call genes command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - call_genes] Identifying genes within genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.genome_nt_dir)
        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        # call genes in genomes
        prodigal = Prodigal(options.cpus)
        prodigal.run(genome_files, options.output_dir)
        self.logger.info('  Genes in genomes written to: %s' % options.output_dir)

        # call genes in unbinned scaffolds
        if options.unbinned_file:
            unbinned_output_dir = os.path.join(options.output_dir, 'unbinned')
            prodigal.run([options.unbinned_file], unbinned_output_dir, meta=True)
            self.logger.info('  Genes in unbinned scaffolds written to: %s' % unbinned_output_dir)

        self.time_keeper.print_time_stamp()
Exemplo n.º 2
0
    def _genome_files(self, genome_dir, genome_ext):
        """Identify genomes files.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes of interest.
        genome_ext : str
            Extension of genome files.

        Returns
        -------
        list
            Name of genome files in directory.
        """

        check_dir_exists(genome_dir)

        genome_files = []
        for f in os.listdir(genome_dir):
            if f.endswith(genome_ext):
                genome_files.append(os.path.join(genome_dir, f))

        if not genome_files:
            self.logger.warning('  [Warning] No genomes found. Check the --genome_ext flag used to identify genomes.')
            sys.exit()

        return genome_files
Exemplo n.º 3
0
    def lgt_codon(self, options):
        """LGT dinucleotide usage command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - lgt_codon] Calculating codon usage of genes.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.gene_dir)

        # get list of files with called genes
        gene_files = []
        files = os.listdir(options.gene_dir)
        for f in files:
            if f.endswith(options.gene_ext):
                gene_files.append(os.path.join(options.gene_dir, f))

        # warn use if no files were found
        if len(gene_files) == 0:
            self.logger.warning('  [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.')
            return

        lgt_codon = LgtCodon(options.cpus)
        lgt_codon.run(gene_files, options.output_dir)

        self.logger.info('')
        self.logger.info('  Codon usage written to directory: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
Exemplo n.º 4
0
    def codon_usage(self, options):
        """Codon usage command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - codon_usage] Calculating codon usage within each genome.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.gene_dir)

        # get list of files with called genes
        gene_files = []
        files = os.listdir(options.gene_dir)
        for f in files:
            if f.endswith(options.gene_ext):
                gene_files.append(os.path.join(options.gene_dir, f))

        # warn use if no files were found
        if len(gene_files) == 0:
            self.logger.warning('  [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.')
            return

        # calculate amino acid usage
        codon_usage = CodonUsage(options.cpus, options.keep_ambiguous)
        genome_codon_usage, codon_set, _mean_length = codon_usage.run(gene_files)

        # write out results
        self._write_usage_profile(genome_codon_usage, codon_set, options.output_file)

        self.logger.info('')
        self.logger.info('  Codon usage written to: %s' % options.output_file)

        self.time_keeper.print_time_stamp()
Exemplo n.º 5
0
    def _genome_files(self, genome_dir, genome_ext):
        """Identify genomes files.

        Parameters
        ----------
        genome_dir : str
            Directory containing genomes of interest.
        genome_ext : str
            Extension of genome files.

        Returns
        -------
        list
            Path to genome files.
        """

        check_dir_exists(genome_dir)

        genome_files = []
        for f in os.listdir(genome_dir):
            if f.endswith(genome_ext):
                genome_files.append(os.path.join(genome_dir, f))

        if not genome_files:
            self.logger.warning(
                'No genomes found. Check the --genome_ext or --protein_ext flag used to identify genomes.'
            )
            sys.exit()

        return genome_files
Exemplo n.º 6
0
    def bin_compare(self, options):
        """Bin compare command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - bin_compare] Comparing two sets of genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.genome_nt_dir1)
        check_dir_exists(options.genome_nt_dir2)

        genomes_files1 = self._genome_files(options.genome_nt_dir1, options.genome_ext1)
        if not self._check_nuclotide_seqs(genomes_files1):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        genomes_files2 = self._genome_files(options.genome_nt_dir2, options.genome_ext2)
        if not self._check_nuclotide_seqs(genomes_files2):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        bin_comparer = BinComparer()
        bin_comparer.run(genomes_files1, genomes_files2, options.scaffold_file, options.output_file)

        self.logger.info('')
        self.logger.info('  Detailed bin comparison written to: ' + options.output_file)

        self.time_keeper.print_time_stamp()
Exemplo n.º 7
0
    def call_genes(self, options):
        """Call genes command"""

        check_dir_exists(options.genome_nt_dir)
        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir,
                                          options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        # call genes in genomes
        prodigal = Prodigal(options.cpus)
        prodigal.run(genome_files, options.output_dir)
        self.logger.info('Genes in genomes written to: %s' %
                         options.output_dir)

        # call genes in unbinned scaffolds
        if options.unbinned_file:
            unbinned_output_dir = os.path.join(options.output_dir, 'unbinned')
            prodigal.run([options.unbinned_file],
                         unbinned_output_dir,
                         meta=True)
            self.logger.info('Genes in unbinned scaffolds written to: %s' %
                             unbinned_output_dir)
Exemplo n.º 8
0
    def unanimous(self, options):
        """Unanimous command"""

        check_dir_exists(options.profile_dir)
        make_sure_path_exists(options.output_dir)

        bin_dirs = self._bin_dirs(options)

        e = Ensemble(options.bin_prefix)
        e.run(
            options.profile_dir,
            bin_dirs,
            options.weight,
            options.sel_min_quality,
            options.sel_min_comp,
            options.sel_max_cont,
            None,
            None,
            None,
            False,  # perform greedy bin selection
            True,  # perform unanimous bin selection
            False,  # do not merge
            None,  # no coverage file
            options.report_min_quality,
            options.simple_headers,
            options.output_dir)

        self.logger.info("UniteM 'unanimous' results written to: %s" %
                         options.output_dir)
Exemplo n.º 9
0
    def unique(self, options):
        """Unique command"""

        check_dir_exists(options.bin_dir)

        bt = BinTools()
        bin_files = bt.bin_files(options.bin_dir, options.extension)
        bt.unique(bin_files)
Exemplo n.º 10
0
    def stop_usage(self, options):
        """Stop codon usage command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - stop_usage] Calculating stop codon usage within each genome.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.gene_dir)

        # get list of files with called genes
        gene_files = []
        files = os.listdir(options.gene_dir)
        for f in files:
            if f.endswith(options.gene_ext):
                gene_files.append(os.path.join(options.gene_dir, f))

        # warn use if no files were found
        if len(gene_files) == 0:
            self.logger.warning('  [Warning] No gene files found. Check the --gene_ext flag used to identify gene files.')
            return

        # calculate amino acid usage
        codon_usage = CodonUsage(options.cpus, keep_ambiguous=False, stop_codon_only=True)
        genome_codon_usage, codon_set, mean_gene_length = codon_usage.run(gene_files)

        # write out results
        fout = open(options.output_file, 'w')
        for codon in codon_set:
            fout.write('\t' + codon)
            if mean_gene_length:
                fout.write('\t' + codon + ': avg. seq. length')
        fout.write('\n')

        for genome_id, codons in genome_codon_usage.iteritems():
            fout.write(genome_id)

            for codon in codon_set:
                fout.write('\t%d' % codons.get(codon, 0))

                if mean_gene_length:
                    mean_len = mean_gene_length[genome_id].get(codon, None)
                    if mean_len:
                        fout.write('\t%.1f' % mean_len)
                    else:
                        fout.write('\tna')
            fout.write('\n')

        self.logger.info('')
        self.logger.info('  Stop codon usage written to: %s' % options.output_file)

        self.time_keeper.print_time_stamp()
Exemplo n.º 11
0
    def marker_files(self, options):
        """Generate marker gene file."""

        check_dir_exists(options.bac120_gene_dir)
        check_dir_exists(options.ar122_gene_dir)
        check_file_exists(options.user_gid_table)
        make_sure_path_exists(options.output_dir)

        p = WebsiteData(options.release_number, options.output_dir)
        p.marker_files(options.bac120_gene_dir, options.ar122_gene_dir,
                       options.user_gid_table)

        self.logger.info('Done.')
Exemplo n.º 12
0
    def rogue_test(self, options):
        """Rogue taxa command."""

        check_dir_exists(options.input_tree_dir)
        check_file_exists(options.taxonomy_file)
        make_sure_path_exists(options.output_dir)

        if options.decorate:
            check_dependencies(['genometreetk'])

        rt = RogueTest()
        rt.run(options.input_tree_dir, options.taxonomy_file,
               options.outgroup_taxon, options.decorate, options.output_dir)

        self.logger.info('Finished rogue taxa test.')
Exemplo n.º 13
0
    def compare(self, options):
        """Compare command"""

        check_dir_exists(options.bin_dir1)
        check_dir_exists(options.bin_dir2)

        bt = BinTools()
        bin_files1 = bt.bin_files(options.bin_dir1, options.extension1)
        bin_files2 = bt.bin_files(options.bin_dir2, options.extension2)

        bt.compare(bin_files1, bin_files2, options.assembly_file,
                   options.output_file)

        self.logger.info("UniteM 'compare' results written to: %s" %
                         options.output_file)
Exemplo n.º 14
0
    def classify(self, options):
        """Determine taxonomic classification of genomes."""

        check_dir_exists(options.align_dir)
        make_sure_path_exists(options.out_dir)

        genomes = self._genomes_to_process(options.genome_dir,
                                           options.batchfile,
                                           options.extension)

        classify = Classify(options.cpus)
        classify.run(genomes, options.align_dir, options.out_dir,
                     options.prefix, options.debug)

        self.logger.info('Done.')
Exemplo n.º 15
0
    def align(self, options):
        """Create MSA from marker genes."""

        check_dir_exists(options.identify_dir)
        make_sure_path_exists(options.out_dir)

        if not hasattr(options, 'outgroup_taxon'):
            options.outgroup_taxon = None

        markers = Markers(options.cpus)
        markers.align(options.identify_dir, options.taxa_filter,
                      options.min_perc_aa, options.custom_msa_filters,
                      options.consensus, options.min_perc_taxa,
                      options.out_dir, options.prefix, options.outgroup_taxon)

        self.logger.info('Done.')
Exemplo n.º 16
0
    def unbinned(self, options):
        """Unbinned Command"""

        check_dir_exists(options.genome_nt_dir)

        genomes_files = self._genome_files(options.genome_nt_dir,
                                           options.genome_ext)
        if not self._check_nuclotide_seqs(genomes_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        unbinned = Unbinned()
        unbinned_seqs = unbinned.run(genomes_files, options.scaffold_file,
                                     options.min_seq_len)

        seq_io.write_fasta(unbinned_seqs, options.output_file)

        self.logger.info('Unbinned scaffolds written to: ' +
                         options.output_file)
Exemplo n.º 17
0
    def _bin_dirs(self, options):
        """Get directories with bins from different binners."""

        bin_dirs = {}
        if hasattr(options, 'bin_dirs') and options.bin_dirs:
            for d in options.bin_dirs:
                check_dir_exists(d)
                method_id = os.path.basename(os.path.normpath(d))
                bin_ext, count = self._bin_extension(d)
                if not bin_ext:
                    self.logger.warning('No bins identified for %s in %s.' %
                                        (method_id, d))
                else:
                    bin_dirs[method_id] = (d, bin_ext)
                    self.logger.info(
                        "Processing %d genomes from %s with extension '%s'." %
                        (count, method_id, bin_ext))

        if hasattr(options, 'bin_file') and options.bin_file:
            check_file_exists(options.bin_file)
            for line in open(options.bin_file):
                if line.strip():
                    line_split = map(str.strip, line.split('\t'))
                    if len(line_split) != 2:
                        self.logger.warning("Skipping invalid line: %s" %
                                            line.strip())
                        continue

                    method_id = line_split[0]
                    d = line_split[1]
                    check_dir_exists(d)
                    bin_ext, count = self._bin_extension(d)
                    if not bin_ext:
                        self.logger.warning(
                            'No bins identified for %s in %s.' %
                            (method_id, d))
                    else:
                        bin_dirs[method_id] = (d, bin_ext)
                        self.logger.info(
                            "Processing %d genomes from %s with extension '%s'."
                            % (count, method_id, bin_ext))

        return bin_dirs
Exemplo n.º 18
0
    def identify(self, options):
        """Identify marker genes in genomes."""

        if options.genome_dir:
            check_dir_exists(options.genome_dir)

        if options.batchfile:
            check_file_exists(options.batchfile)

        make_sure_path_exists(options.out_dir)

        genomes = self._genomes_to_process(options.genome_dir,
                                           options.batchfile,
                                           options.extension)

        markers = Markers(options.cpus)
        markers.identify(genomes, options.out_dir, options.prefix)

        self.logger.info('Done.')
Exemplo n.º 19
0
    def unbinned(self, options):
        """Unbinned Command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [RefineM - unbinned] Identify unbinned scaffolds.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.genome_nt_dir)

        genomes_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genomes_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        unbinned = Unbinned()
        unbinned_seqs = unbinned.run(genomes_files, options.scaffold_file, options.min_seq_len)

        seq_io.write_fasta(unbinned_seqs, options.output_file)

        self.logger.info('')
        self.logger.info('  Unbinned scaffolds written to: ' + options.output_file)

        self.time_keeper.print_time_stamp()
Exemplo n.º 20
0
    def ssu_erroneous(self, options):
        """Erroneous SSU command"""

        check_dependencies(('nhmmer', 'blastn'))

        check_dir_exists(options.genome_nt_dir)
        check_dir_exists(options.taxon_profile_dir)

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir,
                                          options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        # identify scaffolds with 16S sequences
        ssu = SSU(options.cpus)
        ssu_hits = ssu.identify(genome_files, options.evalue,
                                options.concatenate, options.output_dir)
        ssu_seq_files = ssu.extract(genome_files, ssu_hits, options.output_dir)
        ssu_classifications = ssu.classify(ssu_seq_files, options.ssu_db,
                                           options.ssu_taxonomy_file,
                                           options.evalue, options.output_dir)

        # report statistics for SSU scaffolds
        self.logger.info(
            'Identifying scaffolds with 16S rRNA genes with divergent taxonomic classification.'
        )

        ssu.erroneous(ssu_hits, ssu_classifications, options.taxon_profile_dir,
                      options.common_taxon, options.ssu_min_len,
                      options.ssu_domain, options.ssu_phylum,
                      options.ssu_class, options.ssu_order, options.ssu_family,
                      options.ssu_genus, options.output_dir)

        self.logger.info('SSU information written to: ' + options.output_dir)
Exemplo n.º 21
0
    def aai(self, options):
        """AAI command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - aai] Calculating the AAI between homologs in genome pairs.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.rblast_dir)
        make_sure_path_exists(options.output_dir)

        genome_ids = []
        protein_dir = os.path.join(options.rblast_dir, 'genes')
        for f in os.listdir(protein_dir):
            if f.endswith('.faa'):
                genome_id = remove_extension(f, '.faa')
                genome_ids.append(genome_id)

        if not genome_ids:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            sys.exit()

        aai_calculator = AAICalculator(options.cpus)
        aai_calculator.run(genome_ids,
                            protein_dir,
                            options.rblast_dir,
                            options.per_identity,
                            options.per_aln_len,
                            options.write_shared_genes,
                            options.output_dir)

        shared_genes_dir = os.path.join(options.output_dir, aai_calculator.shared_genes)
        self.logger.info('')
        self.logger.info('  Identified homologs between genome pairs written to: %s' % shared_genes_dir)

        self.time_keeper.print_time_stamp()
Exemplo n.º 22
0
    def bin_compare(self, options):
        """Bin compare command"""

        check_dir_exists(options.genome_nt_dir1)
        check_dir_exists(options.genome_nt_dir2)

        genomes_files1 = self._genome_files(options.genome_nt_dir1,
                                            options.genome_ext1)
        if not self._check_nuclotide_seqs(genomes_files1):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        genomes_files2 = self._genome_files(options.genome_nt_dir2,
                                            options.genome_ext2)
        if not self._check_nuclotide_seqs(genomes_files2):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        bin_comparer = BinComparer()
        bin_comparer.run(genomes_files1, genomes_files2, options.scaffold_file,
                         options.output_file)

        self.logger.info('Detailed bin comparison written to: ' +
                         options.output_file)
Exemplo n.º 23
0
    def rblast(self, options):
        """Reciprocal blast command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.protein_dir)
        make_sure_path_exists(options.output_dir)

        aa_gene_files = []
        for f in os.listdir(options.protein_dir):
            if f.endswith(options.protein_ext):
                aa_gene_files.append(os.path.join(options.protein_dir, f))

        if not aa_gene_files:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            sys.exit()

        # modify gene ids to include genome ids in order to ensure
        # all gene identifiers are unique across the set of genomes,
        # also removes the trailing asterisk used to identify the stop
        # codon
        self.logger.info('')
        self.logger.info('  Appending genome identifiers to all gene identifiers.')
        gene_out_dir = os.path.join(options.output_dir, 'genes')
        make_sure_path_exists(gene_out_dir)
        modified_aa_gene_files = []
        for gf in aa_gene_files:
            genome_id = remove_extension(gf)

            aa_file = os.path.join(gene_out_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()

            modified_aa_gene_files.append(aa_file)

        # perform the reciprocal blast with blastp or diamond
        self.logger.info('')
        if options.blastp:
            rblast = ReciprocalBlast(options.cpus)
            rblast.run(modified_aa_gene_files, options.evalue, options.output_dir)

            # concatenate all blast tables to mimic output of diamond, all hits
            # for a given genome MUST be in consecutive order to fully mimic
            # the expected results from diamond
            self.logger.info('')
            self.logger.info('  Creating single file with all blast hits (be patient!).')
            blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')])
            hit_tables = [os.path.join(options.output_dir, f) for f in blast_files]
            concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv'))
        else:
            rdiamond = ReciprocalDiamond(options.cpus)
            rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir)

        self.logger.info('')
        self.logger.info('  Reciprocal blast hits written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
Exemplo n.º 24
0
    def annoted_features(self, options):
        """Making annoted features matrix"""

        missing = []

        features2annotation = {}
        with open(options.features_annotation) as f:
            for line in f:
                line = line.rstrip()
                features_id, annotation = line.split('\t')
                features2annotation[features_id] = annotation

        counts = {}
        id2description = {}
        annotation_id_list = []
        with open(options.annotation_description) as f:
            for line in f:
                line = line.rstrip()
                annotation_id, description = line.split('\t')
                id2description[annotation_id] = description
                annotation_id_list.append(annotation_id)
                counts[annotation_id] = {}

        annotation_id_list.append('hypothetical protein')
        counts['hypothetical protein'] = {}

        check_dir_exists(options.features_dir)
        input_matrices = DefaultValues.FEATURES_ABUNDANCE_FILES
        output_matrices = DefaultValues.ANNOTATE_ABUNDANCE_FILES

        for index, input_matrix in enumerate(input_matrices):

            input_matrix = os.path.join(options.features_dir, input_matrix)
            count_type, abundance_type = input_matrix.split('_')[1:3]
            check_file_exists(input_matrix)
            counts_all = {}
            header = []

            with open(input_matrix) as f:
                for line in f:
                    line = line.rstrip()
                    line_list = line.split('\t')
                    if len(header) == 0:
                        header = line_list
                        for i in range(3, len(header), 1):
                            sample = header[i]
                            for annotation_id in annotation_id_list:
                                counts[annotation_id][sample] = 0
                            counts_all[sample] = 0

                    else:
                        features = line_list[0]
                        annotation_id = features2annotation[features]
                        if annotation_id not in counts:
                            if annotation_id not in missing:
                                self.logger.warning(
                                    "'%s' not present in %s" %
                                    (annotation_id,
                                     options.annotation_description))
                                missing.append(annotation_id)
                            continue
                        for i in range(3, len(header), 1):
                            sample = header[i]
                            counts[annotation_id][sample] = counts[
                                annotation_id][sample] + float(line_list[i])
                            counts_all[sample] = counts_all[sample] + float(
                                line_list[i])

            output_matrix = os.path.join(options.features_dir,
                                         output_matrices[index])
            self.logger.info('Print %s %s abundance matrix in "%s"' %
                             (count_type, abundance_type, output_matrix))
            output_handle = open(output_matrix, "w")
            output_handle.write('\t'.join(['Features'] +
                                          header[3:len(header)]) + '\n')
            for annotation in annotation_id_list:
                if sum([counts[annotation][s]
                        for s in counts[annotation]]) == 0 and options.removed:
                    continue
                else:
                    output_handle.write('\t'.join([annotation] + [
                        str(counts[annotation][s]) for s in counts[annotation]
                    ]) + '\n')

        self.logger.info('Printing matrices done')