示例#1
0
    def _producer_blast(self, genome_pair):
        """Apply reciprocal blast to a pair of genomes.

        Parameters
        ----------
        genome_pair : list
            Identifier of genomes to process.
        """

        blast = Blast(cpus=self.producer_cpus)

        aa_gene_fileA, aa_gene_fileB = genome_pair

        genome_idA = remove_extension(aa_gene_fileA)
        genome_idB = remove_extension(aa_gene_fileB)

        dbA = os.path.join(self.output_dir, genome_idA + '.db')
        dbB = os.path.join(self.output_dir, genome_idB + '.db')

        output_fileAB = os.path.join(self.output_dir, genome_idA + '-' + genome_idB + '.blastp.tsv')
        blast.blastp(aa_gene_fileA, dbB, output_fileAB, self.evalue)

        output_fileBA = os.path.join(self.output_dir, genome_idB + '-' + genome_idA + '.blastp.tsv')
        blast.blastp(aa_gene_fileB, dbA, output_fileBA, self.evalue)

        return True
示例#2
0
    def run(self, bam_files, out_file, all_reads, min_align_per, max_edit_dist_per):
        """Calculate coverage of sequences for each BAM file."""

        # make sure all BAM files are sorted
        for bam_file in bam_files:
            if not os.path.exists(bam_file + '.bai'):
                self.logger.error('  [Error] BAM file is not sorted: ' + bam_file + '\n')
                sys.exit()

        # calculate coverage of each BAM file
        coverage_info = {}
        for i, bam_file in enumerate(bam_files):
            self.logger.info('')
            self.logger.info('  Calculating coverage profile for %s (%d of %d):' % (ntpath.basename(bam_file), i + 1, len(bam_files)))

            coverage_info[bam_file] = mp.Manager().dict()
            coverage_info[bam_file] = self._process_bam(bam_file, all_reads, min_align_per, max_edit_dist_per, coverage_info[bam_file])

        fout = open(out_file, 'w')
        header = 'Scaffold Id\tLength (bp)'
        for bam_file in bam_files:
            bam_id = remove_extension(bam_file)
            header += '\t' + bam_id
        fout.write(header + '\n')

        for seq_id in coverage_info[coverage_info.keys()[0]].keys():
            row_str = seq_id + '\t' + str(coverage_info[coverage_info.keys()[0]][seq_id].seq_len)
            for bam_file in bam_files:
                bam_id = remove_extension(bam_file)
                row_str += '\t' + str(coverage_info[bam_file][seq_id].coverage)
            fout.write(row_str + '\n')

        fout.close()
示例#3
0
文件: coverage.py 项目: wwood/RefineM
    def run(self, bam_files, out_file, all_reads, min_align_per, max_edit_dist_per):
        """Calculate coverage of sequences for each BAM file."""

        # make sure all BAM files are indexed
        for bam_file in bam_files:
            if not os.path.exists(bam_file + '.bai'):
                self.logger.error('BAM index file is missing: ' + bam_file + '.bai\n')
                sys.exit()

        # calculate coverage of each BAM file
        coverage_info = {}
        for i, bam_file in enumerate(bam_files):
            self.logger.info('Calculating coverage profile for %s (%d of %d):' % (ntpath.basename(bam_file), i + 1, len(bam_files)))

            coverage_info[bam_file] = mp.Manager().dict()
            coverage_info[bam_file] = self._process_bam(bam_file, all_reads, min_align_per, max_edit_dist_per, coverage_info[bam_file])

        fout = open(out_file, 'w')
        header = 'Scaffold Id\tLength (bp)'
        for bam_file in bam_files:
            bam_id = remove_extension(bam_file)
            header += '\t' + bam_id
        fout.write(header + '\n')

        for seq_id in coverage_info[coverage_info.keys()[0]].keys():
            row_str = seq_id + '\t' + str(coverage_info[coverage_info.keys()[0]][seq_id].seq_len)
            for bam_file in bam_files:
                bam_id = remove_extension(bam_file)
                if seq_id in coverage_info[bam_file]:
                    row_str += '\t' + str(coverage_info[bam_file][seq_id].coverage)
                else:
                    row_str += '\t' + '0.0'
            fout.write(row_str + '\n')

        fout.close()
示例#4
0
    def add_compatible(self, scaffold_file, genome_file, compatible_file, min_len, out_genome):
        """Add sequences specified as compatible.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = set()
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Median genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Scaffold coverage')
            genome_cov_index = headers.index('Median genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                if bin_id == cur_bin_id:
                    scaffold_ids.add(scaffold_id)

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in scaffold_ids:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
示例#5
0
    def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file):
        """Prefix all gene IDs with genome IDs: <genome_id>~<gene_id>.
        
        Parameters
        ----------
        gene_files : list of str
            Genes in fasta files to modify.
        keep_headers : boolean
            If True, indicates FASTA headers already have the format <genome_id>~<gene_id>.
        output_file : str
            Name of FASTA file to contain modified genes.
        """

        fout = open(output_file, 'w')
        for gf in gene_files:
            genome_id = remove_extension(gf)
            if genome_id.endswith('_genes'):
                genome_id = genome_id[0:genome_id.rfind('_genes')]

            for seq_id, seq, annotation in seq_io.read_fasta_seq(
                    gf, keep_annotation=True):
                if keep_headers:
                    fout.write('>' + seq_id + ' ' + annotation + '\n')
                else:
                    fout.write('>' + genome_id + '~' + seq_id + ' ' +
                               annotation + '\n')
                fout.write(seq + '\n')
        fout.close()
示例#6
0
 def _prefix_gene_identifiers(self, gene_files, keep_headers, output_file):
     """Prefix all gene IDs with genome IDs: <genome_id>~<gene_id>.
     
     Parameters
     ----------
     gene_files : list of str
         Genes in fasta files to modify.
     keep_headers : boolean
         If True, indicates FASTA headers already have the format <genome_id>~<gene_id>.
     output_file : str
         Name of FASTA file to contain modified genes.
     """
     
     fout = open(output_file, 'w')
     for gf in gene_files:           
         genome_id = remove_extension(gf)
         if genome_id.endswith('_genes'):
             genome_id = genome_id[0:genome_id.rfind('_genes')]
             
         for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
             if keep_headers:
                 fout.write('>' + seq_id  + ' ' + annotation + '\n')
             else:
                 fout.write('>' + genome_id + '~' + seq_id  + ' ' + annotation + '\n')
             fout.write(seq + '\n')
     fout.close()
示例#7
0
def concatenate_gene_files(gene_files, concatenated_gene_file):
    """Combine all gene files into a single file.

    Gene ids are modified to include genome ids in order to ensure
    all gene identifiers are unique across the set of genomes.

    Parameters
    ----------
    gene_files : list of str
        Fasta files of called genes to process.
    concatenated_gene_file : str
        Name of file to contain concatenated gene files.
    """

    fout = open(concatenated_gene_file, 'w')

    for gf in gene_files:
        genome_id = remove_extension(gf)

        for seq_id, seq in seq_io.read_seq(gf):
            fout.write('>' + seq_id + '~' + genome_id + '\n')
            if seq[-1] == '*':
                seq = seq[0:-1]
            fout.write(seq + '\n')

    fout.close()
示例#8
0
    def amend_gene_identifies(self, gene_dir, output_dir):
        """Modify gene ids to include source genome id.

        The following format is used:
          <gene_id>~<genome_id>

        Parameters
        ----------
        gene_dir : str
            Directory with fasta files containing protein sequences.
        output_dir : float
            Directory to contain modified fasta files.
        """

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for f in os.listdir(gene_dir):
            gf = os.path.join(gene_dir, f)
            genome_id = remove_extension(gf)

            aa_file = os.path.join(output_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()
示例#9
0
文件: common.py 项目: wwood/RefineM
def concatenate_gene_files(gene_files, concatenated_gene_file):
    """Combine all gene files into a single file.

    Gene ids are modified to include genome ids in order to ensure
    all gene identifiers are unique across the set of genomes.

    Parameters
    ----------
    gene_files : list of str
        Fasta files of called genes to process.
    concatenated_gene_file : str
        Name of file to contain concatenated gene files.
    """

    fout = open(concatenated_gene_file, 'w')

    for gf in gene_files:
        genome_id = remove_extension(gf)

        for seq_id, seq in seq_io.read_seq(gf):
            fout.write('>' + genome_id + '~' + seq_id + '\n')
            if seq[-1] == '*':
                seq = seq[0:-1]
            fout.write(seq + '\n')

    fout.close()
示例#10
0
    def _parse_fastani_results(self, fastout_file, list_leaf):
        """ Parse the fastani output file
        
        
        Parameters
        ----------
        fastout_file : fastani output file.
    
        Returns
        -------
        dictionary
            dict_results[user_g]={"ref_genome":ref_genome,"ani":ani}
        """
        dict_results = {}
        with open(fastout_file) as fastfile:
            for line in fastfile:
                info = line.strip().split(" ")
                ref_genome = os.path.basename(info[1]).replace(
                    Config.FASTANI_GENOMES_EXT, "")
                user_g = remove_extension(os.path.basename(info[0]))
                ani = float(info[2])
                if user_g in dict_results:
                    print "it should not happen! (if user_g in dict_results)"
                else:
                    dict_results[user_g] = {
                        "ref_genome": ref_genome,
                        "ani": ani
                    }

        return dict_results
示例#11
0
    def amend_gene_identifies(self, gene_dir, output_dir):
        """Modify gene ids to include source genome id.

        The following format is used:
          <genome_id>~<gene_id>

        Parameters
        ----------
        gene_dir : str
            Directory with fasta files containing protein sequences.
        output_dir : float
            Directory to contain modified fasta files.
        """

        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        for f in os.listdir(gene_dir):
            gf = os.path.join(gene_dir, f)
            genome_id = remove_extension(gf)

            aa_file = os.path.join(output_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>%s~%s %s\n' % (genome_id, seq_id, annotation))
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()
示例#12
0
文件: outliers.py 项目: wwood/RefineM
    def add_compatible_unique(self, scaffold_file, genome_file,
                              compatible_file, min_len, out_genome):
        """Add sequences specified as compatible.

        Only sequences specified exactly once in the
        compatibility file are added.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine scaffolds compatible with genome
        scaffold_ids = []
        bin_ids = {}
        with open(compatible_file) as f:
            f.readline()

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_ids.append(scaffold_id)
                bin_ids[scaffold_id] = bin_id

        compatible_scaffolds = set()
        for scaffold_id, bin_id in bin_ids.iteritems():
            if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)

        self.logger.info('Identified %d compatible scaffolds.' %
                         len(compatible_scaffolds))

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1

        self.logger.info('Added %d scaffolds meeting length criterion.' %
                         added_seqs)

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
示例#13
0
文件: ssu.py 项目: wwood/RefineM
    def identify(self, genome_files, evalue_threshold, concatenate_threshold,
                 output_dir):
        """Identify 16S rRNA genes.

        Parameters
        ----------
        genome_files : iterable
            Path to genome files to process.
        evalue_threshold : float
            E-value threshold for defining valid hits.
        concatenate_threshold : int
            Concatenate hits within the specified number of base pairs.
        output_dir : str
            Output directory.

        Returns
        -------
        dict : d[genome_id][seq_id] -> information about best hit
            Information about best hits for each genome.
        """

        self.logger.info('Identifying SSU rRNA genes.')
        best_hits = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            genome_dir = os.path.join(output_dir, genome_id)
            make_sure_path_exists(genome_dir)

            # identify 16S reads from contigs/scaffolds
            self._hmm_search(genome_file, evalue_threshold, genome_dir)

            # read HMM hits
            hits_per_domain = {}
            for domain in ['archaea', 'bacteria', 'euk']:
                seq_info = self._read_hits(
                    os.path.join(genome_dir,
                                 'ssu' + '.hmm_' + domain + '.txt'), domain,
                    evalue_threshold)

                hits = {}
                if len(seq_info) > 0:
                    for seq_id, seq_hits in seq_info.iteritems():
                        for hit in seq_hits:
                            self._add_hit(hits, seq_id, hit,
                                          concatenate_threshold)

                hits_per_domain[domain] = hits

            # find best domain hit for each
            best_hits[genome_id] = {}
            for _, hits in hits_per_domain.iteritems():
                for seq_id, info in hits.iteritems():
                    if '-#' in seq_id:
                        seq_id = seq_id[0:seq_id.rfind('-#')]

                    self._add_domain_hit(best_hits[genome_id], seq_id, info)

        return best_hits
示例#14
0
    def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep,
            num_replicates, model, output_dir):
        """Jackknife taxa.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        outgroup_file : str
          File indicating labels of outgroup taxa.
        perc_taxa_to_keep : float
          Percentage of taxa to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          input_tree directory for bootstrap trees.
        """

        assert (model in ['wag', 'jtt'])

        self.perc_taxa_to_keep = perc_taxa_to_keep
        self.model = model
        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)

        # read outgroup taxa
        self.outgroup_ids = set()
        if outgroup_file:
            for line in open(outgroup_file):
                self.outgroup_ids.add(line.strip())

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        #***self.logger.info('Calculating jackknife taxa replicates:')
        #***parallel = Parallel(self.cpus)
        #***parallel.run(self._producer, None, range(num_replicates), self._progress)

        # calculate support
        rep_tree_files = []
        for rep_index in range(num_replicates):
            rep_tree_files.append(
                os.path.join(self.replicate_dir,
                             'jk_taxa.tree.' + str(rep_index) + '.tre'))

        tree_support = TreeSupport()
        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.jk_taxa.tree')
        tree_support.subset_taxa(input_tree, rep_tree_files, output_tree)

        return output_tree
示例#15
0
文件: ssu.py 项目: wwood/RefineM
    def extract(self, genome_files, best_hits, output_dir):
        """Extract 16S rRNA genes.

        Parameters
        ----------
        genome_files : iterable
            Path to genome files to process.
        best_hits : d[genome_id][seq_id] -> information about best hit
            Information about best hits for each genome.
        output_dir : str
            Output directory.

        Returns
        -------
        d[genome_id] -> str
            Fasta file containing SSU sequences for each genome.
        """

        self.logger.info('Extracting SSU rRNA genes.')
        ssu_seq_files = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            genome_dir = os.path.join(output_dir, genome_id)

            if len(best_hits[genome_id]) == 0:
                continue

            # write summary file and putative SSU rRNAs to file
            summary_file = os.path.join(genome_dir, 'ssu.hmm_summary.tsv')
            summary_out = open(summary_file, 'w')
            summary_out.write(
                'Sequence Id\tHMM\ti-Evalue\tStart hit\tEnd hit\tSSU gene length\tReverse Complement\tSequence length\n'
            )

            ssu_seq_files[genome_id] = os.path.join(genome_dir, 'ssu.fna')
            seq_out = open(ssu_seq_files[genome_id], 'w')

            seqs = seq_io.read(genome_file)

            for seq_id in best_hits[genome_id]:
                orig_seq_id = seq_id
                if '-#' in seq_id:
                    seq_id = seq_id[0:seq_id.rfind('-#')]

                seq_info = [orig_seq_id] + best_hits[genome_id][orig_seq_id]
                seq = seqs[seq_id]
                summary_out.write('\t'.join(seq_info) + '\n')

                seq_out.write('>' + seq_info[0] + '\n')
                seq_out.write(seq[int(seq_info[3]) + 1:int(seq_info[4]) + 1] +
                              '\n')

            summary_out.close()
            seq_out.close()

        return ssu_seq_files
示例#16
0
    def run(self, input_tree, msa_file, num_replicates, model, base_type, frac,
            output_dir):
        """Bootstrap multiple sequence alignment.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        base_type : str
          Indicates if bases are nucleotides or amino acids.
        frac : float
          Fraction of alignment to subsample.
        output_dir : str
          Directory for bootstrap trees.
        """

        assert (model in ['wag', 'lg', 'jtt'])
        assert (base_type in ['nt', 'prot'])

        self.model = model
        self.base_type = base_type
        self.frac = frac

        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        self.logger.info('Calculating bootstrap replicates:')
        parallel = Parallel(self.cpus)
        parallel.run(self._producer, None, range(num_replicates),
                     self._progress)

        # calculate support values
        rep_tree_files = []
        for rep_index in range(num_replicates):
            rep_tree_files.append(
                os.path.join(self.replicate_dir,
                             'bootstrap_tree.r_' + str(rep_index) + '.tree'))

        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
示例#17
0
    def run(self, input_tree, msa_file, outgroup_file, perc_taxa_to_keep, num_replicates, model, output_dir):
        """Jackknife taxa.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        outgroup_file : str
          File indicating labels of outgroup taxa.
        perc_taxa_to_keep : float
          Percentage of taxa to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          input_tree directory for bootstrap trees.
        """

        assert(model in ['wag', 'jtt'])

        self.perc_taxa_to_keep = perc_taxa_to_keep
        self.model = model
        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)
        # read outgroup taxa
        self.outgroup_ids = set()
        if outgroup_file:
            for line in open(outgroup_file):
                self.outgroup_ids.add(line.strip())

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        #***self.logger.info('Calculating jackknife taxa replicates:')
        #***parallel = Parallel(self.cpus)
        #***parallel.run(self._producer, None, xrange(num_replicates), self._progress)

        # calculate support
        rep_tree_files = []
        for rep_index in xrange(num_replicates):
            rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_taxa.tree.' + str(rep_index) + '.tre'))

        tree_support = TreeSupport()
        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_taxa.tree')
        tree_support.subset_taxa(input_tree, rep_tree_files, output_tree)

        return output_tree
示例#18
0
    def add_compatible_unique(self, scaffold_file, genome_file, compatible_file, out_genome):
        """Add sequences specified as compatible.

        Only sequences specified exactly once in the
        compatibility file are added.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine scaffolds compatible with genome
        scaffold_ids = []
        bin_ids = {}
        with open(compatible_file) as f:
            f.readline()

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_ids.append(scaffold_id)
                bin_ids[scaffold_id] = bin_id

        compatible_scaffolds = set()
        for scaffold_id, bin_id in bin_ids.iteritems():
            if scaffold_ids.count(scaffold_id) == 1 and bin_id == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)

        # add compatible sequences to genome
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                genome_seqs[seq_id] = seq

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
示例#19
0
    def _producer_db(self, aa_gene_file):
        """Create blast database.

        Parameters
        ----------
        aa_gene_files : str
            Fasta file with genes in amino acid space.
        """

        genome_id = remove_extension(aa_gene_file)

        blast_DB = os.path.join(self.output_dir, genome_id + '.db')
        log_file = os.path.join(self.output_dir, genome_id + '.log')
        cmd = 'makeblastdb -dbtype prot -in %s -out %s -logfile %s' % (aa_gene_file, blast_DB, log_file)
        os.system(cmd)

        return True
示例#20
0
文件: genome_tk.py 项目: wwood/biolib
def unique(genome_files):
    """Check if sequences are assigned to multiple bins.

    Parameters
    ----------
    genome_files : iterable
        Path to genome fasta files.

    Returns
    -------
    dict : d[genome_id][genome_id] -> [shared sequences]
        List of any sequences within a genome observed multiple times.
    """

    # read sequence IDs from all genomes,
    # while checking for duplicate sequences within a genomes
    duplicates = defaultdict(lambda: defaultdict(list))

    genome_seqs = {}
    for f in genome_files:
        genome_id = remove_extension(f)

        seq_ids = set()
        for seq_id, _seq in seq_io.read_seq(f):
            if seq_id in seq_ids:
                duplicates[genome_id][genome_id].append(seq_id)

            seq_ids.add(seq_id)

        genome_seqs[genome_id] = seq_ids

    # check for sequences assigned to multiple bins
    genome_ids = genome_seqs.keys()
    for i in xrange(0, len(genome_ids)):
        seq_idsI = genome_seqs[genome_ids[i]]

        for j in xrange(i + 1, len(genome_ids)):
            seq_idsJ = genome_seqs[genome_ids[j]]

            seq_intersection = seq_idsI.intersection(seq_idsJ)

            if len(seq_intersection) > 0:
                duplicates[genome_ids[i]][genome_ids[j]] = seq_intersection
                duplicates[genome_ids[j]][genome_ids[i]] = seq_intersection

    return duplicates
示例#21
0
    def reformat_gene_id_to_scaffold_id(self, gene_file, gff_file, taxonomy, output_file):
        """Reformat gene ids to format which explicitly gives scaffold names.

        <genome_id>~<scaffold_id>_<gene_#> [gtdb_taxonomy] [NCBI organism name] [annotation]

        Parameters
        ----------
        gene_file : str
            Gene file for genome.
        gff_file : str
            General feature file (GFF) for genome.
        output_file : float
            File to contain modified gene fasta file.
        """

        # determine source scaffold for each gene
        gene_id_to_scaffold_id = {}
        gene_number = defaultdict(int)
        for line in open(gff_file):
            if line.startswith('##FASTA'):
                # start of FASTA section with individual sequences
                break

            if line[0] == '#':
                continue

            line_split = line.split('\t')
            scaffold_id = line_split[0]
            info = line_split[8]
            if info != '':  # this will be empty for non-protein coding genes
                gene_id = info.split(';')[0].replace('ID=', '')

                gene_number[scaffold_id] += 1
                gene_id_to_scaffold_id[gene_id] = scaffold_id + '_' + str(gene_number[scaffold_id])

        # write out gene file with modified identifiers
        fout = open(output_file, 'w')
        for gene_id, seq, annotation in seq_io.read_fasta_seq(gene_file, keep_annotation=True):
            genome_id = remove_extension(gene_file)
            fout.write('>%s [%s] [%s] [%s]\n' % (gene_id_to_scaffold_id[gene_id],
                                                    ';'.join(taxonomy.get(genome_id, ['none'])),
                                                    'none',
                                                    annotation))
            fout.write(seq + '\n')
        fout.close()
示例#22
0
文件: main.py 项目: gsc0107/RefineM
    def filter_bins(self, options):
        """Filter bins command"""

        make_sure_path_exists(options.output_dir)

        genome_files = self._genome_files(options.genome_nt_dir,
                                          options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        outliers = Outliers()
        for genome_file in genome_files:
            gf = remove_extension(
                genome_file) + '.filtered.' + options.genome_ext
            out_genome = os.path.join(options.output_dir, gf)
            outliers.remove_outliers(genome_file, options.filter_file,
                                     out_genome, options.modified_only)

        self.logger.info('Modified genome written to: ' + options.output_dir)
示例#23
0
    def _genome_seqs(self, genome_files):
        """Get unique id of sequences in each genome.

        Parameters
        ----------
        genome_files : iterable
            Genome files in fasta format.

        Returns
        -------
        dict: d[genome_id] -> set(seq_id1, ..., seq_idN)
            Ids of sequences in each genome.
        """

        genome_seqs = defaultdict(set)
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            for seq_id, _seq in seq_io.read_seq(genome_file):
                genome_seqs[genome_id].add(seq_id)

        return genome_seqs
示例#24
0
    def _genome_seqs(self, genome_files):
        """Get unique id of sequences in each genome.

        Parameters
        ----------
        genome_files : iterable
            Genome files in fasta format.

        Returns
        -------
        dict: d[genome_id] -> set(seq_id1, ..., seq_idN)
            Ids of sequences in each genome.
        """

        genome_seqs = defaultdict(set)
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            for seq_id, _seq in seq_io.read_seq(genome_file):
                genome_seqs[genome_id].add(seq_id)

        return genome_seqs
示例#25
0
文件: main.py 项目: gsc0107/RefineM
    def manual(self, options):
        """Manual command"""

        check_file_exists(options.cluster_file)
        check_file_exists(options.genome_file)
        make_sure_path_exists(options.output_dir)

        genome_id = remove_extension(options.genome_file)

        seqs = seq_io.read(options.genome_file)
        fout = {}
        with open(options.cluster_file) as f:
            f.readline()

            for line in f:
                line_split = line.rstrip().split('\t')
                scaffold_id = line_split[0]
                cluster_id = int(line_split[1])

                if cluster_id < 0:
                    # negative values indicate scaffolds that should
                    # not be placed in a cluster
                    continue

                if cluster_id not in fout:
                    fout[cluster_id] = open(
                        os.path.join(options.output_dir,
                                     genome_id + '_c%d.fna' % cluster_id), 'w')

                f = fout[cluster_id]
                f.write('>' + scaffold_id + '\n')
                f.write(seqs[scaffold_id] + '\n')

        for f in fout.values():
            f.close()

        self.logger.info('Partitioned sequences written to: ' +
                         options.output_dir)
示例#26
0
    def aai(self, options):
        """AAI command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - aai] Calculating the AAI between homologs in genome pairs.')
        self.logger.info('*******************************************************************************')
        self.logger.info('')

        check_dir_exists(options.rblast_dir)
        make_sure_path_exists(options.output_dir)

        genome_ids = []
        protein_dir = os.path.join(options.rblast_dir, 'genes')
        for f in os.listdir(protein_dir):
            if f.endswith('.faa'):
                genome_id = remove_extension(f, '.faa')
                genome_ids.append(genome_id)

        if not genome_ids:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            sys.exit()

        aai_calculator = AAICalculator(options.cpus)
        aai_calculator.run(genome_ids,
                            protein_dir,
                            options.rblast_dir,
                            options.per_identity,
                            options.per_aln_len,
                            options.write_shared_genes,
                            options.output_dir)

        shared_genes_dir = os.path.join(options.output_dir, aai_calculator.shared_genes)
        self.logger.info('')
        self.logger.info('  Identified homologs between genome pairs written to: %s' % shared_genes_dir)

        self.time_keeper.print_time_stamp()
示例#27
0
    def run(self, 
                input_tree, 
                msa_file, 
                num_replicates, 
                model, 
                gamma,
                base_type, 
                frac,
                boot_dir,
                output_dir):
        """Bootstrap multiple sequence alignment.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        base_type : str
          Indicates if bases are nucleotides or amino acids.
        frac : float
          Fraction of alignment to subsample.
        output_dir : str
          Directory for bootstrap trees.
        """

        assert(model in ['wag', 'lg', 'jtt'])
        assert(base_type in ['nt', 'prot'])

        self.model = model
        self.gamma = gamma
        self.base_type = base_type
        self.frac = frac

        rep_tree_files = []
        if not boot_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)

            # calculate replicates
            self.logger.info('Calculating bootstrap replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates), self._progress)

            for rep_index in xrange(num_replicates):
                rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree'))
        else:
            for f in os.listdir(boot_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(boot_dir, f))
            self.logger.info('Read %d bootstrap replicates.' % len(rep_tree_files))
          
        # calculate support values
        self.logger.info('Calculating bootstrap support values.')
        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
示例#28
0
    def features(self, options):
        """Making bam features matrix"""

        make_sure_path_exists(options.output_dir)
        reads_abundance = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[0])
        reads_normalised = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[1])
        reads_relative = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[2])
        base_abundance = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[3])
        base_normalised = os.path.join(
            options.output_dir, DefaultValues.FEATURES_ABUNDANCE_FILES[4])
        base_relative = os.path.join(options.output_dir,
                                     DefaultValues.FEATURES_ABUNDANCE_FILES[5])
        reads_count = os.path.join(options.output_dir,
                                   'features_reads_raw_count.tsv')
        tpm_count = os.path.join(options.output_dir, 'TPM.tsv')

        features_size = {}
        raw_counts = {}
        rpk = {}
        counts = {}
        counts_base = {}

        reference = remove_extension(options.faidx, options.faidx_extension)

        self.logger.info('Get features and initialise matrix')
        with open(options.faidx) as f:
            for line in f:
                if not line.startswith('#'):
                    line_list = line.rstrip().split('\t')
                    features = line_list[0]
                    if options.merge:
                        features = options.separator.join(
                            features.split(options.separator)[:-1])
                    if options.genome:
                        features = reference
                    try:
                        features_size[features] = features_size[
                            features] + int(line_list[1])
                    except KeyError:
                        features_size[features] = int(line_list[1])
                    counts[features] = 0
                    counts_base[features] = 0
                    raw_counts[features] = 0
                    rpk[features] = 0

        counts_raw_all = []
        counts_tpm_all = []
        counts_all = []
        counts_all_normalised = []
        counts_all_relative = []
        counts_base_all = []
        counts_base_all_normalised = []
        counts_base_all_relative = []

        header = ["Features", "Features_size"]
        self.logger.info('Browse alignement file(s)')

        samtoolsexec = findEx('samtools')
        samtoolsthreads = '-@ ' + options.threads
        samtoolsminqual = '-q ' + options.mapQ

        with open(options.bam_list, 'r') as b:
            for bam in b:
                if bam.startswith('#'):
                    continue
                i = 0
                alignementfile, librarysize = bam.split('\t')
                if librarysize == '' or librarysize == 0 or options.discard_library_size_normalisation:
                    librarysize = 1
                samplename = remove_extension(os.path.basename(alignementfile),
                                              options.extension)
                header.append(samplename)
                self.logger.info('\t' + samplename)
                cmd = [
                    samtoolsexec, 'view', samtoolsthreads, samtoolsminqual,
                    alignementfile
                ]
                p = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout
                for line in p:
                    try:
                        k = line
                        line = line.decode(sys.getdefaultencoding()).rstrip()
                        l = k
                    except:
                        print(l)
                        print(line)
                        sys.exit()
                    if i > 0 and i % 1000000 == 0:
                        self.logger.info("Alignment record %s processed" % i)
                    i += 1
                    line_list = line.split('\t')
                    features = line_list[2]
                    if options.merge:
                        features = options.separator.join(
                            features.split(options.separator)[:-1])
                    if options.genome:
                        features = reference
                    cigar = line_list[5]
                    base_mapped = 0
                    match = re.findall(r'(\d+)M', cigar)
                    read_len = len(line_list[6])
                    for base_match in match:
                        base_mapped += int(base_match)
                    if read_len == 0:
                        self.logger.info(line_list)

                    if base_mapped / read_len < float(options.id_cutoff):
                        continue

                    raw_counts[features] += 1
                    rpk[features] += (1 / int(features_size[features])) * 1000
                    if options.discard_feature_length_normalisation:
                        counts_base[features] += base_mapped
                        counts[features] += 1
                    else:
                        counts_base[features] += (
                            base_mapped / int(features_size[features])
                        ) * options.feature_size_normalisation
                        counts[features] += (
                            1 / int(features_size[features])
                        ) * options.feature_size_normalisation

                if options.library_size_normalisation == 'aligned':
                    librarysize = sum(counts.values())
                    if librarysize == 0:
                        librarysize = 1

                # raw reads count wo gl
                counts_raw_all.append(raw_counts.copy())

                # rpk
                count_tmp = {}
                try:
                    count_tmp = {
                        k: v * 1000000 / total
                        for total in (sum(rpk.values()), )
                        for k, v in rpk.items()
                    }
                except ZeroDivisionError:
                    count_tmp = {k: v for k, v in counts.items()}
                counts_tpm_all.append(count_tmp.copy())

                # raw reads count
                counts_all.append(counts.copy())
                # normalised reads count
                count_tmp = {}
                count_tmp = {
                    k: (v / int(librarysize)) * options.feature_normalisation
                    for k, v in counts.items()
                }
                counts_all_normalised.append(count_tmp.copy())

                # relative reads count
                count_tmp = {}
                try:
                    count_tmp = {
                        k: v / total
                        for total in (sum(counts.values()), )
                        for k, v in counts.items()
                    }
                except ZeroDivisionError:
                    count_tmp = {k: v for k, v in counts.items()}
                counts_all_relative.append(count_tmp.copy())

                # raw bases count
                counts_base_all.append(counts_base.copy())

                # normalised bases count
                count_tmp = {}
                count_tmp = {
                    k: (v / int(librarysize)) * options.feature_normalisation
                    for k, v in counts_base.items()
                }
                counts_base_all_normalised.append(count_tmp.copy())

                # relative bases count
                count_tmp = {}
                try:
                    count_tmp = {
                        k: v / total
                        for total in (sum(counts_base.values()), )
                        for k, v in counts_base.items()
                    }
                except ZeroDivisionError:
                    count_tmp = {k: v for k, v in counts_base.items()}
                counts_base_all_relative.append(count_tmp.copy())

                for fn in counts:
                    raw_counts[fn] = 0
                    counts[fn] = 0
                    counts_base[fn] = 0

        self.logger.info('Print matrices')
        self.logger.info('Print raw reads count matrix in %s' % reads_count)
        output_handle = open(reads_count, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts.keys():
            if sum([c[fn] for c in counts_raw_all]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [str(features_size[fn])] +
                              [str(c[fn]) for c in counts_raw_all]) + '\n')
        output_handle.close()

        self.logger.info('Print TMP matrix in %s' % tpm_count)
        output_handle = open(tpm_count, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts.keys():
            if sum([c[fn] for c in counts_tpm_all]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [str(features_size[fn])] +
                              [str(c[fn]) for c in counts_tpm_all]) + '\n')
        output_handle.close()

        self.logger.info('Print raw reads abundance matrix in %s' %
                         reads_abundance)
        output_handle = open(reads_abundance, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts.keys():
            if sum([c[fn] for c in counts_all]) == 0 and options.removed:
                continue
            else:
                output_handle.write('\t'.join([fn] + [str(features_size[fn])] +
                                              [str(c[fn])
                                               for c in counts_all]) + '\n')
        output_handle.close()

        self.logger.info('Print normalised reads abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(reads_normalised, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts.keys():
            if sum([c[fn]
                    for c in counts_all_normalised]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [str(features_size[fn])] +
                              [str(c[fn])
                               for c in counts_all_normalised]) + '\n')
        output_handle.close()

        self.logger.info('Print relative reads abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(reads_relative, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts.keys():
            if sum([c[fn]
                    for c in counts_all_relative]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [str(features_size[fn])] +
                              [str(c[fn])
                               for c in counts_all_relative]) + '\n')
        output_handle.close()

        self.logger.info('Print raw base abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(base_abundance, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts_base.keys():
            if sum([c[fn] for c in counts_all]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [str(features_size[fn])] +
                              [str(c[fn]) for c in counts_base_all]) + '\n')
        output_handle.close()

        self.logger.info('Print normalised base abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(base_normalised, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts_base.keys():
            if sum([c[fn]
                    for c in counts_all_normalised]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [str(features_size[fn])] +
                              [str(c[fn])
                               for c in counts_base_all_normalised]) + '\n')
        output_handle.close()

        self.logger.info('Print relative base abundance matrix in %s' %
                         reads_normalised)
        output_handle = open(base_relative, "w")
        output_handle.write('\t'.join(header) + '\n')
        for fn in counts_base.keys():
            if sum([c[fn]
                    for c in counts_all_relative]) == 0 and options.removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [str(features_size[fn])] +
                              [str(c[fn])
                               for c in counts_base_all_relative]) + '\n')
        output_handle.close()

        self.logger.info('Matrices printed')
示例#29
0
    def run(self, scaffold_stats, num_clusters, num_components, K, no_coverage, no_pca, iterations, genome_file, output_dir):
        """Calculate statistics for genomes.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature.
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations : int
            Iterations of clustering to perform.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('')
        self.logger.info('  Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.iteritems():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in xrange(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of tetranucleotide signatures
        if K != 0:
            if not no_pca:
                self.logger.info('  Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info('    First %d PCs capture %.1f%% of the variance.' % (num_components, sum(variance[0:num_components]) * 100))
    
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('  Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            print '  Whitening data.'
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info('  Partitioning genome into %d clusters.' % num_clusters)

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats, num_clusters, iterations, minit='points', missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('    Placed %d sequences in cluster %d.' % (sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(os.path.join(output_dir, genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()
示例#30
0
    def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity, window_size, step_size):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        genome_files : list of str
            Fasta files of genomes to process.
        db_file : str
            Database of reference genes.
        taxonomy_file : str
            File containing GreenGenes taxonomy strings for reference genomes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        window_size : int
            Size of each fragment.
        step_size : int
            Number of bases to move after each window.
        """

        # parse taxonomy file
        self.logger.info('  Reading taxonomic assignment of reference genomes.')
        taxonomy = Taxonomy().read(taxonomy_file)

        # fragment each genome into fixed sizes windows
        self.logger.info('')
        self.logger.info('  Fragmenting sequences in each bin:')
        diamond_output_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(diamond_output_dir)

        fragment_file = os.path.join(diamond_output_dir, 'fragments.fna')
        fragment_out = open(fragment_file, 'w')
        contig_id_to_genome_id = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            self.profiles[genome_id] = Profile(genome_id, taxonomy)
            self._fragment_genomes(genome_file,
                                  window_size,
                                  step_size,
                                  self.profiles[genome_id],
                                  fragment_out)

            for seq_id, _seq in seq_io.read_seq(genome_file):
                contig_id_to_genome_id[seq_id] = genome_id

        # run diamond
        self.logger.info('')
        self.logger.info('  Running diamond blastx with %d processes (be patient!)' % self.cpus)

        diamond = Diamond(self.cpus)
        diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
        diamond.blastx(fragment_file, db_file, evalue, per_identity, 1, diamond_daa_out)

        diamond_table_out = os.path.join(diamond_output_dir, 'diamond_hits.tsv')
        diamond.view(diamond_daa_out + '.daa', diamond_table_out)

        self.logger.info('')
        self.logger.info('  Creating taxonomic profile for each genome.')
        self._taxonomic_profiles(diamond_table_out, taxonomy, contig_id_to_genome_id)

        self.logger.info('')
        self.logger.info('  Writing taxonomic profile for each genome.')

        report_dir = os.path.join(self.output_dir, 'bin_reports')
        make_sure_path_exists(report_dir)

        for genome_id, profile in self.profiles.iteritems():
            seq_summary_out = os.path.join(report_dir, genome_id + '.sequences.tsv')
            profile.write_seq_summary(seq_summary_out)

            genome_profile_out = os.path.join(report_dir, genome_id + '.profile.tsv')
            profile.write_genome_profile(genome_profile_out)

        genome_summary_out = os.path.join(self.output_dir, 'genome_summary.tsv')
        self._write_genome_summary(genome_summary_out)

        # create Krona plot
        krona_profiles = defaultdict(lambda: defaultdict(int))
        for genome_id, profile in self.profiles.iteritems():
            seq_assignments = profile.classify_seqs(taxonomy)

            for seq_id, classification in seq_assignments.iteritems():
                taxa = []
                for r in xrange(0, len(profile.rank_labels)):
                    taxa.append(classification[r][0])

                krona_profiles[genome_id][';'.join(taxa)] += profile.seq_len[seq_id]

        krona = Krona()
        krona_output_file = os.path.join(self.output_dir, 'taxonomic_profiles.krona.html')
        krona.create(krona_profiles, krona_output_file)
示例#31
0
    def kmeans(self, scaffold_stats, num_clusters, num_components, K,
               no_coverage, no_pca, iterations, genome_file, output_dir):
        """Cluster genome with k-means.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        num_clusters : int
            Number of cluster to form.
        num_components : int
            Number of PCA components to consider.
        K : int
            K-mer size to use for calculating genomic signature
        no_coverage : boolean
            Flag indicating if coverage information should be used during clustering.
        no_pca : boolean
            Flag indicating if PCA of genomic signature should be calculated.
        iterations: int
            iterations to perform during clustering
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        # get GC and mean coverage for each scaffold in genome
        self.logger.info('Determining mean coverage and genomic signatures.')
        signatures = GenomicSignature(K)
        genome_stats = []
        signature_matrix = []
        seqs = seq_io.read(genome_file)
        for seq_id, seq in seqs.items():
            stats = scaffold_stats.stats[seq_id]

            if not no_coverage:
                genome_stats.append((np_mean(stats.coverage)))
            else:
                genome_stats.append(())

            if K == 0:
                pass
            elif K == 4:
                signature_matrix.append(stats.signature)
            else:
                sig = signatures.seq_signature(seq)
                total_kmers = sum(sig)
                for i in range(0, len(sig)):
                    sig[i] = float(sig[i]) / total_kmers
                signature_matrix.append(sig)

        # calculate PCA of signatures
        if K != 0:
            if not no_pca:
                self.logger.info('Calculating PCA of genomic signatures.')
                pc, variance = self.pca(signature_matrix)
                self.logger.info(
                    'First {:,} PCs capture {:.1f}% of the variance.'.format(
                        num_components,
                        sum(variance[0:num_components]) * 100))

                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, pc[i][0:num_components])
            else:
                self.logger.info('Using complete genomic signature.')
                for i, stats in enumerate(genome_stats):
                    genome_stats[i] = np_append(stats, signature_matrix[i])

        # whiten data if feature matrix contains coverage and genomic signature data
        if not no_coverage and K != 0:
            self.logger.info('Whitening data.')
            genome_stats = whiten(genome_stats)
        else:
            genome_stats = np_array(genome_stats)

        # cluster
        self.logger.info(
            'Partitioning genome into {:,} clusters.'.format(num_clusters))

        bError = True
        while bError:
            try:
                bError = False
                _centroids, labels = kmeans2(genome_stats,
                                             num_clusters,
                                             iterations,
                                             minit='points',
                                             missing='raise')
            except ClusterError:
                bError = True

        for k in range(num_clusters):
            self.logger.info('Placed {:,} sequences in cluster {:,}.'.format(
                sum(labels == k), (k + 1)))

        # write out clusters
        genome_id = remove_extension(genome_file)
        for k in range(num_clusters):
            fout = open(
                os.path.join(output_dir,
                             genome_id + '_c%d' % (k + 1) + '.fna'), 'w')
            for i in np_where(labels == k)[0]:
                seq_id = seqs.keys()[i]
                fout.write('>' + seq_id + '\n')
                fout.write(seqs[seq_id] + '\n')
            fout.close()
    def run(self, input_tree, 
                    msa_file, 
                    marker_info_file, 
                    mask_file, 
                    perc_markers_to_keep, 
                    num_replicates, 
                    model,
                    jk_dir,
                    output_dir):
        """Jackknife marker genes.

        Marker file should have the format:
          <marker id>\t<marker name>\t<marker desc>\t<length>\n

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        marker_info_file : str
          File indicating database id, HMM name, description and length of each marker in the alignment.
        mask_file : str
          File indicating masking of multiple sequence alignment.
        perc_markers_to_keep : float [0, 1]
          Percentage of marker genes to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          Output directory for jackkife trees.
        """

        assert(model in ['wag', 'jtt'])

        self.model = model
        self.perc_markers_to_keep = perc_markers_to_keep
        
        
        # determine length of each marker gene in alignment
        rep_tree_files = []
        if not jk_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)
            
            marker_lengths = []
            total_len = 0
            with open(marker_info_file) as f:
                f.readline()
                for line in f:
                    line_split = line.split('\t')
                    ml = int(line_split[3])
                    marker_lengths.append(ml)
                    total_len += ml
                    
            self.logger.info('Concatenated length of markers: %d' % total_len)
                    
            # read mask
            mask = open(mask_file).readline().strip()
            start = 0
            self.marker_lengths = []
            total_mask_len = 0
            for ml in marker_lengths:
                end = start + ml
                zeros = mask[start:end].count('0')
                start = end
                
                self.marker_lengths.append(ml - zeros)
                total_mask_len += ml - zeros
                
            self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)
            
            if len(self.msa.values()[0]) != total_mask_len:
                self.logger.error('Length of MSA does not meet length of mask.')
                sys.exit()

            # calculate replicates
            self.logger.info('Calculating jackknife marker replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates), self._progress)

            # calculate support
            self.logger.info('Calculating support for %d replicates.' % num_replicates)
            for rep_index in xrange(num_replicates):
                rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre'))
        else:
            for f in os.listdir(jk_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(jk_dir, f))
            self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files))

        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_markers.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
示例#33
0
    def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, min_len, out_genome):
        """Add sequences specified as compatible.

        A sequences is added to a bin if and only if it is
        closest to that bin in GC, tetranuclotide, and
        coverage space.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        min_len : int
            Minimum length to add scaffold.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = defaultdict(dict)
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Median genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Scaffold coverage')
            genome_cov_index = headers.index('Median genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist]

        # determine scaffolds that are closest to a single bin
        # in terms of GC, tetranucleotide distance, and coverage
        compatible_scaffolds = set()
        for scaffold_id, bin_stats in scaffold_ids.items():
            best_gc = [1e9, None]
            best_td = [1e9, None]
            best_cov = [1e9, None]
            for bin_id, stats in bin_stats.items():
                gc, td, cov = stats
                if gc < best_gc[0]:
                    best_gc = [gc, bin_id]
                if td < best_td[0]:
                    best_td = [td, bin_id]
                if cov < best_cov[0]:
                    best_cov = [cov, bin_id]

            # check if scaffold is closest to a single bin
            if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)
                
        self.logger.info('Identified {:,} compatible scaffolds.'.format(len(compatible_scaffolds)))

        # add compatible sequences to genome
        added_seqs = 0
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                if len(seq) >= min_len:
                    genome_seqs[seq_id] = seq
                    added_seqs += 1
                
        self.logger.info('Added {:,} scaffolds meeting length criterion.'.format(added_seqs))

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
示例#34
0
    def rblast(self, options):
        """Reciprocal blast command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info(' [CompareM - rblast] Performing reciprocal blast between genomes.')
        self.logger.info('*******************************************************************************')

        check_dir_exists(options.protein_dir)
        make_sure_path_exists(options.output_dir)

        aa_gene_files = []
        for f in os.listdir(options.protein_dir):
            if f.endswith(options.protein_ext):
                aa_gene_files.append(os.path.join(options.protein_dir, f))

        if not aa_gene_files:
            self.logger.warning('  [Warning] No gene files found. Check the --protein_ext flag used to identify gene files.')
            sys.exit()

        # modify gene ids to include genome ids in order to ensure
        # all gene identifiers are unique across the set of genomes,
        # also removes the trailing asterisk used to identify the stop
        # codon
        self.logger.info('')
        self.logger.info('  Appending genome identifiers to all gene identifiers.')
        gene_out_dir = os.path.join(options.output_dir, 'genes')
        make_sure_path_exists(gene_out_dir)
        modified_aa_gene_files = []
        for gf in aa_gene_files:
            genome_id = remove_extension(gf)

            aa_file = os.path.join(gene_out_dir, genome_id + '.faa')
            fout = open(aa_file, 'w')
            for seq_id, seq, annotation in seq_io.read_fasta_seq(gf, keep_annotation=True):
                fout.write('>' + seq_id + '~' + genome_id + ' ' + annotation + '\n')
                if seq[-1] == '*':
                    seq = seq[0:-1]
                fout.write(seq + '\n')
            fout.close()

            modified_aa_gene_files.append(aa_file)

        # perform the reciprocal blast with blastp or diamond
        self.logger.info('')
        if options.blastp:
            rblast = ReciprocalBlast(options.cpus)
            rblast.run(modified_aa_gene_files, options.evalue, options.output_dir)

            # concatenate all blast tables to mimic output of diamond, all hits
            # for a given genome MUST be in consecutive order to fully mimic
            # the expected results from diamond
            self.logger.info('')
            self.logger.info('  Creating single file with all blast hits (be patient!).')
            blast_files = sorted([f for f in os.listdir(options.output_dir) if f.endswith('.blastp.tsv')])
            hit_tables = [os.path.join(options.output_dir, f) for f in blast_files]
            concatenate_files(hit_tables, os.path.join(options.output_dir, 'all_hits.tsv'))
        else:
            rdiamond = ReciprocalDiamond(options.cpus)
            rdiamond.run(modified_aa_gene_files, options.evalue, options.per_identity, options.output_dir)

        self.logger.info('')
        self.logger.info('  Reciprocal blast hits written to: %s' % options.output_dir)

        self.time_keeper.print_time_stamp()
示例#35
0
    def _producer(self, genome_file):
        """Apply prodigal to genome with most suitable translation table.

        Parameters
        ----------
        genome_file : queue
            Fasta file for genome.
        """

        genome_id = remove_extension(genome_file)

        aa_gene_file = os.path.join(self.output_dir, genome_id + '_genes.faa')
        nt_gene_file = os.path.join(self.output_dir, genome_id + '_genes.fna')
        gff_file = os.path.join(self.output_dir, genome_id + '.gff')

        best_translation_table = -1
        table_coding_density = {4:-1, 11:-1}
        if self.called_genes:
            os.system('cp %s %s' % (os.path.abspath(genome_file), aa_gene_file))
        else:
            tmp_dir = tempfile.mkdtemp()

            seqs = read_fasta(genome_file)

            # determine number of bases
            total_bases = 0
            for seq in seqs.values():
                total_bases += len(seq)

            # call genes under different translation tables
            if self.translation_table:
                translation_tables = [self.translation_table]
            else:
                translation_tables = [4, 11]

            for translation_table in translation_tables:
                os.makedirs(os.path.join(tmp_dir, str(translation_table)))
                aa_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.faa')
                nt_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.fna')
                gff_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '.gff')

                # check if there is sufficient bases to calculate prodigal parameters
                if total_bases < 100000 or self.meta:
                    proc_str = 'meta'  # use best precalculated parameters
                else:
                    proc_str = 'single'  # estimate parameters from data
                    
                args = '-m'
                if self.closed_ends:
                    args += ' -c'

                cmd = 'prodigal %s -p %s -q -f gff -g %d -a %s -d %s -i %s > %s 2> /dev/null' % (args,
                                                                                            proc_str,
                                                                                            translation_table,
                                                                                            aa_gene_file_tmp,
                                                                                            nt_gene_file_tmp,
                                                                                            genome_file,
                                                                                            gff_file_tmp)
                os.system(cmd)

                # determine coding density
                prodigalParser = ProdigalGeneFeatureParser(gff_file_tmp)

                codingBases = 0
                for seq_id, _seq in seqs.items():
                    codingBases += prodigalParser.coding_bases(seq_id)

                codingDensity = float(codingBases) / total_bases
                table_coding_density[translation_table] = codingDensity

            # determine best translation table
            if not self.translation_table:
                best_translation_table = 11
                if (table_coding_density[4] - table_coding_density[11] > 0.05) and table_coding_density[4] > 0.7:
                    best_translation_table = 4
            else:
                best_translation_table = self.translation_table

            shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.faa'), aa_gene_file)
            shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.fna'), nt_gene_file)
            shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '.gff'), gff_file)

            # clean up temporary files
            shutil.rmtree(tmp_dir)

        return (genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4], table_coding_density[11])
示例#36
0
    def run(self,
                taxonomy_file, type_strains_file,
                genome_prot_dir, extension,
                max_taxa, rank,
                per_identity, per_aln_len,
                genomes_to_process, keep_all_genes,
                no_reformat_gene_ids,
                output_dir):
        """ Create dereplicate set of genes.

        Taxonomy file should have the following format:
            <genome_id>\t<taxonomy_str>

            where taxonomy_str is in GreenGenes format:
                d__Bacteria;p__Proteobacteria;...;s__Escherichia coli

        Type strain file should have the following format:
            <genome_id>\t<genome name>

        Parameters
        ----------
        taxonomy_file : str
            File indicating taxonomy string for all genomes of interest
        type_strains_file : str
            File indicating type strains.
        genome_prot_dir : str
            Directory containing amino acid genes for each genome.
        extension : str
            Extension of files with called genes.
        max_taxa : int
            Maximum taxa to retain in a named group.
        rank : int
            Taxonomic rank to perform dereplication (0 = domain, ..., 6 = species).
        per_identity : float
            Percent identity for subsampling similar genes.
        per_aln_len : float
            Percent alignment length for subsampling similar genes.
        genomes_to_process : str
            File with list of genomes to retain instead of performing taxon subsampling.
        keep_all_genes : boolean
            Flag indicating that no gene subsampling should be performed.
        no_reformat_gene_ids : boolean
            Flag indicating if gene ids should be reformatted to include scaffold names given by the GFF file.
        output_dir : str
            Desired output directory for storing results.
        """

        make_sure_path_exists(output_dir)
        self.logger.info('Dereplicating at the rank of %s.' % self.rank_labels[rank])

        # get taxonomy string for each genome
        taxonomy = {}
        if taxonomy_file:
            self.logger.info('Reading taxonomy file.')
            taxonomy = Taxonomy().read(taxonomy_file)
            self.logger.info('There are %d genomes with taxonomy strings.' % len(taxonomy))

        # get type strains; genomes which should never be dereplicated
        type_strains = set()
        if type_strains_file:
            self.logger.info('Reading type strain file.')
            type_strains = self.read_type_strain(type_strains_file)
            self.logger.info('There are %d type strains.' % len(type_strains))

        # get specific list of genomes to process
        genomes_to_retain = set()
        if genomes_to_process:
            self.logger.info('Reading genomes to retain.')
            for line in open(genomes_to_process):
                line_split = line.split()
                genomes_to_retain.add(line_split[0])
            self.logger.info('Retaining %d genomes.' % len(genomes_to_retain))
            
        # make sure extension filter starts with a '.'
        if not extension.startswith('.'):
            extension = '.' + extension

        # identify unique genes in each named group
        fout = open(os.path.join(output_dir, 'genomes_without_called_genes.tsv'), 'w')
        rank_genomes = defaultdict(list)
        genome_files = os.listdir(genome_prot_dir)
        underclassified_genomes = 0
        genomes_with_missing_data = 0
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file, extension)

            if not genome_file.endswith(extension):
                continue

            if genomes_to_process and genome_id not in genomes_to_retain:
                continue

            genome_file = os.path.join(genome_prot_dir, genome_file)
            if not os.path.exists(genome_file):
                genomes_with_missing_data += 1
                fout.write(genome_id + '\t' + ';'.join(taxonomy[genome_id]) + '\n')
                continue

            t = taxonomy.get(genome_id, self.rank_prefixes)
            taxa = t[rank]
            if taxa[3:] == '':
                underclassified_genomes += 1
                rank_genomes[self.underclassified].append(genome_id)
            else:
                rank_genomes[taxa].append(genome_id)

            validate_seq_ids(genome_file)

        fout.close()

        total_genomes_to_process = sum([len(genome_list) for genome_list in rank_genomes.values()])
        if total_genomes_to_process == 0:
            self.logger.error('No genomes found in directory: %s. Check the --extension flag used to identify genomes.' % genome_prot_dir)
            sys.exit(-1)

        self.logger.info('Under-classified genomes automatically placed into the database: %d' % underclassified_genomes)
        self.logger.info('Genomes with missing sequence data: %d' % genomes_with_missing_data)
        self.logger.info('Total named groups: %d' % len(rank_genomes))
        self.logger.info('Total genomes to process: %d' % total_genomes_to_process)

        # process each named group
        gene_file = os.path.join(output_dir, 'custom_db.faa')
        gene_out = open(gene_file, 'w')

        taxonomy_out = open(os.path.join(output_dir, 'custom_taxonomy.tsv'), 'w')

        tmp_dir = tempfile.mkdtemp()
        total_genes_removed = 0
        total_genes_kept = 0
        total_genomes_kept = 0
        processed_genomes = 0
        for taxa, genome_list in rank_genomes.iteritems():
            processed_genomes += len(genome_list)

            print '-------------------------------------------------------------------------------'
            self.logger.info('Processing %s | Finished %d of %d (%.2f%%) genomes.' % (taxa, processed_genomes, total_genomes_to_process, processed_genomes * 100.0 / total_genomes_to_process))

            # create directory with selected genomes
            taxon_dir = os.path.join(tmp_dir, 'taxon')
            os.mkdir(taxon_dir)

            reduced_genome_list = genome_list
            if not genomes_to_process and taxa != self.underclassified:  # perform taxon subsampling
                reduced_genome_list = self.select_taxa(genome_list, taxonomy, type_strains, max_taxa)
            total_genomes_kept += len(reduced_genome_list)

            gene_dir = os.path.join(taxon_dir, 'genes')
            os.mkdir(gene_dir)
            for genome_id in reduced_genome_list:
                taxonomy_out.write(genome_id + '\t' + ';'.join(taxonomy.get(genome_id, self.rank_prefixes)) + '\n')

                genome_gene_file = os.path.join(genome_prot_dir, genome_id + extension)
                gff_file = os.path.join(genome_prot_dir, genome_id + '.gff')
                output_gene_file = os.path.join(gene_dir, genome_id + '.faa')
                if not no_reformat_gene_ids:
                    self.reformat_gene_id_to_scaffold_id(genome_gene_file, gff_file, taxonomy, output_gene_file)
                else:
                    os.system('cp %s %s' % (genome_gene_file, output_gene_file))

            # filter genes based on amino acid identity
            genes_to_remove = []
            amended_gene_dir = os.path.join(taxon_dir, 'amended_genes')
            if keep_all_genes or taxa == self.underclassified:
                # modify gene identifiers to include genome ids
                self.amend_gene_identifies(gene_dir, amended_gene_dir)
            else:
                # filter genes on AAI
                genes_to_remove = self.filter_aai(taxon_dir, gene_dir, amended_gene_dir, per_identity, per_aln_len, self.cpus)

            self.logger.info('Writing unique genes from genomes in %s.' % taxa)
            genes_kept = self.write_gene_file(gene_out, amended_gene_dir, reduced_genome_list, taxonomy, genes_to_remove)

            self.logger.info('Retain %d of %d taxa.' % (len(reduced_genome_list), len(genome_list)))
            self.logger.info('Genes to keep: %d' % genes_kept)
            self.logger.info('Genes removed: %d' % len(genes_to_remove))

            total_genes_kept += genes_kept
            total_genes_removed += len(genes_to_remove)

            shutil.rmtree(taxon_dir)

        taxonomy_out.close()
        gene_out.close()

        self.logger.info('Retain %d of %d (%.1f%%) genomes' % (total_genomes_kept, total_genomes_to_process, total_genomes_kept * 100.0 / (total_genomes_to_process)))
        self.logger.info('Total genes kept: %d' % total_genes_kept)
        self.logger.info('Total genes removed: %d (%.1f%%)' % (total_genes_removed, total_genes_removed * 100.0 / (total_genes_kept + total_genes_removed)))

        self.logger.info('Creating BLAST database.')
        os.system('makeblastdb -dbtype prot -in %s' % gene_file)

        shutil.rmtree(tmp_dir)
示例#37
0
    def run(self, input_dir, tmp_dir, threads):
        # get path to all unprocessed genome files
        print 'Reading genomes.'
        genome_files = []
        for genome_dir in os.listdir(input_dir):
            cur_genome_dir = os.path.join(input_dir, genome_dir)
            if not os.path.isdir(cur_genome_dir):
                continue

            for assembly_id in os.listdir(cur_genome_dir):
                assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                genome_id = assembly_id[0:assembly_id.find('_', 4)]

                # check if prodigal has already been called
                aa_gene_file = os.path.join(assembly_dir, 'prodigal',
                                            genome_id + '_protein.faa')
                if os.path.exists(aa_gene_file):
                    # verify checksum
                    checksum_file = aa_gene_file + '.sha256'
                    if os.path.exists(checksum_file):
                        checksum = sha256(aa_gene_file)
                        cur_checksum = open(checksum_file).readline().strip()
                        if checksum == cur_checksum:
                            continue

                genome_file = os.path.join(assembly_dir,
                                           assembly_id + '_genomic.fna')
                if os.path.exists(genome_file):
                    if os.stat(genome_file).st_size == 0:
                        print '[Warning] Genome file appears to be empty: %s' % genome_file
                    else:
                        genome_files.append(genome_file)

        print '  Number of unprocessed genomes: %d' % len(genome_files)

        # run prodigal on each genome
        print 'Running prodigal.'
        prodigal = Prodigal(cpus=threads)
        summary_stats = prodigal.run(genome_files, output_dir=tmp_dir)

        # move results into individual genome directories
        print 'Moving files and calculating checksums.'
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)

            aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path,
                                            genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path,
                                            genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path,
                                        genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(
                prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            fout.write('%s\t%d\n' %
                       ('best_translation_table',
                        summary_stats[genome_id].best_translation_table))
            fout.write('%s\t%.2f\n' %
                       ('coding_density_4',
                        summary_stats[genome_id].coding_density_4 * 100))
            fout.write('%s\t%.2f\n' %
                       ('coding_density_11',
                        summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()
示例#38
0
    def run(self, scaffold_file, genome_files, tetra_file, coverage_file,
            output_file):
        """Calculate statistics for scaffolds.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds.
        genome_files : list of str
            Fasta files with binned scaffolds.
        tetra_file : str
            Tetranucleotide signatures for scaffolds.
        coverage_file : str
            Coverage profiles for scaffolds
        output_file : str
            Output file for scaffolds statistics.
        """

        tetra = Tetranucleotide(self.cpus)
        signatures = tetra.read(tetra_file)

        cov_profiles = None
        if coverage_file:
            coverage = Coverage(self.cpus)
            cov_profiles, _ = coverage.read(coverage_file)

        # determine bin assignment for each scaffold
        self.logger.info('Determining scaffold statistics.')

        scaffold_id_genome_id = {}
        for gf in genome_files:
            genome_id = remove_extension(gf)
            for scaffold_id, _seq in seq_io.read_seq(gf):
                scaffold_id_genome_id[scaffold_id] = genome_id

        # write out scaffold statistics
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')

        if cov_profiles:
            first_key = list(cov_profiles.keys())[0]
            bam_ids = sorted(cov_profiles[first_key].keys())
            for bam_id in bam_ids:
                fout.write('\t' + bam_id)

        for kmer in tetra.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for scaffold_id, seq in seq_io.read_seq(scaffold_file):
            fout.write(scaffold_id)
            fout.write('\t' +
                       scaffold_id_genome_id.get(scaffold_id, self.unbinned))
            fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
            fout.write('\t%d' % len(seq))

            if cov_profiles:
                for bam_id in bam_ids:
                    fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])

            fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
            fout.write('\n')

        fout.close()
示例#39
0
    def split(self, scaffold_stats, criteria1, criteria2, genome_file,
              output_dir):
        """Split genome into two based ongenomic feature.

        Parameters
        ----------
        scaffold_stats : ScaffoldStats
            Statistics for individual scaffolds.
        criteria1 : str
            First criteria used for splitting genome.
        criteria2 : str
           Second criteria used for splitting genome.
        genome_file : str
            Sequences being clustered.
        output_dir : str
            Directory to write results.
        """

        seqs = seq_io.read(genome_file)

        # calculate PCA if necessary
        if 'pc' in criteria1 or 'pc' in criteria2:
            self.logger.info('Performing PCA.')
            signatures = GenomicSignature(K)
            signature_matrix = []
            seqs = seq_io.read(genome_file)
            for seq_id, seq in seqs.items():
                stats = scaffold_stats.stats[seq_id]

                signature_matrix.append(stats.signature)

            pc, _variance = self.pca(signature_matrix)
            for i, seq_id in enumerate(seqs):
                scaffold_stats.stats[seq_id].pc1 = pc[i][0]
                scaffold_stats.stats[seq_id].pc2 = pc[i][1]
                scaffold_stats.stats[seq_id].pc3 = pc[i][2]

        # split bin
        genome_id = remove_extension(genome_file)
        fout1 = open(os.path.join(output_dir, genome_id + '_c1.fna'), 'w')
        fout2 = open(os.path.join(output_dir, genome_id + '_c2.fna'), 'w')

        for seq_id, seq in seqs.items():
            stats = scaffold_stats.stats[seq_id]

            meet_criteria = True
            for criteria in [criteria1, criteria2]:
                if 'gc' in criteria:
                    v = eval(criteria.replace('gc', str(stats.gc)),
                             {"__builtins__": {}})
                elif 'coverage' in criteria:
                    v = eval(criteria.replace('coverage', str(stats.coverage)),
                             {"__builtins__": {}})
                elif 'pc1' in criteria:
                    v = eval(criteria.replace('pc1', str(stats.pc1)),
                             {"__builtins__": {}})
                elif 'pc2' in criteria:
                    v = eval(criteria.replace('pc2', str(stats.pc2)),
                             {"__builtins__": {}})
                elif 'pc3' in criteria:
                    v = eval(criteria.replace('pc3', str(stats.pc3)),
                             {"__builtins__": {}})

                meet_criteria = meet_criteria and v

            if meet_criteria:
                fout1.write('>' + seq_id + '\n')
                fout1.write(seqs[seq_id] + '\n')
            else:
                fout2.write('>' + seq_id + '\n')
                fout2.write(seqs[seq_id] + '\n')

        fout1.close()
        fout2.close()
示例#40
0
    def run(self, scaffold_file, genome_files, tetra_file, coverage_file, output_file):
        """Calculate statistics for scaffolds.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds.
        genome_files : list of str
            Fasta files with binned scaffolds.
        tetra_file : str
            Tetranucleotide signatures for scaffolds.
        coverage_file : str
            Coverage profiles for scaffolds
        output_file : str
            Output file for scaffolds statistics.
        """

        tetra = Tetranucleotide(self.cpus)
        signatures = tetra.read(tetra_file)

        cov_profiles = None
        if coverage_file:
            coverage = Coverage(self.cpus)
            cov_profiles, _ = coverage.read(coverage_file)

        # determine bin assignment for each scaffold
        self.logger.info('')
        self.logger.info('  Determining scaffold statistics.')

        scaffold_id_genome_id = {}
        for gf in genome_files:
            genome_id = remove_extension(gf)
            for scaffold_id, _seq in seq_io.read_seq(gf):
                scaffold_id_genome_id[scaffold_id] = genome_id

        # write out scaffold statistics
        fout = open(output_file, 'w')
        fout.write('Scaffold id\tGenome Id\tGC\tLength (bp)')

        if cov_profiles:
            bam_ids = sorted(cov_profiles[cov_profiles.keys()[0]].keys())
            for bam_id in bam_ids:
                fout.write('\t' + bam_id)

        for kmer in tetra.canonical_order():
            fout.write('\t' + kmer)
        fout.write('\n')

        for scaffold_id, seq in seq_io.read_seq(scaffold_file):
            fout.write(scaffold_id)
            fout.write('\t' + scaffold_id_genome_id.get(scaffold_id, self.unbinned))
            fout.write('\t%.2f' % (seq_tk.gc(seq) * 100.0))
            fout.write('\t%d' % len(seq))

            if cov_profiles:
                for bam_id in bam_ids:
                    fout.write('\t%.2f' % cov_profiles[scaffold_id][bam_id])

            fout.write('\t' + '\t'.join(map(str, signatures[scaffold_id])))
            fout.write('\n')

        fout.close()
    def run(self, input_tree, msa_file, marker_info_file, mask_file,
            perc_markers_to_keep, num_replicates, model, jk_dir, output_dir):
        """Jackknife marker genes.

        Marker file should have the format:
          <marker id>\t<marker name>\t<marker desc>\t<length>\n

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        marker_info_file : str
          File indicating database id, HMM name, description and length of each marker in the alignment.
        mask_file : str
          File indicating masking of multiple sequence alignment.
        perc_markers_to_keep : float [0, 1]
          Percentage of marker genes to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          Output directory for jackkife trees.
        """

        assert (model in ['wag', 'jtt'])

        self.model = model
        self.perc_markers_to_keep = perc_markers_to_keep

        # determine length of each marker gene in alignment
        rep_tree_files = []
        if not jk_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)

            marker_lengths = []
            total_len = 0
            with open(marker_info_file) as f:
                f.readline()
                for line in f:
                    line_split = line.split('\t')
                    ml = int(line_split[3])
                    marker_lengths.append(ml)
                    total_len += ml

            self.logger.info('Concatenated length of markers: %d' % total_len)

            # read mask
            mask = open(mask_file).readline().strip()
            start = 0
            self.marker_lengths = []
            total_mask_len = 0
            for ml in marker_lengths:
                end = start + ml
                zeros = mask[start:end].count('0')
                start = end

                self.marker_lengths.append(ml - zeros)
                total_mask_len += ml - zeros

            self.logger.info('Concatenated length of filtered MSA: %d' %
                             total_mask_len)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)

            if len(self.msa.values()[0]) != total_mask_len:
                self.logger.error(
                    'Length of MSA does not meet length of mask.')
                sys.exit()

            # calculate replicates
            self.logger.info('Calculating jackknife marker replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates),
                         self._progress)

            # calculate support
            self.logger.info('Calculating support for %d replicates.' %
                             num_replicates)
            for rep_index in xrange(num_replicates):
                rep_tree_files.append(
                    os.path.join(self.replicate_dir,
                                 'jk_markers.tree.' + str(rep_index) + '.tre'))
        else:
            for f in os.listdir(jk_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(jk_dir, f))
            self.logger.info('Read %d jackknife replicates.' %
                             len(rep_tree_files))

        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.jk_markers.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
示例#42
0
    def _genomes_to_process(self, genome_dir, batchfile, extension):
        """Get genomes to process.

        Parameters
        ----------
        genome_dir : str
          Directory containing genomes.
        batchfile : str
          File describing genomes.
        extension : str
          Extension of files to process.

        Returns
        -------
        genomic_files : d[genome_id] -> FASTA file
            Map of genomes to their genomic FASTA files.
        """

        genomic_files = {}
        if genome_dir:
            for f in os.listdir(genome_dir):
                if f.endswith(extension):
                    genome_id = remove_extension(f)
                    genomic_files[genome_id] = os.path.join(genome_dir, f)

        elif batchfile:
            for line_no, line in enumerate(open(batchfile, "rb")):
                line_split = line.strip().split("\t")
                if line_split[0] == '':
                    continue  # blank line

                if len(line_split) != 2:
                    self.logger.error(
                        'Batch file must contain exactly 2 columns.')
                    sys.exit(-1)

                genome_file, genome_id = line_split
                self._verify_genome_id(genome_id)

                if genome_file is None or genome_file == '':
                    self.logger.error('Missing genome file on line %d.' %
                                      line_no + 1)
                    self.exit(-1)
                elif genome_id is None or genome_id == '':
                    self.logger.error('Missing genome ID on line %d.' %
                                      line_no + 1)
                    self.exit(-1)
                elif genome_id in genomic_files:
                    self.logger.error('Genome ID %s appear multiple times.' %
                                      genome_id)
                    self.exit(-1)

                genomic_files[genome_id] = genome_file

        for genome_key in genomic_files.iterkeys():
            if genome_key.startswith("RS_") or genome_key.startswith(
                    "GB_") or genome_key.startswith("UBA"):
                self.logger.error(
                    "Submitted genomes start with the same prefix (RS_,GB_,UBA) as reference genomes in GTDB-Tk. This will cause issues for downstream analysis."
                )
                sys.exit(-1)

        if len(genomic_files) == 0:
            if genome_dir:
                self.logger.warning(
                    'No genomes found in directory: %s. Check the --extension flag used to identify genomes.'
                    % genome_dir)
            else:
                self.logger.warning(
                    'No genomes found in batch file: %s. Please check the format of this file.'
                    % batchfile)
            sys.exit(-1)

        return genomic_files
示例#43
0
    def _run_prodigal(self, genome_paths):
        """Run Prodigal on genomes."""

        # get genome path and translation table for each file
        self.logger.info('Determining genomic file and translation table for each of the %d genomes.' % len(genome_paths))
        genome_files = []
        translation_table = {}
        for gid, gpath in genome_paths.items():
            assembly_id = os.path.basename(os.path.normpath(gpath))
            canonical_gid = assembly_id[0:assembly_id.find('_', 4)]
            
            genome_file = os.path.join(gpath, assembly_id + '_genomic.fna')
            if os.path.exists(genome_file):
                if os.stat(genome_file).st_size == 0:
                    self.logger.warning('Genomic file appears to be empty: %s' % genome_file)
                    continue
                
                genome_files.append(genome_file)
            else:
                self.logger.warning('Genomic file appears to be missing: %s' % genome_file)
                    
            gff_file = os.path.join(gpath, assembly_id + '_genomic.gff')
            if os.path.exists(gff_file):
                if os.stat(gff_file).st_size == 0:
                    self.logger.warning('GFF appears to be empty: %s' % gff_file)
                    continue

                tt = self._parse_translation_table(gff_file)
                if tt:
                    translation_table[canonical_gid] = tt
                else:
                    translation_table[canonical_gid] = None
                    self.logger.warning('Unable to determine translation table for: %s' % gff_file)
                    sys.exit(-1)
            else:
                self.logger.warning('GFF appears to be missing: %s' % gff_file)
                sys.exit(-1)
        
        # run Prodigal on each genome
        self.logger.info('Running Prodigal on %d genomes.' % len(genome_paths))
        prodigal = Prodigal(cpus=self.cpus)
        summary_stats = prodigal.run(genome_files, 
                                    translation_table=translation_table, 
                                    output_dir=self.tmp_dir)

        # move results into individual genome directories
        self.logger.info('Moving files and calculating checksums.')
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)
            canonical_gid = genome_id[0:genome_id.find('_', 4)]
            
            aa_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(self.tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(self.tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            if translation_table[canonical_gid]:
                fout.write('%s\t%d\t%s\n' % ('best_translation_table', 
                                                summary_stats[genome_id].best_translation_table,
                                                'used table specified by NCBI'))
            else:
                fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table))
                fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100))
                fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()
示例#44
0
def matrix_maker(faidx, bam_list, extension, threads, mapq, id_cutoff,
                 abundance_file, normalised_file, relative_file,
                 base_abundance_file, base_normalised_file, base_relative_file,
                 feature_normalisation, discard_gene_length_normalisation,
                 removed):

    import subprocess
    import re

    logger = logging.getLogger('timestamp')
    features_size = {}
    counts = {}
    counts_base = {}

    logger.info('Get features and initialise matrix')
    with open(faidx) as f:
        for line in f:
            if not line.startswith('#'):
                LINE = line.rstrip().split('\t')
                features = LINE[0]
                features_size[features] = LINE[1]
                counts[features] = 0
                counts_base[features] = 0

    counts_all = []
    counts_all_normalised = []
    counts_all_relative = []
    counts_base_all = []
    counts_base_all_normalised = []
    counts_base_all_relative = []

    file = ["Features", "Features_size"]
    logger.info('Browse alignement file(s)')

    samtoolsexec = find_ex('samtools')
    samtoolsthreads = '-@ ' + threads
    samtoolsminqual = '-q ' + mapq

    with open(bam_list, 'r') as b:
        for bam in b:
            if bam.startswith('#'):
                continue
            i = 0
            alignementfile, librarysize = bam.split(',')
            if librarysize == '' or librarysize == 0:
                librarysize = 1
            samplename = remove_extension(os.path.basename(alignementfile),
                                          extension)
            file.append(samplename)
            logger.info('\t' + samplename)
            cmd = [
                samtoolsexec, 'view', samtoolsthreads, samtoolsminqual,
                alignementfile
            ]
            p = subprocess.Popen(cmd, stdout=subprocess.PIPE).stdout
            for line in p:
                line = line.decode(sys.getdefaultencoding()).rstrip()
                if i > 0 and i % 10000 == 0:
                    logger.info("Alignment record %s processed" % i)
                i += 1
                LINE = line.split('\t')
                features = LINE[2]
                cigar = LINE[5]
                base_mapped = 0
                match = re.findall(r'(\d+)M', cigar)
                read_len = len(LINE[6])
                for base_match in match:
                    base_mapped += int(base_match)
                if read_len == 0:
                    logger.info(LINE)

                if base_mapped / read_len < float(id_cutoff):
                    continue

                counts[features] += 1

                if discard_gene_length_normalisation:
                    counts_base[features] += base_mapped
                else:
                    counts_base[features] += base_mapped / int(
                        features_size[features])

            if abundance_file:
                counts_all.append(counts.copy())

            if normalised_file:
                count_tmp = {}
                count_tmp = {
                    k: (v / int(librarysize)) * feature_normalisation
                    for k, v in counts.items()
                }
                counts_all_normalised.append(count_tmp.copy())

            if relative_file:
                count_tmp = {}
                count_tmp = {
                    k: v / total
                    for total in (sum(counts.values()), )
                    for k, v in counts.items()
                }
                counts_all_relative.append(count_tmp.copy())

            if base_abundance_file:
                counts_base_all.append(counts_base.copy())

            if base_normalised_file:
                count_tmp = {}
                count_tmp = {
                    k: (v / int(librarysize)) * feature_normalisation
                    for k, v in counts_base.items()
                }
                counts_base_all_normalised.append(count_tmp.copy())

            if base_relative_file:
                count_tmp = {}
                count_tmp = {
                    k: v / total
                    for total in (sum(counts_base.values()), )
                    for k, v in counts_base.items()
                }
                counts_base_all_relative.append(count_tmp.copy())

            for fn in counts:
                counts[fn] = 0
                counts_base[fn] = 0

    logger.info('Print matrix')

    if abundance_file:
        output_handle = open(abundance_file, "w")
        output_handle.write('\t'.join(file) + '\n')
        for fn in counts.keys():
            if sum([c[fn] for c in counts_all]) == 0 and removed:
                continue
            else:
                output_handle.write('\t'.join([fn] + [features_size[fn]] +
                                              [str(c[fn])
                                               for c in counts_all]) + '\n')
        output_handle.close()

    if normalised_file:
        output_handle = open(normalised_file, "w")
        output_handle.write('\t'.join(file) + '\n')
        for fn in counts.keys():
            if sum([c[fn] for c in counts_all_normalised]) == 0 and removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn])
                               for c in counts_all_normalised]) + '\n')
        output_handle.close()

    if relative_file:
        output_handle = open(relative_file, "w")
        output_handle.write('\t'.join(file) + '\n')
        for fn in counts.keys():
            if sum([c[fn] for c in counts_all_relative]) == 0 and removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn])
                               for c in counts_all_relative]) + '\n')
        output_handle.close()

    if base_abundance_file:
        output_handle = open(base_abundance_file, "w")
        output_handle.write('\t'.join(file) + '\n')
        for fn in counts_base.keys():
            if sum([c[fn] for c in counts_all]) == 0 and removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn]) for c in counts_base_all]) + '\n')
        output_handle.close()

    if base_normalised_file:
        output_handle = open(base_normalised_file, "w")
        output_handle.write('\t'.join(file) + '\n')
        for fn in counts_base.keys():
            if sum([c[fn] for c in counts_all_normalised]) == 0 and removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn])
                               for c in counts_base_all_normalised]) + '\n')
        output_handle.close()

    if base_relative_file:
        output_handle = open(base_relative_file, "w")
        output_handle.write('\t'.join(file) + '\n')
        for fn in counts_base.keys():
            if sum([c[fn] for c in counts_all_relative]) == 0 and removed:
                continue
            else:
                output_handle.write(
                    '\t'.join([fn] + [features_size[fn]] +
                              [str(c[fn])
                               for c in counts_base_all_relative]) + '\n')
        output_handle.close()
示例#45
0
    def run(self,
            rna_name,
            gtdb_metadata_file,
            rna_file,
            min_rna_length,
            min_scaffold_length,
            min_quality,
            max_contigs,
            min_N50,
            tax_filter,
            genome_list,
            output_dir,
            align_method='ssu_align'):
        """Infer rRNA gene tree spanning select GTDB genomes.

        Parameters
        ----------
        rna_name : str
            Name of rRNA gene.
        gtdb_metadata_file : str
            File specifying GTDB metadata for each genome.
        rna_file : str
            File with rRNA gene sequences in FASTA format.
        min_rna_length : int
            Minimum required length of rRNA gene sequences.
        min_scaffold_length : int
            Minimum required length of scaffold containing rRNA gene sequence.
        min_quality : float [0, 100]
            Minimum genome quality for a genome to be include in tree.
        max_contigs : int
            Maximum number of contigs to include genome.
        min_N50 : int
            Minimum N50 to include genome.
        tax_filter : boolean
            Filter sequences based on incongruent taxonomy classification.
        genome_list : str
            Explicit list of genomes to use (ignores --ncbi_rep_only and --user_genomes).
        output_dir : str
            Directory to store results
        """

        if rna_name not in ['ssu', 'lsu']:
            self.logger.error('Unrecognized rRNA gene type: %s' % rna_name)
            sys.exit(-1)

        genome_metadata = read_gtdb_metadata(gtdb_metadata_file, [
            'checkm_completeness', 'checkm_contamination', 'scaffold_count',
            'n50_scaffolds', 'organism_name', 'gtdb_representative'
        ])

        gtdb_taxonomy = read_gtdb_taxonomy(gtdb_metadata_file)

        user_genomes = set()
        uba_genomes = set()
        ncbi_genomes = set()
        rep_genomes = set()
        for genome_id in genome_metadata:
            org_name = str(genome_metadata[genome_id][4])
            if genome_id.startswith('U_'):
                if '(UBA' in org_name:
                    uba_genomes.add(genome_id)
                else:
                    user_genomes.add(genome_id)
            elif genome_id.startswith('RS_') or genome_id.startswith('GB_'):
                ncbi_genomes.add(genome_id)
            else:
                self.logger.warning('Unrecognized genome prefix: %s' %
                                    genome_id)

            rep = genome_metadata[genome_id][5] == 't'
            if rep:
                rep_genomes.add(genome_id)

        self.logger.info(
            'Initially considering %d genomes (%d NCBI, %d UBA, %d User).' %
            (len(genome_metadata), len(ncbi_genomes), len(uba_genomes),
             len(user_genomes)))
        self.logger.info('Identified %d representative genomes.' %
                         len(rep_genomes))

        # get genomes specified in genome list by user
        genomes_to_consider = set()
        if genome_list:
            for line in open(genome_list):
                gid = line.rstrip().split('\t')[0]
                if gid.startswith('RS_') or gid.startswith(
                        'GB_') or gid.startswith('U_'):
                    genomes_to_consider.add(gid)
            self.logger.info(
                'Restricting genomes to the %d in the genome list.' %
                len(genomes_to_consider))
        else:
            # filter genomes based on quality and database source
            self.logger.info('Filtering genomes based on specified critieria.')
            self.logger.info('Filtering on minimum quality <%d.' % min_quality)
            self.logger.info('Filtering on number of contigs >%d.' %
                             max_contigs)
            self.logger.info('Filtering on scaffold N50 <%d.' % min_N50)

            new_genomes_to_consider = []
            filtered_genomes = 0
            gt = 0
            gq = 0
            sc = 0
            n50 = 0
            for genome_id in genome_metadata:
                if genome_id not in rep_genomes:
                    gt += 1
                    filtered_genomes += 1
                    continue

                if genome_id not in ncbi_genomes and genome_id not in uba_genomes:
                    gt += 1
                    filtered_genomes += 1
                    continue

                comp, cont, scaffold_count, n50_contigs, _org_name, _rep = genome_metadata[
                    genome_id]
                q = float(comp) - 5 * float(cont)
                if q < min_quality or int(scaffold_count) > max_contigs or int(
                        n50_contigs) < min_N50:
                    if q < min_quality:
                        gq += 1

                    if int(scaffold_count) > max_contigs:
                        sc += 1

                    if int(n50_contigs) < min_N50:
                        n50 += 1

                    filtered_genomes += 1
                    continue

                new_genomes_to_consider.append(genome_id)

            genomes_to_consider = new_genomes_to_consider
            self.logger.info(
                'Filtered %d genomes (%d on genome type, %d on genome quality, %d on number of contigs, %d on N50).'
                % (filtered_genomes, gt, gq, sc, n50))
            self.logger.info('Considering %d genomes after filtering.' %
                             len(genomes_to_consider))

        # limit taxonomy to genomes being considered
        cur_gtdb_taxonomy = {}
        for gid in genomes_to_consider:
            cur_gtdb_taxonomy[gid] = gtdb_taxonomy[gid]

        # get rRNA gene sequences for each genome
        rna_output_file = self._get_rna_seqs(rna_name, rna_file,
                                             min_rna_length,
                                             min_scaffold_length,
                                             cur_gtdb_taxonomy,
                                             genomes_to_consider, output_dir)

        # identify erroneous rRNA gene sequences
        if tax_filter:
            self.logger.info(
                'Filtering sequences with incongruent taxonomy strings.')
            filter = self._tax_filter(rna_output_file, cur_gtdb_taxonomy,
                                      output_dir)

            self.logger.info('Filtered %d sequences.' % len(filter))
            if len(filter) > 0:
                rna_filtered_output = os.path.join(
                    output_dir, 'gtdb_%s.tax_filter.fna' % rna_name)
                fout = open(rna_filtered_output, 'w')
                for seq_id, seq, annotation in seq_io.read_seq(
                        rna_output_file, keep_annotation=True):
                    if seq_id not in filter:
                        fout.write('>' + seq_id + ' ' + annotation + '\n')
                        fout.write(seq + '\n')
                fout.close()

                rna_output_file = rna_filtered_output

        # align sequences with ssu-align or mothur
        if rna_name == 'ssu':
            if align_method == 'ssu_align':
                self.logger.info('Aligning sequences with ssu-align.')
                align_dir = os.path.join(output_dir, '%s_align' % rna_name)
                os.system('ssu-align --dna %s %s' %
                          (rna_output_file, align_dir))
                os.system('ssu-mask --afa %s' % align_dir)
            elif align_method == 'mothur':
                self.logger.info('Aligning sequences with mothur.')
                align_dir = os.path.join(output_dir, 'mothur')
                if not os.path.exists(align_dir):
                    os.makedirs(align_dir)

                mothur_cmd = 'mothur "#set.dir(output=%s, blastdir=/srv/sw/Mothur/1.39.5)' % align_dir
                mothur_cmd += '; align.seqs(candidate=%s, template=/srv/db/mothur/silva_128/silva.seed_v128.align, search=blast, flip=t, processors=%d)' % (
                    rna_output_file, self.cpus)
                input_prefix = remove_extension(rna_output_file)
                align_file = os.path.join(align_dir, input_prefix + '.align')
                mothur_cmd += '; filter.seqs(fasta=%s, hard=/srv/db/mothur/silva_128/Lane1349.silva.filter, processors=%d);"' % (
                    align_file, self.cpus)
                os.system(mothur_cmd)
                input_msa = os.path.join(align_dir,
                                         input_prefix + '.filter.fasta')
        elif rna_name == 'lsu':
            self.logger.info('Aligning sequences with ssu-align.')
            align_dir = os.path.join(output_dir, '%s_align' % rna_name)
            if not os.path.exists(align_dir):
                os.makedirs(align_dir)

            os.system('esl-sfetch --index %s' % rna_output_file)

            # search fo sequences using domain-specific LSU HMMs
            for domain in ['archaea', 'bacteria', 'eukaryote']:
                self.logger.info(
                    'Matching LSU rRNA genes to %s-specific HMM.' % domain)
                table_out = os.path.join(
                    align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain))
                cm_dir = os.path.join(
                    os.path.dirname(os.path.realpath(__file__)), 'cm_files')
                cm_file = os.path.join(cm_dir, 'lsu_%s.cm' % domain)
                log_file = os.path.join(
                    align_dir, 'cmsearch.%s.%s.out' % (rna_name, domain))
                os.system(
                    'cmsearch --hmmonly --cpu %d --noali --tblout %s %s %s > %s'
                    %
                    (self.cpus, table_out, cm_file, rna_output_file, log_file))

            # identify top hits for each domain
            self.logger.info(
                'Identifying best domain-specific HMM for each LSU rRNA gene.')
            top_hits = {}
            for domain in ['archaea', 'bacteria', 'eukaryote']:
                table_out = os.path.join(
                    align_dir, 'cmsearch.%s.%s.tblout' % (rna_name, domain))
                for line in open(table_out):
                    if line[0] == '#':
                        continue

                    line_split = line.split()
                    seq_id = line_split[0]
                    start_seq = int(line_split[7])
                    end_seq = int(line_split[8])
                    bitscore = float(line_split[14])

                    prev_bitscore = top_hits.get(seq_id, [None, 0, 0, 0, 0])[4]
                    if bitscore > prev_bitscore:
                        top_hits[seq_id] = [
                            domain, seq_id, start_seq, end_seq, bitscore
                        ]

            # create MSA for each bacteria and archaea
            for domain in ['archaea', 'bacteria']:
                # creat file of top hits
                top_hits_out = os.path.join(
                    align_dir, 'top_hits.%s.%s.tsv' % (rna_name, domain))
                fout = open(top_hits_out, 'w')
                num_hits = 0
                for top_domain, seq_id, start_seq, end_seq, bitscore in top_hits.values(
                ):
                    if top_domain == domain:
                        fout.write('%s\t%d\t%d\%f\n' %
                                   (seq_id, start_seq, end_seq, bitscore))
                        num_hits += 1
                fout.close()

                # align top hits
                self.logger.info(
                    'Creating MSA for %s LSU rRNA genes (%d sequences).' %
                    (domain, num_hits))

                if num_hits > 0:
                    seq_file = os.path.join(
                        align_dir, 'cmsearch.%s.%s.fna' % (rna_name, domain))
                    os.system(
                        "grep -v '^#' %s | awk '{print $1, $2, $3, $1}' | esl-sfetch -Cf %s - > %s"
                        % (top_hits_out, rna_output_file, seq_file))

                    align_file = os.path.join(
                        align_dir, 'cmalign.%s.%s.stk' % (rna_name, domain))
                    os.system('cmalign --dnaout --outformat Pfam %s %s > %s' %
                              (cm_file, seq_file, align_file))

                    masked_file = os.path.join(
                        align_dir,
                        'cmalign.%s.%s.mask.afa' % (rna_name, domain))
                    os.system('esl-alimask -p --outformat AFA %s > %s' %
                              (align_file, masked_file))

        # trim sequences and infer tree
        if align_method == 'ssu_align':
            for domain in ['archaea', 'bacteria']:
                if rna_name == 'ssu':
                    input_msa = os.path.join(
                        align_dir, 'ssu_align.' + domain + '.mask.afa')
                elif rna_name == 'lsu':
                    input_msa = os.path.join(
                        align_dir,
                        'cmalign.%s.%s.mask.afa' % (rna_name, domain))

                if not os.path.exists(input_msa):
                    continue

                trimmed_msa = os.path.join(output_dir, domain + '.trimmed.fna')
                self._trim_seqs(input_msa, trimmed_msa)

                # infer tree
                self.logger.info('Inferring tree for %s genes.' % domain)
                output_tree = os.path.join(output_dir, domain + '.tree')
                os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' %
                          (trimmed_msa, output_tree))
        elif align_method == 'mothur':
            trimmed_msa = os.path.join(output_dir,
                                       input_prefix + '.trimmed.fna')
            self._trim_seqs(input_msa, trimmed_msa)

            # infer tree
            self.logger.info('Inferring tree for %s genes.')
            output_tree = os.path.join(output_dir, input_prefix + '.tree')
            os.system('FastTreeMP -nosupport -nt -gtr -gamma %s > %s' %
                      (trimmed_msa, output_tree))
示例#46
0
    def add_compatible_closest(self, scaffold_file, genome_file, compatible_file, out_genome):
        """Add sequences specified as compatible.

        A sequences is added to a bin if and only if it is
        closest to that bin in GC, tetranuclotide, and
        coverage space.

        Parameters
        ----------
        scaffold_file : str
            Fasta file containing scaffolds to add.
        genome_file : str
            Fasta file of binned scaffolds.
        compatible_file : str
            File specifying compatible scaffolds.
        out_genome : str
            Name of output genome.
        """

        cur_bin_id = remove_extension(genome_file)

        # determine statistics for each potentially compatible scaffold
        scaffold_ids = defaultdict(dict)
        with open(compatible_file) as f:
            headers = [x.strip() for x in f.readline().split('\t')]
            scaffold_gc_index = headers.index('Scaffold GC')
            genome_gc_index = headers.index('Mean genome GC')
            td_dist_index = headers.index('Scaffold TD')
            scaffold_cov_index = headers.index('Mean scaffold coverage')
            genome_cov_index = headers.index('Mean genome coverage')

            for line in f:
                line_split = line.split('\t')
                scaffold_id = line_split[0]
                bin_id = line_split[1].strip()

                scaffold_gc = float(line_split[scaffold_gc_index])
                genome_gc = float(line_split[genome_gc_index])
                gc_dist = abs(scaffold_gc - genome_gc)

                td_dist = float(line_split[td_dist_index])

                scaffold_cov = float(line_split[scaffold_cov_index])
                genome_cov = float(line_split[genome_cov_index])
                cov_dist = abs(scaffold_cov - genome_cov)

                scaffold_ids[scaffold_id][bin_id] = [gc_dist, td_dist, cov_dist]

        # determine scaffolds that are closest to a single bin
        # in terms of GC, tetranucleotide distance, and coverage
        compatible_scaffolds = set()
        for scaffold_id, bin_stats in scaffold_ids.iteritems():
            best_gc = [1e9, None]
            best_td = [1e9, None]
            best_cov = [1e9, None]
            for bin_id, stats in bin_stats.iteritems():
                gc, td, cov = stats
                if gc < best_gc[0]:
                    best_gc = [gc, bin_id]
                if td < best_td[0]:
                    best_td = [td, bin_id]
                if cov < best_cov[0]:
                    best_cov = [cov, bin_id]

            # check if scaffold is closest to a single bin
            if (best_gc[1] == best_td[1] == best_cov[1]) and best_gc[1] == cur_bin_id:
                compatible_scaffolds.add(scaffold_id)

        # add compatible sequences to genome
        genome_seqs = seq_io.read(genome_file)
        for seq_id, seq in seq_io.read_seq(scaffold_file):
            if seq_id in compatible_scaffolds:
                genome_seqs[seq_id] = seq

        # save modified bin
        seq_io.write_fasta(genome_seqs, out_genome)
示例#47
0
    def run(self, genome_files, db_file, taxonomy_file, evalue, per_identity,
            window_size, step_size):
        """Create taxonomic profiles for a set of genomes.

        Parameters
        ----------
        genome_files : list of str
            Fasta files of genomes to process.
        db_file : str
            Database of reference genes.
        taxonomy_file : str
            File containing GreenGenes taxonomy strings for reference genomes.
        evalue : float
            E-value threshold used by blast.
        per_identity: float
            Percent identity threshold used by blast.
        window_size : int
            Size of each fragment.
        step_size : int
            Number of bases to move after each window.
        """

        # parse taxonomy file
        self.logger.info(
            '  Reading taxonomic assignment of reference genomes.')
        taxonomy = Taxonomy().read(taxonomy_file)

        # fragment each genome into fixed sizes windows
        self.logger.info('')
        self.logger.info('  Fragmenting sequences in each bin:')
        diamond_output_dir = os.path.join(self.output_dir, 'diamond')
        make_sure_path_exists(diamond_output_dir)

        fragment_file = os.path.join(diamond_output_dir, 'fragments.fna')
        fragment_out = open(fragment_file, 'w')
        contig_id_to_genome_id = {}
        for genome_file in genome_files:
            genome_id = remove_extension(genome_file)
            self.profiles[genome_id] = Profile(genome_id, taxonomy)
            self._fragment_genomes(genome_file, window_size, step_size,
                                   self.profiles[genome_id], fragment_out)

            for seq_id, _seq in seq_io.read_seq(genome_file):
                contig_id_to_genome_id[seq_id] = genome_id

        # run diamond
        self.logger.info('')
        self.logger.info(
            '  Running diamond blastx with %d processes (be patient!)' %
            self.cpus)

        diamond = Diamond(self.cpus)
        diamond_daa_out = os.path.join(diamond_output_dir, 'diamond_hits')
        diamond.blastx(fragment_file, db_file, evalue, per_identity, 1,
                       diamond_daa_out)

        diamond_table_out = os.path.join(diamond_output_dir,
                                         'diamond_hits.tsv')
        diamond.view(diamond_daa_out + '.daa', diamond_table_out)

        self.logger.info('')
        self.logger.info('  Creating taxonomic profile for each genome.')
        self._taxonomic_profiles(diamond_table_out, taxonomy,
                                 contig_id_to_genome_id)

        self.logger.info('')
        self.logger.info('  Writing taxonomic profile for each genome.')

        report_dir = os.path.join(self.output_dir, 'bin_reports')
        make_sure_path_exists(report_dir)

        for genome_id, profile in self.profiles.iteritems():
            seq_summary_out = os.path.join(report_dir,
                                           genome_id + '.sequences.tsv')
            profile.write_seq_summary(seq_summary_out)

            genome_profile_out = os.path.join(report_dir,
                                              genome_id + '.profile.tsv')
            profile.write_genome_profile(genome_profile_out)

        genome_summary_out = os.path.join(self.output_dir,
                                          'genome_summary.tsv')
        self._write_genome_summary(genome_summary_out)

        # create Krona plot
        krona_profiles = defaultdict(lambda: defaultdict(int))
        for genome_id, profile in self.profiles.iteritems():
            seq_assignments = profile.classify_seqs(taxonomy)

            for seq_id, classification in seq_assignments.iteritems():
                taxa = []
                for r in xrange(0, len(profile.rank_labels)):
                    taxa.append(classification[r][0])

                krona_profiles[genome_id][';'.join(
                    taxa)] += profile.seq_len[seq_id]

        krona = Krona()
        krona_output_file = os.path.join(self.output_dir,
                                         'taxonomic_profiles.krona.html')
        krona.create(krona_profiles, krona_output_file)
示例#48
0
    def _producer(self, genome_pair):
        """Identify reciprocal best blast hits between pairs of genomes.

        Parameters
        ----------
        genome_pair : list
            Identifier of genomes to process.
        """

        blast_stream = open(self.blast_table, 'rb', 32 * (10 ** 6))

        genome_fileA, genome_fileB = genome_pair

        # count number of genes in each genome
        genes_in_genomeA = seq_io.read_fasta(genome_fileA)
        genes_in_genomeB = seq_io.read_fasta(genome_fileB)

        genome_idA = remove_extension(genome_fileA)
        genome_idB = remove_extension(genome_fileB)

        # find blast hits between genome A and B, and vice versa
        hitsAB = self._valid_hits(blast_stream, self.offset_table,
                                    self.per_identity_threshold, self.per_aln_len_threshold,
                                    genome_idA, genome_idB)
        hitsBA = self._valid_hits(blast_stream, self.offset_table,
                                    self.per_identity_threshold, self.per_aln_len_threshold,
                                    genome_idB, genome_idA)

        # report reciprocal best blast hits
        if self.write_shared_genes:
            fout_seqs = open(os.path.join(self.shared_genes_dir, genome_idA + '-' + genome_idB + '.shared_genes.faa'), 'w')

        fout_stats = open(os.path.join(self.shared_genes_dir, genome_idA + '-' + genome_idB + '.rbb_hits.tsv'), 'w')
        fout_stats.write(genome_idA + '\t' + genome_idB + '\tPercent Identity\tPercent Alignment Length\te-value\tbitscore\n')

        per_identity_hits = []
        for query_id, hit_stats in hitsAB.iteritems():
            subject_id, per_identA, per_aln_lenA, evalueA, bitscoreA = hit_stats
            if subject_id in hitsBA and query_id == hitsBA[subject_id][0]:
                _subject_id, per_identB, per_aln_lenB, evalueB, bitscoreB = hitsBA[subject_id]

                # take average of statistics in both blast directions as
                # the results will be similar, but not identical
                per_ident = 0.5 * (per_identA + per_identB)
                per_identity_hits.append(per_ident)

                per_aln_len = 0.5 * (per_aln_lenA + per_aln_lenB)
                evalue = 0.5 * (evalueA + evalueB)
                bitscore = 0.5 * (bitscoreA + bitscoreB)

                fout_stats.write('%s\t%s\t%.2f\t%.2f\t%.2g\t%.2f\n' % (query_id, subject_id, per_ident, per_aln_len, evalue, bitscore))

                # write out shared genes
                if self.write_shared_genes:
                    fout_seqs.write('>' + query_id + '\n')
                    fout_seqs.write(genes_in_genomeA[query_id] + '\n')

                    fout_seqs.write('>' + subject_id + '\n')
                    fout_seqs.write(genes_in_genomeB[subject_id] + '\n')

        if self.write_shared_genes:
            fout_seqs.close()
        fout_stats.close()

        mean_per_identity_hits = 0
        if len(per_identity_hits) > 0:
            mean_per_identity_hits = mean(per_identity_hits)

        std_per_identity_hits = 0
        if len(per_identity_hits) >= 2:
            std_per_identity_hits = std(per_identity_hits)

        return (genome_idA,
                    len(genes_in_genomeA),
                    genome_idB,
                    len(genes_in_genomeB),
                    len(per_identity_hits),
                    mean_per_identity_hits,
                    std_per_identity_hits)
示例#49
0
    def run(self, input_dir, tmp_dir, threads):
        # get path to all unprocessed genome files
        print 'Reading genomes.'
        genome_files = []
        for genome_dir in os.listdir(input_dir):
            cur_genome_dir = os.path.join(input_dir, genome_dir)
            if not os.path.isdir(cur_genome_dir):
                continue
              
            for assembly_id in os.listdir(cur_genome_dir):
                assembly_dir = os.path.join(cur_genome_dir, assembly_id)
                genome_id = assembly_id[0:assembly_id.find('_', 4)]

                # check if prodigal has already been called
                if False:
                    # for safety, I am just recalling genes for all genomes right now,
                    # but this is very efficient
                    aa_gene_file = os.path.join(assembly_dir, 'prodigal', genome_id + '_protein.faa')
                    if os.path.exists(aa_gene_file):
                        # verify checksum
                        checksum_file = aa_gene_file + '.sha256'
                        if os.path.exists(checksum_file):
                            checksum = sha256(aa_gene_file)
                            cur_checksum = open(checksum_file).readline().strip()
                            if checksum == cur_checksum:
                                continue

                genome_file = os.path.join(assembly_dir, assembly_id + '_genomic.fna')
                if os.path.exists(genome_file):
                    if os.stat(genome_file).st_size == 0:
                        print '[Warning] Genome file appears to be empty: %s' % genome_file
                    else:
                        genome_files.append(genome_file)

        print '  Number of unprocessed genomes: %d' % len(genome_files)

        # run prodigal on each genome
        print 'Running prodigal.'
        prodigal = Prodigal(cpus=threads)
        summary_stats = prodigal.run(genome_files, output_dir=tmp_dir)

        # move results into individual genome directories
        print 'Moving files and calculating checksums.'
        for genome_file in genome_files:
            genome_path, genome_id = ntpath.split(genome_file)
            genome_id = remove_extension(genome_id)
            
            aa_gene_file = os.path.join(tmp_dir, genome_id + '_genes.faa')
            nt_gene_file = os.path.join(tmp_dir, genome_id + '_genes.fna')
            gff_file = os.path.join(tmp_dir, genome_id + '.gff')

            genome_root = genome_id[0:genome_id.find('_', 4)]
            prodigal_path = os.path.join(genome_path, 'prodigal')
            if not os.path.exists(prodigal_path):
                os.makedirs(prodigal_path)
            new_aa_gene_file = os.path.join(prodigal_path, genome_root + '_protein.faa')
            new_nt_gene_file = os.path.join(prodigal_path, genome_root + '_protein.fna')
            new_gff_file = os.path.join(prodigal_path, genome_root + '_protein.gff')

            os.system('mv %s %s' % (aa_gene_file, new_aa_gene_file))
            os.system('mv %s %s' % (nt_gene_file, new_nt_gene_file))
            os.system('mv %s %s' % (gff_file, new_gff_file))

            # save translation table information
            translation_table_file = os.path.join(prodigal_path, 'prodigal_translation_table.tsv')
            fout = open(translation_table_file, 'w')
            fout.write('%s\t%d\n' % ('best_translation_table', summary_stats[genome_id].best_translation_table))
            fout.write('%s\t%.2f\n' % ('coding_density_4', summary_stats[genome_id].coding_density_4 * 100))
            fout.write('%s\t%.2f\n' % ('coding_density_11', summary_stats[genome_id].coding_density_11 * 100))
            fout.close()

            checksum = sha256(new_aa_gene_file)
            fout = open(new_aa_gene_file + '.sha256', 'w')
            fout.write(checksum)
            fout.close()