Пример #1
0
def list_genomes_dir(userdir):
    """List fasta files in a specified directory

    Parameters
    ----------
    userdir : str
        Directory path where all fasta files are

    Returns
    -------
    dict
        Dictionary indicating the genomic file for each genome.
    """
    if not os.path.exists(userdir):
        raise ValueError('{0} does not exist.'.format(userdir))
    else:
        onlygenomefiles = {
            f: os.path.join(userdir, f)
            for f in os.listdir(userdir)
            if os.path.isfile(os.path.join(userdir, f))
        }
        for potential_file in onlygenomefiles:
            try:
                read_fasta(os.path.join(userdir, potential_file))
            except:
                raise IOError("{0} is not a fasta file.".format(
                    os.path.join(userdir, potential_file)))
        return onlygenomefiles
Пример #2
0
    def _producer(self, gene_file):
        """Calculates codon usage of a genome.

        This function is intended to be used as a producer
        within a producer/consumer multiprocessing framework.
        It calculates the codon usage for a single genome
        and returns the results for consumption by the
        consumer function.

        Parameters
        ----------
        gene_file : str
            Fasta file containing nucleotide sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : d[codon] -> count
            Occurrence of each codon.
        dict : d[codon] -> length
            Average length of genes for a given stop codon.
        """

        genome_id = ntpath.basename(gene_file)
        genome_id = genome_id.replace('.genes.fna', '')
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(gene_file)
        codon_usage, gene_length = self.codon_usage(seqs)

        return (genome_id, codon_usage, gene_length)
Пример #3
0
    def _msa_filter_by_taxa(self, concatenated_file, gtdb_taxonomy,
                            taxa_filter, outgroup_taxon):
        """Filter GTDB MSA filtered to specified taxa."""

        msa = read_fasta(concatenated_file)
        self.logger.info('Read concatenated alignment for %d GTDB genomes.' %
                         len(msa))

        if taxa_filter is not None:
            taxa_to_keep = set(taxa_filter.split(','))

            if outgroup_taxon not in taxa_to_keep and outgroup_taxon is not None:
                taxa_to_keep.add(outgroup_taxon)

            filtered_genomes = 0
            for genome_id, taxa in gtdb_taxonomy.iteritems():
                common_taxa = taxa_to_keep.intersection(taxa)
                if len(common_taxa) == 0:
                    if genome_id in msa:
                        del msa[genome_id]
                        filtered_genomes += 1

            self.logger.info('Filtered %d taxa based on assigned taxonomy.' %
                             filtered_genomes)

        return msa
Пример #4
0
    def _producer(self, gene_file):
        """Calculates codon usage of a genome.

        This function is intended to be used as a producer
        within a producer/consumer multiprocessing framework.
        It calculates the codon usage for a single genome
        and returns the results for consumption by the
        consumer function.

        Parameters
        ----------
        gene_file : str
            Fasta file containing nucleotide sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : d[codon] -> count
            Occurrence of each codon.
        dict : d[codon] -> length
            Average length of genes for a given stop codon.
        """

        genome_id = ntpath.basename(gene_file)
        genome_id = genome_id.replace('.genes.fna', '')
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(gene_file)
        codon_usage, gene_length = self.codon_usage(seqs)

        return (genome_id, codon_usage, gene_length)
Пример #5
0
def create_concatenated_alignment(genome_ids,
                                   marker_genes,
                                   alignment_dir,
                                   concatenated_alignment_file,
                                   marker_file):
    """Create concatenated multiple sequence alignment for all genomes.

    Parameters
    ----------
    genome_ids : iterable
        Genomes of interest.
    marker_genes : iterable
        Unique ids of marker genes.
    alignment_dir : str
        Directory containing multiple sequence alignments.
    concatenated_alignment_file : str
        File to containing concatenated alignment.
    marker_file : str
        File indicating length of each marker in the alignment.
    """

    # Read alignment files. Some genomes may have multiple
    # copies of a marker gene in which case the last one
    # is arbitrarily taken. This is acceptable as all genes
    # are already screen to be conspecific.
    alignments = defaultdict(dict)
    marker_length = {}
    for mg in marker_genes:
        f = mg + '.aln.masked.faa'
        seqs = seq_io.read_fasta(os.path.join(alignment_dir, f))

        for seq_id, seq in seqs.iteritems():
            genome_id = seq_id[0:seq_id.find(DefaultValues.SEQ_CONCAT_CHAR)]

            alignments[mg][genome_id] = seq

            marker_length[mg] = len(seq)

    # create marker file
    fout = open(marker_file, 'w')
    for mg in marker_genes:
        fout.write('%s\t%s\t%s\t%d\n' % (mg, mg, mg, marker_length[mg]))
    fout.close()

    # create concatenated alignment
    concatenated_seqs = {}
    for mg in marker_genes:
        seqs = alignments[mg]

        for genome_id in genome_ids:
            if genome_id in seqs:
                # append alignment
                concatenated_seqs[genome_id] = concatenated_seqs.get(genome_id, '') + seqs[genome_id]
            else:
                # missing gene
                concatenated_seqs[genome_id] = concatenated_seqs.get(genome_id, '') + '-' * marker_length[mg]

    # save concatenated alignment
    seq_io.write_fasta(concatenated_seqs, concatenated_alignment_file)
Пример #6
0
def create_concatenated_alignment(genome_ids, marker_genes, alignment_dir,
                                  concatenated_alignment_file, marker_file):
    """Create concatenated multiple sequence alignment for all genomes.

    Parameters
    ----------
    genome_ids : iterable
        Genomes of interest.
    marker_genes : iterable
        Unique ids of marker genes.
    alignment_dir : str
        Directory containing multiple sequence alignments.
    concatenated_alignment_file : str
        File to containing concatenated alignment.
    marker_file : str
        File indicating length of each marker in the alignment.
    """

    # Read alignment files. Some genomes may have multiple
    # copies of a marker gene in which case the last one
    # is arbitrarily taken. This is acceptable as all genes
    # are already screen to be conspecific.
    alignments = defaultdict(dict)
    marker_length = {}
    for mg in marker_genes:
        f = mg + '.aln.masked.faa'
        seqs = seq_io.read_fasta(os.path.join(alignment_dir, f))

        for seq_id, seq in seqs.iteritems():
            genome_id = seq_id[0:seq_id.find(DefaultValues.SEQ_CONCAT_CHAR)]

            alignments[mg][genome_id] = seq

            marker_length[mg] = len(seq)

    # create marker file
    fout = open(marker_file, 'w')
    for mg in marker_genes:
        fout.write('%s\t%s\t%s\t%d\n' % (mg, mg, mg, marker_length[mg]))
    fout.close()

    # create concatenated alignment
    concatenated_seqs = {}
    for mg in marker_genes:
        seqs = alignments[mg]

        for genome_id in genome_ids:
            if genome_id in seqs:
                # append alignment
                concatenated_seqs[genome_id] = concatenated_seqs.get(
                    genome_id, '') + seqs[genome_id]
            else:
                # missing gene
                concatenated_seqs[genome_id] = concatenated_seqs.get(
                    genome_id, '') + '-' * marker_length[mg]

    # save concatenated alignment
    seq_io.write_fasta(concatenated_seqs, concatenated_alignment_file)
Пример #7
0
    def _read_seq_ids(self, bin_files):
        """Read sequence IDs of all bin files."""
        
        bins = {}
        for bin_file in bin_files:
            bin_id = self.bin_id_from_filename(bin_file)
            bins[bin_id] = set(read_fasta(bin_file).keys())

        return bins
Пример #8
0
 def _parse_sequence_file(self, fna_file, prefix, ssu_query_id):
     metadata = []
     all_genes_dict = read_fasta(fna_file, False)
     sequence = all_genes_dict[ssu_query_id]
     if prefix == 'lsu_silva_23s':
         metadata.append(('lsu_23s_sequence'.format(prefix), sequence))
     elif prefix == 'ssu_silva':
         metadata.append(('ssu_sequence'.format(prefix), sequence))
     else:
         metadata.append(('{0}_sequence'.format(prefix), sequence))
     return metadata
Пример #9
0
    def run(self, msa, mask, outf):

        outfwriter = open(outf, 'w')
        dict_genomes = read_fasta(msa, False)
        with open(mask, 'r') as f:
            maskstr = f.readline()
        print maskstr
        print len(maskstr)

        for k, v in dict_genomes.iteritems():
            aligned_seq = ''.join([v[i] for i in xrange(
                0, len(maskstr)) if maskstr[i] == '1'])
            fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
            outfwriter.write(fasta_outstr)
        outfwriter.close()
Пример #10
0
    def _producer(self, gene_file):
        """Calculates dinucleotide usage statistics of a genome.

        Parameters
        ----------
        gene_file : str
            Fasta file containing amino acid sequences.
        """

        genome_id = ntpath.basename(gene_file)
        genome_id = genome_id.replace('.genes.fna', '')
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(gene_file)
        self.dinucleotide_usage(seqs, genome_id)

        return True
Пример #11
0
    def _producer(self, gene_file):
        """Calculates dinucleotide usage statistics of a genome.

        Parameters
        ----------
        gene_file : str
            Fasta file containing amino acid sequences.
        """

        genome_id = ntpath.basename(gene_file)
        genome_id = genome_id.replace('.genes.fna', '')
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(gene_file)
        self.dinucleotide_usage(seqs, genome_id)

        return True
Пример #12
0
    def run(self, msa, mask, marker_list, taxonomy_file, metadata_file,
            output):
        dict_marker = {}
        print "readmsa"
        dict_genomes = read_fasta(msa, False)

        print len(dict_genomes)

        sub_list_genomes = self.selectGenomes(dict_genomes, taxonomy_file,
                                              metadata_file)

        print len(sub_list_genomes)

        with open(mask, 'r') as f:
            maskstr = f.readline()

        with open(marker_list, 'r') as f:
            f.readline()
            for line in f:
                list_info = line.split("\t")
                dict_marker[list_info[0]] = int(list_info[3])

        new_mask, output_seqs = self.trim_seqs(dict_genomes, sub_list_genomes,
                                               maskstr, dict_marker)

        if not os.path.exists(output):
            os.makedirs(output)

        #Write mask
        mask_file = open(os.path.join(output, "mask.txt"), 'w')
        mask_file.write(''.join([str(n) for n in new_mask]))
        mask_file.close()

        #Write MSA
        trimmed_file = open(os.path.join(output, "trimmed_sequences.fa"), 'w')
        nbr_aa_seqs = open(os.path.join(output, "number_AA_genomes.tsv"), 'w')
        for genome_id, aligned_seq in output_seqs.iteritems():
            fasta_outstr = ">%s\n%s\n" % (genome_id, aligned_seq)
            trimmed_file.write(fasta_outstr)
            lenaa = len(aligned_seq) - (len(aligned_seq) -
                                        len(aligned_seq.replace('-', '')))
            len_outstr = "%s\t%s\t%s\n" % (genome_id, lenaa, len(aligned_seq))
            nbr_aa_seqs.write(len_outstr)
        trimmed_file.close()
        nbr_aa_seqs.close()
    def _parse_lsu_5S_files(self, accession, fout, fna_file, summary_file):
        """Parse information from 5S LSU files."""

        # check if a 5S sequence was identified
        if not os.path.exists(fna_file):
            return 0

        # write header
        if self.write_lsu_5S_header:
            fout.write(
                'genome_id\tlsu_5s_query_id\tlsu_5s_length\tlsu_5s_contig_len\tlsu_5s_sequence\n'
            )
            self.write_lsu_5S_header = False

        seqs = read_fasta(fna_file)

        identified_genes = 0
        longest_seq = 0
        longest_seq_id = None
        longest_contig_len = None
        if os.path.exists(summary_file):
            with open(summary_file) as fsum:
                header_line = fsum.readline()  # consume header line
                header_list = [x.strip() for x in header_line.split('\t')]
                idx_seq_len = header_list.index("Sequence length")
                for line in fsum:
                    identified_genes += 1

                    line_split = list(map(str.strip, line.strip().split('\t')))
                    seq_id = line_split[0]
                    contig_len = int(line_split[idx_seq_len])
                    seq_len = len(seqs[seq_id])

                    if seq_len > longest_seq:
                        longest_seq_id = seq_id
                        longest_seq = seq_len
                        longest_contig_len = contig_len

        if longest_seq_id:
            fout.write('%s\t%s\t%d\t%d\t%s\n' %
                       (accession, longest_seq_id, longest_seq,
                        longest_contig_len, seqs[longest_seq_id]))

        return identified_genes
Пример #14
0
    def trim_msa(self, untrimmed_msa, mask_type, maskid, output_file):
        if maskid == 'bac' and mask_type == 'reference':
            mask = os.path.join(Config.MASK_DIR, Config.MASK_BAC120)
        elif maskid == 'arc' and mask_type == 'reference':
            mask = os.path.join(Config.MASK_DIR, Config.MASK_AR122)
        elif mask_type == 'file':
            mask = maskid
        with open(mask, 'r') as f:
            maskstr = f.readline()

        outfwriter = open(output_file, 'w')
        dict_genomes = read_fasta(untrimmed_msa, False)

        for k, v in dict_genomes.iteritems():
            aligned_seq = ''.join(
                [v[i] for i in xrange(0, len(maskstr)) if maskstr[i] == '1'])
            fasta_outstr = ">%s\n%s\n" % (k, aligned_seq)
            outfwriter.write(fasta_outstr)
        outfwriter.close()
        return True
Пример #15
0
 def run(self, msa_file, constraint_dir, outfile):
     msa_dict = read_fasta(msa_file)
     outdict = dict((key, []) for key in msa_dict.iterkeys())
     onlyfiles = [
         os.path.join(constraint_dir, f) for f in os.listdir(constraint_dir)
         if os.path.isfile(os.path.join(constraint_dir, f))
     ]
     for constraintfile in onlyfiles:
         constraintlist = []
         with open(constraintfile) as f:
             for line in f:
                 constraintlist.append(line.strip())
             for k, v in outdict.iteritems():
                 if k in constraintlist:
                     outdict.get(k).append('1')
                 else:
                     outdict.get(k).append('0')
     outf = open(outfile, 'w')
     for outk, outval in outdict.iteritems():
         outf.write(">{0}\n{1}\n".format(outk, ''.join(outval)))
     outf.close()
Пример #16
0
    def _producer(self, genome_file):
        """Calculates kmer usage of a genome.

        Parameters
        ----------
        genome_file : str
            Fasta file containing genomic sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : d[kmer] -> count
            Occurrence of each kmer.
        """

        genome_id = ntpath.basename(genome_file)
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(genome_file)
        kmer_usage = self.signatures.counts(seqs)

        return (genome_id, kmer_usage)
Пример #17
0
    def _producer(self, genome_file):
        """Calculates kmer usage of a genome.

        Parameters
        ----------
        genome_file : str
            Fasta file containing genomic sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : d[kmer] -> count
            Occurrence of each kmer.
        """

        genome_id = ntpath.basename(genome_file)
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(genome_file)
        kmer_usage = self.signatures.calculate(seqs)

        return (genome_id, kmer_usage)
Пример #18
0
    def run(self, msa_file, marker_list):
        """Randomly select a subset of columns from the MSA of each marker."""

        # read multiple sequence alignment
        self.logger.info('Reading multiple sequence alignment.')
        msa = read_fasta(msa_file, False)
        self.logger.info('Read MSA for %d genomes.' % len(msa))

        filtered_seqs, pruned_seqs = self.trim(msa, marker_list)

        self.logger.info(
            'Removed %d taxa have amino acids in <%.1f%% of columns in filtered MSA.'
            % (len(pruned_seqs), self.min_perc_aa))

        # write out trimmed sequences
        filter_file = open(os.path.join(self.output_dir, "filtered_msa.faa"),
                           'w')
        for gid, seq in filtered_seqs.items():
            fasta_outstr = ">%s\n%s\n" % (gid, seq)
            filter_file.write(fasta_outstr)
        filter_file.close()

        self.logger.info('Done.')
Пример #19
0
    def _producer(self, gene_file):
        """Calculates amino acid usage of a genome.

        Parameters
        ----------
        gene_file : str
            Fasta file containing amino acid sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : dict[aa] -> count
            Occurrence of each amino acid.
        """

        genome_id = ntpath.basename(gene_file)
        genome_id = genome_id.replace('.genes.faa', '')
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(gene_file)
        aa_usage = self.amino_acid_usage(seqs)

        return [genome_id, aa_usage]
Пример #20
0
    def _producer(self, gene_file):
        """Calculates amino acid usage of a genome.

        Parameters
        ----------
        gene_file : str
            Fasta file containing amino acid sequences.

        Returns
        -------
        str
           Unique identifier of genome.
        dict : dict[aa] -> count
            Occurrence of each amino acid.
        """

        genome_id = ntpath.basename(gene_file)
        genome_id = genome_id.replace('.genes.faa', '')
        genome_id = os.path.splitext(genome_id)[0]

        seqs = seq_io.read_fasta(gene_file)
        aa_usage = self.amino_acid_usage(seqs)

        return [genome_id, aa_usage]
Пример #21
0
    def _runHmmMultiAlign(self, db_genome_id, path, marker_ids):
        '''
        Selects markers that are not aligned for a specific genome.

        :param db_genome_id: Selected genome
        :param path: Path to the genomic fasta file for the genome
        :param marker_ids: list of marker ids for the selected sets
        '''

        temp_con = GenomeDatabaseConnection()
        temp_con.MakePostgresConnection()
        temp_cur = temp_con.cursor()

        # gather information for all marker genes
        final_genome = []
        final_markerid = []
        final_seq = []
        final_multihits = []
        final_evalue = []
        final_bitscore = []

        marker_dbs = {"PFAM": self.pfam_top_hit_suffix,
                      "TIGR": self.tigrfam_top_hit_suffix}
        for marker_db, marker_suffix in marker_dbs.iteritems():
            query = ("SELECT m.id_in_database,m.marker_file_location,m.size,m.id " +
                     "FROM genomes as g, markers as m " +
                     "LEFT JOIN marker_databases as md " +
                     "ON md.id=m.marker_database_id " +
                     "WHERE NOT EXISTS (" +
                     "SELECT * FROM aligned_markers as am " +
                     "WHERE am.genome_id = g.id and am.marker_id = m.id) " +
                     "AND g.id = %s " +
                     "AND m.id in %s " +
                     "AND md.external_id_prefix like %s")
            temp_cur.execute(
                query, (db_genome_id, tuple(marker_ids,), marker_db))
            raw_results = temp_cur.fetchall()
            marker_dict_original = {
                a: {"path": b, "size": c, "db_marker_id": d} for a, b, c, d in raw_results}

            # get all gene sequences
            genome_path = str(path)
            tophit_path = genome_path.replace(
                self.protein_file_suffix, marker_suffix)

            # we load the list of all the genes detected in the genome
            protein_file = tophit_path.replace(
                marker_suffix, self.protein_file_suffix)
            all_genes_dict = read_fasta(protein_file, False)

            # we store the tophit file line by line and store the
            # information in a dictionary
            with open(tophit_path) as tp:
                # first line is header line
                tp.readline()
                gene_dict = {}
                for line_tp in tp:
                    linelist = line_tp.split("\t")
                    genename = linelist[0]
                    sublist = linelist[1]
                    if ";" in sublist:
                        diff_markers = sublist.split(";")
                    else:
                        diff_markers = [sublist]

                    for each_gene in diff_markers:
                        sublist = each_gene.split(",")
                        markerid = sublist[0]
                        if markerid not in marker_dict_original:
                            continue

                        evalue = sublist[1]
                        bitscore = sublist[2].strip()

                        if markerid in gene_dict:
                            oldbitscore = gene_dict.get(
                                markerid).get("bitscore")
                            if oldbitscore < bitscore:
                                gene_dict[markerid] = {"marker_path": marker_dict_original.get(markerid).get("path"),
                                                       "gene": genename,
                                                       "gene_seq": all_genes_dict.get(genename),
                                                       "evalue": evalue,
                                                       "bitscore": bitscore,
                                                       "multihit": True,
                                                       "db_marker_id": marker_dict_original.get(markerid).get("db_marker_id")}
                            else:
                                gene_dict.get(markerid)["multihit"] = True
                        else:
                            gene_dict[markerid] = {"marker_path": marker_dict_original.get(markerid).get("path"),
                                                   "gene": genename,
                                                   "gene_seq": all_genes_dict.get(genename),
                                                   "evalue": evalue,
                                                   "bitscore": bitscore,
                                                   "multihit": False,
                                                   "db_marker_id": marker_dict_original.get(markerid).get("db_marker_id")}

            for mid, info in marker_dict_original.iteritems():
                if mid not in gene_dict:
                    final_genome.append(db_genome_id)
                    final_markerid.append(info.get("db_marker_id"))
                    final_seq.append("-" * info.get("size"))
                    final_multihits.append(False)
                    final_evalue.append(None)
                    final_bitscore.append(None)

            result_aligns = self._runHmmAlign(gene_dict, db_genome_id)
            for result_align in result_aligns:
                final_genome.append(result_align[0])
                final_markerid.append(result_align[1])
                final_seq.append(result_align[2])
                final_multihits.append(result_align[3])
                final_evalue.append(result_align[4])
                final_bitscore.append(result_align[5])

        if final_genome:
            query = "SELECT upsert_aligned_markers(%s,%s,%s,%s,%s,%s)"
            temp_cur.execute(query, (final_genome,
                                     final_markerid,
                                     final_seq,
                                     final_multihits,
                                     final_evalue,
                                     final_bitscore))
        temp_con.commit()
        temp_cur.close()
        temp_con.ClosePostgresConnection()

        return True
Пример #22
0
    def _producer(self, genome_file):
        """Apply prodigal to genome with most suitable translation table.

        Parameters
        ----------
        genome_file : queue
            Fasta file for genome.
        """

        genome_id = remove_extension(genome_file)

        aa_gene_file = os.path.join(self.output_dir, genome_id + '_genes.faa')
        nt_gene_file = os.path.join(self.output_dir, genome_id + '_genes.fna')
        gff_file = os.path.join(self.output_dir, genome_id + '.gff')

        best_translation_table = -1
        table_coding_density = {4:-1, 11:-1}
        if self.called_genes:
            os.system('cp %s %s' % (os.path.abspath(genome_file), aa_gene_file))
        else:
            tmp_dir = tempfile.mkdtemp()

            seqs = read_fasta(genome_file)

            # determine number of bases
            total_bases = 0
            for seq in seqs.values():
                total_bases += len(seq)

            # call genes under different translation tables
            if self.translation_table:
                translation_tables = [self.translation_table]
            else:
                translation_tables = [4, 11]

            for translation_table in translation_tables:
                os.makedirs(os.path.join(tmp_dir, str(translation_table)))
                aa_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.faa')
                nt_gene_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '_genes.fna')
                gff_file_tmp = os.path.join(tmp_dir, str(translation_table), genome_id + '.gff')

                # check if there is sufficient bases to calculate prodigal parameters
                if total_bases < 100000 or self.meta:
                    proc_str = 'meta'  # use best precalculated parameters
                else:
                    proc_str = 'single'  # estimate parameters from data
                    
                args = '-m'
                if self.closed_ends:
                    args += ' -c'

                cmd = 'prodigal %s -p %s -q -f gff -g %d -a %s -d %s -i %s > %s 2> /dev/null' % (args,
                                                                                            proc_str,
                                                                                            translation_table,
                                                                                            aa_gene_file_tmp,
                                                                                            nt_gene_file_tmp,
                                                                                            genome_file,
                                                                                            gff_file_tmp)
                os.system(cmd)

                # determine coding density
                prodigalParser = ProdigalGeneFeatureParser(gff_file_tmp)

                codingBases = 0
                for seq_id, _seq in seqs.items():
                    codingBases += prodigalParser.coding_bases(seq_id)

                codingDensity = float(codingBases) / total_bases
                table_coding_density[translation_table] = codingDensity

            # determine best translation table
            if not self.translation_table:
                best_translation_table = 11
                if (table_coding_density[4] - table_coding_density[11] > 0.05) and table_coding_density[4] > 0.7:
                    best_translation_table = 4
            else:
                best_translation_table = self.translation_table

            shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.faa'), aa_gene_file)
            shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '_genes.fna'), nt_gene_file)
            shutil.copyfile(os.path.join(tmp_dir, str(best_translation_table), genome_id + '.gff'), gff_file)

            # clean up temporary files
            shutil.rmtree(tmp_dir)

        return (genome_id, aa_gene_file, nt_gene_file, gff_file, best_translation_table, table_coding_density[4], table_coding_density[11])
Пример #23
0
    def _run_hmm_align(self, genome_ids,
                                genome_dirs,
                                genes_in_genomes,
                                ignore_multi_copy,
                                output_msa_dir,
                                output_model_dir,
                                queue_in,
                                queue_out):
        """Run each marker gene in a separate thread.

        Only the gene with the highest bitscore is used for genomes with
        multiple hits to a given protein family.

        Parameters
        ----------
        genome_ids : iterable
            Genomes of interest.
        genome_dirs : d[assembly_accession] -> directory
            Path to files for individual genomes.
        genes_in_genomes : d[genome_id][family_id] -> [(gene_id_1, bitscore), ..., (gene_id_N, bitscore)]
            Genes within each genome.
        ignore_multi_copy : bool
            Flag indicating if genes with multiple hits should be ignored (True) or the gene with the highest bitscore taken (False).
        output_msa_dir : str
            Output directory for multiple sequence alignment.
        output_model_dir : str
            Output directory for HMMs.
        queue_in : Queue
            Input queue for parallel processing.
        queue_out : Queue
            Output queue for parallel processing.
        """

        while True:
            marker_id = queue_in.get(block=True, timeout=None)
            if marker_id == None:
                break

            marker_seq_file = os.path.join(output_msa_dir, marker_id + '.faa')
            fout = open(marker_seq_file, 'w')
            for genome_id in genome_ids:
                genome_dir = genome_dirs[genome_id]

                assembly = genome_dir[genome_dir.rfind('/') + 1:]
                genes_file = os.path.join(genome_dir, assembly + self.protein_file_ext)
                seqs = seq_io.read_fasta(genes_file)

                hits = genes_in_genomes[genome_id].get(marker_id, None)
                if not hits or (ignore_multi_copy and len(hits) > 1):
                    continue

                # get gene with highest bitscore
                hits.sort(key=lambda x: x[1], reverse=True)
                gene_id, _bitscore = hits[0]

                fout.write('>' + genome_id + DefaultValues.SEQ_CONCAT_CHAR + gene_id + '\n')
                fout.write(seqs[gene_id] + '\n')
            fout.close()

            hmmer = HMMER('align')
            hmmer.align(os.path.join(output_model_dir, marker_id + '.hmm'), marker_seq_file, os.path.join(output_msa_dir, marker_id + '.aln.faa'), trim=False, outputFormat='Pfam')
            self._mask_alignment(os.path.join(output_msa_dir, marker_id + '.aln.faa'), os.path.join(output_msa_dir, marker_id + '.aln.masked.faa'))

            queue_out.put(marker_id)
Пример #24
0
    def run(self, dirin, dirout, gtr, release):
        """ renaming genome files for fastani"""

        # get list of genomes to retain (based on genome list 1014)
        genomes_to_retain = set()
        with open(gtr) as f:
            # f.readline()

            for line in f:
                line_split = line.strip().split('\t')
                genomes_to_retain.add(line_split[0])

        print('Genome to retain: %d' % len(genomes_to_retain))
        # get mapping from published UBA genomes to NCBI accessions
        __location__ = os.path.realpath(
            os.path.join(os.getcwd(), os.path.dirname(__file__)))

        uba_acc = {}
        with open(os.path.join(__location__, 'uba_ncbi_accessions.tsv')) as ub:
            for line in ub:
                line_split = line.strip().split('\t')
                if line_split[2] != "None":
                    uba_acc[line_split[0]] = {
                        "uba": line_split[1],
                        "gca": 'GB_' + line_split[2]
                    }
                else:
                    uba_acc[line_split[0]] = {"uba": line_split[1]}

        # renaming taxonomy:
        taxout = open(os.path.join(dirout, 'gtdb_taxonomy.tsv'), 'w')
        with open(os.path.join(dirin, 'gtdb_taxonomy.tsv')) as gt:
            for line in gt:
                info = line.strip().split("\t")
                if info[0] in genomes_to_retain:
                    if info[0].startswith("U_"):
                        subdict = uba_acc.get(info[0])
                        if "gca" in subdict.keys():
                            taxout.write("{0}\t{1}\n".format(
                                subdict.get("gca"), info[1]))
                        else:
                            taxout.write("{0}\t{1}\n".format(
                                subdict.get("uba"), info[1]))
                    else:
                        taxout.write(line)
        taxout.close()

        # renaming genome files for fastani
        fastanis = glob.glob(os.path.join(dirin, 'fastani', "*"))
        fastani_dir = os.path.join(dirout, 'fastani')
        if not os.path.exists(fastani_dir):
            os.makedirs(fastani_dir)
        for genome in fastanis:
            filenamef = os.path.basename(genome)
            filenamef = filenamef.replace("_genomic.fna", "")
            if filenamef.startswith("U_"):
                subdict = uba_acc.get(filenamef)
                if filenamef == "U_74684":
                    print(subdict)
                    print(genome)
                    print(
                        os.path.join(fastani_dir,
                                     subdict.get("gca")[3:] + "_genomic.fna"))
                if "gca" in subdict.keys():
                    copyfile(
                        genome,
                        os.path.join(fastani_dir,
                                     subdict.get("gca")[3:] + "_genomic.fna"))
                else:
                    copyfile(
                        genome,
                        os.path.join(fastani_dir,
                                     subdict.get("uba") + "_genomic.fna"))
            else:
                copyfile(genome,
                         os.path.join(fastani_dir, filenamef + "_genomic.fna"))

        for dom in ['bac120', 'ar122']:
            # MSA renaming
            msadir = os.path.join(dirout, dom, 'msa')
            if not os.path.exists(msadir):
                os.makedirs(msadir)
            msa_dict = read_fasta(
                os.path.join(dirin, dom, 'gtdb_concatenated.faa'))
            seqout = open(
                os.path.join(msadir, 'gtdb_r' + release + '_' + dom + '.faa'),
                'w')
            for id, seq in msa_dict.items():
                if id in genomes_to_retain:
                    if id.startswith("U_"):
                        subdict = uba_acc.get(id)
                        if "gca" in subdict.keys():
                            seqout.write(">{0}\n{1}\n".format(
                                subdict.get("gca"), seq))
                        else:
                            seqout.write(">{0}\n{1}\n".format(
                                subdict.get("uba"), seq))
                    else:
                        seqout.write(">{0}\n{1}\n".format(id, seq))
            seqout.close()

            # PPLACER renaming
            pplacerdir = os.path.join(dirout, dom, 'pplacer')
            if not os.path.exists(pplacerdir):
                os.makedirs(pplacerdir)

            trees = glob.glob(os.path.join(dirin, dom, 'pplacer', "*.tree"))
            if len(trees) != 1:
                print("Error")
                sys.exit()
            else:
                treef = trees[0]
            fastas = glob.glob(os.path.join(dirin, dom, 'pplacer', "*.fa"))
            if len(fastas) != 1:
                print("Error")
                sys.exit()
            else:
                seqfile = fastas[0]
            logs = glob.glob(os.path.join(dirin, dom, 'pplacer', "*.log"))
            if len(logs) != 1:
                print("Error")
                sys.exit()
            else:
                logfile = logs[0]

            # produce corrected tree
            tree = dendropy.Tree.get_from_path(os.path.join(treef),
                                               schema='newick',
                                               rooting='force-rooted',
                                               preserve_underscores=True)
            for n in tree.leaf_node_iter():
                if n.taxon.label.startswith("U_"):
                    subdict = uba_acc.get(n.taxon.label)
                    if "gca" in subdict.keys():
                        n.taxon.label = subdict.get("gca")
                    else:
                        n.taxon.label = subdict.get("uba")
            tree.write_to_path(os.path.join(dirout, dom, 'pplacer',
                                            dom + "_r" + release + ".tree"),
                               schema='newick',
                               suppress_rooting=True,
                               unquoted_underscores=True)

            trimmed_seqout = open(
                os.path.join(dirout, dom, 'pplacer',
                             'trimmed_msa_' + dom + '.faa'), 'w')
            trimmed_fasta = read_fasta(seqfile)
            for id, seq in trimmed_fasta.items():
                if id in genomes_to_retain:
                    if id.startswith("U_"):
                        subdict = uba_acc.get(id)
                        if "gca" in subdict.keys():
                            trimmed_seqout.write(">{0}\n{1}\n".format(
                                subdict.get("gca"), seq))
                        else:
                            trimmed_seqout.write(">{0}\n{1}\n".format(
                                subdict.get("uba"), seq))
                    else:
                        trimmed_seqout.write(">{0}\n{1}\n".format(id, seq))
            trimmed_seqout.close()

            logoutf = open(
                os.path.join(dirout, dom, 'pplacer',
                             'fitting_' + dom + '.log'), 'w')
            with open(logfile) as logfin:
                for line in logfin:
                    for k, subdict in uba_acc.items():
                        if "gca" in subdict.keys():
                            line = line.replace(k + ":",
                                                subdict.get("gca") + ":")
                        else:
                            line = line.replace(k + ":",
                                                subdict.get("uba") + ":")
                    logoutf.write(line)
            logoutf.close()
Пример #25
0
    def run(self, outf):

        # Check if all directories are here
        actual_dirs = os.listdir(self.pack_dir)
        if len(actual_dirs) != len(self.list_dirsinpackage):
            print 'ERROR:'
        if len(set(actual_dirs) & set(self.list_dirsinpackage)) != len(
                self.list_dirsinpackage):
            print 'ERROR:'

        with open(os.path.join(self.pack_dir, 'metadata',
                               'metadata.txt')) as metafile:
            for line in metafile:
                if line.startswith('VERSION_DATA'):
                    version = line.strip().split('=')[1]

        # List genomes in fastani folder
        list_genomes = glob.glob(
            os.path.join(self.pack_dir, 'fastani', 'database/*.gz'))

        # Archaeal genome MSA is untrimmed
        ar_msa_file = glob.glob(os.path.join(self.pack_dir,
                                             'msa/*ar122.faa'))[0]
        ar_msa = read_fasta(ar_msa_file)
        first_seq = ar_msa.get(ar_msa.keys()[0])
        if len(first_seq) != 32675:
            print 'ERROR: len(first_seq) != 32675'

        # Bacterial genome MSA is untrimmed
        bac_msa_file = glob.glob(os.path.join(self.pack_dir,
                                              'msa/*bac120.faa'))[0]
        bac_msa = read_fasta(bac_msa_file)
        first_seq = bac_msa.get(bac_msa.keys()[0])
        if len(first_seq) != 41155:
            print 'ERROR: len(first_seq) != 41155'

        # Bacterial MASK is same length as the untrimmed bacterial genomes
        bac_mask_file = glob.glob(
            os.path.join(self.pack_dir, 'masks/*bac120.mask'))[0]
        bac_mask = ''
        with open(bac_mask_file) as bmf:
            bac_mask = bmf.readline()
        if len(bac_mask) != 41155:
            print 'ERROR: len(bac_mask) != 41155'

        # Archaeal MASK is same length as the untrimmed archaeal genomes
        ar_mask_file = glob.glob(
            os.path.join(self.pack_dir, 'masks/*ar122.mask'))[0]
        ar_mask = ''
        with open(ar_mask_file) as amf:
            ar_mask = amf.readline()
        if len(ar_mask) != 32675:
            print 'ERROR: len(ar_mask) != 32675'

        # Archaeal Pplacer MSA should have the same number of genomes as the
        # Archaeal untrimmed MSA
        ar_pplacer_msa_file = glob.glob(
            os.path.join(self.pack_dir, 'pplacer',
                         'gtdb_' + version + '_ar122.refpkg',
                         'trimmed_msa_ar122.faa'))[0]
        ar_pplacer_msa = read_fasta(ar_pplacer_msa_file)
        if len(ar_pplacer_msa) != len(ar_msa):
            print 'ERROR: len(ar_pplacer_msa) != len(ar_msa)'
            print 'len(ar_pplacer_msa): {}'.format(len(ar_pplacer_msa))
            print 'len(ar_msa): {}'.format(len(ar_msa))
            print 'difference genomes: {}'.format(
                list(set(ar_msa.keys()).difference(ar_pplacer_msa.keys())))
        first_seq = ar_pplacer_msa.get(ar_pplacer_msa.keys()[0])
        # Archaeal Pplacer MSA should have the same length as the Archaeal mask
        if len(first_seq) != len([a for a in ar_mask if a == '1']):
            print 'ERROR: len(first_seq) != len([a for a in ar_mask if a ==1])'
            print 'len(first_seq): {}'.format(len(first_seq))
            print 'len([a for a in ar_mask if a ==1]): {}'.format(
                len([a for a in ar_mask if a == '1']))

        # Bacterial Pplacer MSA should have the same number of genomes as the
        # Bacterial untrimmed MSA
        bac_pplacer_msa_file = os.path.join(
            self.pack_dir, 'pplacer', 'gtdb_' + version + '_bac120.refpkg',
            'trimmed_msa_bac120.faa')
        bac_pplacer_msa = read_fasta(bac_pplacer_msa_file)
        if len(bac_pplacer_msa) != len(bac_msa):
            print 'ERROR: len(bac_pplacer_msa) != len(bac_msa)'
            print 'len(bac_pplacer_msa): {}'.format(len(bac_pplacer_msa))
            print 'len(bac_msa): {}'.format(len(bac_msa))
            print 'difference genomes: {}'.format(
                list(set(bac_msa.keys()).difference(bac_pplacer_msa.keys())))
        first_seq = bac_pplacer_msa.get(bac_pplacer_msa.keys()[0])
        # Bacterial Pplacer MSA should have the same length as the Bacterial
        # mask
        if len(first_seq) != len([a for a in bac_mask if a == '1']):
            print 'ERROR: len(first_seq) != len([a for a in bac_mask if a ==1])'
            print 'len(first_seq): {}'.format(len(first_seq))
            print 'len([a for a in bac_mask if a ==1]): {}'.format(
                len([a for a in bac_mask if a == '1']))

        # Archaeal Tree should have the same number of leaves than nomber of
        # genomes in the MSA
        arc_tree = dendropy.Tree.get_from_path(os.path.join(
            self.pack_dir, 'pplacer', 'gtdb_' + version + '_ar122.refpkg',
            'ar122_' + version + '.tree'),
                                               schema='newick',
                                               rooting='force-rooted',
                                               preserve_underscores=True)
        list_leaves = arc_tree.leaf_nodes()
        if len(list_leaves) != len(ar_pplacer_msa):
            print 'ERROR: len(list_leaves) != len(ar_pplacer_msa)'
            print 'len(list_leaves): {}'.format(len(list_leaves))
            print 'len(ar_pplacer_msa): {}'.format(len(ar_pplacer_msa))

        # Bacterial Tree should have the same number of leaves than nomber of
        # genomes in the MSA
        bac_tree = dendropy.Tree.get_from_path(os.path.join(
            self.pack_dir, 'pplacer', 'gtdb_' + version + '_bac120.refpkg',
            'bac120_' + version + '.tree'),
                                               schema='newick',
                                               rooting='force-rooted',
                                               preserve_underscores=True)
        list_leaves = bac_tree.leaf_nodes()
        if len(list_leaves) != len(bac_pplacer_msa):
            print 'ERROR: len(list_leaves) != len(bac_pplacer_msa)'
            print 'len(list_leaves): {}'.format(len(list_leaves))
            print 'len(bac_pplacer_msa): {}'.format(len(bac_pplacer_msa))

        # Taxonomy file should have as many genomes as bac120 and ar122 MSA
        # combined
        tax_file = os.path.join(self.pack_dir, 'taxonomy', 'gtdb_taxonomy.tsv')
        tax_dict = {}
        with open(tax_file) as tf:
            for line in tf:
                infos = line.strip().split('\t')
                tax_dict[infos[0]] = infos[1]
        if len(tax_dict) != (len(ar_msa) + len(bac_msa)):
            print 'ERROR: len(tax_dict) != (len(ar_msa) + len(bac_msa))'
            print 'len(tax_dict): {}'.format(len(tax_dict))
            print 'len(ar_msa) + len(bac_msa): {}'.format(
                len(ar_msa) + len(bac_msa))

        # Radii file should have as many genomes as bac120 and ar122 MSA
        # combined
        radii_file = os.path.join(self.pack_dir, 'radii', 'gtdb_radii.tsv')
        radii_dict = {}
        with open(radii_file) as rf:
            for line in rf:
                infos = line.strip().split('\t')
                radii_dict[infos[1]] = infos[2]
        if len(radii_dict) != (len(ar_msa) + len(bac_msa)):
            print 'ERROR: len(radii_dict) != (len(ar_msa) + len(bac_msa))'
            print 'len(radii_dict): {}'.format(len(radii_dict))
            print 'len(ar_msa) + len(bac_msa): {}'.format(
                len(ar_msa) + len(bac_msa))
        if len(set(radii_dict.keys()).symmetric_difference(
                tax_dict.keys())) != 0:
            print 'ERROR: len(set(radii_dict.keys()).symmetric_difference(tax_dict.keys()))'
            print 'set(radii_dict.keys()).symmetric_difference(tax_dict.keys()): {}'.format(
                set(radii_dict.keys()).symmetric_difference(tax_dict.keys()))

        if len(list_genomes) != len(radii_dict):
            print 'ERROR: len(list_genomes) != len(radii_dict)'
            print 'len(list_genomes): {}'.format(len(list_genomes))
            print 'len(radii_dict): {}'.format(len(radii_dict))

        print '\n\nVERSION: {}'.format(version)
        print 'Length trimmed bac120 MSA: {}'.format(
            len(bac_pplacer_msa.get(bac_pplacer_msa.keys()[0])))
        print 'Length trimmed ar122 MSA: {}'.format(
            len(ar_pplacer_msa.get(ar_pplacer_msa.keys()[0])))
        print ''
        print 'Number of genomes in fastani/database: {}'.format(
            len(list_genomes))
        print 'Number of genomes in radii file: {}'.format(len(radii_dict))
        print 'Number of genomes in taxonomy file: {}'.format(len(tax_dict))

        print 'Would you like to archive the folder? '
        # raw_input returns the empty string for "enter"

        yes = {'yes', 'y', 'yep', ''}
        no = {'no', 'n'}

        final_choice = False
        choice = raw_input().lower()
        if choice in yes:
            with tarfile.open(outf, "w:gz") as tar:
                packdir = copy.copy(self.pack_dir)
                if packdir.endswith('/'):
                    packdir = packdir[:-1]
                tar.add(self.pack_dir, arcname=os.path.basename(packdir))
        elif choice in no:
            return False
        else:
            sys.stdout.write("Please respond with 'yes' or 'no'")
Пример #26
0
    def compare(self, bin_files1, bin_files2, assembly_file, output_file):
        """Compare bins from two different binning methods."""
        
        # determine total number of sequences
        self.logger.info('Reading bins.')
        seqs = read_fasta(assembly_file)

        seq_lens = {}
        total_bases = 0
        num_seq1K = 0
        total_bases1K = 0
        num_seq5K = 0
        total_bases5K = 0
        for seq_id, seq in seqs.items():
            seq_len = len(seq)
            seq_lens[seq_id] = seq_len
            total_bases += seq_len
            if seq_len >= 1000:
                num_seq1K += 1
                total_bases1K += seq_len
            if seq_len >= 5000:
                num_seq5K += 1
                total_bases5K += seq_len

        # determine sequences in each bin
        bins1 = self._read_seq_ids(bin_files1)
        bins2 = self._read_seq_ids(bin_files2)

        # determine bin stats
        bin_stats1, total_uniq_binned_seqs1, tota_uniq_binned_bases1, num_repeats1 = self._binning_stats(bins1, seq_lens)
        bin_stats2, total_uniq_binned_seqs2, tota_uniq_binned_bases2, num_repeats2 = self._binning_stats(bins2, seq_lens)

        # sort bins by size
        bin_stats1 = sorted(bin_stats1.items(), key=lambda x: x[1][1], reverse=True)
        bin_stats2 = sorted(bin_stats2.items(), key=lambda x: x[1][1], reverse=True)

        # report summary results
        print()
        print('Assembled sequences = %d (%.2f Mbp)' % (len(seqs), total_bases / 1e6))
        print('  No. seqs > 1 kbp = %d (%.2f Mbp)' % (num_seq1K, total_bases1K / 1e6))
        print('  No. seqs > 5 kbp = %d (%.2f Mbp)' % (num_seq5K, total_bases5K / 1e6))
        print()
        print('Binning statistics:')
        print('  1) No. bins: %s, No. binned seqs: %d (%.2f%%), No. binned bases: %.2f Mbp (%.2f%%), No. seqs in multiple bins: %d'
                                % (len(bins1),
                                   total_uniq_binned_seqs1,
                                   total_uniq_binned_seqs1 * 100 / len(seqs),
                                   tota_uniq_binned_bases1 / 1e6,
                                   tota_uniq_binned_bases1 * 100 / total_bases,
                                   num_repeats1))
        print('  2) No. bins: %s, No. binned seqs: %d (%.2f%%), No. binned bases: %.2f Mbp (%.2f%%), No. seqs in multiple bins: %d'
                                % (len(bins2),
                                   total_uniq_binned_seqs2,
                                   total_uniq_binned_seqs2 * 100 / len(seqs),
                                   tota_uniq_binned_bases2 / 1e6,
                                   tota_uniq_binned_bases2 * 100 / total_bases,
                                   num_repeats2))
        print()

        # output report
        fout = open(output_file, 'w')
        for data in bin_stats2:
            fout.write('\t' + data[0])
        fout.write('\tUnbinned\tNo. Sequences\tNo. Bases (Mbp)\tBest Match\tBases in Common (%)\tSequences in Common (%)\n')

        total_seqs_in_common2 = defaultdict(int)
        max_bases_in_common2 = defaultdict(int)
        max_seqs_in_common2 = defaultdict(int)
        best_matching_bins2 = {}
        binned_seqs2 = defaultdict(set)
        for data1 in bin_stats1:
            bin_id1 = data1[0]
            fout.write(bin_id1)

            seqs1 = bins1[bin_id1]

            max_bases_in_common = 0
            max_seqs_in_common = 0
            best_matching_bin = 'n/a'
            binned_seqs = set()
            for data2 in bin_stats2:
                bin_id2 = data2[0]
                seqs2 = bins2[bin_id2]

                seqs_in_common = seqs1.intersection(seqs2)
                binned_seqs.update(seqs_in_common)
                num_seqs_in_common = len(seqs_in_common)
                fout.write('\t' + str(num_seqs_in_common))

                bases_in_common = 0
                for seq_id in seqs_in_common:
                    bases_in_common += seq_lens[seq_id]

                if bases_in_common > max_bases_in_common:
                    max_bases_in_common = bases_in_common
                    max_seqs_in_common = num_seqs_in_common
                    best_matching_bin = bin_id2

                if bases_in_common > max_bases_in_common2[bin_id2]:
                    max_bases_in_common2[bin_id2] = bases_in_common
                    max_seqs_in_common2[bin_id2] = num_seqs_in_common
                    best_matching_bins2[bin_id2] = bin_id1

                binned_seqs2[bin_id2].update(seqs_in_common)
            fout.write('\t%d\t%d\t%.2f\t%s\t%.2f\t%.2f\n' % (len(seqs1) - len(binned_seqs),
                                                             data1[1][0],
                                                             data1[1][1] / 1e6,
                                                             best_matching_bin,
                                                             max_bases_in_common * 100 / data1[1][1],
                                                             max_seqs_in_common * 100 / data1[1][0],
                                                             ))

        fout.write('Unbinned')
        for data in bin_stats2:
            binId = data[0]
            fout.write('\t%d' % (len(bins2[binId]) - len(binned_seqs2[binId])))
        fout.write('\n')

        fout.write('No. Sequences')
        for data in bin_stats2:
            fout.write('\t%d' % data[1][0])
        fout.write('\n')

        fout.write('No. Bases (Mbp)')
        for data in bin_stats2:
            fout.write('\t%.2f' % (data[1][1] / 1e6))
        fout.write('\n')

        fout.write('Best Match')
        for data in bin_stats2:
            binId = data[0]
            fout.write('\t%s' % best_matching_bins2.get(binId, 'n/a'))
        fout.write('\n')

        fout.write('Bases in Common (%)')
        for data in bin_stats2:
            binId = data[0]
            fout.write('\t%.2f' % (max_bases_in_common2[binId] * 100 / data[1][1]))
        fout.write('\n')

        fout.write('Sequences in Common (%)')
        for data in bin_stats2:
            binId = data[0]
            fout.write('\t%.2f' % (max_seqs_in_common2[binId] * 100 / data[1][0]))
        fout.write('\n')

        fout.close()
Пример #27
0
    def _trim_seqs(self,
                   input_msa,
                   output_msa,
                   remove_identical=False,
                   min_per_taxa=0.5,
                   min_bp=1000):
        """Trim ends of sequences.

        input_msa : str
            File with MSA to trim.
        output_msa : str
            New file with trimmed MSA.
        remove_identical : boolean
            Flag indicating if identical sequence should be removed.
        min_per_taxa : float [0, 1.0]
            Minimum required taxa to retain leading and trailing columns.
        min_bp : int
            Minimum required length to retain sequence.
        """

        # read seqs
        seqs = seq_io.read_fasta(input_msa)

        # filter identical seqs
        identical_seqs = set()
        if remove_identical:
            self.logger.info('Filtering identical sequences.')

            seq_ids = list(seqs.keys())
            for i in range(0, len(seq_ids)):
                seq_id_I = seq_ids[i]

                if seq_id_I in identical_seqs:
                    continue

                for j in range(i + 1, len(seqIds)):
                    seq_id_J = seq_ids[j]
                    if seqs[seq_id_I] == seqs[seq_id_J]:
                        self.logger.info('Seq %s and %s are identical.' %
                                         (seq_id_I, seq_id_J))
                        identical_seqs.add(seq_id_J)

            self.logger.info('Identified %d of %d sequences as identical.' %
                             (len(identical_seqs), len(seqs)))

        # trim start and end columns to consensus alignment
        first_char = []
        last_char = []
        for seq_id, seq in seqs.items():
            if seq_id in identical_seqs:
                continue

            for i, ch in enumerate(seq):
                if ch != '.' and ch != '-':
                    first_char.append(i)
                    break

            for i in range(len(seq) - 1, -1, -1):
                if seq[i] != '.' and seq[i] != '-':
                    last_char.append(i)
                    break

        first_char.sort()
        last_char.sort(reverse=True)

        trim_index = int((len(seqs) * min_per_taxa))

        start = first_char[trim_index]
        end = last_char[trim_index]

        self.logger.info(
            'Trimming seqs from %d to %d leaving a %dbp length alignment.' %
            (start, end, end - start + 1))

        short_seq_file = output_msa + '.short'
        fout = open(output_msa, 'w')
        fout_short = open(short_seq_file, 'w')
        num_filtered_seq = 0
        for seq_id, seq in seqs.items():
            if seq_id in identical_seqs:
                continue

            valid_bp = 0
            for i in range(start, min(len(seq), end + 1)):
                ch = seq[i]
                if ch != '.' and ch != '-':
                    valid_bp += 1

            if valid_bp >= min_bp:
                fout.write('>' + seq_id + '\n')
                fout.write(seq[start:end + 1] + '\n')
            else:
                self.logger.info(
                    'Filtering seq %s with %d of %d (%.1f%%) aligned bases.' %
                    (seq_id, valid_bp,
                     (end - start + 1), valid_bp * 100.0 / (end - start + 1)))
                num_filtered_seq += 1
                fout_short.write('>' + seq_id + '\n')
                fout_short.write(seq[start:end + 1] + '\n')

        fout.close()
        fout_short.close()

        self.logger.info('Filtered %d of %d sequences due to length.' %
                         (num_filtered_seq, len(seqs) - len(identical_seqs)))
        self.logger.info('Short sequence written to: %s' % short_seq_file)
Пример #28
0
    def run(self, homolog_file, min_per_taxa, consensus, min_per_bp,
            use_trimAl, msa_program, output_dir):
        """Create multiple sequence alignment.

        Parameters
        ----------
        homolog_file : str
            File containing sequences to align
        min_per_taxa : float
            Minimum percentage of taxa required to retain a column.
        consensus : float
            Minimum percentage of the same amino acid required to retain column.
        min_per_bp : float
            Minimum percentage of base pairs required to keep trimmed sequence.
        use_trimAl : boolean
            Filter columns using trimAl.
        msa_program : str
            Program to use for multiple sequence alignment ['mafft', 'muscle'].
        output_dir : str
            Directory to store results.
        """

        # infer multiple sequence alignment
        self.logger.info('Inferring multiple sequence alignment with %s.' %
                         msa_program)

        output_file = ntpath.basename(homolog_file)
        prefix = output_file[0:output_file.rfind('.')]
        suffix = output_file[output_file.rfind('.') + 1:]

        msa_output = os.path.join(output_dir, prefix + '.aligned.' + suffix)
        if msa_program == 'mafft':
            mafft = Mafft(self.cpus)
            msa_log = os.path.join(output_dir, 'mafft.log')
            mafft.run(homolog_file, msa_output, msa_log)
        elif msa_program == 'muscle':
            muscle = Muscle()
            msa_log = os.path.join(output_dir, 'muscle.log')
            muscle.run(homolog_file, msa_output, msa_log)

        # trim multiple sequence alignment
        trimmed_msa_output = os.path.join(
            output_dir, prefix + '.trimmed.aligned.' + suffix)
        if use_trimAl:
            self.logger.info(
                'Using trimAl to filter poorly represented columns from alignment.'
            )

            # convert MSA to relaxed phylip format
            phylip_msa_output = msa_output.replace('.faa', '.phyx')
            cmd = 'seqmagick convert %s %s' % (msa_output, phylip_msa_output)
            os.system(cmd)

            tmp_output = os.path.join(output_dir, 'tmp.faa')
            cmd = 'trimal -in %s -out %s -automated1 -fasta' % (
                phylip_msa_output, tmp_output)
            os.system(cmd)

            cmd = 'trimal -in %s -out %s -resoverlap 0.75 -seqoverlap %f' % (
                tmp_output, trimmed_msa_output, min_per_bp)
            os.system(cmd)

            seqs = seq_io.read_fasta(msa_output)
            tmp_seqs = seq_io.read_fasta(tmp_output)
            trimmed_seqs = seq_io.read_fasta(trimmed_msa_output)
            self.logger.info(
                'Trimmed alignment from %d to %d AA.' %
                (len(seqs.values()[0]), len(trimmed_seqs.values()[0])))
            self.logger.info(
                '%d of %d taxa were deemed to be too short and removed.' %
                (len(tmp_seqs) - len(trimmed_seqs), len(seqs)))
            os.remove(tmp_output)
        else:
            self.logger.info(
                'Trimming poorly represented columns from alignment.')
            seqs = seq_io.read_fasta(msa_output, keep_annotation=True)
            trimmed_seqs, pruned_seqs, min_taxa_filtered, consensus_filtered = seq_tk.trim_seqs(
                seqs, min_per_taxa / 100.0, consensus / 100.0,
                min_per_bp / 100.0)

            self.logger.info(
                'Trimmed alignment from %d to %d AA (%d by minimum taxa percent, %d by consensus).'
                % (len(seqs.values()[0]), len(trimmed_seqs.values()[0]),
                   min_taxa_filtered, consensus_filtered))
            self.logger.info(
                '%d of %d taxa were deemed to be too short and removed.' %
                (len(pruned_seqs), len(seqs)))

            if len(pruned_seqs) > 0:
                prune_seqs_out = os.path.join(output_dir,
                                              'filtered_seqs.too_short.txt')
                self.logger.info('Pruned sequences written to %s.' %
                                 prune_seqs_out)
                seq_io.write_fasta(pruned_seqs, prune_seqs_out)

            if len(pruned_seqs) == len(seqs):
                self.logger.error(
                    'Too many sequences were pruned. Gene tree cannot be inferred.'
                )
                sys.exit()

            seq_io.write_fasta(trimmed_seqs, trimmed_msa_output)

        return trimmed_msa_output
Пример #29
0
    def _parse_taxonomy_file(self, genome_id, metadata_taxonomy_file, fout, prefix, fna_file, summary_file=None):
        """Parse metadata file with taxonomic information for 16S rRNA genes.

        Parameters
        ----------
        genome_id : str
          Unique identifier of genome.
        metadata_taxonomy_file : str
          Full path to file containing 16S rRNA metadata.
        fout : file
          Output stream to populate with metadata.
        Prefix : str
          Prefix to append to metadata fields.

        Returns
        -------
        int
          Number of 16S rRNA genes identified in genome.
        """

        if not os.path.exists(metadata_taxonomy_file):
            return 0

        with open(metadata_taxonomy_file) as f:
            header_line = f.readline()  # consume header line
            if prefix not in self.taxonomy_headers:
                self.taxonomy_headers.add(prefix)

                fout.write('genome_id')
                headers = [prefix + '_' + x.strip().replace('ssu_', '') for x in header_line.split('\t')]
                fout.write('\t' + '\t'.join(headers))
                fout.write('\t{0}_sequence\t{0}_contig_len\n'.format(prefix))

            # Check the CheckM headers are consistent
            split_headers = header_line.rstrip().split("\t")
            for pos in range(0, len(split_headers)):
                header = split_headers[pos]
                if header == 'query_id':
                    query_id_pos = pos
                    break

            # Report hit to longest 16S rRNA gene. It is possible that
            # the HMMs identified a putative 16S rRNA gene, but that
            # there was no valid BLAST hit.
            longest_query_len = 0
            longest_ssu_hit_info = None
            identified_ssu_genes = 0
            for line in f:
                line_split = line.strip().split('\t')
                query_len = int(line_split[2])
                if query_len > longest_query_len:
                    longest_query_len = query_len
                    longest_ssu_hit_info = line_split
                    ssu_query_id = line_split[query_id_pos]

            if longest_ssu_hit_info:
                fout.write(genome_id)
                fout.write('\t' + '\t'.join(longest_ssu_hit_info))

                all_genes_dict = read_fasta(fna_file, False)
                sequence = all_genes_dict[ssu_query_id]
                fout.write('\t{0}'.format(sequence))
                if summary_file is not None and os.path.exists(summary_file):
                    with open(summary_file) as fsum:
                        header_line = fsum.readline()  # consume header line
                        header_list = [x.strip() for x in header_line.split('\t')]
                        idx_seq = header_list.index("Sequence length")
                        for line in fsum:
                            identified_ssu_genes += 1
                            sum_list = [x.strip() for x in line.split('\t')]
                            if sum_list[0] == ssu_query_id:
                                fout.write("\t{0}".format(sum_list[idx_seq]))

                fout.write('\n')

            return identified_ssu_genes
Пример #30
0
    def run(self, genomes, align_dir, out_dir, prefix, debugopt=False):
        try:
            """Classify genomes based on position in reference tree."""

            for marker_set_id in ('bac120', 'ar122'):
                user_msa_file = os.path.join(
                    align_dir, prefix + '.%s.user_msa.fasta' % marker_set_id)
                if not os.path.exists(user_msa_file):
                    # file will not exist if there are no User genomes from a given domain
                    continue

                classify_tree = self.place_genomes(user_msa_file,
                                                   marker_set_id, out_dir,
                                                   prefix)

                # get taxonomic classification of each user genome
                tree = dendropy.Tree.get_from_path(classify_tree,
                                                   schema='newick',
                                                   rooting='force-rooted',
                                                   preserve_underscores=True)

                gtdb_taxonomy = Taxonomy().read(self.taxonomy_file)

                fout = open(
                    os.path.join(
                        out_dir,
                        prefix + '.%s.classification.tsv' % marker_set_id),
                    'w')
                fastaniout = open(
                    os.path.join(
                        out_dir,
                        prefix + '.%s.fastani_results.tsv' % marker_set_id),
                    'w')
                redfout = open(
                    os.path.join(out_dir,
                                 prefix + '.%s.summary.tsv' % marker_set_id),
                    'w')
                if debugopt:
                    parchiinfo = open(
                        os.path.join(
                            out_dir,
                            prefix + '.%s.debug_file.tsv' % marker_set_id),
                        'w')

                reddictfile = open(
                    os.path.join(
                        out_dir,
                        prefix + '.%s.red_dictionary.tsv' % marker_set_id),
                    'w')

                marker_dict = {}
                if marker_set_id == 'bac120':
                    marker_dict = Config.RED_DIST_BAC_DICT
                elif marker_set_id == 'ar122':
                    marker_dict = Config.RED_DIST_ARC_DICT
                reddictfile.write('Phylum\t{0}\n'.format(
                    marker_dict.get('p__')))
                reddictfile.write('Class\t{0}\n'.format(
                    marker_dict.get('c__')))
                reddictfile.write('Order\t{0}\n'.format(
                    marker_dict.get('o__')))
                reddictfile.write('Family\t{0}\n'.format(
                    marker_dict.get('f__')))
                reddictfile.write('Genus\t{0}\n'.format(
                    marker_dict.get('g__')))
                reddictfile.close()

                fastaniout.write("User genome\tReference genome\tANI\n")
                redfout.write(
                    "user_genome\tclassification_method\tred_value\n")
                if debugopt:
                    parchiinfo.write(
                        "User genome\tHigher rank\tHigher value\tLower rank\tLower value\tcase\tclosest_rank\n"
                    )

                # Genomes can be classified by using Mash or RED values
                # We go through all leaves of the tree. if the leaf is a user genome we take it's parent node and look at all the leaves for this node.
                # If the parent node has only one Reference genome ( GB or RS ) we calculate the mash distance between the user genome and the reference genome
                analysed_nodes = []
                fastani_dict = {}
                all_fastani_dict = {}

                fastani_list = []
                # some genomes of Case C are handled here, if Mash distance is close enough
                self.logger.info(
                    'Calculating Average Nucleotide Identity using FastANI.')

                for nd in tree.preorder_node_iter():
                    #We store the prefixes of each leaves to check if one starts with GB_ or RS_
                    list_subnode_initials = [
                        subnd.taxon.label.replace("'", '')[0:3]
                        for subnd in nd.leaf_iter()
                    ]
                    list_subnode = [
                        subnd.taxon.label.replace("'", '')
                        for subnd in nd.leaf_iter()
                    ]
                    #if only one genome is a reference genome
                    if (list_subnode_initials.count('RS_') +
                            list_subnode_initials.count('GB_') +
                            list_subnode_initials.count('UBA')) == 1 and len(
                                list_subnode_initials
                            ) > 1 and list_subnode[0] not in analysed_nodes:
                        fastani_list.append(list_subnode)
                        analysed_nodes.extend(list_subnode)

                manager = multiprocessing.Manager()
                out_q = manager.dict()
                procs = []
                nprocs = self.cpus
                if len(fastani_list) > 0:
                    for item in splitchunks_list(fastani_list, nprocs):
                        p = multiprocessing.Process(target=self._fastaniWorker,
                                                    args=(item, genomes,
                                                          out_q))
                        procs.append(p)
                        p.start()

                    # Collect all results into a single result dict. We know how many dicts
                    # with results to expect.
                    #while out_q.empty():
                    #    time.sleep(1)

                    # Wait for all worker processes to finish
                    for p in procs:
                        p.join()
                        if p.exitcode == 1:
                            raise ValueError("Stop!!")

                    all_fastani_dict = dict(out_q)

                for k, v in all_fastani_dict.iteritems():
                    fastaniout.write("{0}\t{1}\t{2}\n".format(
                        k, v.get("ref_genome"), v.get("ani")))
                    if Config.FASTANI_SPECIES_THRESHOLD <= v.get("ani"):
                        suffixed_name = add_ncbi_prefix(v.get("ref_genome"))
                        taxa_str = ";".join(gtdb_taxonomy.get(suffixed_name))
                        if taxa_str.endswith("s__"):
                            taxa_str = taxa_str + v.get("ref_genome")
                        fout.write('%s\t%s\n' % (k, taxa_str))
                        fastani_dict[k] = v
                        redfout.write("{0}\tani\tNone\n".format(k))
                fastaniout.close()

                self.logger.info(
                    '{0} genomes have been classify with FastANI.'.format(
                        len(fastani_dict)))

                scaled_tree = self._calculate_red_distances(
                    classify_tree, out_dir)

                user_genome_ids = set(read_fasta(user_msa_file).keys())
                user_genome_ids = user_genome_ids.difference(
                    set(fastani_dict.keys()))
                # for all other cases we measure the RED distance between a leaf and a parent node ( RED = 1-edge_length). This RED value will tell us
                # the rank level that can be associated with a User genome.
                # As an example if the RED value is close to the order level, the user genome will take the order level of the Reference genome under the same parent node.
                # Is there are multiple orders under the parent node. The user genome is considered as a new order
                for leaf in scaled_tree.leaf_node_iter():
                    if leaf.taxon.label in user_genome_ids:
                        taxa = []
                        # In some cases , pplacer can associate 2 user genomes on the same parent node so we need to go up the tree to find a node with a reference genome as leaf.
                        cur_node = leaf.parent_node
                        list_subnode_initials = [
                            subnd.taxon.label.replace("'", '')[0:3]
                            for subnd in cur_node.leaf_iter()
                        ]
                        while 'RS_' not in list_subnode_initials and 'GB_' not in list_subnode_initials and 'UBA' not in list_subnode_initials:
                            cur_node = cur_node.parent_node
                            list_subnode_initials = [
                                subnd.taxon.label.replace("'", '')[0:3]
                                for subnd in cur_node.leaf_iter()
                            ]

                        current_rel_list = cur_node.rel_dist

                        parent_taxon_node = cur_node.parent_node
                        _support, parent_taxon, _aux_info = parse_label(
                            parent_taxon_node.label)

                        while parent_taxon_node is not None and not parent_taxon:
                            parent_taxon_node = parent_taxon_node.parent_node
                            _support, parent_taxon, _aux_info = parse_label(
                                parent_taxon_node.label)

                        parent_rank = parent_taxon.split(";")[-1][0:3]
                        parent_rel_dist = parent_taxon_node.rel_dist

                        genome_parent_child = [
                            leaf.taxon.label, parent_rank, parent_rel_dist, '',
                            '', '', ''
                        ]

                        child_taxons = []
                        closest_rank = None
                        detection = "RED"
                        # if the genome is placed between the genus and specie ranks , it will be associated with the genus when _get_closest_red_rank is called
                        if parent_rank != 'g__':
                            child_rk = self.order_rank[
                                self.order_rank.index(parent_rank) + 1]
                            list_subnode = [
                                childnd.taxon.label.replace("'", '')
                                for childnd in cur_node.leaf_iter()
                                if (childnd.taxon.label.startswith('RS_')
                                    or childnd.taxon.label.startswith('GB_'))
                            ]
                            list_ranks = [
                                gtdb_taxonomy.get(name)[self.order_rank.index(
                                    child_rk)] for name in list_subnode
                            ]
                            if len(set(list_ranks)) == 1:
                                for subranknd in cur_node.preorder_iter():
                                    _support, subranknd_taxon, _aux_info = parse_label(
                                        subranknd.label)
                                    if subranknd.is_internal(
                                    ) and subranknd_taxon is not None and subranknd_taxon.startswith(
                                            child_rk):
                                        child_taxons = subranknd_taxon.split(
                                            ";")
                                        child_taxon_node = subranknd
                                        child_rel_dist = child_taxon_node.rel_dist
                                        break
                            else:
                                #case 2a and 2b
                                closest_rank = parent_rank
                                detection = "Topology"
                        else:
                            #case 1a
                            closest_rank = parent_rank
                            detection = "Topology"

                        #case 1b
                        if len(child_taxons) == 0 and closest_rank is None:
                            list_leaves = [
                                childnd.taxon.label.replace("'", '')
                                for childnd in cur_node.leaf_iter()
                                if (childnd.taxon.label.startswith('RS_')
                                    or childnd.taxon.label.startswith('GB_'))
                            ]
                            if len(list_leaves) != 1:
                                self.logger.error(
                                    'There should be only one leaf.')
                                sys.exit(-1)
                            list_leaf_ranks = gtdb_taxonomy.get(
                                list_leaves[0])[self.order_rank.index(child_rk
                                                                      ):-1]
                            for leaf_taxon in reversed(list_leaf_ranks):
                                if leaf_taxon == list_leaf_ranks[0]:
                                    if abs(current_rel_list - marker_dict.get(
                                            leaf_taxon[:3])) < abs(
                                                (current_rel_list) -
                                                marker_dict.get(parent_rank)):
                                        #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ):
                                        closest_rank = leaf_taxon[:3]
                                        genome_parent_child[3] = leaf_taxon
                                        genome_parent_child[
                                            5] = 'case 1b - III'
                                        break
                                else:
                                    pchildrank = list_leaf_ranks[
                                        list_leaf_ranks.index(leaf_taxon) - 1]
                                    if abs(
                                            current_rel_list -
                                            marker_dict.get(leaf_taxon[:3])
                                    ) < abs(current_rel_list -
                                            marker_dict.get(pchildrank[:3])):
                                        #and current_rel_list - marker_dict.get(leaf_taxon[:3]) > 0 ) :
                                        closest_rank = leaf_taxon[:3]
                                        genome_parent_child[1] = pchildrank
                                        genome_parent_child[2] = 1.0
                                        genome_parent_child[3] = leaf_taxon
                                        genome_parent_child[5] = 'case 1b - II'
                                        break
                            if closest_rank is None:
                                closest_rank = parent_rank
                                genome_parent_child[3] = list_leaf_ranks[0]
                                genome_parent_child[5] = 'case 1b - IV'

                        #if there is multiple ranks on the child node (i.e genome between p__Nitrospirae and c__Nitrospiria;o__Nitrospirales;f__Nitropiraceae)
                        #we loop through the list of rank from f_ to c_ rank
                        for child_taxon in reversed(child_taxons):
                            # if lower rank is c__Nitropiria
                            if child_taxon == child_taxons[0]:
                                if (abs(current_rel_list -
                                        marker_dict.get(child_taxon[:3])) <
                                        abs(child_rel_dist -
                                            marker_dict.get(child_taxon[:3]))
                                        and
                                        abs(current_rel_list -
                                            marker_dict.get(child_taxon[:3])) <
                                        abs(current_rel_list -
                                            marker_dict.get(parent_rank))):
                                    genome_parent_child[3] = ';'.join(
                                        child_taxons)
                                    genome_parent_child[4] = child_rel_dist
                                    genome_parent_child[5] = 'case 3b - II'
                                    closest_rank = child_taxon[:3]
                                elif closest_rank is None:
                                    closest_rank = parent_rank
                                    genome_parent_child[3] = ';'.join(
                                        child_taxons)
                                    genome_parent_child[4] = child_rel_dist
                                    genome_parent_child[5] = 'case 3b - III'
                            else:
                                pchildrank = child_taxons[
                                    child_taxons.index(child_taxon) - 1]
                                if (abs(current_rel_list -
                                        marker_dict.get(child_taxon[:3])) <
                                        abs(current_rel_list -
                                            marker_dict.get(pchildrank[:3]))
                                        and
                                        abs(current_rel_list -
                                            marker_dict.get(child_taxon[:3])) <
                                        abs(child_rel_dist -
                                            marker_dict.get(child_taxon[:3]))):
                                    closest_rank = child_taxon
                                    genome_parent_child[3] = ';'.join(
                                        child_taxons)
                                    genome_parent_child[4] = child_rel_dist
                                    genome_parent_child[5] = 'case 3b - I'
                                    break

                        # case 1b
                        if closest_rank is None:
                            print "IT SHOULDN'T HAPPEN!!!"

                        genome_parent_child[6] = closest_rank

                        list_subnode = [
                            subnd.taxon.label.replace("'", '')
                            for subnd in cur_node.leaf_iter()
                        ]
                        red_taxonomy = self._get_redtax(
                            list_subnode, closest_rank, gtdb_taxonomy)

                        fout.write('{0}\t{1}\n'.format(leaf.taxon.label,
                                                       red_taxonomy))
                        del genome_parent_child[0]
                        redfout.write("{0}\t{1}\t{2}\n".format(
                            leaf.taxon.label, detection, current_rel_list))
                        if debugopt:
                            parchiinfo.write('{0}\t{1}\t{2}\t{3}\n'.format(
                                leaf.taxon.label, current_rel_list,
                                '\t'.join(str(x) for x in genome_parent_child),
                                detection))

                redfout.close()
                fout.close()
                if debugopt:
                    parchiinfo.close()

                pplaceout = open(
                    os.path.join(
                        out_dir, prefix +
                        '.%s.classification_pplacer.tsv' % marker_set_id), 'w')

                # We get the pplacer taxonomy for comparison
                user_genome_ids = set(read_fasta(user_msa_file).keys())
                for leaf in tree.leaf_node_iter():
                    if leaf.taxon.label in user_genome_ids:
                        taxa = []
                        cur_node = leaf
                        while cur_node.parent_node:
                            _support, taxon, _aux_info = parse_label(
                                cur_node.label)
                            if taxon:
                                for t in taxon.split(';')[::-1]:
                                    taxa.append(t.strip())
                            cur_node = cur_node.parent_node
                        taxa_str = ';'.join(taxa[::-1])
                        pplaceout.write('%s\t%s\n' %
                                        (leaf.taxon.label, taxa_str))
                pplaceout.close()
        except ValueError as error:
            print "GTDB-Tk has stopped before finishing"
            sys.exit(-1)
        except Exception as error:
            print "GTDB-Tk has stopped before finishing"
            sys.exit(-1)
Пример #31
0
    def _report_identified_marker_genes(self, gene_dict, outdir, prefix):
        """Report statistics for identified marker genes."""

        bac_outfile = open(
            os.path.join(outdir, prefix + "_bac120_markers_summary.tsv"), "w")
        arc_outfile = open(
            os.path.join(outdir, prefix + "_ar122_markers_summary.tsv"), "w")

        header = "Name\tnumber_unique_genes\tnumber_multiple_genes\tnumber_missing_genes\tlist_unique_genes\tlist_multiple_genes\tlist_missing_genes\n"

        bac_outfile.write(header)
        arc_outfile.write(header)

        # gather information for all marker genes
        marker_dbs = {
            "PFAM": ConfigMetadata.PFAM_TOP_HIT_SUFFIX,
            "TIGR": ConfigMetadata.TIGRFAM_TOP_HIT_SUFFIX
        }

        marker_bac_list_original = []
        for db_marker in Config.BAC120_MARKERS.keys():
            marker_bac_list_original.extend([
                marker.replace(".HMM", "").replace(".hmm", "")
                for marker in Config.BAC120_MARKERS[db_marker]
            ])

        marker_arc_list_original = []
        for db_marker in Config.AR122_MARKERS.keys():
            marker_arc_list_original.extend([
                marker.replace(".HMM", "").replace(".hmm", "")
                for marker in Config.AR122_MARKERS[db_marker]
            ])

        for db_genome_id, info in gene_dict.items():
            unique_genes_bac, multi_hits_bac, missing_genes_bac = [], [], []
            unique_genes_arc, multi_hits_arc, missing_genes_arc = [], [], []

            gene_bac_dict, gene_arc_dict = {}, {}

            path = info.get("aa_gene_path")
            for _marker_db, marker_suffix in marker_dbs.iteritems():
                # get all gene sequences
                protein_file = str(path)
                tophit_path = protein_file.replace(
                    ConfigMetadata.PROTEIN_FILE_SUFFIX, marker_suffix)

                # we load the list of all the genes detected in the genome
                all_genes_dict = read_fasta(protein_file, False)

                # Prodigal adds an asterisks at the end of each called genes.
                # These asterisks sometimes appear in the MSA, which can be
                # an issue for some downstream software
                for seq_id, seq in all_genes_dict.iteritems():
                    if seq[-1] == '*':
                        all_genes_dict[seq_id] = seq[:-1]

                # we store the tophit file line by line and store the
                # information in a dictionary
                with open(tophit_path) as tp:
                    # first line is header line
                    tp.readline()

                    for line_tp in tp:
                        linelist = line_tp.split("\t")
                        genename = linelist[0]
                        sublist = linelist[1]
                        if ";" in sublist:
                            diff_markers = sublist.split(";")
                        else:
                            diff_markers = [sublist]

                        for each_mark in diff_markers:
                            sublist = each_mark.split(",")
                            markerid = sublist[0]

                            if (markerid not in marker_bac_list_original and
                                    markerid not in marker_arc_list_original):
                                continue

                            if markerid in marker_bac_list_original:
                                if markerid in gene_bac_dict:
                                    gene_bac_dict.get(
                                        markerid)["multihit"] = True
                                else:
                                    gene_bac_dict[markerid] = {
                                        "gene": genename,
                                        "multihit": False
                                    }

                            if markerid in marker_arc_list_original:
                                if markerid in gene_arc_dict:
                                    gene_arc_dict.get(
                                        markerid)["multihit"] = True
                                else:
                                    gene_arc_dict[markerid] = {
                                        "gene": genename,
                                        "multihit": False
                                    }

            for mid in marker_bac_list_original:
                if mid not in gene_bac_dict:
                    missing_genes_bac.append(mid)
                elif gene_bac_dict[mid]["multihit"]:
                    multi_hits_bac.append(mid)
                else:
                    unique_genes_bac.append(mid)

            for mid in marker_arc_list_original:
                if mid not in gene_arc_dict:
                    missing_genes_arc.append(mid)
                elif gene_arc_dict[mid]["multihit"]:
                    multi_hits_arc.append(mid)
                else:
                    unique_genes_arc.append(mid)

            bac_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(
                db_genome_id, len(unique_genes_bac), len(multi_hits_bac),
                len(missing_genes_bac), ','.join(unique_genes_bac),
                ','.join(multi_hits_bac), ','.join(missing_genes_bac)))

            arc_outfile.write("{0}\t{1}\t{2}\t{3}\t{4}\t{5}\t{6}\n".format(
                db_genome_id, len(unique_genes_arc), len(multi_hits_arc),
                len(missing_genes_arc), ','.join(unique_genes_arc),
                ','.join(multi_hits_arc), ','.join(missing_genes_arc)))

        bac_outfile.close()
        arc_outfile.close()
Пример #32
0
    def _runHmmMultiAlign(self, db_genome_id, path, marker_ids):
        '''
        Selects markers that are not aligned for a specific genome.

        :param db_genome_id: Selected genome
        :param path: Path to the genomic fasta file for the genome
        :param marker_ids: list of marker ids for the selected sets
        '''

        temp_con = GenomeDatabaseConnection()
        temp_con.MakePostgresConnection(self.release)
        temp_cur = temp_con.cursor()

        # gather information for all marker genes
        final_genome = []
        final_markerid = []
        final_seq = []
        final_multihits = []
        final_evalue = []
        final_bitscore = []

        marker_dbs = {
            "PFAM": self.pfam_top_hit_suffix,
            "TIGR": self.tigrfam_top_hit_suffix
        }
        for marker_db, marker_suffix in marker_dbs.iteritems():
            query = (
                "SELECT m.id_in_database,m.marker_file_location,m.size,m.id " +
                "FROM genomes as g, markers as m " +
                "LEFT JOIN marker_databases as md " +
                "ON md.id=m.marker_database_id " + "WHERE NOT EXISTS (" +
                "SELECT * FROM aligned_markers as am " +
                "WHERE am.genome_id = g.id and am.marker_id = m.id) " +
                "AND g.id = %s " + "AND m.id in %s " +
                "AND md.external_id_prefix like %s")
            temp_cur.execute(query,
                             (db_genome_id, tuple(marker_ids, ), marker_db))
            raw_results = temp_cur.fetchall()
            marker_dict_original = {
                a: {
                    "path": b,
                    "size": c,
                    "db_marker_id": d
                }
                for a, b, c, d in raw_results
            }

            # get all gene sequences
            genome_path = str(path)
            tophit_path = genome_path.replace(self.protein_file_suffix,
                                              marker_suffix)

            # we load the list of all the genes detected in the genome
            protein_file = tophit_path.replace(marker_suffix,
                                               self.protein_file_suffix)
            all_genes_dict = read_fasta(protein_file, False)

            # Prodigal adds an asterisks at the end of each called genes,
            # These asterisks sometimes appear in the MSA, which can be an issue for some softwares downstream
            for seq_id, seq in all_genes_dict.iteritems():
                if seq[-1] == '*':
                    all_genes_dict[seq_id] = seq[:-1]

            # we store the tophit file line by line and store the
            # information in a dictionary
            with open(tophit_path) as tp:
                # first line is header line
                tp.readline()
                gene_dict = {}
                for line_tp in tp:
                    linelist = line_tp.split("\t")
                    genename = linelist[0]
                    sublist = linelist[1]
                    if ";" in sublist:
                        diff_markers = sublist.split(";")
                    else:
                        diff_markers = [sublist]

                    for each_gene in diff_markers:
                        sublist = each_gene.split(",")
                        markerid = sublist[0]
                        if markerid not in marker_dict_original:
                            continue

                        evalue = sublist[1]
                        bitscore = sublist[2].strip()

                        if markerid in gene_dict:
                            oldbitscore = gene_dict.get(markerid).get(
                                "bitscore")
                            if oldbitscore < bitscore:
                                gene_dict[markerid] = {
                                    "marker_path":
                                    marker_dict_original.get(markerid).get(
                                        "path"),
                                    "gene":
                                    genename,
                                    "gene_seq":
                                    all_genes_dict.get(genename),
                                    "evalue":
                                    evalue,
                                    "bitscore":
                                    bitscore,
                                    "multihit":
                                    True,
                                    "db_marker_id":
                                    marker_dict_original.get(markerid).get(
                                        "db_marker_id")
                                }
                            else:
                                gene_dict.get(markerid)["multihit"] = True
                        else:
                            gene_dict[markerid] = {
                                "marker_path":
                                marker_dict_original.get(markerid).get("path"),
                                "gene":
                                genename,
                                "gene_seq":
                                all_genes_dict.get(genename),
                                "evalue":
                                evalue,
                                "bitscore":
                                bitscore,
                                "multihit":
                                False,
                                "db_marker_id":
                                marker_dict_original.get(markerid).get(
                                    "db_marker_id")
                            }

            for mid, info in marker_dict_original.iteritems():
                if mid not in gene_dict:
                    final_genome.append(db_genome_id)
                    final_markerid.append(info.get("db_marker_id"))
                    final_seq.append("-" * info.get("size"))
                    final_multihits.append(False)
                    final_evalue.append(None)
                    final_bitscore.append(None)

            result_aligns = self._runHmmAlign(gene_dict, db_genome_id)
            for result_align in result_aligns:
                final_genome.append(result_align[0])
                final_markerid.append(result_align[1])
                final_seq.append(result_align[2])
                final_multihits.append(result_align[3])
                final_evalue.append(result_align[4])
                final_bitscore.append(result_align[5])

        if final_genome:
            query = "SELECT upsert_aligned_markers(%s,%s,%s,%s,%s,%s)"
            temp_cur.execute(query,
                             (final_genome, final_markerid, final_seq,
                              final_multihits, final_evalue, final_bitscore))
        temp_con.commit()
        temp_cur.close()
        temp_con.ClosePostgresConnection()

        return True
Пример #33
0
 def _parse_sequence_file(self, fna_file, prefix, ssu_query_id):
     metadata = []
     all_genes_dict = read_fasta(fna_file, False)
     sequence = all_genes_dict[ssu_query_id]
     metadata.append(('{0}_sequence'.format(prefix), sequence))
     return metadata
Пример #34
0
    def __workerThread(self, 
                        probe_size,
                        probe_step_size,
                        mismatch,
                        min_aln_len,
                        keep_fragments,
                        results_dir,
                        queueIn, 
                        queueOut):
        """Process each data item in parallel.
        
        The reference genome is the genome from which probes are being
        designed. The aim is to determine how many of these 
        reference probes will hybridize to the target genome. 

        To determine the number of reference probes which will hybridize
        to the target genome, the target genome is fragmented
        into probe sized windows to determine how many of these are 
        nearly identical to the reference genome.
        """

        while True:
            ref_genome, target_genome = queueIn.get(block=True, timeout=None)
            if ref_genome == None:
                break

            ref_name = ntpath.basename(ref_genome).replace('.fasta', '').replace('.fna', '')
            target_name = ntpath.basename(target_genome).replace('.fasta', '').replace('.fna', '')
            
            if keep_fragments:
                fragment_dir = os.path.join(results_dir, 'fragments')
            else:
                fragment_dir = tempfile.mkdtemp()

            # count total number of reference genome probes
            ref_seqs = read_fasta(ref_genome)
            ref_genome_size = 0
            num_ref_probes = 0
            for seq in ref_seqs.values():
                num_ref_probes += (len(seq)-probe_size)/probe_step_size + 1 #sum([1 for i in range(0, len(seq)-probe_size, probe_step_size)])
                ref_genome_size += len(seq)

            # fragment target genome into probe sized windows
            window_file = os.path.join(fragment_dir, ref_name + '~' + target_name + '.fna')
            fout = open(window_file, 'w')
            target_seqs = read_fasta(target_genome)
            num_target_probes = 0
            target_windows = {}
            target_genome_size = 0
            for seq in target_seqs.values():
                target_genome_size += len(seq)
                
                for i in range(0, len(seq)-probe_size, probe_step_size):
                    fout.write('>probe_%d\n' % num_target_probes)
                    fout.write(seq[i:i+probe_size] + '\n')
                    target_windows[str(num_target_probes)] = seq[i:i+probe_size]
                    num_target_probes += 1
            fout.close()
                  
            # BLAST target probes against reference genome
            output_table = os.path.join(results_dir, ref_name + '~' + target_name + '.blast_hits.tsv')
            self._blastn(window_file, 
                            ref_genome, 
                            output_table, 
                            evalue=1e-2, 
                            max_matches=1, 
                            task='dc-megablast')

            window_hits = set()
            failed_similarity_test = set()
            output_file = os.path.join(results_dir, ref_name + '~' + target_name + '.probe_hits.tsv')
            fout = open(output_file, 'w')
            fout.write('Probe ID\tSubject ID\tProbe percent alignment\tPercent identity\tAdjusted percent identity\n')
            for hit in self._read_hit(output_table):
                adj_aln_len = hit.aln_len - hit.gaps
                query_aln_frac = adj_aln_len * 100.0 / hit.query_len
                adjusted_perc_identity = (adj_aln_len - hit.mismatch) * 100.0 / hit.query_len

                if (query_aln_frac >= (100*min_aln_len)
                    and adjusted_perc_identity >= (100*(1.0 - mismatch))):

                    if hit.query_id not in window_hits:
                        window_hits.add(hit.query_id)
                        fout.write('%s\t%s\t%.1f\t%.1f\t%.1f\n' % (hit.query_id, 
                                                                            hit.subject_id, 
                                                                            query_aln_frac,
                                                                            hit.perc_identity,
                                                                            adjusted_perc_identity))
                else:
                    failed_similarity_test.add(hit.query_id)
            fout.close()
            
            num_failed_similarity_test = len(target_windows) - len(window_hits)
            
            output_file = os.path.join(results_dir, ref_name + '~' + target_name + '.summary.tsv')
            fout = open(output_file, 'w')
            fout.write('Reference ID\tReference genome size (bp)')
            fout.write('\tTarget ID\tTarget genome size (bp)')
            fout.write('\tNo. reference probes\tNo. target probes\tNo. hybridized probes\tNo. probes failing genomic similarity test')
            fout.write('\tPredict signal intensity\n')
            
            fout.write('%s\t%d' % (ref_name, ref_genome_size))
            fout.write('\t%s\t%d' % (target_name, target_genome_size))
            fout.write('\t%d\t%d\t%d\t%d' % (num_ref_probes, len(target_windows), len(window_hits), num_failed_similarity_test))
            fout.write('\t%.1f' % (len(window_hits)*100.0/len(target_windows)))
            fout.write('\n')
            fout.close()
            
            if not keep_fragments:
                shutil.rmtree(fragment_dir)

            # allow results to be processed or written to file
            queueOut.put(ref_name)
Пример #35
0
    def run(self, homolog_file, gene_id_file, taxonomy_file, min_per_taxa,
            consensus, min_per_bp, use_trimAl, msa_program, tree_program,
            prot_model, output_dir):
        """Infer a tree over a reduced set of genes.

        Filter a set of homolgs to a specified set of gene ids, 
        and infer tree over this reduced set of proteins.

        Parameters
        ----------
        homolog_file : str
            Fasta file containing homologs.
        gene_ids : str
            File with gene ids to retain in tree.
        taxonomy_file : str
            Taxonomic assignment of each reference genomes.
        min_per_taxa : float
            Minimum percentage of taxa required to retain a column.
        consensus : float
            Minimum percentage of the same amino acid required to retain column.
        min_per_bp : float
            Minimum percentage of base pairs required to keep trimmed sequence.
        use_trimAl : boolean
            Filter columns using trimAl.
        msa_program : str
            Program to use for multiple sequence alignment ['mafft', 'muscle'].
        tree_program : str
            Program to use for tree inference ['fasttree', 'raxml'].
        prot_model : str
            Protein substitution model for tree inference ['WAG', 'LG', 'AUTO'].
        output_dir: str
            Output directory.
        """

        # generate msa with reduced sequences
        self.logger.info('Extracting sequences to retain.')
        genes_to_retain = self.read_ids(gene_id_file)
        self.logger.info(' ...identified %d sequences to retain.' %
                         len(genes_to_retain))

        seqs = seq_io.read_fasta(homolog_file)
        reduced_seqs = {}
        for seq_id, seq in seqs.iteritems():
            if seq_id in genes_to_retain:
                reduced_seqs[seq_id] = seq

        reduced_homolog_file = homolog_file[0:homolog_file.rfind('.')]
        reduced_homolog_file += '.reduced.' + homolog_file[homolog_file.
                                                           rfind('.') + 1:]
        seq_io.write_fasta(reduced_seqs, reduced_homolog_file)

        self.logger.info('Retained %d sequences.' % len(reduced_seqs))

        # infer multiple sequence alignment
        msa = MsaWorkflow(self.cpus)
        trimmed_msa_output = msa.run(reduced_homolog_file, min_per_taxa,
                                     consensus, min_per_bp, use_trimAl,
                                     msa_program, output_dir)

        # infer tree
        tw = TreeWorkflow(self.cpus)
        tree_output = tw.run(trimmed_msa_output, tree_program, prot_model,
                             False, output_dir)

        # create tax2tree consensus map and decorate tree
        self.logger.info('Decorating internal tree nodes with tax2tree.')
        t2t_tree = tree_output.replace('.tree', '.tax2tree.tree')
        os.system('t2t decorate -m %s -t %s -o %s' %
                  (taxonomy_file, tree_output, t2t_tree))
    def _parse_taxonomy_file(self,
                             genome_id,
                             metadata_taxonomy_file,
                             fout,
                             prefix,
                             fna_file,
                             summary_file=None):
        """Parse metadata file with taxonomic information for 16S rRNA genes.

        Parameters
        ----------
        genome_id : str
          Unique identifier of genome.
        metadata_taxonomy_file : str
          Full path to file containing rRNA metadata.
        fout : file
          Output stream to populate with metadata.
        Prefix : str
          Prefix to append to metadata fields.

        Returns
        -------
        int
          Number of 16S rRNA genes identified in genome.
        """

        if not os.path.exists(metadata_taxonomy_file):
            return 0

        with open(metadata_taxonomy_file) as f:
            header_line = f.readline()  # consume header line
            if prefix not in self.taxonomy_headers:
                self.taxonomy_headers.add(prefix)

                fout.write('genome_id')
                headers = [
                    prefix + '_' + x.strip().replace('ssu_', '')
                    for x in header_line.split('\t')
                ]
                headers.append("{0}_sequence".format(prefix))
                headers.append("{0}_contig_len".format(prefix))

                if prefix == 'lsu_silva_23s':
                    for n, i in enumerate(headers):
                        if i == 'lsu_silva_23s_sequence':
                            headers[n] = 'lsu_23s_sequence'
                        elif i == 'lsu_silva_23s_query_id':
                            headers[n] = 'lsu_23s_query_id'
                        elif i == 'lsu_silva_23s_length':
                            headers[n] = 'lsu_23s_length'
                        elif i == 'lsu_silva_23s_contig_len':
                            headers[n] = 'lsu_23s_contig_len'
                elif prefix == 'ssu_silva':
                    for n, i in enumerate(headers):
                        if i == 'ssu_silva_sequence':
                            headers[n] = 'ssu_sequence'
                        elif i == 'ssu_silva_query_id':
                            headers[n] = 'ssu_query_id'
                        elif i == 'ssu_silva_length':
                            headers[n] = 'ssu_length'
                        elif i == 'ssu_silva_contig_len':
                            headers[n] = 'ssu_contig_len'

                fout.write('\t' + '\t'.join(headers) + "\n")

            # Check the CheckM headers are consistent
            split_headers = header_line.rstrip().split("\t")
            for pos in range(0, len(split_headers)):
                header = split_headers[pos]
                if header == 'query_id':
                    query_id_pos = pos
                    break

            # Report hit to longest 16S rRNA gene. It is possible that
            # the HMMs identified a putative 16S rRNA gene, but that
            # there was no valid BLAST hit.
            longest_query_len = 0
            longest_ssu_hit_info = None
            identified_ssu_genes = 0
            for line in f:
                line_split = line.strip().split('\t')
                query_len = int(line_split[2])
                if query_len > longest_query_len:
                    longest_query_len = query_len
                    longest_ssu_hit_info = line_split
                    ssu_query_id = line_split[query_id_pos]

            if longest_ssu_hit_info:
                fout.write(genome_id)
                fout.write('\t' + '\t'.join(longest_ssu_hit_info))
                all_genes_dict = read_fasta(fna_file, False)
                sequence = all_genes_dict[ssu_query_id]
                fout.write('\t{0}'.format(sequence))
                if summary_file is not None and os.path.exists(summary_file):
                    with open(summary_file) as fsum:
                        header_line = fsum.readline()  # consume header line
                        header_list = [
                            x.strip() for x in header_line.split('\t')
                        ]
                        idx_seq = header_list.index("Sequence length")
                        for line in fsum:
                            identified_ssu_genes += 1
                            sum_list = [x.strip() for x in line.split('\t')]
                            if sum_list[0] == ssu_query_id:
                                fout.write("\t{0}".format(sum_list[idx_seq]))

                fout.write('\n')

            return identified_ssu_genes
Пример #37
0
    def _producer(self, genome_pair):
        """Identify reciprocal best blast hits between pairs of genomes.

        Parameters
        ----------
        genome_pair : list
            Identifier of genomes to process.
        """

        blast_stream = open(self.blast_table, 'rb', 32 * (10 ** 6))

        genome_fileA, genome_fileB = genome_pair

        # count number of genes in each genome
        genes_in_genomeA = seq_io.read_fasta(genome_fileA)
        genes_in_genomeB = seq_io.read_fasta(genome_fileB)

        genome_idA = remove_extension(genome_fileA)
        genome_idB = remove_extension(genome_fileB)

        # find blast hits between genome A and B, and vice versa
        hitsAB = self._valid_hits(blast_stream, self.offset_table,
                                    self.per_identity_threshold, self.per_aln_len_threshold,
                                    genome_idA, genome_idB)
        hitsBA = self._valid_hits(blast_stream, self.offset_table,
                                    self.per_identity_threshold, self.per_aln_len_threshold,
                                    genome_idB, genome_idA)

        # report reciprocal best blast hits
        if self.write_shared_genes:
            fout_seqs = open(os.path.join(self.shared_genes_dir, genome_idA + '-' + genome_idB + '.shared_genes.faa'), 'w')

        fout_stats = open(os.path.join(self.shared_genes_dir, genome_idA + '-' + genome_idB + '.rbb_hits.tsv'), 'w')
        fout_stats.write(genome_idA + '\t' + genome_idB + '\tPercent Identity\tPercent Alignment Length\te-value\tbitscore\n')

        per_identity_hits = []
        for query_id, hit_stats in hitsAB.iteritems():
            subject_id, per_identA, per_aln_lenA, evalueA, bitscoreA = hit_stats
            if subject_id in hitsBA and query_id == hitsBA[subject_id][0]:
                _subject_id, per_identB, per_aln_lenB, evalueB, bitscoreB = hitsBA[subject_id]

                # take average of statistics in both blast directions as
                # the results will be similar, but not identical
                per_ident = 0.5 * (per_identA + per_identB)
                per_identity_hits.append(per_ident)

                per_aln_len = 0.5 * (per_aln_lenA + per_aln_lenB)
                evalue = 0.5 * (evalueA + evalueB)
                bitscore = 0.5 * (bitscoreA + bitscoreB)

                fout_stats.write('%s\t%s\t%.2f\t%.2f\t%.2g\t%.2f\n' % (query_id, subject_id, per_ident, per_aln_len, evalue, bitscore))

                # write out shared genes
                if self.write_shared_genes:
                    fout_seqs.write('>' + query_id + '\n')
                    fout_seqs.write(genes_in_genomeA[query_id] + '\n')

                    fout_seqs.write('>' + subject_id + '\n')
                    fout_seqs.write(genes_in_genomeB[subject_id] + '\n')

        if self.write_shared_genes:
            fout_seqs.close()
        fout_stats.close()

        mean_per_identity_hits = 0
        if len(per_identity_hits) > 0:
            mean_per_identity_hits = mean(per_identity_hits)

        std_per_identity_hits = 0
        if len(per_identity_hits) >= 2:
            std_per_identity_hits = std(per_identity_hits)

        return (genome_idA,
                    len(genes_in_genomeA),
                    genome_idB,
                    len(genes_in_genomeB),
                    len(per_identity_hits),
                    mean_per_identity_hits,
                    std_per_identity_hits)
Пример #38
0
    def _run_hmm_align(self, genome_ids,
                                genome_dirs,
                                genes_in_genomes,
                                ignore_multi_copy,
                                output_msa_dir,
                                output_model_dir,
                                queue_in,
                                queue_out):
        """Run each marker gene in a separate thread.

        Only the gene with the highest bitscore is used for genomes with
        multiple hits to a given protein family.

        Parameters
        ----------
        genome_ids : iterable
            Genomes of interest.
        genome_dirs : d[assembly_accession] -> directory
            Path to files for individual genomes.
        genes_in_genomes : d[genome_id][family_id] -> [(gene_id_1, bitscore), ..., (gene_id_N, bitscore)]
            Genes within each genome.
        ignore_multi_copy : bool
            Flag indicating if genes with multiple hits should be ignored (True) or the gene with the highest bitscore taken (False).
        output_msa_dir : str
            Output directory for multiple sequence alignment.
        output_model_dir : str
            Output directory for HMMs.
        queue_in : Queue
            Input queue for parallel processing.
        queue_out : Queue
            Output queue for parallel processing.
        """

        while True:
            marker_id = queue_in.get(block=True, timeout=None)
            if marker_id == None:
                break

            marker_seq_file = os.path.join(output_msa_dir, marker_id + '.faa')
            fout = open(marker_seq_file, 'w')
            for genome_id in genome_ids:
                genome_dir = genome_dirs[genome_id]

                assembly = genome_dir[genome_dir.rfind('/') + 1:]
                genes_file = os.path.join(genome_dir, assembly + self.protein_file_ext)
                seqs = seq_io.read_fasta(genes_file)

                hits = genes_in_genomes[genome_id].get(marker_id, None)
                if not hits or (ignore_multi_copy and len(hits) > 1):
                    continue

                # get gene with highest bitscore
                hits.sort(key=lambda x: x[1], reverse=True)
                gene_id, _bitscore = hits[0]

                fout.write('>' + genome_id + DefaultValues.SEQ_CONCAT_CHAR + gene_id + '\n')
                fout.write(seqs[gene_id] + '\n')
            fout.close()

            hmmer = HMMER('align')
            hmmer.align(os.path.join(output_model_dir, marker_id + '.hmm'), marker_seq_file, os.path.join(output_msa_dir, marker_id + '.aln.faa'), trim=False, outputFormat='Pfam')
            self._mask_alignment(os.path.join(output_msa_dir, marker_id + '.aln.faa'), os.path.join(output_msa_dir, marker_id + '.aln.masked.faa'))

            queue_out.put(marker_id)