Exemplo n.º 1
0
    def run(self, gene_files):
        """Calculate amino acid usage over a set of genomes.

        Parameters
        ----------
        gene_files : list
            Fasta files containing called genes.

        Returns
        -------
        dict of dict : dict[genome_id][aa] -> count
           Amino acid usage of each genome.
        set
           Set with all identified amino acids.
        """

        self.logger.info('Calculating amino acid usage for each genome:')
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        consumer_data = parallel.run(self._producer, self._consumer, gene_files, progress_func)

        return consumer_data.genome_aa_usage, consumer_data.aa_set
    def generate_metadata(self, gtdb_genome_path_file):
        self.starttime = datetime.datetime.utcnow().replace(microsecond=0)
        input_files = []
        countr = 0
        for line in open(gtdb_genome_path_file):
            countr += 1
            statusStr = '{} lines read.'.format(countr)
            sys.stdout.write('%s\r' % statusStr)
            sys.stdout.flush()

            line_split = line.strip().split('\t')

            gid = line_split[0]
            gpath = line_split[1]
            assembly_id = os.path.basename(os.path.normpath(gpath))

            genome_file = os.path.join(gpath, assembly_id + '_genomic.fna')
            gff_file = os.path.join(gpath, 'prodigal', gid + '_protein.gff')

            input_files.append([genome_file, gff_file])

        # process each genome
        print('Generating metadata for each genome:')
        parallel = Parallel(cpus=self.cpus)
        parallel.run(self._producer, None, input_files, self._progress)
Exemplo n.º 3
0
    def run(self, gene_files, critical_value, output_dir):
        """Calculate dinucleotide usage over a set of genomes.

        Parameters
        ----------
        gene_files : list
            Fasta files containing called genes in nucleotide space.
        critical_value : float
            Critical value used to define a deviant gene (i.e., potential LGT event).
        output_dir : str
            Directory to store results.
        """

        self.output_dir = output_dir
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        self.critical_value = critical_value

        self.logger.info('Calculating dinucleotide usage for each genome.')

        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        parallel.run(self._producer, None, gene_files, progress_func)
Exemplo n.º 4
0
    def parallel_run(self, msa_files, seq_type, model_str, gamma, output_dir,
                     cpus):
        """Infer tree using FastTree in parallel.

        Parameters
        ----------
        msa_files : str
            Fasta files containing multiple sequence alignments.
        seq_type : str
            Specifies multiple sequences alignment is of 'nt' or 'prot'.
        model_str : str
            Specified either the 'wag' or 'jtt' model.
        output_dir: str
            Prefix for all output files.
        """

        assert (seq_type.upper() in ['NT', 'PROT'])
        assert (model_str.upper() in ['WAG', 'LG', 'JTT', 'GTR'])

        self.output_dir = output_dir
        self.seq_type = seq_type
        self.model = model_str
        self.gamma = gamma

        parallel = Parallel(cpus)
        parallel.run(self._parallel_infer_tree, None, msa_files, None)
Exemplo n.º 5
0
    def run(self, cluster_file, genome_dir_file, output_prefix):
        output_dir = os.path.dirname(output_prefix)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        
        # get path to all nucleotide gene file of all genomes
        gene_files = {}
        for line in open(genome_dir_file):
            line_split = line.strip().split('\t')
            
            genome_id = line_split[0]
            genome_path = line_split[1]
            genome_dir_id = os.path.basename(os.path.normpath(genome_path))
            gene_files[genome_id] = os.path.join(line_split[1], genome_dir_id + '_protein.fna')
            
        print 'Read path for %d genomes.' % len(gene_files)

        # process all clusters
        fout = open(output_prefix + '.ani.tsv', 'w')
        fout_summary = open(output_prefix + '.ani_summary.tsv', 'w')
        
        for line in open(cluster_file):
            line_split = line.strip().split('\t')
            
            rep_genome = line_split[0]
            rep_gene_file = gene_files[rep_genome]
            
            if len(line_split) == 4:
                data_items = []
                genome_ids = line_split[3].split(',')
                for genome_id in genome_ids:
                    gene_file = gene_files[genome_id]
                    data_items.append((rep_gene_file, gene_file, genome_id))
                    
                parallel = Parallel(cpus = 38)
                results = parallel.run(self._producer,
                                        self._consumer,
                                        data_items,
                                        self._progress)

                gANIs = []
                AFs = []
                for r in results:
                    genome_id, gANI, AF =  r
                    fout.write('%s\t%s\t%.3f\t%.3f\n' % (rep_genome, genome_id, gANI, AF))
                    gANIs.append(gANI)
                    AFs.append(AF)
                                            
                fout_summary.write('%s\t%.3f\t%.4f\t%.4f\t%.3f\t%.4f\t%.4f\n' % (rep_genome, 
                                                                                    mean(gANIs), 
                                                                                    std(gANIs), 
                                                                                    min(gANIs), 
                                                                                    mean(AFs), 
                                                                                    std(AFs), 
                                                                                    min(AFs)))
                fout.flush()
                fout_summary.flush()
            
        fout.close()
        fout_summary.close()
Exemplo n.º 6
0
    def run(self, gene_files):
        """Calculate codon usage over a set of genomes.

        Parameters
        ----------
        gene_files : list
            Fasta files containing called genes in nucleotide space.

        Returns
        -------
        dict of dict : d[genome_id][codon] -> count
           Codon usage of each genome.
        set
           Set with all identified codons.
        dict of dict : d[genome_id][codon] -> length
            Mean length of genes for each stop codon.
        """

        self.logger.info('Calculating codon usage for each genome.')
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        consumer_data = parallel.run(self._producer, self._consumer, gene_files, progress_func)

        return consumer_data.genome_codon_usage, consumer_data.codon_set, consumer_data.mean_gene_length
Exemplo n.º 7
0
    def run(self, gene_files):
        """Calculate amino acid usage over a set of genomes.

        Parameters
        ----------
        gene_files : list
            Fasta files containing called genes.

        Returns
        -------
        dict of dict : dict[genome_id][aa] -> count
           Amino acid usage of each genome.
        set
           Set with all identified amino acids.
        """

        self.logger.info('Calculating amino acid usage for each genome:')

        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        consumer_data = parallel.run(self._producer, self._consumer,
                                     gene_files, progress_func)

        return consumer_data.genome_aa_usage, consumer_data.aa_set
Exemplo n.º 8
0
    def run(self, genome_files):
        """Calculate kmer usage over a set of genomes.

        Parameters
        ----------
        genome_files : list
            Fasta files containing genomic sequences in nucleotide space.

        Returns
        -------
        dict of dict : d[genome_id][kmer] -> count
           Kmer usage of each genome.
        set
           Set with all identified kmers.
        """

        self.logger.info('Calculating kmer usage for each genome.')
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        kmer_counts = parallel.run(self._producer, self._consumer, genome_files, progress_func)

        return kmer_counts, self.signatures.canonical_order()
Exemplo n.º 9
0
    def run(self, gene_files):
        """Calculate codon usage over a set of genomes.

        Parameters
        ----------
        gene_files : list
            Fasta files containing called genes in nucleotide space.

        Returns
        -------
        dict of dict : d[genome_id][codon] -> count
           Codon usage of each genome.
        set
           Set with all identified codons.
        dict of dict : d[genome_id][codon] -> length
            Mean length of genes for each stop codon.
        """

        self.logger.info('Calculating codon usage for each genome.')

        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        consumer_data = parallel.run(self._producer, self._consumer,
                                     gene_files, progress_func)

        return consumer_data.genome_codon_usage, consumer_data.codon_set, consumer_data.mean_gene_length
Exemplo n.º 10
0
    def run(self, genome_files):
        """Calculate kmer usage over a set of genomes.

        Parameters
        ----------
        genome_files : list
            Fasta files containing genomic sequences in nucleotide space.

        Returns
        -------
        dict of dict : d[genome_id][kmer] -> count
           Kmer usage of each genome.
        set
           Set with all identified kmers.
        """

        self.logger.info('Calculating kmer usage for each genome.')
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        consumer_data = parallel.run(self._producer, self._consumer, genome_files, progress_func)

        return consumer_data.genome_kmer_usage, consumer_data.kmer_set
Exemplo n.º 11
0
    def run(self, gene_files, critical_value, output_dir):
        """Calculate dinucleotide usage over a set of genomes.

        Parameters
        ----------
        gene_files : list
            Fasta files containing called genes in nucleotide space.
        critical_value : float
            Critical value used to define a deviant gene (i.e., potential LGT event).
        output_dir : str
            Directory to store results.
        """

        self.output_dir = output_dir
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        self.critical_value = critical_value

        self.logger.info('Calculating dinucleotide usage for each genome.')
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        parallel.run(self._producer, None, gene_files, progress_func)
Exemplo n.º 12
0
    def run(self, cluster_file, genome_dir_file, output_prefix):
        output_dir = os.path.dirname(output_prefix)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)

        # get path to all nucleotide gene file of all genomes
        gene_files = {}
        for line in open(genome_dir_file):
            line_split = line.strip().split('\t')

            genome_id = line_split[0]
            genome_path = line_split[1]
            genome_dir_id = os.path.basename(os.path.normpath(genome_path))
            gene_files[genome_id] = os.path.join(
                line_split[1], genome_dir_id + '_protein.fna')

        print 'Read path for %d genomes.' % len(gene_files)

        # process all clusters
        fout = open(output_prefix + '.ani.tsv', 'w')
        fout_summary = open(output_prefix + '.ani_summary.tsv', 'w')

        for line in open(cluster_file):
            line_split = line.strip().split('\t')

            rep_genome = line_split[0]
            rep_gene_file = gene_files[rep_genome]

            if len(line_split) == 4:
                data_items = []
                genome_ids = line_split[3].split(',')
                for genome_id in genome_ids:
                    gene_file = gene_files[genome_id]
                    data_items.append((rep_gene_file, gene_file, genome_id))

                parallel = Parallel(cpus=38)
                results = parallel.run(self._producer, self._consumer,
                                       data_items, self._progress)

                gANIs = []
                AFs = []
                for r in results:
                    genome_id, gANI, AF = r
                    fout.write('%s\t%s\t%.3f\t%.3f\n' %
                               (rep_genome, genome_id, gANI, AF))
                    gANIs.append(gANI)
                    AFs.append(AF)

                fout_summary.write('%s\t%.3f\t%.4f\t%.4f\t%.3f\t%.4f\t%.4f\n' %
                                   (rep_genome, mean(gANIs), std(gANIs),
                                    min(gANIs), mean(AFs), std(AFs), min(AFs)))
                fout.flush()
                fout_summary.flush()

        fout.close()
        fout_summary.close()
Exemplo n.º 13
0
    def run(self, input_tree, msa_file, num_replicates, model, base_type, frac,
            output_dir):
        """Bootstrap multiple sequence alignment.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        base_type : str
          Indicates if bases are nucleotides or amino acids.
        frac : float
          Fraction of alignment to subsample.
        output_dir : str
          Directory for bootstrap trees.
        """

        assert (model in ['wag', 'lg', 'jtt'])
        assert (base_type in ['nt', 'prot'])

        self.model = model
        self.base_type = base_type
        self.frac = frac

        self.replicate_dir = os.path.join(output_dir, 'replicates')
        make_sure_path_exists(self.replicate_dir)

        # read full multiple sequence alignment
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        self.logger.info('Calculating bootstrap replicates:')
        parallel = Parallel(self.cpus)
        parallel.run(self._producer, None, range(num_replicates),
                     self._progress)

        # calculate support values
        rep_tree_files = []
        for rep_index in range(num_replicates):
            rep_tree_files.append(
                os.path.join(self.replicate_dir,
                             'bootstrap_tree.r_' + str(rep_index) + '.tree'))

        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Exemplo n.º 14
0
    def bootstrap(self, input_tree, msa_file, seq_type, model_str, gamma,
                  num_replicates, output_dir, cpus):
        """Perform non-parametric bootstrapping.

        Parameters
        ----------
        input_tree : str
            File containing newick tree to decorate with bootstraps.
        msa_file : str
            Fasta file containing multiple sequence alignment.
        seq_type : str
            Specifies multiple sequences alignment is of 'nt' or 'prot'.
        model_str : str
            Specified either the 'wag' or 'jtt' model.
        gamma : bool
            Indicates if GAMMA model should be used
        num_replicates : int
            Number of replicates to perform.
        output_dir: str
            Output directory to contain bootstrap trees.
        cpus : int
            Number of cpus to use.
        """

        assert (seq_type.upper() in ['NT', 'PROT'])
        assert (model_str.upper() in ['WAG', 'LG', 'JTT', 'GTR'])

        self.output_dir = output_dir
        self.seq_type = seq_type
        self.model = model_str
        self.gamma = gamma
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        parallel = Parallel(cpus)
        replicate_numbers = list(range(num_replicates))
        parallel.run(self._bootstrap, None, replicate_numbers, None)

        # calculate support values
        rep_tree_files = []
        for rep_index in replicate_numbers:
            rep_tree_files.append(
                os.path.join(self.output_dir, 'rep_%d' % rep_index,
                             'bootstrap.tree'))

        tree_name = os.path.splitext(os.path.basename(input_tree))[0]
        output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Exemplo n.º 15
0
    def run(self, ncbi_genome_dir, user_genome_dir, cpus):
        """Create metadata by parsing assembly stats files."""

        input_files = []

        # generate metadata for NCBI assemblies
        print 'Reading NCBI assembly directories.'
        processed_assemblies = defaultdict(list)
        for domain in ['archaea', 'bacteria']:
            domain_dir = os.path.join(ncbi_genome_dir, domain)
            for species_dir in os.listdir(domain_dir):
                full_species_dir = os.path.join(domain_dir, species_dir)
                for assembly_dir in os.listdir(full_species_dir):
                    accession = assembly_dir[0:assembly_dir.find('_', 4)]

                    processed_assemblies[accession].append(species_dir)
                    if len(processed_assemblies[accession]) >= 2:
                        continue

                    full_assembly_dir = os.path.join(full_species_dir,
                                                     assembly_dir)
                    genome_file = os.path.join(full_assembly_dir,
                                               assembly_dir + '_genomic.fna')
                    gff_file = os.path.join(full_assembly_dir, 'prodigal',
                                            accession + '_protein.gff')
                    input_files.append([genome_file, gff_file])

        # generate metadata for user genomes
        if user_genome_dir != 'NONE':
            print 'Reading user genome directories.'
            for user_id in os.listdir(user_genome_dir):
                full_user_dir = os.path.join(user_genome_dir, user_id)
                if not os.path.isdir(full_user_dir):
                    continue

                for genome_id in os.listdir(full_user_dir):
                    full_genome_dir = os.path.join(full_user_dir, genome_id)
                    genome_file = os.path.join(full_genome_dir,
                                               genome_id + '_genomic.fna')
                    gff_file = os.path.join(full_genome_dir,
                                            genome_id + '_protein.gff')
                    input_files.append([genome_file, gff_file])

        # process each genome
        print 'Generating metadata for each genome:'
        parallel = Parallel(cpus=cpus)
        parallel.run(self._producer, None, input_files, self._progress)
Exemplo n.º 16
0
  def run(self, ncbi_genome_dir, user_genome_dir, cpus):
    """Create metadata by parsing assembly stats files."""

    input_files = []

    # generate metadata for NCBI assemblies
    print 'Reading NCBI assembly directories.'
    processed_assemblies = defaultdict(list)
    for domain in ['archaea', 'bacteria']:
      domain_dir = os.path.join(ncbi_genome_dir, domain)
      for species_dir in os.listdir(domain_dir):
        full_species_dir = os.path.join(domain_dir, species_dir)
        for assembly_dir in os.listdir(full_species_dir):
          accession = assembly_dir[0:assembly_dir.find('_', 4)]

          processed_assemblies[accession].append(species_dir)
          if len(processed_assemblies[accession]) >= 2:
            continue

          full_assembly_dir = os.path.join(full_species_dir, assembly_dir)
          genome_file = os.path.join(full_assembly_dir, assembly_dir + '_genomic.fna')
          gff_file = os.path.join(full_assembly_dir, 'prodigal', accession + '_protein.gff')
          input_files.append([genome_file, gff_file])

    # generate metadata for user genomes
    if user_genome_dir != 'NONE':
        print 'Reading user genome directories.'
        for user_id in os.listdir(user_genome_dir):
          full_user_dir = os.path.join(user_genome_dir, user_id)
          if not os.path.isdir(full_user_dir):
            continue

          for genome_id in os.listdir(full_user_dir):
            full_genome_dir = os.path.join(full_user_dir, genome_id)
            genome_file = os.path.join(full_genome_dir, genome_id + '_genomic.fna')
            gff_file = os.path.join(full_genome_dir, genome_id + '_protein.gff')
            input_files.append([genome_file, gff_file])

    # process each genome
    print 'Generating metadata for each genome:'
    parallel = Parallel(cpus = cpus)
    parallel.run(self._producer,
                  None,
                  input_files,
                  self._progress)
Exemplo n.º 17
0
    def run(self, gene_files, output_dir):
        """Calculate codon usage over genes with a set of genomes.

        Parameters
        ----------
        gene_files : list
            Fasta files containing called genes in nucleotide space.
        output_dir : str
            Directory to store results.
        """

        self.output_dir = output_dir
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        self.logger.info('  Calculating codon usage for each genome.')

        parallel = Parallel(self.cpus)
        parallel.run(self._producer, None, gene_files, self._progress)
Exemplo n.º 18
0
    def bootstrap(self, input_tree, msa_file, model_str, num_replicates, output_dir, cpus):
        """Perform non-parametric bootstrapping.

        Parameters
        ----------
        input_tree : str
            File containing newick tree to decorate with bootstraps.
        msa_file : str
            Fasta file containing multiple sequence alignment.
        model_str : str
            Specified either the 'WAG' or 'LG' model.
        num_replicates : int
            Number of replicates to perform.
        output_dir: str
            Output directory to contain bootstrap trees.
        cpus : int
            Number of cpus to use.
        """
        
        check_on_path('seqmagick')

        assert(model_str.upper() in ['WAG', 'LG'])

        self.output_dir = output_dir
        self.model = model_str
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        parallel = Parallel(cpus)
        replicate_numbers = list(range(num_replicates))
        parallel.run(self._bootstrap, None, replicate_numbers, None)

        # calculate support values
        rep_tree_files = []
        for rep_index in replicate_numbers:
            rep_tree_files.append(os.path.join(output_dir, 'rep_%d' % rep_index, 'RAxML_bestTree.support'))

        tree_name = os.path.splitext(os.path.basename(input_tree))[0]
        output_tree = os.path.join(output_dir, tree_name + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)
        
        return output_tree
Exemplo n.º 19
0
    def run(self, seq_file):
        """Calculate tetranucleotide signatures of sequences.

        Parameters
        ----------
        seq_file : str
            Name of fasta/q file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        self.logger.info('Calculating tetranucleotide signature for each sequence:')

        parallel = Parallel(self.cpus)
        seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress)

        return seq_signatures
Exemplo n.º 20
0
    def run(self, seq_file):
        """Calculate tetranucleotide signatures of sequences.

        Parameters
        ----------
        seq_file : str
            Name of fasta/q file to read.

        Returns
        -------
        dict : d[seq_id] -> tetranucleotide signature in canonical order
            Count of each kmer.
        """

        self.logger.info('  Calculating tetranucleotide signature for each sequence:')

        parallel = Parallel(self.cpus)
        seq_signatures = parallel.run_seqs_file(self._producer, self._consumer, seq_file, self._progress)

        return seq_signatures
Exemplo n.º 21
0
    def bootstrap(self, input_tree, msa_file, seq_type, model_str, num_replicates, output_tree, cpus):
        """Perform non-parametric bootstrapping.

        Parameters
        ----------
        input_tree : str
            File containing newick tree to decorate with bootstraps.
        msa_file : str
            Fasta file containing multiple sequence alignment.
        seq_type : str
            Specifies multiple sequences alignment is of 'nt' or 'prot'.
        model_str : str
            Specified either the 'wag' or 'jtt' model.
        num_replicates : int
            Number of replicates to perform.
        output_tree: str
            Output file containing tree with bootstrap values.
        cpus : int
            Number of cpus to use.
        """

        assert(seq_type in ['nt', 'prot'])
        assert(model_str in ['wag', 'jtt'])

        self.replicate_dir = tempfile.mkdtemp()
        self.seq_type = seq_type
        self.model = model_str
        self.msa = seq_io.read(msa_file)

        # calculate replicates
        parallel = Parallel(cpus)
        parallel.run(self._bootstrap, None, xrange(num_replicates), None)

        # calculate support values
        rep_tree_files = []
        for rep_index in xrange(num_replicates):
            rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap.tree.' + str(rep_index) + '.tre'))

        bootstrap_support(input_tree, rep_tree_files, output_tree)

        shutil.rmtree(self.replicate_dir)
Exemplo n.º 22
0
    def run(self, gene_files, output_dir):
        """Calculate codon usage over genes with a set of genomes.

        Parameters
        ----------
        gene_files : list
            Fasta files containing called genes in nucleotide space.
        output_dir : str
            Directory to store results.
        """

        self.output_dir = output_dir
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir)

        self.logger.info('Calculating codon usage for each genome.')
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None

        parallel = Parallel(self.cpus)
        parallel.run(self._producer, None, gene_files, progress_func)
Exemplo n.º 23
0
    def run(self, aa_gene_files, evalue, output_dir):
        """Apply reciprocal blast to all pairs of genomes in parallel.

        Parameters
        ----------
        aa_gene_files : list of str
            Amino acid fasta files to process via reciprocal blast.
        evalue : float
            E-value threshold used by blast.
        output_dir : str
            Directory to store blast results.
        """

        self.evalue = evalue
        self.output_dir = output_dir

        # set CPUs per producer process
        self.producer_cpus = 1
        if self.cpus > len(aa_gene_files):
            self.producer_cpus = self.cpus / len(aa_gene_files)

        # create the blast databases in serial
        self.logger.info('  Creating blast databases:')

        parallel = Parallel(self.cpus)
        parallel.run(self._producer_db, None, aa_gene_files, self._progress)

        # perform reciprocal blast between all genome pairs
        self.logger.info('')
        self.logger.info('  Identifying hits between all pairs of genomes:')

        genome_pairs = []
        for i in xrange(0, len(aa_gene_files)):
            for j in xrange(i, len(aa_gene_files)):
                genome_pairs.append((aa_gene_files[i], aa_gene_files[j]))

        parallel.run(self._producer_blast, None, genome_pairs, self._progress)
Exemplo n.º 24
0
    def run(self, ncbi_genome_dir, user_genome_dir, cpus):
        """Create metadata by parsing assembly stats files."""

        input_files = []

        # generate metadata for NCBI assemblies
        if ncbi_genome_dir != 'NONE':
            print('Reading NCBI assembly directories.')
            processed_assemblies = defaultdict(list)
            rfq_dir = os.path.join(ncbi_genome_dir, 'refseq', 'GCF')
            gbk_dir = os.path.join(ncbi_genome_dir, 'genbank', 'GCA')

            for input_dir in (gbk_dir, rfq_dir):
                for first_three in os.listdir(input_dir):
                    onethird_species_dir = os.path.join(input_dir, first_three)
                    print onethird_species_dir
                    if os.path.isfile(onethird_species_dir):
                        continue
                    for second_three in os.listdir(onethird_species_dir):
                        twothird_species_dir = os.path.join(
                            onethird_species_dir, second_three)
                        # print twothird_species_dir
                        if os.path.isfile(twothird_species_dir):
                            continue
                        for third_three in os.listdir(twothird_species_dir):
                            threethird_species_dir = os.path.join(
                                twothird_species_dir, third_three)
                            # print threethird_species_dir
                            if os.path.isfile(threethird_species_dir):
                                continue
                            for complete_name in os.listdir(threethird_species_dir):
                                assembly_dir = os.path.join(
                                    threethird_species_dir, complete_name)
                                if os.path.isfile(assembly_dir):
                                    continue

                                accession = complete_name[0:complete_name.find(
                                    '_', 4)]

                                processed_assemblies[accession].append(
                                    assembly_dir)
                                if len(processed_assemblies[accession]) >= 2:
                                    continue

                                ssu_file = os.path.join(
                                    assembly_dir, self.silva_output_dir, 'ssu.fna')
                                if os.path.exists(ssu_file):
                                    genome_file = os.path.join(
                                        assembly_dir, complete_name + '_genomic.fna')
                                    input_files.append((genome_file, ssu_file))

        # generate metadata for user genomes
        if user_genome_dir != 'NONE':
            print('Reading user genome directories.')
            for user_id in os.listdir(user_genome_dir):
                full_user_dir = os.path.join(user_genome_dir, user_id)
                if not os.path.isdir(full_user_dir):
                    continue

                for genome_id in os.listdir(full_user_dir):
                    full_genome_dir = os.path.join(full_user_dir, genome_id)

                    ssu_file = os.path.join(
                        full_genome_dir, self.silva_output_dir, 'ssu.fna')
                    if os.path.exists(ssu_file):
                        genome_file = os.path.join(
                            full_genome_dir, genome_id + '_genomic.fna')
                        input_files.append((genome_file, ssu_file))

                    print('Identified %d genomes to process.' %
                          len(input_files))

        # process each genome
        print('Generating metadata for each genome:')
        parallel = Parallel(cpus=cpus)
        parallel.run(self._producer,
                     None,
                     input_files,
                     self._progress)
Exemplo n.º 25
0
    def run(self, genome_ids, gene_dir, blast_dir, per_iden_threshold, per_aln_len_threshold, write_shared_genes, output_dir):
        """Calculate amino acid identity (AAI) between pairs of genomes.

        Parameters
        ----------
        genome_ids : list of str
            Unique ids of genomes to process.
        gene_dir : str
            Directory with amino acid genes in fasta format.
        blast_dir : str
            Directory with reciprocal blast between genome pairs.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        write_shared_genes : boolean
            Flag indicating if shared genes should be written to file.
        output_dir : str
            Directory to store AAI results.
        """

        self.gene_dir = gene_dir
        self.blast_dir = blast_dir

        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.write_shared_genes = write_shared_genes
        self.output_dir = output_dir

        shared_genes_dir = os.path.join(output_dir, self.shared_genes)
        make_sure_path_exists(shared_genes_dir)
        self.shared_genes_dir = shared_genes_dir

        # calculate length of genes in each genome
        self.logger.info('  Calculating length of genes in each genome.')
        self.gene_lengths = {}
        gene_files = []
        for gene_file in os.listdir(gene_dir):
            gene_file = os.path.join(gene_dir, gene_file)
            gene_files.append(gene_file)
            self.gene_lengths.update(seq_io.seq_lengths(gene_file))

        # get byte offset of hits from each genome
        self.logger.info('')
        self.logger.info('  Indexing blast hits.')
        self.blast_table = os.path.join(self.blast_dir, self.blast_table_file)
        self.offset_table = self._genome_offsets(self.blast_table)

        # calculate AAI between each pair of genomes in parallel
        self.logger.info('')
        self.logger.info('  Calculating amino acid identity between all pairs of genomes:')

        genome_pairs = []
        for i in xrange(0, len(gene_files)):
            for j in xrange(i + 1, len(gene_files)):
                genome_pairs.append((gene_files[i], gene_files[j]))

        if len(genome_pairs) == 0:
            self.logger.warning('  [Warning] No genome pairs identified.')
            return

        parallel = Parallel(self.cpus)
        consumer_data = parallel.run(self._producer, self._consumer, genome_pairs, self._progress)

        # write results for each genome pair
        aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv')
        fout = open(aai_summay_file, 'w')
        fout.write('Genome Id A\tGenes in A\tGenome Id B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\n')

        for data in consumer_data:
            fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\n' % data)

        fout.close()

        self.logger.info('')
        self.logger.info('  Summary of AAI between genomes: %s' % aai_summay_file)
Exemplo n.º 26
0
    def run(self, query_gene_file,
                    target_gene_file,
                    sorted_hit_table, 
                    evalue_threshold, 
                    per_iden_threshold, 
                    per_aln_len_threshold,
                    keep_rbhs,
                    output_dir):
        """Calculate amino acid identity (AAI) between pairs of genomes.

        Parameters
        ----------
        query_gene_file : str
            File with all query genes in FASTA format.
        target_gene_file : str or None
            File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation.
        sorted_hit_table : str
            Sorted table indicating genes with sequence similarity.
        evalue_threshold : float
            Evalue threshold used to define a homologous gene.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """

        self.sorted_hit_table = sorted_hit_table
        self.evalue_threshold = evalue_threshold
        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.keep_rbhs = keep_rbhs
        self.output_dir = output_dir

        # calculate length of genes and number of genes in each genome
        self.logger.info('Calculating length of genes.')
        self.gene_lengths = {}
        self.query_gene_count = defaultdict(int)
        query_genomes = set()
        for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)
                
            genome_id = seq_id[0:seq_id.find('~')]
            self.query_gene_count[genome_id] += 1
            query_genomes.add(genome_id)
            
        self.target_gene_count = defaultdict(int)
        target_genomes = set()
        if target_gene_file:
            for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
                if seq[-1] == '*':
                    self.gene_lengths[seq_id] = len(seq) - 1
                else:
                    self.gene_lengths[seq_id] = len(seq)
                    
                genome_id = seq_id[0:seq_id.find('~')]
                self.target_gene_count[genome_id] += 1
                target_genomes.add(genome_id)
        else:
            self.target_gene_count = self.query_gene_count

        # get byte offset of hits from each genome
        self.logger.info('Indexing sorted hit table.')
        self.offset_table = self._genome_offsets(self.sorted_hit_table)

        # calculate AAI between each pair of genomes in parallel
        if target_genomes:
            # compare query genomes to target genomes
            self.num_pairs = len(query_genomes) * len(target_genomes)
            self.logger.info('Calculating AAI between %d query and %d target genomes:' % (len(query_genomes), len(target_genomes)))
        else:
            # compute pairwise values between target genomes
            ng = len(query_genomes)
            self.num_pairs = (ng*ng - ng) / 2
            self.logger.info('Calculating AAI between all %d pairs of genomes:' % self.num_pairs)
            
        if self.num_pairs == 0:
            self.logger.warning('No genome pairs identified.')
            return

        genome_id_lists = []
        query_genomes = list(query_genomes)
        target_genomes = list(target_genomes)
        for i in range(0, len(query_genomes)):
            genome_idI = query_genomes[i]
            
            if target_genomes:
                genome_id_list = target_genomes
            else:
                genome_id_list = []
                for j in range(i + 1, len(query_genomes)):
                    genome_idJ = query_genomes[j]
                    genome_id_list.append(genome_idJ)

            genome_id_lists.append((genome_idI, genome_id_list))

        self.processed_paired = 0
        parallel = Parallel(self.cpus)
        
        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None
        consumer_data = parallel.run(self._producer, self._consumer, genome_id_lists, progress_func)

        # write results for each genome pair
        self.logger.info('Summarizing AAI results.')
        aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv')
        fout = open(aai_summay_file, 'w')
        fout.write('#Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n')

        for data in consumer_data:
            fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data)

        fout.close()

        # concatenate RBH files
        rbh_output_file = None
        if self.keep_rbhs:
            self.logger.info('Concatenating RBH files.')
            rbh_files = []
            for genome_id in query_genomes:
                rbh_files.append(os.path.join(self.output_dir, genome_id + '.rbh.tsv'))
                
            rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv')
            concatenate_files(rbh_files, rbh_output_file, common_header=True)
            
            for f in rbh_files:
                os.remove(f)
                
        return aai_summay_file, rbh_output_file
Exemplo n.º 27
0
    def run(self, 
                input_tree, 
                msa_file, 
                num_replicates, 
                model, 
                gamma,
                base_type, 
                frac,
                boot_dir,
                output_dir):
        """Bootstrap multiple sequence alignment.

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        base_type : str
          Indicates if bases are nucleotides or amino acids.
        frac : float
          Fraction of alignment to subsample.
        output_dir : str
          Directory for bootstrap trees.
        """

        assert(model in ['wag', 'lg', 'jtt'])
        assert(base_type in ['nt', 'prot'])

        self.model = model
        self.gamma = gamma
        self.base_type = base_type
        self.frac = frac

        rep_tree_files = []
        if not boot_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)

            # calculate replicates
            self.logger.info('Calculating bootstrap replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates), self._progress)

            for rep_index in xrange(num_replicates):
                rep_tree_files.append(os.path.join(self.replicate_dir, 'bootstrap_tree.r_' + str(rep_index) + '.tree'))
        else:
            for f in os.listdir(boot_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(boot_dir, f))
            self.logger.info('Read %d bootstrap replicates.' % len(rep_tree_files))
          
        # calculate support values
        self.logger.info('Calculating bootstrap support values.')
        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.bootstrap.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Exemplo n.º 28
0
    def _run(self, rna_gene, ncbi_genome_dir, user_genome_dir, db, cpus):
        """Create metadata by parsing assembly stats files."""

        self.rna_gene = rna_gene

        if db == 'SILVA':
            # Silva info
            if rna_gene == 'ssu':
                self.db = self.silva_ssu_ref_file
                self.taxonomy = self.silva_ssu_taxonomy_file
            elif rna_gene == 'lsu_23S':
                self.db = self.silva_lsu_ref_file
                self.taxonomy = self.silva_lsu_taxonomy_file
            elif rna_gene == 'lsu_5S':
                print 'We currently do not curate against a 5S database, but do identify these sequences for quality assessment purposes.'
            self.output_dir = self.silva_output_dir
        else:
            print('Unrecognized database: %s' % db)
            sys.exit(-1)

        input_files = []

        # generate metadata for NCBI assemblies
        if ncbi_genome_dir != 'NONE':
            print('Reading NCBI assembly directories.')
            processed_assemblies = defaultdict(list)
            for domain in ['archaea', 'bacteria']:
                domain_dir = os.path.join(ncbi_genome_dir, domain)
                if not os.path.exists(domain_dir):
                    continue

                for species_dir in os.listdir(domain_dir):
                    full_species_dir = os.path.join(domain_dir, species_dir)
                    for assembly_dir in os.listdir(full_species_dir):
                        accession = assembly_dir[0:assembly_dir.find('_', 4)]

                        processed_assemblies[accession].append(species_dir)
                        if len(processed_assemblies[accession]) >= 2:
                            continue

                        full_assembly_dir = os.path.join(
                            full_species_dir, assembly_dir)
                        hmm_results_file = os.path.join(
                            full_assembly_dir, self.output_dir,
                            rna_gene + '.hmm_summary.tsv')
                        if os.path.exists(hmm_results_file):
                            continue

                        genome_file = os.path.join(
                            full_assembly_dir, assembly_dir + '_genomic.fna')
                        input_files.append(genome_file)

        # generate metadata for user genomes
        if user_genome_dir != 'NONE':
            print('Reading user genome directories.')
            for user_id in os.listdir(user_genome_dir):
                full_user_dir = os.path.join(user_genome_dir, user_id)
                if not os.path.isdir(full_user_dir):
                    continue

                for genome_id in os.listdir(full_user_dir):
                    full_genome_dir = os.path.join(full_user_dir, genome_id)

                    hmm_results_file = os.path.join(
                        full_genome_dir, self.output_dir,
                        rna_gene + '.hmm_summary.tsv')
                    if os.path.exists(hmm_results_file):
                        continue

                    genome_file = os.path.join(full_genome_dir,
                                               genome_id + '_genomic.fna')
                    input_files.append(genome_file)

            print('Identified %d genomes to process.' % len(input_files))

        # process each genome
        print('Generating metadata for each genome:')
        parallel = Parallel(cpus=cpus)
        parallel.run(self._producer, None, input_files, self._progress)
Exemplo n.º 29
0
    def _run(self, rna_gene, ncbi_genome_dir, user_genome_dir, ssu_db, cpus):
        """Create metadata by parsing assembly stats files."""

        self.rna_gene = rna_gene

        if ssu_db == 'GG':
            # Greengenes data files and desired output
            if rna_gene == 'ssu':
                self.db = '/srv/db/gg/2013_08/gg_13_8_otus/rep_set/99_otus.fasta'
                self.taxonomy = '/srv/db/gg/2013_08/gg_13_8_otus/taxonomy/99_otu_taxonomy.txt'
                self.output_dir = 'ssu_gg'
            elif rna_gene == 'lsu_23S':
                print 'There is no 23S LSU database for GG.'
                return
            elif rna_gene == 'lsu_5S':
                return
        elif ssu_db == 'SILVA':
            # Silva info
            if rna_gene == 'ssu':
                self.db = '/srv/whitlam/bio/db/silva/123.1/SILVA_123.1_SSURef_Nr99_tax_silva.fasta'
                self.taxonomy = '/srv/whitlam/bio/db/silva/123.1/silva_taxonomy.ssu.tsv'
                self.output_dir = 'rna_silva'
            elif rna_gene == 'lsu_23S':
                self.db = '/srv/db/silva/123.1/SILVA_123.1_LSURef_tax_silva.fasta'
                self.taxonomy = '/srv/whitlam/bio/db/silva/123.1/silva_taxonomy.lsu.tsv'
                self.output_dir = 'rna_silva'
            elif rna_gene == 'lsu_5S':
                print 'We currently do not curate against a 5S database, but do identify these sequences for quality assessment purposes.'
                self.output_dir = 'lsu_5S'

        input_files = []

        # generate metadata for NCBI assemblies
        if ncbi_genome_dir != 'NONE':
            print 'Reading NCBI assembly directories.'
            processed_assemblies = defaultdict(list)
            for domain in ['archaea', 'bacteria']:
                domain_dir = os.path.join(ncbi_genome_dir, domain)
                if not os.path.exists(domain_dir):
                    continue

                for species_dir in os.listdir(domain_dir):
                    full_species_dir = os.path.join(domain_dir, species_dir)
                    for assembly_dir in os.listdir(full_species_dir):
                        accession = assembly_dir[0:assembly_dir.find('_', 4)]

                        processed_assemblies[accession].append(species_dir)
                        if len(processed_assemblies[accession]) >= 2:
                            continue

                        full_assembly_dir = os.path.join(
                            full_species_dir, assembly_dir)

                        #if os.path.exists(os.path.join(full_assembly_dir, self.output_dir)):
                        #  continue

                        genome_file = os.path.join(
                            full_assembly_dir, assembly_dir + '_genomic.fna')
                        input_files.append(genome_file)

        # generate metadata for user genomes
        if user_genome_dir != 'NONE':
            print 'Reading user genome directories.'
            for user_id in os.listdir(user_genome_dir):
                full_user_dir = os.path.join(user_genome_dir, user_id)
                if not os.path.isdir(full_user_dir):
                    continue

                for genome_id in os.listdir(full_user_dir):
                    full_genome_dir = os.path.join(full_user_dir, genome_id)

                    if os.path.exists(
                            os.path.join(full_genome_dir, self.output_dir)):
                        continue

                    genome_file = os.path.join(full_genome_dir,
                                               genome_id + '_genomic.fna')
                    input_files.append(genome_file)

        print 'Identified %d genomes to process.' % len(input_files)

        # process each genome
        print 'Generating metadata for each genome:'
        parallel = Parallel(cpus=cpus)
        if len(input_files) > 0:
            parallel.run(self._producer, None, input_files, self._progress)
Exemplo n.º 30
0
    def run(self, input_tree, msa_file, marker_info_file, mask_file,
            perc_markers_to_keep, num_replicates, model, jk_dir, output_dir):
        """Jackknife marker genes.

        Marker file should have the format:
          <marker id>\t<marker name>\t<marker desc>\t<length>\n

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        marker_info_file : str
          File indicating database id, HMM name, description and length of each marker in the alignment.
        mask_file : str
          File indicating masking of multiple sequence alignment.
        perc_markers_to_keep : float [0, 1]
          Percentage of marker genes to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          Output directory for jackkife trees.
        """

        assert (model in ['wag', 'jtt'])

        self.model = model
        self.perc_markers_to_keep = perc_markers_to_keep

        # determine length of each marker gene in alignment
        rep_tree_files = []
        if not jk_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)

            marker_lengths = []
            total_len = 0
            with open(marker_info_file) as f:
                f.readline()
                for line in f:
                    line_split = line.split('\t')
                    ml = int(line_split[3])
                    marker_lengths.append(ml)
                    total_len += ml

            self.logger.info('Concatenated length of markers: %d' % total_len)

            # read mask
            mask = open(mask_file).readline().strip()
            start = 0
            self.marker_lengths = []
            total_mask_len = 0
            for ml in marker_lengths:
                end = start + ml
                zeros = mask[start:end].count('0')
                start = end

                self.marker_lengths.append(ml - zeros)
                total_mask_len += ml - zeros

            self.logger.info('Concatenated length of filtered MSA: %d' %
                             total_mask_len)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)

            if len(self.msa.values()[0]) != total_mask_len:
                self.logger.error(
                    'Length of MSA does not meet length of mask.')
                sys.exit()

            # calculate replicates
            self.logger.info('Calculating jackknife marker replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates),
                         self._progress)

            # calculate support
            self.logger.info('Calculating support for %d replicates.' %
                             num_replicates)
            for rep_index in xrange(num_replicates):
                rep_tree_files.append(
                    os.path.join(self.replicate_dir,
                                 'jk_markers.tree.' + str(rep_index) + '.tre'))
        else:
            for f in os.listdir(jk_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(jk_dir, f))
            self.logger.info('Read %d jackknife replicates.' %
                             len(rep_tree_files))

        output_tree = os.path.join(
            output_dir,
            remove_extension(input_tree) + '.jk_markers.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree
Exemplo n.º 31
0
    def run(self, 
                genome_files, 
                output_dir, 
                called_genes=False, 
                translation_table=None, 
                meta=False,
                closed_ends=False):
        """Call genes with Prodigal.

        Call genes with prodigal and store the results in the
        specified output directory. For convenience, the
        called_gene flag can be used to indicate genes have
        previously been called and simply need to be copied
        to the specified output directory.

        Parameters
        ----------
        genome_files : list of str
            Nucleotide fasta files to call genes on.
        called_genes : boolean
            Flag indicating if genes are already called.
        translation_table : int
            Specifies desired translation table, use None to automatically
            select between tables 4 and 11.
        meta : boolean
            Flag indicating if prodigal should call genes with the metagenomics procedure.
        closed_ends : boolean
            If True, do not allow genes to run off edges (throws -c flag).
        output_dir : str
            Directory to store called genes.

        Returns
        -------
        d[genome_id] -> namedtuple(best_translation_table
                                            coding_density_4
                                            coding_density_11)
            Summary statistics of called genes for each genome.
        """

        self.called_genes = called_genes
        self.translation_table = translation_table
        self.meta = meta
        self.closed_ends = closed_ends
        self.output_dir = output_dir

        make_sure_path_exists(self.output_dir)

        progress_func = None
        if self.verbose:
            file_type = 'genomes'
            self.progress_str = '  Finished processing %d of %d (%.2f%%) genomes.'
            if meta:
                file_type = 'scaffolds'
                if len(genome_files):
                    file_type = ntpath.basename(genome_files[0])

                self.progress_str = '  Finished processing %d of %d (%.2f%%) files.'

            self.logger.info('Identifying genes within %s: ' % file_type)
            progress_func = self._progress

        parallel = Parallel(self.cpus)
        summary_stats = parallel.run(self._producer, self._consumer, genome_files, progress_func)

        return summary_stats
Exemplo n.º 32
0
    def run(self, query_gene_file, target_gene_file, sorted_hit_table,
            evalue_threshold, per_iden_threshold, per_aln_len_threshold,
            keep_rbhs, output_dir):
        """Calculate amino acid identity (AAI) between pairs of genomes.

        Parameters
        ----------
        query_gene_file : str
            File with all query genes in FASTA format.
        target_gene_file : str or None
            File with all target genes in FASTA format, or None if performing a reciprocal AAI calculation.
        sorted_hit_table : str
            Sorted table indicating genes with sequence similarity.
        evalue_threshold : float
            Evalue threshold used to define a homologous gene.
        per_identity_threshold : float
            Percent identity threshold used to define a homologous gene.
        per_aln_len_threshold : float
            Alignment length threshold used to define a homologous gene.
        keep_rbhs : boolean
            Flag indicating if RBH should be written to file.
        output_dir : str
            Directory to store AAI results.
        """

        self.sorted_hit_table = sorted_hit_table
        self.evalue_threshold = evalue_threshold
        self.per_identity_threshold = per_iden_threshold
        self.per_aln_len_threshold = per_aln_len_threshold
        self.keep_rbhs = keep_rbhs
        self.output_dir = output_dir

        # calculate length of genes and number of genes in each genome
        self.logger.info('Calculating length of genes.')
        self.gene_lengths = {}
        self.query_gene_count = defaultdict(int)
        query_genomes = set()
        for seq_id, seq in seq_io.read_fasta_seq(query_gene_file):
            if seq[-1] == '*':
                self.gene_lengths[seq_id] = len(seq) - 1
            else:
                self.gene_lengths[seq_id] = len(seq)

            genome_id = seq_id[0:seq_id.find('~')]
            self.query_gene_count[genome_id] += 1
            query_genomes.add(genome_id)

        self.target_gene_count = defaultdict(int)
        target_genomes = set()
        if target_gene_file:
            for seq_id, seq in seq_io.read_fasta_seq(target_gene_file):
                if seq[-1] == '*':
                    self.gene_lengths[seq_id] = len(seq) - 1
                else:
                    self.gene_lengths[seq_id] = len(seq)

                genome_id = seq_id[0:seq_id.find('~')]
                self.target_gene_count[genome_id] += 1
                target_genomes.add(genome_id)
        else:
            self.target_gene_count = self.query_gene_count

        # get byte offset of hits from each genome
        self.logger.info('Indexing sorted hit table.')
        self.offset_table = self._genome_offsets(self.sorted_hit_table)

        # calculate AAI between each pair of genomes in parallel
        if target_genomes:
            # compare query genomes to target genomes
            self.num_pairs = len(query_genomes) * len(target_genomes)
            self.logger.info(
                'Calculating AAI between %d query and %d target genomes:' %
                (len(query_genomes), len(target_genomes)))
        else:
            # compute pairwise values between target genomes
            ng = len(query_genomes)
            self.num_pairs = (ng * ng - ng) / 2
            self.logger.info(
                'Calculating AAI between all %d pairs of genomes:' %
                self.num_pairs)

        if self.num_pairs == 0:
            self.logger.warning('No genome pairs identified.')
            return

        genome_id_lists = []
        query_genomes = list(query_genomes)
        target_genomes = list(target_genomes)
        for i in xrange(0, len(query_genomes)):
            genome_idI = query_genomes[i]

            if target_genomes:
                genome_id_list = target_genomes
            else:
                genome_id_list = []
                for j in xrange(i + 1, len(query_genomes)):
                    genome_idJ = query_genomes[j]
                    genome_id_list.append(genome_idJ)

            genome_id_lists.append((genome_idI, genome_id_list))

        self.processed_paired = 0
        parallel = Parallel(self.cpus)

        progress_func = self._progress
        if self.logger.is_silent:
            progress_func = None
        consumer_data = parallel.run(self._producer, self._consumer,
                                     genome_id_lists, progress_func)

        # write results for each genome pair
        self.logger.info('Summarizing AAI results.')
        aai_summay_file = os.path.join(output_dir, 'aai_summary.tsv')
        fout = open(aai_summay_file, 'w')
        fout.write(
            'Genome A\tGenes in A\tGenome B\tGenes in B\t# orthologous genes\tMean AAI\tStd AAI\tOrthologous fraction (OF)\n'
        )

        for data in consumer_data:
            fout.write('%s\t%d\t%s\t%d\t%d\t%.2f\t%.2f\t%.2f\n' % data)

        fout.close()

        # concatenate RBH files
        rbh_output_file = None
        if self.keep_rbhs:
            self.logger.info('Concatenating RBH files.')
            rbh_files = []
            for genome_id in query_genomes:
                rbh_files.append(
                    os.path.join(self.output_dir, genome_id + '.rbh.tsv'))

            rbh_output_file = os.path.join(self.output_dir, 'rbh.tsv')
            concatenate_files(rbh_files, rbh_output_file, common_header=True)

            for f in rbh_files:
                os.remove(f)

        return aai_summay_file, rbh_output_file
Exemplo n.º 33
0
  def _run(self, rna_gene, ncbi_genome_dir, user_genome_dir, ssu_db, cpus):
    """Create metadata by parsing assembly stats files."""
    
    self.rna_gene = rna_gene
    
    if ssu_db == 'GG':
        # Greengenes data files and desired output
        if rna_gene == 'ssu':
            self.db = '/srv/db/gg/2013_08/gg_13_8_otus/rep_set/99_otus.fasta'
            self.taxonomy = '/srv/db/gg/2013_08/gg_13_8_otus/taxonomy/99_otu_taxonomy.txt'
            self.output_dir = 'ssu_gg'
        else:
            print 'There is no LSU database for GG.'
            sys.exit()
    elif ssu_db == 'SILVA':
        # Silva info
        if rna_gene == 'ssu':
            self.db = '/srv/whitlam/bio/db/silva/123.1/SILVA_123.1_SSURef_Nr99_tax_silva.fasta'
            self.taxonomy = '/srv/whitlam/bio/db/silva/123.1/silva_taxonomy.ssu.tsv'
        elif rna_gene == 'lsu_23S':
            self.db = '/srv/db/silva/123.1/SILVA_123.1_LSURef_tax_silva.fasta'
            self.taxonomy = '/srv/whitlam/bio/db/silva/123.1/silva_taxonomy.lsu.tsv'
        self.output_dir = 'rna_silva'

    input_files = []

    # generate metadata for NCBI assemblies
    print 'Reading NCBI assembly directories.'
    processed_assemblies = defaultdict(list)
    for domain in ['archaea', 'bacteria']:
      domain_dir = os.path.join(ncbi_genome_dir, domain)
      if not os.path.exists(domain_dir):
        continue

      for species_dir in os.listdir(domain_dir):
        full_species_dir = os.path.join(domain_dir, species_dir)
        for assembly_dir in os.listdir(full_species_dir):
          accession = assembly_dir[0:assembly_dir.find('_', 4)]

          processed_assemblies[accession].append(species_dir)
          if len(processed_assemblies[accession]) >= 2:
            continue

          full_assembly_dir = os.path.join(full_species_dir, assembly_dir)

          #if os.path.exists(os.path.join(full_assembly_dir, self.output_dir)):
          #  continue
          
          genome_file = os.path.join(full_assembly_dir, assembly_dir + '_genomic.fna')
          input_files.append(genome_file)

    # generate metadata for user genomes
    if user_genome_dir != 'NONE':
        print 'Reading user genome directories.'
        for user_id in os.listdir(user_genome_dir):
          full_user_dir = os.path.join(user_genome_dir, user_id)
          if not os.path.isdir(full_user_dir):
            continue

          for genome_id in os.listdir(full_user_dir):
            full_genome_dir = os.path.join(full_user_dir, genome_id)

            #if os.path.exists(os.path.join(full_genome_dir, self.output_dir)):
            #    continue
             
            genome_file = os.path.join(full_genome_dir, genome_id + '_genomic.fna')
            input_files.append(genome_file)

        print 'Identified %d genomes to process.' % len(input_files)

    # process each genome
    print 'Generating metadata for each genome:'
    parallel = Parallel(cpus = cpus)
    parallel.run(self._producer,
                  None,
                  input_files,
                  self._progress)
Exemplo n.º 34
0
    def _run(self, rna_gene, ncbi_genome_dir, user_genome_dir, db, cpus):
        """Create metadata by parsing assembly stats files."""

        self.rna_gene = rna_gene

        if db == 'SILVA':
            # Silva info
            if rna_gene == 'ssu':
                self.db = self.silva_ssu_ref_file
                self.taxonomy = self.silva_ssu_taxonomy_file
            elif rna_gene == 'lsu_23S':
                self.db = self.silva_lsu_ref_file
                self.taxonomy = self.silva_lsu_taxonomy_file
            elif rna_gene == 'lsu_5S':
                print 'We currently do not curate against a 5S database, but do identify these sequences for quality assessment purposes.'
            self.output_dir = self.silva_output_dir
        else:
            print('Unrecognized database: %s' % db)
            sys.exit(-1)

        input_files = []

        # generate metadata for NCBI assemblies
        if ncbi_genome_dir != 'NONE':
            print('Reading NCBI assembly directories.')
            processed_assemblies = defaultdict(list)
            rfq_dir = os.path.join(ncbi_genome_dir, 'refseq', 'GCF')
            gbk_dir = os.path.join(ncbi_genome_dir, 'genbank', 'GCA')

            for input_dir in (rfq_dir, gbk_dir):
                for first_three in os.listdir(input_dir):
                    onethird_species_dir = os.path.join(input_dir, first_three)
                    print onethird_species_dir
                    if os.path.isfile(onethird_species_dir):
                        continue
                    for second_three in os.listdir(onethird_species_dir):
                        twothird_species_dir = os.path.join(
                            onethird_species_dir, second_three)
                        # print twothird_species_dir
                        if os.path.isfile(twothird_species_dir):
                            continue
                        for third_three in os.listdir(twothird_species_dir):
                            threethird_species_dir = os.path.join(
                                twothird_species_dir, third_three)
                            # print threethird_species_dir
                            if os.path.isfile(threethird_species_dir):
                                continue
                            for complete_name in os.listdir(
                                    threethird_species_dir):
                                assembly_dir = os.path.join(
                                    threethird_species_dir, complete_name)
                                if os.path.isfile(assembly_dir):
                                    continue

                                accession = complete_name[0:complete_name.
                                                          find('_', 4)]

                                processed_assemblies[accession].append(
                                    assembly_dir)
                                if len(processed_assemblies[accession]) >= 2:
                                    continue

                                hmm_results_file = os.path.join(
                                    assembly_dir, self.output_dir,
                                    rna_gene + '.hmm_summary.tsv')
                                if os.path.exists(hmm_results_file):
                                    continue

                                genome_file = os.path.join(
                                    assembly_dir,
                                    complete_name + '_genomic.fna')
                                input_files.append(genome_file)

        # generate metadata for user genomes
        if user_genome_dir != 'NONE':
            print('Reading user genome directories.')
            for user_id in os.listdir(user_genome_dir):
                full_user_dir = os.path.join(user_genome_dir, user_id)
                if not os.path.isdir(full_user_dir):
                    continue

                for genome_id in os.listdir(full_user_dir):
                    full_genome_dir = os.path.join(full_user_dir, genome_id)

                    hmm_results_file = os.path.join(
                        full_genome_dir, self.output_dir,
                        rna_gene + '.hmm_summary.tsv')
                    if os.path.exists(hmm_results_file):
                        continue

                    genome_file = os.path.join(full_genome_dir,
                                               genome_id + '_genomic.fna')
                    input_files.append(genome_file)

            print('Identified %d genomes to process.' % len(input_files))

        # process each genome
        print('Generating metadata for each genome:')
        parallel = Parallel(cpus=cpus)
        parallel.run(self._producer, None, input_files, self._progress)
Exemplo n.º 35
0
    def run(self, input_tree, 
                    msa_file, 
                    marker_info_file, 
                    mask_file, 
                    perc_markers_to_keep, 
                    num_replicates, 
                    model,
                    jk_dir,
                    output_dir):
        """Jackknife marker genes.

        Marker file should have the format:
          <marker id>\t<marker name>\t<marker desc>\t<length>\n

        Parameters
        ----------
        input_tree : str
          Tree inferred with all data.
        msa_file : str
          File containing multiple sequence alignment for all taxa.
        marker_info_file : str
          File indicating database id, HMM name, description and length of each marker in the alignment.
        mask_file : str
          File indicating masking of multiple sequence alignment.
        perc_markers_to_keep : float [0, 1]
          Percentage of marker genes to keep in each replicate.
        num_replicates : int
          Number of replicates to perform.
        model : str
          Desired model of evolution.
        output_dir : str
          Output directory for jackkife trees.
        """

        assert(model in ['wag', 'jtt'])

        self.model = model
        self.perc_markers_to_keep = perc_markers_to_keep
        
        
        # determine length of each marker gene in alignment
        rep_tree_files = []
        if not jk_dir:
            self.replicate_dir = os.path.join(output_dir, 'replicates')
            make_sure_path_exists(self.replicate_dir)
            
            marker_lengths = []
            total_len = 0
            with open(marker_info_file) as f:
                f.readline()
                for line in f:
                    line_split = line.split('\t')
                    ml = int(line_split[3])
                    marker_lengths.append(ml)
                    total_len += ml
                    
            self.logger.info('Concatenated length of markers: %d' % total_len)
                    
            # read mask
            mask = open(mask_file).readline().strip()
            start = 0
            self.marker_lengths = []
            total_mask_len = 0
            for ml in marker_lengths:
                end = start + ml
                zeros = mask[start:end].count('0')
                start = end
                
                self.marker_lengths.append(ml - zeros)
                total_mask_len += ml - zeros
                
            self.logger.info('Concatenated length of filtered MSA: %d' % total_mask_len)

            # read full multiple sequence alignment
            self.msa = seq_io.read(msa_file)
            
            if len(self.msa.values()[0]) != total_mask_len:
                self.logger.error('Length of MSA does not meet length of mask.')
                sys.exit()

            # calculate replicates
            self.logger.info('Calculating jackknife marker replicates:')
            parallel = Parallel(self.cpus)
            parallel.run(self._producer, None, xrange(num_replicates), self._progress)

            # calculate support
            self.logger.info('Calculating support for %d replicates.' % num_replicates)
            for rep_index in xrange(num_replicates):
                rep_tree_files.append(os.path.join(self.replicate_dir, 'jk_markers.tree.' + str(rep_index) + '.tre'))
        else:
            for f in os.listdir(jk_dir):
                if f.endswith('.tree') or f.endswith('.tre'):
                    rep_tree_files.append(os.path.join(jk_dir, f))
            self.logger.info('Read %d jackknife replicates.' % len(rep_tree_files))

        output_tree = os.path.join(output_dir, remove_extension(input_tree) + '.jk_markers.tree')
        bootstrap_support(input_tree, rep_tree_files, output_tree)

        return output_tree