Exemplo n.º 1
0
    def gen_combined_aa_sequences_FASTA(self,
                                        output_file_path,
                                        exclude_partial_gene_calls=False):
        self.run.info('Exclude partial gene calls',
                      exclude_partial_gene_calls,
                      nl_after=1)

        total_num_aa_sequences = 0
        total_num_excluded_aa_sequences = 0

        fasta_output = fastalib.FastaOutput(output_file_path)

        genome_info_dict = self.get_genomes_dict()

        for genome_name in self.genome_names:
            self.progress.new('Storing aa sequences')
            self.progress.update('%s ...' % genome_name)

            gene_caller_ids = sorted([
                int(gene_caller_id)
                for gene_caller_id in self.get_gene_caller_ids(genome_name)
            ])

            for gene_caller_id in gene_caller_ids:
                is_partial = self.is_partial_gene_call(genome_name,
                                                       gene_caller_id)

                if exclude_partial_gene_calls and is_partial:
                    total_num_excluded_aa_sequences += 1
                    continue

                aa_sequence = self.get_gene_sequence(genome_name,
                                                     gene_caller_id)

                fasta_output.write_id(
                    '%s_%d' % (genome_info_dict[genome_name]['genome_hash'],
                               int(gene_caller_id)))
                fasta_output.write_seq(aa_sequence, split=False)

                total_num_aa_sequences += 1

            self.progress.end()

        fasta_output.close()

        self.run.info('AA sequences FASTA', output_file_path)
        self.run.info('Num AA sequences reported',
                      '%s' % pp(total_num_aa_sequences),
                      nl_before=1)
        self.run.info('Num excluded gene calls',
                      '%s' % pp(total_num_excluded_aa_sequences))

        return total_num_aa_sequences, total_num_excluded_aa_sequences
Exemplo n.º 2
0
    def export_sequences_table_in_db_into_FASTA_file(
            self, table=t.contig_sequences_table_name, output_file_path=None):
        if self.db_type != 'contigs':
            return None

        if output_file_path:
            filesnpaths.is_output_file_writable(output_file_path)
        else:
            output_file_path = os.path.join(
                filesnpaths.get_temp_directory_path(), 'aa_sequences.fa')

        database = db.DB(self.db_path, self.version)

        if table not in database.get_table_names():
            raise ConfigError(
                'Trying to export sequences into a FASTA file, but the table\
                                "%s" does not seem to be in this database :/' %
                (table))

        if 'sequence' not in database.get_table_structure(table):
            raise ConfigError(
                "You requested to store sequences in table '%s' into a FASTA\
                                file, however this table does not seem to be a table that\
                                stores sequence information :(" % table)

        sequences_table = database.get_table_as_dict(table)
        database.disconnect()

        if not len([sequences_table]):
            raise ConfigError(
                "There are no sequences to report in table '%s'." % (table))

        self.progress.new('Exporting %d sequences into a FASTA file' %
                          len(sequences_table))
        self.progress.update('...')

        sequences_fasta = u.FastaOutput(output_file_path)

        for seq_id in sequences_table:
            sequences_fasta.write_id(seq_id)
            sequences_fasta.write_seq(sequences_table[seq_id]['sequence'],
                                      split=False)

        self.progress.end()
        self.run.info('Sequences',
                      '%d sequences reported.' % (len(sequences_table)))
        self.run.info('FASTA', output_file_path)

        return output_file_path
Exemplo n.º 3
0
    def generate(self):
        output = fastalib.FastaOutput(self.output_file)

        for index_fasta in range(0, len(self.fasta_files)):
            f = self.fasta_files_dict[self.fasta_files[index_fasta]]

            x = self.short_read_length
            c = f['coverage']

            self.progress.new(
                'Working on file %d of %d (%s) with expected coverage of %d' %
                (index_fasta + 1, len(self.fasta_files), f['alias'], c))

            fasta = fastalib.SequenceSource(f['path'])
            total_num_errors = 0
            total_num_reads = 0
            while next(fasta):
                L = len(fasta.seq)

                av_num_short_reads_needed = int(L / x * c)
                total_num_reads += av_num_short_reads_needed

                for index_short_read in range(0, av_num_short_reads_needed):
                    if (index_short_read + 1) % 100 == 0:
                        self.progress.update('Entry %s :: %s nts :: reads %s of %s :: num errors: %s ...'\
                                                        % (pp(fasta.pos + 1), pp(len(fasta.seq)),
                                                           pp(index_short_read + 1), pp(av_num_short_reads_needed),
                                                           pp(total_num_errors)))

                    start_pos = random.randint(0, L - x)
                    short_read, num_errors = simulate_errors(
                        self.error_rate, fasta.seq[start_pos:start_pos + x])
                    total_num_errors += num_errors

                    output.write_id('%s_%d|source:%s|start:%d|stop:%d' %
                                    (f['alias'], index_short_read, fasta.id,
                                     start_pos, start_pos + x))
                    output.write_seq(short_read)

            self.progress.end()
            self.run.info('%s w/ %d contigs' % (f['alias'], fasta.pos), '%s reads with %s errors (avg %.4f) for %sX avg cov.'\
                                        % (pp(total_num_reads),
                                           pp(total_num_errors),
                                           total_num_errors * 1.0 / (total_num_reads * x),
                                           pp(c),
                                           ))

        output.close()
        self.run.info('Fasta output', self.output_file)
Exemplo n.º 4
0
    def export_contigs_in_db_into_FASTA_file(self):
        if self.db_type != 'contigs':
            return None

        database = db.DB(self.db_path, self.version)
        contig_sequences_table = database.get_table_as_dict(t.contig_sequences_table_name)
        database.disconnect()

        self.progress.new('Exporting contigs into a FASTA file')
        self.progress.update('...')
        contigs_fasta_path = os.path.join(filesnpaths.get_temp_directory_path(), 'contigs.fa')
        contigs_fasta = u.FastaOutput(contigs_fasta_path)
        for contig in contig_sequences_table:
            contigs_fasta.write_id(contig)
            contigs_fasta.write_seq(contig_sequences_table[contig]['sequence'], split=False)

        self.progress.end()
        self.run.info('FASTA for contigs', contigs_fasta_path)

        return contigs_fasta_path
Exemplo n.º 5
0
    def export_sequences_table_in_db_into_FASTA_file(
            self, table=t.contig_sequences_table_name, output_file_path=None):
        if self.db_type != 'contigs':
            return None

        if output_file_path:
            filesnpaths.is_output_file_writable(output_file_path)
        else:
            output_file_path = os.path.join(
                filesnpaths.get_temp_directory_path(), 'aa_sequences.fa')

        database = db.DB(self.db_path, self.version)

        if table not in database.get_table_names():
            raise ConfigError(
                'Trying to export sequences into a FASTA file, but the table\
                                "%s" does not seem to be in this database :/' %
                (table))

        if 'sequence' not in database.get_table_structure(table):
            raise ConfigError(
                "You requested to store sequences in table '%s' into a FASTA\
                                file, however this table does not seem to be a table that\
                                stores sequence information :(" % table)

        sequences_table = database.get_table_as_dict(table)
        database.disconnect()

        if not len([sequences_table]):
            raise ConfigError(
                "There are no sequences to report in table '%s'." % (table))

        self.progress.new('Exporting %d sequences into a FASTA file' %
                          len(sequences_table))
        self.progress.update('...')

        sequences_fasta = u.FastaOutput(output_file_path)

        seq_ids_not_reported = set([])

        for seq_id in sequences_table:
            if len(sequences_table[seq_id]['sequence']):
                sequences_fasta.write_id(seq_id)
                sequences_fasta.write_seq(sequences_table[seq_id]['sequence'],
                                          split=False)
            else:
                seq_ids_not_reported.add(seq_id)

        self.progress.end()

        if len(seq_ids_not_reported):
            self.run.warning(
                "%d entries in the sequences table had blank sequences :/ This is related to the issue\
                             at https://github.com/merenlab/anvio/issues/565. If this is like mid-2018 and you still\
                             get this warning, please find an anvi'o developer and make them feel embarrassed. If it\
                             is earlier than take this as a simple warning that some gene calls in your downstream\
                             analyses may have no sequences, and that's OK. This is a very minor issue due to on-the-fly\
                             addition of Ribosomal RNA gene calls to the contigs database, and will likely will not\
                             affect anything major. This warning will go away when anvi'o can seamlessly work with\
                             multiple gene callers (which we are looking forward to implement in the future)."
                % len(seq_ids_not_reported))

        self.run.info(
            'Sequences', '%d sequences reported.' %
            (len(sequences_table) - len(seq_ids_not_reported)))
        self.run.info('FASTA', output_file_path)

        return output_file_path
Exemplo n.º 6
0
    def export_sequences_table_in_db_into_FASTA_file(
        self,
        table=t.contig_sequences_table_name,
        output_file_path=None,
        item_names=set([])):
        '''Exports a sequence table from the contigs database.

            - t.contig_sequences_table_name: contig sequences (where item_names are contig names)
            - t.gene_amino_acid_sequences_table_name: amino acid sequences for gene calls (item_names are gene caller ids)


          If `item_names` are specified, only those sequences with matching ids to something in this set will be reported.
          '''

        if self.db_type != 'contigs':
            return None

        if not isinstance(item_names, set):
            raise ConfigError("`item_names` must be of type `set`")

        if output_file_path:
            filesnpaths.is_output_file_writable(output_file_path)
        else:
            output_file_path = os.path.join(
                filesnpaths.get_temp_directory_path(), 'aa_sequences.fa')

        database = db.DB(self.db_path, self.version)

        if table not in database.get_table_names():
            raise ConfigError(
                'Trying to export sequences into a FASTA file, but the table\
                                "%s" does not seem to be in this database :/' %
                (table))

        if 'sequence' not in database.get_table_structure(table):
            raise ConfigError(
                "You requested to store sequences in table '%s' into a FASTA\
                                file, however this table does not seem to be a table that\
                                stores sequence information :(" % table)

        sequences_table = database.get_table_as_dict(table)
        database.disconnect()

        if len(item_names):
            total_num_items_in_db = len(sequences_table)
            item_names_to_remove = set(list(
                sequences_table.keys())).difference(item_names)

            for item_name in item_names_to_remove:
                if item_name in item_names_to_remove:
                    sequences_table.pop(item_name)

            # who does this for their users:
            num_items_to_be_reported = len(sequences_table)
            optional_info = ("It turned out %d of the item ids you requested was actually in the database." \
                                    % len(sequences_table)) if num_items_to_be_reported != len(item_names) else ''

            if not self.quiet:
                self.run.info_single("You asked anvi'o to report only %d items from a database that contained %d. %s" \
                                            % (len(item_names), total_num_items_in_db, optional_info))

        if not len([sequences_table]):
            raise ConfigError(
                "There are no sequences to report in table '%s'." % (table))

        self.progress.new('Exporting %d sequences into a FASTA file' %
                          len(sequences_table))
        self.progress.update('...')

        sequences_fasta = u.FastaOutput(output_file_path)

        blank_seq_ids_not_reported = set([])

        for seq_id in sequences_table:
            if len(sequences_table[seq_id]['sequence']):
                sequences_fasta.write_id(seq_id)
                sequences_fasta.write_seq(sequences_table[seq_id]['sequence'],
                                          split=False)
            else:
                blank_seq_ids_not_reported.add(seq_id)

        self.progress.end()

        if len(blank_seq_ids_not_reported):
            self.run.warning(
                "%d entries in the sequences table had blank sequences :/ This is related to the issue\
                             at https://github.com/merenlab/anvio/issues/565. If this is like mid-2018 and you still\
                             get this warning, please find an anvi'o developer and make them feel embarrassed. If it\
                             is earlier than take this as a simple warning that some gene calls in your downstream\
                             analyses may have no sequences, and that's OK. This is a very minor issue due to on-the-fly\
                             addition of Ribosomal RNA gene calls to the contigs database, and will likely will not\
                             affect anything major. This warning will go away when anvi'o can seamlessly work with\
                             multiple gene callers (which we are looking forward to implement in the future)."
                % len(blank_seq_ids_not_reported))

        self.run.info(
            'Sequences', '%d sequences reported.' %
            (len(sequences_table) - len(blank_seq_ids_not_reported)))
        self.run.info('FASTA', output_file_path)

        return output_file_path
Exemplo n.º 7
0
    def check_database(self):
        """Setup the database files

        Downloads the .pir file if it is missing
        Binarizes .pir file if .bin is missing
        Creates the .dmnd file if it is missing
        """

        bin_db_path = J(self.database_dir, self.modeller_database + ".bin")
        pir_db_path = J(self.database_dir, self.modeller_database + ".pir")
        bin_exists = utils.filesnpaths.is_file_exists(bin_db_path,
                                                      dont_raise=True)
        pir_exists = utils.filesnpaths.is_file_exists(pir_db_path,
                                                      dont_raise=True)

        if bin_exists and pir_exists:
            # We good
            pass
        else:
            if not pir_exists:
                # Download .pir
                self.run.warning(
                    "Anvi'o looked in {} for a database with the name {} and with an extension "
                    "of either .bin or .pir, but didn't find anything matching that "
                    "criteria. Anvi'o will try and download the best database it knows of from "
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz and use that. "
                    "You can checkout https://salilab.org/modeller/ for more info about the pdb_95 "
                    "database".format(self.database_dir,
                                      self.modeller_database))

                db_download_path = os.path.join(self.database_dir,
                                                "pdb_95.pir.gz")
                utils.download_file(
                    "https://salilab.org/modeller/downloads/pdb_95.pir.gz",
                    db_download_path)
                utils.run_command(
                    ['gzip', '-d', db_download_path],
                    log_file_path=filesnpaths.get_temp_file_path())

            # Binarize .pir (make .bin)
            self.run.warning(
                "Your database is not in binary format. That means accessing its contents is slower "
                "than it could be. Anvi'o is going to make a binary format. Just FYI"
            )
            self.run_binarize_database(pir_db_path, bin_db_path)

        dmnd_db_path = J(self.database_dir, self.modeller_database + '.dmnd')

        if os.path.exists(dmnd_db_path):
            return

        self.run.warning(
            "Your diamond database does not exist. It will be created.")

        script_name = "pir_to_fasta.py"

        self.copy_script_to_directory(script_name)

        input_pir_path = J(self.database_dir, self.modeller_database + '.pir')
        fasta_path = J(self.database_dir, self.modeller_database + '.fa')
        dmnd_path = J(self.database_dir, self.modeller_database)

        command = [self.executable, script_name, input_pir_path, fasta_path]

        self.run_command(command, script_name=script_name, rename_log=False)

        temp = u.FastaOutput(filesnpaths.get_temp_file_path())
        fasta = u.SequenceSource(fasta_path)

        while next(fasta):
            temp.write_id(fasta.id)
            temp.write_seq(fasta.seq.replace('-', '').replace('.', 'X'))

        shutil.move(temp.output_file_path, fasta_path)
        fasta.close()
        temp.close()

        driver = diamond.Diamond(
            query_fasta=fasta_path,
            run=terminal.Run(verbose=False),
            progress=terminal.Progress(verbose=False),
        )
        driver.makedb(output_file_path=dmnd_path)

        os.remove(fasta_path)