Пример #1
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=False)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['blank']:
            raise ConfigError("The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!")

        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!")
            else:
                self.bin_names_of_interest = [self.bin_name]
Пример #2
0
    def populate_profile_dbs_info_dict(self):
        improper = []

        for p in self.input_profile_db_paths:
            dbops.is_profile_db(p)

            profile_db = dbops.ProfileDatabase(p)

            if profile_db.meta['db_type'] != 'profile' or profile_db.meta[
                    'blank'] or profile_db.meta['merged']:
                improper.append(p)
            else:
                self.profile_dbs_info_dict[p] = profile_db.meta

        proper = [p for p in self.input_profile_db_paths if p not in improper]

        if len(improper) == len(proper):
            raise ConfigError(
                "None of the databases you asked anvi'o to merge were single, non-blank anvi'o profiles. If you\
                               are not testing anvi'o and yet found yourself here, it is safe to assume that something somewhere\
                               in your workflow is quite wrong :/")

        if not len(proper) > 1:
            raise ConfigError(
                "Anvi'o can only merge single, non-blank anvi'o profiles. You have only one database that fits into that\
                               criterion. So there is nothing really to merge here. Yes?"
            )

        if improper:
            self.run.warning("Pleae read carefuly. You sent %d profile databases to anvi'o merger to be merged. However, not\
                              all of them were single, non-blank anvi'o profiles. Anvi'o removed %d of them, and will merge\
                              only the remaining %d. At the end of this warning you will find a list of paths to those databases\
                              anvi'o excluded from merging. If you are not happy with that, please carefully examine what went wrong.\
                              Here are all the paths for excluded databases: %s." \
                                            % (len(self.input_profile_db_paths), len(improper), len(proper), ', '.join(["'%s'" % p for p in improper])))
Пример #3
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory, ok_if_exists=False)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['db_type'] != 'profile' or profile_db.meta[
                'blank'] or not profile_db.meta['merged']:
            raise ConfigError(
                "You an only split merged profiles :/ We hope this is not a moment of a terrible disappointment.\
                               If it is, you should consider writing to us.")

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError(
                    "The bin name you wish to split from this profile databse is not in the collection. Busted!"
                )
            else:
                self.bin_names_of_interest = [self.bin_name]
Пример #4
0
    def init_mock_profile(self):
        self.progress.new('Init')
        self.progress.update('...')
        self.num_reads_mapped = 0
        self.progress.end()

        self.contig_names = list(self.contigs_basic_info.keys())
        self.contig_lengths = [self.contigs_basic_info[contig_name]['length'] for contig_name in self.contigs_basic_info]
        self.total_length = sum(self.contig_lengths)
        self.num_contigs = len(self.contig_names)

        utils.check_contig_names(self.contig_names)

        self.run.info('input_bam', None)
        self.run.info('output_dir', self.output_directory, display_only=True)
        self.run.info('total_reads_mapped', pp(int(self.num_reads_mapped)))
        self.run.info('num_contigs', pp(self.num_contigs))

        # check for the -M parameter.
        self.remove_contigs_that_are_shorter_than_min_contig_length()

        self.run.info('num_contigs_after_M', self.num_contigs, display_only=True)
        self.run.info('num_contigs', self.num_contigs, quiet=True)
        self.run.info('num_splits', self.num_splits)
        self.run.info('total_length', self.total_length)

        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        profile_db.db.set_meta_value('num_splits', self.num_splits)
        profile_db.db.set_meta_value('num_contigs', self.num_contigs)
        profile_db.db.set_meta_value('total_length', self.total_length)
        profile_db.disconnect()

        self.layer_additional_data['total_reads_mapped'] = self.num_reads_mapped
        self.layer_additional_keys.append('total_reads_mapped')
Пример #5
0
    def merge_gene_coverages_tables(self):
        self.is_all_samples_have_it('gene_coverages_table')

        # create an instance from genes
        gene_coverages_table = dbops.TableForGeneCoverages(
            self.profile_db_path,
            anvio.__profile__version__,
            progress=self.progress)

        # fill "genes" instance from all samples
        for runinfo in self.input_runinfo_dicts.values():
            sample_id = runinfo['sample_id']

            sample_profile_db = dbops.ProfileDatabase(runinfo['profile_db'],
                                                      quiet=True)
            sample_gene_profiles = sample_profile_db.db.get_table_as_dict(
                tables.gene_coverages_table_name,
                tables.gene_coverages_table_structure)
            for g in sample_gene_profiles.values():
                gene_coverages_table.add_gene_entry(
                    g['prot'], g['sample_id'], g['mean_coverage'] *
                    self.normalization_multiplier[sample_id])
            sample_profile_db.disconnect()

        gene_coverages_table.store()
Пример #6
0
    def __init__(self, args):
        self.progress = terminal.Progress()
        self.run = terminal.Run()
        self.args = args
        self.args.mode = 'refine'

        self.bins = set([])

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.bin_ids_file_path = A('bin_ids_file')
        self.bin_id = A('bin_id')
        self.collection_name = A('collection_name')
        self.contigs_db_path = A('contigs_db')
        self.profile_db_path = A('profile_db')
        self.debug = A('debug')

        dbops.is_contigs_db(self.contigs_db_path)
        dbops.is_profile_db(self.profile_db_path)

        self.database_paths = {'CONTIGS.db': self.contigs_db_path,
                               'PROFILE.db': self.profile_db_path}
        self.is_merged = None
        self.split_names_of_interest = set([])

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.is_merged = int(profile_db.meta['merged'])
        profile_db.disconnect()

        self.clustering_configs = constants.clustering_configs['merged' if self.is_merged else 'single']
Пример #7
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError("You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_SCVs = False

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'blank': self.blank,
                       'items_ordered': False,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': not self.skip_SNV_profiling,
                       'SCVs_profiled': self.profile_SCVs,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        self.progress.update('Creating a new auxiliary database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.auxiliary_db_path = self.generate_output_destination('AUXILIARY-DATA.db')
        self.auxiliary_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(self.auxiliary_db_path,
                                                                            self.a_meta['contigs_db_hash'],
                                                                            create_new=True,
                                                                            run=null_run,
                                                                            progress=null_progress)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning('Single-nucleotide variation will not be characterized for this profile.')

        if not self.profile_SCVs:
            self.run.warning('Amino acid linkmer frequencies will not be characterized for this profile.')
Пример #8
0
    def set_sample_names(self):
        """Go through all profile databases involved, and learn all sample names"""

        self.sample_names = []

        for profile_db_path in set([g['profile_db_path'] for g in list(self.descriptions.internal_genomes_dict.values())]):
            self.sample_names.extend(sorted(list(dbops.ProfileDatabase(profile_db_path).samples)))

        self.run.info("Samples found", "%d (%s)" % (len(self.sample_names), ', '.join(self.sample_names)), nl_after=1)
Пример #9
0
    def populate_profile_dbs_info_dict(self):
        self.progress.new('Reading self tables of each single profile db')
        self.progress.update('...')

        for p in self.input_profile_db_paths:
            profile_db = dbops.ProfileDatabase(p)
            self.profile_dbs_info_dict[p] = profile_db.meta

        self.progress.end()
Пример #10
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError, "You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/"

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(
            self.output_directory,
            self.progress,
            delete_if_exists=self.overwrite_output_destinations)

        self.progress.update(
            'Creating a new single profile database with contigs hash "%s" ...'
            % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_AA_frequencies = False

        meta_values = {
            'db_type': 'profile',
            'anvio': __version__,
            'sample_id': self.sample_id,
            'samples': self.sample_id,
            'merged': False,
            'blank': self.blank,
            'contigs_clustered': self.contigs_shall_be_clustered,
            'default_view': 'single',
            'min_contig_length': self.min_contig_length,
            'SNVs_profiled': not self.skip_SNV_profiling,
            'AA_frequencies_profiled': self.profile_AA_frequencies,
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'report_variability_full': self.report_variability_full,
            'contigs_db_hash': self.a_meta['contigs_db_hash'],
            'gene_coverages_computed': self.a_meta['genes_are_called']
        }
        profile_db.create(meta_values)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning(
                'Single-nucleotide variation will not be characterized for this profile.'
            )

        if not self.profile_AA_frequencies:
            self.run.warning(
                'Amino acid linkmer frequencies will not be characterized for this profile.'
            )
Пример #11
0
    def init_profile_from_BAM(self):
        self.progress.new('Init')
        self.progress.update('Reading BAM File')
        self.bam = bamops.BAMFileObject(self.input_file_path, run=self.run, progress=self.progress).get()
        self.num_reads_mapped = self.bam.mapped
        self.progress.end()

        self.contig_names = self.bam.references
        self.contig_lengths = self.bam.lengths

        utils.check_contig_names(self.contig_names)

        self.run.info('input_bam', self.input_file_path)
        self.run.info('output_dir', self.output_directory, display_only=True)
        self.run.info('total_reads_mapped', pp(int(self.num_reads_mapped)))
        self.run.info('num_contigs', pp(len(self.contig_names)))

        if self.contig_names_of_interest:
            indexes = [self.contig_names.index(r) for r in self.contig_names_of_interest if r in self.contig_names]
            self.contig_names = [self.contig_names[i] for i in indexes]
            self.contig_lengths = [self.contig_lengths[i] for i in indexes]
            self.run.info('num_contigs_selected_for_analysis', pp(len(self.contig_names)))

        # it brings good karma to let the user know what the hell is wrong with their data:
        self.check_contigs_without_any_gene_calls(self.contig_names)

        # check for the -M parameter.
        self.remove_contigs_based_on_min_max_contig_length()

        # let's see whether the user screwed up to follow the simple instructions
        # mentioned here: http://merenlab.org/2015/05/01/anvio-tutorial/#preparation
        for contig_name in self.contig_names:
            if contig_name not in self.contig_names_in_contigs_db:
                raise ConfigError("At least one contig name in your BAM file does not match contig names stored in the\
                                    contigs database. For instance, this is one contig name found in your BAM file: '%s',\
                                    and this is another one found in your contigs database: '%s'. You may be using an\
                                    contigs database for profiling that has nothing to do with the BAM file you are\
                                    trying to profile, or you may have failed to fix your contig names in your FASTA file\
                                    prior to mapping, which is described here: %s"\
                                        % (contig_name, self.contig_names_in_contigs_db.pop(), 'http://goo.gl/Q9ChpS'))

        self.run.info('num_contigs_after_M', self.num_contigs, display_only=True)
        self.run.info('num_contigs', self.num_contigs, quiet=True)
        self.run.info('num_splits', self.num_splits)
        self.run.info('total_length', self.total_length)
        self.run.info('max_coverage_depth', pp(self.max_coverage_depth))

        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        profile_db.db.set_meta_value('num_splits', self.num_splits)
        profile_db.db.set_meta_value('num_contigs', self.num_contigs)
        profile_db.db.set_meta_value('total_length', self.total_length)
        profile_db.disconnect()

        self.layer_additional_data['total_reads_mapped'] = self.num_reads_mapped
        self.layer_additional_keys.append('total_reads_mapped')
Пример #12
0
    def do_profile_db(self):
        self.progress.update('Subsetting the profile database')

        bin_profile_db_path = os.path.join(self.bin_output_directory,
                                           'PROFILE.db')

        bin_profile_db = dbops.ProfileDatabase(bin_profile_db_path)
        bin_profile_db.touch()

        # copy-paste tables that will largely stay the same from the parent
        bin_profile_db.db.copy_paste(table_name='self',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='views',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='states',
                                     source_db_path=self.profile_db_path)

        # update some values
        bin_profile_db.db.update_meta_value('contigs_db_hash',
                                            self.contigs_db_hash)
        bin_profile_db.db.update_meta_value('available_clusterings', None)

        # setup the filtering rules for migrating data:
        tables = {}

        # this is to deal with merge atomic data tables that are stored in merged profiles.
        # they are being created on the fly during merge, so bin_profile_db.touch() did not
        # create them, and we have to do it here ourselves. while creating them in the target
        # db, we will also populate the tables dictionary for data migration::
        sample_names = self.summary.p_meta['samples']
        for table_name in t.atomic_data_table_structure[1:-1]:
            for target in ['splits', 'contigs']:
                new_table_name = '_'.join([table_name, target])
                new_table_structure = ['contig'
                                       ] + sample_names + ['__parent__']
                new_table_types = [
                    'text'
                ] + ['numeric'] * len(sample_names) + ['text']
                bin_profile_db.db.create_table(new_table_name,
                                               new_table_structure,
                                               new_table_types)

                tables[new_table_name] = ('contig', self.split_names)

        bin_profile_db.disconnect()

        self.migrate_data(tables, self.profile_db_path, bin_profile_db_path)

        self.progress.end()

        if not self.skip_hierarchical_clustering:
            dbops.do_hierarchical_clusterings(self.split_names, bin_profile_db_path, constants.clustering_configs['merged'], self.database_paths,\
                                              self.bin_output_directory, default_clustering_config=constants.merged_default, \
                                              distance=self.distance, linkage=self.linkage, run=terminal.Run(verbose=False), progress=self.progress)
Пример #13
0
    def merge_variable_aas_tables(self):
        variable_aas_table = TableForAAFrequencies(self.merged_profile_db_path, progress=self.progress)

        for input_profile_db_path in self.profile_dbs_info_dict:
            sample_profile_db = dbops.ProfileDatabase(input_profile_db_path, quiet=True)
            sample_variable_aas_table = sample_profile_db.db.get_table_as_list_of_tuples(tables.variable_aas_table_name, tables.variable_aas_table_structure)
            sample_profile_db.disconnect()

            for tpl in sample_variable_aas_table:
                entry = tuple([variable_aas_table.next_id(tables.variable_aas_table_name)] + list(tpl[1:]))
                variable_aas_table.db_entries.append(entry)

        variable_aas_table.store()
Пример #14
0
    def __init__(self, args, r=run, p=progress):
        self.run = r
        self.progress = p

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.num_clusters_requested = A('num_clusters_requested') or 80

        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        self.clusters = {}

        self.lengths = {}
        self.kmers = {}
        self.coverages = {}

        self.debug = args.debug

        self.progress.new('Init')

        self.progress.update('accessing the profile database ...')
        profile_db = dbops.ProfileDatabase(args.profile_db)

        if not int(profile_db.meta['merged']):
            self.progress.end()
            raise ConfigError(
                'CONCOCT can only be used to cluster merged runs...')

        self.coverages = profile_db.db.get_table_as_dict(
            'mean_coverage_contigs', columns_of_interest=profile_db.samples)
        profile_db.disconnect()

        self.progress.update('accessing the profile database ...')
        contigs_db = dbops.ContigsDatabase(args.contigs_db, quiet=True)
        self.kmers = contigs_db.db.get_table_as_dict(
            'kmer_contigs', keys_of_interest=list(self.coverages.keys()))
        splits_basic_info = contigs_db.db.get_table_as_dict(
            'splits_basic_info', keys_of_interest=list(self.coverages.keys()))
        contigs_db.disconnect()

        self.progress.update('computing split lengths ...')
        for split_name in splits_basic_info:
            self.lengths[split_name] = splits_basic_info[split_name]['length']

        self.progress.end()
Пример #15
0
    def merge_indels_tables(self):
        indels_table = TableForIndels(self.merged_profile_db_path,
                                      progress=self.progress)

        for input_profile_db_path in self.profile_dbs_info_dict:
            sample_profile_db = dbops.ProfileDatabase(input_profile_db_path,
                                                      quiet=True)
            sample_indels_table = sample_profile_db.db.get_table_as_list_of_tuples(
                tables.indels_table_name, tables.indels_table_structure)
            sample_profile_db.disconnect()

            for tpl in sample_indels_table:
                entry = tuple(
                    [indels_table.next_id(tables.indels_table_name)] +
                    list(tpl[1:]))
                indels_table.append_entry(entry)

        indels_table.store()
Пример #16
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory, ok_if_exists=True)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['blank']:
            raise ConfigError(
                "The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!"
            )

        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError(
                "Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        # if this is not set false, the summarizer class attemts to remove the main output directory
        # upon initialization. not doing that is useful in this context since this allows multiple
        # anvi-split runs to work on bins in the same collection in parallel:
        self.args.delete_output_directory_if_exists = False

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError(
                    "The bin name you wish to split from this profile databse is not in the collection. Busted!"
                )
            else:
                self.bin_names_of_interest = [self.bin_name]
Пример #17
0
    def merge_variable_aas_tables(self):
        self.is_all_samples_have_it('AA_frequencies_table')

        variable_aas_table = dbops.TableForAAFrequencies(
            self.profile_db_path, progress=self.progress)

        for runinfo in self.input_runinfo_dicts.values():
            sample_profile_db = dbops.ProfileDatabase(runinfo['profile_db'],
                                                      quiet=True)
            sample_variable_aas_table = sample_profile_db.db.get_table_as_list_of_tuples(
                tables.variable_aas_table_name,
                tables.variable_aas_table_structure)
            sample_profile_db.disconnect()

            for tpl in sample_variable_aas_table:
                entry = tuple([
                    variable_aas_table.next_id(tables.variable_aas_table_name)
                ] + list(tpl[1:]))
                variable_aas_table.db_entries.append(entry)

        variable_aas_table.store()
Пример #18
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(
            self.output_directory,
            delete_if_exists=self.overwrite_output_destinations)

        self.run.log_file_path = os.path.join(self.output_directory,
                                              'RUNLOG.txt')

        # set database paths
        self.merged_profile_db_path = os.path.join(self.output_directory,
                                                   'PROFILE.db')
        self.database_paths['PROFILE.db'] = os.path.abspath(
            self.merged_profile_db_path)

        profile_db = dbops.ProfileDatabase(self.merged_profile_db_path)

        C = lambda x: list(self.profile_dbs_info_dict.values())[0][x]
        self.contigs_db_hash = C('contigs_db_hash')
        self.min_contig_length = C('min_contig_length')
        self.max_contig_length = C('max_contig_length')
        self.num_contigs = C('num_contigs')
        self.num_splits = C('num_splits')
        self.min_coverage_for_variability = C('min_coverage_for_variability')
        self.report_variability_full = C('report_variability_full')
        self.SCVs_profiled = C('SCVs_profiled')
        self.SNVs_profiled = C('SNVs_profiled')
        self.total_length = C('total_length')

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and not self.enforce_hierarchical_clustering:
            self.run.warning("It seems you have more than %s splits in your samples to be merged. This is the\
                              soft limit for anvi'o to attempt to create a hierarchical clustering of your splits\
                              (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                              clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`.\
                              But more importantly, please take a look at the anvi'o tutorial to make sure you know\
                              your better options to analyze large metagenomic datasets with anvi'o." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))
            self.skip_hierarchical_clustering = True

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and self.enforce_hierarchical_clustering:
            self.run.warning("Becasue you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                              to create a hierarchical clustering of your %s splits. It may take a bit of time..." \
                                                                % pp(self.num_splits))

        self.total_reads_mapped_per_sample = dict([
            (s, self.layer_additional_data_dict['default'][s]
             ['total_reads_mapped'])
            for s in self.layer_additional_data_dict['default']
        ])

        sample_ids_list = ', '.join(sorted(self.sample_ids_found_in_input_dbs))
        total_reads_mapped_list = ', '.join([
            str(self.total_reads_mapped_per_sample[sample_id])
            for sample_id in self.sample_ids_found_in_input_dbs
        ])

        # we run this now because we change default flags in this function
        # depending on the number of reads characterized within each single profile.
        self.set_normalization_multiplier()

        meta_values = {
            'db_type':
            'profile',
            'anvio':
            __version__,
            'sample_id':
            self.sample_id,
            'samples':
            sample_ids_list,
            'total_reads_mapped':
            total_reads_mapped_list,
            'merged':
            True,
            'blank':
            False,
            'items_ordered':
            False,
            'default_view':
            'mean_coverage',
            'min_contig_length':
            self.min_contig_length,
            'max_contig_length':
            self.max_contig_length,
            'SNVs_profiled':
            self.SNVs_profiled,
            'SCVs_profiled':
            self.SCVs_profiled,
            'num_contigs':
            self.num_contigs,
            'num_splits':
            self.num_splits,
            'total_length':
            self.total_length,
            'min_coverage_for_variability':
            self.min_coverage_for_variability,
            'report_variability_full':
            self.report_variability_full,
            'contigs_db_hash':
            self.contigs_db_hash,
            'description':
            self.description
            if self.description else '_No description is provided_'
        }
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables(
        )

        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info(
            'description', 'Found (%d characters)' %
            len(self.description) if self.description else None)
        self.run.info('profile_db', self.merged_profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('num_runs_processed',
                      len(self.sample_ids_found_in_input_dbs))
        self.run.info('merged_sample_ids', sample_ids_list)
        self.run.info("Common layer additional data keys",
                      ', '.join(self.layer_additional_data_keys))
        self.run.info('total_reads_mapped', total_reads_mapped_list)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('clustering_performed',
                      not self.skip_hierarchical_clustering)

        self.merge_split_coverage_data()

        if self.SNVs_profiled:
            self.progress.new('Merging variable positions tables')
            self.progress.update('...')
            self.merge_variable_nts_tables()
            self.progress.end()
        else:
            self.run.warning(
                "SNVs were not profiled, variable nt positions tables will be empty in the merged profile database."
            )

        if self.SCVs_profiled:
            self.progress.new('Merging variable codons tables')
            self.progress.update('...')
            self.merge_variable_codons_tables()
            self.progress.end()
        else:
            self.run.warning(
                "Codon frequencies were not profiled, hence, these tables will be empty in the merged profile database."
            )

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        self.populate_misc_data_tables()

        self.run.info_single('Happy ☘ ', nl_before=1, nl_after=1)

        self.run.quit()
parser.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db', {'required': False}))
parser.add_argument(*anvio.A('output-file'), **anvio.K('output-file', {'default': "COLLECTIONS.txt"}))

args = anvio.get_args(parser)

filesnpaths.is_file_exists(args.output_file)

contigs = set([])
contig_lengths = {}

db = dbops.ContigsDatabase(args.contigs_db, quiet=False)
contigs_info_table = db.db.get_table_as_dict(t.contigs_info_table_name)
contig_lengths = dict([(c, contigs_info_table[c]['length']) for c in contigs_info_table])
db.disconnect()

db = dbops.ProfileDatabase(args.profile_db, quiet=False)
collections_splits_table = db.db.get_table_as_dict(t.collections_splits_table_name)
collections_info_table = db.db.get_table_as_dict(t.collections_info_table_name)
db.disconnect()

collection_names = list(collections_info_table.keys())

splits = {}
for entry in list(collections_splits_table.values()):
    split = entry['split']
    collection_name = entry['collection_name']
    bin_name = entry['bin_name']

    if split in splits:
        splits[split][collection_name] = bin_name
    else:
Пример #20
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(
            self.output_directory,
            delete_if_exists=self.overwrite_output_destinations)

        # init profile database
        self.profile_db_path = os.path.join(self.output_directory,
                                            'PROFILE.db')

        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        C = lambda x: self.input_runinfo_dicts.values()[0][x]
        self.contigs_db_hash = C('contigs_db_hash')
        self.min_contig_length = C('min_contig_length')
        self.num_contigs = C('num_contigs')
        self.num_splits = C('num_splits')
        self.total_reads_mapped = C('total_reads_mapped')
        self.min_coverage_for_variability = C('min_coverage_for_variability')
        self.report_variability_full = C('report_variability_full')
        self.gene_coverages_computed = C('gene_coverages_computed')
        self.AA_frequencies_profiled = C('profile_AA_frequencies')
        self.SNVs_profiled = not C('skip_SNV_profiling')
        self.total_length = C('total_length')

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and not self.enforce_hierarchical_clustering:
            self.run.warning("It seems you have more than %s splits in your samples to be merged. This is the\
                              soft limit for anvi'o to attempt to create a hierarchical clustering of your splits\
                              (which becomes the center tree in all anvi'o displays). If you want a hierarchical\
                              clustering to be done anyway, please see the flag `--enforce-hierarchical-clustering`.\
                              But more importantly, please take a look at the anvi'o tutorial to make sure you know\
                              your better options to analyze large metagenomic datasets with anvi'o." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))
            self.skip_hierarchical_clustering = True

        if self.num_splits > self.max_num_splits_for_hierarchical_clustering and self.enforce_hierarchical_clustering:
            self.run.warning("Becasue you have used the flag `--enforce-hierarchical-clustering`, anvi'o will attempt\
                              to create a hierarchical clustering of your %s splits. It may take a bit of time..." \
                                                                % pp(self.max_num_splits_for_hierarchical_clustering))

        meta_values = {
            'db_type': 'profile',
            'anvio': __version__,
            'sample_id': self.sample_id,
            'samples': ','.join(self.merged_sample_ids),
            'merged': True,
            'blank': False,
            'contigs_clustered': not self.skip_hierarchical_clustering,
            'default_view': 'mean_coverage',
            'min_contig_length': self.min_contig_length,
            'SNVs_profiled': self.SNVs_profiled,
            'AA_frequencies_profiled': self.AA_frequencies_profiled,
            'num_contigs': self.num_contigs,
            'num_splits': self.num_splits,
            'total_length': self.total_length,
            'total_reads_mapped': self.total_reads_mapped,
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'report_variability_full': self.report_variability_full,
            'contigs_db_hash': self.contigs_db_hash,
            'gene_coverages_computed': self.gene_coverages_computed
        }
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables(
        )
        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('merged_sample_ids', self.merged_sample_ids)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('num_runs_processed', len(self.merged_sample_ids))
        self.run.info('clustering_performed',
                      not self.skip_hierarchical_clustering)

        self.set_normalization_multiplier()

        self.progress.new('Merging gene coverages tables')
        self.merge_gene_coverages_tables()
        self.progress.end()

        self.progress.new('Merging split coverage values')
        self.merge_split_coverage_data()
        self.progress.end()

        if self.SNVs_profiled:
            self.progress.new('Merging variable positions tables')
            self.merge_variable_nts_tables()
            self.progress.end()
        else:
            self.run.warning(
                "SNVs were not profiled, variable nt positions tables will be empty in the merged profile database."
            )

        if self.AA_frequencies_profiled:
            self.progress.new('Merging variable AAs tables')
            self.merge_variable_aas_tables()
            self.progress.end()
        else:
            self.run.warning(
                "AA frequencies were not profiled, these tables will be empty in the merged profile database."
            )

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # store everything
        runinfo_serialized = os.path.join(self.output_directory, 'RUNINFO.mcp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.run.quit()
Пример #21
0
    def init_commons(self):
        self.progress.new('Init')

        self.progress.update('Checking the output file path ..')
        if self.output_file_path:
            filesnpaths.is_output_file_writable(self.output_file_path)

        self.progress.update('Checking the samples of interest ..')
        if self.samples_of_interest_path:
            filesnpaths.is_file_exists(self.samples_of_interest_path)
            self.samples_of_interest = set([
                s.strip()
                for s in open(self.samples_of_interest_path).readlines()
            ])
        else:
            self.samples_of_interest = set([])

        self.progress.update('Making sure our databases are here ..')
        if not self.profile_db_path:
            raise ConfigError, 'You need to provide a profile database.'

        if not self.contigs_db_path:
            raise ConfigError, 'You need to provide a contigs database.'

        self.progress.update('Making sure our databases are compatible ..')
        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        if self.min_coverage_in_each_sample and not self.quince_mode:
            self.progress.end()
            raise ConfigError, "When you sepecify a coverage value through --min-coverage-in-each-sample, you must also\
                                use --quince-mode flag, since the former parameter needs to know the coverage values in all\
                                samples even if variation is reported for only one sample among otheres. This is the only way\
                                to figure out whether variation is not reported for other samples due to low or zero coverage,\
                                or there was no variation to report despite the high coverage. Anvi'o could turn --quince-mode\
                                flat automatically for you, but then it is much better if you have full control and understaning\
                                of what is going on."

        if self.quince_mode:
            self.progress.update('Accessing auxiliary data file ...')
            auxiliary_data_file_path = os.path.join(
                os.path.dirname(self.profile_db_path), 'AUXILIARY-DATA.h5')
            if not os.path.exists(auxiliary_data_file_path):
                raise ConfigError, "Anvi'o needs the auxiliary data file to run this program with '--quince-mode' flag.\
                                    However it wasn't found at '%s' :/" % auxiliary_data_file_path
            self.merged_split_coverage_values = auxiliarydataops.AuxiliaryDataForSplitCoverages(
                auxiliary_data_file_path, None, ignore_hash=True)

        self.progress.update(
            'Attempting to get our splits of interest sorted ..')
        if self.collection_name:
            # the user wants to go with the collection id path. fine. we will get our split names from
            # the profile database.
            if not self.bin_id:
                self.progress.end()
                raise ConfigError, 'When you declare a collection id, you must also declare a bin name\
                                    (from which the split names of interest will be acquired)'

            if self.splits_of_interest or self.splits_of_interest_path:
                self.progress.end()
                raise ConfigError, "You declared a collection id and one or more bin names so anvi'o can find out\
                                    splits of interest, but you also have specified informaiton for split names?\
                                    This is confusing. You should choose one way or another :/"

            self.splits_of_interest = ccollections.GetSplitNamesInBins(
                self.args).get_split_names_only()
        else:
            # OK. no collection id. we will go oldschool. we whope to find what we are looking for in
            # self.splits_of_interst_path  at this point (which may have been filled through the command
            # line client), or in self.splits_of_interest (which may have been filled in by another program)
            if not self.splits_of_interest:
                if not self.splits_of_interest_path:
                    self.progress.end()
                    raise ConfigError, 'You did not declare a source for split names. You either should give me\
                                        a file with split names you are interested in, or a collection id and\
                                        bin name so I can learn split names from the profile database.'

                filesnpaths.is_file_exists(self.splits_of_interest_path)
                self.splits_of_interest = set([
                    c.strip().replace('\r', '')
                    for c in open(self.splits_of_interest_path).readlines()
                ])

        self.input_file_path = '/' + '/'.join(
            os.path.abspath(self.profile_db_path).split('/')[:-1])

        self.progress.update('Reading the data ...')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        self.sample_ids = profile_db.samples  # we set this now, but we will overwrite it with args.samples_of_interest if necessary

        if not profile_db.meta['SNVs_profiled']:
            self.progress.end()
            raise ConfigError, "Well well well. It seems SNVs were not characterized for this profile database.\
                                Sorry, there is nothing to report here!"

        if self.engine == 'NT':
            self.data = profile_db.db.get_table_as_dict(
                t.variable_nts_table_name)
        elif self.engine == 'AA':
            # AA specific stuff. first check whether things were profiled
            if not profile_db.meta['AA_frequencies_profiled']:
                raise ConfigError, "It seems AA frequencies were not characterized for this profile database.\
                                    There is nothing to report here for AAs!"

            # get the data.
            self.data = profile_db.db.get_table_as_dict(
                t.variable_aas_table_name)

            # append split_name information
            for e in self.data.values():
                e['split_name'] = self.gene_callers_id_to_split_name_dict[
                    e['corresponding_gene_call']]
        else:
            raise ConfigError, "VariabilitySuper :: Anvi'o doesn't know what to do with a engine on '%s' yet :/" % self.engine

        profile_db.disconnect()

        self.progress.end()
Пример #22
0
    def do_profile_db(self):
        # are we working with a merged profile database?
        merged = self.summary.p_meta['merged']
        self.run.info('Merged database', 'True' if merged else 'False')

        self.progress.new('Splitting "%s"' % self.bin_id)
        self.progress.update('Subsetting the %s profile database' %
                             'merged' if merged else 'single')

        bin_profile_db = dbops.ProfileDatabase(self.bin_profile_db_path)
        bin_profile_db.touch()

        # copy-paste tables that will largely stay the same from the parent
        bin_profile_db.db.copy_paste(table_name='self',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='views',
                                     source_db_path=self.profile_db_path)
        bin_profile_db.db.copy_paste(table_name='states',
                                     source_db_path=self.profile_db_path)

        # update some values
        bin_profile_db.db.update_meta_value('contigs_db_hash',
                                            self.contigs_db_hash)
        bin_profile_db.db.update_meta_value('available_clusterings', None)
        bin_profile_db.db.update_meta_value('sample_id', self.bin_id)

        # setup the filtering rules for migrating data:
        tables = {}

        # this is to deal with merge atomic data tables that are stored in merged profiles.
        # they are being created on the fly during merge, so bin_profile_db.touch() did not
        # create them, and we have to do it here ourselves. while creating them in the target
        # db, we will also populate the tables dictionary for data migration::
        sample_names = self.summary.p_meta['samples']
        if merged:
            for table_name in t.atomic_data_table_structure[1:-1]:
                for target in ['splits', 'contigs']:
                    new_table_name = '_'.join([table_name, target])
                    new_table_structure = ['contig'
                                           ] + sample_names + ['__parent__']
                    new_table_types = [
                        'text'
                    ] + ['numeric'] * len(sample_names) + ['text']
                    bin_profile_db.db.create_table(new_table_name,
                                                   new_table_structure,
                                                   new_table_types)

                    tables[new_table_name] = ('contig', self.split_names)
        else:
            profile_db = dbops.ProfileDatabase(self.profile_db_path)
            table_structure = profile_db.db.get_table_structure(
                'atomic_data_contigs')
            table_types = profile_db.db.get_table_column_types(
                'atomic_data_contigs')
            for table_name in ['atomic_data_splits', 'atomic_data_contigs']:
                new_table_structure = profile_db.db.get_table_structure(
                    table_name)
                bin_profile_db.db.create_table(table_name, table_structure,
                                               table_types)

                tables[table_name] = ('contig', self.split_names)

        # we need to migrate these guys, too. unless we don't need to... if we are migrating,
        # the values in the self table are already accurate. if we are skipping, regardless
        # of what the values were, we will set the absolut correct ones.
        if self.skip_variability_tables:
            bin_profile_db.db.update_meta_value('SNVs_profiled', False)
            bin_profile_db.db.update_meta_value('SCVs_profiled', False)
        else:
            tables[t.variable_nts_table_name] = ('split_name',
                                                 self.split_names)
            tables[t.variable_codons_table_name] = ('corresponding_gene_call',
                                                    self.gene_caller_ids)

        bin_profile_db.disconnect()

        self.migrate_data(tables, self.profile_db_path,
                          self.bin_profile_db_path)

        self.progress.end()

        if not self.skip_hierarchical_clustering:
            dbops.do_hierarchical_clustering_of_items(self.bin_profile_db_path, constants.clustering_configs['merged' if merged else 'single'], self.split_names, \
                                                      self.database_paths, input_directory=self.bin_output_directory, \
                                                      default_clustering_config=constants.merged_default, distance=self.distance, \
                                                      linkage=self.linkage, run=terminal.Run(verbose=False), progress=self.progress)

        # add a collection
        collection_dict = {'ALL_SPLITS': self.split_names}
        bins_info_dict = {
            'ALL_SPLITS': {
                'html_color': '#FF0000',
                'source': 'anvi-split'
            }
        }
        collections = TablesForCollections(self.bin_profile_db_path)
        collections.append('DEFAULT',
                           collection_dict,
                           bins_info_dict=bins_info_dict)
Пример #23
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('skip_AA_frequencies', self.skip_AA_frequencies)
        self.run.info('report_variability_full', self.report_variability_full)
        self.run.info('gene_coverages_computed',
                      self.a_meta['genes_are_called'])

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
            if self.gen_serialized_profile:
                self.store_profile()
        else:
            self.init_serialized_profile()

        self.generate_variabile_positions_table()
        self.profile_AA_frequencies()
        self.generate_gene_coverages_table()
        self.store_split_coverages()

        # here we store atomic data for contigs and splits into the database:
        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        self.atomic_contig_split_data.store_atomic_data_for_contigs_and_splits(
            self.sample_id, self.contigs, profile_db.db)
        profile_db.disconnect()

        # the only view for the single PROFILE database is ready, and already
        # set as the default view. store the info in the db:
        views_table = dbops.TableForViews(self.profile_db_path,
                                          anvio.__profile__version__)
        views_table.append('single', 'atomic_data_splits')
        views_table.store()

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        runinfo_serialized = self.generate_output_destination('RUNINFO.cp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        self.bam.close()
        self.run.quit()
Пример #24
0
    def profile(self):
        manager = multiprocessing.Manager()
        info_dict = manager.dict()
        info_dict = {
            'input_file_path': self.input_file_path,
            'contig_names': self.contig_names,
            'contig_lengths': self.contig_lengths,
            'splits_basic_info': self.splits_basic_info,
            'split_length': self.a_meta['split_length'],
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'skip_SNV_profiling': self.skip_SNV_profiling,
            'report_variability_full': self.report_variability_full,
            'contig_name_to_splits': self.contig_name_to_splits,
            'contig_sequences': self.contig_sequences,
            'min_mean_coverage': self.min_mean_coverage
        }

        available_index_queue = manager.Queue()
        output_queue = manager.Queue(self.queue_size)

        # put contig indices into the queue to be read from within
        # the worker
        for i in range(0, self.num_contigs):
            available_index_queue.put(i)

        processes = []
        for i in range(0, self.num_threads):
            processes.append(
                multiprocessing.Process(
                    target=BAMProfiler.profile_contig_worker,
                    args=(available_index_queue, output_queue, info_dict)))

        for proc in processes:
            proc.start()

        recieved_contigs = 0
        discarded_contigs = 0
        memory_usage = None

        self.progress.new('Profiling using ' + str(self.num_threads) +
                          ' thread%s' % ('s' if self.num_threads > 1 else ''))
        self.progress.update('initializing threads ...')
        # FIXME: memory usage should be generalized.
        last_memory_update = int(time.time())

        self.progress.update('contigs are being processed ...')
        while recieved_contigs < self.num_contigs:
            try:
                contig = output_queue.get()

                # if we have a contig back, it means we are good to go with it,
                # otherwise it is garbage.
                if contig:
                    self.contigs.append(contig)
                else:
                    discarded_contigs += 1

                recieved_contigs += 1

                if (int(time.time()) - last_memory_update) > 5:
                    memory_usage = utils.get_total_memory_usage()
                    last_memory_update = int(time.time())

                self.progress.update('Processed %d of %d contigs. Current memory usage: %s' % \
                            (recieved_contigs, self.num_contigs, memory_usage or '...'))

                # here you're about to witness the poor side of Python (or our use of it).
                # the problem we run into here was the lack of action from the garbage
                # collector on the processed objects. although we couldn't find any refs to
                # these objects, garbage collecter kept them in the memory, and `del` statement
                # on the `split` object did not yield any improvement either. so here we are
                # accessing to the atomic data structures in our split objects to try to relieve
                # the memory by encouraging the garbage collector to realize what's up
                # explicitly.
                if self.write_buffer_size > 0 and len(
                        self.contigs) % self.write_buffer_size == 0:
                    self.store_contigs_buffer()
                    for c in self.contigs:
                        for split in c.splits:
                            del split.coverage
                            del split.auxiliary
                            del split
                        del c.splits[:]
                        del c.coverage
                        del c
                    del self.contigs[:]
            except KeyboardInterrupt:
                print(
                    "Anvi'o profiler recieved SIGINT, terminating all processes..."
                )
                break

        for proc in processes:
            proc.terminate()

        self.store_contigs_buffer()
        self.auxiliary_db.close()
        self.progress.end()

        # FIXME: this needs to be checked:
        if discarded_contigs > 0:
            self.run.info('contigs_after_C',
                          pp(recieved_contigs - discarded_contigs))

        overall_mean_coverage = 1
        if self.total_length_of_all_contigs != 0:
            overall_mean_coverage = self.total_coverage_values_for_all_contigs / self.total_length_of_all_contigs

        # FIXME: We know this is ugly. You can keep your opinion to yourself.
        if overall_mean_coverage > 0.0:
            # avoid dividing by zero
            dbops.ProfileDatabase(self.profile_db_path).db._exec(
                "UPDATE atomic_data_splits SET abundance = abundance / " +
                str(overall_mean_coverage) + " * 1.0;")
            dbops.ProfileDatabase(self.profile_db_path).db._exec(
                "UPDATE atomic_data_contigs SET abundance = abundance / " +
                str(overall_mean_coverage) + " * 1.0;")

        self.check_contigs(num_contigs=recieved_contigs - discarded_contigs)
Пример #25
0
    def init_profile_from_BAM(self):
        self.progress.new('Init')
        self.progress.update('Reading BAM File')
        self.bam = bamops.BAMFileObject(self.input_file_path,
                                        run=self.run,
                                        progress=self.progress).get()
        self.num_reads_mapped = self.bam.mapped
        self.progress.end()

        self.contig_names = self.bam.references
        self.contig_lengths = self.bam.lengths

        utils.check_contig_names(self.contig_names)

        runinfo = self.generate_output_destination('RUNINFO')
        self.run.init_info_file_obj(runinfo)
        self.run.info('input_bam', self.input_file_path)
        self.run.info('output_dir', self.output_directory, display_only=True)
        self.run.info('total_reads_mapped', pp(int(self.num_reads_mapped)))
        self.run.info('num_contigs', pp(len(self.contig_names)))

        if self.contig_names_of_interest:
            indexes = [
                self.contig_names.index(r)
                for r in self.contig_names_of_interest
                if r in self.contig_names
            ]
            self.contig_names = [self.contig_names[i] for i in indexes]
            self.contig_lengths = [self.contig_lengths[i] for i in indexes]
            self.run.info('num_contigs_selected_for_analysis',
                          pp(len(self.contig_names)))

        # it brings good karma to let the user know what the hell is wrong with their data:
        self.check_contigs_without_any_gene_calls(self.contig_names)

        # check for the -M parameter.
        contigs_longer_than_M = set()
        for i in range(0, len(self.contig_names)):
            if self.contig_lengths[i] >= self.min_contig_length:
                contigs_longer_than_M.add(i)

        if not len(contigs_longer_than_M):
            raise ConfigError, "0 contigs larger than %s nts." % pp(
                self.min_contig_length)
        else:
            self.contig_names = [
                self.contig_names[i] for i in contigs_longer_than_M
            ]
            self.contig_lengths = [
                self.contig_lengths[i] for i in contigs_longer_than_M
            ]
            self.num_contigs = len(
                self.contig_names)  # we will store these two
            self.total_length = sum(
                self.contig_lengths)  # into the db in a second.

        # let's see whether the user screwed up to follow the simple instructions
        # mentioned here: http://merenlab.org/2015/05/01/anvio-tutorial/#preparation
        for contig_name in self.contig_names:
            if contig_name not in self.contig_names_in_contigs_db:
                raise ConfigError, "At least one contig name in your BAM file does not match contig names stored in the\
                                    contigs database. For instance, this is one contig name found in your BAM file: '%s',\
                                    and this is another one found in your contigs database: '%s'. You may be using an\
                                    contigs database for profiling that has nothing to do with the BAM file you are\
                                    trying to profile, or you may have failed to fix your contig names in your FASTA file\
                                    prior to mapping, which is described here: %s"\
                                        % (contig_name, self.contig_names_in_contigs_db.pop(), 'http://goo.gl/Q9ChpS')

        contigs_longer_than_M = set(self.contig_names)  # for fast access
        self.split_names = set([])
        self.contig_name_to_splits = {}
        for split_name in sorted(self.splits_basic_info.keys()):
            parent = self.splits_basic_info[split_name]['parent']

            if parent not in contigs_longer_than_M:
                continue

            self.split_names.add(split_name)

            if self.contig_name_to_splits.has_key(parent):
                self.contig_name_to_splits[parent].append(split_name)
            else:
                self.contig_name_to_splits[parent] = [split_name]

        # we just recovered number of splits that are coming from contigs
        # longer than M:
        self.num_splits = len(self.split_names)

        self.run.info('num_contigs_after_M',
                      self.num_contigs,
                      display_only=True)
        self.run.info('num_contigs', self.num_contigs, quiet=True)
        self.run.info('num_splits', self.num_splits)
        self.run.info('total_length', self.total_length)

        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        profile_db.db.set_meta_value('num_splits', self.num_splits)
        profile_db.db.set_meta_value('num_contigs', self.num_contigs)
        profile_db.db.set_meta_value('total_length', self.total_length)
        profile_db.db.set_meta_value('total_reads_mapped',
                                     int(self.num_reads_mapped))
        profile_db.disconnect()
Пример #26
0
    def gen_view_data_tables_from_atomic_data(self):
        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]
        auxiliary_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_AUXILIARY_FIELD(f)
        ]

        views_table = dbops.TableForViews(self.profile_db_path,
                                          anvio.__profile__version__,
                                          progress=self.progress)

        # setting standard view table structure and types
        view_table_structure = ['contig'
                                ] + self.merged_sample_ids + auxiliary_fields
        view_table_types = [
            'text'
        ] + ['numeric'] * len(self.merged_sample_ids) + ['text']

        # generate a dictionary for normalized coverage of each contig across samples per target
        self.normalized_coverages = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.normalized_coverages[target][split_name] = {}
                for sample_id in self.merged_sample_ids:
                    self.normalized_coverages[target][split_name][
                        sample_id] = self.get_normalized_coverage_of_split(
                            target, sample_id, split_name)

        # generate a dictionary for max normalized ratio of each contig across samples per target
        self.max_normalized_ratios = {'contigs': {}, 'splits': {}}
        for target in ['contigs', 'splits']:
            for split_name in self.split_names:
                self.max_normalized_ratios[target][
                    split_name] = self.get_max_normalized_ratio_of_split(
                        target, split_name)

        self.progress.new('Generating view data tables')
        profile_db = dbops.ProfileDatabase(self.profile_db_path, quiet=True)
        for target in ['contigs', 'splits']:
            for essential_field in essential_fields:
                self.progress.update('Processing %s for %s ...' %
                                     (essential_field, target))

                target_table = '_'.join([essential_field, target])

                m = {}
                for split_name in self.split_names:
                    m[split_name] = {
                        '__parent__': self.split_parents[split_name]
                    }

                    for sample_id in self.merged_sample_ids:
                        if essential_field == 'normalized_coverage':
                            m[split_name][
                                sample_id] = self.normalized_coverages[target][
                                    split_name][sample_id]
                        elif essential_field == 'max_normalized_ratio':
                            m[split_name][
                                sample_id] = self.max_normalized_ratios[
                                    target][split_name][sample_id]
                        elif essential_field == 'relative_abundance':
                            m[split_name][
                                sample_id] = self.get_relative_abundance_of_split(
                                    target, sample_id, split_name)
                        else:
                            m[split_name][
                                sample_id] = self.atomic_data_for_each_run[
                                    target][sample_id][split_name][
                                        essential_field]

                # variable 'm' for the essential field is now ready to be its own table:
                profile_db.db.create_table(target_table, view_table_structure,
                                           view_table_types)
                db_entries = [
                    tuple([split_name] +
                          [m[split_name][h] for h in view_table_structure[1:]])
                    for split_name in self.split_names
                ]
                profile_db.db._exec_many(
                    '''INSERT INTO %s VALUES (%s)''' % (target_table, ','.join(
                        ['?'] * len(view_table_structure))), db_entries)

                if target == 'splits':
                    views_table.append(essential_field, target_table)

        profile_db.disconnect()
        self.progress.end()

        # store views in the database
        views_table.store()
Пример #27
0
    def merge(self):
        self.sanity_check()
        self.set_sample_id()

        filesnpaths.gen_output_directory(
            self.output_directory,
            delete_if_exists=self.overwrite_output_destinations)

        # init profile database
        self.profile_db_path = os.path.join(self.output_directory,
                                            'PROFILE.db')

        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        self.contigs_db_hash = self.input_runinfo_dicts.values(
        )[0]['contigs_db_hash']
        self.min_contig_length = self.input_runinfo_dicts.values(
        )[0]['min_contig_length']
        self.num_contigs = self.input_runinfo_dicts.values()[0]['num_contigs']
        self.num_splits = self.input_runinfo_dicts.values()[0]['num_splits']
        self.min_coverage_for_variability = self.input_runinfo_dicts.values(
        )[0]['min_coverage_for_variability']
        self.total_length = self.input_runinfo_dicts.values(
        )[0]['total_length']
        meta_values = {
            'db_type': 'profile',
            'anvio': __version__,
            'sample_id': self.sample_id,
            'samples': ','.join(self.merged_sample_ids),
            'merged': True,
            'contigs_clustered': not self.skip_hierarchical_clustering,
            'default_view': 'mean_coverage',
            'min_contig_length': self.min_contig_length,
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'num_contigs': self.num_contigs,
            'num_splits': self.num_splits,
            'total_length': self.total_length,
            'contigs_db_hash': self.contigs_db_hash
        }
        profile_db.create(meta_values)

        # get view data information for both contigs and splits:
        self.atomic_data_fields, self.atomic_data_for_each_run = self.read_atomic_data_tables(
        )
        self.split_parents = self.get_split_parents()

        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('output_dir', self.output_directory)
        self.run.info('sample_id', self.sample_id)
        self.run.info('profile_db', self.profile_db_path)
        self.run.info('merged', True)
        self.run.info('contigs_db_hash', self.contigs_db_hash)
        self.run.info('merged_sample_ids', self.merged_sample_ids)
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('num_runs_processed', len(self.merged_sample_ids))
        self.run.info('clustering_performed',
                      not self.skip_hierarchical_clustering)

        self.set_normalization_multiplier()

        self.progress.new('Merging gene coverages tables')
        self.merge_gene_coverages_tables()
        self.progress.end()

        self.progress.new('Merging split coverage values')
        self.merge_split_coverage_data()
        self.progress.end()

        self.progress.new('Merging variable positions tables')
        self.merge_variable_positions_tables()
        self.progress.end()

        # critical part:
        self.gen_view_data_tables_from_atomic_data()

        # We cluster? Note: the check is being done in the function!
        self.cluster_contigs_anvio()

        self.progress.end()

        # store everything
        runinfo_serialized = os.path.join(self.output_directory, 'RUNINFO.mcp')
        self.run.info('runinfo', runinfo_serialized)
        self.run.store_info_dict(runinfo_serialized,
                                 strip_prefix=self.output_directory)

        # run CONCOCT, if otherwise is not requested:
        if not self.skip_concoct_binning and __CONCOCT_IS_AVAILABLE__:
            self.bin_contigs_concoct()

        self.run.quit()