Пример #1
0
    def populate_layer_additional_data_dict(self, missing_default_data_group_is_OK=False):
        self.progress.new('Layer additional data ops')
        self.progress.update('...')

        data_groups_common_to_all_profile_dbs = set([])

        for p in self.input_profile_db_paths:
            if self.input_profile_db_paths.index(p) == 0:
                data_groups_common_to_all_profile_dbs = set(TableForLayerAdditionalData(argparse.Namespace(profile_db = p)).get_group_names())
            else:
                data_groups_common_to_all_profile_dbs.intersection_update(set(TableForLayerAdditionalData(argparse.Namespace(profile_db = p)).get_group_names()))

        if 'default' not in data_groups_common_to_all_profile_dbs:
            if missing_default_data_group_is_OK:
                pass
            else:
                 raise ConfigError("There is something wrong with your input databases. The group name 'default'\
                                    should be common to all of them, but it doesn't seem to be the case :/ How did\
                                    you end up with an anvi'o single profile database that doesn't have the 'default'\
                                    group in its additional layer data table? It is very likely that your profiling\
                                    step failed for some reason for one or more of your databases :(")

        taxonomic_data_groups = set(constants.levels_of_taxonomy).intersection(data_groups_common_to_all_profile_dbs)
        regular_data_groups = data_groups_common_to_all_profile_dbs.difference(taxonomic_data_groups)

        self.progress.end()

        self.__populate_layer_additional_data_dict_for_regular_data_groups(regular_data_groups)
        self.__populate_layer_additional_data_dict_for_taxonomic_data_groups(taxonomic_data_groups)
Пример #2
0
    def add_genomes_across_metagenomes_dict_into_pan_database(self):
        genomes_across_metagenomes_dict = self.get_genomes_across_metagenomes_dict(
        )

        self.args.just_do_it = True
        TableForLayerAdditionalData(self.args).add(
            genomes_across_metagenomes_dict, self.sample_names)
Пример #3
0
    def populate_layers_additional_data_and_layer_orders(self):
        self.run.info_single("Additional data and layer orders...", nl_before=1, nl_after=1, mc="blue")

        essential_fields = [f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f)]

        # initialize views.
        args = argparse.Namespace(profile_db = self.merged_profile_db_path)
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out sample orders dictionary
        layer_orders_data_dict = {}
        failed_attempts = []
        self.progress.new('Working on layer orders')
        for essential_field in essential_fields:
            self.progress.update('recovering order for "%s"' % (essential_field))
            try:
                data_value = clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'],
                                                                      distance=self.distance,
                                                                      linkage=self.linkage,
                                                                      transpose=True)

                layer_orders_data_dict[essential_field] = {'data_value': data_value, 'data_type': 'newick'}
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(layer_orders_data_dict):
            self.run.warning("This may or may not be important: anvi'o attempted to generate orders for your\
                              samples based on the view data, however, it failed :/")
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to store layers order information for those view items\
                              the clustering in fact worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        self.progress.new('Working on layer additional data')
        self.progress.update('...')

        layer_additional_data_dict = {}
        for sample_name in self.sample_ids_found_in_input_dbs:
            layer_additional_data_dict[sample_name] = {}

        # figure out num reads mapped per sample:
        for sample_name in self.sample_ids_found_in_input_dbs:
            layer_additional_data_dict[sample_name]['num_mapped_reads'] = self.total_reads_mapped_per_sample[sample_name]

        self.progress.end()

        TableForLayerOrders(args).add(layer_orders_data_dict)
        TableForLayerAdditionalData(args).add(layer_additional_data_dict, ['num_mapped_reads'])
Пример #4
0
    def populate_misc_data_tables(self):
        self.run.info_single("Additional data and layer orders...", nl_before=1, nl_after=1, mc="blue")

        essential_fields = [f for f in self.atomic_data_fields if constants.IS_ESSENTIAL_FIELD(f)]

        # initialize views.
        args = argparse.Namespace(profile_db = self.merged_profile_db_path)
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out layer orders dictionary
        layer_orders_data_dict = {}
        failed_attempts = []
        self.progress.new('Working on layer orders')
        for essential_field in essential_fields:
            self.progress.update('recovering order for "%s"' % (essential_field))
            try:
                data_value = clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'],
                                                                      distance=self.distance,
                                                                      linkage=self.linkage,
                                                                      transpose=True)

                layer_orders_data_dict[essential_field] = {'data_value': data_value, 'data_type': 'newick'}
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(layer_orders_data_dict):
            self.run.warning("This may or may not be important: anvi'o attempted to generate orders for your\
                              samples based on the view data, however, it failed :/")
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to store layers order information for those view items\
                              the clustering in fact worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        # add the layer orders quietly
        TableForLayerOrders(args, r=terminal.Run(verbose=False)).add(layer_orders_data_dict)
        self.run.warning(None, header="Layer orders added", lc='cyan')
        for layer_order in layer_orders_data_dict:
            self.run.info_single(layer_order, mc='cyan')

        # done with layer orders. let's add our layer additional data and call it a day.
        for data_group_name in self.layer_additional_data_dict:
            args.target_data_group = data_group_name
            TableForLayerAdditionalData(args, r=terminal.Run(verbose=False)).add(self.layer_additional_data_dict[data_group_name],
                                                                                 list(self.layer_additional_data_keys[data_group_name]))

        self.run.warning(None, header="Data groups added", lc='cyan')
        for data_group in self.layer_additional_data_dict:
            self.run.info_single('%s (w/%d items)' % (data_group, len(self.layer_additional_data_keys[data_group])), mc='cyan')
Пример #5
0
    def __populate_layer_additional_data_dict_for_regular_data_groups(self, data_group_names):
        dicts_of_layer_additional_data_dicts = {}

        for data_group_name in data_group_names:
            self.layer_additional_data_dict[data_group_name] = {}
            self.layer_additional_data_keys[data_group_name] = []

            for p in self.input_profile_db_paths:
                keys, data = TableForLayerAdditionalData(argparse.Namespace(profile_db = p, target_data_group=data_group_name)).get()
                dicts_of_layer_additional_data_dicts[p] = {'keys': keys, 'data': data}

            # find common keys to all layer additional data tables:
            all_keys = [set(e['keys']) for e in dicts_of_layer_additional_data_dicts.values()]

            layer_additional_data_keys = all_keys.pop()

            for key_set in all_keys:
                layer_additional_data_keys.intersection_update(key_set)

            # if there are no keys that are common to all single profiles, we shall merge nothing of these
            # tables.
            if data_group_name == 'default' and 'total_reads_mapped' not in layer_additional_data_keys:
                self.progress.end()
                raise ConfigError("While trying to learn everything there is to learn about layer additional data in single\
                                   profiles to be merged, anvi'o realized that 'total_reads_mapped' is not common to all \
                                   profile databases :( This is bad, becasue this indicates there is something terribly\
                                   wrong with one or more of your single profile databases. If you are a programmer trying to\
                                   mimic anvi'o single profiles, you will have to look at the code of the profiler a bit more\
                                   carefully. If you are a user, well, you are *really* in trouble... Send us an e-mail or\
                                   something?")

            # otherwise, let's create a final data dictionary for these assholes in this data group based on
            # thier common keys.
            for data in [v['data'] for v in dicts_of_layer_additional_data_dicts.values()]:
                sample_id = list(data.keys())[0]

                keys_of_no_interest = [k for k in data[sample_id] if k not in layer_additional_data_keys]
                for key in keys_of_no_interest:
                    data[sample_id].pop(key)

                self.layer_additional_data_dict[data_group_name][sample_id] = data[sample_id]

            self.layer_additional_data_keys[data_group_name] = layer_additional_data_keys
Пример #6
0
    def __populate_layer_additional_data_dict_for_taxonomic_data_groups(
            self, data_group_names):
        if data_group_names:
            self.run.warning("Anvi'o found %d data groups for taxonomy (%s), and will do its best to make sure they\
                              get worked into the merged profile database. A moment of zero promises but crossed\
                              fingers (which is the best way to avoid most computational poopsies)."                                                                                                     % \
                                                (len(data_group_names), ', '.join(data_group_names)),
                              header="GOOD NEWS",
                              lc="green")
        else:
            return

        dicts_of_layer_additional_data_dicts = {}

        for data_group_name in data_group_names:
            self.layer_additional_data_dict[data_group_name] = {}
            self.layer_additional_data_keys[data_group_name] = []

            all_keys = set([])
            for p in self.input_profile_db_paths:
                keys, data = TableForLayerAdditionalData(
                    argparse.Namespace(
                        profile_db=p,
                        target_data_group=data_group_name)).get()
                dicts_of_layer_additional_data_dicts[p] = data
                all_keys.update(set(keys))

            # here we are building a data dict that will make sure every profile has an entry in the dict
            # for every key in `all_keys` in this data group.
            layer_additional_data_dict = {}
            for data in dicts_of_layer_additional_data_dicts.values():
                for layer_name in data:
                    for key in all_keys:
                        if key not in data[layer_name]:
                            data[layer_name][key] = 0

                    layer_additional_data_dict[layer_name] = data[layer_name]

            self.layer_additional_data_dict[
                data_group_name] = layer_additional_data_dict
            self.layer_additional_data_keys[data_group_name] = all_keys
Пример #7
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info(
            'description', 'Found (%d characters)' %
            len(self.description) if self.description else None)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability',
                      self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_SCVs', self.profile_SCVs)
        self.run.info('report_variability_full', self.report_variability_full)

        self.run.warning("Your minimum contig length is set to %s base pairs. So anvi'o will not take into\
                          consideration anything below that. If you need to kill this an restart your\
                          analysis with another minimum contig length value, feel free to press CTRL+C." \
                                                % (pp(self.min_contig_length)))

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()

            # creating a null view_data_splits dict:
            view_data_splits = dict(
                list(
                    zip(self.split_names, [
                        dict(
                            list(
                                zip(t.atomic_data_table_structure[1:], [None] *
                                    len(t.atomic_data_table_structure[1:]))))
                    ] * len(self.split_names))))
            TablesForViews(self.profile_db_path).remove(
                'single', table_names_to_blank=['atomic_data_splits'])
            TablesForViews(self.profile_db_path).create_new_view(
                data_dict=view_data_splits,
                table_name='atomic_data_splits',
                table_structure=t.atomic_data_table_structure,
                table_types=t.atomic_data_table_types,
                view_name='single')
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
        else:
            raise ConfigError(
                "What are you doing? :( Whatever it is, anvi'o will have none of it."
            )

        # update layer additional data table content
        if self.layer_additional_data:
            layer_additional_data_table = TableForLayerAdditionalData(
                argparse.Namespace(profile_db=self.profile_db_path))
            layer_additional_data_table.add(
                {self.sample_id: self.layer_additional_data},
                self.layer_additional_keys)

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        if self.bam:
            self.bam.close()

        self.run.quit()
Пример #8
0
    def _run(self):
        self.check_args()

        self.set_sample_id()

        self.init_dirs_and_dbs()

        self.run.log_file_path = self.generate_output_destination('RUNLOG.txt')
        self.run.info('anvio', anvio.__version__)
        self.run.info('profiler_version', anvio.__profile__version__)
        self.run.info('sample_id', self.sample_id)
        self.run.info('description', 'Found (%d characters)' % len(self.description) if self.description else None)
        self.run.info('profile_db', self.profile_db_path, display_only=True)
        self.run.info('contigs_db', True if self.contigs_db_path else False)
        self.run.info('contigs_db_hash', self.a_meta['contigs_db_hash'])
        self.run.info('cmd_line', utils.get_cmd_line())
        self.run.info('merged', False)
        self.run.info('blank', self.blank)
        self.run.info('split_length', self.a_meta['split_length'])
        self.run.info('min_contig_length', self.min_contig_length)
        self.run.info('max_contig_length', self.max_contig_length)
        self.run.info('min_mean_coverage', self.min_mean_coverage)
        self.run.info('clustering_performed', self.contigs_shall_be_clustered)
        self.run.info('min_coverage_for_variability', self.min_coverage_for_variability)
        self.run.info('skip_SNV_profiling', self.skip_SNV_profiling)
        self.run.info('profile_SCVs', self.profile_SCVs)
        self.run.info('report_variability_full', self.report_variability_full)

        self.run.warning("Your minimum contig length is set to %s base pairs. So anvi'o will not take into\
                          consideration anything below that. If you need to kill this an restart your\
                          analysis with another minimum contig length value, feel free to press CTRL+C." \
                                                % (pp(self.min_contig_length)))

        if self.max_contig_length < sys.maxsize:
            self.run.warning("Your maximum contig length is set to %s base pairs. Which means anvi'o will remove\
            any contigs that are longer than this value." % pp(self.max_contig_length))

        # this is kinda important. we do not run full-blown profile function if we are dealing with a summarized
        # profile...
        if self.blank:
            self.init_mock_profile()

            # creating a null view_data_splits dict:
            view_data_splits = dict(list(zip(self.split_names, [dict(list(zip(t.atomic_data_table_structure[1:], [None] * len(t.atomic_data_table_structure[1:]))))] * len(self.split_names))))
            TablesForViews(self.profile_db_path).remove('single', table_names_to_blank=['atomic_data_splits'])
            TablesForViews(self.profile_db_path).create_new_view(
                                           data_dict=view_data_splits,
                                           table_name='atomic_data_splits',
                                           table_structure=t.atomic_data_table_structure,
                                           table_types=t.atomic_data_table_types,
                                           view_name='single')
        elif self.input_file_path:
            self.init_profile_from_BAM()
            self.profile()
        else:
            raise ConfigError("What are you doing? :( Whatever it is, anvi'o will have none of it.")

        # update layer additional data table content
        if self.layer_additional_data:
            layer_additional_data_table = TableForLayerAdditionalData(argparse.Namespace(profile_db=self.profile_db_path), r=self.run, p=self.progress)
            layer_additional_data_table.add({self.sample_id: self.layer_additional_data}, self.layer_additional_keys)

        if self.contigs_shall_be_clustered:
            self.cluster_contigs()

        if self.bam:
            self.bam.close()

        self.run.info_single('Happy 😇', nl_before=1, nl_after=1)

        self.run.quit()