Exemplo n.º 1
0
    def export_collection(self,
                          collection_name,
                          output_file_prefix=None,
                          include_unbinned=False):
        self.sanity_check(collection_name)

        if not output_file_prefix:
            output_file_prefix = 'collection-%s' % (
                collection_name.strip().replace(' ', '-'))

        info_file_path = output_file_prefix + '-info.txt'
        items_file_path = output_file_prefix + '.txt'

        self.run.info('Report unbinned items if there are any',
                      include_unbinned)
        self.run.info('Items file path', items_file_path)
        filesnpaths.is_output_file_writable(items_file_path)

        bins_info = self.get_bins_info_dict(collection_name)
        collection = self.get_collection_dict(collection_name)

        if len(bins_info):
            self.run.info('Bins info file path', info_file_path)
            info_file = open(info_file_path, 'w')

            if include_unbinned:
                bins_info['UNBINNED_ITEMS_BIN'] = {
                    'html_color': '#000000',
                    'source': 'anvi-export-collections'
                }

            for bin_name in bins_info:
                info_file.write('%s\t%s\t%s\n' %
                                (bin_name, bins_info[bin_name]['source'],
                                 bins_info[bin_name]['html_color']))
            info_file.close()

        binned_items = set([])

        items_file = open(items_file_path, 'w')
        for bin_name in collection:
            for item_name in collection[bin_name]:
                items_file.write('%s\t%s\n' % (item_name, bin_name))
                binned_items.add(item_name)

        if include_unbinned:
            all_items = utils.get_all_item_names_from_the_database(
                self.db_path)

            unbinned_items = all_items.difference(binned_items)

            for item_name in unbinned_items:
                items_file.write('%s\tUNBINNED_ITEMS_BIN\n' % (item_name))

            self.run.warning(
                "As per your request, %d items that were not in any of the bins in the collection '%s' are stored\
                              in the output file under the bin name 'UNBINNED_ITEMS_BIN'."
                % (len(unbinned_items), collection_name))

        items_file.close()
Exemplo n.º 2
0
    def export_collection(self, collection_name, output_file_prefix=None, include_unbinned=False):
        self.sanity_check(collection_name)

        if not output_file_prefix:
            output_file_prefix = 'collection-%s' % (collection_name.strip().replace(' ', '-'))

        info_file_path = output_file_prefix + '-info.txt'
        items_file_path = output_file_prefix + '.txt'

        self.run.info('Report unbinned items if there are any', include_unbinned)
        self.run.info('Items file path', items_file_path)
        filesnpaths.is_output_file_writable(items_file_path)

        bins_info = self.get_bins_info_dict(collection_name)
        collection = self.get_collection_dict(collection_name)

        if len(bins_info):
            self.run.info('Bins info file path', info_file_path)
            info_file = open(info_file_path, 'w')

            if include_unbinned:
                bins_info['UNBINNED_ITEMS_BIN'] = {'html_color': '#000000', 'source': 'anvi-export-collections'}

            for bin_name in bins_info:
                info_file.write('%s\t%s\t%s\n' % (bin_name, bins_info[bin_name]['source'], bins_info[bin_name]['html_color']))
            info_file.close()

        binned_items = set([])

        items_file = open(items_file_path, 'w')
        for bin_name in collection:
            for item_name in collection[bin_name]:
                items_file.write('%s\t%s\n' % (item_name, bin_name))
                binned_items.add(item_name)

        if include_unbinned:
            all_items = utils.get_all_item_names_from_the_database(self.db_path)

            unbinned_items = all_items.difference(binned_items)

            for item_name in unbinned_items:
                items_file.write('%s\tUNBINNED_ITEMS_BIN\n' % (item_name))

            self.run.warning("As per your request, %d items that were not in any of the bins in the collection '%s' are stored\
                              in the output file under the bin name 'UNBINNED_ITEMS_BIN'." % (len(unbinned_items), collection_name))

        items_file.close()
Exemplo n.º 3
0
    def check_names(self, data_dict):
        """Compares item names found in the data dict to the ones in the db"""

        items_in_db = utils.get_all_item_names_from_the_database(self.db_path)
        items_in_data = set(data_dict.keys())

        items_in_data_but_not_in_db = items_in_data.difference(items_in_db)
        if len(items_in_data_but_not_in_db):
            raise ConfigError("Well. %d of %d item names in your additional data are only in your data (which\
                               that they are not in the %s database you are working with (which means bad news)).\
                               Since there is no reason to add additional data for items that do not exist in your\
                               database, anvi'o will stop you right there. Please fix your data and come again. In\
                               case you want to see a random item that is only in your data, here is one: %s. Stuff\
                               in your db looks like this: %s." \
                                    % (len(items_in_data_but_not_in_db), len(items_in_data), self.db_type, \
                                       items_in_data_but_not_in_db.pop(), items_in_db.pop()))

        items_in_db_but_not_in_data = items_in_db.difference(items_in_data)
        if len(items_in_db_but_not_in_data):
            self.run.warning("Your input contains additional data for only %d of %d total number of items in your %s\
                              database. Just wanted to make sure you know what's up, but we cool." \
                                % (len(items_in_db) - len(items_in_db_but_not_in_data), len(items_in_db), self.db_type))
Exemplo n.º 4
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory,
            ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError(
                "Anvi'o couldn't find the contigs database where you said it would be :/"
            )

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.check_dbs_to_be_merged()

        self.populate_profile_dbs_info_dict()

        self.populate_layer_additional_data_dict()

        self.sample_ids_found_in_input_dbs = sorted([
            v['sample_id'] for v in list(self.profile_dbs_info_dict.values())
        ])
        if len(self.profile_dbs_info_dict) != len(
                set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError(
                "Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" %
                (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [
            ('total_length', 'The number of nucleotides described'),
            ('num_contigs', 'The number of contigs'),
            ('version', 'The version number'),
            ('num_splits', 'The number of splits'),
            ('min_contig_length', 'The minimum contig length (-M) values'),
            ('max_contig_length',
             'The maximum contig length (--max-contig-length) values'),
            ('min_coverage_for_variability',
             'The minimum coverage values to report variability (-V)'),
            ('report_variability_full',
             'Whether to report full variability (--report-variability-full) flags'
             ), ('SCVs_profiled', 'Profile SCVs flags (--profile-SCVs)'),
            ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')
        ]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                if anvio.FORCE:
                    self.run.warning(
                        "Anvio'o found out that %s is not identical across all your profiles, but since you\
                                      have used the `--force` flag, it will continue with the merge. This is very\
                                      dangerous, and even if merging finishes succesfully, it does not mean you can trust\
                                      your results to be error free. We believe you are prepared to deal with potential\
                                      implications of forcing things because you are awesome."
                        % p,
                        lc="cyan")
                else:
                    raise ConfigError(
                        "Ouch. %s are not identical for all profiles to be merged, which is a \
                                       deal breaker. All profiles that are going to be merged must be\
                                       run with identical flags and parameters :/ You really shouldn't but if you want to\
                                       try to force things because you believe this is due to a misunderstanding, you can\
                                       use the flag --force. While you are considering this as an option, please also\
                                       remember that this we advice against it.."
                        % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(
            list(
                utils.get_all_item_names_from_the_database(
                    list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([
            r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()
        ])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError(
                    "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"
                )
            else:
                raise ConfigError(
                    "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")

        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError(
                "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?"
                % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()
Exemplo n.º 5
0
    def __init__(self,
                 args,
                 skip_sanity_check=False,
                 run=terminal.Run(),
                 progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.bams_and_profiles_file_path = A('bams_and_profiles')

        if not self.bams_and_profiles_file_path:
            raise ConfigError(
                "Sorry, you can't get an instance of this class without a `--bams-and-profiles` argument."
            )

        # get these filled in immediately
        self.contigs_db_path, self.profile_db_bam_file_pairs = utils.get_bams_and_profiles_txt_as_data(
            self.bams_and_profiles_file_path)
        self.profile_db_paths = [
            e['profile_db_path']
            for e in self.profile_db_bam_file_pairs.values()
        ]

        # params to identify regions of interest. if you are studying the code, don't forget to read
        # the information stored in the help menu of the program about these parameters
        self.min_coverage_to_define_stretches = A(
            'min_coverage_to_define_stretches') or 10
        self.min_stretch_length = A('min_stretch_length') or 50
        self.min_distance_between_independent_stretches = A(
            'min_distance_between_independent_stretches') or 2000
        self.num_nts_to_pad_a_stretch = A('num_nts_to_pad-a_stretch') or 100

        # palindrome search parameters
        self.min_palindrome_length = A('min_palindrome_length') or 10
        self.max_num_mismatches = A('max_num_mismatches') or 0
        self.min_distance_palindrome = A('min-distance') or 50

        # parameters to survey inversions
        self.process_only_inverted_reads = A('process_only_inverted_reads')

        # be talkative or not
        self.verbose = A('verbose')

        # debugging mode:
        self.only_report_from = A('only_report_from')

        if self.only_report_from:
            self.verbose = True

        if not skip_sanity_check:
            self.sanity_check()

        # we will generate our splits info and contigs to splits dicts here.
        split_names = utils.get_all_item_names_from_the_database(
            self.profile_db_paths[0])
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path,
                                           run=run_quiet,
                                           progress=progress_quiet)
        self.splits_basic_info = contigs_db.db.smart_get(
            t.splits_info_table_name, column='split', data=split_names)
        self.contig_sequences = contigs_db.db.get_table_as_dict(
            t.contig_sequences_table_name)
        contigs_db.disconnect()

        # next, we will generate a dictionary to convert contig names to split names
        self.contig_name_to_split_names = {}
        for split_name in sorted(self.splits_basic_info.keys()):
            contig_name = self.splits_basic_info[split_name]['parent']

            if contig_name not in self.contig_name_to_split_names:
                self.contig_name_to_split_names[contig_name] = []

            self.contig_name_to_split_names[contig_name].append(split_name)

        # let's have a variable of convenience:
        self.contig_names = sorted(list(
            self.contig_name_to_split_names.keys()))
Exemplo n.º 6
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError("Anvi'o couldn't find the contigs database where you said it would be :/")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.populate_profile_dbs_info_dict()

        self.sample_ids_found_in_input_dbs = sorted([v['sample_id'] for v in list(self.profile_dbs_info_dict.values())])
        if len(self.profile_dbs_info_dict) != len(set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError("Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" % (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [('total_length', 'The number of nucleotides described'),
                     ('num_contigs', 'The number of contigs'),
                     ('version', 'The version number'),
                     ('num_splits', 'The number of splits'),
                     ('min_contig_length', 'The minimum contig length (-M) values'),
                     ('min_coverage_for_variability', 'The minimum coverage values to report variability (-V)'),
                     ('report_variability_full', 'Whether to report full variability (--report-variability-full) flags'),
                     ('AA_frequencies_profiled', 'Profile AA frequencies flags (--profile-AA-frequencies)'),
                     ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                raise ConfigError("%s are not identical for all profiles to be merged, which is a \
                                    deal breaker. All profiles that are going to be merged must be\
                                    run with identical flags and parameters :/" % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(list(utils.get_all_item_names_from_the_database(list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError("It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/")
            else:
                raise ConfigError("It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")


        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError("The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?" % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()
Exemplo n.º 7
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError("Anvi'o couldn't find the contigs database where you said it would be :/")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.check_dbs_to_be_merged()

        self.populate_profile_dbs_info_dict()

        self.populate_layer_additional_data_dict()

        self.sample_ids_found_in_input_dbs = sorted([v['sample_id'] for v in list(self.profile_dbs_info_dict.values())])
        if len(self.profile_dbs_info_dict) != len(set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError("Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" % (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [('total_length', 'The number of nucleotides described'),
                     ('num_contigs', 'The number of contigs'),
                     ('version', 'The version number'),
                     ('num_splits', 'The number of splits'),
                     ('min_contig_length', 'The minimum contig length (-M) values'),
                     ('max_contig_length', 'The maximum contig length (--max-contig-length) values'),
                     ('min_coverage_for_variability', 'The minimum coverage values to report variability (-V)'),
                     ('report_variability_full', 'Whether to report full variability (--report-variability-full) flags'),
                     ('SCVs_profiled', 'Profile SCVs flags (--profile-SCVs)'),
                     ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                raise ConfigError("%s are not identical for all profiles to be merged, which is a \
                                    deal breaker. All profiles that are going to be merged must be\
                                    run with identical flags and parameters :/" % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(list(utils.get_all_item_names_from_the_database(list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError("It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/")
            else:
                raise ConfigError("It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")


        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError("The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?" % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()