示例#1
0
 def dump_results_to_full_output(self):
     """
     if self.full_modeller_output, all files from MODELLERs temp directory are recursively moved into
     output_gene_dir. Otherwise, the list of files we care about are defined in this function
     and moved into output_gene_dir.
     """
     output_gene_dir = os.path.join(self.full_modeller_output, self.modeller.corresponding_gene_call)
     filesnpaths.check_output_directory(output_gene_dir)
     shutil.move(self.modeller.directory, output_gene_dir)
示例#2
0
 def dump_results_to_full_output(self):
     """
     if self.full_modeller_output, all files from MODELLERs temp directory are recursively moved into
     output_gene_dir. Otherwise, the list of files we care about are defined in this function
     and moved into output_gene_dir.
     """
     output_gene_dir = os.path.join(self.full_modeller_output,
                                    self.modeller.corresponding_gene_call)
     filesnpaths.check_output_directory(output_gene_dir)
     shutil.move(self.modeller.directory, output_gene_dir)
示例#3
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError, "You can not run profiling without a contigs database. You can create\
                                      one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                      tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/"

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists = self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists = self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        meta_values = {'db_type': 'profile',
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'contigs_clustered': self.contigs_shall_be_clustered,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'gene_coverages_computed': self.a_meta['genes_are_called']}
        profile_db.create(meta_values)

        self.progress.end()
示例#4
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory, ok_if_exists=False)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        dbops.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['db_type'] != 'profile' or profile_db.meta[
                'blank'] or not profile_db.meta['merged']:
            raise ConfigError(
                "You an only split merged profiles :/ We hope this is not a moment of a terrible disappointment.\
                               If it is, you should consider writing to us.")

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError(
                    "The bin name you wish to split from this profile databse is not in the collection. Busted!"
                )
            else:
                self.bin_names_of_interest = [self.bin_name]
示例#5
0
    def add_genes(self):
        # identify which genes user wants to model structures for
        self.genes_of_interest = self.get_genes_of_interest(self.genes_to_add_path, self.genes_to_add)

        if self.skip_genes_if_already_present:
            redundant_gene_caller_ids = [g for g in self.genes_of_interest if g in self.structure_db.genes_queried]
            if redundant_gene_caller_ids:
                self.run.info("Redundant gene caller ids that will be skipped", ",".join([str(x) for x in redundant_gene_caller_ids]))
                self.genes_of_interest = [g for g in self.genes_of_interest if g not in redundant_gene_caller_ids]
                if not self.genes_of_interest:
                    raise ConfigError("Every gene you wanted to add is already in the database. Since you provided\
                                       the --skip-genes-if-already-present flag, there is nothing to do :)")

        self.run.info("Gene caller ids to be added", ",".join([str(x) for x in self.genes_of_interest]))

        self.get_MODELLER_params_used_when_db_was_created()

        self.sanity_check_for_adding_genes()

        # residue annotation
        self.residue_annotation_sources_info = self.get_residue_annotation_sources_info()
        self.residue_annotation_df = pd.DataFrame({})

        if self.full_modeller_output:
            self.full_modeller_output = filesnpaths.check_output_directory(self.full_modeller_output, ok_if_exists=True)

        self.process()
        self.run.info_single("Anvi'o attempted to add the requested genes. The above log can inform you which were successful.", nl_after=1, nl_before=1)
示例#6
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=True)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        # if this is not set false, the summarizer class attemts to remove the main output directory
        # upon initialization. not doing that is useful in this context since this allows multiple
        # anvi-split runs to work on bins in the same collection in parallel:
        self.args.delete_output_directory_if_exists = False

        self.summary = summarizer.ProfileSummarizer(self.args, r=self.run, p=self.progress)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!")
            else:
                self.bin_names_of_interest = [self.bin_name]
示例#7
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=False)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['blank']:
            raise ConfigError("The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!")

        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError("Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError("The bin name you wish to split from this profile databse is not in the collection. Busted!")
            else:
                self.bin_names_of_interest = [self.bin_name]
示例#8
0
    def sanity_check(self):
        if not self.collection_name:
            raise ConfigError, "You must specify a collection id :/"

        if self.collection_name not in self.collections.collections_dict:
            raise ConfigError, "%s is not a valid collection ID. See a list of available ones with '--list-collections' flag" % self.collection_name

        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=True)
示例#9
0
    def sanity_check(self):
        if not self.collection_id:
            raise ConfigError, "You must specify a collection id :/"

        if self.collection_id not in self.collections.sources_dict:
            raise ConfigError, "%s is not a valid collection ID. See a list of available ones with '--list-collections' flag" % self.collection_id

        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists = True)
示例#10
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError("You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_SCVs = False

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'blank': self.blank,
                       'items_ordered': False,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': not self.skip_SNV_profiling,
                       'SCVs_profiled': self.profile_SCVs,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        self.progress.update('Creating a new auxiliary database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.auxiliary_db_path = self.generate_output_destination('AUXILIARY-DATA.db')
        self.auxiliary_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(self.auxiliary_db_path,
                                                                            self.a_meta['contigs_db_hash'],
                                                                            create_new=True,
                                                                            run=null_run,
                                                                            progress=null_progress)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning('Single-nucleotide variation will not be characterized for this profile.')

        if not self.profile_SCVs:
            self.run.warning('Amino acid linkmer frequencies will not be characterized for this profile.')
示例#11
0
文件: profiler.py 项目: meren/anvio
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError("You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/")

        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations)

        self.progress.update('Creating a new single profile database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_SCVs = False

        meta_values = {'db_type': 'profile',
                       'anvio': __version__,
                       'sample_id': self.sample_id,
                       'samples': self.sample_id,
                       'merged': False,
                       'blank': self.blank,
                       'items_ordered': False,
                       'default_view': 'single',
                       'min_contig_length': self.min_contig_length,
                       'max_contig_length': self.max_contig_length,
                       'SNVs_profiled': not self.skip_SNV_profiling,
                       'SCVs_profiled': self.profile_SCVs,
                       'min_coverage_for_variability': self.min_coverage_for_variability,
                       'report_variability_full': self.report_variability_full,
                       'contigs_db_hash': self.a_meta['contigs_db_hash'],
                       'description': self.description if self.description else '_No description is provided_'}
        profile_db.create(meta_values)

        self.progress.update('Creating a new auxiliary database with contigs hash "%s" ...' % self.a_meta['contigs_db_hash'])
        self.auxiliary_db_path = self.generate_output_destination('AUXILIARY-DATA.db')
        self.auxiliary_db = auxiliarydataops.AuxiliaryDataForSplitCoverages(self.auxiliary_db_path,
                                                                            self.a_meta['contigs_db_hash'],
                                                                            create_new=True,
                                                                            run=null_run,
                                                                            progress=null_progress)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning('Single-nucleotide variation will not be characterized for this profile.')

        if not self.profile_SCVs:
            self.run.warning('Amino acid linkmer frequencies will not be characterized for this profile.')
示例#12
0
    def init_dirs_and_dbs(self):
        if not self.contigs_db_path:
            raise ConfigError, "You can not run profiling without a contigs database. You can create\
                                one using 'anvi-gen-contigs-database'. Not sure how? Please see the\
                                tutorial: http://merenlab.org/2015/05/02/anvio-tutorial/"

        self.output_directory = filesnpaths.check_output_directory(self.output_directory or self.input_file_path + '-ANVIO_PROFILE',\
                                                                   ok_if_exists=self.overwrite_output_destinations)

        self.progress.new('Initializing')

        self.progress.update('Creating the output directory ...')
        filesnpaths.gen_output_directory(
            self.output_directory,
            self.progress,
            delete_if_exists=self.overwrite_output_destinations)

        self.progress.update(
            'Creating a new single profile database with contigs hash "%s" ...'
            % self.a_meta['contigs_db_hash'])
        self.profile_db_path = self.generate_output_destination('PROFILE.db')
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        if self.skip_SNV_profiling:
            self.profile_AA_frequencies = False

        meta_values = {
            'db_type': 'profile',
            'anvio': __version__,
            'sample_id': self.sample_id,
            'samples': self.sample_id,
            'merged': False,
            'blank': self.blank,
            'contigs_clustered': self.contigs_shall_be_clustered,
            'default_view': 'single',
            'min_contig_length': self.min_contig_length,
            'SNVs_profiled': not self.skip_SNV_profiling,
            'AA_frequencies_profiled': self.profile_AA_frequencies,
            'min_coverage_for_variability': self.min_coverage_for_variability,
            'report_variability_full': self.report_variability_full,
            'contigs_db_hash': self.a_meta['contigs_db_hash'],
            'gene_coverages_computed': self.a_meta['genes_are_called']
        }
        profile_db.create(meta_values)

        self.progress.end()

        if self.skip_SNV_profiling:
            self.run.warning(
                'Single-nucleotide variation will not be characterized for this profile.'
            )

        if not self.profile_AA_frequencies:
            self.run.warning(
                'Amino acid linkmer frequencies will not be characterized for this profile.'
            )
示例#13
0
    def init_dirs_and_dbs(self):
        if not self.annotation_db_path:
            raise ConfigError, "You can not run profiling without an annotation database. You can create\
                                      one using 'anvi-gen-annotation-database'. Not sure how? Please see the\
                                      user manual."

        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory or self.input_file_path + "-ANVIO_PROFILE",
            ok_if_exists=self.overwrite_output_destinations,
        )

        self.progress.new("Initializing")

        self.progress.update("Creating the output directory ...")
        filesnpaths.gen_output_directory(
            self.output_directory, self.progress, delete_if_exists=self.overwrite_output_destinations
        )

        self.progress.update("Initializing the annotation database ...")
        annotation_db = dbops.AnnotationDatabase(self.annotation_db_path)
        self.split_length = int(annotation_db.meta["split_length"])
        self.annotation_hash = annotation_db.meta["annotation_hash"]
        self.contig_names_in_annotation_db = set(
            annotation_db.db.get_table_as_dict(t.contigs_info_table_name, string_the_key=True).keys()
        )
        annotation_db.disconnect()

        self.progress.update(
            'Creating a new single profile database with annotation hash "%s" ...' % self.annotation_hash
        )
        self.profile_db_path = self.generate_output_destination("PROFILE.db")
        profile_db = dbops.ProfileDatabase(self.profile_db_path)

        meta_values = {
            "db_type": "profile",
            "sample_id": self.sample_id,
            "samples": self.sample_id,
            "merged": False,
            "contigs_clustered": self.contigs_shall_be_clustered,
            "min_coverage_for_variability": self.min_coverage_for_variability,
            "default_view": "single",
            "min_contig_length": self.min_contig_length,
            "report_variability_full": self.report_variability_full,
            "annotation_hash": self.annotation_hash,
        }
        profile_db.create(meta_values)

        self.progress.end()
示例#14
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory, ok_if_exists=True)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not self.profile_db_path:
            raise ConfigError("No profile db no cookie. Bye.")

        utils.is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                      self.contigs_db_path)

        profile_db = dbops.ProfileDatabase(self.profile_db_path)
        if profile_db.meta['blank']:
            raise ConfigError(
                "The anvi-split workflow is not prepared to deal with blank profiles :/ Sorry!"
            )

        if profile_db.meta['db_type'] != 'profile':
            raise ConfigError(
                "Anvi'o was trying to split this profile, but it just realized that it is not a profile\
                               database. There is something wrong here.")
        profile_db.disconnect()

        # if this is not set false, the summarizer class attemts to remove the main output directory
        # upon initialization. not doing that is useful in this context since this allows multiple
        # anvi-split runs to work on bins in the same collection in parallel:
        self.args.delete_output_directory_if_exists = False

        self.summary = summarizer.ProfileSummarizer(self.args)
        self.summary.init()

        self.bin_names_of_interest = sorted(self.summary.bin_ids)
        if self.bin_name:
            if self.bin_name not in self.bin_names_of_interest:
                raise ConfigError(
                    "The bin name you wish to split from this profile databse is not in the collection. Busted!"
                )
            else:
                self.bin_names_of_interest = [self.bin_name]
示例#15
0
    def add_genes(self):
        # identify which genes user wants to model structures for
        self.genes_of_interest = self.get_genes_of_interest(
            self.genes_to_add_path, self.genes_to_add)
        self.run.info("Gene caller ids to be added",
                      ", ".join([str(x) for x in self.genes_of_interest]))

        self.get_MODELLER_params_used_when_db_was_created()

        self.sanity_check_for_adding_genes()

        # residue annotation
        self.annotation_sources_info = self.get_annotation_sources_info()
        self.res_annotation_df = pd.DataFrame({})

        if self.full_modeller_output:
            self.full_modeller_output = filesnpaths.check_output_directory(
                self.full_modeller_output, ok_if_exists=True)

        self.process()
        self.run.info_single(
            "Anvi'o attempted to add the requested genes. The above log can inform you which were successful.",
            nl_after=1,
            nl_before=1)
示例#16
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory,
            ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError(
                "Anvi'o couldn't find the contigs database where you said it would be :/"
            )

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.check_dbs_to_be_merged()

        self.populate_profile_dbs_info_dict()

        self.populate_layer_additional_data_dict()

        self.sample_ids_found_in_input_dbs = sorted([
            v['sample_id'] for v in list(self.profile_dbs_info_dict.values())
        ])
        if len(self.profile_dbs_info_dict) != len(
                set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError(
                "Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" %
                (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [
            ('total_length', 'The number of nucleotides described'),
            ('num_contigs', 'The number of contigs'),
            ('version', 'The version number'),
            ('num_splits', 'The number of splits'),
            ('min_contig_length', 'The minimum contig length (-M) values'),
            ('max_contig_length',
             'The maximum contig length (--max-contig-length) values'),
            ('min_coverage_for_variability',
             'The minimum coverage values to report variability (-V)'),
            ('report_variability_full',
             'Whether to report full variability (--report-variability-full) flags'
             ), ('SCVs_profiled', 'Profile SCVs flags (--profile-SCVs)'),
            ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')
        ]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                if anvio.FORCE:
                    self.run.warning(
                        "Anvio'o found out that %s is not identical across all your profiles, but since you\
                                      have used the `--force` flag, it will continue with the merge. This is very\
                                      dangerous, and even if merging finishes succesfully, it does not mean you can trust\
                                      your results to be error free. We believe you are prepared to deal with potential\
                                      implications of forcing things because you are awesome."
                        % p,
                        lc="cyan")
                else:
                    raise ConfigError(
                        "Ouch. %s are not identical for all profiles to be merged, which is a \
                                       deal breaker. All profiles that are going to be merged must be\
                                       run with identical flags and parameters :/ You really shouldn't but if you want to\
                                       try to force things because you believe this is due to a misunderstanding, you can\
                                       use the flag --force. While you are considering this as an option, please also\
                                       remember that this we advice against it.."
                        % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(
            list(
                utils.get_all_item_names_from_the_database(
                    list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([
            r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()
        ])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError(
                    "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"
                )
            else:
                raise ConfigError(
                    "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")

        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError(
                "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?"
                % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()
示例#17
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory,
            ok_if_exists=self.overwrite_output_destinations)

        if not len(self.input_runinfo_paths) > 1:
            raise ConfigError, "You need to provide at least 2 RUNINFO.cp files for this program\
                                           to be useful."

        if not self.contigs_db_path:
            raise ConfigError, "You must provide a contigs database for this operation."
        if not os.path.exists(self.contigs_db_path):
            raise ConfigError, "Anvi'o couldn't find the contigs database where you said it would be :/"

        missing = [
            p for p in self.input_runinfo_paths if not os.path.exists(p)
        ]
        if missing:
            raise ConfigError, "%s not found: %s." % (
                'Some files are' if len(missing) > 1 else "File is",
                ', '.join(missing))

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError, "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it."

        self.read_runinfo_dicts()

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        # test open all profile databases to make sure you are golden with versions
        for runinfo in self.input_runinfo_dicts.values():
            sample_profile_db = anvio.db.DB(runinfo['profile_db'],
                                            anvio.__profile__version__)
            sample_profile_db.disconnect()

        if [True for v in self.input_runinfo_dicts.values() if v['merged']]:
            raise ConfigError, "This is very cute, but you can't merge already merged runs. anvio can only merge\
                                      individual profiles (which are generated through anvi-profile program). Sorry."

        if [True for v in self.input_runinfo_dicts.values() if v['blank']]:
            raise ConfigError, "Do you have a blank profile in there? Because it seems you do :/ Well, here is the problem:\
                                blank profiles are merely useful to play with a contigs database when no mapping data is\
                                available, and they are not supposed to be merged."

        for k, p in [
            ('total_length', 'Number of nucleotides described'),
            ('num_contigs', 'Number of contigs'),
            ('num_splits', 'Number of splits'),
            ('split_length', 'Split length (-L)'),
            ('min_contig_length', 'Minimum contig length (-M)'),
            ('min_mean_coverage', 'Minimum mean coverage (-C)'),
            ('min_coverage_for_variability',
             'Minimum coverage to report variability (-V)'),
            ('report_variability_full',
             'Report full variability (--report-variability-full)'),
            ('profile_AA_frequencies',
             'Profile AA frequencies parameter (--profile-AA-frequencies)'),
            ('skip_SNV_profiling',
             'Skip SNV profiling parameter (--skip-SNV-profiling)')
        ]:
            v = set([r[k] for r in self.input_runinfo_dicts.values()])
            if len(v) > 1:
                raise ConfigError, "%s is not identical for all profiles to be merged, which is a \
                                    deal breaker. All profiles that are going to be merged must be\
                                    run with identical flags and parameters :/" % p

            # so we carry over this information into the runinfo dict for merged runs:
            self.run.info(k, v.pop())

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(
            list(
                dbops.get_split_names_in_profile_db(
                    self.input_runinfo_dicts.values()[0]['profile_db'])))

        # make sure all runs were profiled using the same contigs database (if one used):
        sample_runinfos = self.input_runinfo_dicts.values()
        hashes_for_profile_dbs = set(
            [r['contigs_db_hash'] for r in sample_runinfos])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError, "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"

            else:
                raise ConfigError, "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/"

        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError, "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?" % (
                contigs_db_hash, hashes_for_profile_dbs[0])
示例#18
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory,
            ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError(
                "You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError(
                "Anvi'o couldn't find the contigs database where you said it would be :/"
            )

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.populate_profile_dbs_info_dict()

        self.sample_ids_found_in_input_dbs = sorted([
            v['sample_id'] for v in list(self.profile_dbs_info_dict.values())
        ])
        if len(self.profile_dbs_info_dict) != len(
                set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError(
                "Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" %
                (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [
            ('total_length', 'The number of nucleotides described'),
            ('num_contigs', 'The number of contigs'),
            ('version', 'The version number'),
            ('num_splits', 'The number of splits'),
            ('min_contig_length', 'The minimum contig length (-M) values'),
            ('min_coverage_for_variability',
             'The minimum coverage values to report variability (-V)'),
            ('report_variability_full',
             'Whether to report full variability (--report-variability-full) flags'
             ),
            ('AA_frequencies_profiled',
             'Profile AA frequencies flags (--profile-AA-frequencies)'),
            ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')
        ]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                raise ConfigError(
                    "%s are not identical for all profiles to be merged, which is a \
                                    deal breaker. All profiles that are going to be merged must be\
                                    run with identical flags and parameters :/"
                    % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(
            list(
                dbops.get_split_names_in_profile_db(
                    list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([
            r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()
        ])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError(
                    "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"
                )
            else:
                raise ConfigError(
                    "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")

        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError(
                "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?"
                % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(
                os.path.abspath(self.description_file_path), 'rU').read()
示例#19
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists = self.overwrite_output_destinations)

        if not len(self.input_runinfo_paths) > 1:
            raise ConfigError, "You need to provide at least 2 RUNINFO.cp files for this program\
                                           to be useful."

        if not self.contigs_db_path:
            raise ConfigError, "You must provide a contigs database for this operation."
        if not os.path.exists(self.contigs_db_path):
            raise ConfigError, "anvio couldn't find the contigs database where you said it would be :/"

        missing = [p for p in self.input_runinfo_paths if not os.path.exists(p)]
        if missing:
            raise ConfigError, "%s not found: %s." % ('Some files are' if len(missing) > 1 else "File is",
                                                                 ', '.join(missing))

        self.read_runinfo_dicts()

        if [True for v in self.input_runinfo_dicts.values() if v['merged']]:
            raise ConfigError, "This is very cute, but you can't merge already merged runs. anvio can only merge\
                                      individual profiles (which are generated through anvi-profile program). Sorry."

        for k, p in [('total_length', 'Number of nucleotides described'),
                     ('num_contigs', 'Number of contigs'),
                     ('num_splits', 'Number of splits'),
                     ('split_length', 'Split length (-L)'),
                     ('min_contig_length', 'Minimum contig length (-M)'),
                     ('min_mean_coverage', 'Minimum mean coverage (-C)'),
                     ('min_coverage_for_variability', 'Minimum coverage to report variability (-V)')]:
            v = set([r[k] for r in self.input_runinfo_dicts.values()])
            if len(v) > 1:
                raise ConfigError, "%s is not identical for all runs to be merged, which is a \
                                          deal breaker. You need to profile all runs to be merged with\
                                          identical parameters :/" % p

            # so we carry over this information into the runinfo dict for merged runs:
            self.run.info(k, v.pop())

        # get split names from one of the profile databases. split names must be identical across all 
        self.split_names = sorted(list(dbops.get_split_names_in_profile_db(self.input_runinfo_dicts.values()[0]['profile_db'])))

        # make sure all runs were profiled using the same contigs database (if one used):
        sample_runinfos = self.input_runinfo_dicts.values()
        hashes_for_profile_dbs = set([r['contigs_db_hash'] for r in sample_runinfos])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError, "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"
            else:
                raise ConfigError, "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/"

        # make sure contigs hash that is common across runs is also identical to the contigs database
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet = True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError, "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?" % (contigs_db_hash, hashes_for_profile_dbs[0])
示例#20
0
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(self.output_directory, ok_if_exists=self.overwrite_output_destinations)

        if not self.contigs_db_path:
            raise ConfigError("You must provide a contigs database for this operation.")

        if not os.path.exists(self.contigs_db_path):
            raise ConfigError("Anvi'o couldn't find the contigs database where you said it would be :/")

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                                while also asking it to enforce it.")

        self.check_dbs_to_be_merged()

        self.populate_profile_dbs_info_dict()

        self.populate_layer_additional_data_dict()

        self.sample_ids_found_in_input_dbs = sorted([v['sample_id'] for v in list(self.profile_dbs_info_dict.values())])
        if len(self.profile_dbs_info_dict) != len(set(self.sample_ids_found_in_input_dbs)):
            raise ConfigError("Sample ids in each single profile database to be merged must be unique. But it is not the case\
                               with your input :/ Here are the sample names in case you would like to find out which ones occur\
                               more than once: '%s'" % (', '.join(self.sample_ids_found_in_input_dbs)))

        # test open the contigs database (and learn its hash while doing it) to make sure we don't have
        # a deal breaker just yet
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        for k, p in [('total_length', 'The number of nucleotides described'),
                     ('num_contigs', 'The number of contigs'),
                     ('version', 'The version number'),
                     ('num_splits', 'The number of splits'),
                     ('min_contig_length', 'The minimum contig length (-M) values'),
                     ('max_contig_length', 'The maximum contig length (--max-contig-length) values'),
                     ('min_coverage_for_variability', 'The minimum coverage values to report variability (-V)'),
                     ('report_variability_full', 'Whether to report full variability (--report-variability-full) flags'),
                     ('SCVs_profiled', 'Profile SCVs flags (--profile-SCVs)'),
                     ('SNVs_profiled', 'SNV profiling flags (--skip-SNV-profiling)')]:
            v = set([r[k] for r in list(self.profile_dbs_info_dict.values())])
            if len(v) > 1:
                raise ConfigError("%s are not identical for all profiles to be merged, which is a \
                                    deal breaker. All profiles that are going to be merged must be\
                                    run with identical flags and parameters :/" % p)

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(list(utils.get_all_item_names_from_the_database(list(self.profile_dbs_info_dict.keys())[0])))

        # make sure all runs were profiled using the same contigs database (if one used):
        hashes_for_profile_dbs = set([r['contigs_db_hash'] for r in self.profile_dbs_info_dict.values()])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError("It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/")
            else:
                raise ConfigError("It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/")


        # make sure the hash for contigs db is identical across all profile databases:
        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError("The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?" % (contigs_db_hash, list(hashes_for_profile_dbs)[0]))

        # do we have a description file?
        if self.description_file_path:
            filesnpaths.is_file_plain_text(self.description_file_path)
            self.description = open(os.path.abspath(self.description_file_path), 'rU').read()
示例#21
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # initialize self.arg parameters
        A                             = lambda x, t: t(args.__dict__[x]) if x in self.args.__dict__ else None
        null                          = lambda x: x
        self.contigs_db_path          = A('contigs_db', null)
        self.genes_of_interest_path   = A('genes_of_interest', null)
        self.splits_of_interest_path  = A('splits_of_interest', null)
        self.bin_id                   = A('bin_id', null)
        self.collection_name          = A('collection_name', null)
        self.gene_caller_ids          = A('gene_caller_ids', null)
        self.output_db_path           = A('output_db_path', null)
        self.full_modeller_output     = A('dump_dir', null)
        self.skip_DSSP                = A('skip_DSSP', bool)
        self.modeller_executable      = A('modeller_executable', null)
        self.DSSP_executable          = None

        utils.is_contigs_db(self.contigs_db_path)
        self.contigs_db                = dbops.ContigsDatabase(self.contigs_db_path)
        self.contigs_db_hash           = self.contigs_db.meta['contigs_db_hash']

        # MODELLER params
        self.modeller_database        = A('modeller_database', null)
        self.scoring_method           = A('scoring_method', null)
        self.max_number_templates     = A('max_number_templates', null)
        self.percent_identical_cutoff = A('percent_identical_cutoff', null)
        self.num_models               = A('num_models', null)
        self.deviation                = A('deviation', null)
        self.very_fast                = A('very_fast', bool)

        # check database output
        if not self.output_db_path:
            self.output_db_path = "STRUCTURE.db"
        if not self.output_db_path.endswith('.db'):
            raise ConfigError("The structure database output file (`-o / --output`) must end with '.db'")
        filesnpaths.is_output_file_writable(self.output_db_path)

        # check modeller output
        if self.full_modeller_output:
            self.full_modeller_output = filesnpaths.check_output_directory(self.full_modeller_output, ok_if_exists=False)

        # identify which genes user wants to model structures for
        self.genes_of_interest = self.get_genes_of_interest(self.genes_of_interest_path, self.gene_caller_ids)

        self.sanity_check()

        # residue annotation
        self.residue_annotation_sources_info = self.get_residue_annotation_sources_info()
        self.residue_info_table_structure, self.residue_info_table_types = self.get_residue_info_table_structure()
        self.residue_annotation_df = pd.DataFrame({})

        # initialize StructureDatabase
        self.structure_db = StructureDatabase(self.output_db_path,
                                              self.contigs_db_hash,
                                              residue_info_structure_extras = self.residue_info_table_structure,
                                              residue_info_types_extras = self.residue_info_table_types,
                                              create_new=True)

        # init ContigsSuperClass
        self.contigs_super = ContigsSuperclass(self.args)
示例#22
0
文件: merger.py 项目: paczian/anvio
    def sanity_check(self):
        self.output_directory = filesnpaths.check_output_directory(
            self.output_directory,
            ok_if_exists=self.overwrite_output_destinations)

        if not len(self.input_runinfo_paths) > 1:
            raise ConfigError, "You need to provide at least 2 RUNINFO.cp files for this program\
                                           to be useful."

        if not self.contigs_db_path:
            raise ConfigError, "You must provide a contigs database for this operation."
        if not os.path.exists(self.contigs_db_path):
            raise ConfigError, "anvio couldn't find the contigs database where you said it would be :/"

        missing = [
            p for p in self.input_runinfo_paths if not os.path.exists(p)
        ]
        if missing:
            raise ConfigError, "%s not found: %s." % (
                'Some files are' if len(missing) > 1 else "File is",
                ', '.join(missing))

        self.read_runinfo_dicts()

        if [True for v in self.input_runinfo_dicts.values() if v['merged']]:
            raise ConfigError, "This is very cute, but you can't merge already merged runs. anvio can only merge\
                                      individual profiles (which are generated through anvi-profile program). Sorry."

        for k, p in [('total_length', 'Number of nucleotides described'),
                     ('num_contigs', 'Number of contigs'),
                     ('num_splits', 'Number of splits'),
                     ('split_length', 'Split length (-L)'),
                     ('min_contig_length', 'Minimum contig length (-M)'),
                     ('min_mean_coverage', 'Minimum mean coverage (-C)'),
                     ('min_coverage_for_variability',
                      'Minimum coverage to report variability (-V)')]:
            v = set([r[k] for r in self.input_runinfo_dicts.values()])
            if len(v) > 1:
                raise ConfigError, "%s is not identical for all runs to be merged, which is a \
                                          deal breaker. You need to profile all runs to be merged with\
                                          identical parameters :/" % p

            # so we carry over this information into the runinfo dict for merged runs:
            self.run.info(k, v.pop())

        # get split names from one of the profile databases. split names must be identical across all
        self.split_names = sorted(
            list(
                dbops.get_split_names_in_profile_db(
                    self.input_runinfo_dicts.values()[0]['profile_db'])))

        # make sure all runs were profiled using the same contigs database (if one used):
        sample_runinfos = self.input_runinfo_dicts.values()
        hashes_for_profile_dbs = set(
            [r['contigs_db_hash'] for r in sample_runinfos])
        if len(hashes_for_profile_dbs) != 1:
            if None in hashes_for_profile_dbs:
                raise ConfigError, "It seems there is at least one run in the mix that was profiled using an\
                                          contigs database, and at least one other that was profiled without using\
                                          one. This is not good. All runs must be profiled using the same contigs\
                                          database, or all runs must be profiled without a contigs database :/"

            else:
                raise ConfigError, "It seems these runs were profiled using different contigs databases (or\
                                          different versions of the same contigs database). All runs must be\
                                          profiled using the same contigs database, or all runs must be profiled\
                                          without a contigs database :/"

        # make sure contigs hash that is common across runs is also identical to the contigs database
        contigs_db = dbops.ContigsDatabase(self.contigs_db_path, quiet=True)
        contigs_db_hash = contigs_db.meta['contigs_db_hash']
        contigs_db.disconnect()

        if list(hashes_for_profile_dbs)[0] != contigs_db_hash:
            raise ConfigError, "The contigs database you provided, which is identified with hash '%s', does\
                                      not seem to match the run profiles you are trying to merge, which share the\
                                      hash identifier of '%s'. What's up with that?" % (
                contigs_db_hash, hashes_for_profile_dbs[0])
示例#23
0
    def __init__(self, args, run=terminal.Run(), progress=terminal.Progress()):
        self.args = args
        self.run = run
        self.progress = progress

        # initialize self.arg parameters
        A = lambda x, t: t(args.__dict__[x]
                           ) if x in self.args.__dict__ else None
        null = lambda x: x
        self.contigs_db_path = A('contigs_db', null)
        self.genes_of_interest_path = A('genes_of_interest', null)
        self.splits_of_interest_path = A('splits_of_interest', null)
        self.bin_id = A('bin_id', null)
        self.collection_name = A('collection_name', null)
        self.gene_caller_ids = A('gene_caller_ids', null)
        self.output_db_path = A('output_db_path', null)
        self.full_modeller_output = A('dump_dir', null)
        self.skip_DSSP = A('skip_DSSP', bool)
        self.modeller_executable = A('modeller_executable', null)
        self.DSSP_executable = None

        utils.is_contigs_db(self.contigs_db_path)
        self.contigs_db = dbops.ContigsDatabase(self.contigs_db_path)
        self.contigs_db_hash = self.contigs_db.meta['contigs_db_hash']

        # MODELLER params
        self.modeller_database = A('modeller_database', null)
        self.scoring_method = A('scoring_method', null)
        self.max_number_templates = A('max_number_templates', null)
        self.percent_identical_cutoff = A('percent_identical_cutoff', null)
        self.num_models = A('num_models', null)
        self.deviation = A('deviation', null)
        self.very_fast = A('very_fast', bool)

        # check database output
        if not self.output_db_path:
            self.output_db_path = "STRUCTURE.db"
        if not self.output_db_path.endswith('.db'):
            raise ConfigError(
                "The structure database output file (`-o / --output`) must end with '.db'"
            )
        filesnpaths.is_output_file_writable(self.output_db_path)

        # check modeller output
        if self.full_modeller_output:
            self.full_modeller_output = filesnpaths.check_output_directory(
                self.full_modeller_output, ok_if_exists=False)

        # identify which genes user wants to model structures for
        self.genes_of_interest = self.get_genes_of_interest(
            self.genes_of_interest_path, self.gene_caller_ids)

        self.sanity_check()

        # residue annotation
        self.annotation_sources_info = self.get_annotation_sources_info()
        self.residue_info_table_structure, self.residue_info_table_types = self.get_residue_info_table_structure(
        )
        self.res_annotation_df = pd.DataFrame({})

        # initialize StructureDatabase
        self.structure_db = StructureDatabase(
            self.output_db_path,
            self.contigs_db_hash,
            residue_info_structure_extras=self.residue_info_table_structure,
            residue_info_types_extras=self.residue_info_table_types,
            create_new=True)

        # init ContigsSuperClass
        self.contigs_super = ContigsSuperclass(self.args)