示例#1
0
    def sanity_check(self):
        self.distance = self.distance or constants.distance_metric_default
        self.linkage = self.linkage or constants.linkage_method_default

        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        filesnpaths.is_file_tab_delimited(self.view_data_path)
        if self.tree_file_path:
            filesnpaths.is_proper_newick(self.tree_file_path)

        self.check_output_directory()

        new_view_data_path = self.get_output_file_path('view_data.txt')
        shutil.copyfile(self.view_data_path, new_view_data_path)
        self.view_data_path = new_view_data_path

        if self.tree_file_path:
            new_tree_path = self.get_output_file_path('tree.txt')
            shutil.copyfile(self.tree_file_path, new_tree_path)
            self.tree_file_path = new_tree_path

        if self.additional_view_data_file_path:
            new_additional_view_data_file_path = self.get_output_file_path('additional_view_data.txt')
            shutil.copyfile(self.additional_view_data_file_path, new_additional_view_data_file_path)
            self.additional_view_data_file_path = new_additional_view_data_file_path

        if self.samples_info_file_path:
            new_samples_info_file_path = self.get_output_file_path('anvio_samples_info.txt')
            shutil.copyfile(self.samples_info_file_path, new_samples_info_file_path)
            self.samples_info_file_path = new_samples_info_file_path


        self.sanity_checked = True
示例#2
0
    def __init__(self, args, run=run, progress=progress):
        self.progress = progress
        self.run = run

        self.max_num_splits_for_hierarchical_clustering = constants.max_num_splits_for_hierarchical_clustering

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.sample_id = A('sample_name')
        self.contigs_db_path = A('contigs_db')
        self.input_runinfo_paths = A('input')
        self.output_directory = A('output_dir')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.enforce_hierarchical_clustering = A(
            'enforce_hierarchical_clustering')
        self.skip_concoct_binning = A('skip_concoct_binning')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.debug = A('debug')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        self.split_names = None
        self.merged_sample_ids = []
        self.input_runinfo_dicts = {}
        self.normalization_multiplier = {}
        self.profiles = []

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs['merged']

        self.database_paths = {'CONTIGS.db': self.contigs_db_path}
示例#3
0
    def sanity_check(self):
        self.distance = self.distance or constants.distance_metric_default
        self.linkage = self.linkage or constants.linkage_method_default

        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        filesnpaths.is_file_tab_delimited(self.view_data_path)
        if self.tree_file_path:
            filesnpaths.is_proper_newick(self.tree_file_path)

        self.check_output_directory()

        new_view_data_path = self.get_output_file_path('view_data.txt')
        shutil.copyfile(self.view_data_path, new_view_data_path)
        self.view_data_path = new_view_data_path

        if self.tree_file_path:
            new_tree_path = self.get_output_file_path('tree.txt')
            shutil.copyfile(self.tree_file_path, new_tree_path)
            self.tree_file_path = new_tree_path

        if self.additional_view_data_file_path:
            new_additional_view_data_file_path = self.get_output_file_path('additional_view_data.txt')
            shutil.copyfile(self.additional_view_data_file_path, new_additional_view_data_file_path)
            self.additional_view_data_file_path = new_additional_view_data_file_path

        if self.samples_info_file_path:
            new_samples_info_file_path = self.get_output_file_path('anvio_samples_info.txt')
            shutil.copyfile(self.samples_info_file_path, new_samples_info_file_path)
            self.samples_info_file_path = new_samples_info_file_path


        self.sanity_checked = True
示例#4
0
    def __init__(self,
                 bin_name,
                 summary_object,
                 args,
                 run=run,
                 progress=progress):
        """A class to split a single bin from its parent.

        The class is not really useful without a summary object, but it makes logistic sense to keep it
        separate since the inheritance from anvio/summarizer.Bin is much easier and sane this way."""
        summarizer.Bin.__init__(self, summary_object, bin_name, run, progress)

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.output_directory = A('output_dir')
        self.skip_variability_tables = A('skip_variability_tables')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.enforce_hierarchical_clustering = A(
            'enforce_hierarchical_clustering')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.compress_auxiliary_data = A('compress_auxiliary_data')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)
        self.clustering_configs = constants.clustering_configs['merged']
        self.database_paths = {
            'CONTIGS.db': os.path.abspath(self.contigs_db_path)
        }

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        # set the output directory, and output file paths
        self.bin_output_directory = os.path.join(self.output_directory,
                                                 bin_name)
        filesnpaths.gen_output_directory(self.bin_output_directory)

        # let's see whether we are going to do any hierarchical clustering:
        self.max_num_splits_for_hierarchical_clustering = constants.max_num_items_for_hierarchical_clustering
        self.skip_hierarchical_clustering = self.is_hierarchical_clustering_for_bin_OK(
        )

        # set your own db paths
        self.bin_contigs_db_path = os.path.join(self.bin_output_directory,
                                                'CONTIGS.db')
        self.bin_profile_db_path = os.path.join(self.bin_output_directory,
                                                'PROFILE.db')
示例#5
0
    def __init__(self, args, run=run, progress=progress):
        self.progress = progress
        self.run = run

        self.max_num_splits_for_hierarchical_clustering = constants.max_num_items_for_hierarchical_clustering

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.sample_id = A('sample_name')
        self.contigs_db_path = A('contigs_db')
        self.input_profile_db_paths = A('input')
        self.output_directory = A('output_dir')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.enforce_hierarchical_clustering = A(
            'enforce_hierarchical_clustering')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.debug = A('debug')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.description_file_path = A('description')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        self.profiles = []
        self.num_profile_dbs = None
        self.split_names = None
        self.sample_ids_found_in_input_dbs = []
        self.normalization_multiplier = {}

        self.profile_dbs_info_dict = {}

        # these will describe layer additional data common to all profile
        # databases to be merged:
        self.layer_additional_data_dict = {}
        self.layer_additional_data_keys = {}

        self.merged_profile_db_path = None

        self.clustering_configs = constants.clustering_configs['merged']

        self.database_paths = {
            'CONTIGS.db':
            os.path.abspath(self.contigs_db_path)
            if self.contigs_db_path else None
        }

        # we don't know what we are about
        self.description = None
示例#6
0
    def __init__(self, args, run=run, progress=progress):
        self.progress = progress
        self.run = run

        self.max_num_splits_for_hierarchical_clustering = constants.max_num_items_for_hierarchical_clustering

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.sample_id = A('sample_name')
        self.contigs_db_path = A('contigs_db')
        self.input_profile_db_paths = A('input')
        self.output_directory = A('output_dir')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.enforce_hierarchical_clustering = A('enforce_hierarchical_clustering')
        self.skip_concoct_binning = A('skip_concoct_binning')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.debug = A('debug')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.description_file_path = A('description')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        self.profiles = []
        self.split_names = None
        self.sample_ids_found_in_input_dbs = []
        self.normalization_multiplier = {}

        self.profile_dbs_info_dict = {}

        # these will describe layer additional data common to all profile
        # databases to be merged:
        self.layer_additional_data_dict = {}
        self.layer_additional_data_keys = {}


        self.merged_profile_db_path = None

        self.clustering_configs = constants.clustering_configs['merged']

        self.database_paths = {'CONTIGS.db': os.path.abspath(self.contigs_db_path) if self.contigs_db_path else None}

        # we don't know what we are about
        self.description = None
示例#7
0
    def __init__(self, bin_name, summary_object, args, run=run, progress=progress):
        """A class to split a single bin from its parent.

        The class is not really useful without a summary object, but it makes logistic sense to keep it
        separate since the inheritance from anvio/summarizer.Bin is much easier and sane this way."""
        summarizer.Bin.__init__(self, summary_object, bin_name, run, progress)

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.output_directory = A('output_dir')
        self.skip_variability_tables = A('skip_variability_tables')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.enforce_hierarchical_clustering = A('enforce_hierarchical_clustering')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.compress_auxiliary_data = A('compress_auxiliary_data')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)
        self.clustering_configs = constants.clustering_configs['merged']
        self.database_paths = {'CONTIGS.db': os.path.abspath(self.contigs_db_path)}

        if self.enforce_hierarchical_clustering and self.skip_hierarchical_clustering:
            raise ConfigError("You are confusing anvi'o :/ You can't tell anvi'o to skip hierarchical clustering\
                               while also asking it to enforce it.")

        # set the output directory, and output file paths
        self.bin_output_directory = os.path.join(self.output_directory, bin_name)
        filesnpaths.gen_output_directory(self.bin_output_directory)

        # let's see whether we are going to do any hierarchical clustering:
        self.max_num_splits_for_hierarchical_clustering = constants.max_num_items_for_hierarchical_clustering
        self.skip_hierarchical_clustering = self.is_hierarchical_clustering_for_bin_OK()

        # set your own db paths
        self.bin_contigs_db_path = os.path.join(self.bin_output_directory, 'CONTIGS.db')
        self.bin_profile_db_path = os.path.join(self.bin_output_directory, 'PROFILE.db')
示例#8
0
    def __init__(self, args):
        self.args = args

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
        self.output_directory = A('output_dir')
        self.list_contigs_and_exit = A('list_contigs')
        self.min_contig_length = A('min_contig_length')
        self.min_mean_coverage = A('min_mean_coverage')
        self.min_coverage_for_variability = A('min_coverage_for_variability')
        self.contigs_shall_be_clustered = A('cluster_contigs')
        self.sample_id = A('sample_name')
        self.report_variability_full = A('report_variability_full')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.skip_SNV_profiling = A('skip_SNV_profiling')
        self.profile_AA_frequencies = A('profile_AA_frequencies')
        self.gen_serialized_profile = A('gen_serialized_profile')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        # whehther the profile database is a blank (without any BAM files or reads):
        self.blank = A('blank_profile')

        if self.blank:
            self.contigs_shall_be_clustered = True

        if args.contigs_of_interest:
            filesnpaths.is_file_exists(args.contigs_of_interest)
            self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                           if c.strip() and not c.startswith('#')])
        else:
            self.contig_names_of_interest = None

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        if not self.contigs_db_path:
            raise ConfigError, "No contigs database, no profilin'. Bye."

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self,
                                         self.args,
                                         r=self.run,
                                         p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = {}

        self.database_paths = {'CONTIGS.db': self.contigs_db_path}

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs[
            'blank' if self.blank else 'single']

        self.atomic_data = contigops.AtomicContigSplitData(self.progress)

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []

        # following variable will be populated while the variable positions table is computed
        self.codons_in_genes_to_profile_AA_frequencies = set([])
示例#9
0
文件: profiler.py 项目: pythseq/anvio
    def __init__(self, args):
        self.args = args

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
        self.output_directory = A('output_dir')
        self.list_contigs_and_exit = A('list_contigs')
        self.min_contig_length = A('min_contig_length')
        self.min_mean_coverage = A('min_mean_coverage')
        self.min_coverage_for_variability = A('min_coverage_for_variability')
        self.contigs_shall_be_clustered = A('cluster_contigs')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.sample_id = A('sample_name')
        self.report_variability_full = A('report_variability_full')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.skip_SNV_profiling = A('skip_SNV_profiling')
        self.profile_SCVs = A('profile_SCVs')
        self.gen_serialized_profile = A('gen_serialized_profile')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.num_threads = int(A('num_threads'))
        self.queue_size = int(A('queue_size'))
        self.write_buffer_size = int(A('write_buffer_size'))
        self.total_length_of_all_contigs = 0
        self.total_coverage_values_for_all_contigs = 0
        self.description_file_path = A('description')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        # whehther the profile database is a blank (without any BAM files or reads):
        self.blank = A('blank_profile')

        if not self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confused, and confusing anvi'o, too. You can't as hierarchical clustering\
                               to be performed with one flag, and try to skip it with another one :("
            )

        if self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError(
                "So you want to generate a blank profile, and you both want hierarchical clustering\
                               of your contigs to be performed, and skipped. No."
            )

        if self.blank and self.contigs_shall_be_clustered:
            raise ConfigError(
                "When the blank profile is asked to be generated, there is no need to ask for the\
                               hierarchical clustering of contigs. It is going to be done by default. If it is\
                               not changing anything, why is anvi'o upset with you? Because. Let's don't use flags\
                               we don't need.")

        if self.blank and not self.skip_hierarchical_clustering:
            self.contigs_shall_be_clustered = True

        if args.contigs_of_interest:
            filesnpaths.is_file_exists(args.contigs_of_interest)
            self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                           if c.strip() and not c.startswith('#')])
        else:
            self.contig_names_of_interest = None

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        if not self.contigs_db_path:
            raise ConfigError("No contigs database, no profilin'. Bye.")

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self,
                                         self.args,
                                         r=self.run,
                                         p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = []

        self.database_paths = {
            'CONTIGS.db': os.path.abspath(self.contigs_db_path)
        }

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs[
            'blank' if self.blank else 'single']

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []

        # if genes are not called, yet the user is asking for codon frequencies to be profiled, we give
        # a warning and force-turn that flag off.
        if (not self.a_meta['genes_are_called']) and self.profile_SCVs:
            self.run.warning(
                "You asked the codon frequencies to be profiled, but genes were not called\
                              for your contigs database. Anvi'o is assigning `False` to the profile-codon-frequncies\
                              flag, overruling your request like a boss.")
            self.profile_SCVs = False

        # following variable will be populated while the variable positions table is computed
        self.codons_in_genes_to_profile_SCVs = set([])

        # we don't know what we are about
        self.description = None

        # additional layer data will be filled later
        self.layer_additional_keys = []
        self.layer_additional_data = {}
示例#10
0
文件: profiler.py 项目: meren/anvio
    def __init__(self, args, r=terminal.Run(width=35), p=terminal.Progress()):
        self.args = args
        self.progress = p
        self.run = r

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
        self.output_directory = A('output_dir')
        self.list_contigs_and_exit = A('list_contigs')
        self.min_contig_length = A('min_contig_length') or 0
        self.max_contig_length = A('max_contig_length') or sys.maxsize
        self.min_mean_coverage = A('min_mean_coverage')
        self.min_coverage_for_variability = A('min_coverage_for_variability')
        self.contigs_shall_be_clustered = A('cluster_contigs')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.sample_id = A('sample_name')
        self.report_variability_full = A('report_variability_full')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.skip_SNV_profiling = A('skip_SNV_profiling')
        self.profile_SCVs = A('profile_SCVs')
        self.ignore_orphans = A('ignore_orphans')
        self.max_coverage_depth = A('max_coverage_depth') or 8000
        self.gen_serialized_profile = A('gen_serialized_profile')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.num_threads = int(A('num_threads') or 1)
        self.queue_size = int(A('queue_size') if A('queue_size') is not None else 0)
        self.write_buffer_size = int(A('write_buffer_size') if A('write_buffer_size') is not None else 500)
        self.total_length_of_all_contigs = 0
        self.total_coverage_values_for_all_contigs = 0
        self.description_file_path = A('description')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        # whehther the profile database is a blank (without any BAM files or reads):
        self.blank = A('blank_profile')

        if not self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError("You are confused, and confusing anvi'o, too. You can't as hierarchical clustering\
                               to be performed with one flag, and try to skip it with another one :(")

        if self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError("So you want to generate a blank profile, and you both want hierarchical clustering\
                               of your contigs to be performed, and skipped. No.")

        if self.blank and self.contigs_shall_be_clustered:
            raise ConfigError("When the blank profile is asked to be generated, there is no need to ask for the\
                               hierarchical clustering of contigs. It is going to be done by default. If it is\
                               not changing anything, why is anvi'o upset with you? Because. Let's don't use flags\
                               we don't need.")

        if self.max_coverage_depth >= auxiliarydataops.COVERAGE_MAX_VALUE:
            raise ConfigError("The value %s for the maximum coverage depth is not going to work :/ While the maximum\
                               depth of coverage for anvi'o to care about is a soft cut-off (hence you have some level\
                               of freedom through the parameter `--max-coverage-depth`), there are database limitations\
                               anvi'o must consider and can not change. The maximum value allowed in the database for\
                               coverage information is 65536. Hence, you should set your depth of coverage to something \
                               that is less than this value. In addition, it is also recommended to leave a little gap\
                               and don't go beyond 90%% of this hard limit (that's why anvi'o will keep telling you,\
                               \"%s is nice, but %s is the best I can do\" when you try to exceed that)." \
                                        % (pp(self.max_coverage_depth), pp(self.max_coverage_depth), pp(auxiliarydataops.COVERAGE_MAX_VALUE)))


        if self.blank and not self.skip_hierarchical_clustering:
            self.contigs_shall_be_clustered = True

        if A('contigs_of_interest'):
            filesnpaths.is_file_exists(args.contigs_of_interest)
            self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                           if c.strip() and not c.startswith('#')])
        else:
            self.contig_names_of_interest = None

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        if not self.contigs_db_path:
            raise ConfigError("No contigs database, no profilin'. Bye.")

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self, self.args, r=self.run, p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = []

        self.database_paths = {'CONTIGS.db': os.path.abspath(self.contigs_db_path)}

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs['blank' if self.blank else 'single']

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []

        # if genes are not called, yet the user is asking for codon frequencies to be profiled, we give
        # a warning and force-turn that flag off.
        if (not self.a_meta['genes_are_called']) and self.profile_SCVs:
            self.run.warning("You asked the codon frequencies to be profiled, but genes were not called\
                              for your contigs database. Anvi'o is assigning `False` to the profile-codon-frequncies\
                              flag, overruling your request like a boss.")
            self.profile_SCVs = False

        # following variable will be populated while the variable positions table is computed
        self.codons_in_genes_to_profile_SCVs = set([])

        # we don't know what we are about
        self.description = None

        # additional layer data will be filled later
        self.layer_additional_keys = []
        self.layer_additional_data = {}
示例#11
0
    def __init__(self, args, r=terminal.Run(width=35), p=terminal.Progress()):
        self.args = args
        self.progress = p
        self.run = r

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
        self.output_directory = A('output_dir')
        self.list_contigs_and_exit = A('list_contigs')
        self.min_contig_length = A('min_contig_length') or 0
        self.max_contig_length = A('max_contig_length') or sys.maxsize
        self.min_mean_coverage = A('min_mean_coverage')
        self.min_coverage_for_variability = A('min_coverage_for_variability')
        self.contigs_shall_be_clustered = A('cluster_contigs')
        self.skip_hierarchical_clustering = A('skip_hierarchical_clustering')
        self.sample_id = A('sample_name')
        self.report_variability_full = A('report_variability_full')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.skip_SNV_profiling = A('skip_SNV_profiling')
        self.profile_SCVs = A('profile_SCVs')
        self.include_orphans = A('include_orphans')
        self.max_coverage_depth = A('max_coverage_depth') or 8000
        self.gen_serialized_profile = A('gen_serialized_profile')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default
        self.num_threads = int(A('num_threads') or 1)
        self.queue_size = int(
            A('queue_size') if A('queue_size') is not None else 0)
        self.write_buffer_size = int(
            A('write_buffer_size') if A('write_buffer_size'
                                        ) is not None else 500)
        self.total_length_of_all_contigs = 0
        self.total_coverage_values_for_all_contigs = 0
        self.description_file_path = A('description')

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        # whehther the profile database is a blank (without any BAM files or reads):
        self.blank = A('blank_profile')

        if not self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError(
                "You are confused, and confusing anvi'o, too. You can't as hierarchical clustering\
                               to be performed with one flag, and try to skip it with another one :("
            )

        if self.blank and self.contigs_shall_be_clustered and self.skip_hierarchical_clustering:
            raise ConfigError(
                "So you want to generate a blank profile, and you both want hierarchical clustering\
                               of your contigs to be performed, and skipped. No."
            )

        if self.blank and self.contigs_shall_be_clustered:
            raise ConfigError(
                "When the blank profile is asked to be generated, there is no need to ask for the\
                               hierarchical clustering of contigs. It is going to be done by default. If it is\
                               not changing anything, why is anvi'o upset with you? Because. Let's don't use flags\
                               we don't need.")

        if self.max_coverage_depth >= auxiliarydataops.COVERAGE_MAX_VALUE:
            raise ConfigError("The value %s for the maximum coverage depth is not going to work :/ While the maximum\
                               depth of coverage for anvi'o to care about is a soft cut-off (hence you have some level\
                               of freedom through the parameter `--max-coverage-depth`), there are database limitations\
                               anvi'o must consider and can not change. The maximum value allowed in the database for\
                               coverage information is 65536. Hence, you should set your depth of coverage to something \
                               that is less than this value. In addition, it is also recommended to leave a little gap\
                               and don't go beyond 90%% of this hard limit (that's why anvi'o will keep telling you,\
                               \"%s is nice, but %s is the best I can do\" when you try to exceed that)." \
                                        % (pp(self.max_coverage_depth), pp(self.max_coverage_depth), pp(auxiliarydataops.COVERAGE_MAX_VALUE)))

        if self.blank and not self.skip_hierarchical_clustering:
            self.contigs_shall_be_clustered = True

        if A('contigs_of_interest'):
            filesnpaths.is_file_exists(args.contigs_of_interest)
            self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                           if c.strip() and not c.startswith('#')])
        else:
            self.contig_names_of_interest = None

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        if not self.contigs_db_path:
            raise ConfigError("No contigs database, no profilin'. Bye.")

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self,
                                         self.args,
                                         r=self.run,
                                         p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = []

        self.database_paths = {
            'CONTIGS.db': os.path.abspath(self.contigs_db_path)
        }

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs[
            'blank' if self.blank else 'single']

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []

        # if genes are not called, yet the user is asking for codon frequencies to be profiled, we give
        # a warning and force-turn that flag off.
        if (not self.a_meta['genes_are_called']) and self.profile_SCVs:
            self.run.warning(
                "You asked the codon frequencies to be profiled, but genes were not called\
                              for your contigs database. Anvi'o is assigning `False` to the profile-codon-frequncies\
                              flag, overruling your request like a boss.")
            self.profile_SCVs = False

        # following variable will be populated while the variable positions table is computed
        self.codons_in_genes_to_profile_SCVs = set([])

        # we don't know what we are about
        self.description = None

        # additional layer data will be filled later
        self.layer_additional_keys = []
        self.layer_additional_data = {}
示例#12
0
文件: profiler.py 项目: ascendo/anvio
    def __init__(self, args):
        self.args = args

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.input_file_path = A('input_file')
        self.contigs_db_path = A('contigs_db')
        self.serialized_profile_path = A('serialized_profile')
        self.output_directory = A('output_dir')
        self.list_contigs_and_exit = A('list_contigs')
        self.min_contig_length = A('min_contig_length')
        self.min_mean_coverage = A('min_mean_coverage')
        self.min_coverage_for_variability = A('min_coverage_for_variability')
        self.contigs_shall_be_clustered = A('cluster_contigs')
        self.sample_id = A('sample_name')
        self.report_variability_full = A('report_variability_full')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.skip_SNV_profiling = A('skip_SNV_profiling')
        self.profile_AA_frequencies = A('profile_AA_frequencies')
        self.gen_serialized_profile = A('gen_serialized_profile')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        # whehther the profile database is a blank (without any BAM files or reads):
        self.blank = A('blank_profile')

        if self.blank:
            self.contigs_shall_be_clustered = True

        if args.contigs_of_interest:
            filesnpaths.is_file_exists(args.contigs_of_interest)
            self.contig_names_of_interest = set([c.strip() for c in open(args.contigs_of_interest).readlines()\
                                                                           if c.strip() and not c.startswith('#')])
        else:
            self.contig_names_of_interest = None

        self.progress = terminal.Progress()
        self.run = terminal.Run(width=35)

        if self.list_contigs_and_exit:
            self.list_contigs()
            sys.exit()

        if not self.contigs_db_path:
            raise ConfigError, "No contigs database, no profilin'. Bye."

        # Initialize contigs db
        dbops.ContigsSuperclass.__init__(self, self.args, r=self.run, p=self.progress)
        self.init_contig_sequences()
        self.contig_names_in_contigs_db = set(self.contigs_basic_info.keys())

        self.bam = None
        self.contigs = {}

        self.database_paths = {'CONTIGS.db': self.contigs_db_path}

        self.profile_db_path = None

        self.clustering_configs = constants.clustering_configs['blank' if self.blank else 'single']

        self.atomic_contig_split_data = contigops.AtomicContigSplitData(self.progress)

        # following variable will be populated during the profiling, and its content will eventually
        # be stored in t.variable_nts_table_name
        self.variable_nts_table_entries = []

        # following variable will be populated while the variable positions table is computed
        self.codons_in_genes_to_profile_AA_frequencies = set([])
示例#13
0
    def __init__(self, args, external_clustering=None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.mode = A('mode')
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.collection_name = A('collection_name')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.taxonomic_level = A('taxonomic_level')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state_autoload = A('state_autoload')
        self.collection_autoload = A('collection_autoload')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')
        self.list_collections = A('list_collections')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance,
                                                      self.linkage)

        self.split_names_ordered = None
        self.additional_layers = None
        self.auxiliary_profile_data_available = False

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}

        # make sure the mode will be set properly
        if self.collection_name and self.manual_mode:
            raise ConfigError, "You can't anvi-interactive in manual mode with a collection name."

        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)
        self.init_splits_taxonomy(self.taxonomic_level)

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(
                self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts(
            )
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order(
            )
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = Completeness(self.contigs_db_path)
            self.collections.populate_collections_dict(
                self.contigs_db_path, anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path,
                                                    self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if not self.mode and self.manual_mode:
            self.mode = 'manual'
            self.run.info('Mode', self.mode, mc='red')
            self.load_manual_mode(args)
        elif self.mode == 'refine':
            self.load_full_mode(args)
        elif self.collection_name or self.list_collections:
            self.mode = 'collection'
            self.run.info('Mode', self.mode, mc='green')
            self.load_collection_mode(args)
        else:
            self.mode = 'full'
            self.load_full_mode(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_manual_mode function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(
                self.profile_db_path,
                self.samples_information_db_path,
                manual_mode_exception=self.manual_mode)

        if self.external_clustering:
            self.p_meta[
                'clusterings'] = self.clusterings = self.external_clustering[
                    'clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering[
                'default_clustering']

        if not self.state_autoload and 'default' in self.states_table.states:
            self.state_autoload = 'default'

        if not self.collection_autoload and 'default' in self.collections.collections_dict:
            self.collection_autoload = 'default'

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    of splits that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped by anvi-merge because you had too many stplits\
                                    to get the clustering in a reasonable amount of time. Please read the help menu for\
                                    anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"

            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = utils.get_names_order_from_newick_tree(
            self.p_meta['clusterings'][
                self.p_meta['default_clustering']]['newick'])

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        if self.mode == 'full':
            self.init_non_singlecopy_gene_hmm_sources(
                self.split_names_ordered,
                return_each_gene_as_a_layer=self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()
示例#14
0
    def __init__(self, args, external_clustering=None):
        self.args = args
        self.views = {}
        self.states_table = None
        self.p_meta = {}
        self.title = 'Unknown Project'

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        self.mode = A('mode')
        self.profile_db_path = A('profile_db')
        self.contigs_db_path = A('contigs_db')
        self.collection_name = A('collection_name')
        self.manual_mode = A('manual_mode')
        self.split_hmm_layers = A('split_hmm_layers')
        self.additional_layers_path = A('additional_layers')
        self.additional_view_path = A('additional_view')
        self.samples_information_db_path = A('samples_information_db')
        self.view = A('view')
        self.fasta_file = A('fasta_file')
        self.view_data_path = A('view_data')
        self.tree = A('tree')
        self.title = A('title')
        self.output_dir = A('output_dir')
        self.show_views = A('show_views')
        self.state = A('state')
        self.show_states = A('show_states')
        self.skip_check_names = A('skip_check_names')
        self.list_collections = A('list_collections')
        self.distance = A('distance') or constants.distance_metric_default
        self.linkage = A('linkage') or constants.linkage_method_default

        # make sure early on that both the distance and linkage is OK.
        clustering.is_distance_and_linkage_compatible(self.distance, self.linkage)

        self.split_names_ordered = None
        self.additional_layers = None
        self.auxiliary_profile_data_available = False

        self.samples_information_dict = {}
        self.samples_order_dict = {}
        self.samples_information_default_layer_order = {}

        # make sure the mode will be set properly
        if self.collection_name and self.manual_mode:
            raise ConfigError, "You can't anvi-interactive in manual mode with a collection name."

        self.external_clustering = external_clustering

        self.collections = ccollections.Collections()

        ContigsSuperclass.__init__(self, self.args)
        self.init_splits_taxonomy()

        if self.samples_information_db_path:
            samples_information_db = SamplesInformationDatabase(self.samples_information_db_path)
            self.samples_information_dict, self.samples_order_dict = samples_information_db.get_samples_information_and_order_dicts()
            self.samples_information_default_layer_order = samples_information_db.get_samples_information_default_layer_order()
            samples_information_db.disconnect()

        if self.contigs_db_path:
            self.completeness = Completeness(self.contigs_db_path)
            self.collections.populate_collections_dict(self.contigs_db_path, anvio.__contigs__version__)
        else:
            self.completeness = None

        if 'skip_init_functions' in args and not args.skip_init_functions:
            self.init_functions()

        # make sure we are not dealing with apples and oranges here.
        if self.contigs_db_path and self.profile_db_path:
            is_profile_db_and_contigs_db_compatible(self.profile_db_path, self.contigs_db_path)

        self.P = lambda x: os.path.join(self.p_meta['output_dir'], x)
        self.cwd = os.getcwd()

        # here is where the big deal stuff takes place:
        if not self.mode and self.manual_mode:
            self.mode = 'manual'
            self.run.info('Mode', self.mode, mc='red')
            self.load_manual_mode(args)
        elif self.mode == 'refine':
            self.load_full_mode(args)
        elif self.collection_name or self.list_collections:
            self.mode = 'collection'
            self.run.info('Mode', self.mode, mc='green')
            self.load_collection_mode(args)
        else:
            self.mode = 'full'
            self.load_full_mode(args)

        # make sure the samples information database, if there is one, is in fact compatible with the profile database
        # the reason we are doing this here is because when we are in 'self.manual_mode', the self.p_meta['samples'] is
        # being filled within the self.load_manual_mode function based on the headers of the view data.
        if self.profile_db_path and self.samples_information_db_path:
            is_profile_db_and_samples_db_compatible(self.profile_db_path, self.samples_information_db_path, manual_mode_exception=self.manual_mode)

        if self.external_clustering:
            self.p_meta['clusterings'] = self.clusterings = self.external_clustering['clusterings']
            self.p_meta['available_clusterings'] = self.clusterings.keys()
            self.p_meta['default_clustering'] = self.external_clustering['default_clustering']

        if not self.state and 'default' in self.states_table.states:
            self.state = 'default'

        if not self.p_meta['clusterings']:
            if self.p_meta['merged']:
                raise ConfigError, "This merged profile database does not seem to have any hierarchical clustering\
                                    of splits that is required by the interactive interface. It may have been generated\
                                    by anvi-merge with the `--skip-hierarchical-clustering` flag, or hierarchical\
                                    clustering step may have been skipped by anvi-merge because you had too many stplits\
                                    to get the clustering in a reasonable amount of time. Please read the help menu for\
                                    anvi-merge, and/or refer to the tutorial: \
                                    http://merenlab.org/2015/05/01/anvio-tutorial/#clustering-during-merging"
            else:
                raise ConfigError, "This single profile database does not seem to have any hierarchical clustering\
                                    that is required by the interactive interface. You must use `--cluster-contigs`\
                                    flag for single profiles to access to this functionality. Please read the help\
                                    menu for anvi-profile, and/or refer to the tutorial."

        # self.split_names_ordered is going to be the 'master' names list. everything else is going to
        # need to match these names:
        self.split_names_ordered = utils.get_names_order_from_newick_tree(self.p_meta['clusterings'][self.p_meta['default_clustering']]['newick'])

        # now we knot what splits we are interested in (self.split_names_ordered), we can get rid of all the
        # unnecessary splits stored in views dicts.
        self.prune_view_dicts()

        # if there are any HMM search results in the contigs database other than 'singlecopy' sources,
        # we would like to visualize them as additional layers. following function is inherited from
        # Contigs DB superclass and will fill self.hmm_searches_dict if appropriate data is found in
        # search tables:
        if self.mode == 'full':
            self.init_non_singlecopy_gene_hmm_sources(self.split_names_ordered, return_each_gene_as_a_layer=self.split_hmm_layers)

        if self.additional_layers_path:
            filesnpaths.is_file_tab_delimited(self.additional_layers_path)
            self.additional_layers = self.additional_layers_path

        self.check_names_consistency()
        self.convert_view_data_into_json()