Exemplo n.º 1
0
 def get_data_from_txt_file(self):
     """ Reads the coverage data from TAB delimited file """
     self.samples = set(
         utils.get_columns_of_TAB_delim_file(
             self.gene_coverages_data_file_path))
     self.gene_coverages = utils.get_TAB_delimited_file_as_dictionary(
         self.gene_coverages_data_file_path,
         column_mapping=[int] + [float] * len(self.samples))
     # checking if a gene_detection file was also supplied
     if self.gene_detection_data_file_path:
         self.gene_detection = utils.get_TAB_delimited_file_as_dictionary(
             self.gene_detection_data_file_path,
             column_mapping=[int] + [float] * len(self.samples))
         # making sure that the tables are compatible, notice we're only checking if gene_detection contains everything that's in gene_coverages (and not vise versa)
         for gene_id in self.gene_coverages:
             if gene_id not in self.gene_detection:
                 raise ConfigError(
                     "Your tables are not compatible. For example gene_id %s is in %s, but not in %s"
                     % (gene_id, self.gene_coverages_data_file_path,
                        self.gene_detection_data_file_path))
         gene_detection_sample_list = next(
             iter(self.gene_detection.values())).keys()
         for sample_id in next(iter(self.gene_coverages.values())).keys():
             if sample_id not in gene_detection_sample_list:
                 raise ConfigError(
                     "Your tables are not compatible. For example sample_id %s is in %s, but not in %s"
                     % (sample_id, self.gene_coverages_data_file_path,
                        self.gene_detection_data_file_path))
Exemplo n.º 2
0
    def init(self):
        self.progress.new('Initializing COGs Data')
        self.progress.update('Reading COG functions ...')

        if self.COG_version == 'COG14':
            self.cogs = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation'])
        elif self.COG_version == 'COG20':
            self.cogs = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation', 'pathway'])
        else:
            raise ConfigError("You need to edit all the if/else statements with COG version checks to ensure proper "
                              "parsing of a new generation of COG files.")


        self.progress.update('Reading COG categories ...')
        self.categories = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['CATEGORIES.txt'], no_header=True, column_names=['category', 'description'])

        self.progress.update('Reading missing COG IDs ...')
        self.missing_cogs = dictio.read_serialized_object(self.essential_files['MISSING_COG_IDs.cPickle'])

        self.progress.end()

        for cog in self.cogs:
            self.cogs[cog]['categories'] = [c.strip() for c in self.cogs[cog]['categories'].split(',')]

        for cat in self.categories:
            self.categories[cat] = self.categories[cat]['description']

        self.initialized = True
Exemplo n.º 3
0
    def __init__(self, args = {}, p=progress, r=run):
        self.args = args

        self.run = r
        self.progress = p

        self.samples = None
        self.samples_information_dict = None
        self.data = None

        A = lambda x, t: t(args.__dict__[x]) if args.__dict__.has_key(x) else None
        null = lambda x: x
        self.input_file_path = A('input_file', null)
        self.samples_information_path = A('samples_information', null)
        self.max_num_unique_positions = A('max_num_unique_positions', int)
        self.output_file_path = A('output_file', null)

        filesnpaths.is_output_file_writable(self.output_file_path)

        if self.samples_information_path:
            filesnpaths.is_file_tab_delimited(self.samples_information_path)
            self.samples_information_dict = utils.get_TAB_delimited_file_as_dictionary(self.samples_information_path)
            num_attributes = len(self.samples_information_dict.values()[0])

            self.run.info('samples_information', '%d attributes read for %d samples' % (num_attributes, len(self.samples_information_dict)))

        if self.input_file_path:
            filesnpaths.is_file_tab_delimited(self.input_file_path)
            self.progress.new('Reading the input file')
            self.progress.update('...')
            self.data = utils.get_TAB_delimited_file_as_dictionary(self.input_file_path)
            self.progress.end()

            self.run.info('input_file', '%d entries read' % len(self.data))
Exemplo n.º 4
0
    def init(self):
        self.progress.new('Initializing COGs Data')
        self.progress.update('Reading COG functions ...')
        self.cogs = utils.get_TAB_delimited_file_as_dictionary(
            self.essential_files['COG.txt'],
            no_header=True,
            column_names=['COG', 'categories', 'annotation'])

        self.progress.update('Reading COG categories ...')
        self.categories = utils.get_TAB_delimited_file_as_dictionary(
            self.essential_files['CATEGORIES.txt'],
            no_header=True,
            column_names=['category', 'description'])

        self.progress.update('Reading missing COG IDs ...')
        self.missing_cogs = dictio.read_serialized_object(
            self.essential_files['MISSING_COG_IDs.cPickle'])

        self.progress.end()

        for cog in self.cogs:
            self.cogs[cog]['categories'] = [
                c.strip() for c in self.cogs[cog]['categories'].split(',')
            ]

        for cat in self.categories:
            self.categories[cat] = self.categories[cat]['description']

        self.initialized = True
Exemplo n.º 5
0
    def read_genome_paths_from_input_files(self):
        """Reads internal and external genome files, populates self.genomes"""

        fields_for_internal_genomes_input = ['name', 'bin_id', 'collection_id', 'profile_db_path', 'contigs_db_path']
        fields_for_external_genomes_input = ['name', 'contigs_db_path']

        self.internal_genomes_dict = utils.get_TAB_delimited_file_as_dictionary(self.input_file_for_internal_genomes, expected_fields=fields_for_internal_genomes_input) if self.input_file_for_internal_genomes else {}
        self.external_genomes_dict = utils.get_TAB_delimited_file_as_dictionary(self.input_file_for_external_genomes, expected_fields=fields_for_external_genomes_input) if self.input_file_for_external_genomes else {}
Exemplo n.º 6
0
    def read_genome_paths_from_input_files(self):
        """Reads internal and external genome files, populates self.genomes"""

        fields_for_internal_genomes_input = ['name', 'bin_id', 'collection_id', 'profile_db_path', 'contigs_db_path']
        fields_for_external_genomes_input = ['name', 'contigs_db_path']

        self.internal_genomes_dict = utils.get_TAB_delimited_file_as_dictionary(self.input_file_for_internal_genomes, expected_fields=fields_for_internal_genomes_input) if self.input_file_for_internal_genomes else {}
        self.external_genomes_dict = utils.get_TAB_delimited_file_as_dictionary(self.input_file_for_external_genomes, expected_fields=fields_for_external_genomes_input) if self.input_file_for_external_genomes else {}
Exemplo n.º 7
0
    def __init__(self, args=None, run=run, progress=progress):
        self.args = args
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        input_file_for_internal_genomes = A('internal_genomes')
        input_file_for_external_genomes = A('external_genomes')
        self.num_threads = A('num_threads')
        self.output_dir = A('output_dir')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.debug = A('debug')
        self.min_percent_identity = A('min_percent_identity')
        self.PC_min_occurrence = A('min_occurrence')
        self.mcl_inflation = A('mcl_inflation')
        self.sensitive = A('sensitive')
        self.maxbit = A('maxbit')
        self.use_ncbi_blast = A('use_ncbi_blast')
        self.exclude_partial_gene_calls = A('exclude_partial_gene_calls')

        self.genomes = {}

        fields_for_internal_genomes_input = ['name', 'bin_id', 'collection_id', 'profile_db_path', 'contigs_db_path']
        fields_for_external_genomes_input = ['name', 'contigs_db_path']

        self.log_file_path = None

        internal_genomes_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_for_internal_genomes, expected_fields=fields_for_internal_genomes_input) if input_file_for_internal_genomes else {}
        external_genomes_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_for_external_genomes, expected_fields=fields_for_external_genomes_input) if input_file_for_external_genomes else {}

        self.internal_genome_names = internal_genomes_dict.keys()
        self.external_genome_names = external_genomes_dict.keys()

        if len(self.internal_genome_names) + len(self.external_genome_names) != len(set(self.internal_genome_names + self.external_genome_names)):
            raise ConfigError, "Each entry both in internal and external genome descriptions should have a unique 'name'. This does not\
                                seem to be the case with your input :/"

        # convert relative paths to absolute paths and MERGE internal and external genomes into self.genomes:
        for source, input_file in [(external_genomes_dict, input_file_for_external_genomes), (internal_genomes_dict, input_file_for_internal_genomes)]:
            for genome_name in source:
                self.genomes[genome_name] = source[genome_name]
                for db_path_var in ['contigs_db_path', 'profile_db_path']:
                    if db_path_var not in self.genomes[genome_name]:
                        continue
                    path = self.genomes[genome_name][db_path_var]
                    if not path.startswith('/'):
                        self.genomes[genome_name][db_path_var] = os.path.abspath(os.path.join(os.path.dirname(input_file), path))

        # to be filled during init:
        self.hash_to_genome_name = {}
        self.protein_sequences_dict = {}
        self.view_data = {}
        self.view_data_presence_absence = {}
        self.additional_view_data = {}
Exemplo n.º 8
0
    def save_samples_information(self, additional_description=''):
        if not self.samples_information_to_append:
            samples_information_column_titles = list(
                self.samples_information[next(iter(self.samples_information))])
            samples_information_dict = self.samples_information
        else:
            samples_information_column_titles = utils.get_columns_of_TAB_delim_file(
                self.samples_information_to_append)
            column_mapping = [str
                              ] * (len(samples_information_column_titles) + 2)
            self.run.warning(self.samples_information)
            samples_information_dict = utils.get_TAB_delimited_file_as_dictionary(
                self.samples_information_to_append,
                dict_to_append=self.samples_information,
                assign_none_for_missing=True,
                column_mapping=column_mapping)

        if additional_description:
            additional_description = '-' + additional_description

        samples_information_file_name = self.output_file_prefix + additional_description + '-samples-information.txt'
        utils.store_dict_as_TAB_delimited_file(
            samples_information_dict,
            samples_information_file_name,
            headers=['samples'] + samples_information_column_titles)
Exemplo n.º 9
0
    def process(self):
        self.sanity_check()

        self.run.info('Input metadata file', self.metadata_file_path)
        self.run.info('Output directory', self.output_directory_path)

        columns = utils.get_columns_of_TAB_delim_file(self.metadata_file_path)
        if 'organism_name' not in columns or 'local_filename' not in columns:
            raise ConfigError("The metadata file you provided does not look like a metadata\
                               file output from the program `ncbi-genome-download` :/ Why?\
                               Because anvi'o expects that file to have at least the following\
                               two columns in it: 'organism_name' and 'local_filename'.")

        metadata = utils.get_TAB_delimited_file_as_dictionary(self.metadata_file_path)

        for entry in metadata:
            if not os.path.exists(metadata[entry]['local_filename']):
                raise ConfigError("At least one of the files in your metadata input does not seem to be\
                                   where they think they are :/ Please make sure the entry %s and others\
                                   point to proper local file paths..." % entry)

        self.run.info('Num entries in metadata', len(metadata))

        output_fasta_dict = {}
        self.progress.new("GenBank to anvi'o", progress_total_items=len(metadata))
        for entry in metadata:
            self.progress.increment()
            self.progress.update('Processing %s ...' % entry)

            # set the organism name and accession id and clean them from weird
            # characters.
            organism_name = metadata[entry]['organism_name']
            for char in [c for c in organism_name if c not in OK_CHARS_FOR_ORGANISM_NAME]:
                organism_name = organism_name.replace(char, '_')

            accession_id = entry
            for char in [c for c in accession_id if c not in OK_CHARS_FOR_ACCESSION]:
                accession_id = accession_id.replace(char, '_')

            final_name = '_'.join([organism_name, accession_id])

            args = argparse.Namespace(input_genbank=metadata[entry]['local_filename'],
                                      output_file_prefix=os.path.join(self.output_directory_path, final_name))
            g = GenbankToAnvio(args, run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False))

            if final_name in output_fasta_dict:
                raise ConfigError("The final name '%s' for your genome has alrady been used by\
                                   another one :/ This should never happen unless your metadata\
                                   contains entries with identical accession numbers...")
            output_fasta_dict[final_name] = g.process()

        self.progress.end()

        headers = ['name', 'path']
        if not self.exclude_gene_calls_from_fasta_txt:
            headers.extend(['external_gene_calls', 'gene_functional_annotation'])

        utils.store_dict_as_TAB_delimited_file(output_fasta_dict, self.output_fasta_descriptor, headers=headers)

        self.run.info('Output FASTA descriptor', self.output_fasta_descriptor)
Exemplo n.º 10
0
 def load_collections(self):
     ''' Load the collections_txt file, run some sanity checks, and figure out params for anvi_import_collection'''
     collections = u.get_TAB_delimited_file_as_dictionary(self.collections_txt)
     bad_groups = [g for g in collections if g not in self.group_names]
     if bad_groups:
             raise ConfigError('Some of the names in your collection_txt \
                                file ("%s") don\'t match the names of the \
                                groups in your samples_txt/fasta_txt. \
                                Here are the names that don\'t match: %s. \
                                And here are the group names we expect to find: \
                                %s' % (self.collections_txt, ', '.join(bad_groups), ', '.join(self.group_names)))
     for group in collections:
         filesnpaths.is_file_exists(collections[group]['collection_file'])
         if not collections[group]['collection_name']:
             raise ConfigError('You must specify a name for each collection in your collections_txt')
         u.check_collection_name(collections[group]['collection_name'])
         if collections[group].get('bins_info'):
             filesnpaths.is_file_exists(collections[group]['bins_info'])
             collections[group]['bins_info'] = '--bins-info %s' % collections[group]['bins_info']
         else:
             collections[group]['bins_info'] = ''
         if collections[group].get('contigs_mode'):
             collections[group]['contigs_mode'] = '--contigs-mode'
         else:
             collections[group]['contigs_mode'] = ''
     self.collections = collections
Exemplo n.º 11
0
    def populate_from_file(self,
                           additional_data_file_path,
                           skip_check_names=None):

        if skip_check_names is None and utils.is_blank_profile(self.db_path):
            # FIXME: this BS is here because blank abvi'o profiles do not know what items they have,
            #        hence the utils.get_all_item_names_from_the_database function eventually explodes if we
            #        don't skip check names.
            skip_check_names = True

        filesnpaths.is_file_tab_delimited(additional_data_file_path)

        data_keys = utils.get_columns_of_TAB_delim_file(
            additional_data_file_path)
        data_dict = utils.get_TAB_delimited_file_as_dictionary(
            additional_data_file_path)

        if not len(data_keys):
            raise ConfigError("There is something wrong with the additional data file for %s at %s.\
                               It does not seem to have any additional keys for data :/" \
                                            % (self.target, additional_data_file_path))

        if self.target == 'layer_orders':
            OrderDataBaseClass.add(self, data_dict, skip_check_names)
        else:
            AdditionalDataBaseClass.add(self, data_dict, data_keys,
                                        skip_check_names)
Exemplo n.º 12
0
    def save_gene_class_information_in_additional_layers(
            self, additional_description=''):
        if not self.additional_layers_to_append:
            additional_column_titles = []
            additional_layers_dict = self.gene_class_information
        else:
            additional_column_titles = utils.get_columns_of_TAB_delim_file(
                self.additional_layers_to_append)
            additional_layers_dict = utils.get_TAB_delimited_file_as_dictionary(
                self.additional_layers_to_append,
                dict_to_append=self.gene_class_information,
                assign_none_for_missing=True,
                column_mapping=[int] + [str] * len(additional_column_titles))

        if additional_description:
            additional_description = '-' + additional_description

        additional_layers_file_name = self.output_file_prefix + additional_description + '-additional-layers.txt'
        headers = headers = [
            'gene_callers_id', 'gene_class', 'number_of_detections',
            'portion_detected', 'gene_specificity',
            'gene_coverage_consistency', 'core_or_accessory', 'adjusted_mean',
            'adjusted_stds'
        ] + additional_column_titles

        utils.store_dict_as_TAB_delimited_file(additional_layers_dict,
                                               additional_layers_file_name,
                                               headers=headers)
Exemplo n.º 13
0
    def init_kraken(self):
        '''Making sure the sample names and file paths the provided kraken.txt file are valid'''
        kraken_txt = self.get_param_value_from_config('kraken_txt')
        self.run_krakenuniq = self.get_param_value_from_config(
            ['krakenuniq', 'run']) == True

        if kraken_txt:
            if self.get_param_value_from_config(['krakenuniq', 'run']) == True:
                raise ConfigError(
                    "You supplied a kraken_txt file (\"%s\") but you set krakenuniq \
                                   to run in the config file. anvi'o is confused and \
                                   is officially going on a strike. Ok, let's clarify, \
                                   having a kraken_txt file means you already ran krakenuniq \
                                   and want us to use those results, and yet you set krakenuniq \
                                   to run again? why? Ok, time to strike. Bye!"
                    % kraken_txt)

            # if a kraken_txt was supplied then let's run kraken by default
            self.run_krakenuniq = True

            kraken_annotation_dict = u.get_TAB_delimited_file_as_dictionary(
                kraken_txt)
            if next(iter(next(iter(
                    kraken_annotation_dict.values())).keys())) != "path":
                raise ConfigError(
                    "Your kraken annotation file, '%s', is not formatted properly \
                                   anvi'o expected it to have two columns only and the second column \
                                   should have a header 'path'." % kraken_txt)
            samples_in_kraken_txt = set(kraken_annotation_dict.keys())
            # get a list of the sample names
            sample_names = set(self.samples_information['sample'])

            wrong_samples_in_kraken_txt = samples_in_kraken_txt - sample_names
            if wrong_samples_in_kraken_txt:
                raise ConfigError(
                    "Your kraken annotation file, '%s', contains samples that \
                                   are not in your samples_txt file, '%s'. Here is an example \
                                   of such a sample: %s." %
                    (kraken_txt,
                     self.get_param_value_from_config('samples_txt'),
                     next(iter(wrong_samples_in_kraken_txt))))

            missing_samples_in_kraken_txt = sample_names - samples_in_kraken_txt
            if missing_samples_in_kraken_txt:
                raise ConfigError(
                    "Your kraken annotation file, '%s', is missing samples that \
                                   are in your samples_txt file, '%s'. This is not allowed. \
                                   Here is an example of such a sample: %s." %
                    (kraken_txt,
                     self.get_param_value_from_config('samples_txt'),
                     wrong_samples_in_kraken_txt[0]))
            self.kraken_annotation_dict = kraken_annotation_dict

        if self.get_param_value_from_config(['krakenuniq', 'run']):
            if not self.get_param_value_from_config(['krakenuniq', '--db']):
                raise ConfigError(
                    'In order to run krakenuniq, you must provide a path to \
                                   a database using the --db parameter in the config file.'
                )
Exemplo n.º 14
0
 def load_catalog(self):
     catalog_path = os.path.join(self.pfam_data_dir, 'Pfam-A.clans.tsv')
     self.function_catalog = utils.get_TAB_delimited_file_as_dictionary(
         catalog_path,
         column_names=[
             'accession', 'clan', 'unknown_column1', 'unknown_column2',
             'function'
         ])
Exemplo n.º 15
0
def get(engine, run=run):
    data = {}

    if engine not in engines:
        raise ConfigError(
            "Anvi'o was about to populate the SSMs, but it does not know about the engine '%s'."
            % engine)

    dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), engine)
    substitution_matrix_paths = [
        s for s in glob.glob(os.path.join(dir_path, '*')) if s.endswith('.txt')
    ]

    for matrix_path in substitution_matrix_paths:
        matrix_id = os.path.basename(matrix_path).split('.txt')[0]

        matrix_rows = u.get_column_data_from_TAB_delim_file(
            matrix_path, column_indices=[0])[0][1:]
        matrix_columns = u.get_columns_of_TAB_delim_file(
            matrix_path, include_first_column=False)

        if sorted(matrix_columns) != sorted(matrix_rows):
            raise ConfigError("Anvi'o found a substitution scoring matrix named '%s'. However, it doesn't look like "
                              "a nicely done matrix. Substitution scoring matrices must contain the same row and column "
                              "names (i.e., a square matrix that is equal to its transpose). Well. This one does not :/" \
                                                    % (os.path.basename(matrix_path)))

        if engine == 'AA':
            expected_items = set(list(constants.amino_acids))
        elif engine == 'NT':
            expected_items = set(list(constants.nucleotides))
        elif engine == 'CDN':
            expected_items = set(list(constants.codons))

        unexpected_items_in_matrix = [
            item for item in matrix_columns if item not in expected_items
        ]
        if len(unexpected_items_in_matrix):
            raise ConfigError("It seems you have a poorly done substitution scoring matrix named '%s' in the data directory. "
                              "Anvi'o expects an %s substitution matrix to describe one or more of these %d guys: '%s'. But "
                              "the matrix %s had stuff anvi'o is not familiar with: '%s'." % \
                                            (matrix_id, engine, len(expected_items), ', '.join(expected_items),
                                             matrix_id, ', '.join(unexpected_items_in_matrix)))

        matrix_data = u.get_TAB_delimited_file_as_dictionary(
            matrix_path, column_mapping=[str] + [float] * len(expected_items))
        data[matrix_id] = matrix_data

    if len(data):
        run.warning('%d matri%s been loaded: "%s".' % \
                                    (len(data),
                                     'ces have' if len(data) > 1 else 'x has',
                                     ', '.join(list(data.keys()))),
                    header='%s substitution scoring matrices' % engine,
                    lc="green")

    return data
Exemplo n.º 16
0
    def load_from_files(self, args):
        if (not self.fasta_file) or (not self.metadata) or (not self.tree) or (not self.output_dir):
            raise ConfigError, "If you do not have a RUNINFO dict, you must declare each of\
                                           '-f', '-m', '-t' and '-o' parameters. Please see '--help' for\
                                           more detailed information on them."

        if self.view:
            raise ConfigError, "You can't use '-v' parameter when this program is not called with a RUNINFO.cp"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show when there is no RUNINFO.cp :/"

        metadata_path = os.path.abspath(self.metadata)
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file)
        self.p_meta['output_dir'] = os.path.abspath(self.output_dir)
        self.p_meta['views'] = {}
        self.p_meta['default_view'] = 'single'
        self.p_meta['default_clustering'] = 'default'
        self.p_meta['available_clusterings'] = ['default']
        self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}}

        self.default_view = self.p_meta['default_view']

        if self.summary_index:
            self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index)
            self.splits_summary_index = dictio.read_serialized_object(self.p_meta['profile_summary_index'])

        # sanity of the metadata
        filesnpaths.is_file_tab_delimited(metadata_path)
        metadata_columns = utils.get_columns_of_TAB_delim_file(metadata_path, include_first_column=True)
        if not metadata_columns[0] == "contig":
            raise ConfigError, "The first row of the first column of the metadata file must\
                                      say 'contig', which is not the case for your metadata file\
                                      ('%s'). Please make sure this is a properly formatted metadata\
                                      file." % (metadata_path)

        # store metadata as view:
        self.views[self.default_view] = {'header': metadata_columns[1:],
                                         'dict': utils.get_TAB_delimited_file_as_dictionary(metadata_path)}
        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
        self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

        # setup a mock splits_basic_info dict
        self.splits_basic_info = {}
        for split_id in self.split_names_ordered:
            self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # reminder: this is being stored in the output dir provided as a commandline parameter:
        self.p_meta['self_path'] = os.path.join(self.p_meta['output_dir'], 'RUNINFO.cp')

        if self.title:
            self.title = self.title

        filesnpaths.gen_output_directory(self.p_meta['output_dir'])
Exemplo n.º 17
0
    def init(self):
        super().init()

        fasta_txt_file = self.get_param_value_from_config('fasta_txt', repress_default=True)

        if fasta_txt_file:
            filesnpaths.is_file_exists(fasta_txt_file)
            self.contigs_information = u.get_TAB_delimited_file_as_dictionary(fasta_txt_file)
            self.fasta_information.update(self.contigs_information)
            self.group_names = list(self.contigs_information.keys())
            self.references_mode = True
Exemplo n.º 18
0
    def process_samples_order_file(self, samples_order_path):
        if not samples_order_path:
            return

        self.sample_names_in_samples_order_file = filesnpaths.is_proper_samples_order_file(samples_order_path)

        self.samples_order_dict = utils.get_TAB_delimited_file_as_dictionary(samples_order_path)

        self.available_orders = set(self.samples_order_dict.keys())

        self.run.info('Samples order', 'Loaded for %d attributes' % len(self.samples_order_dict))
Exemplo n.º 19
0
    def process(self, input_path, fasta_files):
        self.run.info('[sourmash] Kmer size', self.kmer_size, nl_before=1)
        self.run.info('[sourmash] Compression ratio', self.scale)

        report_name = 'kmer_%d_mash_similarity' % self.kmer_size

        # backup the old working directory before changing the directory
        old_wd = os.getcwd()
        os.chdir(input_path)
        if not os.path.exists('output'):
            os.mkdir('output')
        else:
            pass

        self.progress.new('Sourmash')
        self.progress.update('Computing fasta signatures for kmer=%d, scale=%d' % (self.kmer_size, self.scale))

        scale = '--scaled=%i' % self.scale
        compute_command = [self.program_name, 'compute',
                           '-k', self.kmer_size,
                           '-f', scale]
        compute_command.extend(fasta_files)

        exit_code = utils.run_command(compute_command, self.log_file_path, remove_log_file_if_exists=False)
        if int(exit_code):
            self.progress.end()
            raise ConfigError("sourmash returned with non-zero exit code, there may be some errors.\
                              Please check the log file `%s` for details. Offending command: \
                              `%s` ..." % (self.log_file_path, ' '.join([str(x) for x in compute_command[:7]])))

        self.progress.update('Computing similarity matrix for kmer=%d, scale=%d' % (self.kmer_size, self.scale))
        compare_command = [self.program_name, 'compare',
                           '-k', self.kmer_size,
                           '--csv', os.path.join('output', report_name + '.txt')]
        for f in fasta_files:
            compare_command.append(f + ".sig")

        exit_code = utils.run_command(compare_command, self.log_file_path, remove_log_file_if_exists=False)
        if int(exit_code):
            self.progress.end()
            raise ConfigError("sourmash returned with non-zero exit code, there may be some errors.\
                              Please check the log file `%s` for details. Offending command: \
                              `%s` ..." % (self.log_file_path, ' '.join([str(x) for x in compute_command[:7]])))

        self.results[report_name] = utils.get_TAB_delimited_file_as_dictionary(os.path.join('output', report_name + '.txt'),
                                                                               indexing_field=-1,
                                                                               separator=',')

        self.progress.end()

        # restore old working directory
        os.chdir(old_wd)

        return self.results
Exemplo n.º 20
0
    def process_samples_order_file(self, samples_order_path):
        if not samples_order_path:
            return

        self.sample_names_in_samples_order_file = filesnpaths.is_proper_samples_order_file(samples_order_path)

        self.samples_order_dict = utils.get_TAB_delimited_file_as_dictionary(samples_order_path)

        self.available_orders = set(self.samples_order_dict.keys())

        self.run.info('Samples order', 'Loaded for %d attributes' % len(self.samples_order_dict), quiet = self.quiet)
Exemplo n.º 21
0
    def init(self):
        self.progress.new('Initializing COGs Data')
        self.progress.update('Reading COG functions ...')
        self.cogs = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['COG.txt'], no_header=True, column_names=['COG', 'categories', 'annotation'])

        self.progress.update('Reading COG categories ...')
        self.categories = utils.get_TAB_delimited_file_as_dictionary(self.essential_files['CATEGORIES.txt'], no_header=True, column_names=['category', 'description'])

        self.progress.update('Reading missing COG IDs ...')
        self.missing_cogs = dictio.read_serialized_object(self.essential_files['MISSING_COG_IDs.cPickle'])

        self.progress.end()

        for cog in self.cogs:
            self.cogs[cog]['categories'] = [c.strip() for c in self.cogs[cog]['categories'].split(',')]

        for cat in self.categories:
            self.categories[cat] = self.categories[cat]['description']

        self.initialized = True
Exemplo n.º 22
0
    def load_references_for_removal(self):
        """Load and perform some sanity checks on the references for removal"""
        self.references_for_removal = u.get_TAB_delimited_file_as_dictionary(
            self.references_for_removal_txt)
        # adding the references_for_removal to the fasta_information dict
        self.fasta_information.update(self.references_for_removal)

        for sample in self.references_for_removal.keys():
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError(
                    "While processing the references for removal txt file ('%s'), anvi'o ran into the following error: "
                    "%s" % (self.samples_txt_file, e))

        files_that_end_with_gz = []
        for ref_dict in self.references_for_removal.values():
            if 'path' not in ref_dict:
                raise ConfigError(
                    'Yor references for removal txt file is not formatted properly. It must have only two columns '
                    'with the headers "reference" and "path".')
            if ref_dict['path'].endswith('.gz'):
                filesnpaths.is_file_exists(ref_dict['path'])
                files_that_end_with_gz.append(ref_dict['path'])
            else:
                # if the file is not compressed then we can verify that it is a fasta file
                filesnpaths.is_file_fasta_formatted(ref_dict['path'])

        if files_that_end_with_gz:
            run.warning(
                'The following reference for removal files are compressed: %s. '
                'That\'s fine, but it means that we will skip the '
                'sanity check to verify that this is actually '
                'a properly formatted fasta file. Things are '
                'probably Ok, this is just one of these occasions '
                'in which anvi\'o is oversharing.' %
                ', '.join(files_that_end_with_gz))

        if self.references_mode:
            # Make sure that the user didn't give the same name to references and references_for_removal
            ref_name_in_both = [
                r for r in self.references_for_removal
                if r in self.contigs_information
            ]
            if ref_name_in_both:
                raise ConfigError(
                    'You must have unique names for your fasta files in your fasta txt file '
                    'and your references for removal txt file. These are the names that appear '
                    'in both: %s' % ', '.join(ref_name_in_both))
        dont_remove = self.get_param_value_from_config(
            ['remove_short_reads_based_on_references', 'dont_remove_just_map'])
        if not dont_remove:
            self.remove_short_reads_based_on_references = True
Exemplo n.º 23
0
    def __init__(self, args={}, p=progress, r=run):
        self.args = args

        self.run = r
        self.progress = p

        self.samples = None
        self.samples_information_dict = None
        self.variable_nts_table = None

        A = lambda x, t: t(args.__dict__[x]) if args.__dict__.has_key(
            x) else None
        null = lambda x: x
        self.input_file_path = A('input_file', null)
        self.samples_information_path = A('samples_information', null)
        self.max_num_unique_positions = A('max_num_unique_positions', int)
        self.output_file_path = A('output_file', null)

        filesnpaths.is_output_file_writable(self.output_file_path)

        if self.samples_information_path:
            filesnpaths.is_file_tab_delimited(self.samples_information_path)
            self.samples_information_dict = utils.get_TAB_delimited_file_as_dictionary(
                self.samples_information_path)
            num_attributes = len(self.samples_information_dict.values()[0])

            self.run.info(
                'samples_information', '%d attributes read for %d samples' %
                (num_attributes, len(self.samples_information_dict)))

        if self.input_file_path:
            filesnpaths.is_file_tab_delimited(self.input_file_path)
            self.progress.new('Reading the input file')
            self.progress.update('...')
            self.variable_nts_table = utils.get_TAB_delimited_file_as_dictionary(
                self.input_file_path)
            self.progress.end()

            self.run.info('input_file',
                          '%d entries read' % len(self.variable_nts_table))
Exemplo n.º 24
0
    def read_paths_from_input_file(self):
        """Reads metagenome files, populates self.metagenomes"""

        columns = utils.get_columns_of_TAB_delim_file(self.input_file_for_metagenomes)

        if 'profile_db_path' in columns:
            fields_for_metagenomes_input = ['name', 'contigs_db_path', 'profile_db_path']
            self.profile_dbs_available = True
        else:
            fields_for_metagenomes_input = ['name', 'contigs_db_path']
            self.profile_dbs_available = False

        self.metagenomes_dict = utils.get_TAB_delimited_file_as_dictionary(self.input_file_for_metagenomes, expected_fields=fields_for_metagenomes_input) if self.input_file_for_metagenomes else {}
Exemplo n.º 25
0
    def populate_from_file(self, additional_data_file_path, skip_check_names=None):
        data_keys = utils.get_columns_of_TAB_delim_file(additional_data_file_path)
        data_dict = utils.get_TAB_delimited_file_as_dictionary(additional_data_file_path)

        if not len(data_keys):
            raise ConfigError("There is something wrong with the additional data file for %s at %s.\
                               It does not seem to have any additional keys for data :/" \
                                            % (self.target, additional_data_file_path))

        if self.target == 'layer_orders':
            OrderDataBaseClass.add(self, data_dict, skip_check_names)
        else:
            AdditionalDataBaseClass.add(self, data_dict, data_keys, skip_check_names)
Exemplo n.º 26
0
    def populate_from_file(self, additional_data_file_path, skip_check_names=None):
        data_keys = utils.get_columns_of_TAB_delim_file(additional_data_file_path)
        data_dict = utils.get_TAB_delimited_file_as_dictionary(additional_data_file_path)

        if not len(data_keys):
            raise ConfigError("There is something wrong with the additional data file for %s at %s.\
                               It does not seem to have any additional keys for data :/" \
                                            % (self.target, additional_data_file_path))

        if self.target == 'layer_orders':
            OrderDataBaseClass.add(self, data_dict, skip_check_names)
        else:
            AdditionalDataBaseClass.add(self, data_dict, data_keys, skip_check_names)
Exemplo n.º 27
0
    def process_samples_information_file(self, samples_information_path):
        if not samples_information_path:
            return

        self.sample_names_in_samples_information_file = filesnpaths.is_proper_samples_information_file(
            samples_information_path)

        self.samples_information_dict, self.aliases_to_attributes_dict = self.convert_samples_information_dict(
            utils.get_TAB_delimited_file_as_dictionary(
                samples_information_path))

        self.run.info(
            'Samples information',
            'Loaded for %d samples' % len(self.samples_information_dict))
Exemplo n.º 28
0
    def run_command(self, input_path):
        # backup the old working directory before changing the directory
        old_wd = os.getcwd()
        os.chdir(input_path)

        full_command = [
            self.program_name, '--outdir', 'output', '--indir', input_path,
            '--method', self.method, '--workers', self.num_threads
        ]

        self.progress.new('PyANI')
        self.progress.update('Running ...')
        exit_code = utils.run_command(full_command, self.log_file_path)
        self.progress.end()

        if int(exit_code):
            raise ConfigError(
                "PyANI returned with non-zero exit code, there may be some errors. \
                              please check the log file for details.")

        output_matrix_names = ['alignment_coverage', 'alignment_lengths', 'hadamard', \
                               'percentage_identity', 'similarity_errors', 'correlations']

        full_matrix_path = lambda name: os.path.join(
            input_path, 'output', self.method + '_' + name + '.tab')

        matrices = {}
        for matrix_name in output_matrix_names:
            output_matrix_path = full_matrix_path(matrix_name)
            if os.path.exists(output_matrix_path):
                matrices[
                    matrix_name] = utils.get_TAB_delimited_file_as_dictionary(
                        output_matrix_path, empty_header_columns_are_OK=True)

        if not len(matrices):
            raise ConfigError(
                "None of the output matrices pyANI was supposed to generate was found in the\
                               output directory :( You may find some clues in the log file?"
            )

        if not self.quiet:
            self.run.info_single("Output matrices for the following items are stored in the output\
                                  directory: %s <success kid meme.png>."                                                                         % \
                                            (', '.join(["'%s'" % m.replace('_', ' ') for m in matrices])), nl_before=1, mc='green')

        # restore old working directory
        os.chdir(old_wd)

        return matrices
Exemplo n.º 29
0
def get(engine, run=run):
    data = {}

    if engine not in engines:
        raise ConfigError("Anvi'o was about to populate the SSMs, but it does not know about the engine '%s'." % engine)

    dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), engine)
    substitution_matrix_paths = [s for s in glob.glob(os.path.join(dir_path, '*')) if s.endswith('.txt')]

    for matrix_path in substitution_matrix_paths:
        matrix_id = os.path.basename(matrix_path).split('.txt')[0]

        matrix_rows = u.get_column_data_from_TAB_delim_file(matrix_path, column_indices=[0])[0][1:]
        matrix_columns = u.get_columns_of_TAB_delim_file(matrix_path, include_first_column=False)

        if sorted(matrix_columns) != sorted(matrix_rows):
            raise ConfigError("Anvi'o found a substitution scoring matrix named '%s'. However, it doesn't look like\
                               a nicely done matrix. Substitution scoring matrices must contain the same row and column\
                               names (i.e., a square matrix that is equal to its transpose). Well. This one does not :/" \
                                                    % (os.path.basename(matrix_path)))

        if engine == 'AA':
            expected_items = set(list(constants.amino_acids))
        elif engine == 'NT':
            expected_items = set(list(constants.nucleotides))
        elif engine == 'CDN':
            expected_items = set(list(constants.codons))

        unexpected_items_in_matrix = [item for item in matrix_columns if item not in expected_items]
        if len(unexpected_items_in_matrix):
            raise ConfigError("It seems you have a poorly done substitution scoring matrix named '%s' in the data directory.\
                               Anvi'o expects an %s substitution matrix to describe one or more of these %d guys: '%s'. But\
                               the matrix %s had stuff anvi'o is not familiar with: '%s'." % \
                                            (matrix_id, engine, len(expected_items), ', '.join(expected_items),
                                             matrix_id, ', '.join(unexpected_items_in_matrix)))

        matrix_data = u.get_TAB_delimited_file_as_dictionary(matrix_path, column_mapping = [str] + [float] * len(expected_items))
        data[matrix_id] = matrix_data

    if len(data):
        run.warning('%d matri%s been loaded: "%s".' % \
                                    (len(data),
                                     'ces have' if len(data) > 1 else 'x has',
                                     ', '.join(list(data.keys()))),
                    header='%s substitution scoring matrices' % engine,
                    lc="green")

    return data
Exemplo n.º 30
0
    def process_samples_information_file(self, samples_information_path):
        if not samples_information_path:
            return

        self.sample_names_in_samples_information_file = filesnpaths.is_proper_samples_information_file(
            samples_information_path)

        self.samples_information_dict, self.aliases_to_attributes_dict = self.convert_samples_information_dict(
            utils.get_TAB_delimited_file_as_dictionary(
                samples_information_path))
        self.samples_information_default_layer_order = open(
            samples_information_path).readline().strip().split('\t')[1:]

        self.run.info(
            'Samples information',
            'Loaded for %d samples' % len(self.samples_information_dict))
Exemplo n.º 31
0
    def load_collections(self):
        ''' Load the collections_txt file, run some sanity checks, and figure out params for anvi_import_collection'''
        collections = u.get_TAB_delimited_file_as_dictionary(self.collections_txt)
        bad_groups = [g for g in collections if g not in self.group_names]
        if bad_groups:
                raise ConfigError('Some of the names in your collection_txt '
                                  'file ("%s") don\'t match the names of the '
                                  'groups in your samples_txt/fasta_txt. '
                                  'Here are the names that don\'t match: %s. '
                                  'And here are the group names we expect to find: '
                                  '%s' % (self.collections_txt, ', '.join(bad_groups), ', '.join(self.group_names)))
        for group in collections:
            default_collection = collections[group].get('default_collection')

            if default_collection:
                # User can specify either a default collection OR collection from file
                not_allowed_params = {'collection_name', 'collection_file', 'bins_info', 'contigs_mode'}
                if any([collections[group][key] for key in not_allowed_params if key in collections[group].keys()]):
                    raise ConfigError('We encountered the following problem with your '
                                      'collections_txt file ("%s"): you can choose '
                                      'either using a default collection OR importing '
                                      'a collection from a file. Yet, for "%s", you specificy '
                                      'a default collection AND also specify some of the following '
                                      'parameters: %s.' % (self.collections_txt, group, ", ".join(not_allowed_params)))

                collections[group]['collection_name'] = 'DEFAULT'
                collections[group]['contigs_mode'] = ''

            else:
                if not filesnpaths.is_file_exists(collections[group]['collection_file'], dont_raise=True):
                    raise ConfigError('We encountered the following problem with your '
                                      'collections_txt file ("%s"): you did not specify '
                                      'a valid collection file for "%s".' % (self.collections_txt, group))

                if not collections[group]['collection_name']:
                    raise ConfigError('You must specify a name for each collection in your collections_txt')
                u.check_collection_name(collections[group]['collection_name'])
                if collections[group].get('bins_info'):
                    filesnpaths.is_file_exists(collections[group]['bins_info'])
                    collections[group]['bins_info'] = '--bins-info %s' % collections[group]['bins_info']
                else:
                    collections[group]['bins_info'] = ''
                if collections[group].get('contigs_mode'):
                    collections[group]['contigs_mode'] = '--contigs-mode'
                else:
                    collections[group]['contigs_mode'] = ''
        self.collections = collections
Exemplo n.º 32
0
    def load_collections(self):
        ''' Load the collections_txt file, run some sanity checks, and figure out params for anvi_import_collection'''
        collections = u.get_TAB_delimited_file_as_dictionary(self.collections_txt)
        bad_groups = [g for g in collections if g not in self.group_names]
        if bad_groups:
                raise ConfigError('Some of the names in your collection_txt \
                                   file ("%s") don\'t match the names of the \
                                   groups in your samples_txt/fasta_txt. \
                                   Here are the names that don\'t match: %s. \
                                   And here are the group names we expect to find: \
                                   %s' % (self.collections_txt, ', '.join(bad_groups), ', '.join(self.group_names)))
        for group in collections:
            default_collection = collections[group].get('default_collection')

            if default_collection:
                # User can specify either a default collection OR collection from file
                not_allowed_params = {'collection_name', 'collection_file', 'bins_info', 'contigs_mode'}
                if any([collections[group][key] for key in not_allowed_params if key in collections[group].keys()]):
                    raise ConfigError('We encountered the following problem with your \
                                       collections_txt file ("%s"): you can choose \
                                       either using a default collection OR importing \
                                       a collection from a file. Yet, for "%s", you specificy \
                                       a default collection AND also specify some of the following \
                                       parameters: %s.' % (self.collections_txt, group, ", ".join(not_allowed_params)))

                collections[group]['collection_name'] = 'DEFAULT'
                collections[group]['contigs_mode'] = ''

            else:
                if not filesnpaths.is_file_exists(collections[group]['collection_file'], dont_raise=True):
                    raise ConfigError('We encountered the following problem with your \
                                       collections_txt file ("%s"): you did not specify \
                                       a valid collection file for "%s".' % (self.collections_txt, group))

                if not collections[group]['collection_name']:
                    raise ConfigError('You must specify a name for each collection in your collections_txt')
                u.check_collection_name(collections[group]['collection_name'])
                if collections[group].get('bins_info'):
                    filesnpaths.is_file_exists(collections[group]['bins_info'])
                    collections[group]['bins_info'] = '--bins-info %s' % collections[group]['bins_info']
                else:
                    collections[group]['bins_info'] = ''
                if collections[group].get('contigs_mode'):
                    collections[group]['contigs_mode'] = '--contigs-mode'
                else:
                    collections[group]['contigs_mode'] = ''
        self.collections = collections
Exemplo n.º 33
0
    def sanity_check_for_kraken(self):
        '''Making sure the sample names and file paths the provided kraken.txt file are valid'''
        kraken_txt = self.get_param_value_from_config('kraken_txt')

        if kraken_txt:
            if self.get_param_value_from_config(['krakenhll', 'run']) == False:
                raise ConfigError("You supplied a kraken_txt file, %s, but you set krakenhll \
                                   not to run in the config file. anvi'o is confused and \
                                   is officially going on a strike." % kraken_txt)

            if 'krakenhll' not in self.config:
                raise ConfigError('You provided a kraken_txt, but you didnt set any parameters \
                                   for krakenhll. As a minimum, you must provide the path to \
                                   the krakenhll database using the --db parameter in the config file.')

            # if a kraken_txt was supplied then let's run kraken by default
            self.config['krakenhll']['run'] = True

            kraken_annotation_dict = u.get_TAB_delimited_file_as_dictionary(kraken_txt)
            if next(iter(next(iter(kraken_annotation_dict.values())).keys())) != "path":
                raise ConfigError("Your kraken annotation file, '%s', is not formatted properly \
                                   anvi'o expected it to have two columns only and the second column \
                                   should have a header 'path'." % kraken_txt)
            samples_in_kraken_txt = set(kraken_annotation_dict.keys())
            # get a list of the sample names
            sample_names = set(self.samples_information['sample'])

            wrong_samples_in_kraken_txt = samples_in_kraken_txt - sample_names
            if wrong_samples_in_kraken_txt:
                raise ConfigError("Your kraken annotation file, '%s', contains samples that \
                                   are not in your samples_txt file, '%s'. Here is an example \
                                   of such a sample: %s." % (kraken_txt, self.get_param_value_from_config('samples_txt'), next(iter(wrong_samples_in_kraken_txt))))

            missing_samples_in_kraken_txt = sample_names - samples_in_kraken_txt
            if missing_samples_in_kraken_txt:
                raise ConfigError("Your kraken annotation file, '%s', is missing samples that \
                                   are in your samples_txt file, '%s'. This is not allowed. \
                                   Here is an example of such a sample: %s." % (kraken_txt, self.get_param_value_from_config('samples_txt'), wrong_samples_in_kraken_txt[0]))
            self.kraken_annotation_dict = kraken_annotation_dict

        if self.get_param_value_from_config(['krakenhll', 'run']):
            if not self.get_param_value_from_config(['krakenhll', '--db']):
                raise ConfigError('In order to run krakenhll, you must provide a path to \
                                   a database using the --db parameter in the config file.')
Exemplo n.º 34
0
    def load_references_for_removal(self):
        """Load and perform some sanity checks on the references for removal"""
        self.references_for_removal = u.get_TAB_delimited_file_as_dictionary(self.references_for_removal_txt)
        # adding the references_for_removal to the fasta_information dict
        self.fasta_information.update(self.references_for_removal)

        for sample in self.references_for_removal.keys():
            try:
                u.check_sample_id(sample)
            except ConfigError as e:
                raise ConfigError("While processing the references for removal txt file ('%s'), anvi'o ran into the following error: \
                                   %s" % (self.samples_txt_file, e))

        files_that_end_with_gz = []
        for ref_dict in self.references_for_removal.values():
            if 'path' not in ref_dict:
                raise ConfigError('Yor references for removal txt file is not formatted properly. It must have only two columns \
                                   with the headers "reference" and "path".')
            if ref_dict['path'].endswith('.gz'):
                filesnpaths.is_file_exists(ref_dict['path'])
                files_that_end_with_gz.append(ref_dict['path'])
            else:
                # if the file is not compressed then we can verify that it is a fasta file
                filesnpaths.is_file_fasta_formatted(ref_dict['path'])

        if files_that_end_with_gz:
            run.warning('The following reference for removal files are compressed: %s. \
                         That\'s fine, but it means that we will skip the \
                         sanity check to verify that this is actually \
                         a properly formatted fasta file. Things are \
                         probably Ok, this is just one of these occasions \
                         in which anvi\'o is oversharing.' % ', '.join(files_that_end_with_gz))

        if self.references_mode:
            # Make sure that the user didn't give the same name to references and references_for_removal
            ref_name_in_both = [r for r in self.references_for_removal if r in self.contigs_information]
            if ref_name_in_both:
                raise ConfigError('You must have unique names for your fasta files in your fasta txt file \
                                   and your references for removal txt file. These are the names that appear \
                                   in both: %s' % ', '.join(ref_name_in_both))
        dont_remove = self.get_param_value_from_config(['remove_short_reads_based_on_references', 'dont_remove_just_map'])
        if not dont_remove:
            self.remove_short_reads_based_on_references = True
Exemplo n.º 35
0
    def init_kraken(self):
        '''Making sure the sample names and file paths the provided kraken.txt file are valid'''
        kraken_txt = self.get_param_value_from_config('kraken_txt')
        self.run_krakenuniq = self.get_param_value_from_config(['krakenuniq', 'run']) == True

        if kraken_txt:
            if self.get_param_value_from_config(['krakenuniq', 'run']) == True:
                raise ConfigError("You supplied a kraken_txt file (\"%s\") but you set krakenuniq \
                                   to run in the config file. anvi'o is confused and \
                                   is officially going on a strike. Ok, let's clarify, \
                                   having a kraken_txt file means you already ran krakenuniq \
                                   and want us to use those results, and yet you set krakenuniq \
                                   to run again? why? Ok, time to strike. Bye!" % kraken_txt)

            # if a kraken_txt was supplied then let's run kraken by default
            self.run_krakenuniq = True

            kraken_annotation_dict = u.get_TAB_delimited_file_as_dictionary(kraken_txt)
            if next(iter(next(iter(kraken_annotation_dict.values())).keys())) != "path":
                raise ConfigError("Your kraken annotation file, '%s', is not formatted properly \
                                   anvi'o expected it to have two columns only and the second column \
                                   should have a header 'path'." % kraken_txt)
            samples_in_kraken_txt = set(kraken_annotation_dict.keys())
            # get a list of the sample names
            sample_names = set(self.samples_information['sample'])

            wrong_samples_in_kraken_txt = samples_in_kraken_txt - sample_names
            if wrong_samples_in_kraken_txt:
                raise ConfigError("Your kraken annotation file, '%s', contains samples that \
                                   are not in your samples_txt file, '%s'. Here is an example \
                                   of such a sample: %s." % (kraken_txt, self.get_param_value_from_config('samples_txt'), next(iter(wrong_samples_in_kraken_txt))))

            missing_samples_in_kraken_txt = sample_names - samples_in_kraken_txt
            if missing_samples_in_kraken_txt:
                raise ConfigError("Your kraken annotation file, '%s', is missing samples that \
                                   are in your samples_txt file, '%s'. This is not allowed. \
                                   Here is an example of such a sample: %s." % (kraken_txt, self.get_param_value_from_config('samples_txt'), wrong_samples_in_kraken_txt[0]))
            self.kraken_annotation_dict = kraken_annotation_dict

        if self.get_param_value_from_config(['krakenuniq', 'run']):
            if not self.get_param_value_from_config(['krakenuniq', '--db']):
                raise ConfigError('In order to run krakenuniq, you must provide a path to \
                                   a database using the --db parameter in the config file.')
Exemplo n.º 36
0
    def run_command(self, input_path):
        # backup the old working directory before changing the directory
        old_wd = os.getcwd()
        os.chdir(input_path)

        full_command = [self.program_name,
                        '--outdir', 'output',
                        '--indir', input_path,
                        '--method', self.method,
                        '--workers', self.num_threads]

        self.progress.new('PyANI')
        self.progress.update('Running ...')
        exit_code = utils.run_command(full_command, self.log_file_path)
        self.progress.end()

        if int(exit_code):
            raise ConfigError("PyANI returned with non-zero exit code, there may be some errors. \
                              please check the log file for details.")

        output_matrix_names = ['alignment_coverage', 'alignment_lengths', 'hadamard', \
                              'percentage_identity', 'similarity_errors', 'correlations']

        full_matrix_path = lambda name: os.path.join(input_path, 'output', self.method + '_' + name + '.tab')

        matrices = {}
        for matrix_name in output_matrix_names:
            output_matrix_path = full_matrix_path(matrix_name)
            if os.path.exists(output_matrix_path):
                matrices[matrix_name] = utils.get_TAB_delimited_file_as_dictionary(output_matrix_path, empty_header_columns_are_OK=True)

        if not len(matrices):
            raise ConfigError("None of the output matrices pyANI was supposed to generate was found in the\
                               output directory :( You may find some clues in the log file?")

        self.run.info_single("Output matrices for the following items are stored in the output\
                              directory: %s <success kid meme.png>." % \
                                        (', '.join(["'%s'" % m.replace('_', ' ') for m in matrices])), nl_before=1, mc='green')

        # restore old working directory
        os.chdir(old_wd)

        return matrices
Exemplo n.º 37
0
    def init(self):
        super().init()

        self.fasta_txt_file = self.get_param_value_from_config('fasta_txt')

        if self.fasta_txt_file:
            filesnpaths.is_file_exists(self.fasta_txt_file)
            self.contigs_information = u.get_TAB_delimited_file_as_dictionary(
                self.fasta_txt_file)
            self.fasta_information.update(self.contigs_information)
            self.group_names = list(self.contigs_information.keys())
            self.references_mode = True
            self.sanity_check_for_fasta_txt()

        self.sanity_check_contigs_project_name()

        # check and warn user regarding risky change of parameters with wildcards as default values
        self.warn_user_regarding_param_with_wildcard_default_value(
            'anvi_run_ncbi_cogs', '--temporary-dir-path', '{group}')
        self.warn_user_regarding_param_with_wildcard_default_value(
            'anvi_script_reformat_fasta', '--prefix', '{group}')
Exemplo n.º 38
0
    def init(self):
        super().init()

        self.fasta_txt_file = self.get_param_value_from_config(
            'fasta_txt', repress_default=True)

        if self.fasta_txt_file:
            filesnpaths.is_file_exists(self.fasta_txt_file)
            self.contigs_information = u.get_TAB_delimited_file_as_dictionary(
                self.fasta_txt_file)
            self.fasta_information.update(self.contigs_information)
            self.group_names = list(self.contigs_information.keys())
            self.references_mode = True
            self.sanity_check_for_fasta_txt()

        self.sanity_check_contigs_project_name()

        self.import_external_functions_flags = [os.path.join(self.dirs_dict["CONTIGS_DIR"],
                                                group + "-external-functions-imported.done")\
                                                for group in self.contigs_information \
                                                if self.contigs_information[group].get('gene_functional_annotation')]

        self.run_pfams = self.get_param_value_from_config(
            ['anvi_run_pfams', 'run'])
Exemplo n.º 39
0
    def parse_pubs_txt(self):
        if os.path.exists(self.pubs_info_file_path):
            self.info = u.get_TAB_delimited_file_as_dictionary(
                self.pubs_info_file_path)

        pubs_header = u.get_columns_of_TAB_delim_file(
            self.pubs_file_path, include_first_column=True)
        headers_expected = [
            'Authors', 'Title', 'Publication', 'Volume', 'Number', 'Pages',
            'Year', 'doi'
        ]
        missing_headers = [h for h in pubs_header if h not in headers_expected]
        if len(missing_headers):
            raise ConfigError(
                "Sorry, the pubs.txt seems to be missing some of the headers that are mandatory. Each of \
                               the columns in the following list must be present in this file: %s (hint: yours do not have\
                               the following: %s)." %
                (', '.join(headers_expected), ', '.join(missing_headers)))

        self.pubs_txt = u.get_TAB_delimited_file_as_dictionary(
            self.pubs_file_path, indexing_field=pubs_header.index('doi'))

        for doi in self.pubs_txt:
            authors = []
            co_first_authors = []
            co_senior_authors = []
            p = self.pubs_txt[doi]

            for author in [_.strip() for _ in p['Authors'].split(';')]:
                if not len(author):
                    continue

                author_last_name, author_first_name_raw = [
                    _.strip() for _ in author.split(',')
                ]
                author_first_name = ''.join(
                    [n[0] for n in author_first_name_raw.split()])
                author_final_name = '%s %s' % (author_last_name,
                                               author_first_name)

                if author_first_name_raw.endswith('*'):
                    co_first_authors.append(author_final_name)
                elif author_first_name_raw.endswith('+'):
                    co_senior_authors.append(author_final_name)

                authors.append(author_final_name)

            if p['Number']:
                issue = '%s(%s):%s' % (p['Volume'], p['Number'], p['Pages'])
            else:
                issue = '%s:%s' % (p['Volume'], p['Pages'])

            year = p['Year'].strip()
            pub_entry = {
                'authors': authors,
                'title': p['Title'],
                'journal': p['Publication'],
                'issue': issue,
                'doi': doi,
                'year': year,
                'co_first_authors': co_first_authors,
                'co_senior_authors': co_senior_authors
            }

            if year not in self.pubs_dict:
                self.pubs_dict[year] = [pub_entry]
            else:
                self.pubs_dict[year].append(pub_entry)
Exemplo n.º 40
0
    def load_from_profile_database(self, args):
        if self.p_meta['version'] != anvio.__profile__version__:
            raise ConfigError, "The profile database has a version number that differs from the version that is valid\
                                for this codebase (the profile database is at '%s', and the codebase is at '%s'). Very\
                                unfortunately, you need to re-profile and re-merge this project using the current anvi'o :("

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # load views from the profile database
        self.load_views()
        self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings 

        if self.tree:
            entry_id = os.path.basename(self.tree).split('.')[0]
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = entry_id
                self.p_meta['available_clusterings'] = [entry_id]
                self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id)
            else:
                self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % entry_id)

        # is summary being overwritten?
        if self.summary_index:
            run.info('Warning', "The default summary index in RUNINFO is being overriden by '%s'." % self.summary_index)
            self.p_meta['profile_summary_index'] = os.path.abspath(self.summary_index)

        if os.path.exists(self.P('SUMMARY.cp')):
            self.splits_summary_index = dictio.read_serialized_object(self.P('SUMMARY.cp'))
        else:
            self.splits_summary_index = None
            run.warning("SUMMARY.cp is missing for your run. Anvi'o will continue working (well, at least\
                         it will attempt to do it), but things may behave badly with the absence of\
                         SUMMARY.cp (first and foremost, you will not be able to inspect individual\
                         contigs through any of the interactive interfaces). Please investigate it\
                         if you were not expecting this.")

        # set title
        if self.title:
            self.title = self.title + ' (%s)' % self.default_view
        else:
            self.title = self.p_meta['sample_id'] + ' (%s)' % self.default_view
Exemplo n.º 41
0
 def load_catalog(self):
     catalog_path = os.path.join(self.pfam_data_dir, 'Pfam-A.clans.tsv')
     self.function_catalog = utils.get_TAB_delimited_file_as_dictionary(catalog_path,
         column_names=['accession', 'clan', 'unknown_column1', 'unknown_column2', 'function'])
Exemplo n.º 42
0
    def process_samples_information_file(self, samples_information_path):
        if not samples_information_path:
            return

        self.sample_names_in_samples_information_file = filesnpaths.is_proper_samples_information_file(samples_information_path)

        self.samples_information_dict, self.aliases_to_attributes_dict = self.convert_samples_information_dict(utils.get_TAB_delimited_file_as_dictionary(samples_information_path))
 
        self.run.info('Samples information', 'Loaded for %d samples' % len(self.samples_information_dict))
Exemplo n.º 43
0
    def parse_bookends_input(self):
        bad_entries = []

        if os.path.exists(self.pubs_info_file_path):
            self.info = u.get_TAB_delimited_file_as_dictionary(
                self.pubs_info_file_path)

        for line in [
                l.strip() for l in open(self.pubs_file_path, 'rU').readlines()
        ]:
            if line.find('(ed.)') > 0 or line.find('(eds.)') > 0:
                bad_entries.append((line, 'ed/eds. found...'), )
                continue

            p_s = line.find(' (')
            p_e = p_s + 6
            if not p_s > 0:
                bad_entries.append((line, 'p_s <= 0...'), )
                continue
            if not line[p_e] == ')':
                bad_entries.append((line, 'p_e != )...'), )
                continue

            doi = None
            if line.split()[-1].strip().startswith('doi:'):
                doi = line.split('doi:')[1].strip()
                line = line.split('doi:')[0].strip()

            year = int(line[p_s + 2:p_e])

            if year < keep_pubs_after_year:
                bad_entries.append((line, 'year >= keep_pubs_after...'), )
                continue

            authors = line[0:p_s]

            q_s = line.find(' "', p_e)
            if not q_s > 0:
                bad_entries.append((line, 'q_s <= 0...'), )
                continue
            q_e = line.find('."', q_s)

            if not q_e > 0:
                q_e = line.find('?"', q_s)
                if not q_e > 0:
                    bad_entries.append((line, 'q_e <= 0...'), )
                    continue

            title = line[q_s + 2:q_e + 1]

            c = line.find(', ', q_e + 2)
            if not c > 0:
                bad_entries.append((line, 'c <= 0...'), )
                continue

            journal = line[q_e + 3:c]

            issue = line[c + 2:-1]

            # ad hoc fixes for journal names
            for bad_form, good_form in journal_name_fixes:
                journal = journal.replace(bad_form, good_form)

            self.journals_list.append(journal)

            authors = authors.replace('Esen, Ö.,', 'Esen, Ö. C.,')
            authors = authors.replace('Murat Eren, A.,', 'Eren, A. M.,')

            if year not in self.pubs_dict:
                self.pubs_dict[year] = [{
                    'authors': authors,
                    'title': title,
                    'journal': journal,
                    'issue': issue,
                    'doi': doi,
                    'year': year
                }]
            else:
                self.pubs_dict[year].append({
                    'authors': authors,
                    'title': title,
                    'journal': journal,
                    'issue': issue,
                    'doi': doi,
                    'year': year
                })

            if authors.count(',') == 1:
                self.authors_list.append(authors)
                if year > 2004:
                    self.recent_authors_list.append(authors)
            else:
                for author in [
                        a + '.' if not a.endswith('.') else a
                        for a in authors.replace('and ', '').split('., ')
                ]:
                    self.authors_list.append(author)
                    if year > 2004:
                        self.recent_authors_list.append(author)

        # check for failed entries
        if len(bad_entries):
            print("Some entries failed. Quitting.")
            print()
            for tpl in bad_entries:
                print(' - Failed (reason: "%s"): %s' % (tpl[1], tpl[0]))

            sys.exit(-2)
Exemplo n.º 44
0
    def load_manual_mode(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in manual mode, you must\
                                not use a contigs database."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in manual mode, you need\
                                to declare a profile database. The profile database in this mode only used to\
                                read or store the 'state' of the display for visualization purposes. You DO\
                                NOT need to point to an already existing database, as anvi'o will generate\
                                an empty one for your if there is no profile database."

        if not self.tree:
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                at least the tree file. Please see the documentation for help."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"

        filesnpaths.is_file_exists(self.tree)
        tree = filesnpaths.is_proper_newick(self.tree)

        view_data_path = os.path.abspath(self.view_data_path) if self.view_data_path else None
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file) if self.fasta_file else None
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'

        clustering_id = '%s:unknown:unknown' % filesnpaths.get_name_from_file_path(self.tree)
        self.p_meta['default_clustering'] = clustering_id
        self.p_meta['available_clusterings'] = [clustering_id]
        self.p_meta['clusterings'] = {clustering_id: {'newick': ''.join([l.strip() for l in open(os.path.abspath(self.tree)).readlines()])}}

        self.default_view = self.p_meta['default_view']

        if self.view_data_path:
            # sanity of the view data
            filesnpaths.is_file_tab_delimited(view_data_path)
            view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True)
            if not view_data_columns[0] == "contig":
                raise ConfigError, "The first row of the first column of the view data file must\
                                    say 'contig', which is not the case for your view data file\
                                    ('%s'). Please make sure this is a properly formatted view data\
                                    file." % (view_data_path)

            # load view data as the default view:
            self.views[self.default_view] = {'header': view_data_columns[1:],
                                             'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)}
        else:
            # no view data is provided... it is only the tree we have. we will creaet a mock 'view data dict'
            # here using what is in the tree.
            names_in_the_tree = [n.name for n in tree.get_leaves()]

            ad_hoc_dict = {}
            for item in names_in_the_tree:
                ad_hoc_dict[item] = {'names': item}

            self.views[self.default_view] = {'header': ['names'],
                                             'dict': ad_hoc_dict}

        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up:
        self.p_meta['samples'] = self.views[self.default_view]['header']

        # if we have an input FASTA file, we will set up the split_sequences and splits_basic_info dicts,
        # otherwise we will leave them empty
        self.splits_basic_info = {}
        self.split_sequences = None
        if self.p_meta['splits_fasta']:
            filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
            self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

            names_missing_in_FASTA = set(self.split_names_ordered) - set(self.split_sequences.keys())
            num_names_missing_in_FASTA = len(names_missing_in_FASTA)
            if num_names_missing_in_FASTA:
                raise ConfigError, 'Some of the names in your view data does not have corresponding entries in the\
                                    FASTA file you provided. Here is an example to one of those %d names that occur\
                                    in your data file, but not in the FASTA file: "%s"' % (num_names_missing_in_FASTA, names_missing_in_FASTA.pop())

            # setup a mock splits_basic_info dict
            for split_id in self.split_names_ordered:
                self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                    'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # create a new, empty profile database for manual operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])})

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_collections_dict(self.profile_db_path, anvio.__profile__version__)

        if self.title:
            self.title = self.title
Exemplo n.º 45
0
    def convert_view_data_into_json(self):
        '''This function's name must change to something more meaningful.'''

        additional_layers_dict, additional_layer_headers = None, []
        if self.additional_layers_path:
            additional_layers_dict = utils.get_TAB_delimited_file_as_dictionary(self.additional_layers_path)
            additional_layer_headers = utils.get_columns_of_TAB_delim_file(self.additional_layers_path)

        for view in self.views:
            # here we will populate runinfo['views'] with json objects.
            view_dict = self.views[view]['dict']
            view_headers = self.views[view]['header']

            json_object = []

            # (1) set the header line with the first entry:
            json_header = ['contigs']

            # (2) add taxonomy, if exitsts:
            if len(self.splits_taxonomy_dict):
                json_header.extend(['taxonomy'])

            # (3) then add split summaries from contigs db, if exists
            if len(self.genes_in_splits_summary_dict):
                json_header.extend(self.genes_in_splits_summary_headers[1:])

            # (4) then add length and GC content
            basic_info_headers = ['length', 'gc_content']
            json_header.extend(basic_info_headers)

            # (5) then add the view!
            json_header.extend(view_headers)

            # (6) then add 'additional' headers as the outer ring:
            if additional_layer_headers:
                json_header.extend(additional_layer_headers)

            # (7) finally add hmm search results
            if self.hmm_searches_header:
                json_header.extend([tpl[0] for tpl in self.hmm_searches_header])

            # (8) and finalize it (yay):
            json_object.append(json_header)

            for split_name in view_dict:
                # (1)
                json_entry = [split_name]

                # (2)
                if self.splits_taxonomy_dict:
                    json_entry.extend([self.splits_taxonomy_dict[split_name]['t_species']])

                # (3)
                if self.genes_in_splits_summary_dict:
                    json_entry.extend([self.genes_in_splits_summary_dict[split_name][header] for header in self.genes_in_splits_summary_headers[1:]])

                # (4)
                json_entry.extend([self.splits_basic_info[split_name][header] for header in basic_info_headers])

                # (5) adding essential data for the view
                json_entry.extend([view_dict[split_name][header] for header in view_headers])

                # (6) adding additional layers
                json_entry.extend([additional_layers_dict[split_name][header] if additional_layers_dict.has_key(split_name) else None for header in additional_layer_headers])

                # (7) adding hmm stuff
                if self.hmm_searches_dict:
                    if self.split_hmm_layers:
                        json_entry.extend([self.hmm_searches_dict[split_name][header] if self.hmm_searches_dict.has_key(split_name) else None for header in [tpl[0] for tpl in self.hmm_searches_header]])
                    else:
                        json_entry.extend([len(self.hmm_searches_dict[split_name][header]) if self.hmm_searches_dict.has_key(split_name) else 0 for header in [tpl[1] for tpl in self.hmm_searches_header]])

                # (8) send it along!
                json_object.append(json_entry)

            self.views[view] = json_object
Exemplo n.º 46
0
    def load_from_anvio_files(self, args):
        if not self.contigs_db_path:
            raise ConfigError, "Anvi'o needs the contigs database to make sense of this run."

        ProfileSuperclass.__init__(self, args)

        # this is a weird place to do it, but we are going to ask ContigsSuperclass function to load
        # all the split sequences since only now we know the mun_contig_length that was used to profile
        # this stuff
        self.init_split_sequences(self.p_meta['min_contig_length'])

        self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__)

        self.p_meta['self_path'] = self.profile_db_path
        self.p_meta['output_dir'] = os.path.join(os.getcwd(), os.path.dirname(self.profile_db_path))

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # load views from the profile database
        self.load_views()
        self.default_view = self.p_meta['default_view']

        # if the user wants to see available views, show them and exit.
        if self.show_views:
            run.warning('', header = 'Available views (%d)' % len(self.views), lc = 'green')
            for view in self.views:
                run.info(view,
                         'Via "%s" table' % self.views[view]['table_name'],
                         lc='crimson',
                         mc='green' if view == self.default_view else 'crimson')
            print
            sys.exit()

        if self.show_states:
            run.warning('', header = 'Available states (%d)' % len(self.states_table.states), lc = 'green')
            for state in self.states_table.states:
                run.info(state,
                         'Last modified %s' % self.states_table.states[state]['last_modified'],
                         lc='crimson',
                         mc='crimson')
            print
            sys.exit()

        # if the user has an additional view data, load it up into the self.views dict.
        if self.additional_view_path:
            filesnpaths.is_file_tab_delimited(self.additional_view_path)
            additional_view_columns = utils.get_columns_of_TAB_delim_file(self.additional_view_path)

            if not additional_view_columns[-1] == '__parent__':
                raise ConfigError, "The last column of the additional view must be '__parent__' with the proper\
                                    parent information for each split."

            column_mapping = [str] + [float] * (len(additional_view_columns) - 1) + [str]

            self.views['user_view'] = {'table_name': 'NA',
                                       'header': additional_view_columns,
                                       'dict': utils.get_TAB_delimited_file_as_dictionary(self.additional_view_path, column_mapping = column_mapping)}

        # if the user specifies a view, set it as default:
        if self.view:
            if not self.view in self.views:
                raise ConfigError, "The requested view ('%s') is not available for this run. Please see\
                                          available views by running this program with --show-views flag." % self.view

            self.default_view = self.view

        self.p_meta['clusterings'] = self.clusterings 

        if self.tree:
            entry_id = os.path.basename(self.tree).split('.')[0]
            if not self.p_meta['clusterings']:
                self.p_meta['default_clustering'] = entry_id
                self.p_meta['available_clusterings'] = [entry_id]
                self.p_meta['clusterings'] = {entry_id: {'newick': open(os.path.abspath(self.tree)).read()}}
                run.info('Additional Tree', "Splits will be organized based on '%s'." % entry_id)
            else:
                self.p_meta['clusterings'][entry_id] = {'newick': open(os.path.abspath(self.tree)).read()}
                run.info('Additional Tree', "'%s' has been added to available trees." % entry_id)

        # set title
        if self.title:
            self.title = self.title
        else:
            self.title = self.p_meta['sample_id'].replace('-', ' ').replace('_', ' ')

        # do we have auxiliary data available?
        if not self.auxiliary_data_available:
            summary_cp_available = os.path.exists(os.path.join(os.path.dirname(self.profile_db_path), 'SUMMARY.cp'))
            self.run.warning("Auxiliary data is not available; which means you will not be able to perform\
                              certain operations (i.e., the inspect menu in the interactive interface will\
                              not work, etc). %s" % ('' if not summary_cp_available else "Although, you have\
                              a SUMMARY.cp file in your work directory, which means you are working with an\
                              outdated anvi'o run. You can convert your SUMMARY.cp into an auxiliary data file\
                              by using `anvi-script-generate-auxiliary-data-from-summary-cp` script."))

        if self.state:
            if not self.state in self.states_table.states:
                raise ConfigError, "The requested state ('%s') is not available for this run. Please see\
                                          available states by running this program with --show-states flag." % self.state               
Exemplo n.º 47
0
    def load_from_user_files(self, args):
        if self.contigs_db_path:
            raise ConfigError, "When you want to use the interactive interface in an ad hoc manner, you must\
                                not use a contigs database."

        if not self.profile_db_path:
            raise ConfigError, "Even when you want to use the interactive interface in an ad hoc manner by\
                                using the '--manual-mode' flag, you still need to declare a profile database.\
                                The profile database in this mode only used to read or store the 'state' of\
                                the display for visualization purposes. You DO NOT need to point to an already\
                                existing database, as anvi'o will generate an empty one for your if there is no\
                                profile database."

        if (not self.fasta_file) or (not self.view_data_path) or (not self.tree):
            raise ConfigError, "When you are running the interactive interface in manual mode, you must declare\
                                each of '-f', '-d', and '-t' parameters. Please see the help menu for more info."

        if self.view:
            raise ConfigError, "You can't use '--view' parameter when you are running the interactive interface\
                                in manual mode"

        if self.show_views:
            raise ConfigError, "Sorry, there are no views to show in manual mode :/"

        if self.show_states:
            raise ConfigError, "Sorry, there are no states to show in manual mode :/"


        view_data_path = os.path.abspath(self.view_data_path)
        self.p_meta['splits_fasta'] = os.path.abspath(self.fasta_file)
        self.p_meta['output_dir'] = None
        self.p_meta['views'] = {}
        self.p_meta['merged'] = True
        self.p_meta['default_view'] = 'single'
        self.p_meta['default_clustering'] = 'default'
        self.p_meta['available_clusterings'] = ['default']
        self.p_meta['clusterings'] = {'default': {'newick': open(os.path.abspath(self.tree)).read()}}

        self.default_view = self.p_meta['default_view']

        # sanity of the view data
        filesnpaths.is_file_tab_delimited(view_data_path)
        view_data_columns = utils.get_columns_of_TAB_delim_file(view_data_path, include_first_column=True)
        if not view_data_columns[0] == "contig":
            raise ConfigError, "The first row of the first column of the view data file must\
                                      say 'contig', which is not the case for your view data file\
                                      ('%s'). Please make sure this is a properly formatted view data\
                                      file." % (view_data_path)

        # load view data as the default view:
        self.views[self.default_view] = {'header': view_data_columns[1:],
                                         'dict': utils.get_TAB_delimited_file_as_dictionary(view_data_path)}
        self.split_names_ordered = self.views[self.default_view]['dict'].keys()

        # we assume that the sample names are the header of the view data, so we might as well set it up: 
        self.p_meta['samples'] = self.views[self.default_view]['header']

        filesnpaths.is_file_fasta_formatted(self.p_meta['splits_fasta'])
        self.split_sequences = utils.get_FASTA_file_as_dictionary(self.p_meta['splits_fasta'])

        # setup a mock splits_basic_info dict
        self.splits_basic_info = {}
        for split_id in self.split_names_ordered:
            self.splits_basic_info[split_id] = {'length': len(self.split_sequences[split_id]),
                                                'gc_content': utils.get_GC_content_for_sequence(self.split_sequences[split_id])}

        # create a new, empty profile database for ad hoc operations
        if not os.path.exists(self.profile_db_path):
            profile_db = ProfileDatabase(self.profile_db_path)
            profile_db.create({'db_type': 'profile', 'merged': True, 'contigs_db_hash': None, 'samples': ','.join(self.p_meta['samples'])})

        # create an instance of states table
        self.states_table = TablesForStates(self.profile_db_path, anvio.__profile__version__)

        # also populate collections, if there are any
        self.collections.populate_sources_dict(self.profile_db_path, anvio.__profile__version__)

        if self.title:
            self.title = self.title
Exemplo n.º 48
0
    def __init__(self, args=None, run=run, progress=progress):
        self.args = args
        self.run = run
        self.progress = progress

        A = lambda x: args.__dict__[x] if x in args.__dict__ else None
        input_file_for_internal_genomes = A('internal_genomes')
        input_file_for_external_genomes = A('external_genomes')
        self.num_threads = A('num_threads')
        self.output_dir = A('output_dir')
        self.overwrite_output_destinations = A('overwrite_output_destinations')
        self.debug = A('debug')
        self.min_percent_identity = A('min_percent_identity')
        self.PC_min_occurrence = A('min_occurrence')
        self.mcl_inflation = A('mcl_inflation')
        self.sensitive = A('sensitive')
        self.maxbit = A('maxbit')
        self.use_ncbi_blast = A('use_ncbi_blast')
        self.exclude_partial_gene_calls = A('exclude_partial_gene_calls')

        self.genomes = {}

        fields_for_internal_genomes_input = [
            'name', 'bin_id', 'collection_id', 'profile_db_path',
            'contigs_db_path'
        ]
        fields_for_external_genomes_input = ['name', 'contigs_db_path']

        self.log_file_path = None

        internal_genomes_dict = utils.get_TAB_delimited_file_as_dictionary(
            input_file_for_internal_genomes,
            expected_fields=fields_for_internal_genomes_input
        ) if input_file_for_internal_genomes else {}
        external_genomes_dict = utils.get_TAB_delimited_file_as_dictionary(
            input_file_for_external_genomes,
            expected_fields=fields_for_external_genomes_input
        ) if input_file_for_external_genomes else {}

        self.internal_genome_names = internal_genomes_dict.keys()
        self.external_genome_names = external_genomes_dict.keys()

        if len(self.internal_genome_names) + len(
                self.external_genome_names) != len(
                    set(self.internal_genome_names +
                        self.external_genome_names)):
            raise ConfigError, "Each entry both in internal and external genome descriptions should have a unique 'name'. This does not\
                                seem to be the case with your input :/"

        # convert relative paths to absolute paths and MERGE internal and external genomes into self.genomes:
        for source, input_file in [
            (external_genomes_dict, input_file_for_external_genomes),
            (internal_genomes_dict, input_file_for_internal_genomes)
        ]:
            for genome_name in source:
                self.genomes[genome_name] = source[genome_name]
                for db_path_var in ['contigs_db_path', 'profile_db_path']:
                    if db_path_var not in self.genomes[genome_name]:
                        continue
                    path = self.genomes[genome_name][db_path_var]
                    if not path.startswith('/'):
                        self.genomes[genome_name][
                            db_path_var] = os.path.abspath(
                                os.path.join(os.path.dirname(input_file),
                                             path))

        # to be filled during init:
        self.hash_to_genome_name = {}
        self.protein_sequences_dict = {}
        self.view_data = {}
        self.view_data_presence_absence = {}
        self.additional_view_data = {}
Exemplo n.º 49
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False, skip_amino_acid_sequences=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

            By default this function will also attempt to add translated DNA sequences into the
            corresponding table per gene call. Unless the `skip_amino_acid_sequences` flag is
            True. This may be useful if genes that are not translated are being added, such as
            ribosomal RNA genes, etc.
        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.", nl_before=1, nl_after=1)
                return


        if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found):
            raise ConfigError("You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path,
                                                                         expected_fields=t.genes_in_contigs_table_structure,
                                                                         only_expected_fields=True,
                                                                         column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid seqeunces or create a blank dictionary
        if skip_amino_acid_sequences:
            amino_acid_sequences = dict([(g, '') for g in gene_calls_dict])
        else:
            amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict(gene_calls_dict, ignore_internal_stop_codons=ignore_internal_stop_codons)

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)
Exemplo n.º 50
0
    def process_samples_information_file(self, samples_information_path):
        if not samples_information_path:
            return

        self.sample_names_in_samples_information_file = filesnpaths.is_proper_samples_information_file(samples_information_path)

        self.samples_information_dict, self.aliases_to_attributes_dict = self.convert_samples_information_dict(utils.get_TAB_delimited_file_as_dictionary(samples_information_path))
        self.samples_information_default_layer_order = open(samples_information_path, 'rU').readline().strip().split('\t')[1:]
 
        self.run.info('Samples information', 'Loaded for %d samples' % len(self.samples_information_dict))
Exemplo n.º 51
0
 def predict_from_TAB_delimited_file(self, file_path):
     cols = utils.get_columns_of_TAB_delim_file(file_path)
     return self.predict(utils.get_TAB_delimited_file_as_dictionary(file_path, column_mapping=[str] + [float] * len(cols)))
Exemplo n.º 52
0
 def predict_from_TAB_delimited_file(self, file_path):
     cols = utils.get_columns_of_TAB_delim_file(file_path)
     return self.predict(
         utils.get_TAB_delimited_file_as_dictionary(file_path,
                                                    column_mapping=[str] +
                                                    [float] * len(cols)))
Exemplo n.º 53
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(
            self,
            input_file_path,
            gene_calls_dict=None,
            ignore_internal_stop_codons=False,
            skip_predict_frame=False,
            skip_amino_acid_sequences=False):
        """Add genes to the contigs database.

        Primary input is either an `input_file_path` for external gene calls, or an
        external `gene_calls_dict` dictionary object.

        Parameters
        ==========
        input_file_path : str
            Path to file with one of the following structures.

            Option 1:
                gene_callers_id contig          start  stop  direction  partial  call_type  source    version
                0               CACHJY01_00016  0      693   r          1        1          prodigal  v2.6.3
                1               CACHJY01_00016  711    1140  r          0        1          prodigal  v2.6.3

            Option 2:
                gene_callers_id contig          start  stop  direction  partial  call_type  source    version  aa_sequence
                0               CACHJY01_00016  0      693   r          1        1          prodigal  v2.6.3   MSKKIYFTEYSKVNRLQTISNFTGSA
                1               CACHJY01_00016  711    1140  r          0        1          prodigal  v2.6.3   MVNVDYHGLIAGAGSGKTKVLTSRIAHIIK

        gene_calls_dict : dict, None
            Alternative to `input_file_path`. If provided, entries will be APPENDED to the database.
            So you need to make sure gene caller ids in your dict does not overlap with the ones in
            the database. Should look like:

                {
                    "1": {
                        "contig": "contig_name",
                        "start": 20,
                        "stop": 1544,
                        "direction": "f",
                        "partial": 0,
                        "call_type": 1,
                        "source": "source_name",
                        "version": "unknown",
                        "aa_sequence": "MSKKIYFTEYSKVNRLQTISNFTGSA"
                    },

                    "2": {
                      (...)
                    },

                    (...)
                }

            All entries are required except "aa_sequence", which is optional. If provided, it should
            be present for ALL entries, even if it is an empty string. It's presence will be used to
            populate `gene_amino_acid_sequences`.

        ignore_internal_stop_codons : bool, False
            If False, ConfigError will be raised if a stop codon is found inside any gene. If True,
            this is suppressed and the stop codon is replaced with the character `X`.

        skip_predict_frame : bool, False
            If True, ConfigError will be raised if a gene is not divisible by 3. If False, anvi'o predicts
            the most likley open reading frame and trims the start/stop of the gene call to reflect this
            change so that the gene *is* divisible by 3. This flag allows the retention of amino acid
            sequences even if genes are not divisible by 3, or when it is flagged as partial.

        skip_amino_acid_sequences : bool, False
            Should the gene_amino_acid_sequences table be populated? This may be useful if genes
            that are not translated are being added, such as ribosomal RNA genes, etc.
        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError(
                    "'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" %
                    (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            has_aa_seq = lambda x: True if 'aa_sequence' in x else False
            num_with_aa_seqs = sum([
                has_aa_seq(gene_call)
                for gene_call in gene_calls_dict.values()
            ])
            num_gene_calls = len(gene_calls_dict)
            if num_with_aa_seqs != 0 and num_with_aa_seqs != num_gene_calls:
                raise ConfigError(
                    "The gene_calls_dict passed to use_external_gene_calls_to_populate_genes_in_contigs_table "
                    "has %d entries with 'aa_sequence' and %d without. Either 0 or all (%d) should have "
                    "'aa_sequence'" % (num_with_aa_seqs, num_gene_calls -
                                       num_with_aa_seqs, num_gene_calls))

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single(
                    "'Use external gene calls' function found an empty gene calls dict, returning "
                    "prematurely and assuming you know what's up. If you don't, stop here and try to "
                    "identify what decisions you've made might have led you to this weird point your "
                    "workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've "
                    "done great so far.",
                    nl_before=1,
                    nl_after=1)
                return

        if (not input_file_path
                and not gene_calls_found) or (input_file_path
                                              and gene_calls_found):
            raise ConfigError(
                "You must provide either an input file, or an gene calls dict to process external "
                "gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table` "
                "with wrong parameters.")

        Table.__init__(self,
                       self.db_path,
                       anvio.__contigs__version__,
                       self.run,
                       self.progress,
                       simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            expected_fields = t.genes_in_contigs_table_structure
            column_mapping = [int, str, int, int, str, int, int, str, str]

            if 'aa_sequence' in utils.get_columns_of_TAB_delim_file(
                    input_file_path):
                expected_fields = t.genes_in_contigs_table_structure + [
                    'aa_sequence'
                ]
                column_mapping.append(
                    lambda x: ''
                    if x is None else str(x))  # str(None) is 'None', amazingly

            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(
                input_file_path,
                expected_fields=expected_fields,
                only_expected_fields=True,
                column_mapping=column_mapping)

            if not len(gene_calls_dict):
                raise ConfigError(
                    "You provided an external gene calls file, but it returned zero gene calls. Assuming that "
                    "this is an error, anvi'o will stop here and complain. If this is not an error and you "
                    "in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag, "
                    "instead of providing an emtpy external gene calls file. You don't agree? You need this "
                    "for some weird step for you weird pipeline? Let us know, and we will consider changing "
                    "this.")

            self.run.info(
                "External gene calls",
                "%d gene calls recovered and will be processed." %
                len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences or create a blank dictionary
        if skip_amino_acid_sequences:
            amino_acid_sequences = dict([(g, '') for g in gene_calls_dict])
        else:
            gene_calls_dict, amino_acid_sequences = self.get_amino_acid_sequences_for_genes_in_gene_calls_dict(
                gene_calls_dict,
                ignore_internal_stop_codons=ignore_internal_stop_codons,
                skip_predict_frame=skip_predict_frame,
            )

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict,
                                             amino_acid_sequences,
                                             append_to_the_db=append_to_the_db)
Exemplo n.º 54
0
    def use_external_gene_calls_to_populate_genes_in_contigs_table(self, input_file_path, gene_calls_dict=None, ignore_internal_stop_codons=False):
        """Add genes to the contigs database.

           Either provide an `input_file_path` for external gene calls, or provide an
           external gene calls dictionary. The format should follow this:

                {
                  "1": {
                      "contig": "contig_name",
                      "start": 20,
                      "stop": 1544,
                      "direction": "f",
                      "partial": 0,
                      "source": "source_name",
                      "version": "unknown"
                  },

                  "2": {
                    (...)
                  },

                (...)
                }

            If you provide a `gene_calls_dict`, they will be APPENDED to the database. So you
            need to make sure gene caller ids in your dict does not overlap with the ones in
            the database.

        """

        # by default we assume that this is a pristine run. but if the user sends a dictionary
        append_to_the_db = False

        gene_calls_found = False
        # let's do a rigorous check whether the user provided a gene_calls_dict.
        if (gene_calls_dict is not None and gene_calls_dict is not False):
            if not isinstance(gene_calls_dict, dict):
                raise ConfigError("'Use external gene calls' function received a non-empty gene_calls_dict object,\
                                    but it is of type '%s', and not '%s'" % (type(gene_calls_dict), type({})))

            # congrats, we have a dict.
            gene_calls_found = True

            if not len(gene_calls_dict):
                # but it is empty ... silly user.
                self.run.info_single("'Use external gene calls' function found an empty gene calls dict, returning\
                                      prematurely and assuming you know what's up. If you don't, stop here and try to\
                                      identify what decisions you've made might have led you to this weird point your\
                                      workflow (or 'life', totally up to you and your mood, but anvi'o thinks you've\
                                      done great so far.", nl_before=1, nl_after=1)
                return


        if (not input_file_path and not gene_calls_found) or (input_file_path and gene_calls_found):
            raise ConfigError("You must provide either an input file, or an gene calls dict to process external\
                               gene calls. You called `use_external_gene_calls_to_populate_genes_in_contigs_table`\
                               with wrong parameters.")

        Table.__init__(self, self.db_path, anvio.__contigs__version__, self.run, self.progress, simple=True)

        # take care of gene calls dict
        if not gene_calls_found:
            gene_calls_dict = utils.get_TAB_delimited_file_as_dictionary(input_file_path,
                                                                         expected_fields=t.genes_in_contigs_table_structure,
                                                                         only_expected_fields=True,
                                                                         column_mapping=[int, str, int, int, str, int, str, str])

            if not len(gene_calls_dict):
                raise ConfigError("You provided an external gene calls file, but it returned zero gene calls. Assuming that\
                                   this is an error, anvi'o will stop here and complain. If this is not an error and you\
                                   in fact expected this, the proper way of doing this is to use `--skip-gene-calls` flag,\
                                   instead of providing an emtpy external gene calls file. You don't agree? You need this\
                                   for some weird step for you weird pipeline? Let us know, and we will consider changing\
                                   this.")

            self.run.info("External gene calls", "%d gene calls recovered and will be processed." % len(gene_calls_dict))
        else:
            # FIXME: we need to make sure the gene caller ids in the incoming directory is not going to
            #        overwrite an existing gene call. Something like this would have returned the
            #        current max, which could be cross-checked with what's in the dict:
            #
            #            contigs_db = ContigsDatabase(self.db_path)
            #            next_id = contigs_db.db.get_max_value_in_column('genes_in_contigs', 'gene_callers_id') + 1
            #            contigs_db.disconnect()
            append_to_the_db = True

        # recover amino acid sequences. during this operation we are going to have to read all contig sequences
        # into the damn memory. anvi'o is doing a pretty bad job with memory management :(
        amino_acid_sequences = {}

        contig_sequences = {}
        if self.contigs_fasta:
            fasta = u.SequenceSource(self.contigs_fasta)
            while next(fasta):
                contig_sequences[fasta.id] = {'sequence': fasta.seq}
            fasta.close()
        else:
            database = db.DB(self.db_path, utils.get_required_version_for_db(self.db_path))
            contig_sequences = database.get_table_as_dict(t.contig_sequences_table_name)

        num_genes_with_internal_stops = 0
        number_of_impartial_gene_calls = 0
        for gene_callers_id in gene_calls_dict:
            gene_call = gene_calls_dict[gene_callers_id]
            contig_name = gene_call['contig']

            if contig_name not in contig_sequences:
                # remove the partial contigs database so things don't get screwed later
                os.remove(self.db_path)
                raise ConfigError("You are in big trouble :( The contig name '%s' in your external gene callers file\
                                    does not appear to be in the contigs FASTA file. How did this happen?" % contig_name)

            if gene_call['partial']:
                amino_acid_sequences[gene_callers_id] = ''
                number_of_impartial_gene_calls += 1
                continue

            sequence = contig_sequences[contig_name]['sequence'][gene_call['start']:gene_call['stop']]
            if gene_call['direction'] == 'r':
                sequence = utils.rev_comp(sequence)

            amino_acid_sequence = utils.get_DNA_sequence_translated(sequence, gene_callers_id)

            # check if there are any internal stops:
            if amino_acid_sequence.find('*') > -1:
                if ignore_internal_stop_codons:
                    amino_acid_sequence = amino_acid_sequence.replace('*', 'X')
                    num_genes_with_internal_stops += 1
                else:
                    os.remove(self.db_path)
                    raise ConfigError("Oops. Anvi'o run into an amino acid seqeunce (that corresponds to the gene callers id '%s')\
                                       which had an internal stop codon :/ This usually indicates that your external gene calls\
                                       have problems. If you still want to continue, you can ask anvi'o to ignore internal stop\
                                       codons on your own risk. It will probably look very ugly on your screen, but here is the\
                                       DNA sequence for that gene in case you don't trust anvi'o (which only would be fair since\
                                       anvi'o does not trust you either): %s" % (str(gene_callers_id), sequence))

            amino_acid_sequences[gene_callers_id] = amino_acid_sequence

        # populate genes_in_contigs, and gene_amino_acid_sequences table in contigs db.
        self.populate_genes_in_contigs_table(gene_calls_dict, amino_acid_sequences, append_to_the_db=append_to_the_db)

        if num_genes_with_internal_stops:
            percent_genes_with_internal_stops = num_genes_with_internal_stops * 100.0 / len(gene_calls_dict)
            self.run.warning("Please read this carefully: Your external gene calls contained open reading frames with internal\
                              stop codons, and you asked anvi'o to ignore those. Anvi'o replaced internal stop codons with 'X'\
                              characters, and stored them in the contigs database that way. %d of your genes, which corresponded\
                              to %.2f%% of the total %d genes, had internal stop codons. We hope you are happy." % \
                                        (num_genes_with_internal_stops, percent_genes_with_internal_stops, len(gene_calls_dict)))

        if number_of_impartial_gene_calls:
            self.run.warning('%d of your %d gene calls were impartial, hence the translated amino acid sequences for those\
                              were not stored in the database.' % (number_of_impartial_gene_calls, len(gene_calls_dict)))