示例#1
0
    def store_gene_coverages_matrix(self):
        self.progress.update('Storing gene coverages ...')

        info_dict = {}
        genes_dict = {}

        gene_entry_ids_in_bin = set([])
        for split_name in self.split_ids:
            gene_entry_ids_in_bin.update(self.summary.split_to_genes_in_splits_ids[split_name])

        info_dict['num_genes_found'] = len(gene_entry_ids_in_bin)

        headers = ['function', 'contig', 'start', 'stop', 'direction']
        for gene_entry_id in gene_entry_ids_in_bin:
            prot_id = self.summary.genes_in_splits[gene_entry_id]['prot']
            genes_dict[prot_id] = {}

            # first fill in sample independent information;
            for header in headers:
                genes_dict[prot_id][header] = self.summary.genes_in_contigs_dict[prot_id][header]

            # then fill in distribution across samples:
            for sample_name in self.summary.p_meta['samples']:
                genes_dict[prot_id][sample_name] = self.summary.gene_coverages_dict[prot_id][sample_name]

            # finally add the sequence:
            contig = self.summary.genes_in_contigs_dict[prot_id]['contig']
            start = self.summary.genes_in_contigs_dict[prot_id]['start']
            stop = self.summary.genes_in_contigs_dict[prot_id]['stop']
            genes_dict[prot_id]['sequence'] = self.summary.contig_sequences[contig]['sequence'][start:stop]

        output_file_obj = self.get_output_file_handle('functions.txt')
        utils.store_dict_as_TAB_delimited_file(genes_dict, None, headers = ['prot'] + headers + self.summary.p_meta['samples'] + ['sequence'], file_obj = output_file_obj)

        self.bin_info_dict['genes'] = info_dict
示例#2
0
    def gen_samples_info_file(self):
        self.progress.new('Samples DB')
        self.progress.update('Generating the samples information file ..')

        samples_info_dict = {}
        samples_info_file_path = self.get_output_file_path(
            self.project_name + '-samples-information.txt')

        # set headers
        headers = ['total_length']

        for h in ['percent_completion', 'percent_redundancy']:
            if h in list(self.genomes.values())[0]:
                headers.append(h)

        headers.extend(
            ['gc_content', 'num_genes', 'avg_gene_length', 'num_genes_per_kb'])

        for c in list(self.genomes.values()):
            new_dict = {}
            for header in headers:
                new_dict[header] = c[header]

            samples_info_dict[c['name']] = new_dict

        utils.store_dict_as_TAB_delimited_file(samples_info_dict,
                                               samples_info_file_path,
                                               headers=['samples'] + headers)

        self.progress.end()
        self.run.info("Anvi'o samples information", samples_info_file_path)

        return samples_info_file_path
示例#3
0
    def report(self):
        self.progress.new('Reporting')

        new_structure = [t.variable_nts_table_structure[0]] + [
            'unique_pos_identifier'
        ] + [
            x for x in t.variable_aas_table_structure[1:] if x != 'split_name'
        ] + ['contig_name', 'split_name', 'unique_pos_identifier_str']

        self.progress.update(
            'exporting variable positions table as a TAB-delimited file ...')

        utils.store_dict_as_TAB_delimited_file(self.data,
                                               self.args.output_file,
                                               new_structure)
        self.progress.end()

        self.run.info('Num entries reported', pp(len(self.data)))
        self.run.info('Output File', self.args.output_file)
        self.run.info(
            'Num AA positions reported',
            pp(
                len(
                    set([
                        e['unique_pos_identifier'] for e in self.data.values()
                    ]))))
示例#4
0
    def gen_samples_info_file(self):
        samples_info_dict = {}
        samples_info_file_path = self.get_output_file_path('anvio-samples-information.txt')

        # set headers
        headers = ['total_length']

        for h in ['percent_complete', 'percent_redundancy']:
            if self.genomes.values()[0].has_key(h):
                headers.append(h)

        headers.extend(['gc_content', 'num_genes', 'avg_gene_length', 'num_genes_per_kb'])

        for c in self.genomes.values():
            new_dict = {}
            for header in headers:
                new_dict[header] = c[header]

            samples_info_dict[c['name']] = new_dict

        utils.store_dict_as_TAB_delimited_file(samples_info_dict, samples_info_file_path, headers = ['samples'] + headers)

        self.run.info("Anvi'o samples information", samples_info_file_path)

        return samples_info_file_path
示例#5
0
    def save_samples_information(self, additional_description=''):
        if not self.samples_information_to_append:
            samples_information_column_titles = list(
                self.samples_information[next(iter(self.samples_information))])
            samples_information_dict = self.samples_information
        else:
            samples_information_column_titles = utils.get_columns_of_TAB_delim_file(
                self.samples_information_to_append)
            column_mapping = [str
                              ] * (len(samples_information_column_titles) + 2)
            self.run.warning(self.samples_information)
            samples_information_dict = utils.get_TAB_delimited_file_as_dictionary(
                self.samples_information_to_append,
                dict_to_append=self.samples_information,
                assign_none_for_missing=True,
                column_mapping=column_mapping)

        if additional_description:
            additional_description = '-' + additional_description

        samples_information_file_name = self.output_file_prefix + additional_description + '-samples-information.txt'
        utils.store_dict_as_TAB_delimited_file(
            samples_information_dict,
            samples_information_file_name,
            headers=['samples'] + samples_information_column_titles)
示例#6
0
文件: panops.py 项目: psaxcode/anvio
    def gen_samples_info_file(self):
        samples_info_dict = {}
        samples_info_file_path = self.get_output_file_path(
            'anvio-samples-information.txt')

        # set headers
        headers = ['total_length']

        for h in ['percent_complete', 'percent_redundancy']:
            if h in self.genomes.values()[0]:
                headers.append(h)

        headers.extend(
            ['gc_content', 'num_genes', 'avg_gene_length', 'num_genes_per_kb'])

        for c in self.genomes.values():
            new_dict = {}
            for header in headers:
                new_dict[header] = c[header]

            samples_info_dict[c['name']] = new_dict

        utils.store_dict_as_TAB_delimited_file(samples_info_dict,
                                               samples_info_file_path,
                                               headers=['samples'] + headers)

        self.run.info("Anvi'o samples information", samples_info_file_path)

        return samples_info_file_path
示例#7
0
文件: panops.py 项目: psaxcode/anvio
        def store_file(data, path, headers=None):
            if not headers:
                headers = ['contig'] + sorted(data.values()[0].keys())

            utils.store_dict_as_TAB_delimited_file(data, path, headers=headers)

            return path
示例#8
0
    def report(self):
        self.progress.new('Reporting')

        new_structure = [t.variable_nts_table_structure[0]] + [
            'unique_pos_identifier'
        ] + t.variable_nts_table_structure[1:] + ['parent']

        self.progress.update(
            'exporting variable positions table as a TAB-delimited file ...')

        utils.store_dict_as_TAB_delimited_file(self.variable_nts_table,
                                               self.args.output_file,
                                               new_structure)
        self.progress.end()

        self.run.info('Num entries reported', pp(len(self.variable_nts_table)))
        self.run.info('Output File', self.args.output_file)
        self.run.info(
            'Num nt positions reported',
            pp(
                len(
                    set([
                        e['unique_pos_identifier']
                        for e in self.variable_nts_table.values()
                    ]))))
示例#9
0
文件: panops.py 项目: ascendo/anvio
    def store_protein_clusters(self, protein_clusters_dict):
        self.progress.new('Storing protein clusters')
        self.progress.update('...')

        protein_clusters_output_path = self.get_output_file_path('protein-clusters.txt')

        self.progress.end()
        d = {}

        PCs = protein_clusters_dict.keys()

        unique_entry_id = 0
        for PC in PCs:
            for entry_hash, gene_caller_id in [e.split('_') for e in protein_clusters_dict[PC]]:
                try:
                    genome_name = self.hash_to_genome_name[entry_hash]
                except KeyError:
                    raise ConfigError, "Something horrible happened. This can only happen if you started a new analysis with\
                                        additional genomes without cleaning the previous work directory. Sounds familiar?"

                d[unique_entry_id] = {'gene_caller_id': gene_caller_id, 'protein_cluster_id': PC, 'genome_name': genome_name, 'sequence': self.protein_sequences_dict[genome_name][int(gene_caller_id)]}
                unique_entry_id += 1

        utils.store_dict_as_TAB_delimited_file(d, protein_clusters_output_path, headers=['entry_id', 'gene_caller_id', 'protein_cluster_id', 'genome_name', 'sequence'])

        self.progress.end()

        self.run.info('protein clusters info', protein_clusters_output_path)

        return protein_clusters_output_path
示例#10
0
文件: panops.py 项目: ascendo/anvio
        def store_file(data, path, headers=None):
            if not headers:
                headers = ['contig'] + sorted(data.values()[0].keys())

            utils.store_dict_as_TAB_delimited_file(data, path, headers=headers)

            return path
示例#11
0
    def process(self):
        self.sanity_check()

        self.run.info('Input metadata file', self.metadata_file_path)
        self.run.info('Output directory', self.output_directory_path)

        columns = utils.get_columns_of_TAB_delim_file(self.metadata_file_path)
        if 'organism_name' not in columns or 'local_filename' not in columns:
            raise ConfigError("The metadata file you provided does not look like a metadata\
                               file output from the program `ncbi-genome-download` :/ Why?\
                               Because anvi'o expects that file to have at least the following\
                               two columns in it: 'organism_name' and 'local_filename'.")

        metadata = utils.get_TAB_delimited_file_as_dictionary(self.metadata_file_path)

        for entry in metadata:
            if not os.path.exists(metadata[entry]['local_filename']):
                raise ConfigError("At least one of the files in your metadata input does not seem to be\
                                   where they think they are :/ Please make sure the entry %s and others\
                                   point to proper local file paths..." % entry)

        self.run.info('Num entries in metadata', len(metadata))

        output_fasta_dict = {}
        self.progress.new("GenBank to anvi'o", progress_total_items=len(metadata))
        for entry in metadata:
            self.progress.increment()
            self.progress.update('Processing %s ...' % entry)

            # set the organism name and accession id and clean them from weird
            # characters.
            organism_name = metadata[entry]['organism_name']
            for char in [c for c in organism_name if c not in OK_CHARS_FOR_ORGANISM_NAME]:
                organism_name = organism_name.replace(char, '_')

            accession_id = entry
            for char in [c for c in accession_id if c not in OK_CHARS_FOR_ACCESSION]:
                accession_id = accession_id.replace(char, '_')

            final_name = '_'.join([organism_name, accession_id])

            args = argparse.Namespace(input_genbank=metadata[entry]['local_filename'],
                                      output_file_prefix=os.path.join(self.output_directory_path, final_name))
            g = GenbankToAnvio(args, run=terminal.Run(verbose=False), progress=terminal.Progress(verbose=False))

            if final_name in output_fasta_dict:
                raise ConfigError("The final name '%s' for your genome has alrady been used by\
                                   another one :/ This should never happen unless your metadata\
                                   contains entries with identical accession numbers...")
            output_fasta_dict[final_name] = g.process()

        self.progress.end()

        headers = ['name', 'path']
        if not self.exclude_gene_calls_from_fasta_txt:
            headers.extend(['external_gene_calls', 'gene_functional_annotation'])

        utils.store_dict_as_TAB_delimited_file(output_fasta_dict, self.output_fasta_descriptor, headers=headers)

        self.run.info('Output FASTA descriptor', self.output_fasta_descriptor)
示例#12
0
    def save_gene_class_information_in_additional_layers(
            self, additional_description=''):
        if not self.additional_layers_to_append:
            additional_column_titles = []
            additional_layers_dict = self.gene_class_information
        else:
            additional_column_titles = utils.get_columns_of_TAB_delim_file(
                self.additional_layers_to_append)
            additional_layers_dict = utils.get_TAB_delimited_file_as_dictionary(
                self.additional_layers_to_append,
                dict_to_append=self.gene_class_information,
                assign_none_for_missing=True,
                column_mapping=[int] + [str] * len(additional_column_titles))

        if additional_description:
            additional_description = '-' + additional_description

        additional_layers_file_name = self.output_file_prefix + additional_description + '-additional-layers.txt'
        headers = headers = [
            'gene_callers_id', 'gene_class', 'number_of_detections',
            'portion_detected', 'gene_specificity',
            'gene_coverage_consistency', 'core_or_accessory', 'adjusted_mean',
            'adjusted_stds'
        ] + additional_column_titles

        utils.store_dict_as_TAB_delimited_file(additional_layers_dict,
                                               additional_layers_file_name,
                                               headers=headers)
示例#13
0
文件: __init__.py 项目: meren/anvio
 def gen_report_with_references_for_removal_info(self, filtered_id_files, output_file_name):
     ''' If mapping was done to reference for removal then we create a report with the results.'''
     report_dict = {}
     for filename in filtered_id_files:
         sample = os.path.basename(filename).split("-ids-to-remove.txt")[0]
         ids = set(open(filename).read().splitlines())
         report_dict[sample] = {}
         report_dict[sample]['number_of_filtered_reads'] = len(ids)
     u.store_dict_as_TAB_delimited_file(report_dict, output_file_name, headers=["sample", 'number_of_filtered_reads'])
示例#14
0
 def gen_report_with_references_for_removal_info(self, filtered_id_files, output_file_name):
     ''' If mapping was done to reference for removal then we create a report with the results.'''
     report_dict = {}
     for filename in filtered_id_files:
         sample = os.path.basename(filename).split("-ids-to-remove.txt")[0]
         ids = set(open(filename).read().splitlines())
         report_dict[sample] = {}
         report_dict[sample]['number_of_filtered_reads'] = len(ids)
     u.store_dict_as_TAB_delimited_file(report_dict, output_file_name, headers=["sample", 'number_of_filtered_reads'])
示例#15
0
    def store_profile_data(self):

        if self.summary.quick:
            return

        self.progress.update('Storing profile data ...')

        for table_name in self.bin_profile:
            output_file_obj = self.get_output_file_handle('%s.txt' % table_name)
            utils.store_dict_as_TAB_delimited_file({table_name: self.bin_profile[table_name]}, None, headers=['bin'] + self.summary.p_meta['samples'], file_obj=output_file_obj)
示例#16
0
    def store_profile_data(self):

        if self.summary.quick:
            return

        self.progress.update('Storing profile data ...')

        for table_name in self.bin_profile:
            output_file_obj = self.get_output_file_handle('%s.txt' % table_name)
            utils.store_dict_as_TAB_delimited_file({table_name: self.bin_profile[table_name]}, None, headers = ['bin'] + self.summary.p_meta['samples'], file_obj = output_file_obj)
示例#17
0
def get_newick_tree_data_for_dict(d, linkage=constants.linkage_method_default, distance=constants.distance_metric_default):
    is_distance_and_linkage_compatible(distance, linkage)

    matrix_file = filesnpaths.get_temp_file_path()
    utils.store_dict_as_TAB_delimited_file(d, matrix_file, ['items'] + d[d.keys()[0]].keys())

    newick = get_newick_tree_data(matrix_file, distance=distance, linkage=linkage)

    os.remove(matrix_file)
    return newick
示例#18
0
 def save_gene_detection_and_coverage(self, additional_description=''):
     if additional_description:
         prefix = self.output_file_prefix + '-' + additional_description
     else:
         prefix = self.output_file_prefix
     gene_coverages_file_name = prefix + '-gene-coverages.txt'
     gene_detections_file_name = prefix + '-gene-detections.txt'
     utils.store_dict_as_TAB_delimited_file(self.gene_coverages,
                                            gene_coverages_file_name)
     utils.store_dict_as_TAB_delimited_file(self.gene_detection,
                                            gene_detections_file_name)
示例#19
0
    def export_samples_db_files(self):
        """Export whatever information is stored in a ginve anvi'o samples database"""

        order_output_path = get_temp_file_path()
        information_output_path = get_temp_file_path()

        samples_information_dict, samples_order_dict = self.get_samples_information_and_order_dicts()

        utils.store_dict_as_TAB_delimited_file(samples_order_dict, order_output_path, headers=['attributes', 'data_type', 'data_value'])
        utils.store_dict_as_TAB_delimited_file(samples_information_dict, information_output_path, headers=['samples'] + sorted(list(list(samples_information_dict.values())[0].keys())))

        return information_output_path, order_output_path
示例#20
0
    def export_samples_db_files(self):
        """Export whatever information is stored in a ginve anvi'o samples database"""

        order_output_path = get_temp_file_path()
        information_output_path = get_temp_file_path()

        samples_information_dict, samples_order_dict = self.get_samples_information_and_order_dicts()

        utils.store_dict_as_TAB_delimited_file(samples_order_dict, order_output_path, headers=['attributes', 'data_type', 'data_value'])
        utils.store_dict_as_TAB_delimited_file(samples_information_dict, information_output_path, headers=['samples'] + sorted(list(list(samples_information_dict.values())[0].keys())))

        return information_output_path, order_output_path
示例#21
0
    def report(self):
        self.progress.new('Reporting')

        new_structure = [t.variable_nts_table_structure[0]] + ['unique_pos_identifier'] + [x for x in t.variable_nts_table_structure[1:] if x != 'split_name'] + ['contig_name', 'split_name', 'unique_pos_identifier_str']

        self.progress.update('exporting variable positions table as a TAB-delimited file ...')

        utils.store_dict_as_TAB_delimited_file(self.data, self.args.output_file, new_structure)
        self.progress.end()

        self.run.info('Num entries reported', pp(len(self.data)))
        self.run.info('Output File', self.args.output_file) 
        self.run.info('Num nt positions reported', pp(len(set([e['unique_pos_identifier'] for e in self.data.values()]))))
示例#22
0
    def store_clusters_as_TAB_delimited_text(self, output_file_path):
        filesnpaths.is_output_file_writable(output_file_path)

        self.progress.new('Storing clusters as TAB-delimited file')
        self.progress.update('creating the clusters dictionary ...')
        clusters_dict = {}
        for contig_name in self.clusters:
            clusters_dict[contig_name] = {'concoct_bin': self.clusters[contig_name]}

        self.progress.update('writing the file ...')
        utils.store_dict_as_TAB_delimited_file(clusters_dict, output_file_path, ['contig', 'concoct_bin'])
        self.progress.end()

        self.run.info('CONCOCT results in txt', output_file_path, display_only = True)
示例#23
0
    def store_clusters_as_TAB_delimited_text(self, output_file_path):
        filesnpaths.is_output_file_writable(output_file_path)

        self.progress.new("Storing clusters as TAB-delimited file")
        self.progress.update("creating the clusters dictionary ...")
        clusters_dict = {}
        for contig_name in self.clusters:
            clusters_dict[contig_name] = {"concoct_bin": self.clusters[contig_name]}

        self.progress.update("writing the file ...")
        utils.store_dict_as_TAB_delimited_file(clusters_dict, output_file_path, ["contig", "concoct_bin"])
        self.progress.end()

        self.run.info("CONCOCT results in txt", output_file_path, display_only=True)
示例#24
0
    def gen_data_from_protein_clusters(self, protein_clustering_dict):
        self.progress.new('Generating view data')
        self.progress.update('...')

        PCs = protein_clustering_dict.keys()

        for PC in PCs:
            self.view_data[PC] = dict([(genome_name, 0) for genome_name in self.genomes])
            self.view_data_presence_absence[PC] = dict([(genome_name, 0) for genome_name in self.genomes])
            self.additional_view_data[PC] = {'num_genes_in_pc': 0, 'num_genomes_pc_has_hits': 0}
            for entry_hash, gene_caller_id in [e.split('_') for e in protein_clustering_dict[PC]]:
                try:
                    genome_name = self.hash_to_genome_name[entry_hash]
                except KeyError:
                    raise ConfigError, "Something horrible happened. This can only happend if you started a new analysis with\
                                        additional genomes without cleaning the previous work directory. Sounds familiar?"
                self.view_data[PC][genome_name] += 1
                self.view_data_presence_absence[PC][genome_name] = 1
                self.additional_view_data[PC]['num_genes_in_pc'] += 1
            self.additional_view_data[PC]['num_genomes_pc_has_hits'] = len([True for genome in self.view_data[PC] if self.view_data[PC][genome] > 0])

        self.progress.end()

        #
        # FILTERING BASED ON OCCURRENCE
        #
        PCs_of_interest = set([])
        for PC in PCs:
            if self.additional_view_data[PC]['num_genomes_pc_has_hits'] >= self.PC_min_occurrence:
                PCs_of_interest.add(PC)

        for PC in PCs:
            if PC not in PCs_of_interest:
                self.view_data.pop(PC)
                self.view_data_presence_absence.pop(PC)
                self.additional_view_data.pop(PC)

        if self.PC_min_occurrence > 1:
            self.run.info('PCs min occurrence', '%d (the filter removed %s PCs)' % (self.PC_min_occurrence, (len(protein_clustering_dict) - len(PCs_of_interest))))

        view_data_file_path = self.get_output_file_path('anvio-view-data.txt')
        additional_view_data_file_path = self.get_output_file_path('anvio-additional-view-data.txt')
        view_data_presence_absence_file_path = self.get_output_file_path('anvio-view-data-prsence-absence.txt')

        utils.store_dict_as_TAB_delimited_file(self.view_data, view_data_file_path, headers = ['contig'] + sorted(self.genomes.keys()))
        utils.store_dict_as_TAB_delimited_file(self.additional_view_data, additional_view_data_file_path, headers = ['contig'] + sorted(self.additional_view_data.values()[0].keys()))
        utils.store_dict_as_TAB_delimited_file(self.view_data_presence_absence, view_data_presence_absence_file_path, headers = ['contig'] + sorted(self.view_data_presence_absence.values()[0].keys()))

        # here's where we finalize experimental data for clustering
        experimental_data = copy.deepcopy(self.view_data_presence_absence)
        for PC in self.additional_view_data:
            for i in range(0, int(len(self.genomes) / 2)):
                experimental_data[PC]['num_genomes_pc_has_hits_%d' % i] = self.additional_view_data[PC]['num_genomes_pc_has_hits']
        experimental_data_file_path = self.get_output_file_path('anvio-experimental-data-for-clustering.txt')
        utils.store_dict_as_TAB_delimited_file(experimental_data, experimental_data_file_path, headers = ['contig'] + sorted(experimental_data.values()[0].keys()))

        self.run.info("Anvi'o view data for protein clusters", view_data_file_path)
        self.run.info("Anvi'o additional view data", additional_view_data_file_path)

        return view_data_file_path, view_data_presence_absence_file_path, additional_view_data_file_path, experimental_data_file_path
示例#25
0
文件: panops.py 项目: psaxcode/anvio
    def store_protein_clusters(self, protein_clusters_dict):
        self.progress.new('Storing protein clusters')
        self.progress.update('...')

        protein_clusters_output_path = self.get_output_file_path(
            'protein-clusters.txt')

        self.progress.end()
        d = {}

        PCs = protein_clusters_dict.keys()

        unique_entry_id = 0
        for PC in PCs:
            for entry_hash, gene_caller_id in [
                    e.split('_') for e in protein_clusters_dict[PC]
            ]:
                try:
                    genome_name = self.hash_to_genome_name[entry_hash]
                except KeyError:
                    raise ConfigError, "Something horrible happened. This can only happen if you started a new analysis with\
                                        additional genomes without cleaning the previous work directory. Sounds familiar?"

                d[unique_entry_id] = {
                    'gene_caller_id':
                    gene_caller_id,
                    'protein_cluster_id':
                    PC,
                    'genome_name':
                    genome_name,
                    'sequence':
                    self.protein_sequences_dict[genome_name][int(
                        gene_caller_id)]
                }
                unique_entry_id += 1

        utils.store_dict_as_TAB_delimited_file(d,
                                               protein_clusters_output_path,
                                               headers=[
                                                   'entry_id',
                                                   'gene_caller_id',
                                                   'protein_cluster_id',
                                                   'genome_name', 'sequence'
                                               ])

        self.progress.end()

        self.run.info('protein clusters info', protein_clusters_output_path)

        return protein_clusters_output_path
示例#26
0
    def store_clusters_as_TAB_delimited_text(self, output_file_path):
        filesnpaths.is_output_file_writable(output_file_path)

        self.progress.new('Storing clusters as TAB-delimited file')
        self.progress.update('creating the clusters dictionary ...')
        clusters_dict = {}
        for contig_name in self.clusters:
            clusters_dict[contig_name] = {'concoct_bin': self.clusters[contig_name]}

        self.progress.update('writing the file ...')
        utils.store_dict_as_TAB_delimited_file(clusters_dict, output_file_path, ['contig', 'concoct_bin'])
        self.progress.end()

        self.run.info('CONCOCT results in txt', output_file_path, display_only = True)
示例#27
0
def get_newick_tree_data_for_dict(d,
                                  linkage=constants.linkage_method_default,
                                  distance=constants.distance_metric_default):
    is_distance_and_linkage_compatible(distance, linkage)

    matrix_file = filesnpaths.get_temp_file_path()
    utils.store_dict_as_TAB_delimited_file(d, matrix_file,
                                           ['items'] + d[d.keys()[0]].keys())

    newick = get_newick_tree_data(matrix_file,
                                  distance=distance,
                                  linkage=linkage)

    os.remove(matrix_file)
    return newick
示例#28
0
    def export(self, output_file_path):
        filesnpaths.is_output_file_writable(output_file_path)

        if self.target in ['layers', 'items']:
            keys, data = AdditionalDataBaseClass.get(self)
        elif self.target in ['layer_orders']:
            data = OrderDataBaseClass.get(self, native_form=True)
            keys = ['data_type', 'data_value']
        else:
            raise ConfigError("Your target table '%s' does not make any sense" % self.target)

        if not(len(data)):
            raise ConfigError("Additional data table for %s is empty. There is nothing to export :/" % self.target)

        utils.store_dict_as_TAB_delimited_file(data, output_file_path, headers=[self.target] + keys)

        self.run.info('Output file for %s' % self.target, output_file_path)
示例#29
0
    def store_gene_coverages_matrix(self):
        self.progress.update('Storing gene coverages ...')

        info_dict = {}
        genes_dict = {}

        gene_entry_ids_in_bin = set([])
        for split_name in self.split_ids:
            gene_entry_ids_in_bin.update(
                self.summary.split_to_genes_in_splits_ids[split_name])

        info_dict['num_genes_found'] = len(gene_entry_ids_in_bin)

        headers = ['function', 'contig', 'start', 'stop', 'direction']
        for gene_entry_id in gene_entry_ids_in_bin:
            prot_id = self.summary.genes_in_splits[gene_entry_id]['prot']
            genes_dict[prot_id] = {}

            # first fill in sample independent information;
            for header in headers:
                genes_dict[prot_id][
                    header] = self.summary.genes_in_contigs_dict[prot_id][
                        header]

            # then fill in distribution across samples:
            for sample_name in self.summary.p_meta['samples']:
                genes_dict[prot_id][
                    sample_name] = self.summary.gene_coverages_dict[prot_id][
                        sample_name]

            # finally add the sequence:
            contig = self.summary.genes_in_contigs_dict[prot_id]['contig']
            start = self.summary.genes_in_contigs_dict[prot_id]['start']
            stop = self.summary.genes_in_contigs_dict[prot_id]['stop']
            genes_dict[prot_id]['sequence'] = self.summary.contig_sequences[
                contig]['sequence'][start:stop]

        output_file_obj = self.get_output_file_handle('functions.txt')
        utils.store_dict_as_TAB_delimited_file(genes_dict,
                                               None,
                                               headers=['prot'] + headers +
                                               self.summary.p_meta['samples'] +
                                               ['sequence'],
                                               file_obj=output_file_obj)

        self.bin_info_dict['genes'] = info_dict
示例#30
0
    def report(self):
        self.progress.new('Reporting')

        new_structure = [t.variable_positions_table_structure[0]] + ['unique_pos_identifier'] + t.variable_positions_table_structure[1:] + ['parent']

        self.progress.update('exporting variable positions table as a TAB-delimited file ...')

        # FIXME: THIS HAS TO GO INTO THE TABLE THIS WAY
        for e in self.variable_positions_table:
            self.variable_positions_table[e]['competing_nts'] = ''.join(sorted(self.variable_positions_table[e]['competing_nts']))

        utils.store_dict_as_TAB_delimited_file(self.variable_positions_table, self.args.output_file, new_structure)
        self.progress.end()

        self.run.info('Num entries reported', pp(len(self.variable_positions_table)))
        self.run.info('Output File', self.args.output_file) 
        self.run.info('Num nt positions reported', pp(len(set([e['unique_pos_identifier'] for e in self.variable_positions_table.values()]))))
示例#31
0
    def report(self):
        self.progress.new('Reporting')

        if self.engine == 'NT':
            table_structure = t.variable_nts_table_structure
        elif self.engine == 'AA':
            table_structure = t.variable_aas_table_structure

        new_structure = [t.variable_nts_table_structure[0]] + [
            'unique_pos_identifier'
        ] + [x for x in table_structure[1:] if x != 'split_name'
             ] + ['consensus', 'departure_from_consensus', 'n2n1ratio']

        if self.include_contig_names_in_output:
            new_structure.append('contig_name')

        if self.include_split_names_in_output:
            new_structure.append('split_name')

        self.progress.update(
            'exporting variable positions table as a TAB-delimited file ...')

        utils.store_dict_as_TAB_delimited_file(self.data,
                                               self.args.output_file,
                                               new_structure)
        self.progress.end()

        self.run.info('Num entries reported', pp(len(self.data)))
        self.run.info('Output File', self.args.output_file)
        self.run.info(
            'Num %s positions reported' % self.engine,
            pp(
                len(
                    set([
                        e['unique_pos_identifier'] for e in self.data.values()
                    ]))))
示例#32
0
    def report(self):
        self.progress.new('Reporting')

        if self.engine == 'NT':
            table_structure = t.variable_nts_table_structure
        elif self.engine == 'AA':
            table_structure = t.variable_aas_table_structure

        new_structure = [t.variable_nts_table_structure[0]] + ['unique_pos_identifier'] + [x for x in table_structure[1:] if x != 'split_name'] + ['consensus', 'departure_from_consensus', 'n2n1ratio']

        if self.include_contig_names_in_output:
            new_structure.append('contig_name')

        if self.include_split_names_in_output:
            new_structure.append('split_name')

        self.progress.update('exporting variable positions table as a TAB-delimited file ...')

        utils.store_dict_as_TAB_delimited_file(self.data, self.args.output_file, new_structure)
        self.progress.end()

        self.run.info('Num entries reported', pp(len(self.data)))
        self.run.info('Output File', self.args.output_file)
        self.run.info('Num %s positions reported' % self.engine, pp(len(set([e['unique_pos_identifier'] for e in self.data.values()]))))
示例#33
0
    def process(self):
        self.sanity_check()

        output_fasta = {}
        output_gene_calls = {}
        output_functions = {}
        num_genbank_records_processed = 0
        num_genes_found = 0
        num_genes_reported = 0
        num_genes_with_functions = 0

        try:
            if self.input_genbank_path.endswith('.gz'):
                genbank_file_object = SeqIO.parse(
                    io.TextIOWrapper(gzip.open(self.input_genbank_path, 'r')),
                    "genbank")
            else:
                genbank_file_object = SeqIO.parse(
                    open(self.input_genbank_path, "r"), "genbank")
        except Exception as e:
            raise ConfigError(
                "Someone didn't like your unput 'genbank' file :/ Here's what they said "
                "about it: '%s'." % e)

        for genbank_record in genbank_file_object:
            num_genbank_records_processed += 1
            output_fasta[genbank_record.name] = str(genbank_record.seq)

            genes = [
                gene for gene in genbank_record.features if gene.type == "CDS"
            ]  # focusing on features annotated as "CDS" by NCBI's PGAP

            for gene in genes:
                num_genes_found += 1
                location = str(gene.location)
                # dumping gene if "location" section contains any of these terms set above: "join" means the gene call spans multiple contigs; "<" or ">" means the gene call runs off a contig
                if any(exclusion_term in location
                       for exclusion_term in self.location_terms_to_exclude):
                    continue

                if "note" in gene.qualifiers:
                    note = str(gene.qualifiers["note"][0])

                    # dumping gene if noted as any of these in the "note" section set above
                    if any(exclusion_term in note
                           for exclusion_term in self.note_terms_to_exclude):
                        continue

                # dumping if overlapping translation frame
                if "transl_except" in gene.qualifiers:
                    continue

                # dumping if gene declared a pseudogene
                if "pseudo" in gene.qualifiers or "pseudogene" in gene.qualifiers:
                    continue

                # cleaning up gene coordinates to more easily parse:
                location = location.replace("[", "")
                location = re.sub('](.*)', '', location)
                location = location.split(":")

                start = location[0]  # start coordinate
                end = location[1]  # end coordinate

                # setting direction to "f" or "r":
                if gene.strand == 1:
                    direction = "f"
                else:
                    direction = "r"

                # for accession, storing protein id if it has one, else the the locus tag, else "None"
                if "protein_id" in gene.qualifiers:
                    accession = gene.qualifiers["protein_id"][0]
                elif "locus_tag" in gene.qualifiers:
                    accession = gene.qualifiers["locus_tag"][0]
                else:
                    accession = "None"

                # storing gene product annotation if present
                if "product" in gene.qualifiers:
                    function = gene.qualifiers["product"][0]
                    # trying to capture all different ways proteins are listed as hypothetical and setting to same thing so can prevent from adding to output functions table below
                    if function in [
                            "hypothetical", "hypothetical protein",
                            "conserved hypothetical",
                            "conserved hypotheticals",
                            "Conserved hypothetical protein"
                    ]:
                        function = "hypothetical protein"
                else:
                    function = "hypothetical protein"

                # if present, adding gene name to product annotation (so long as not a hypothetical, sometimes these names are useful, sometimes they are not):
                if "gene" in gene.qualifiers:
                    if function not in "hypothetical protein":
                        gene_name = str(gene.qualifiers["gene"][0])
                        function = function + " (" + gene_name + ")"

                output_gene_calls[self.gene_callers_id] = {
                    'contig': genbank_record.name,
                    'start': start,
                    'stop': end,
                    'direction': direction,
                    'partial': 0,
                    'call_type': 1,
                    'source': self.source,
                    'version': self.version
                }
                num_genes_reported += 1

                # not writing gene out to functions table if no annotation
                if "hypothetical protein" not in function:
                    output_functions[self.gene_callers_id] = {
                        'source': self.source,
                        'accession': accession,
                        'function': function,
                        'e_value': 0
                    }
                    num_genes_with_functions += 1

                # increment the gene callers id fo rthe next
                self.gene_callers_id += 1

        if num_genbank_records_processed == 0:
            raise ConfigError(
                "It seems there was no records in your input genbank file :/ Are you sure you "
                "gave the right file path that actually resolves to a genbank formatted "
                "text file?")

        self.run.info('Num GenBank entries processed',
                      num_genbank_records_processed)
        self.run.info('Num gene records found', num_genes_found)
        self.run.info('Num genes reported', num_genes_reported, mc='green')
        self.run.info('Num genes with functions',
                      num_genes_with_functions,
                      mc='green',
                      nl_after=1)

        # time to write these down:
        utils.store_dict_as_FASTA_file(output_fasta,
                                       self.output_fasta_path,
                                       wrap_from=None)
        self.run.info('FASTA file path', self.output_fasta_path)

        if len(output_gene_calls):
            utils.store_dict_as_TAB_delimited_file(output_gene_calls,
                                                   self.output_gene_calls_path,
                                                   headers=[
                                                       "gene_callers_id",
                                                       "contig", "start",
                                                       "stop", "direction",
                                                       "partial", "call_type",
                                                       "source", "version"
                                                   ])
            self.run.info('External gene calls file',
                          self.output_gene_calls_path)

            utils.store_dict_as_TAB_delimited_file(output_functions,
                                                   self.output_functions_path,
                                                   headers=[
                                                       'gene_callers_id',
                                                       'source', 'accession',
                                                       'function', 'e_value'
                                                   ])
            self.run.info('TAB-delimited functions',
                          self.output_functions_path)
        else:
            self.output_gene_calls_path = None
            self.output_functions_path = None
            self.run.warning(
                "Anvi'o couldn't find any gene calles in the GenBank file, hence you will get "
                "no output files for external gene calls or functions :/ We hope you can "
                "survive this terrible terrible news :(")

        self.run.info_single('Mmmmm ☘ ', nl_before=1, nl_after=1)

        return {
            'external_gene_calls': self.output_gene_calls_path,
            'gene_functional_annotation': self.output_functions_path,
            'path': self.output_fasta_path
        }
示例#34
0
    def process(self):
        # learn who you are:
        collection_dict = self.collections.get_collection_dict(
            self.collection_name)
        bins_info_dict = self.collections.get_bins_info_dict(
            self.collection_name)

        # init profile data for colletion.
        self.init_collection_profile(collection_dict)

        # load completeness information if available
        self.completeness = completeness.Completeness(self.contigs_db_path)
        if len(self.completeness.sources):
            self.completeness_data_available = True

        # load HMM sources for non-single-copy genes if available
        if self.non_singlecopy_gene_hmm_sources and not self.quick:
            self.init_non_singlecopy_gene_hmm_sources()
            self.non_single_copy_gene_hmm_data_available = True

        # load gene functions from contigs db superclass
        self.init_functions()

        # set up the initial summary dictionary
        self.summary['meta'] = {
            'quick':
            self.quick,
            'output_directory':
            self.output_directory,
            'collection':
            collection_dict.keys(),
            'num_bins':
            len(collection_dict.keys()),
            'collection_name':
            self.collection_name,
            'total_nts_in_collection':
            0,
            'num_contigs_in_collection':
            0,
            'anvio_version':
            __version__,
            'profile':
            self.p_meta,
            'contigs':
            self.a_meta,
            'gene_coverages_data_available':
            self.gene_coverages_data_available,
            'completeness_data_available':
            self.completeness_data_available,
            'non_single_copy_gene_hmm_data_available':
            self.non_single_copy_gene_hmm_data_available,
            'percent_contigs_nts_described_by_collection':
            0.0,
            'percent_profile_nts_described_by_collection':
            0.0,
            'percent_contigs_nts_described_by_profile':
            P(self.p_meta['total_length'], self.a_meta['total_length']),
            'percent_contigs_contigs_described_by_profile':
            P(self.p_meta['num_contigs'], self.a_meta['num_contigs']),
            'percent_contigs_splits_described_by_profile':
            P(self.p_meta['num_splits'], self.a_meta['num_splits']),
        }

        # I am not sure whether this is the best place to do this,
        self.summary['basics_pretty'] = {
            'profile': [
                ('Created on', self.p_meta['creation_date']),
                ('Version', self.p_meta['version']),
                ('Minimum conting length',
                 pretty(self.p_meta['min_contig_length'])),
                ('Number of contigs', pretty(int(self.p_meta['num_contigs']))),
                ('Number of splits', pretty(int(self.p_meta['num_splits']))),
                ('Total nucleotides',
                 humanize_n(int(self.p_meta['total_length']))),
            ],
            'contigs': [
                ('Created on', self.p_meta['creation_date']),
                ('Version', self.a_meta['version']),
                ('Split length', pretty(int(self.a_meta['split_length']))),
                ('Number of contigs', pretty(int(self.a_meta['num_contigs']))),
                ('Number of splits', pretty(int(self.a_meta['num_splits']))),
                ('Total nucleotides',
                 humanize_n(int(self.a_meta['total_length']))),
                ('K-mer size', self.a_meta['kmer_size']),
            ],
        }

        self.summary['max_shown_header_items'] = 10
        self.summary['slice_header_items_tmpl'] = '0:%d' % self.summary[
            'max_shown_header_items']
        self.summary['num_not_shown_samples'] = len(
            self.p_meta['samples']) - self.summary['max_shown_header_items']
        self.summary['num_not_shown_hmm_items'] = dict([
            (hmm_search_source,
             len(self.hmm_sources_info[hmm_search_source]['genes']) -
             self.summary['max_shown_header_items'])
            for hmm_search_type, hmm_search_source in self.hmm_searches_header
        ])

        self.summary['files'] = {}
        self.summary['collection'] = {}
        self.summary[
            'collection_profile'] = self.collection_profile  # reminder; collection_profile comes from ProfileSuperclass!
        self.summary[
            'collection_profile_items'] = self.collection_profile.values(
            )[0].keys()

        # add hmm items for each seach type:
        if self.non_single_copy_gene_hmm_data_available:
            self.summary['meta']['hmm_items'] = dict([
                (hmm_search_source,
                 self.hmm_sources_info[hmm_search_source]['genes']) for
                hmm_search_type, hmm_search_source in self.hmm_searches_header
            ])

        # summarize bins:
        for bin_id in collection_dict:
            bin = Bin(self, bin_id, collection_dict[bin_id], self.run,
                      self.progress)
            bin.output_directory = os.path.join(self.output_directory,
                                                'bin_by_bin', bin_id)
            bin.bin_profile = self.collection_profile[bin_id]

            self.summary['collection'][bin_id] = bin.create()
            self.summary['collection'][bin_id][
                'color'] = bins_info_dict[bin_id]['html_color'] or '#212121'
            self.summary['collection'][bin_id]['source'] = bins_info_dict[
                bin_id]['source'] or 'unknown_source'
            self.summary['meta']['total_nts_in_collection'] += self.summary[
                'collection'][bin_id]['total_length']
            self.summary['meta']['num_contigs_in_collection'] += self.summary[
                'collection'][bin_id]['num_contigs']

        # bins are computed, add some relevant meta info:
        self.summary['meta'][
            'percent_contigs_nts_described_by_collection'] = '%.2f' % (
                self.summary['meta']['total_nts_in_collection'] * 100.0 /
                int(self.a_meta['total_length']))
        self.summary['meta'][
            'percent_profile_nts_described_by_collection'] = '%.2f' % (
                self.summary['meta']['total_nts_in_collection'] * 100.0 /
                int(self.p_meta['total_length']))
        self.summary['meta'][
            'bins'] = self.get_bins_ordered_by_completeness_and_size()

        if not self.quick:
            # generate a TAB-delimited text output file for bin summaries
            summary_of_bins_matrix_output = {}
            properties = [
                'taxon', 'total_length', 'num_contigs', 'N50', 'GC_content',
                'percent_complete', 'percent_redundancy'
            ]

            for bin_name in self.summary['collection']:
                summary_of_bins_matrix_output[bin_name] = dict([
                    (prop, self.summary['collection'][bin_name][prop])
                    for prop in properties
                ])

            output_file_obj = self.get_output_file_handle(
                prefix='general_bins_summary.txt')
            utils.store_dict_as_TAB_delimited_file(
                summary_of_bins_matrix_output,
                None,
                headers=['bins'] + properties,
                file_obj=output_file_obj)

            # save merged matrices for bins x samples
            for table_name in self.collection_profile.values()[0].keys():
                d = {}
                for bin_id in self.collection_profile:
                    d[bin_id] = self.collection_profile[bin_id][table_name]

                output_file_obj = self.get_output_file_handle(
                    sub_directory='bins_across_samples',
                    prefix='%s.txt' % table_name)
                utils.store_dict_as_TAB_delimited_file(
                    d,
                    None,
                    headers=['bins'] + sorted(self.p_meta['samples']),
                    file_obj=output_file_obj)

            # merge and store matrices for hmm hits
            if self.non_single_copy_gene_hmm_data_available:
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    # this is to keep numbers per hmm item:
                    d = {}

                    for bin_id in self.summary['meta']['bins']:
                        d[bin_id] = self.summary['collection'][bin_id]['hmms'][
                            hmm_search_source]

                    output_file_obj = self.get_output_file_handle(
                        sub_directory='bins_across_samples',
                        prefix='%s.txt' % hmm_search_source,
                        within='hmms')
                    utils.store_dict_as_TAB_delimited_file(
                        d,
                        None,
                        headers=['bins'] +
                        sorted(self.summary['meta']['hmm_items']
                               [hmm_search_source]),
                        file_obj=output_file_obj)

                # this is to keep number of hmm hits per bin:
                n = dict([(bin_id, {})
                          for bin_id in self.summary['meta']['bins']])
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    for bin_id in self.summary['meta']['bins']:
                        n[bin_id][hmm_search_source] = sum(
                            self.summary['collection'][bin_id]['hmms']
                            [hmm_search_source].values())

                output_file_obj = self.get_output_file_handle(
                    sub_directory='bins_across_samples',
                    prefix='hmm_hit_totals.txt')
                utils.store_dict_as_TAB_delimited_file(
                    n,
                    None,
                    headers=['bins'] +
                    sorted(self.summary['meta']['hmm_items']),
                    file_obj=output_file_obj)

            # store percent abundance of each bin
            self.summary[
                'bin_percent_recruitment'] = self.bin_percent_recruitment_per_sample
            self.summary['bin_percent_abundance_items'] = sorted(
                self.bin_percent_recruitment_per_sample.values()[0].keys())
            output_file_obj = self.get_output_file_handle(
                sub_directory='bins_across_samples',
                prefix='bins_percent_recruitment.txt')
            utils.store_dict_as_TAB_delimited_file(
                self.bin_percent_recruitment_per_sample,
                None,
                headers=['samples'] + sorted(self.collection_profile.keys()) +
                ['__splits_not_binned__'],
                file_obj=output_file_obj)

        if self.debug:
            import json
            print json.dumps(self.summary, sort_keys=True, indent=4)

        self.index_html = SummaryHTMLOutput(
            self.summary, r=self.run,
            p=self.progress).generate(quick=self.quick)
示例#35
0
    def append_dict_to_file(self, dict_to_append, file_handle):
        """This function adds a TAB-delimited dictionary to the end of the file.

        If the file is empty, it writes the header as well as adding the dictionary contents.
        Otherwise, it checks that the dictionary contains the same keys as the header and appends the
        dictionary contents to the end of the file.

        Parameters
        ==========
        dict_to_append : dictionary
            Holds the data you want to add to the end of the file. Keys should be headers of the file.
        file_handle : a file object
            Pointer to the file, opened in append mode. The calling function should take care of the
            open() and pass the handle here
        """

        import anvio.utils as utils
        if is_file_empty(self.path):
            utils.store_dict_as_TAB_delimited_file(dict_to_append, None, headers=self.headers, file_obj=file_handle, \
                                                    key_header=self.key_header, keys_order=self.keys_order, \
                                                    header_item_conversion_dict=self.header_item_conversion_dict, \
                                                    do_not_close_file_obj=True, do_not_write_key_column=self.do_not_write_key_column)
        else:
            # if dictionary is empty, just return
            if not dict_to_append:
                return

            file_headers = utils.get_columns_of_TAB_delim_file(
                self.path, include_first_column=True)
            inner_dict_keys = list(dict_to_append.values())[0].keys()

            # figure out if the first column holds the keys of the outer dictionary or one of the inner dictionary keys
            if file_headers[0] in inner_dict_keys:
                self.key_header = None
                self.headers = file_headers
            else:
                self.key_header = file_headers[0]
                self.headers = file_headers[1:]

            # check that the inner dictionary has the file headers we need
            missing_headers = [
                h for h in self.headers if h not in inner_dict_keys
            ]
            if len(missing_headers):
                if anvio.DEBUG:
                    if len(missing_headers) > 10:
                        raise FilesNPathsError(
                            f"Some headers from the file (n={len(missing_headers)}) are not in your dictionary :/ "
                            f"Here are the first ten of them: {missing_headers[:10].__str__()}"
                        )
                    else:
                        raise FilesNPathsError(
                            f"Some headers from the file are not in your dictionary :/ Here they are: {missing_headers.__str__()}"
                        )
                else:
                    raise FilesNPathsError(
                        "Some headers from the file are not in your dictionary :/ Use `--debug` to see where this "
                        "error is coming from the codebase with a list of example keys that are missing."
                    )

            # check that any requested outer dictionary keys are present
            if not self.keys_order:
                self.keys_order = sorted(dict_to_append.keys())
            else:
                missing_keys = [
                    k for k in self.keys_order if k not in dict_to_append
                ]
                if len(missing_keys):
                    if anvio.DEBUG:
                        if len(missing_keys) > 10:
                            raise FilesNPathsError(
                                f"Some keys (n={len(missing_keys)}) are not in your dictionary :/ Here are "
                                f"the first ten of them: {missing_keys[:10].__str__()}"
                            )
                        else:
                            raise FilesNPathsError(
                                f"Some keys are not in your dictionary :/ Here they are: {missing_keys.__str__()}"
                            )
                    else:
                        raise FilesNPathsError(
                            "Some keys are not in your dictionary :/ Use `--debug` to see where this "
                            "error is coming from the codebase with a list of example keys that are "
                            "missing.")

            # dict looks okay, append it to file
            for k in self.keys_order:
                if self.key_header:  # first column is key of outer dict
                    line = [str(k)]
                else:  # do not put the key of outer dict in the first column
                    line = []

                for header in self.headers:
                    try:
                        val = dict_to_append[k][header]
                    except KeyError:
                        raise FilesNPathsError(
                            f"Header '{header}' is not found in the dict for key '{k}':/"
                        )
                    except TypeError:
                        raise FilesNPathsError(
                            "Your dictionary is not properly formatted to be exported "
                            f"as a TAB-delimited file :/ You ask for '{header}', but it is not "
                            "even a key in the dictionary")

                    line.append(
                        str(val) if not isinstance(val, type(None)) else '')

                if anvio.AS_MARKDOWN:
                    file_handle.write(f"|{'|'.join(map(str, line))}|\n")
                else:
                    file_handle.write('%s\n' % '\t'.join(line))
示例#36
0
    def store_locus_as_contigs_db(self, contig_name, sequence, gene_calls, output_path_prefix, reverse_complement=False):
        """Generates a contigs database and a blank profile for a given locus"""

        temporary_files = []

        # dealing with some output file business.
        E = lambda e: output_path_prefix + e
        locus_output_db_path = E(".db")
        locus_sequence_fasta = E("_sequence.fa")
        locus_external_gene_calls = E("_external_gene_calls.txt")
        temporary_files.extend([locus_external_gene_calls, locus_sequence_fasta])

        # we will generate a blank profile database at the end of this. let's get the directory
        # business sorted.
        profile_output_dir = output_path_prefix + '-PROFILE'
        if os.path.exists(profile_output_dir):
            if self.overwrite_output_destinations:
                filesnpaths.shutil.rmtree(profile_output_dir)
            else:
                raise ConfigError("The directory %s exists, which kinda messes things up here. Either remove\
                                   it manually, or use the flag  --overwrite-output-destinations so anvi'o can\
                                   do it for you." % profile_output_dir)

        # sort out the contigs database output path
        if filesnpaths.is_file_exists(locus_output_db_path, dont_raise=True):
            if self.overwrite_output_destinations:
                os.remove(locus_output_db_path)
            else:
                raise ConfigError("There is already a contigs database at the output file path :( Either remove it first,\
                                   or use the --overwrite-output-destinations flag to give anvi'o full authority to wipe\
                                   your disk.")

        # do we need to reverse complement this guy? if yes, we will take care of the contigs sequence and
        # gene calls here, and remember this for later.
        gene_calls_list = list(gene_calls.keys())
        if reverse_complement:
            sequence = utils.rev_comp(sequence)
            gene_calls, gene_caller_id_conversion_dict = utils.rev_comp_gene_calls_dict(gene_calls, sequence)
        else:
            gene_caller_id_conversion_dict = dict([(gene_calls_list[g], g) for g in range(0, len(gene_calls_list))])
            new_gene_calls = {}
            for g in range(0, len(gene_calls_list)):
                gene_call = copy.deepcopy(gene_calls[gene_calls_list[g]])
                new_gene_calls[g] = gene_call
            gene_calls = new_gene_calls


        # write the sequene as a temporary FASTA file since the design of ContigsDatabase::create
        # will work seamlessly with this approach:
        with open(locus_sequence_fasta, 'w') as f:
            f.write('>%s\n%s\n' % (contig_name, sequence))

        # similarly, here we will store external gene calls so there will be no gene calling during
        # the generation of the contigs database
        headers = ['gene_callers_id', 'contig', 'start', 'stop', 'direction', 'partial', 'source', 'version']
        utils.store_dict_as_TAB_delimited_file(gene_calls, locus_external_gene_calls, headers=headers)

        # this is where magic happens. we ask anvi'o to create a contigs database for us.
        args = argparse.Namespace(contigs_fasta=locus_sequence_fasta,
                                  project_name=os.path.basename(output_path_prefix),
                                  split_length=sys.maxsize,
                                  kmer_size=4,
                                  external_gene_calls=locus_external_gene_calls)
        dbops.ContigsDatabase(locus_output_db_path, run=self.run_object).create(args)

        # while we are at it, here we generate a blank profile, too. so visualization of the
        # new contigs database for debugging or other purposes through anvi'o.
        args = argparse.Namespace(blank_profile=True,
                                  contigs_db=locus_output_db_path,
                                  skip_hierarchical_clustering=False,
                                  output_dir=profile_output_dir,
                                  sample_name=os.path.basename(output_path_prefix))
        profiler.BAMProfiler(args, r=self.run_object)._run()

        # so we have a contigs database! but there isn't much in it. the following where clause will
        # help us read from the tables of the original contigs database, and store it into the
        # new one throughout the following sections of the code.
        where_clause = "gene_callers_id in (%s)" % ', '.join(['"%d"' % g for g in gene_caller_id_conversion_dict])

        # a lousy anonymous function to read data from tables given the gene calls of interest
        R = lambda table_name: db.DB(self.input_contigs_db_path, None, ignore_version=True) \
                                              .get_some_rows_from_table_as_dict(table_name,
                                                                                where_clause=where_clause,
                                                                                error_if_no_data=False)

        G = lambda g: gene_caller_id_conversion_dict[g]

        ############################################################################################
        # DO FUNCTIONS
        ###########################################################################################
        function_calls = R(t.gene_function_calls_table_name)

        for entry_id in function_calls:
            function_calls[entry_id]['gene_callers_id'] = G(function_calls[entry_id]['gene_callers_id'])

        gene_function_calls_table = TableForGeneFunctions(locus_output_db_path, run=self.run_object)
        gene_function_calls_table.create(function_calls)

        self.run.info("Output contigs DB path", locus_output_db_path)
        self.run.info("Output blank profile DB path", os.path.join(profile_output_dir, 'PROFILE.db'))

        ############################################################################################
        # DO AMINO ACID SEQUENCES -- we are using external gene calls to generate the new contigs
        #                            database, but amino acid sequnces are kept in a different table
        #                            and anvi'o checks whether provided gene calls resolve to amino
        #                            acid sequences with proper starts and stops. if not, it skips
        #                            them. but amino acid sequences for each gene call was stored
        #                            in the original contigs database, and the best practice is to
        #                            carry them into the new one. so here we will remove all data
        #                            from the amino acid seqeunces table in the new database, and
        #                            copy the contents from the original one.
        ############################################################################################
        amino_acid_sequences = R(t.gene_amino_acid_sequences_table_name)

        entries = [(gene_caller_id_conversion_dict[g], amino_acid_sequences[g]['sequence']) for g in amino_acid_sequences]
        db.DB(locus_output_db_path, None, ignore_version=True).insert_many(t.gene_amino_acid_sequences_table_name, entries=entries)

        ############################################################################################
        # REMOVE TEMP FILES
        ###########################################################################################
        if anvio.DEBUG:
            self.run.info_single("Temp output files were kept for inspection due to --debug")
        else:
            [os.remove(f) for f in temporary_files]
示例#37
0
    def store_gene_coverages_matrix(self):

        if self.summary.quick:
            return

        self.progress.update('Storing gene coverages ...')

        # because splits are cut from arbitrary locations, we may have partial hits of genes in a bin.
        # we don't want genes to appear in a bin more than once due to this, or end up appearing in
        # two different bins just because one bin has a fraction of a gene. here we will build the
        # genes_dict, which will contain every gene hit in all splits that are found in this genome
        # bin.
        genes_dict = {}

        for split_name in self.split_ids:
            if split_name not in self.summary.split_to_genes_in_splits_ids:
                continue

            for gene_entry_id in self.summary.split_to_genes_in_splits_ids[split_name]:
                gene_call_in_split = self.summary.genes_in_splits[gene_entry_id]
                gene_callers_id = gene_call_in_split['gene_callers_id']

                if genes_dict.has_key(gene_callers_id):
                    genes_dict[gene_callers_id].append(gene_call_in_split)
                else:
                    genes_dict[gene_callers_id] = [gene_call_in_split]


        # here we have every gene hit in this bin stored in genes_dict. what we will do is to find gene
        # call ids for genes more than 90% of which apper to be in this bin (so nothing wil be reported for
        # a gene where only like 20% of it ended up in this bin).
        gene_callers_ids_for_complete_genes = set([])
        for gene_caller_id in genes_dict:
            if sum([x['percentage_in_split'] for x in genes_dict[gene_caller_id]]) > 90:
                gene_callers_ids_for_complete_genes.add(gene_caller_id)

        del genes_dict

        d = {}

        headers = ['contig', 'start', 'stop', 'direction']
        for gene_callers_id in gene_callers_ids_for_complete_genes:
            d[gene_callers_id] = {}

            # first fill in sample independent information;
            for header in headers:
                d[gene_callers_id][header] = self.summary.genes_in_contigs_dict[gene_callers_id][header]

            # then fill in distribution across samples:
            for sample_name in self.summary.p_meta['samples']:
                d[gene_callers_id][sample_name] = self.summary.gene_coverages_dict[gene_callers_id][sample_name]

            # add functions if there are any:
            if len(self.summary.gene_function_call_sources):
                for source in self.summary.gene_function_call_sources:
                    if gene_callers_id not in self.summary.gene_function_calls_dict:
                        # this gene did not get any functional annotation
                        d[gene_callers_id][source] = ''
                        continue

                    if self.summary.gene_function_calls_dict[gene_callers_id][source]:
                        d[gene_callers_id][source] = self.summary.gene_function_calls_dict[gene_callers_id][source][0]
                    else:
                        d[gene_callers_id][source] = ''

            # finally add the sequence:
            contig = self.summary.genes_in_contigs_dict[gene_callers_id]['contig']
            start = self.summary.genes_in_contigs_dict[gene_callers_id]['start']
            stop = self.summary.genes_in_contigs_dict[gene_callers_id]['stop']
            d[gene_callers_id]['sequence'] = self.summary.contig_sequences[contig]['sequence'][start:stop]

        output_file_obj = self.get_output_file_handle('functions.txt')

        if self.summary.gene_function_call_sources:
            headers = ['prot'] + headers + self.summary.p_meta['samples'] + self.summary.gene_function_call_sources + ['sequence']
        else:
            headers = ['prot'] + headers + self.summary.p_meta['samples'] + ['sequence']

        utils.store_dict_as_TAB_delimited_file(d, None, headers = headers, file_obj = output_file_obj)

        self.bin_info_dict['genes'] = {'num_genes_found': len(gene_callers_ids_for_complete_genes)}
示例#38
0
    def process(self):
        # learn who you are:
        collection_dict = self.collections.get_collection_dict(self.collection_id)
        collection_colors = self.collections.get_collection_colors(self.collection_id)

        # init profile data for colletion.
        self.init_collection_profile(collection_dict)

        # load completeness information if available
        self.completeness = completeness.Completeness(self.contigs_db_path)
        if len(self.completeness.sources):
            self.completeness_data_available = True

        # load HMM sources for non-single-copy genes if available
        if self.non_singlecopy_gene_hmm_sources and not self.quick:
            self.init_non_singlecopy_gene_hmm_sources()
            self.non_single_copy_gene_hmm_data_available = True

        # load gene functions from contigs db superclass
        self.init_functions()

        # set up the initial summary dictionary
        self.summary['meta'] = {'quick': self.quick,
                                'output_directory': self.output_directory,
                                'collection': collection_dict.keys(),
                                'num_bins': len(collection_dict.keys()),
                                'collection_id': self.collection_id,
                                'total_nts_in_collection': 0,
                                'num_contigs_in_collection': 0,
                                'anvio_version': __version__, 
                                'profile': self.p_meta,
                                'contigs': self.a_meta,
                                'gene_coverages_data_available': self.gene_coverages_data_available,
                                'completeness_data_available': self.completeness_data_available,
                                'non_single_copy_gene_hmm_data_available': self.non_single_copy_gene_hmm_data_available, 
                                'percent_contigs_nts_described_by_collection': 0.0,
                                'percent_profile_nts_described_by_collection': 0.0,
                                'percent_contigs_nts_described_by_profile': P(self.p_meta['total_length'], self.a_meta['total_length']) ,
                                'percent_contigs_contigs_described_by_profile': P(self.p_meta['num_contigs'], self.a_meta['num_contigs']) ,
                                'percent_contigs_splits_described_by_profile': P(self.p_meta['num_splits'], self.a_meta['num_splits']) ,
                                    }

        # I am not sure whether this is the best place to do this, 
        self.summary['basics_pretty'] = {'profile': [
                                                     ('Created on', self.p_meta['creation_date']),
                                                     ('Version', self.p_meta['version']),
                                                     ('Minimum conting length', pretty(self.p_meta['min_contig_length'])),
                                                     ('Number of contigs', pretty(int(self.p_meta['num_contigs']))),
                                                     ('Number of splits', pretty(int(self.p_meta['num_splits']))),
                                                     ('Total nucleotides', humanize_n(int(self.p_meta['total_length']))),
                                                    ],
                                         'contigs': [
                                                        ('Created on', self.p_meta['creation_date']),
                                                        ('Version', self.a_meta['version']),
                                                        ('Split length', pretty(int(self.a_meta['split_length']))),
                                                        ('Number of contigs', pretty(int(self.a_meta['num_contigs']))),
                                                        ('Number of splits', pretty(int(self.a_meta['num_splits']))),
                                                        ('Total nucleotides', humanize_n(int(self.a_meta['total_length']))),
                                                        ('K-mer size', self.a_meta['kmer_size']),
                                                    ],
                                        }

        self.summary['max_shown_header_items'] = 10
        self.summary['slice_header_items_tmpl'] = '0:%d' % self.summary['max_shown_header_items']
        self.summary['num_not_shown_samples'] = len(self.p_meta['samples']) - self.summary['max_shown_header_items']
        self.summary['num_not_shown_hmm_items'] = dict([(hmm_search_source, len(self.hmm_sources_info[hmm_search_source]['genes']) - self.summary['max_shown_header_items']) for hmm_search_type, hmm_search_source in self.hmm_searches_header])

        self.summary['files'] = {}
        self.summary['collection'] = {}
        self.summary['collection_profile'] = self.collection_profile # reminder; collection_profile comes from ProfileSuperclass!
        self.summary['collection_profile_items'] = self.collection_profile.values()[0].keys()

        # add hmm items for each seach type:
        if self.non_single_copy_gene_hmm_data_available:
            self.summary['meta']['hmm_items'] = dict([(hmm_search_source, self.hmm_sources_info[hmm_search_source]['genes']) for hmm_search_type, hmm_search_source in self.hmm_searches_header])

        # summarize bins:
        for bin_id in collection_dict: 
            bin = Bin(self, bin_id, collection_dict[bin_id], self.run, self.progress)
            bin.output_directory = os.path.join(self.output_directory, 'bin_by_bin', bin_id)
            bin.bin_profile = self.collection_profile[bin_id]

            self.summary['collection'][bin_id] = bin.create()
            self.summary['collection'][bin_id]['color'] = collection_colors[bin_id] or '#212121'
            self.summary['meta']['total_nts_in_collection'] += self.summary['collection'][bin_id]['total_length']
            self.summary['meta']['num_contigs_in_collection'] += self.summary['collection'][bin_id]['num_contigs'] 

        # bins are computed, add some relevant meta info:
        self.summary['meta']['percent_contigs_nts_described_by_collection'] = '%.2f' % (self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.a_meta['total_length']))
        self.summary['meta']['percent_profile_nts_described_by_collection'] = '%.2f' % (self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.p_meta['total_length']))
        self.summary['meta']['bins'] = self.get_bins_ordered_by_completeness_and_size()

        if not self.quick:
            # generate a TAB-delimited text output file for bin summaries
            summary_of_bins_matrix_output = {}
            properties = ['taxon', 'total_length', 'num_contigs', 'N50', 'GC_content', 'percent_complete', 'percent_redundancy']

            for bin_name in self.summary['collection']:
                summary_of_bins_matrix_output[bin_name] = dict([(prop, self.summary['collection'][bin_name][prop]) for prop in properties])

            output_file_obj = self.get_output_file_handle(prefix = 'general_bins_summary.txt')
            utils.store_dict_as_TAB_delimited_file(summary_of_bins_matrix_output, None, headers = ['bins'] + properties, file_obj = output_file_obj)

            # save merged matrices for bins x samples
            for table_name in self.collection_profile.values()[0].keys():
                d = {}
                for bin_id in self.collection_profile:
                    d[bin_id] = self.collection_profile[bin_id][table_name]

                output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = '%s.txt' % table_name)
                utils.store_dict_as_TAB_delimited_file(d, None, headers = ['bins'] + sorted(self.p_meta['samples']), file_obj = output_file_obj)

            # merge and store matrices for hmm hits
            if self.non_single_copy_gene_hmm_data_available:
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    # this is to keep numbers per hmm item:
                    d = {}

                    for bin_id in self.summary['meta']['bins']:
                        d[bin_id] = self.summary['collection'][bin_id]['hmms'][hmm_search_source]

                    output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = '%s.txt' % hmm_search_source, within='hmms')
                    utils.store_dict_as_TAB_delimited_file(d, None, headers = ['bins'] + sorted(self.summary['meta']['hmm_items'][hmm_search_source]), file_obj = output_file_obj)

                # this is to keep number of hmm hits per bin:
                n = dict([(bin_id, {}) for bin_id in self.summary['meta']['bins']])
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    for bin_id in self.summary['meta']['bins']:
                        n[bin_id][hmm_search_source] =  sum(self.summary['collection'][bin_id]['hmms'][hmm_search_source].values())

                output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = 'hmm_hit_totals.txt')
                utils.store_dict_as_TAB_delimited_file(n, None, headers = ['bins'] + sorted(self.summary['meta']['hmm_items']), file_obj = output_file_obj)

            # store percent abundance of each bin
            self.summary['bin_percent_recruitment'] = self.bin_percent_recruitment_per_sample
            self.summary['bin_percent_abundance_items'] = sorted(self.bin_percent_recruitment_per_sample.values()[0].keys())
            output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = 'bins_percent_recruitment.txt')
            utils.store_dict_as_TAB_delimited_file(self.bin_percent_recruitment_per_sample,
                                                   None,
                                                   headers = ['samples'] + sorted(self.collection_profile.keys()) + ['__splits_not_binned__'],
                                                   file_obj = output_file_obj)


        if self.debug:
            import json
            print json.dumps(self.summary, sort_keys=True, indent=4)

        self.index_html = SummaryHTMLOutput(self.summary, r = self.run, p = self.progress).generate(quick = self.quick)
示例#39
0
    def upload_project(self):
        try:
            args = argparse.Namespace()
            args.user = request.forms.get('username')
            args.password = request.forms.get('password')
            args.api_url = anvio.D['api-url'][1]['default']
            args.project_name = request.forms.get('project_name')
            args.delete_if_exists = True if request.forms.get(
                'delete_if_exists') == "true" else False

            view_name = request.forms.get('view')
            if view_name in self.interactive.views:
                view_path = filesnpaths.get_temp_file_path()
                utils.store_array_as_TAB_delimited_file(
                    self.interactive.views[view_name][1:], view_path,
                    self.interactive.views[view_name][0])
                args.view_data = view_path

            item_order_name = request.forms.get('ordering')
            if item_order_name in self.interactive.p_meta['item_orders']:
                ordering_path = filesnpaths.get_temp_file_path()
                items_order = self.interactive.p_meta['item_orders'][
                    item_order_name]

                f = open(ordering_path, 'w')
                if items_order['type'] == 'newick':
                    f.write(items_order['data'])
                    args.tree = ordering_path
                elif items_order['type'] == 'basic':
                    f.write("\n".join(items_order['data']))
                    args.items_order = ordering_path
                f.close()

            state_name = request.forms.get('state')
            if state_name in self.interactive.states_table.states:
                state_path = filesnpaths.get_temp_file_path()
                f = open(state_path, 'w')
                f.write(self.interactive.states_table.states[state_name]
                        ['content'])
                f.close()

                args.state = state_path

            if request.forms.get('include_description') == "true":
                description_path = filesnpaths.get_temp_file_path()
                f = open(description_path, 'w')
                f.write(self.interactive.p_meta['description'])
                f.close()

                args.description = description_path

            if request.forms.get('include_samples') == "true":
                if len(self.interactive.samples_order_dict):
                    samples_order_path = filesnpaths.get_temp_file_path()
                    utils.store_dict_as_TAB_delimited_file(
                        self.interactive.samples_order_dict,
                        samples_order_path,
                        headers=['attributes', 'basic', 'newick'])
                    args.samples_order_file = samples_order_path

                if len(self.interactive.samples_information_dict):
                    samples_info_path = filesnpaths.get_temp_file_path()
                    utils.store_dict_as_TAB_delimited_file(
                        self.interactive.samples_information_dict,
                        samples_info_path)
                    args.samples_information_file = samples_info_path

            collection_name = request.forms.get('collection')
            if collection_name in self.interactive.collections.collections_dict:
                collection_path_prefix = filesnpaths.get_temp_file_path()
                self.interactive.collections.export_collection(
                    collection_name, output_file_prefix=collection_path_prefix)

                args.bins = collection_path_prefix + '.txt'
                args.bins_info = collection_path_prefix + '-info.txt'

            server = AnviServerAPI(args)
            server.login()
            server.push()
            return json.dumps({'status': 0})
        except Exception as e:
            message = str(e.clear_text()) if hasattr(e,
                                                     'clear_text') else str(e)
            return json.dumps({'status': 1, 'message': message})
示例#40
0
    def gen_samples_db_for_the_merged_profile(self):
        """Geenrate a samples db for the merged profile.

           We use the ProfileSuperclass to load all the views we added into the meged profile,
           and generate clusterings of samples for each view to generate a default samples database."""

        self.run.info_single("SAMPLES.db stuff...",
                             nl_before=1,
                             nl_after=1,
                             mc="blue")

        essential_fields = [
            f for f in self.atomic_data_fields
            if constants.IS_ESSENTIAL_FIELD(f)
        ]

        class Args:
            pass

        args = Args()
        args.profile_db = self.merged_profile_db_path

        # initialize views.
        profile_db_super = dbops.ProfileSuperclass(args)
        profile_db_super.load_views(omit_parent_column=True)

        # figure out sample orders dictionary
        sample_orders = {}
        failed_attempts = []
        self.progress.new('Working on SAMPLES.db')
        for essential_field in essential_fields:
            self.progress.update('recovering samples order for "%s"' %
                                 (essential_field))
            try:
                sample_orders[essential_field] = \
                        clustering.get_newick_tree_data_for_dict(profile_db_super.views[essential_field]['dict'],
                                                                 distance=self.distance,
                                                                 linkage=self.linkage,
                                                                 transpose=True)
            except:
                failed_attempts.append(essential_field)
        self.progress.end()

        if not len(sample_orders):
            self.run.warning(
                "This may or may not be important: anvi'o attempted to generate a samples\
                              database for this merged profile, however, all attempts to cluster samples\
                              based on view data available in the merged profile failed. No samples db\
                              for you :/")
            return

        if len(failed_attempts):
            self.run.warning("While anvi'o was trying to generate clusterings of samples based on view data\
                              available in the merged profile, clustering of some of the essential data\
                              failed. It is likely not a very big deal, but you shall be the judge of it.\
                              Anvi'o now proceeds to generate a samples db with clusterings it generated\
                              using the view data that worked. Here is the list of stuff that failed: '%s'"\
                              % (', '.join(failed_attempts)))

        # generate the samples order file
        samples_order_file_path = filesnpaths.get_temp_file_path()
        samples_order_file = open(samples_order_file_path, 'w')
        samples_order_file.write('attributes\tbasic\tnewick\n')
        for sample_order in sample_orders:
            samples_order_file.write(
                '%s\t%s\t%s\n' %
                (sample_order, '', sample_orders[sample_order]))
        samples_order_file.close()

        # figure out samples information stuff
        samples_information = {}
        headers = []
        for sample_name in self.sample_ids_found_in_input_dbs:
            samples_information[sample_name] = {}

        self.progress.new('Working on SAMPLES.db')
        self.progress.update('...')

        # figure out num reads mapped per sample:
        for sample_name in self.sample_ids_found_in_input_dbs:
            samples_information[sample_name][
                'num_mapped_reads'] = self.total_reads_mapped_per_sample[
                    sample_name]

        self.progress.end()
        # generate the samples information file
        samples_information_file_path = filesnpaths.get_temp_file_path()
        utils.store_dict_as_TAB_delimited_file(samples_information,
                                               samples_information_file_path,
                                               headers=headers)

        # generate the samples database
        samples_db = dbops.SamplesInformationDatabase(self.samples_db_path,
                                                      quiet=False)
        samples_db.create(
            samples_order_path=samples_order_file_path,
            samples_information_path=samples_information_file_path)

        os.remove(samples_order_file_path)
        os.remove(samples_information_file_path)

        self.run.info('Samples database', self.samples_db_path)
示例#41
0
    def store_gene_coverages_matrix(self):

        if self.summary.quick:
            return

        self.progress.update('Storing gene coverages ...')

        # because splits are cut from arbitrary locations, we may have partial hits of genes in a bin.
        # we don't want genes to appear in a bin more than once due to this, or end up appearing in
        # two different bins just because one bin has a fraction of a gene. here we will build the
        # genes_dict, which will contain every gene hit in all splits that are found in this genome
        # bin.
        genes_dict = {}

        for split_name in self.split_ids:
            if split_name not in self.summary.split_name_to_gene_caller_ids_dict:
                continue

            for gene_entry_id in self.summary.split_name_to_gene_caller_ids_dict[
                    split_name]:
                gene_call_in_split = self.summary.genes_in_splits[
                    gene_entry_id]
                gene_callers_id = gene_call_in_split['gene_callers_id']

                if genes_dict.has_key(gene_callers_id):
                    genes_dict[gene_callers_id].append(gene_call_in_split)
                else:
                    genes_dict[gene_callers_id] = [gene_call_in_split]

        # here we have every gene hit in this bin stored in genes_dict. what we will do is to find gene
        # call ids for genes more than 90% of which apper to be in this bin (so nothing wil be reported for
        # a gene where only like 20% of it ended up in this bin).
        gene_callers_ids_for_complete_genes = set([])
        for gene_caller_id in genes_dict:
            if sum(
                [x['percentage_in_split']
                 for x in genes_dict[gene_caller_id]]) > 90:
                gene_callers_ids_for_complete_genes.add(gene_caller_id)

        del genes_dict

        d = {}

        headers = ['contig', 'start', 'stop', 'direction']
        for gene_callers_id in gene_callers_ids_for_complete_genes:
            d[gene_callers_id] = {}

            # first fill in sample independent information;
            for header in headers:
                d[gene_callers_id][
                    header] = self.summary.genes_in_contigs_dict[
                        gene_callers_id][header]

            # then fill in distribution across samples:
            for sample_name in self.summary.p_meta['samples']:
                d[gene_callers_id][
                    sample_name] = self.summary.gene_coverages_dict[
                        gene_callers_id][sample_name]

            # add functions if there are any:
            if len(self.summary.gene_function_call_sources):
                for source in self.summary.gene_function_call_sources:
                    if gene_callers_id not in self.summary.gene_function_calls_dict:
                        # this gene did not get any functional annotation
                        d[gene_callers_id][source] = ''
                        continue

                    if self.summary.gene_function_calls_dict[gene_callers_id][
                            source]:
                        d[gene_callers_id][
                            source] = self.summary.gene_function_calls_dict[
                                gene_callers_id][source][0]
                    else:
                        d[gene_callers_id][source] = ''

            # finally add the sequence:
            contig = self.summary.genes_in_contigs_dict[gene_callers_id][
                'contig']
            start = self.summary.genes_in_contigs_dict[gene_callers_id][
                'start']
            stop = self.summary.genes_in_contigs_dict[gene_callers_id]['stop']
            d[gene_callers_id]['sequence'] = self.summary.contig_sequences[
                contig]['sequence'][start:stop]

        output_file_obj = self.get_output_file_handle('functions.txt')

        if self.summary.gene_function_call_sources:
            headers = ['prot'] + headers + self.summary.p_meta[
                'samples'] + self.summary.gene_function_call_sources + [
                    'sequence'
                ]
        else:
            headers = ['prot'] + headers + self.summary.p_meta['samples'] + [
                'sequence'
            ]

        utils.store_dict_as_TAB_delimited_file(d,
                                               None,
                                               headers=headers,
                                               file_obj=output_file_obj)

        self.bin_info_dict['genes'] = {
            'num_genes_found': len(gene_callers_ids_for_complete_genes)
        }
import sys
import argparse

from anvio.merger import MultipleRuns
from anvio.constants import levels_of_taxonomy
from anvio.utils import store_dict_as_TAB_delimited_file

m = MultipleRuns(argparse.Namespace())
m.input_profile_db_paths = sys.argv[1:-1]

m.populate_layer_additional_data_dict(missing_default_data_group_is_OK=True)

for level in levels_of_taxonomy:
    store_dict_as_TAB_delimited_file(m.layer_additional_data_dict[level],
                                     '%s_%s.txt' % (sys.argv[-1], level))
示例#43
0
文件: panops.py 项目: psaxcode/anvio
    def gen_data_from_protein_clusters(self, protein_clusters_dict):
        self.progress.new('Generating view data')
        self.progress.update('...')

        def store_file(data, path, headers=None):
            if not headers:
                headers = ['contig'] + sorted(data.values()[0].keys())

            utils.store_dict_as_TAB_delimited_file(data, path, headers=headers)

            return path

        PCs = protein_clusters_dict.keys()

        for PC in PCs:
            self.view_data[PC] = dict([(genome_name, 0)
                                       for genome_name in self.genomes])
            self.view_data_presence_absence[PC] = dict([
                (genome_name, 0) for genome_name in self.genomes
            ])
            self.additional_view_data[PC] = {
                'num_genes_in_pc': 0,
                'num_genomes_pc_has_hits': 0
            }
            for entry_hash, gene_caller_id in [
                    e.split('_') for e in protein_clusters_dict[PC]
            ]:
                try:
                    genome_name = self.hash_to_genome_name[entry_hash]
                except KeyError:
                    raise ConfigError, "Something horrible happened. This can only happen if you started a new analysis with\
                                        additional genomes without cleaning the previous work directory. Sounds familiar?"

                self.view_data[PC][genome_name] += 1
                self.view_data_presence_absence[PC][genome_name] = 1
                self.additional_view_data[PC]['num_genes_in_pc'] += 1
            self.additional_view_data[PC]['num_genomes_pc_has_hits'] = len([
                True for genome in self.view_data[PC]
                if self.view_data[PC][genome] > 0
            ])

        self.progress.end()

        #
        # STORING A COPY OF RAW DATA
        #
        store_file(self.view_data,
                   self.get_output_file_path('anvio-view-data-RAW.txt'),
                   headers=['contig'] + sorted(self.genomes.keys()))
        store_file(
            self.additional_view_data,
            self.get_output_file_path('anvio-additional-view-data-RAW.txt'))
        store_file(
            self.view_data_presence_absence,
            self.get_output_file_path(
                'anvio-view-data-presence-absence-RAW.txt'))

        #
        # FILTERING BASED ON OCCURRENCE
        #
        PCs_of_interest = set([])
        for PC in PCs:
            if self.additional_view_data[PC][
                    'num_genomes_pc_has_hits'] >= self.PC_min_occurrence:
                PCs_of_interest.add(PC)

        for PC in PCs:
            if PC not in PCs_of_interest:
                self.view_data.pop(PC)
                self.view_data_presence_absence.pop(PC)
                self.additional_view_data.pop(PC)

        if self.PC_min_occurrence > 1:
            self.run.info(
                'PCs min occurrence', '%d (the filter removed %s PCs)' %
                (self.PC_min_occurrence,
                 (len(protein_clusters_dict) - len(PCs_of_interest))))

        #
        # STORING FILTERED DATA
        #
        view_data_file_path = store_file(
            self.view_data,
            self.get_output_file_path('anvio-view-data.txt'),
            headers=['contig'] + sorted(self.genomes.keys()))
        additional_view_data_file_path = store_file(
            self.additional_view_data,
            self.get_output_file_path('anvio-additional-view-data.txt'))
        view_data_presence_absence_file_path = store_file(
            self.view_data_presence_absence,
            self.get_output_file_path('anvio-view-data-presence-absence.txt'))

        # here's where we finalize experimental data for clustering
        experimental_data = copy.deepcopy(self.view_data_presence_absence)
        for PC in self.additional_view_data:
            for i in range(0, int(len(self.genomes) / 2)):
                experimental_data[PC]['num_genomes_pc_has_hits_%d' %
                                      i] = self.additional_view_data[PC][
                                          'num_genomes_pc_has_hits']
        experimental_data_file_path = utils.store_dict_as_TAB_delimited_file(
            experimental_data,
            self.get_output_file_path(
                'anvio-experimental-data-for-clustering.txt'))

        self.run.info("Anvi'o view data for protein clusters",
                      view_data_file_path)
        self.run.info("Anvi'o additional view data",
                      additional_view_data_file_path)

        return view_data_file_path, view_data_presence_absence_file_path, additional_view_data_file_path, experimental_data_file_path
示例#44
0
    def store_locus_as_contigs_db(self,
                                  contig_name,
                                  sequence,
                                  gene_calls,
                                  output_path_prefix,
                                  reverse_complement=False):
        """Generates a contigs database and a blank profile for a given locus"""

        temporary_files = []

        # dealing with some output file business.
        E = lambda e: output_path_prefix + e
        locus_output_db_path = E(".db")
        locus_sequence_fasta = E("_sequence.fa")
        locus_external_gene_calls = E("_external_gene_calls.txt")
        temporary_files.extend(
            [locus_external_gene_calls, locus_sequence_fasta])

        # we will generate a blank profile database at the end of this. let's get the directory
        # business sorted.
        profile_output_dir = output_path_prefix + '-PROFILE'
        if os.path.exists(profile_output_dir):
            if self.overwrite_output_destinations:
                filesnpaths.shutil.rmtree(profile_output_dir)
            else:
                raise ConfigError(
                    "The directory %s exists, which kinda messes things up here. Either remove\
                                   it manually, or use the flag  --overwrite-output-destinations so anvi'o can\
                                   do it for you." % profile_output_dir)

        # sort out the contigs database output path
        if filesnpaths.is_file_exists(locus_output_db_path, dont_raise=True):
            if self.overwrite_output_destinations:
                os.remove(locus_output_db_path)
            else:
                raise ConfigError(
                    "There is already a contigs database at the output file path :( Either remove it first,\
                                   or use the --overwrite-output-destinations flag to give anvi'o full authority to wipe\
                                   your disk.")

        # do we need to reverse complement this guy? if yes, we will take care of the contigs sequence and
        # gene calls here, and remember this for later.
        gene_calls_list = list(gene_calls.keys())
        if reverse_complement:
            sequence = utils.rev_comp(sequence)
            gene_calls, gene_caller_id_conversion_dict = utils.rev_comp_gene_calls_dict(
                gene_calls, sequence)
        else:
            gene_caller_id_conversion_dict = dict([
                (gene_calls_list[g], g) for g in range(0, len(gene_calls_list))
            ])
            new_gene_calls = {}
            for g in range(0, len(gene_calls_list)):
                gene_call = copy.deepcopy(gene_calls[gene_calls_list[g]])
                new_gene_calls[g] = gene_call
            gene_calls = new_gene_calls

        # write the sequene as a temporary FASTA file since the design of ContigsDatabase::create
        # will work seamlessly with this approach:
        with open(locus_sequence_fasta, 'w') as f:
            f.write('>%s\n%s\n' % (contig_name, sequence))

        # similarly, here we will store external gene calls so there will be no gene calling during
        # the generation of the contigs database
        headers = [
            'gene_callers_id', 'contig', 'start', 'stop', 'direction',
            'partial', 'source', 'version'
        ]
        utils.store_dict_as_TAB_delimited_file(gene_calls,
                                               locus_external_gene_calls,
                                               headers=headers)

        # this is where magic happens. we ask anvi'o to create a contigs database for us.
        args = argparse.Namespace(
            contigs_fasta=locus_sequence_fasta,
            project_name=os.path.basename(output_path_prefix),
            split_length=sys.maxsize,
            kmer_size=4,
            external_gene_calls=locus_external_gene_calls)
        dbops.ContigsDatabase(locus_output_db_path,
                              run=self.run_object).create(args)

        # while we are at it, here we generate a blank profile, too. so visualization of the
        # new contigs database for debugging or other purposes through anvi'o.
        args = argparse.Namespace(
            blank_profile=True,
            contigs_db=locus_output_db_path,
            skip_hierarchical_clustering=False,
            output_dir=profile_output_dir,
            sample_name=os.path.basename(output_path_prefix))
        profiler.BAMProfiler(args, r=self.run_object)._run()

        # so we have a contigs database! but there isn't much in it. the following where clause will
        # help us read from the tables of the original contigs database, and store it into the
        # new one throughout the following sections of the code.
        where_clause = "gene_callers_id in (%s)" % ', '.join(
            ['"%d"' % g for g in gene_caller_id_conversion_dict])

        # a lousy anonymous function to read data from tables given the gene calls of interest
        R = lambda table_name: db.DB(self.input_contigs_db_path, None, ignore_version=True) \
                                              .get_some_rows_from_table_as_dict(table_name,
                                                                                where_clause=where_clause,
                                                                                error_if_no_data=False)

        G = lambda g: gene_caller_id_conversion_dict[g]

        ############################################################################################
        # DO FUNCTIONS
        ###########################################################################################
        function_calls = R(t.gene_function_calls_table_name)

        for entry_id in function_calls:
            function_calls[entry_id]['gene_callers_id'] = G(
                function_calls[entry_id]['gene_callers_id'])

        gene_function_calls_table = TableForGeneFunctions(locus_output_db_path,
                                                          run=self.run_object)
        gene_function_calls_table.create(function_calls)

        self.run.info("Output contigs DB path", locus_output_db_path)
        self.run.info("Output blank profile DB path",
                      os.path.join(profile_output_dir, 'PROFILE.db'))

        ############################################################################################
        # DO AMINO ACID SEQUENCES -- we are using external gene calls to generate the new contigs
        #                            database, but amino acid sequnces are kept in a different table
        #                            and anvi'o checks whether provided gene calls resolve to amino
        #                            acid sequences with proper starts and stops. if not, it skips
        #                            them. but amino acid sequences for each gene call was stored
        #                            in the original contigs database, and the best practice is to
        #                            carry them into the new one. so here we will remove all data
        #                            from the amino acid seqeunces table in the new database, and
        #                            copy the contents from the original one.
        ############################################################################################
        amino_acid_sequences = R(t.gene_amino_acid_sequences_table_name)

        entries = [(gene_caller_id_conversion_dict[g],
                    amino_acid_sequences[g]['sequence'])
                   for g in amino_acid_sequences]
        db.DB(locus_output_db_path, None, ignore_version=True).insert_many(
            t.gene_amino_acid_sequences_table_name, entries=entries)

        ############################################################################################
        # REMOVE TEMP FILES
        ###########################################################################################
        if anvio.DEBUG:
            self.run.info_single(
                "Temp output files were kept for inspection due to --debug")
        else:
            [os.remove(f) for f in temporary_files]
示例#45
0
    def check_for_db_requests(self, config):
        sections = self.get_other_sections(config)
        # look for requests from the database, create temporary tab delimited files:
        for section in sections:
            alias, matrix = section.split()
            if matrix.find('::') > -1:
                if matrix.startswith('!'):
                    database, table = matrix.split('::')
                    database = database[1:]

                    if database not in self.db_paths:
                        raise ConfigError('anvio could not recover the actual path of the database\
                                            (!%s) referenced in the config file, because the database\
                                            paths variable sent from the client does not have an entry\
                                            for it :( There are two options. One is to get a db_paths\
                                            dictionary sent to this class that contains a key for %s\
                                            with the full path to the dataase as a value. Or the table\
                                            "%s" can be exported to a TAB-delimited matrix and declared in\
                                            the config file. If you are experimenting and stuck here, please\
                                            see the documentation or send an e-mail to the developers.'\
                                                                                % (database, database, table))
                    database_path = self.db_paths[database]
                else:
                    database, table = matrix.split('::')
                    database_path = os.path.abspath(self.db_paths[database]) if database in self.db_paths else os.path.abspath(database)

                    # if its not there, let's try one more thing
                    if not os.path.exists(database_path):
                        database_path = os.path.abspath(os.path.join(self.input_directory, database))

                if not os.path.exists(database_path):
                    raise ConfigError("The database you requested (%s) is not where it was supposed to be ('%s') :/" % (database, database_path))

                dbc = db.DB(database_path, None, ignore_version=True)

                if not table in dbc.get_table_names():
                    raise ConfigError('The table you requested (%s) does not seem to be in %s :/' % (table, database))

                # here we know we are working with a database table that we have access to. however, in anvi'o database
                # tables in two forms: dataframe form, and matrix form. in dataframe form, we have key/value pairs rather
                # than MxN matrices where each N is a column for an attribute. while the latter is easier to export as a
                # matrix the clustering module can work with, the former requires extra attention. so here we need to first
                # figure out whether which form the table is in. why this even became necessary? taking a look at this issue
                # may help: https://github.com/merenlab/anvio/issues/662
                table_form = None
                if config.has_option(section, 'table_form'):
                    table_form = config.get(section, 'table_form')

                table_rows = dbc.get_all_rows_from_table(table)

                if self.row_ids_of_interest:
                    if table_form == 'dataframe':
                        raise ConfigError("Oops .. anvi'o does not know how to deal with specific row ids of interest when a table\
                                           refernced from a clustering recipe is in dataframe form :(")
                    table_rows = [r for r in table_rows if r[0] in self.row_ids_of_interest]

                if not len(table_rows):
                    raise ConfigError("It seems the table '%s' in the database it was requested from is empty. This\
                                        is not good. Here is the section that is not working for you: '%s' :/" \
                                                                % (table, section))

                tmp_file_path = filesnpaths.get_temp_file_path()

                # time to differentially store table contents.
                if table_form == 'dataframe':
                    args = argparse.Namespace(pan_or_profile_db=database_path, table_name=table)
                    table = TableForItemAdditionalData(args)
                    table_keys_list, table_data_dict = table.get()
                    store_dict_as_TAB_delimited_file(table_data_dict, tmp_file_path)
                else:
                    table_structure = dbc.get_table_structure(table)
                    columns_to_exclude = [c for c in ['entry_id', 'sample_id'] if c in table_structure]
                    store_array(table_rows, tmp_file_path, table_structure, exclude_columns=columns_to_exclude)

                self.matrix_paths[alias] = tmp_file_path