示例#1
0
def test(column, quiet=False, test_class=variability_test_class):
    coverage = len(column)

    results = {}

    if not quiet:
        run.warning(
            '', 'Profiling results for %s nts [TestFactory = %s]' %
            (pretty(len(column)), 'True' if test_class else 'False'))

    for method in ['C', 'Python']:
        results[method] = {}

        start = time.time()
        results[method]['profile'] = ColumnProfile[method](
            column, coverage=coverage, pos=0, test_class=test_class).profile
        end = time.time()

        results[method]['delta_time'] = end - start

        if not quiet:
            run.info('%s profile' % method, results[method]['profile'])
            run.info('%s response time' % method,
                     results[method]['delta_time'])

    run.info('Result',
             'C is ~%.2f times faster' %
             (results['Python']['delta_time'] / results['C']['delta_time']),
             mc='green')
def test(column, quiet = False, test_class = variability_test_class):
    coverage = len(column)

    results = {}

    if not quiet:
        run.warning('', 'Profiling results for %s nts [TestFactory = %s]' % (pretty(len(column)), 'True' if test_class else 'False'))

    for method in ['C', 'Python']:
        results[method] = {}

        start = time.time()
        results[method]['profile'] = ColumnProfile[method](column, coverage = coverage, pos = 0, test_class = test_class).profile
        end = time.time()
    
        results[method]['delta_time'] = end - start
    
        if not quiet:
            run.info('%s profile' % method, results[method]['profile'])
            run.info('%s response time' % method, results[method]['delta_time'])

    run.info('Result', 'C is ~%.2f times faster' % (results['Python']['delta_time'] / results['C']['delta_time']), mc = 'green')
示例#3
0
    def process(self):
        # learn who you are:
        collection_dict = self.collections.get_collection_dict(self.collection_id)
        collection_colors = self.collections.get_collection_colors(self.collection_id)

        # init profile data for colletion.
        self.init_collection_profile(collection_dict)

        # load completeness information if available
        self.completeness = completeness.Completeness(self.contigs_db_path)
        if len(self.completeness.sources):
            self.completeness_data_available = True

        # load HMM sources for non-single-copy genes if available
        if self.non_singlecopy_gene_hmm_sources and not self.quick:
            self.init_non_singlecopy_gene_hmm_sources()
            self.non_single_copy_gene_hmm_data_available = True

        # load gene functions from contigs db superclass
        self.init_functions()

        # set up the initial summary dictionary
        self.summary['meta'] = {'quick': self.quick,
                                'output_directory': self.output_directory,
                                'collection': collection_dict.keys(),
                                'num_bins': len(collection_dict.keys()),
                                'collection_id': self.collection_id,
                                'total_nts_in_collection': 0,
                                'num_contigs_in_collection': 0,
                                'anvio_version': __version__, 
                                'profile': self.p_meta,
                                'contigs': self.a_meta,
                                'gene_coverages_data_available': self.gene_coverages_data_available,
                                'completeness_data_available': self.completeness_data_available,
                                'non_single_copy_gene_hmm_data_available': self.non_single_copy_gene_hmm_data_available, 
                                'percent_contigs_nts_described_by_collection': 0.0,
                                'percent_profile_nts_described_by_collection': 0.0,
                                'percent_contigs_nts_described_by_profile': P(self.p_meta['total_length'], self.a_meta['total_length']) ,
                                'percent_contigs_contigs_described_by_profile': P(self.p_meta['num_contigs'], self.a_meta['num_contigs']) ,
                                'percent_contigs_splits_described_by_profile': P(self.p_meta['num_splits'], self.a_meta['num_splits']) ,
                                    }

        # I am not sure whether this is the best place to do this, 
        self.summary['basics_pretty'] = {'profile': [
                                                     ('Created on', self.p_meta['creation_date']),
                                                     ('Version', self.p_meta['version']),
                                                     ('Minimum conting length', pretty(self.p_meta['min_contig_length'])),
                                                     ('Number of contigs', pretty(int(self.p_meta['num_contigs']))),
                                                     ('Number of splits', pretty(int(self.p_meta['num_splits']))),
                                                     ('Total nucleotides', humanize_n(int(self.p_meta['total_length']))),
                                                    ],
                                         'contigs': [
                                                        ('Created on', self.p_meta['creation_date']),
                                                        ('Version', self.a_meta['version']),
                                                        ('Split length', pretty(int(self.a_meta['split_length']))),
                                                        ('Number of contigs', pretty(int(self.a_meta['num_contigs']))),
                                                        ('Number of splits', pretty(int(self.a_meta['num_splits']))),
                                                        ('Total nucleotides', humanize_n(int(self.a_meta['total_length']))),
                                                        ('K-mer size', self.a_meta['kmer_size']),
                                                    ],
                                        }

        self.summary['max_shown_header_items'] = 10
        self.summary['slice_header_items_tmpl'] = '0:%d' % self.summary['max_shown_header_items']
        self.summary['num_not_shown_samples'] = len(self.p_meta['samples']) - self.summary['max_shown_header_items']
        self.summary['num_not_shown_hmm_items'] = dict([(hmm_search_source, len(self.hmm_sources_info[hmm_search_source]['genes']) - self.summary['max_shown_header_items']) for hmm_search_type, hmm_search_source in self.hmm_searches_header])

        self.summary['files'] = {}
        self.summary['collection'] = {}
        self.summary['collection_profile'] = self.collection_profile # reminder; collection_profile comes from ProfileSuperclass!
        self.summary['collection_profile_items'] = self.collection_profile.values()[0].keys()

        # add hmm items for each seach type:
        if self.non_single_copy_gene_hmm_data_available:
            self.summary['meta']['hmm_items'] = dict([(hmm_search_source, self.hmm_sources_info[hmm_search_source]['genes']) for hmm_search_type, hmm_search_source in self.hmm_searches_header])

        # summarize bins:
        for bin_id in collection_dict: 
            bin = Bin(self, bin_id, collection_dict[bin_id], self.run, self.progress)
            bin.output_directory = os.path.join(self.output_directory, 'bin_by_bin', bin_id)
            bin.bin_profile = self.collection_profile[bin_id]

            self.summary['collection'][bin_id] = bin.create()
            self.summary['collection'][bin_id]['color'] = collection_colors[bin_id] or '#212121'
            self.summary['meta']['total_nts_in_collection'] += self.summary['collection'][bin_id]['total_length']
            self.summary['meta']['num_contigs_in_collection'] += self.summary['collection'][bin_id]['num_contigs'] 

        # bins are computed, add some relevant meta info:
        self.summary['meta']['percent_contigs_nts_described_by_collection'] = '%.2f' % (self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.a_meta['total_length']))
        self.summary['meta']['percent_profile_nts_described_by_collection'] = '%.2f' % (self.summary['meta']['total_nts_in_collection'] * 100.0 / int(self.p_meta['total_length']))
        self.summary['meta']['bins'] = self.get_bins_ordered_by_completeness_and_size()

        if not self.quick:
            # generate a TAB-delimited text output file for bin summaries
            summary_of_bins_matrix_output = {}
            properties = ['taxon', 'total_length', 'num_contigs', 'N50', 'GC_content', 'percent_complete', 'percent_redundancy']

            for bin_name in self.summary['collection']:
                summary_of_bins_matrix_output[bin_name] = dict([(prop, self.summary['collection'][bin_name][prop]) for prop in properties])

            output_file_obj = self.get_output_file_handle(prefix = 'general_bins_summary.txt')
            utils.store_dict_as_TAB_delimited_file(summary_of_bins_matrix_output, None, headers = ['bins'] + properties, file_obj = output_file_obj)

            # save merged matrices for bins x samples
            for table_name in self.collection_profile.values()[0].keys():
                d = {}
                for bin_id in self.collection_profile:
                    d[bin_id] = self.collection_profile[bin_id][table_name]

                output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = '%s.txt' % table_name)
                utils.store_dict_as_TAB_delimited_file(d, None, headers = ['bins'] + sorted(self.p_meta['samples']), file_obj = output_file_obj)

            # merge and store matrices for hmm hits
            if self.non_single_copy_gene_hmm_data_available:
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    # this is to keep numbers per hmm item:
                    d = {}

                    for bin_id in self.summary['meta']['bins']:
                        d[bin_id] = self.summary['collection'][bin_id]['hmms'][hmm_search_source]

                    output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = '%s.txt' % hmm_search_source, within='hmms')
                    utils.store_dict_as_TAB_delimited_file(d, None, headers = ['bins'] + sorted(self.summary['meta']['hmm_items'][hmm_search_source]), file_obj = output_file_obj)

                # this is to keep number of hmm hits per bin:
                n = dict([(bin_id, {}) for bin_id in self.summary['meta']['bins']])
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    for bin_id in self.summary['meta']['bins']:
                        n[bin_id][hmm_search_source] =  sum(self.summary['collection'][bin_id]['hmms'][hmm_search_source].values())

                output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = 'hmm_hit_totals.txt')
                utils.store_dict_as_TAB_delimited_file(n, None, headers = ['bins'] + sorted(self.summary['meta']['hmm_items']), file_obj = output_file_obj)

            # store percent abundance of each bin
            self.summary['bin_percent_recruitment'] = self.bin_percent_recruitment_per_sample
            self.summary['bin_percent_abundance_items'] = sorted(self.bin_percent_recruitment_per_sample.values()[0].keys())
            output_file_obj = self.get_output_file_handle(sub_directory = 'bins_across_samples', prefix = 'bins_percent_recruitment.txt')
            utils.store_dict_as_TAB_delimited_file(self.bin_percent_recruitment_per_sample,
                                                   None,
                                                   headers = ['samples'] + sorted(self.collection_profile.keys()) + ['__splits_not_binned__'],
                                                   file_obj = output_file_obj)


        if self.debug:
            import json
            print json.dumps(self.summary, sort_keys=True, indent=4)

        self.index_html = SummaryHTMLOutput(self.summary, r = self.run, p = self.progress).generate(quick = self.quick)
示例#4
0
    def process(self):
        # learn who you are:
        collection_dict = self.collections.get_collection_dict(
            self.collection_name)
        bins_info_dict = self.collections.get_bins_info_dict(
            self.collection_name)

        # init profile data for colletion.
        self.init_collection_profile(collection_dict)

        # load completeness information if available
        self.completeness = completeness.Completeness(self.contigs_db_path)
        if len(self.completeness.sources):
            self.completeness_data_available = True

        # load HMM sources for non-single-copy genes if available
        if self.non_singlecopy_gene_hmm_sources and not self.quick:
            self.init_non_singlecopy_gene_hmm_sources()
            self.non_single_copy_gene_hmm_data_available = True

        # load gene functions from contigs db superclass
        self.init_functions()

        # set up the initial summary dictionary
        self.summary['meta'] = {
            'quick':
            self.quick,
            'output_directory':
            self.output_directory,
            'collection':
            collection_dict.keys(),
            'num_bins':
            len(collection_dict.keys()),
            'collection_name':
            self.collection_name,
            'total_nts_in_collection':
            0,
            'num_contigs_in_collection':
            0,
            'anvio_version':
            __version__,
            'profile':
            self.p_meta,
            'contigs':
            self.a_meta,
            'gene_coverages_data_available':
            self.gene_coverages_data_available,
            'completeness_data_available':
            self.completeness_data_available,
            'non_single_copy_gene_hmm_data_available':
            self.non_single_copy_gene_hmm_data_available,
            'percent_contigs_nts_described_by_collection':
            0.0,
            'percent_profile_nts_described_by_collection':
            0.0,
            'percent_contigs_nts_described_by_profile':
            P(self.p_meta['total_length'], self.a_meta['total_length']),
            'percent_contigs_contigs_described_by_profile':
            P(self.p_meta['num_contigs'], self.a_meta['num_contigs']),
            'percent_contigs_splits_described_by_profile':
            P(self.p_meta['num_splits'], self.a_meta['num_splits']),
        }

        # I am not sure whether this is the best place to do this,
        self.summary['basics_pretty'] = {
            'profile': [
                ('Created on', self.p_meta['creation_date']),
                ('Version', self.p_meta['version']),
                ('Minimum conting length',
                 pretty(self.p_meta['min_contig_length'])),
                ('Number of contigs', pretty(int(self.p_meta['num_contigs']))),
                ('Number of splits', pretty(int(self.p_meta['num_splits']))),
                ('Total nucleotides',
                 humanize_n(int(self.p_meta['total_length']))),
            ],
            'contigs': [
                ('Created on', self.p_meta['creation_date']),
                ('Version', self.a_meta['version']),
                ('Split length', pretty(int(self.a_meta['split_length']))),
                ('Number of contigs', pretty(int(self.a_meta['num_contigs']))),
                ('Number of splits', pretty(int(self.a_meta['num_splits']))),
                ('Total nucleotides',
                 humanize_n(int(self.a_meta['total_length']))),
                ('K-mer size', self.a_meta['kmer_size']),
            ],
        }

        self.summary['max_shown_header_items'] = 10
        self.summary['slice_header_items_tmpl'] = '0:%d' % self.summary[
            'max_shown_header_items']
        self.summary['num_not_shown_samples'] = len(
            self.p_meta['samples']) - self.summary['max_shown_header_items']
        self.summary['num_not_shown_hmm_items'] = dict([
            (hmm_search_source,
             len(self.hmm_sources_info[hmm_search_source]['genes']) -
             self.summary['max_shown_header_items'])
            for hmm_search_type, hmm_search_source in self.hmm_searches_header
        ])

        self.summary['files'] = {}
        self.summary['collection'] = {}
        self.summary[
            'collection_profile'] = self.collection_profile  # reminder; collection_profile comes from ProfileSuperclass!
        self.summary[
            'collection_profile_items'] = self.collection_profile.values(
            )[0].keys()

        # add hmm items for each seach type:
        if self.non_single_copy_gene_hmm_data_available:
            self.summary['meta']['hmm_items'] = dict([
                (hmm_search_source,
                 self.hmm_sources_info[hmm_search_source]['genes']) for
                hmm_search_type, hmm_search_source in self.hmm_searches_header
            ])

        # summarize bins:
        for bin_id in collection_dict:
            bin = Bin(self, bin_id, collection_dict[bin_id], self.run,
                      self.progress)
            bin.output_directory = os.path.join(self.output_directory,
                                                'bin_by_bin', bin_id)
            bin.bin_profile = self.collection_profile[bin_id]

            self.summary['collection'][bin_id] = bin.create()
            self.summary['collection'][bin_id][
                'color'] = bins_info_dict[bin_id]['html_color'] or '#212121'
            self.summary['collection'][bin_id]['source'] = bins_info_dict[
                bin_id]['source'] or 'unknown_source'
            self.summary['meta']['total_nts_in_collection'] += self.summary[
                'collection'][bin_id]['total_length']
            self.summary['meta']['num_contigs_in_collection'] += self.summary[
                'collection'][bin_id]['num_contigs']

        # bins are computed, add some relevant meta info:
        self.summary['meta'][
            'percent_contigs_nts_described_by_collection'] = '%.2f' % (
                self.summary['meta']['total_nts_in_collection'] * 100.0 /
                int(self.a_meta['total_length']))
        self.summary['meta'][
            'percent_profile_nts_described_by_collection'] = '%.2f' % (
                self.summary['meta']['total_nts_in_collection'] * 100.0 /
                int(self.p_meta['total_length']))
        self.summary['meta'][
            'bins'] = self.get_bins_ordered_by_completeness_and_size()

        if not self.quick:
            # generate a TAB-delimited text output file for bin summaries
            summary_of_bins_matrix_output = {}
            properties = [
                'taxon', 'total_length', 'num_contigs', 'N50', 'GC_content',
                'percent_complete', 'percent_redundancy'
            ]

            for bin_name in self.summary['collection']:
                summary_of_bins_matrix_output[bin_name] = dict([
                    (prop, self.summary['collection'][bin_name][prop])
                    for prop in properties
                ])

            output_file_obj = self.get_output_file_handle(
                prefix='general_bins_summary.txt')
            utils.store_dict_as_TAB_delimited_file(
                summary_of_bins_matrix_output,
                None,
                headers=['bins'] + properties,
                file_obj=output_file_obj)

            # save merged matrices for bins x samples
            for table_name in self.collection_profile.values()[0].keys():
                d = {}
                for bin_id in self.collection_profile:
                    d[bin_id] = self.collection_profile[bin_id][table_name]

                output_file_obj = self.get_output_file_handle(
                    sub_directory='bins_across_samples',
                    prefix='%s.txt' % table_name)
                utils.store_dict_as_TAB_delimited_file(
                    d,
                    None,
                    headers=['bins'] + sorted(self.p_meta['samples']),
                    file_obj=output_file_obj)

            # merge and store matrices for hmm hits
            if self.non_single_copy_gene_hmm_data_available:
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    # this is to keep numbers per hmm item:
                    d = {}

                    for bin_id in self.summary['meta']['bins']:
                        d[bin_id] = self.summary['collection'][bin_id]['hmms'][
                            hmm_search_source]

                    output_file_obj = self.get_output_file_handle(
                        sub_directory='bins_across_samples',
                        prefix='%s.txt' % hmm_search_source,
                        within='hmms')
                    utils.store_dict_as_TAB_delimited_file(
                        d,
                        None,
                        headers=['bins'] +
                        sorted(self.summary['meta']['hmm_items']
                               [hmm_search_source]),
                        file_obj=output_file_obj)

                # this is to keep number of hmm hits per bin:
                n = dict([(bin_id, {})
                          for bin_id in self.summary['meta']['bins']])
                for hmm_search_source in self.summary['meta']['hmm_items']:
                    for bin_id in self.summary['meta']['bins']:
                        n[bin_id][hmm_search_source] = sum(
                            self.summary['collection'][bin_id]['hmms']
                            [hmm_search_source].values())

                output_file_obj = self.get_output_file_handle(
                    sub_directory='bins_across_samples',
                    prefix='hmm_hit_totals.txt')
                utils.store_dict_as_TAB_delimited_file(
                    n,
                    None,
                    headers=['bins'] +
                    sorted(self.summary['meta']['hmm_items']),
                    file_obj=output_file_obj)

            # store percent abundance of each bin
            self.summary[
                'bin_percent_recruitment'] = self.bin_percent_recruitment_per_sample
            self.summary['bin_percent_abundance_items'] = sorted(
                self.bin_percent_recruitment_per_sample.values()[0].keys())
            output_file_obj = self.get_output_file_handle(
                sub_directory='bins_across_samples',
                prefix='bins_percent_recruitment.txt')
            utils.store_dict_as_TAB_delimited_file(
                self.bin_percent_recruitment_per_sample,
                None,
                headers=['samples'] + sorted(self.collection_profile.keys()) +
                ['__splits_not_binned__'],
                file_obj=output_file_obj)

        if self.debug:
            import json
            print json.dumps(self.summary, sort_keys=True, indent=4)

        self.index_html = SummaryHTMLOutput(
            self.summary, r=self.run,
            p=self.progress).generate(quick=self.quick)