示例#1
0
    def profile(self):
        """Big deal function"""

        # So we start with essential stats. In the section below, we will simply go through each contig
        # in the BAM file and populate the contigs dictionary for the first time.
        for i in range(0, len(self.contig_names)):

            contig_name = self.contig_names[i]

            contig = contigops.Contig(contig_name)
            contig.length = self.contig_lengths[i]
            contig.split_length = self.a_meta['split_length']
            contig.min_coverage_for_variability = self.min_coverage_for_variability
            contig.skip_SNV_profiling = self.skip_SNV_profiling
            contig.report_variability_full = self.report_variability_full

            self.progress.new('Profiling "%s" (%d of %d) (%s nts)' %
                              (contig.name, i + 1, len(
                                  self.contig_names), pp(int(contig.length))))

            # populate contig with empty split objects and
            for split_name in self.contig_name_to_splits[contig_name]:
                s = self.splits_basic_info[split_name]
                split_sequence = self.contig_sequences[contig_name][
                    'sequence'][s['start']:s['end']]
                split = contigops.Split(split_name, split_sequence,
                                        contig_name, s['order_in_parent'],
                                        s['start'], s['end'])
                contig.splits.append(split)

            # analyze coverage for each split
            contig.analyze_coverage(self.bam, self.progress)

            # test the mean coverage of the contig.
            discarded_contigs_due_to_C = set([])
            if contig.coverage.mean < self.min_mean_coverage:
                # discard this contig and continue
                discarded_contigs_due_to_C.add(contig.name)
                self.progress.end()
                continue

            if not self.skip_SNV_profiling:
                contig.analyze_auxiliary(self.bam, self.progress)

            self.progress.end()

            # add contig to the dict.
            self.contigs[contig_name] = contig

        if discarded_contigs_due_to_C:
            self.run.info('contigs_after_C', pp(len(self.contigs)))

        # set contig abundance
        contigops.set_contigs_abundance(self.contigs)

        self.check_contigs()
示例#2
0
文件: profiler.py 项目: pythseq/anvio
    def profile_contig_worker(available_index_queue, output_queue, info_dict):
        bam_file = pysam.Samfile(info_dict['input_file_path'], 'rb')
        while True:
            index = available_index_queue.get(True)
            contig_name = info_dict['contig_names'][index]
            contig = contigops.Contig(contig_name)
            contig.length = info_dict['contig_lengths'][index]
            contig.split_length = info_dict['split_length']
            contig.min_coverage_for_variability = info_dict[
                'min_coverage_for_variability']
            contig.skip_SNV_profiling = info_dict['skip_SNV_profiling']
            contig.report_variability_full = info_dict[
                'report_variability_full']

            # populate contig with empty split objects and
            for split_name in info_dict['contig_name_to_splits'][contig_name]:
                s = info_dict['splits_basic_info'][split_name]
                split_sequence = info_dict['contig_sequences'][contig_name][
                    'sequence'][s['start']:s['end']]
                split = contigops.Split(split_name, split_sequence,
                                        contig_name, s['order_in_parent'],
                                        s['start'], s['end'])
                contig.splits.append(split)

            # analyze coverage for each split
            contig.analyze_coverage(bam_file)

            # test the mean coverage of the contig.
            if contig.coverage.mean < info_dict['min_mean_coverage']:
                output_queue.put(None)
                continue

            if not info_dict['skip_SNV_profiling']:
                contig.analyze_auxiliary(bam_file)

            output_queue.put(contig)

            for split in contig.splits:
                del split.coverage
                del split.auxiliary
                del split
            del contig.splits[:]
            del contig.coverage
            del contig

        # we are closing this object here for clarity, although w
        # are not really closing it since the code never reaches here
        # and the worker is killed by its parent:
        bam_file.close()
        return
示例#3
0
    def profile_contig_worker(self, available_index_queue, output_queue):
        bam_file = pysam.Samfile(self.input_file_path, 'rb')

        while True:
            index = available_index_queue.get(True)
            contig_name = self.contig_names[index]
            contig = contigops.Contig(contig_name)
            contig.length = self.contig_lengths[index]
            contig.split_length = self.a_meta['split_length']
            contig.min_coverage_for_variability = self.min_coverage_for_variability
            contig.skip_SNV_profiling = self.skip_SNV_profiling
            contig.report_variability_full = self.report_variability_full
            contig.ignore_orphans = not self.include_orphans
            contig.max_coverage_depth = self.max_coverage_depth

            # populate contig with empty split objects and
            for split_name in self.contig_name_to_splits[contig_name]:
                s = self.splits_basic_info[split_name]
                split_sequence = self.contig_sequences[contig_name][
                    'sequence'][s['start']:s['end']]
                split = contigops.Split(split_name, split_sequence,
                                        contig_name, s['order_in_parent'],
                                        s['start'], s['end'])
                contig.splits.append(split)

            # analyze coverage for each split
            contig.analyze_coverage(bam_file)

            # test the mean coverage of the contig.
            if contig.coverage.mean < self.min_mean_coverage:
                output_queue.put(None)
                continue

            if not self.skip_SNV_profiling:
                contig.analyze_auxiliary(bam_file)
                codons_in_genes_to_profile_SCVs = set([])
                for split in contig.splits:
                    for column_profile in list(split.column_profiles.values()):
                        pos_in_contig = column_profile['pos_in_contig']
                        column_profile['in_partial_gene_call'], \
                        column_profile['in_complete_gene_call'],\
                        column_profile['base_pos_in_codon'] = self.get_nt_position_info(contig.name, pos_in_contig)

                        column_profile['sample_id'] = self.sample_id
                        column_profile[
                            'corresponding_gene_call'] = -1  # this means there is no gene call that corresponds to this
                        # nt position, which will be updated in the following lines.
                        # yeah, we use '-1', because genecaller ids start from 0 :/
                        column_profile['codon_order_in_gene'] = -1

                        # if this particular position (`pos_in_contig`) falls within a COMPLETE gene call,
                        # we would like to find out which unique gene caller id(s) match to this position.
                        if column_profile['in_complete_gene_call']:
                            corresponding_gene_caller_ids = self.get_corresponding_gene_caller_ids_for_base_position(
                                contig.name, pos_in_contig)

                            # if there are more than one corresponding gene call, this usually indicates an assembly error
                            # just to be on the safe side, we will not report a corresopnding unique gene callers id for this
                            # position
                            if len(corresponding_gene_caller_ids) == 1:
                                # if we are here, it means this nucleotide position is in a complete gene call. we will do two things here.
                                # first, we will store the gene_callers_id that corresponds to this nt position, and then we will store the
                                # order of the corresponding codon in the gene for this nt position.
                                gene_callers_id = corresponding_gene_caller_ids[
                                    0]
                                column_profile[
                                    'corresponding_gene_call'] = gene_callers_id
                                column_profile[
                                    'codon_order_in_gene'] = self.get_corresponding_codon_order_in_gene(
                                        gene_callers_id, contig.name,
                                        pos_in_contig)

                                # save this information for later use
                                codons_in_genes_to_profile_SCVs.add(
                                    (gene_callers_id,
                                     column_profile['codon_order_in_gene']), )

                codon_frequencies = bamops.CodonFrequencies()

                codons_in_genes_to_profile_SCVs_dict = {}
                for gene_callers_id, codon_order in codons_in_genes_to_profile_SCVs:
                    if gene_callers_id not in codons_in_genes_to_profile_SCVs_dict:
                        codons_in_genes_to_profile_SCVs_dict[
                            gene_callers_id] = set([])
                    codons_in_genes_to_profile_SCVs_dict[gene_callers_id].add(
                        codon_order)

                gene_caller_ids_to_profile = list(
                    codons_in_genes_to_profile_SCVs_dict.keys())

                for i in range(len(gene_caller_ids_to_profile)):
                    gene_callers_id = gene_caller_ids_to_profile[i]
                    codons_to_profile = codons_in_genes_to_profile_SCVs_dict[
                        gene_callers_id]

                    gene_call = self.genes_in_contigs_dict[gene_callers_id]
                    contig_name = gene_call['contig']
                    contig.codon_frequencies_dict[
                        gene_callers_id] = codon_frequencies.process_gene_call(
                            bam_file, gene_call,
                            self.contig_sequences[contig_name]['sequence'],
                            codons_to_profile)

            output_queue.put(contig)

            for split in contig.splits:
                del split.coverage
                del split.auxiliary
                del split
            del contig.splits[:]
            del contig.coverage
            del contig

        # we are closing this object here for clarity, although w
        # are not really closing it since the code never reaches here
        # and the worker is killed by its parent:
        bam_file.close()
        return