def profile(self): """Big deal function""" # So we start with essential stats. In the section below, we will simply go through each contig # in the BAM file and populate the contigs dictionary for the first time. for i in range(0, len(self.contig_names)): contig_name = self.contig_names[i] contig = contigops.Contig(contig_name) contig.length = self.contig_lengths[i] contig.split_length = self.a_meta['split_length'] contig.min_coverage_for_variability = self.min_coverage_for_variability contig.skip_SNV_profiling = self.skip_SNV_profiling contig.report_variability_full = self.report_variability_full self.progress.new('Profiling "%s" (%d of %d) (%s nts)' % (contig.name, i + 1, len( self.contig_names), pp(int(contig.length)))) # populate contig with empty split objects and for split_name in self.contig_name_to_splits[contig_name]: s = self.splits_basic_info[split_name] split_sequence = self.contig_sequences[contig_name][ 'sequence'][s['start']:s['end']] split = contigops.Split(split_name, split_sequence, contig_name, s['order_in_parent'], s['start'], s['end']) contig.splits.append(split) # analyze coverage for each split contig.analyze_coverage(self.bam, self.progress) # test the mean coverage of the contig. discarded_contigs_due_to_C = set([]) if contig.coverage.mean < self.min_mean_coverage: # discard this contig and continue discarded_contigs_due_to_C.add(contig.name) self.progress.end() continue if not self.skip_SNV_profiling: contig.analyze_auxiliary(self.bam, self.progress) self.progress.end() # add contig to the dict. self.contigs[contig_name] = contig if discarded_contigs_due_to_C: self.run.info('contigs_after_C', pp(len(self.contigs))) # set contig abundance contigops.set_contigs_abundance(self.contigs) self.check_contigs()
def profile_contig_worker(available_index_queue, output_queue, info_dict): bam_file = pysam.Samfile(info_dict['input_file_path'], 'rb') while True: index = available_index_queue.get(True) contig_name = info_dict['contig_names'][index] contig = contigops.Contig(contig_name) contig.length = info_dict['contig_lengths'][index] contig.split_length = info_dict['split_length'] contig.min_coverage_for_variability = info_dict[ 'min_coverage_for_variability'] contig.skip_SNV_profiling = info_dict['skip_SNV_profiling'] contig.report_variability_full = info_dict[ 'report_variability_full'] # populate contig with empty split objects and for split_name in info_dict['contig_name_to_splits'][contig_name]: s = info_dict['splits_basic_info'][split_name] split_sequence = info_dict['contig_sequences'][contig_name][ 'sequence'][s['start']:s['end']] split = contigops.Split(split_name, split_sequence, contig_name, s['order_in_parent'], s['start'], s['end']) contig.splits.append(split) # analyze coverage for each split contig.analyze_coverage(bam_file) # test the mean coverage of the contig. if contig.coverage.mean < info_dict['min_mean_coverage']: output_queue.put(None) continue if not info_dict['skip_SNV_profiling']: contig.analyze_auxiliary(bam_file) output_queue.put(contig) for split in contig.splits: del split.coverage del split.auxiliary del split del contig.splits[:] del contig.coverage del contig # we are closing this object here for clarity, although w # are not really closing it since the code never reaches here # and the worker is killed by its parent: bam_file.close() return
def profile_contig_worker(self, available_index_queue, output_queue): bam_file = pysam.Samfile(self.input_file_path, 'rb') while True: index = available_index_queue.get(True) contig_name = self.contig_names[index] contig = contigops.Contig(contig_name) contig.length = self.contig_lengths[index] contig.split_length = self.a_meta['split_length'] contig.min_coverage_for_variability = self.min_coverage_for_variability contig.skip_SNV_profiling = self.skip_SNV_profiling contig.report_variability_full = self.report_variability_full contig.ignore_orphans = not self.include_orphans contig.max_coverage_depth = self.max_coverage_depth # populate contig with empty split objects and for split_name in self.contig_name_to_splits[contig_name]: s = self.splits_basic_info[split_name] split_sequence = self.contig_sequences[contig_name][ 'sequence'][s['start']:s['end']] split = contigops.Split(split_name, split_sequence, contig_name, s['order_in_parent'], s['start'], s['end']) contig.splits.append(split) # analyze coverage for each split contig.analyze_coverage(bam_file) # test the mean coverage of the contig. if contig.coverage.mean < self.min_mean_coverage: output_queue.put(None) continue if not self.skip_SNV_profiling: contig.analyze_auxiliary(bam_file) codons_in_genes_to_profile_SCVs = set([]) for split in contig.splits: for column_profile in list(split.column_profiles.values()): pos_in_contig = column_profile['pos_in_contig'] column_profile['in_partial_gene_call'], \ column_profile['in_complete_gene_call'],\ column_profile['base_pos_in_codon'] = self.get_nt_position_info(contig.name, pos_in_contig) column_profile['sample_id'] = self.sample_id column_profile[ 'corresponding_gene_call'] = -1 # this means there is no gene call that corresponds to this # nt position, which will be updated in the following lines. # yeah, we use '-1', because genecaller ids start from 0 :/ column_profile['codon_order_in_gene'] = -1 # if this particular position (`pos_in_contig`) falls within a COMPLETE gene call, # we would like to find out which unique gene caller id(s) match to this position. if column_profile['in_complete_gene_call']: corresponding_gene_caller_ids = self.get_corresponding_gene_caller_ids_for_base_position( contig.name, pos_in_contig) # if there are more than one corresponding gene call, this usually indicates an assembly error # just to be on the safe side, we will not report a corresopnding unique gene callers id for this # position if len(corresponding_gene_caller_ids) == 1: # if we are here, it means this nucleotide position is in a complete gene call. we will do two things here. # first, we will store the gene_callers_id that corresponds to this nt position, and then we will store the # order of the corresponding codon in the gene for this nt position. gene_callers_id = corresponding_gene_caller_ids[ 0] column_profile[ 'corresponding_gene_call'] = gene_callers_id column_profile[ 'codon_order_in_gene'] = self.get_corresponding_codon_order_in_gene( gene_callers_id, contig.name, pos_in_contig) # save this information for later use codons_in_genes_to_profile_SCVs.add( (gene_callers_id, column_profile['codon_order_in_gene']), ) codon_frequencies = bamops.CodonFrequencies() codons_in_genes_to_profile_SCVs_dict = {} for gene_callers_id, codon_order in codons_in_genes_to_profile_SCVs: if gene_callers_id not in codons_in_genes_to_profile_SCVs_dict: codons_in_genes_to_profile_SCVs_dict[ gene_callers_id] = set([]) codons_in_genes_to_profile_SCVs_dict[gene_callers_id].add( codon_order) gene_caller_ids_to_profile = list( codons_in_genes_to_profile_SCVs_dict.keys()) for i in range(len(gene_caller_ids_to_profile)): gene_callers_id = gene_caller_ids_to_profile[i] codons_to_profile = codons_in_genes_to_profile_SCVs_dict[ gene_callers_id] gene_call = self.genes_in_contigs_dict[gene_callers_id] contig_name = gene_call['contig'] contig.codon_frequencies_dict[ gene_callers_id] = codon_frequencies.process_gene_call( bam_file, gene_call, self.contig_sequences[contig_name]['sequence'], codons_to_profile) output_queue.put(contig) for split in contig.splits: del split.coverage del split.auxiliary del split del contig.splits[:] del contig.coverage del contig # we are closing this object here for clarity, although w # are not really closing it since the code never reaches here # and the worker is killed by its parent: bam_file.close() return