def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin): command = "%s view -h -F 132 %s" % (samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig = None sample_name, ext = os.path.splitext(bam_file) read_groups = {} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id = rg_sample = rg_library = None for value in sp_line: if value.startswith("ID"): rg_id = value[3:] elif value.startswith("SM"): rg_sample = value[3:] elif value.startswith("LB"): rg_library = value[3:] if rg_id: if rg_sample: read_groups[rg_id] = rg_sample elif rg_library: read_groups[rg_id] = rg_library else: read_groups[rg_id] = rg_id all_sample_coverage = {} all_sample_duplicate = {} for sample in read_groups.values(): all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 # process the first read # if line.startswith("@"): # #Still in the header. There's no read, exit # return sam_record = Sam_record(line.strip()) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 i = 1 # process all the others for line in open_stream: i += 1 if i % 1000000 == 0: print i sam_record = Sam_record(line.strip()) if current_contig != sam_record.get_reference_name( ) and current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values( current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 if current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 finally: open_stream.close()
def process_single_samtools_run_with_read_group(bam_file, all_contigs_info, samtools_bin): command = "%s view -h -F 132 %s" % (samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig = None sample_name, ext = os.path.splitext(bam_file) read_groups = {} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id = rg_sample = rg_library = None for value in sp_line: if value.startswith("ID"): rg_id = value[3:] elif value.startswith("SM"): rg_sample = value[3:] elif value.startswith("LB"): rg_library = value[3:] if rg_id: if rg_sample: read_groups[rg_id] = rg_sample elif rg_library: read_groups[rg_id] = rg_library else: read_groups[rg_id] = rg_id all_sample_coverage = {} all_sample_duplicate = {} for sample in read_groups.values(): all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 # process the first read # if line.startswith("@"): # #Still in the header. There's no read, exit # return sam_record = Sam_record(line.strip()) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 i = 1 # process all the others for line in open_stream: i += 1 if i % 1000000 == 0: print i sam_record = Sam_record(line.strip()) if current_contig != sam_record.get_reference_name() and current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)] += 1 all_sample_coverage[read_groups.get(rg_id)] += 1 if current_contig != None: for sample in read_groups.values(): all_contigs_info.add_values(current_contig, all_sample_coverage.get(sample), all_sample_duplicate.get(sample), sample=sample) all_sample_coverage[sample] = 0 all_sample_duplicate[sample] = 0 finally: open_stream.close()
def process_single_samtools_run_with_read_group(bam_file,all_contigs_info,samtools_bin): command="%s view -h -F 132 %s"%(samtools_bin, bam_file) open_stream, process = get_output_stream_from_command(command) current_contig=None sample_name, ext = os.path.splitext(bam_file) read_groups={} try: for line in open_stream: if not line.startswith("@"): break if line.startswith("@RG"): sp_line = line.strip().split() rg_id=rg_sample=rg_library=None for value in sp_line: if value.startswith("ID"): rg_id=value[3:] elif value.startswith("SM"): rg_sample=value[3:] elif value.startswith("LB"): rg_library=value[3:] if rg_id: if rg_sample: read_groups[rg_id]=rg_sample elif rg_library: read_groups[rg_id]=rg_library else: read_groups[rg_id]=rg_id all_sample_coverage={} all_sample_coverage_reads = {} all_sample_duplicate={} for sample in read_groups.values(): all_sample_coverage[sample]=Counter() all_sample_duplicate[sample]=Counter() all_sample_coverage_reads[sample] = defaultdict(Counter) #process the first read sam_record = Sam_record(line.strip()) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") read_sequence = sam_record.get_query_sequence() loci = get_loci_from_read(sam_record) if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1 i=1 #process all the others for line in open_stream: i+=1 if i%1000000==0: print i sam_record = Sam_record(line.strip()) if current_contig != sam_record.get_reference_name() and current_contig != None: for sample in read_groups.values(): for loci in all_sample_coverage.get(sample): alleles = all_sample_coverage_reads[sample].get(loci) all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0), all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles, sample=sample) all_sample_coverage[sample]=Counter() all_sample_duplicate[sample]=Counter() all_sample_coverage_reads[sample] = defaultdict(Counter) current_contig = sam_record.get_reference_name() if not sam_record.is_unmapped(): rg_id = sam_record.get_tag("RG") loci = get_loci_from_read(sam_record) read_sequence = sam_record.get_query_sequence() if sam_record.is_duplicate_read(): all_sample_duplicate[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage[read_groups.get(rg_id)][str(loci)]+=1 all_sample_coverage_reads[read_groups.get(rg_id)][str(loci)][read_sequence] +=1 if current_contig != None: for sample in read_groups.values(): for loci in all_sample_coverage.get(sample): alleles = all_sample_coverage_reads[sample].get(loci) all_contigs_info.add_values(current_contig, loci, all_sample_coverage.get(sample).get(loci, 0), all_sample_duplicate.get(sample).get(loci, 0), alleles=alleles, sample=sample) all_sample_coverage[sample]=Counter() all_sample_duplicate[sample]=Counter() all_sample_coverage_reads[sample] = defaultdict(Counter) finally: open_stream.close()