def set_read1_consensus_to_read2(input_stream, output_stream): #get the header line = input_stream.readline() while line.startswith("@"): output_stream.write(line) line = input_stream.readline() prev_read=Sam_record(line) for line in input_stream: read=Sam_record(line) if prev_read and read.get_query_name() == prev_read.get_query_name(): if read.is_second_read() and prev_read.is_first_read(): read1=prev_read read2=read else: read2=prev_read read1=read if not read1.is_unmapped(): read2.set_reference_name(read1.get_reference_name()) read2.set_unmapped_flag(False) read2.set_position(1) read2.set_cigar_string("%sM"%len(read2.get_query_sequence())) output_stream.write(str(read1)) output_stream.write(str(read2)) prev_read=None elif prev_read: output_stream.write(str(prev_read)) prev_read=read else: prev_read=read
continue sam_record = Sam_record(line) if sam_record.get_reference_name( ) != current_reference and not current_reference is None: #process this consensus if current_reference != '*': nb_dups, nb_uniq = find_duplicates(first_reads, second_reads, distance_threshold) total_nb_uniqs += nb_uniq total_nb_dups += nb_dups nb_fragment += len(second_reads) output_reads(output_stream, first_reads, second_reads) first_reads = {} second_reads = {} if sam_record.is_second_read(): second_reads[sam_record.get_query_name()] = sam_record else: first_reads[sam_record.get_query_name()] = sam_record nb_reference += 1 if nb_reference % 1000 == 0: print "process %s consensus" % nb_reference current_reference = sam_record.get_reference_name() if sam_record.get_reference_name( ) != current_reference and not current_reference is None: #process this consensus if current_reference != '*': nb_dups = find_duplicates(first_reads, second_reads, distance_threshold) total_nb_dups += nb_dups nb_fragment += len(second_reads)
command ="%s view -bS - | %s sort - %s"%(samtools_bin, samtools_bin, output_bam_file) logging.info(command) output_stream,process_output= utils_commands.get_input_stream_from_command(command) #get the header line = input_stream.readline() while line.startswith("@"): output_stream.write(line) line = input_stream.readline() while line: read1=Sam_record(line) line = input_stream.readline() read2=Sam_record(line) if read1.get_query_name() == read2.get_query_name(): if read1.is_second_read() and read2.is_first_read(): tmp = read1 read1=read2 read2=tmp read2.set_reference_name(read1.get_reference_name()) output_stream.write(str(read1)) output_stream.write(str(read2)) else: logging.critical("bam file is not sorted by read name") input_stream.close() output_stream.close() #os.remove(output_bam_file+'.bam') return line = input_stream.readline() return_code=process_input.wait()
if line.startswith("@"): output_stream.write(line) continue sam_record = Sam_record(line) if sam_record.get_reference_name()!=current_reference and not current_reference is None: #process this consensus if current_reference!='*': nb_dups, nb_uniq = find_duplicates(first_reads,second_reads, distance_threshold) total_nb_uniqs+=nb_uniq total_nb_dups+=nb_dups nb_fragment+=len(second_reads) output_reads(output_stream, first_reads, second_reads) first_reads={} second_reads={} if sam_record.is_second_read(): second_reads[sam_record.get_query_name()]=sam_record else: first_reads[sam_record.get_query_name()]=sam_record nb_reference+=1 if nb_reference%1000==0: print "process %s consensus"%nb_reference current_reference = sam_record.get_reference_name() if sam_record.get_reference_name()!=current_reference and not current_reference is None: #process this consensus if current_reference!='*': nb_dups = find_duplicates(first_reads,second_reads, distance_threshold) total_nb_dups+=nb_dups nb_fragment+=len(second_reads) output_reads(output_stream, first_reads, second_reads) library_size = estimate_library_size(nb_fragment, total_nb_uniqs)