def set_read1_consensus_to_read2(input_stream, output_stream): #get the header line = input_stream.readline() while line.startswith("@"): output_stream.write(line) line = input_stream.readline() prev_read=Sam_record(line) for line in input_stream: read=Sam_record(line) if prev_read and read.get_query_name() == prev_read.get_query_name(): if read.is_second_read() and prev_read.is_first_read(): read1=prev_read read2=read else: read2=prev_read read1=read if not read1.is_unmapped(): read2.set_reference_name(read1.get_reference_name()) read2.set_unmapped_flag(False) read2.set_position(1) read2.set_cigar_string("%sM"%len(read2.get_query_sequence())) output_stream.write(str(read1)) output_stream.write(str(read2)) prev_read=None elif prev_read: output_stream.write(str(prev_read)) prev_read=read else: prev_read=read
def set_read1_consensus_to_read1_and_read2(input_stream, output_stream): #get the header line = input_stream.readline() while line.startswith("@"): output_stream.write(line) line = input_stream.readline() n_1_read=Sam_record(line) line = input_stream.readline() n_2_read=Sam_record(line) #We need three read in a row to assign one to the others for line in input_stream: read=Sam_record(line) if n_1_read and n_2_read and read.get_query_name() == n_1_read.get_query_name() and \ read.get_query_name() == n_2_read.get_query_name(): three_reads={} three_reads[test_read_for_assignation(read)]=read three_reads[test_read_for_assignation(n_1_read)]=n_1_read three_reads[test_read_for_assignation(n_2_read)]=n_2_read #All 3 have been found and assigned if not three_reads['first_assigned'].is_unmapped(): three_reads['first_unassigned'].set_reference_name(three_reads['first_assigned'].get_reference_name()) three_reads['first_unassigned'].set_unmapped_flag(False) three_reads['first_unassigned'].set_position(three_reads['first_assigned'].get_position()) three_reads['first_unassigned'].set_cigar_string("%sM"%len(three_reads['first_unassigned'].get_query_sequence())) three_reads['second'].set_reference_name(three_reads['first_assigned'].get_reference_name()) three_reads['second'].set_unmapped_flag(False) three_reads['second'].set_position(three_reads['first_assigned'].get_position()) three_reads['second'].set_cigar_string("%sM"%len(three_reads['second'].get_query_sequence())) output_stream.write(str(three_reads['first_unassigned'])) output_stream.write(str(three_reads['second'])) n_1_read=None n_2_read=None elif n_1_read and n_2_read: logging.warning('Missing pair for singleton %s: is this file sorted.'%(n_2_read.get_query_name())) output_stream.write(str(n_2_read)) n_2_read=n_1_read n_1_read=read elif n_1_read: n_2_read=n_1_read n_1_read=read else: n_1_read=read
def load_from_sites_generator(stream): all_unmatched_read1={} all_unmatched_read2={} count_line=0 for line in stream: count_line+=1 if count_line%10000==0: sys.stderr.write('%s %s %s\n'%(count_line, len(all_unmatched_read1), len(all_unmatched_read2))) sam_record = Sam_record(line) if sam_record.is_first_read(): sam_record_r1 = sam_record sam_record_r2 = all_unmatched_read2.pop(sam_record.get_query_name(),None) if not sam_record_r2: all_unmatched_read1[sam_record.get_query_name()]=sam_record else: sam_record_r2 = sam_record sam_record_r1 = all_unmatched_read1.pop(sam_record.get_query_name(),None) if not sam_record_r1: all_unmatched_read2[sam_record.get_query_name()]=sam_record if sam_record_r1 and sam_record_r2: yield ((sam_record_r1,sam_record_r2))
sam_record = Sam_record(line) if sam_record.get_reference_name( ) != current_reference and not current_reference is None: #process this consensus if current_reference != '*': nb_dups, nb_uniq = find_duplicates(first_reads, second_reads, distance_threshold) total_nb_uniqs += nb_uniq total_nb_dups += nb_dups nb_fragment += len(second_reads) output_reads(output_stream, first_reads, second_reads) first_reads = {} second_reads = {} if sam_record.is_second_read(): second_reads[sam_record.get_query_name()] = sam_record else: first_reads[sam_record.get_query_name()] = sam_record nb_reference += 1 if nb_reference % 1000 == 0: print "process %s consensus" % nb_reference current_reference = sam_record.get_reference_name() if sam_record.get_reference_name( ) != current_reference and not current_reference is None: #process this consensus if current_reference != '*': nb_dups = find_duplicates(first_reads, second_reads, distance_threshold) total_nb_dups += nb_dups nb_fragment += len(second_reads) output_reads(output_stream, first_reads, second_reads)
input_stream,process_input = utils_commands.get_output_stream_from_command(command) command ="%s view -bS - | %s sort - %s"%(samtools_bin, samtools_bin, output_bam_file) logging.info(command) output_stream,process_output= utils_commands.get_input_stream_from_command(command) #get the header line = input_stream.readline() while line.startswith("@"): output_stream.write(line) line = input_stream.readline() while line: read1=Sam_record(line) line = input_stream.readline() read2=Sam_record(line) if read1.get_query_name() == read2.get_query_name(): if read1.is_second_read() and read2.is_first_read(): tmp = read1 read1=read2 read2=tmp read2.set_reference_name(read1.get_reference_name()) output_stream.write(str(read1)) output_stream.write(str(read2)) else: logging.critical("bam file is not sorted by read name") input_stream.close() output_stream.close() #os.remove(output_bam_file+'.bam') return line = input_stream.readline()
output_stream.write(line) continue sam_record = Sam_record(line) if sam_record.get_reference_name()!=current_reference and not current_reference is None: #process this consensus if current_reference!='*': nb_dups, nb_uniq = find_duplicates(first_reads,second_reads, distance_threshold) total_nb_uniqs+=nb_uniq total_nb_dups+=nb_dups nb_fragment+=len(second_reads) output_reads(output_stream, first_reads, second_reads) first_reads={} second_reads={} if sam_record.is_second_read(): second_reads[sam_record.get_query_name()]=sam_record else: first_reads[sam_record.get_query_name()]=sam_record nb_reference+=1 if nb_reference%1000==0: print "process %s consensus"%nb_reference current_reference = sam_record.get_reference_name() if sam_record.get_reference_name()!=current_reference and not current_reference is None: #process this consensus if current_reference!='*': nb_dups = find_duplicates(first_reads,second_reads, distance_threshold) total_nb_dups+=nb_dups nb_fragment+=len(second_reads) output_reads(output_stream, first_reads, second_reads) library_size = estimate_library_size(nb_fragment, total_nb_uniqs) print "%s fragments"%(nb_fragment)