def __init__(self,alignment,min_intron_size=68): #self._alns = [] self._min_intron_size=min_intron_size self._aligned_query = None self._hpas = [] self._has_quality = False # can be changed when add_alignment uses one that has quality self._alignment = alignment self._quality_distro = None # gets set by analyze_quality self._deletion_type = None self._query_errors = None self._target_errors = None self._context_query_errors = None self._context_target_errors = None astrings = self._alignment.get_alignment_strings(min_intron_size=self._min_intron_size) if self._alignment.get_query_quality(): self._has_quality = True if len(astrings) == 0: return None alns = [] for i in range(len(astrings[0])): if self._alignment.get_strand() == '+': alns.append({'query':astrings[0][i],'target':astrings[1][i],'query_quality':astrings[2][i]}) else: alns.insert(0,{'query':rc(astrings[0][i]),'target':rc(astrings[1][i]),'query_quality':astrings[2][i][::-1]}) #if self._alignment.get_strand() == '-': # alns = alns[::-1] #get homopolymer alignments self._hpas = self._misalign_split(alns) # split alignment into homopolymer groups self._query_hpas = [] self._target_hpas = [] qi = 0 for i in range(len(self._hpas)): prev = None if i > 0: prev = self._hpas[i-1] foll = None if i + 1 < len(self._hpas): foll = self._hpas[i+1] qlen = len(self._hpas[i].get_query()) for j in range(0,qlen): self._query_hpas.append({'hpa':self._hpas[i],'pos':j,'prev-hpa':prev,'next-hpa':foll}) qi+=qlen ti = 0 for i in range(len(self._hpas)): prev = None if i > 0: prev = self._hpas[i-1] foll = None if i + 1 < len(self._hpas): foll = self._hpas[i+1] tlen = len(self._hpas[i].get_target()) for j in range(0,tlen): self._target_hpas.append({'hpa':self._hpas[i],'pos':j,'prev-hpa':prev,'next-hpa':foll}) ti+=tlen self._target_errors = self.get_target_errors() self._query_errors = self.get_query_errors() self._context_target_errors = self.get_context_target_errors()
def get_SAM(self, min_intron_size=68): from Bio.Format.Sam import SAM #ar is target then query qname = self.get_alignment_ranges()[0][1].chr flag = 0 if self.get_strand() == '-': flag = 16 rname = self.get_alignment_ranges()[0][0].chr pos = self.get_alignment_ranges()[0][0].start mapq = 255 cigar = self.construct_cigar(min_intron_size) rnext = '*' pnext = 0 tlen = self.get_target_range().length() seq = self.get_query_sequence() if not seq: seq = '*' qual = self.get_query_quality() if not qual: qual = '*' #seq = '*' #qual = '*' if self.get_strand() == '-': seq = rc(seq) qual = qual[::-1] ln = qname + "\t" + str(flag) + "\t" + rname + "\t" + \ str(pos) + "\t" + str(mapq) + "\t" + cigar + "\t" + \ rnext + "\t" + str(pnext) + "\t" + str(tlen) + "\t" + \ seq + "\t" + qual return SAM(ln, reference=self._reference)
def get_SAM(self,min_intron_size=68): from Bio.Format.Sam import SAM #ar is target then query qname = self.get_alignment_ranges()[0][1].chr flag = 0 if self.get_strand() == '-': flag = 16 rname = self.get_alignment_ranges()[0][0].chr pos = self.get_alignment_ranges()[0][0].start mapq = 255 cigar = self.construct_cigar(min_intron_size) rnext = '*' pnext = 0 tlen = self.get_target_range().length() seq = self.get_query_sequence() if not seq: seq = '*' qual = self.get_query_quality() if not qual: qual = '*' #seq = '*' #qual = '*' if self.get_strand() == '-': seq = rc(seq) qual = qual[::-1] ln = qname + "\t" + str(flag) + "\t" + rname + "\t" + \ str(pos) + "\t" + str(mapq) + "\t" + cigar + "\t" + \ rnext + "\t" + str(pnext) + "\t" + str(tlen) + "\t" + \ seq + "\t" + qual return SAM(ln,reference=self._reference)
def random_flip(sequence, rnum=None): randin = rnum if not randin: randin = RandomSource() if randin.random() < 0.5: return rc(sequence) return sequence
def get_alignment_strings(self, min_intron_size=68): qseq = self.get_query_sequence() if not qseq: sys.exit( "ERROR: Query sequence must be accessable to get alignment strings\n" ) sys.exit() ref = self.get_reference() qual = self.get_query_quality() if not qual: qual = 'I' * len(qseq) # for a placeholder quality if self.get_strand() == '-': qseq = rc(qseq) qual = qual[::-1] tarr = [] qarr = [] yarr = [] tdone = '' qdone = '' ydone = '' #query quality for i in range(len(self.get_alignment_ranges())): [t, q] = self.get_alignment_ranges()[i] textra = '' qextra = '' yextra = '' if i >= 1: dift = t.start - self.get_alignment_ranges()[i - 1][0].end - 1 difq = q.start - self.get_alignment_ranges()[i - 1][1].end - 1 if dift < min_intron_size: if dift > 0: textra = ref[t.chr][t.start - dift - 1:t.start - 1].upper() qextra = '-' * dift yextra = '\0' * dift elif difq > 0: textra = '-' * difq qextra = qseq[q.start - difq - 1:q.start - 1].upper() yextra = qual[q.start - difq - 1:q.start - 1] else: tarr.append(tdone) qarr.append(qdone) yarr.append(ydone) tdone = '' qdone = '' ydone = '' tdone += textra + ref[t.chr][t.start - 1:t.end].upper() qdone += qextra + qseq[q.start - 1:q.end].upper() ydone += yextra + qual[q.start - 1:q.end] if len(tdone) > 0: tarr.append(tdone) qarr.append(qdone) yarr.append(ydone) if self.get_query_quality() == '*': yarr = [x.replace('I', ' ') for x in yarr] #query, target, query_quality return [qarr, tarr, yarr]
def set_sequence(self,ref_dict): strand = '+' if not self._direction: sys.stderr.write("WARNING: no strand information for the transcript\n") if self._direction: strand = self._direction chr = self.get_chrom() seq = '' for e in [x.get_range() for x in self.exons]: seq += ref_dict[chr][e.start-1:e.end] if strand == '-': seq = rc(seq) self._sequence = seq.upper()
def get_alignment_strings(self,min_intron_size=68): qseq = self.get_query_sequence() if not qseq: sys.exit("ERROR: Query sequence must be accessable to get alignment strings\n") sys.exit() ref = self.get_reference() qual = self.get_query_quality() if not qual: qual = 'I'*len(qseq) # for a placeholder quality if self.get_strand() == '-': qseq = rc(qseq) qual = qual[::-1] tarr = [] qarr = [] yarr = [] tdone = '' qdone = '' ydone = '' #query quality for i in range(len(self.get_alignment_ranges())): [t,q] = self.get_alignment_ranges()[i] textra = '' qextra = '' yextra = '' if i >= 1: dift = t.start-self.get_alignment_ranges()[i-1][0].end-1 difq = q.start-self.get_alignment_ranges()[i-1][1].end-1 if dift < min_intron_size: if dift > 0: textra = ref[t.chr][t.start-dift-1:t.start-1].upper() qextra = '-'*dift yextra = '\0'*dift elif difq > 0: textra = '-'*difq qextra = qseq[q.start-difq-1:q.start-1].upper() yextra = qual[q.start-difq-1:q.start-1] else: tarr.append(tdone) qarr.append(qdone) yarr.append(ydone) tdone = '' qdone = '' ydone = '' tdone += textra+ref[t.chr][t.start-1:t.end].upper() qdone += qextra+qseq[q.start-1:q.end].upper() ydone += yextra+qual[q.start-1:q.end] if len(tdone) > 0: tarr.append(tdone) qarr.append(qdone) yarr.append(ydone) if self.get_query_quality() == '*': yarr = [x.replace('I',' ') for x in yarr] #query, target, query_quality return [qarr,tarr,yarr]
def set_sequence(self, ref_dict): self._initialize() strand = '+' if not self._direction: sys.stderr.write( "WARNING: no strand information for the transcript\n") if self._direction: strand = self._direction chr = self.get_chrom() seq = '' for e in [x.get_range() for x in self.exons]: seq += ref_dict[chr][e.start - 1:e.end] if strand == '-': seq = rc(seq) self._sequence = seq.upper()
def main(args): # check outputs if len(args.output) > 1 and not args.sr: sys.stderr.write( "Error: Long reads don't support multiple output files\n") sys.exit() elif len(args.output) > 2: sys.stderr.wrtie( "Error: Short reads support at most two output files (paired end)\n" ) sys.exit() if args.sr_length < args.minimum_read_length: args.minimum_read_length = args.sr_length inf = sys.stdin if args.emitter != '-': inf = open(args.emitter) sys.stderr.write("reading in transcriptome emitter\n") indata = pickle.loads( zlib.decompress(base64.b64decode(inf.read().rstrip()))) txome = Transcriptome() txome.load_serialized(indata['txome']) rnum = RandomSource() rnum_tx = RandomSource() # for drawing transcripts if args.seed: rnum = RandomSource(args.seed) rnum_tx = RandomSource(args.seed) # Load in error profile data ep = None if args.error_profile: sys.stderr.write("read in error profile\n") ep = ErrorProfilePermuter(args.error_profile, rnum, args.skew_profile_error_rate) txemitter = TranscriptomeEmitter(txome, rand=rnum_tx) if indata['weight_type'] == 'expression_table': sys.stderr.write( "Using expression table defined transcript expression\n") txweight = indata['weights'] txemitter.set_weights_by_dict(txweight) elif indata['weight_type'] == 'exponential_distribution': sys.stderr.write( "ERROR not yet implemented exponential distribution\n") sys.exit() elif indata['weight_type'] == 'uniform_distribution': sys.stderr.write( "Using uniform distribution of transcript expression\n") cutter = MakeCuts(rand=rnum_tx) if args.sr: cutter.set_custom(args.sr_gauss_min, args.sr_gauss_mu, args.sr_gauss_sigma) elif args.lr: cutter.set_custom(args.lr_gauss_min, args.lr_gauss_mu, args.lr_gauss_sigma) # Prepare outputs of1 = sys.stdout if args.output[0][-3:] == '.gz': of1 = gzip.open(args.output[0], 'w') elif args.output[0] != '-': of1 = open(args.output[0], 'w') of2 = None if len(args.output) > 1: if args.output[1][-3:] == '.gz': of2 = gzip.open(args.output[1], 'w') elif args.output[0] != '-': of2 = open(args.ouptput[1], 'w') of_origin = None if args.output_original_source: if args.output_original_source[-3:] == '.gz': of_origin = gzip.open(args.output_original_source, 'w') else: of_origin = open(args.output_original_source, 'w') of_sc = None if args.output_sequence_change: if args.output_sequence_change[-3:] == '.gz': of_sc = gzip.open(args.output_sequence_change, 'w') else: of_sc = open(args.output_sequence_change, 'w') absmax = args.count * 100 finished_count = 0 z = 0 while finished_count < args.count: z += 1 if z > absmax: break tx = txemitter.emit_transcript() seq = tx.get_sequence() stage1seq = seq if args.trim_5prime or args.trim_3prime: fivestart = 0 threeend = len(seq) if args.trim_5prime: lcut = int(args.trim_5prime[0] * len(seq)) rcut = int(args.trim_5prime[1] * len(seq)) fivestart = rnum_tx.randint(lcut, rcut) if args.trim_3prime: lcut = int(args.trim_3prime[0] * len(seq)) rcut = int(args.trim_3prime[1] * len(seq)) threeend = rnum_tx.randint(lcut, rcut) # set sequence to its new trimmed bounds seq = seq[fivestart:threeend] # flip sequence if necessary if not args.no_flip: seq = random_flip(seq, rnum_tx) l_read = create_name(rnum) r_read = None if args.sr or args.lr: cutseq = cutter.get_cut(seq) else: cutseq = seq #case for no_fragmentation ############# if we pass this we will really start with this one if len(cutseq) < args.minimum_read_length: continue # can now log our read name if of_origin: of_origin.write(l_read + "\t" + tx.get_gene_name() + "\t" + tx.get_transcript_name() + "\n") stage2seq = cutseq r = None if args.sr: r_read = l_read l = cutseq[0:args.sr_length] r = rc(cutseq[-1 * args.sr_length:]) elif args.lr: l = cutseq else: l = cutseq stage3left = l stage3right = r if not stage3right: stage3right = '' ################# # l (or l and r) contains the sequence prior to errors being added l_qual = 'I' * len(l) r_qual = None if r: r_qual = 'I' * len(r) if args.fixed_quality: #sys.stderr.write("Use fixed quality\n") if len(args.fixed_quality) != 1: sys.stderr.write( "ERROR fixed quaility should be 1 character\n") sys.exit() l_qual = args.fixed_quality * len(l) if r: r_qual = args.fixed_quality * len(r) elif args.quality_from_error_rate: #sys.stderr.write("Set quality from error rate\n") qchar = chr( int(-10 * math.log10(args.quality_from_error_rate)) + 33) l_qual = qchar * len(l) if r: r_qual = qchar * len(r) else: #default is generate quality from profile if not ep: sys.stderr.write( "ERROR: cannot generate quality from a profile. Set error profile or chooce quaility from error rate or fixed quality\n" ) sys.exit() l_qual = ep.emit_qual(len(l)) if r: r_qual = ep.emit_qual(len(r)) # Now prior to errors l_qual and r_qual contain our qualities l_fastq = Fastq([l_read, l, '+', l_qual]) r_fastq = None if r: r_fastq = Fastq([r_read, r, '+', r_qual]) # Permute sequences by a specific error rate if args.specific_errors: rate = args.specific_errors me = MakeErrors(rand=rnum) if args.specific_before_context: me.set_before_context(args.specific_before_context) if args.specific_after_context: me.set_after_context(args.specific_after_context) if args.specific_reference_base: if args.specific_reference_base != '-': me.set_observed_base(args.specific_reference_base) if args.specific_modified_base: if args.specific_modified_base != '-': me.set_modified_base(args.specific_modified_base) if args.specific_reference_base == '-': #doing insertions l_fastq = me.random_insertion(l_fastq, rate) if r_fastq: r_fastq = me.random_insertion(r_fastq, rate) elif args.specific_modified_base == '-': #doing deletions l_fastq = me.random_deletion(l_fastq, rate) if r_fastq: r_fastq = me.random_insertion(r_fastq, rate) else: l_fastq = me.random_substitution(l_fastq, rate) if r_fastq: r_fastq = me.random_insertion(r_fastq, rate) elif args.uniform_any_error: l_fastq = do_uniform_any(l_fastq, rnum, args.uniform_any_error) if r_fastq: r_fastq = do_uniform_any(r_fastq, rnum, args.uniform_any_error) elif args.uniform_mismatch_error: l_fastq = do_uniform_mismatch(l_fastq, rnum, args.uniform_mismatch_error) if r_fastq: r_fastq = do_uniform_mismatch(r_fastq, rnum, args.uniform_mismatch_error) elif args.any_error_by_quality: l_fastq = do_quality_any(l_fastq, rnum) if r_fastq: r_fastq = do_quality_any(r_fastq, rnum) elif args.mismatch_error_by_quality: l_fastq = do_quality_mismatch(l_fastq, rnum) if r_fastq: r_fastq = do_quality_mismatch(r_fastq, rnum) elif args.profile_context_error: l_fastq = ep.permute_context(l_fastq) if r_fastq: r_fastq = ep.permute_context(r_fastq) elif args.profile_general_error: l_fastq = ep.permute_general(l_fastq) if r_fastq: r_fastq = ep.permute_general(r_fastq) # if SR grown/shrink to appropriate length if args.sr and len(l_fastq) != args.sr_length: l_fastq = fit_length(l_fastq, args.sr_length, rnum) if r: if args.sr and len(r_fastq) != args.sr_length: r_fastq = fit_length(r_fastq, args.sr_length, rnum) of1.write(l_fastq.fastq()) if of2: of2.write(r_fastq.fastq()) stage4left = l_fastq.seq stage4right = '' if of_sc: of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \ + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n") if r_fastq: stage4right = r_fastq.seq finished_count += 1 if finished_count % 1000 == 0: sys.stderr.write( str(finished_count) + '/' + str(args.count) + " \r") sys.stderr.write("\n") of1.close() if of2: of2.close() if of_origin: of_origin.close() if of_sc: of_sc.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def get_query_sequence(self): if self.value('seq') == '*': return None if self.check_flag(0x10): return rc(self.value('seq')) return self.value('seq')
def main(args): # check outputs if len(args.output) > 1 and not args.sr: sys.stderr.write("Error: Long reads don't support multiple output files\n") sys.exit() elif len(args.output) > 2: sys.stderr.wrtie("Error: Short reads support at most two output files (paired end)\n") sys.exit() if args.sr_length < args.minimum_read_length: args.minimum_read_length = args.sr_length inf = sys.stdin if args.emitter != '-': inf = open(args.emitter) sys.stderr.write("reading in transcriptome emitter\n") indata = pickle.loads(zlib.decompress(base64.b64decode(inf.read().rstrip()))) txome = Transcriptome() txome.load_serialized(indata['txome']) rnum = RandomSource() rnum_tx = RandomSource() # for drawing transcripts if args.seed: rnum = RandomSource(args.seed) rnum_tx = RandomSource(args.seed) # Load in error profile data ep = None if args.error_profile: sys.stderr.write("read in error profile\n") ep = ErrorProfilePermuter(args.error_profile,rnum,args.skew_profile_error_rate) txemitter = TranscriptomeEmitter(txome,rand=rnum_tx) if indata['weight_type'] == 'expression_table': sys.stderr.write("Using expression table defined transcript expression\n") txweight = indata['weights'] txemitter.set_weights_by_dict(txweight) elif indata['weight_type'] == 'exponential_distribution': sys.stderr.write("ERROR not yet implemented exponential distribution\n") sys.exit() elif indata['weight_type'] == 'uniform_distribution': sys.stderr.write("Using uniform distribution of transcript expression\n") cutter = MakeCuts(rand=rnum_tx) if args.sr: cutter.set_custom(args.sr_gauss_min,args.sr_gauss_mu,args.sr_gauss_sigma) elif args.lr: cutter.set_custom(args.lr_gauss_min,args.lr_gauss_mu,args.lr_gauss_sigma) # Prepare outputs of1 = sys.stdout if args.output[0][-3:] == '.gz': of1 = gzip.open(args.output[0],'w') elif args.output[0] != '-': of1 = open(args.output[0],'w') of2 = None if len(args.output) > 1: if args.output[1][-3:] == '.gz': of2 = gzip.open(args.output[1],'w') elif args.output[0] != '-': of2 = open(args.ouptput[1],'w') of_origin = None if args.output_original_source: if args.output_original_source[-3:]=='.gz': of_origin = gzip.open(args.output_original_source,'w') else: of_origin = open(args.output_original_source,'w') of_sc = None if args.output_sequence_change: if args.output_sequence_change[-3:]=='.gz': of_sc = gzip.open(args.output_sequence_change,'w') else: of_sc = open(args.output_sequence_change,'w') absmax = args.count*100 finished_count = 0 z = 0 while finished_count < args.count: z += 1 if z > absmax: break tx = txemitter.emit_transcript() seq = tx.get_sequence() stage1seq = seq if args.trim_5prime or args.trim_3prime: fivestart = 0 threeend = len(seq) if args.trim_5prime: lcut = int(args.trim_5prime[0]*len(seq)) rcut = int(args.trim_5prime[1]*len(seq)) fivestart = rnum_tx.randint(lcut,rcut) if args.trim_3prime: lcut = int(args.trim_3prime[0]*len(seq)) rcut = int(args.trim_3prime[1]*len(seq)) threeend = rnum_tx.randint(lcut,rcut) # set sequence to its new trimmed bounds seq = seq[fivestart:threeend] # flip sequence if necessary if not args.no_flip: seq = random_flip(seq,rnum_tx) l_read = create_name(rnum) r_read = None if args.sr or args.lr: cutseq = cutter.get_cut(seq) else: cutseq = seq #case for no_fragmentation ############# if we pass this we will really start with this one if len(cutseq) < args.minimum_read_length: continue # can now log our read name if of_origin: of_origin.write(l_read+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\n") stage2seq = cutseq r = None if args.sr: r_read = l_read l = cutseq[0:args.sr_length] r = rc(cutseq[-1*args.sr_length:]) elif args.lr: l = cutseq else: l = cutseq stage3left = l stage3right = r if not stage3right: stage3right = '' ################# # l (or l and r) contains the sequence prior to errors being added l_qual = 'I'*len(l) r_qual = None if r: r_qual = 'I'*len(r) if args.fixed_quality: #sys.stderr.write("Use fixed quality\n") if len(args.fixed_quality) != 1: sys.stderr.write("ERROR fixed quaility should be 1 character\n") sys.exit() l_qual = args.fixed_quality*len(l) if r: r_qual = args.fixed_quality*len(r) elif args.quality_from_error_rate: #sys.stderr.write("Set quality from error rate\n") qchar = chr(int(-10*math.log10(args.quality_from_error_rate))+33) l_qual = qchar*len(l) if r: r_qual = qchar*len(r) else: #default is generate quality from profile if not ep: sys.stderr.write("ERROR: cannot generate quality from a profile. Set error profile or chooce quaility from error rate or fixed quality\n") sys.exit() l_qual = ep.emit_qual(len(l)) if r: r_qual = ep.emit_qual(len(r)) # Now prior to errors l_qual and r_qual contain our qualities l_fastq = Fastq([l_read,l,'+',l_qual]) r_fastq = None if r: r_fastq = Fastq([r_read,r,'+',r_qual]) # Permute sequences by a specific error rate if args.specific_errors: rate = args.specific_errors me = MakeErrors(rand=rnum) if args.specific_before_context: me.set_before_context(args.specific_before_context) if args.specific_after_context: me.set_after_context(args.specific_after_context) if args.specific_reference_base: if args.specific_reference_base != '-': me.set_observed_base(args.specific_reference_base) if args.specific_modified_base: if args.specific_modified_base != '-': me.set_modified_base(args.specific_modified_base) if args.specific_reference_base == '-': #doing insertions l_fastq = me.random_insertion(l_fastq,rate) if r_fastq: r_fastq = me.random_insertion(r_fastq,rate) elif args.specific_modified_base == '-': #doing deletions l_fastq = me.random_deletion(l_fastq,rate) if r_fastq: r_fastq = me.random_insertion(r_fastq,rate) else: l_fastq = me.random_substitution(l_fastq,rate) if r_fastq: r_fastq = me.random_insertion(r_fastq,rate) elif args.uniform_any_error: l_fastq = do_uniform_any(l_fastq,rnum,args.uniform_any_error) if r_fastq: r_fastq = do_uniform_any(r_fastq,rnum,args.uniform_any_error) elif args.uniform_mismatch_error: l_fastq = do_uniform_mismatch(l_fastq,rnum,args.uniform_mismatch_error) if r_fastq: r_fastq = do_uniform_mismatch(r_fastq,rnum,args.uniform_mismatch_error) elif args.any_error_by_quality: l_fastq = do_quality_any(l_fastq,rnum) if r_fastq: r_fastq = do_quality_any(r_fastq,rnum) elif args.mismatch_error_by_quality: l_fastq = do_quality_mismatch(l_fastq,rnum) if r_fastq: r_fastq = do_quality_mismatch(r_fastq,rnum) elif args.profile_context_error: l_fastq = ep.permute_context(l_fastq) if r_fastq: r_fastq = ep.permute_context(r_fastq) elif args.profile_general_error: l_fastq = ep.permute_general(l_fastq) if r_fastq: r_fastq = ep.permute_general(r_fastq) # if SR grown/shrink to appropriate length if args.sr and len(l_fastq) != args.sr_length: l_fastq = fit_length(l_fastq,args.sr_length,rnum) if r: if args.sr and len(r_fastq) != args.sr_length: r_fastq = fit_length(r_fastq,args.sr_length,rnum) of1.write(l_fastq.fastq()) if of2: of2.write(r_fastq.fastq()) stage4left = l_fastq.seq stage4right = '' if of_sc: of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \ + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n") if r_fastq: stage4right = r_fastq.seq finished_count += 1 if finished_count %1000==0: sys.stderr.write(str(finished_count)+'/'+str(args.count)+" \r") sys.stderr.write("\n") of1.close() if of2: of2.close() if of_origin: of_origin.close() if of_sc: of_sc.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def random_flip(self, sequence): if self.random.random() < 0.5: return rc(sequence) return sequence
def __init__(self, alignment, min_intron_size=68): #self._alns = [] self._min_intron_size = min_intron_size self._aligned_query = None self._hpas = [] self._has_quality = False # can be changed when add_alignment uses one that has quality self._alignment = alignment self._quality_distro = None # gets set by analyze_quality self._deletion_type = None self._query_errors = None self._target_errors = None self._context_query_errors = None self._context_target_errors = None astrings = self._alignment.get_alignment_strings( min_intron_size=self._min_intron_size) if self._alignment.get_query_quality(): self._has_quality = True if len(astrings) == 0: return None alns = [] for i in range(len(astrings[0])): if self._alignment.get_strand() == '+': alns.append({ 'query': astrings[0][i], 'target': astrings[1][i], 'query_quality': astrings[2][i] }) else: alns.insert( 0, { 'query': rc(astrings[0][i]), 'target': rc(astrings[1][i]), 'query_quality': astrings[2][i][::-1] }) #if self._alignment.get_strand() == '-': # alns = alns[::-1] #get homopolymer alignments self._hpas = self._misalign_split( alns) # split alignment into homopolymer groups self._query_hpas = [] self._target_hpas = [] qi = 0 for i in range(len(self._hpas)): prev = None if i > 0: prev = self._hpas[i - 1] foll = None if i + 1 < len(self._hpas): foll = self._hpas[i + 1] qlen = len(self._hpas[i].get_query()) for j in range(0, qlen): self._query_hpas.append({ 'hpa': self._hpas[i], 'pos': j, 'prev-hpa': prev, 'next-hpa': foll }) qi += qlen ti = 0 for i in range(len(self._hpas)): prev = None if i > 0: prev = self._hpas[i - 1] foll = None if i + 1 < len(self._hpas): foll = self._hpas[i + 1] tlen = len(self._hpas[i].get_target()) for j in range(0, tlen): self._target_hpas.append({ 'hpa': self._hpas[i], 'pos': j, 'prev-hpa': prev, 'next-hpa': foll }) ti += tlen self._target_errors = self.get_target_errors() self._query_errors = self.get_query_errors() self._context_target_errors = self.get_context_target_errors()