def random_flip(sequence, rnum=None): randin = rnum if not randin: randin = RandomSource() if randin.random() < 0.5: return rc(sequence) return sequence
def __init__(self,transcriptome,seed=None,rand=None): if rand: self.random = rand elif seed: self.random = RandomSource(seed) else: self.random = RandomSource() self._transcriptome = transcriptome ###### tcnt = len(self._transcriptome.get_transcripts()) self._weights = [float(i+1)/float(tcnt) for i in range(0,tcnt)] ## _log stores what we are emitting ## self._log = []
def __init__(self, rand=None, seed=None): if rand: self.random = rand else: self.random = RandomSource() if seed: self.random = RandomSource(seed) self._gauss_min = None self._gauss_mu = None self._gauss_sigma = None self.set_lr_cuts()
def __init__(self, transcriptome, seed=None, rand=None): if rand: self.random = rand elif seed: self.random = RandomSource(seed) else: self.random = RandomSource() self._transcriptome = transcriptome ###### tcnt = len(self._transcriptome.get_transcripts()) self._weights = [float(i + 1) / float(tcnt) for i in range(0, tcnt)] ## _log stores what we are emitting ## self._log = []
def __init__(self, rand=None, seed=None): if rand: self.random = rand else: self.random = RandomSource() if seed: self.random = RandomSource(seed) #### context information #### self._before_base = None self._after_base = None #### set the reference base to change for del,mismatch ### self._observed_base = None #### set waht to change base to for ins or mismatch self._modified_base = None
class TranscriptomeEmitter: def __init__(self,transcriptome,seed=None,rand=None): if rand: self.random = rand elif seed: self.random = RandomSource(seed) else: self.random = RandomSource() self._transcriptome = transcriptome ###### tcnt = len(self._transcriptome.get_transcripts()) self._weights = [float(i+1)/float(tcnt) for i in range(0,tcnt)] ## _log stores what we are emitting ## self._log = [] def emit_transcript(self): i = self.random.get_weighted_random_index(self._weights) return self._transcriptome.get_transcripts()[i] # input: an array of weights <<txname1> <weight1>> <<txname2> <weight2>>... def set_weights_by_dict(self,weights): self._weights = [] txnames = [x.get_transcript_name() for x in self._transcriptome.get_transcripts()] for txname in txnames: if txname in weights: self._weights.append(float(weights[txname])) else: self._weights.append(float(0)) return
class TranscriptomeEmitter: def __init__(self, transcriptome, seed=None, rand=None): if rand: self.random = rand elif seed: self.random = RandomSource(seed) else: self.random = RandomSource() self._transcriptome = transcriptome ###### tcnt = len(self._transcriptome.get_transcripts()) self._weights = [float(i + 1) / float(tcnt) for i in range(0, tcnt)] ## _log stores what we are emitting ## self._log = [] def emit_transcript(self): i = self.random.get_weighted_random_index(self._weights) return self._transcriptome.get_transcripts()[i] # input: an array of weights <<txname1> <weight1>> <<txname2> <weight2>>... def set_weights_by_dict(self, weights): self._weights = [] txnames = [ x.get_transcript_name() for x in self._transcriptome.get_transcripts() ] for txname in txnames: if txname in weights: self._weights.append(float(weights[txname])) else: self._weights.append(float(0)) return
class MakeCuts: def __init__(self, rand=None, seed=None): if rand: self.random = rand else: self.random = RandomSource() if seed: self.random = RandomSource(seed) self._gauss_min = None self._gauss_mu = None self._gauss_sigma = None self.set_lr_cuts() def get_cut(self, seq): rgauss = self.random.gauss(self._gauss_mu, self._gauss_sigma) l = min(len(seq), max(self._gauss_min, int(rgauss))) # print self._gauss_min # print self._gauss_mu # print rgauss print l leeway = len(seq) - l start = self.random.randint(0, leeway) return seq[start : start + l] def set_custom(self, gmin, gmu, gsigma): self._gauss_min = gmin self._gauss_mu = gmu self._gauss_sigma = gsigma def set_lr_cuts(self): self._gauss_min = 1000 self._gauss_mu = 4000 self._gauss_sigma = 500 def set_sr_cuts(self): self._gauss_min = 150 self._gauss_mu = 290 self._gauss_sigma = 290
def main(args): # check outputs if len(args.output) > 1 and not args.sr: sys.stderr.write( "Error: Long reads don't support multiple output files\n") sys.exit() elif len(args.output) > 2: sys.stderr.wrtie( "Error: Short reads support at most two output files (paired end)\n" ) sys.exit() if args.sr_length < args.minimum_read_length: args.minimum_read_length = args.sr_length inf = sys.stdin if args.emitter != '-': inf = open(args.emitter) sys.stderr.write("reading in transcriptome emitter\n") indata = pickle.loads( zlib.decompress(base64.b64decode(inf.read().rstrip()))) txome = Transcriptome() txome.load_serialized(indata['txome']) rnum = RandomSource() rnum_tx = RandomSource() # for drawing transcripts if args.seed: rnum = RandomSource(args.seed) rnum_tx = RandomSource(args.seed) # Load in error profile data ep = None if args.error_profile: sys.stderr.write("read in error profile\n") ep = ErrorProfilePermuter(args.error_profile, rnum, args.skew_profile_error_rate) txemitter = TranscriptomeEmitter(txome, rand=rnum_tx) if indata['weight_type'] == 'expression_table': sys.stderr.write( "Using expression table defined transcript expression\n") txweight = indata['weights'] txemitter.set_weights_by_dict(txweight) elif indata['weight_type'] == 'exponential_distribution': sys.stderr.write( "ERROR not yet implemented exponential distribution\n") sys.exit() elif indata['weight_type'] == 'uniform_distribution': sys.stderr.write( "Using uniform distribution of transcript expression\n") cutter = MakeCuts(rand=rnum_tx) if args.sr: cutter.set_custom(args.sr_gauss_min, args.sr_gauss_mu, args.sr_gauss_sigma) elif args.lr: cutter.set_custom(args.lr_gauss_min, args.lr_gauss_mu, args.lr_gauss_sigma) # Prepare outputs of1 = sys.stdout if args.output[0][-3:] == '.gz': of1 = gzip.open(args.output[0], 'w') elif args.output[0] != '-': of1 = open(args.output[0], 'w') of2 = None if len(args.output) > 1: if args.output[1][-3:] == '.gz': of2 = gzip.open(args.output[1], 'w') elif args.output[0] != '-': of2 = open(args.ouptput[1], 'w') of_origin = None if args.output_original_source: if args.output_original_source[-3:] == '.gz': of_origin = gzip.open(args.output_original_source, 'w') else: of_origin = open(args.output_original_source, 'w') of_sc = None if args.output_sequence_change: if args.output_sequence_change[-3:] == '.gz': of_sc = gzip.open(args.output_sequence_change, 'w') else: of_sc = open(args.output_sequence_change, 'w') absmax = args.count * 100 finished_count = 0 z = 0 while finished_count < args.count: z += 1 if z > absmax: break tx = txemitter.emit_transcript() seq = tx.get_sequence() stage1seq = seq if args.trim_5prime or args.trim_3prime: fivestart = 0 threeend = len(seq) if args.trim_5prime: lcut = int(args.trim_5prime[0] * len(seq)) rcut = int(args.trim_5prime[1] * len(seq)) fivestart = rnum_tx.randint(lcut, rcut) if args.trim_3prime: lcut = int(args.trim_3prime[0] * len(seq)) rcut = int(args.trim_3prime[1] * len(seq)) threeend = rnum_tx.randint(lcut, rcut) # set sequence to its new trimmed bounds seq = seq[fivestart:threeend] # flip sequence if necessary if not args.no_flip: seq = random_flip(seq, rnum_tx) l_read = create_name(rnum) r_read = None if args.sr or args.lr: cutseq = cutter.get_cut(seq) else: cutseq = seq #case for no_fragmentation ############# if we pass this we will really start with this one if len(cutseq) < args.minimum_read_length: continue # can now log our read name if of_origin: of_origin.write(l_read + "\t" + tx.get_gene_name() + "\t" + tx.get_transcript_name() + "\n") stage2seq = cutseq r = None if args.sr: r_read = l_read l = cutseq[0:args.sr_length] r = rc(cutseq[-1 * args.sr_length:]) elif args.lr: l = cutseq else: l = cutseq stage3left = l stage3right = r if not stage3right: stage3right = '' ################# # l (or l and r) contains the sequence prior to errors being added l_qual = 'I' * len(l) r_qual = None if r: r_qual = 'I' * len(r) if args.fixed_quality: #sys.stderr.write("Use fixed quality\n") if len(args.fixed_quality) != 1: sys.stderr.write( "ERROR fixed quaility should be 1 character\n") sys.exit() l_qual = args.fixed_quality * len(l) if r: r_qual = args.fixed_quality * len(r) elif args.quality_from_error_rate: #sys.stderr.write("Set quality from error rate\n") qchar = chr( int(-10 * math.log10(args.quality_from_error_rate)) + 33) l_qual = qchar * len(l) if r: r_qual = qchar * len(r) else: #default is generate quality from profile if not ep: sys.stderr.write( "ERROR: cannot generate quality from a profile. Set error profile or chooce quaility from error rate or fixed quality\n" ) sys.exit() l_qual = ep.emit_qual(len(l)) if r: r_qual = ep.emit_qual(len(r)) # Now prior to errors l_qual and r_qual contain our qualities l_fastq = Fastq([l_read, l, '+', l_qual]) r_fastq = None if r: r_fastq = Fastq([r_read, r, '+', r_qual]) # Permute sequences by a specific error rate if args.specific_errors: rate = args.specific_errors me = MakeErrors(rand=rnum) if args.specific_before_context: me.set_before_context(args.specific_before_context) if args.specific_after_context: me.set_after_context(args.specific_after_context) if args.specific_reference_base: if args.specific_reference_base != '-': me.set_observed_base(args.specific_reference_base) if args.specific_modified_base: if args.specific_modified_base != '-': me.set_modified_base(args.specific_modified_base) if args.specific_reference_base == '-': #doing insertions l_fastq = me.random_insertion(l_fastq, rate) if r_fastq: r_fastq = me.random_insertion(r_fastq, rate) elif args.specific_modified_base == '-': #doing deletions l_fastq = me.random_deletion(l_fastq, rate) if r_fastq: r_fastq = me.random_insertion(r_fastq, rate) else: l_fastq = me.random_substitution(l_fastq, rate) if r_fastq: r_fastq = me.random_insertion(r_fastq, rate) elif args.uniform_any_error: l_fastq = do_uniform_any(l_fastq, rnum, args.uniform_any_error) if r_fastq: r_fastq = do_uniform_any(r_fastq, rnum, args.uniform_any_error) elif args.uniform_mismatch_error: l_fastq = do_uniform_mismatch(l_fastq, rnum, args.uniform_mismatch_error) if r_fastq: r_fastq = do_uniform_mismatch(r_fastq, rnum, args.uniform_mismatch_error) elif args.any_error_by_quality: l_fastq = do_quality_any(l_fastq, rnum) if r_fastq: r_fastq = do_quality_any(r_fastq, rnum) elif args.mismatch_error_by_quality: l_fastq = do_quality_mismatch(l_fastq, rnum) if r_fastq: r_fastq = do_quality_mismatch(r_fastq, rnum) elif args.profile_context_error: l_fastq = ep.permute_context(l_fastq) if r_fastq: r_fastq = ep.permute_context(r_fastq) elif args.profile_general_error: l_fastq = ep.permute_general(l_fastq) if r_fastq: r_fastq = ep.permute_general(r_fastq) # if SR grown/shrink to appropriate length if args.sr and len(l_fastq) != args.sr_length: l_fastq = fit_length(l_fastq, args.sr_length, rnum) if r: if args.sr and len(r_fastq) != args.sr_length: r_fastq = fit_length(r_fastq, args.sr_length, rnum) of1.write(l_fastq.fastq()) if of2: of2.write(r_fastq.fastq()) stage4left = l_fastq.seq stage4right = '' if of_sc: of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \ + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n") if r_fastq: stage4right = r_fastq.seq finished_count += 1 if finished_count % 1000 == 0: sys.stderr.write( str(finished_count) + '/' + str(args.count) + " \r") sys.stderr.write("\n") of1.close() if of2: of2.close() if of_origin: of_origin.close() if of_sc: of_sc.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): # check outputs if len(args.output) > 1 and not args.sr: sys.stderr.write("Error: Long reads don't support multiple output files\n") sys.exit() elif len(args.output) > 2: sys.stderr.wrtie("Error: Short reads support at most two output files (paired end)\n") sys.exit() if args.sr_length < args.minimum_read_length: args.minimum_read_length = args.sr_length inf = sys.stdin if args.emitter != '-': inf = open(args.emitter) sys.stderr.write("reading in transcriptome emitter\n") indata = pickle.loads(zlib.decompress(base64.b64decode(inf.read().rstrip()))) txome = Transcriptome() txome.load_serialized(indata['txome']) rnum = RandomSource() rnum_tx = RandomSource() # for drawing transcripts if args.seed: rnum = RandomSource(args.seed) rnum_tx = RandomSource(args.seed) # Load in error profile data ep = None if args.error_profile: sys.stderr.write("read in error profile\n") ep = ErrorProfilePermuter(args.error_profile,rnum,args.skew_profile_error_rate) txemitter = TranscriptomeEmitter(txome,rand=rnum_tx) if indata['weight_type'] == 'expression_table': sys.stderr.write("Using expression table defined transcript expression\n") txweight = indata['weights'] txemitter.set_weights_by_dict(txweight) elif indata['weight_type'] == 'exponential_distribution': sys.stderr.write("ERROR not yet implemented exponential distribution\n") sys.exit() elif indata['weight_type'] == 'uniform_distribution': sys.stderr.write("Using uniform distribution of transcript expression\n") cutter = MakeCuts(rand=rnum_tx) if args.sr: cutter.set_custom(args.sr_gauss_min,args.sr_gauss_mu,args.sr_gauss_sigma) elif args.lr: cutter.set_custom(args.lr_gauss_min,args.lr_gauss_mu,args.lr_gauss_sigma) # Prepare outputs of1 = sys.stdout if args.output[0][-3:] == '.gz': of1 = gzip.open(args.output[0],'w') elif args.output[0] != '-': of1 = open(args.output[0],'w') of2 = None if len(args.output) > 1: if args.output[1][-3:] == '.gz': of2 = gzip.open(args.output[1],'w') elif args.output[0] != '-': of2 = open(args.ouptput[1],'w') of_origin = None if args.output_original_source: if args.output_original_source[-3:]=='.gz': of_origin = gzip.open(args.output_original_source,'w') else: of_origin = open(args.output_original_source,'w') of_sc = None if args.output_sequence_change: if args.output_sequence_change[-3:]=='.gz': of_sc = gzip.open(args.output_sequence_change,'w') else: of_sc = open(args.output_sequence_change,'w') absmax = args.count*100 finished_count = 0 z = 0 while finished_count < args.count: z += 1 if z > absmax: break tx = txemitter.emit_transcript() seq = tx.get_sequence() stage1seq = seq if args.trim_5prime or args.trim_3prime: fivestart = 0 threeend = len(seq) if args.trim_5prime: lcut = int(args.trim_5prime[0]*len(seq)) rcut = int(args.trim_5prime[1]*len(seq)) fivestart = rnum_tx.randint(lcut,rcut) if args.trim_3prime: lcut = int(args.trim_3prime[0]*len(seq)) rcut = int(args.trim_3prime[1]*len(seq)) threeend = rnum_tx.randint(lcut,rcut) # set sequence to its new trimmed bounds seq = seq[fivestart:threeend] # flip sequence if necessary if not args.no_flip: seq = random_flip(seq,rnum_tx) l_read = create_name(rnum) r_read = None if args.sr or args.lr: cutseq = cutter.get_cut(seq) else: cutseq = seq #case for no_fragmentation ############# if we pass this we will really start with this one if len(cutseq) < args.minimum_read_length: continue # can now log our read name if of_origin: of_origin.write(l_read+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\n") stage2seq = cutseq r = None if args.sr: r_read = l_read l = cutseq[0:args.sr_length] r = rc(cutseq[-1*args.sr_length:]) elif args.lr: l = cutseq else: l = cutseq stage3left = l stage3right = r if not stage3right: stage3right = '' ################# # l (or l and r) contains the sequence prior to errors being added l_qual = 'I'*len(l) r_qual = None if r: r_qual = 'I'*len(r) if args.fixed_quality: #sys.stderr.write("Use fixed quality\n") if len(args.fixed_quality) != 1: sys.stderr.write("ERROR fixed quaility should be 1 character\n") sys.exit() l_qual = args.fixed_quality*len(l) if r: r_qual = args.fixed_quality*len(r) elif args.quality_from_error_rate: #sys.stderr.write("Set quality from error rate\n") qchar = chr(int(-10*math.log10(args.quality_from_error_rate))+33) l_qual = qchar*len(l) if r: r_qual = qchar*len(r) else: #default is generate quality from profile if not ep: sys.stderr.write("ERROR: cannot generate quality from a profile. Set error profile or chooce quaility from error rate or fixed quality\n") sys.exit() l_qual = ep.emit_qual(len(l)) if r: r_qual = ep.emit_qual(len(r)) # Now prior to errors l_qual and r_qual contain our qualities l_fastq = Fastq([l_read,l,'+',l_qual]) r_fastq = None if r: r_fastq = Fastq([r_read,r,'+',r_qual]) # Permute sequences by a specific error rate if args.specific_errors: rate = args.specific_errors me = MakeErrors(rand=rnum) if args.specific_before_context: me.set_before_context(args.specific_before_context) if args.specific_after_context: me.set_after_context(args.specific_after_context) if args.specific_reference_base: if args.specific_reference_base != '-': me.set_observed_base(args.specific_reference_base) if args.specific_modified_base: if args.specific_modified_base != '-': me.set_modified_base(args.specific_modified_base) if args.specific_reference_base == '-': #doing insertions l_fastq = me.random_insertion(l_fastq,rate) if r_fastq: r_fastq = me.random_insertion(r_fastq,rate) elif args.specific_modified_base == '-': #doing deletions l_fastq = me.random_deletion(l_fastq,rate) if r_fastq: r_fastq = me.random_insertion(r_fastq,rate) else: l_fastq = me.random_substitution(l_fastq,rate) if r_fastq: r_fastq = me.random_insertion(r_fastq,rate) elif args.uniform_any_error: l_fastq = do_uniform_any(l_fastq,rnum,args.uniform_any_error) if r_fastq: r_fastq = do_uniform_any(r_fastq,rnum,args.uniform_any_error) elif args.uniform_mismatch_error: l_fastq = do_uniform_mismatch(l_fastq,rnum,args.uniform_mismatch_error) if r_fastq: r_fastq = do_uniform_mismatch(r_fastq,rnum,args.uniform_mismatch_error) elif args.any_error_by_quality: l_fastq = do_quality_any(l_fastq,rnum) if r_fastq: r_fastq = do_quality_any(r_fastq,rnum) elif args.mismatch_error_by_quality: l_fastq = do_quality_mismatch(l_fastq,rnum) if r_fastq: r_fastq = do_quality_mismatch(r_fastq,rnum) elif args.profile_context_error: l_fastq = ep.permute_context(l_fastq) if r_fastq: r_fastq = ep.permute_context(r_fastq) elif args.profile_general_error: l_fastq = ep.permute_general(l_fastq) if r_fastq: r_fastq = ep.permute_general(r_fastq) # if SR grown/shrink to appropriate length if args.sr and len(l_fastq) != args.sr_length: l_fastq = fit_length(l_fastq,args.sr_length,rnum) if r: if args.sr and len(r_fastq) != args.sr_length: r_fastq = fit_length(r_fastq,args.sr_length,rnum) of1.write(l_fastq.fastq()) if of2: of2.write(r_fastq.fastq()) stage4left = l_fastq.seq stage4right = '' if of_sc: of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \ + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n") if r_fastq: stage4right = r_fastq.seq finished_count += 1 if finished_count %1000==0: sys.stderr.write(str(finished_count)+'/'+str(args.count)+" \r") sys.stderr.write("\n") of1.close() if of2: of2.close() if of_origin: of_origin.close() if of_sc: of_sc.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
class MakeErrors: def __init__(self, rand=None, seed=None): if rand: self.random = rand else: self.random = RandomSource() if seed: self.random = RandomSource(seed) #### context information #### self._before_base = None self._after_base = None #### set the reference base to change for del,mismatch ### self._observed_base = None #### set waht to change base to for ins or mismatch self._modified_base = None def set_before_context(self, base): self._before_base = base def set_after_context(self, base): self._after_base = base def set_observed_base(self, base): self._observed_base = base def set_modified_base(self, base): self._modified_base = base def random_substitution(self, fastq, rate): sequence = fastq.seq seq = "" for i in range(len(sequence)): # check context prev = None if i >= 1: prev = sequence[i - 1] next = None if i < len(sequence) - 1: next = sequence[i + 1] if self._before_base and (not prev or prev != self._before_base): seq += sequence[i] continue if self._after_base and (not next or next != self._after_base): seq += sequence[i] continue if self._observed_base and (sequence[i] != self._observed_base): seq += sequence[i] continue rnum = self.random.random() if rnum < rate: if not self._modified_base: seq += self.random.different_random_nt(sequence[i]) else: seq += self._modified_base else: seq += sequence[i] return Fastq([fastq.name, seq, "+", fastq.qual]) def random_deletion(self, fastq, rate): sequence = fastq.seq quality = fastq.qual seq = "" qual = None if quality: qual = "" for i in range(len(sequence)): # check context prev = None if i >= 1: prev = sequence[i - 1] next = None if i < len(sequence) - 1: next = sequence[i + 1] if self._before_base and (not prev or prev != self._before_base): seq += sequence[i] if quality: qual += quality[i] continue if self._after_base and (not next or next != self._after_base): seq += sequence[i] if quality: qual += quality[i] continue if self._observed_base and (sequence[i] != self._observed_base): seq += sequence[i] if quality: qual += quality[i] continue rnum = self.random.random() if rnum >= rate: seq += sequence[i] if quality: qual += quality[i] return Fastq([fastq.name, seq, "+", qual]) def random_insertion(self, rate, max_inserts=1): sequence = fastq.seq quality = fastq.qual seq = "" qual = None ibase = rate_to_phred33(rate) if quality: qual = "" z = 0 while self.random.random() < rate and z < max_inserts: if self._before_base: break # can't do this one if self._after_base: if self._after_base != sequence[1]: break z += 1 if self._modified_base: seq += self._modified_base if quality: qual += ibase else: seq += self.random.random_nt() if quality: qual += ibase z = 0 for i in range(len(sequence)): # check context prev = sequence[i] next = None if i < len(sequence) - 1: next = sequence[i + 1] if self._before_base and (not prev or prev != self._before_base): seq += sequence[i] if quality: qual += quality[i] continue if self._after_base and (not next or next != self._after_base): seq += sequence[i] if quality: qual += quality[i] continue seq += sequence[i] if quality: qual += quality[i] while self.random.random() < rate and z < max_inserts: z += 1 if self._modified_base: seq += self._modified_base if quality: qual += ibase else: seq += self.random.random_nt() if quality: qual += ibase z = 0 return Fastq([fastq.name, seq, "+", qual]) def random_flip(self, sequence): if self.random.random() < 0.5: return rc(sequence) return sequence