def main(args): sys.stderr.write("Reading reference fasta\n") ref_genome = FastaData(open(args.reference_fasta,'rb').read()) sys.stderr.write("Reading in transcriptome\n") output = {} txome = Transcriptome() z = 0 with open(args.reference_gpd) as inf: for line in inf: z+=1 if z%1000==0: sys.stderr.write(str(z)+" \r") gpd = GPD(line) gpd.set_sequence(ref_genome) txome.add_transcript(gpd) sys.stderr.write("\n") sys.stderr.write("Serializing transcriptome\n") output['txome'] = txome.dump_serialized() txweights = {} weight_type = 'uniform_distribution' #default if args.expression_table: weight_type = 'expression_table' inf = None if args.expression_table[-3:]=='.gz': inf = gzip.open(args.expression_table) else: inf = open(args.expression_table) for line in inf: f = line.rstrip().split("\t") txweights[f[0]] = float(f[1]) elif args.exponential_distribution: weight_type = 'exponential_distribution' output['weight_type'] = weight_type output['weights'] = txweights #only matters for expression based of = sys.stdout if args.output: of = open(args.output,'w') of.write(base64.b64encode(zlib.compress(pickle.dumps(output)))+"\n") of.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): # check outputs if len(args.output) > 1 and not args.sr: sys.stderr.write( "Error: Long reads don't support multiple output files\n") sys.exit() elif len(args.output) > 2: sys.stderr.wrtie( "Error: Short reads support at most two output files (paired end)\n" ) sys.exit() if args.sr_length < args.minimum_read_length: args.minimum_read_length = args.sr_length inf = sys.stdin if args.emitter != '-': inf = open(args.emitter) sys.stderr.write("reading in transcriptome emitter\n") indata = pickle.loads( zlib.decompress(base64.b64decode(inf.read().rstrip()))) txome = Transcriptome() txome.load_serialized(indata['txome']) rnum = RandomSource() rnum_tx = RandomSource() # for drawing transcripts if args.seed: rnum = RandomSource(args.seed) rnum_tx = RandomSource(args.seed) # Load in error profile data ep = None if args.error_profile: sys.stderr.write("read in error profile\n") ep = ErrorProfilePermuter(args.error_profile, rnum, args.skew_profile_error_rate) txemitter = TranscriptomeEmitter(txome, rand=rnum_tx) if indata['weight_type'] == 'expression_table': sys.stderr.write( "Using expression table defined transcript expression\n") txweight = indata['weights'] txemitter.set_weights_by_dict(txweight) elif indata['weight_type'] == 'exponential_distribution': sys.stderr.write( "ERROR not yet implemented exponential distribution\n") sys.exit() elif indata['weight_type'] == 'uniform_distribution': sys.stderr.write( "Using uniform distribution of transcript expression\n") cutter = MakeCuts(rand=rnum_tx) if args.sr: cutter.set_custom(args.sr_gauss_min, args.sr_gauss_mu, args.sr_gauss_sigma) elif args.lr: cutter.set_custom(args.lr_gauss_min, args.lr_gauss_mu, args.lr_gauss_sigma) # Prepare outputs of1 = sys.stdout if args.output[0][-3:] == '.gz': of1 = gzip.open(args.output[0], 'w') elif args.output[0] != '-': of1 = open(args.output[0], 'w') of2 = None if len(args.output) > 1: if args.output[1][-3:] == '.gz': of2 = gzip.open(args.output[1], 'w') elif args.output[0] != '-': of2 = open(args.ouptput[1], 'w') of_origin = None if args.output_original_source: if args.output_original_source[-3:] == '.gz': of_origin = gzip.open(args.output_original_source, 'w') else: of_origin = open(args.output_original_source, 'w') of_sc = None if args.output_sequence_change: if args.output_sequence_change[-3:] == '.gz': of_sc = gzip.open(args.output_sequence_change, 'w') else: of_sc = open(args.output_sequence_change, 'w') absmax = args.count * 100 finished_count = 0 z = 0 while finished_count < args.count: z += 1 if z > absmax: break tx = txemitter.emit_transcript() seq = tx.get_sequence() stage1seq = seq if args.trim_5prime or args.trim_3prime: fivestart = 0 threeend = len(seq) if args.trim_5prime: lcut = int(args.trim_5prime[0] * len(seq)) rcut = int(args.trim_5prime[1] * len(seq)) fivestart = rnum_tx.randint(lcut, rcut) if args.trim_3prime: lcut = int(args.trim_3prime[0] * len(seq)) rcut = int(args.trim_3prime[1] * len(seq)) threeend = rnum_tx.randint(lcut, rcut) # set sequence to its new trimmed bounds seq = seq[fivestart:threeend] # flip sequence if necessary if not args.no_flip: seq = random_flip(seq, rnum_tx) l_read = create_name(rnum) r_read = None if args.sr or args.lr: cutseq = cutter.get_cut(seq) else: cutseq = seq #case for no_fragmentation ############# if we pass this we will really start with this one if len(cutseq) < args.minimum_read_length: continue # can now log our read name if of_origin: of_origin.write(l_read + "\t" + tx.get_gene_name() + "\t" + tx.get_transcript_name() + "\n") stage2seq = cutseq r = None if args.sr: r_read = l_read l = cutseq[0:args.sr_length] r = rc(cutseq[-1 * args.sr_length:]) elif args.lr: l = cutseq else: l = cutseq stage3left = l stage3right = r if not stage3right: stage3right = '' ################# # l (or l and r) contains the sequence prior to errors being added l_qual = 'I' * len(l) r_qual = None if r: r_qual = 'I' * len(r) if args.fixed_quality: #sys.stderr.write("Use fixed quality\n") if len(args.fixed_quality) != 1: sys.stderr.write( "ERROR fixed quaility should be 1 character\n") sys.exit() l_qual = args.fixed_quality * len(l) if r: r_qual = args.fixed_quality * len(r) elif args.quality_from_error_rate: #sys.stderr.write("Set quality from error rate\n") qchar = chr( int(-10 * math.log10(args.quality_from_error_rate)) + 33) l_qual = qchar * len(l) if r: r_qual = qchar * len(r) else: #default is generate quality from profile if not ep: sys.stderr.write( "ERROR: cannot generate quality from a profile. Set error profile or chooce quaility from error rate or fixed quality\n" ) sys.exit() l_qual = ep.emit_qual(len(l)) if r: r_qual = ep.emit_qual(len(r)) # Now prior to errors l_qual and r_qual contain our qualities l_fastq = Fastq([l_read, l, '+', l_qual]) r_fastq = None if r: r_fastq = Fastq([r_read, r, '+', r_qual]) # Permute sequences by a specific error rate if args.specific_errors: rate = args.specific_errors me = MakeErrors(rand=rnum) if args.specific_before_context: me.set_before_context(args.specific_before_context) if args.specific_after_context: me.set_after_context(args.specific_after_context) if args.specific_reference_base: if args.specific_reference_base != '-': me.set_observed_base(args.specific_reference_base) if args.specific_modified_base: if args.specific_modified_base != '-': me.set_modified_base(args.specific_modified_base) if args.specific_reference_base == '-': #doing insertions l_fastq = me.random_insertion(l_fastq, rate) if r_fastq: r_fastq = me.random_insertion(r_fastq, rate) elif args.specific_modified_base == '-': #doing deletions l_fastq = me.random_deletion(l_fastq, rate) if r_fastq: r_fastq = me.random_insertion(r_fastq, rate) else: l_fastq = me.random_substitution(l_fastq, rate) if r_fastq: r_fastq = me.random_insertion(r_fastq, rate) elif args.uniform_any_error: l_fastq = do_uniform_any(l_fastq, rnum, args.uniform_any_error) if r_fastq: r_fastq = do_uniform_any(r_fastq, rnum, args.uniform_any_error) elif args.uniform_mismatch_error: l_fastq = do_uniform_mismatch(l_fastq, rnum, args.uniform_mismatch_error) if r_fastq: r_fastq = do_uniform_mismatch(r_fastq, rnum, args.uniform_mismatch_error) elif args.any_error_by_quality: l_fastq = do_quality_any(l_fastq, rnum) if r_fastq: r_fastq = do_quality_any(r_fastq, rnum) elif args.mismatch_error_by_quality: l_fastq = do_quality_mismatch(l_fastq, rnum) if r_fastq: r_fastq = do_quality_mismatch(r_fastq, rnum) elif args.profile_context_error: l_fastq = ep.permute_context(l_fastq) if r_fastq: r_fastq = ep.permute_context(r_fastq) elif args.profile_general_error: l_fastq = ep.permute_general(l_fastq) if r_fastq: r_fastq = ep.permute_general(r_fastq) # if SR grown/shrink to appropriate length if args.sr and len(l_fastq) != args.sr_length: l_fastq = fit_length(l_fastq, args.sr_length, rnum) if r: if args.sr and len(r_fastq) != args.sr_length: r_fastq = fit_length(r_fastq, args.sr_length, rnum) of1.write(l_fastq.fastq()) if of2: of2.write(r_fastq.fastq()) stage4left = l_fastq.seq stage4right = '' if of_sc: of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \ + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n") if r_fastq: stage4right = r_fastq.seq finished_count += 1 if finished_count % 1000 == 0: sys.stderr.write( str(finished_count) + '/' + str(args.count) + " \r") sys.stderr.write("\n") of1.close() if of2: of2.close() if of_origin: of_origin.close() if of_sc: of_sc.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)
def main(args): # check outputs if len(args.output) > 1 and not args.sr: sys.stderr.write("Error: Long reads don't support multiple output files\n") sys.exit() elif len(args.output) > 2: sys.stderr.wrtie("Error: Short reads support at most two output files (paired end)\n") sys.exit() if args.sr_length < args.minimum_read_length: args.minimum_read_length = args.sr_length inf = sys.stdin if args.emitter != '-': inf = open(args.emitter) sys.stderr.write("reading in transcriptome emitter\n") indata = pickle.loads(zlib.decompress(base64.b64decode(inf.read().rstrip()))) txome = Transcriptome() txome.load_serialized(indata['txome']) rnum = RandomSource() rnum_tx = RandomSource() # for drawing transcripts if args.seed: rnum = RandomSource(args.seed) rnum_tx = RandomSource(args.seed) # Load in error profile data ep = None if args.error_profile: sys.stderr.write("read in error profile\n") ep = ErrorProfilePermuter(args.error_profile,rnum,args.skew_profile_error_rate) txemitter = TranscriptomeEmitter(txome,rand=rnum_tx) if indata['weight_type'] == 'expression_table': sys.stderr.write("Using expression table defined transcript expression\n") txweight = indata['weights'] txemitter.set_weights_by_dict(txweight) elif indata['weight_type'] == 'exponential_distribution': sys.stderr.write("ERROR not yet implemented exponential distribution\n") sys.exit() elif indata['weight_type'] == 'uniform_distribution': sys.stderr.write("Using uniform distribution of transcript expression\n") cutter = MakeCuts(rand=rnum_tx) if args.sr: cutter.set_custom(args.sr_gauss_min,args.sr_gauss_mu,args.sr_gauss_sigma) elif args.lr: cutter.set_custom(args.lr_gauss_min,args.lr_gauss_mu,args.lr_gauss_sigma) # Prepare outputs of1 = sys.stdout if args.output[0][-3:] == '.gz': of1 = gzip.open(args.output[0],'w') elif args.output[0] != '-': of1 = open(args.output[0],'w') of2 = None if len(args.output) > 1: if args.output[1][-3:] == '.gz': of2 = gzip.open(args.output[1],'w') elif args.output[0] != '-': of2 = open(args.ouptput[1],'w') of_origin = None if args.output_original_source: if args.output_original_source[-3:]=='.gz': of_origin = gzip.open(args.output_original_source,'w') else: of_origin = open(args.output_original_source,'w') of_sc = None if args.output_sequence_change: if args.output_sequence_change[-3:]=='.gz': of_sc = gzip.open(args.output_sequence_change,'w') else: of_sc = open(args.output_sequence_change,'w') absmax = args.count*100 finished_count = 0 z = 0 while finished_count < args.count: z += 1 if z > absmax: break tx = txemitter.emit_transcript() seq = tx.get_sequence() stage1seq = seq if args.trim_5prime or args.trim_3prime: fivestart = 0 threeend = len(seq) if args.trim_5prime: lcut = int(args.trim_5prime[0]*len(seq)) rcut = int(args.trim_5prime[1]*len(seq)) fivestart = rnum_tx.randint(lcut,rcut) if args.trim_3prime: lcut = int(args.trim_3prime[0]*len(seq)) rcut = int(args.trim_3prime[1]*len(seq)) threeend = rnum_tx.randint(lcut,rcut) # set sequence to its new trimmed bounds seq = seq[fivestart:threeend] # flip sequence if necessary if not args.no_flip: seq = random_flip(seq,rnum_tx) l_read = create_name(rnum) r_read = None if args.sr or args.lr: cutseq = cutter.get_cut(seq) else: cutseq = seq #case for no_fragmentation ############# if we pass this we will really start with this one if len(cutseq) < args.minimum_read_length: continue # can now log our read name if of_origin: of_origin.write(l_read+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\n") stage2seq = cutseq r = None if args.sr: r_read = l_read l = cutseq[0:args.sr_length] r = rc(cutseq[-1*args.sr_length:]) elif args.lr: l = cutseq else: l = cutseq stage3left = l stage3right = r if not stage3right: stage3right = '' ################# # l (or l and r) contains the sequence prior to errors being added l_qual = 'I'*len(l) r_qual = None if r: r_qual = 'I'*len(r) if args.fixed_quality: #sys.stderr.write("Use fixed quality\n") if len(args.fixed_quality) != 1: sys.stderr.write("ERROR fixed quaility should be 1 character\n") sys.exit() l_qual = args.fixed_quality*len(l) if r: r_qual = args.fixed_quality*len(r) elif args.quality_from_error_rate: #sys.stderr.write("Set quality from error rate\n") qchar = chr(int(-10*math.log10(args.quality_from_error_rate))+33) l_qual = qchar*len(l) if r: r_qual = qchar*len(r) else: #default is generate quality from profile if not ep: sys.stderr.write("ERROR: cannot generate quality from a profile. Set error profile or chooce quaility from error rate or fixed quality\n") sys.exit() l_qual = ep.emit_qual(len(l)) if r: r_qual = ep.emit_qual(len(r)) # Now prior to errors l_qual and r_qual contain our qualities l_fastq = Fastq([l_read,l,'+',l_qual]) r_fastq = None if r: r_fastq = Fastq([r_read,r,'+',r_qual]) # Permute sequences by a specific error rate if args.specific_errors: rate = args.specific_errors me = MakeErrors(rand=rnum) if args.specific_before_context: me.set_before_context(args.specific_before_context) if args.specific_after_context: me.set_after_context(args.specific_after_context) if args.specific_reference_base: if args.specific_reference_base != '-': me.set_observed_base(args.specific_reference_base) if args.specific_modified_base: if args.specific_modified_base != '-': me.set_modified_base(args.specific_modified_base) if args.specific_reference_base == '-': #doing insertions l_fastq = me.random_insertion(l_fastq,rate) if r_fastq: r_fastq = me.random_insertion(r_fastq,rate) elif args.specific_modified_base == '-': #doing deletions l_fastq = me.random_deletion(l_fastq,rate) if r_fastq: r_fastq = me.random_insertion(r_fastq,rate) else: l_fastq = me.random_substitution(l_fastq,rate) if r_fastq: r_fastq = me.random_insertion(r_fastq,rate) elif args.uniform_any_error: l_fastq = do_uniform_any(l_fastq,rnum,args.uniform_any_error) if r_fastq: r_fastq = do_uniform_any(r_fastq,rnum,args.uniform_any_error) elif args.uniform_mismatch_error: l_fastq = do_uniform_mismatch(l_fastq,rnum,args.uniform_mismatch_error) if r_fastq: r_fastq = do_uniform_mismatch(r_fastq,rnum,args.uniform_mismatch_error) elif args.any_error_by_quality: l_fastq = do_quality_any(l_fastq,rnum) if r_fastq: r_fastq = do_quality_any(r_fastq,rnum) elif args.mismatch_error_by_quality: l_fastq = do_quality_mismatch(l_fastq,rnum) if r_fastq: r_fastq = do_quality_mismatch(r_fastq,rnum) elif args.profile_context_error: l_fastq = ep.permute_context(l_fastq) if r_fastq: r_fastq = ep.permute_context(r_fastq) elif args.profile_general_error: l_fastq = ep.permute_general(l_fastq) if r_fastq: r_fastq = ep.permute_general(r_fastq) # if SR grown/shrink to appropriate length if args.sr and len(l_fastq) != args.sr_length: l_fastq = fit_length(l_fastq,args.sr_length,rnum) if r: if args.sr and len(r_fastq) != args.sr_length: r_fastq = fit_length(r_fastq,args.sr_length,rnum) of1.write(l_fastq.fastq()) if of2: of2.write(r_fastq.fastq()) stage4left = l_fastq.seq stage4right = '' if of_sc: of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \ + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n") if r_fastq: stage4right = r_fastq.seq finished_count += 1 if finished_count %1000==0: sys.stderr.write(str(finished_count)+'/'+str(args.count)+" \r") sys.stderr.write("\n") of1.close() if of2: of2.close() if of_origin: of_origin.close() if of_sc: of_sc.close() # Temporary working directory step 3 of 3 - Cleanup if not args.specific_tempdir: rmtree(args.tempdir)