def process_long_ccs_read_buffer(rbe,buffer,args): #rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter() #rbe.read_serialized(rbe_ser) fq_prof_pacbio_ccs95 = default_pacbio_ccs95() read1 = '' zend = 0 for z in buffer: [name,seq] = rbe.emit_long_read() g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(z)+'/ccs' zend = z read1 += "@"+g+"\n" if args.no_errors: read1 += seq+"\n" read1 += "+\n" read1 += len(seq)*'I'+"\n" else: seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(seq) read1 += seqperm['seq']+"\n" read1 += "+\n" read1 += seqperm['qual']+"\n" return [read1,len(buffer),rbe.emissions_report]
def process_long_ccs_read_buffer(rbe, buffer, args): #rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter() #rbe.read_serialized(rbe_ser) fq_prof_pacbio_ccs95 = default_pacbio_ccs95() read1 = '' zend = 0 for z in buffer: [name, seq] = rbe.emit_long_read() g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str(z) + '/ccs' zend = z read1 += "@" + g + "\n" if args.no_errors: read1 += seq + "\n" read1 += "+\n" read1 += len(seq) * 'I' + "\n" else: seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence( seq) read1 += seqperm['seq'] + "\n" read1 += "+\n" read1 += seqperm['qual'] + "\n" return [read1, len(buffer), rbe.emissions_report]
def main(): parser = argparse.ArgumentParser( description="Create a simulated RNA-seq dataset") group0 = parser.add_mutually_exclusive_group(required=True) group0.add_argument( '--load_biallelic_transcriptome', help= "SERIALIZED BIALLELIC TRANSCRIOTOME EMITTER FILE to load up and use instead of all other file inputs" ) group0.add_argument( '--inputs', nargs=3, help="<reference_genome> <phased_VCF> <transcripts_genepred>") #parser.add_argument('reference_genome',help="The reference genome.") #parser.add_argument('phased_VCF',help="A phased VCF file. If you are simulating the genomes that step can make on of these for you.") #parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts. Each transcript name must be unique.") group = parser.add_mutually_exclusive_group() group.add_argument('--uniform_expression', action='store_true', help="Uniform distribution of transcript expression") group.add_argument( '--isoform_expression', help= "The transcript expression in TSV format <Transcript name> tab <Expression>" ) group.add_argument( '--cufflinks_isoform_expression', help= "The expression of the isoforms or - for a uniform distribution of transcript expression" ) group2 = parser.add_mutually_exclusive_group() group2.add_argument( '--ASE_identical', type=float, help= "The ASE for the transcriptome, every isoform will have the same allele preference." ) group2.add_argument('--ASE_isoform_random', action='store_true', help="The ASE will be random for every isoform.") group2.add_argument( '--ASE_locus_random', action='store_true', help="The ASE will be randomly assigned for each locus") parser.add_argument('--short_read_count', type=int, default=10000, help="INT number of short reads") parser.add_argument('--short_read_length', type=int, default=101, help="INT length of the short reads") parser.add_argument('--long_read_ccs_count', type=int, default=4000, help="INT default number of long reads") parser.add_argument('--long_read_subread_count', type=int, default=4000, help="INT default number of long reads") parser.add_argument('--no_errors', action='store_true', help="Do not simulate errors in reads") parser.add_argument('--threads', type=int, default=cpu_count(), help="Number of threads defaults to cpu_count()") parser.add_argument( '--locus_by_gene_name', action='store_true', help="Faster than the complete calculation for overlapping loci.") parser.add_argument( '--seed', type=int, help= "seed to make transcriptome and rho creation deterministic. Reads are still random, its just the transcriptome and rho that become determinisitic." ) group3 = parser.add_mutually_exclusive_group(required=True) group3.add_argument('--output', help="Directory name for output") group3.add_argument( '--save_biallelic_transcriptome', help= "FILENAME output the biallelic transcriptome used to this file and then exit" ) parser.add_argument( '--starting_read_multiplier', type=int, default=0, help= "Used if outputting different reads from object, and you want them number differently give each different set values 0, 1, 2, etc..." ) args = parser.parse_args() fq_prof_illumina = None fq_prof_pacbio_ccs95 = None fq_prof_pacbio_subreads = None if not args.no_errors: fq_prof_illumina = default_illumina() fq_prof_pacbio_ccs95 = default_pacbio_ccs95() fq_prof_pacbio_subreads = default_pacbio_subreads() rbe = None if not args.load_biallelic_transcriptome: # we need to establish the emitter based on some known data rbe = load_from_inputs(args) else: rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter() inf = open(args.load_biallelic_transcriptome) sline = inf.readline().rstrip() inf.close() rbe.read_serialized(sline) if args.save_biallelic_transcriptome: ofser = open(args.save_biallelic_transcriptome, 'w') ofser.write(rbe.get_serialized()) ofser.close() return #exiting here # Lets prepare to output now args.output = args.output.rstrip('/') if not os.path.exists(args.output): os.makedirs(args.output) ofser = open( args.output + "/RandomBiallelicTranscriptomeEmitter.serialized", 'w') ofser.write(rbe.get_serialized()) ofser.close() rbe.set_gaussian_fragmentation_default_hiseq() #rbe_ser = rbe.get_serialized() sys.stderr.write("Sequencing short reads\n") global shand1 shand1 = gzip.open(args.output + "/SR_1.fq.gz", 'wb') global shand2 shand2 = gzip.open(args.output + "/SR_2.fq.gz", 'wb') z = 0 buffer_full_size = 5000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(args.short_read_count * args.starting_read_multiplier, args.short_read_count * (args.starting_read_multiplier + 1)): z = i + 1 buffer.append(z) if buffer_full_size <= len(buffer): vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_short_read_buffer, args=(rbe, vals, args), callback=write_short_reads) else: oval = process_short_read_buffer(rbe, vals, args) write_short_reads(oval) if len(buffer) > 0: vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_short_read_buffer, args=(rbe, vals, args), callback=write_short_reads) else: oval = process_short_read_buffer(rbe, vals, args) write_short_reads(oval) if args.threads > 1: p.close() p.join() sys.stderr.write("\nFinished sequencing short reads\n") shand1.close() shand2.close() global emissions_reports for i in range(0, len(emissions_reports)): emissions_reports[i] = emissions_reports[i].get() sr_report = combine_reports(emissions_reports) rbe.emissions_report = {} # initialize so we don't accidentally overwrite # Now lets print out some of the emission details of = open(args.output + "/SR_report.txt", 'w') for name in sorted(rbe.name2locus.keys()): express = 1 if rbe.transcriptome1.expression: express = rbe.transcriptome1.expression.get_expression(name) if name in sr_report: of.write(name + "\t" + rbe.gene_names[name] + "\t" + str(rbe.name2locus[name]) + "\t" + str(express) + "\t" + str(rbe.transcriptome1_rho[name]) + "\t" + str(sr_report[name][0]) + "\t" + str(sr_report[name][1]) + "\n") else: of.write(name + "\t" + rbe.gene_names[name] + "\t" + str(rbe.name2locus[name]) + "\t" + str(express) + "\t" + str(rbe.transcriptome1_rho[name]) + "\t" + str(0) + "\t" + str(0) + "\n") of.close() rbe.emissions_report = {} emissions_reports = [] # Now lets create the long read set rbe.set_gaussian_fragmentation_default_pacbio() #rbe_ser = rbe.get_serialized() sys.stderr.write("Sequencing long ccs reads\n") shand1 = gzip.open(args.output + "/LR_ccs95.fq.gz", 'wb') buffer_full_size = 500 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(args.starting_read_multiplier * args.long_read_ccs_count, (args.starting_read_multiplier + 1) * args.long_read_ccs_count): z = i + 1 buffer.append(z) if buffer_full_size <= len(buffer): vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_long_ccs_read_buffer, args=(rbe, vals, args), callback=write_long_reads) else: oval = process_long_ccs_read_buffer(rbe, vals, args) write_long_reads(oval) if len(buffer) > 0: vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_long_ccs_read_buffer, args=(rbe, vals, args), callback=write_long_reads) else: oval = process_long_ccs_read_buffer(rbe, vals, args) write_long_reads(oval) if args.threads > 1: p.close() p.join() sys.stderr.write("\nFinished sequencing long reads\n") shand1.close() for i in range(0, len(emissions_reports)): emissions_reports[i] = emissions_reports[i].get() lr_ccs_report = combine_reports(emissions_reports) rbe.emissions_report = {} # initialize so we don't accidentally overwrite # Now lets print out some of the emission details of = open(args.output + "/LR_ccs95_report.txt", 'w') for name in sorted(rbe.name2locus.keys()): express = 1 if rbe.transcriptome1.expression: express = rbe.transcriptome1.expression.get_expression(name) if name in lr_ccs_report: of.write(name + "\t" + rbe.gene_names[name] + "\t" + str(rbe.name2locus[name]) + "\t" + str(express) + "\t" + str(rbe.transcriptome1_rho[name]) + "\t" + str(lr_ccs_report[name][0]) + "\t" + str(lr_ccs_report[name][1]) + "\n") else: of.write(name + "\t" + rbe.gene_names[name] + "\t" + str(rbe.name2locus[name]) + "\t" + str(express) + "\t" + str(rbe.transcriptome1_rho[name]) + "\t" + str(0) + "\t" + str(0) + "\n") of.close() rbe.emissions_report = {} emissions_reports = [] # Now lets create the long subread read set rbe.set_gaussian_fragmentation_default_pacbio() #rbe_ser = rbe.get_serialized() sys.stderr.write("Sequencing long subreads\n") shand1 = gzip.open(args.output + "/LR_subreads.fq.gz", 'wb') buffer_full_size = 500 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range( args.long_read_subread_count * args.starting_read_multiplier, (args.starting_read_multiplier + 1) * args.long_read_subread_count): z = i + 1 buffer.append(z) if buffer_full_size <= len(buffer): vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_long_sub_read_buffer, args=(rbe, vals, args), callback=write_long_reads) else: oval = process_long_sub_read_buffer(rbe, vals, args) write_long_reads(oval) if len(buffer) > 0: vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_long_sub_read_buffer, args=(rbe, vals, args), callback=write_long_reads) else: oval = process_long_sub_read_buffer(rbe, vals, args) write_long_reads(oval) if args.threads > 1: p.close() p.join() sys.stderr.write("\nFinished sequencing long reads\n") shand1.close() for i in range(0, len(emissions_reports)): emissions_reports[i] = emissions_reports[i].get() lr_sub_report = combine_reports(emissions_reports) rbe.emissions_report = {} # initialize so we don't accidentally overwrite # Now lets print out some of the emission details of = open(args.output + "/LR_subreads_report.txt", 'w') for name in sorted(rbe.name2locus.keys()): express = 1 if rbe.transcriptome1.expression: express = rbe.transcriptome1.expression.get_expression(name) if name in lr_sub_report: of.write(name + "\t" + rbe.gene_names[name] + "\t" + str(rbe.name2locus[name]) + "\t" + str(express) + "\t" + str(rbe.transcriptome1_rho[name]) + "\t" + str(lr_sub_report[name][0]) + "\t" + str(lr_sub_report[name][1]) + "\n") else: of.write(name + "\t" + rbe.gene_names[name] + "\t" + str(rbe.name2locus[name]) + "\t" + str(express) + "\t" + str(rbe.transcriptome1_rho[name]) + "\t" + str(0) + "\t" + str(0) + "\n") of.close() combo_report = combine_reports([sr_report, lr_ccs_report, lr_sub_report]) of = open(args.output + "/LR_SR_combo_report.txt", 'w') for name in sorted(rbe.name2locus.keys()): express = 1 if rbe.transcriptome1.expression: express = rbe.transcriptome1.expression.get_expression(name) if name in combo_report: of.write(name + "\t" + rbe.gene_names[name] + "\t" + str(rbe.name2locus[name]) + "\t" + str(express) + "\t" + str(rbe.transcriptome1_rho[name]) + "\t" + str(combo_report[name][0]) + "\t" + str(combo_report[name][1]) + "\n") else: of.write(name + "\t" + rbe.gene_names[name] + "\t" + str(rbe.name2locus[name]) + "\t" + str(express) + "\t" + str(rbe.transcriptome1_rho[name]) + "\t" + str(0) + "\t" + str(0) + "\n") of.close()
def main(): parser = argparse.ArgumentParser( description="Create a simulated RNA-seq dataset") parser.add_argument('reference_genome', help="The reference genome.") parser.add_argument( 'transcripts_genepred', help= "A genepred file describing the transcripts. Each transcript name must be unique." ) group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--uniform_expression', action='store_true', help="Uniform distribution of transcript expression") group.add_argument( '--isoform_expression', help= "The transcript expression in TSV format <Transcript name> tab <Expression>" ) group.add_argument( '--cufflinks_isoform_expression', help= "The expression of the isoforms or - for a uniform distribution of transcript expression" ) group2 = parser.add_mutually_exclusive_group() group2.add_argument('--long_reads_only', action='store_true') group2.add_argument('--short_reads_only', action='store_true') group2.add_argument('--output', help="Directory name for output") parser.add_argument('--short_read_count', type=int, default=10000, help="INT number of short reads") parser.add_argument('--short_read_length', type=int, default=101, help="INT length of the short reads") parser.add_argument('--long_read_count', type=int, default=4000, help="INT default number of long reads") parser.add_argument('--no_errors', action='store_true') parser.add_argument('--threads', type=int, default=1) args = parser.parse_args() if args.output: args.output = args.output.rstrip('/') fq_prof_pacbio_ccs95 = None fq_prof_pacbio_subreads = None fq_prof_illumina = None if not args.no_errors: fq_prof_pacbio_ccs95 = default_pacbio_ccs95() fq_prof_pacbio_subreads = default_pacbio_subreads() fq_prof_illumina = default_illumina() ref = read_fasta_into_hash(args.reference_genome) txn = Transcriptome() txn.set_reference_genome_dictionary(ref) with open(args.transcripts_genepred) as inf: for line in inf: if line[0] == '#': continue txn.add_genepred_line(line.rstrip()) if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0], float(f[1])) elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0], float(f[9])) sys.stderr.write("have transcriptome\n") for n in txn.ref_hash.keys(): del txn.ref_hash[n] rbe = SimulationBasics.RandomTranscriptomeEmitter(txn) # Now we have the transcriptomes set #Now our dataset is set up if args.short_reads_only: rbe.set_gaussian_fragmentation_default_hiseq() for zi in range(0, args.short_read_count): [name, seq] = rbe.emit_short_read(args.short_read_length) if args.no_errors: print "@SRSIM" + str(zi + 1) print seq print "+" print 'I' * len(seq) else: l1perm = fq_prof_illumina.create_fastq_and_permute_sequence( seq) print "@SRSIM" + str(zi + 1) print l1perm['seq'] print "+" print l1perm['qual'] return if args.long_reads_only: rbe.set_gaussian_fragmentation_default_pacbio() for zi in range(0, args.long_read_count): [name, seq] = rbe.emit_long_read() if args.no_errors: g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str( zi + 1) + '/ccs' print "@" + g print seq print "+" print 'I' * len(seq) else: g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str( zi + 1) + '/ccs' seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence( seq) print "@" + g print seqperm['seq'] print "+" print seqperm['qual'] return if not os.path.exists(args.output): os.makedirs(args.output) rbe.set_gaussian_fragmentation_default_hiseq() # Lets prepare to output now sys.stderr.write("Sequencing short reads\n") global left_handle global right_handle left_handle = gzip.open(args.output + "/SR_1.fq.gz", 'wb') right_handle = gzip.open(args.output + "/SR_2.fq.gz", 'wb') buffer_size = 10000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0, args.short_read_count): z = i + 1 if z % 1000 == 0: sys.stderr.write(str(z) + "\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_short_read_buffer(buffer[:], rbe, args, fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer, args=(buffer[:], rbe, args, fq_prof_illumina), callback=do_short) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_short_read_buffer(buffer[:], rbe, args, fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer, args=(buffer[:], rbe, args, fq_prof_illumina), callback=do_short) buffer = [] if args.threads > 1: p.close() p.join() global greport of = open(args.output + "/SR_report.txt", 'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]]) + "\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing short reads\n") left_handle.close() right_handle.close() # Now lets create the long read set rbe.set_gaussian_fragmentation_default_pacbio() sys.stderr.write("Sequencing ccs long reads\n") global long_handle long_handle = gzip.open(args.output + "/LR_ccs.fq.gz", 'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0, args.long_read_count): z = i + 1 if z % 100 == 0: sys.stderr.write(str(z) + "\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs'), callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95, 'ccs'), callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output + "/LR_ccs_report.txt", 'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]]) + "\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing ccs long reads\n") sys.stderr.write("Sequencing long sub reads\n") long_handle = gzip.open(args.output + "/LR_sub.fq.gz", 'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(z, z + args.long_read_count): z = i + 1 if z % 100 == 0: sys.stderr.write(str(z) + "\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub'), callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub') do_long(v) else: p.apply_async(process_long_reads, args=(buffer[:], rbe, args, fq_prof_pacbio_subreads, 'sub'), callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output + "/LR_sub_report.txt", 'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]]) + "\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing long sub reads\n") combo = {} with open(args.output + "/SR_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name, express, left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output + "/LR_ccs_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name, express, left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output + "/LR_sub_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name, express, left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) of = open(args.output + "/LR_SR_combo_report.txt", 'w') for name in sorted(combo): of.write(name + "\t" + combo[name]['express'] + "\t" + str(combo[name]['left']) + "\n") of.close()
def main(): parser = argparse.ArgumentParser(description="Create a simulated RNA-seq dataset") parser.add_argument('reference_genome',help="The reference genome.") parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts. Each transcript name must be unique.") group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--uniform_expression',action='store_true',help="Uniform distribution of transcript expression") group.add_argument('--isoform_expression',help="The transcript expression in TSV format <Transcript name> tab <Expression>") group.add_argument('--cufflinks_isoform_expression',help="The expression of the isoforms or - for a uniform distribution of transcript expression") group2 = parser.add_mutually_exclusive_group() group2.add_argument('--long_reads_only',action='store_true') group2.add_argument('--short_reads_only',action='store_true') group2.add_argument('--output',help="Directory name for output") parser.add_argument('--short_read_count',type=int,default=10000,help="INT number of short reads") parser.add_argument('--short_read_length',type=int,default=101,help="INT length of the short reads") parser.add_argument('--long_read_count',type=int,default=4000,help="INT default number of long reads") parser.add_argument('--no_errors',action='store_true') parser.add_argument('--threads',type=int,default=1) args = parser.parse_args() if args.output: args.output = args.output.rstrip('/') fq_prof_pacbio_ccs95 = None fq_prof_pacbio_subreads = None fq_prof_illumina = None if not args.no_errors: fq_prof_pacbio_ccs95 = default_pacbio_ccs95() fq_prof_pacbio_subreads = default_pacbio_subreads() fq_prof_illumina = default_illumina() ref = read_fasta_into_hash(args.reference_genome) txn = Transcriptome() txn.set_reference_genome_dictionary(ref) with open(args.transcripts_genepred) as inf: for line in inf: if line[0]=='#': continue txn.add_genepred_line(line.rstrip()) if args.isoform_expression: sys.stderr.write("Reading expression from a TSV\n") with open(args.isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0],float(f[1])) elif args.uniform_expression: sys.stderr.write("Using uniform expression model\n") elif args.cufflinks_isoform_expression: sys.stderr.write("Using cufflinks expression\n") with open(args.cufflinks_isoform_expression) as inf: line1 = inf.readline() for line in inf: f = line.rstrip().split("\t") txn.add_expression(f[0],float(f[9])) sys.stderr.write("have transcriptome\n") for n in txn.ref_hash.keys(): del txn.ref_hash[n] rbe = SimulationBasics.RandomTranscriptomeEmitter(txn) # Now we have the transcriptomes set #Now our dataset is set up if args.short_reads_only: rbe.set_gaussian_fragmentation_default_hiseq() for zi in range(0,args.short_read_count): [name,seq] = rbe.emit_short_read(args.short_read_length) if args.no_errors: print "@SRSIM"+str(zi+1) print seq print "+" print 'I'*len(seq) else: l1perm = fq_prof_illumina.create_fastq_and_permute_sequence(seq) print "@SRSIM"+str(zi+1) print l1perm['seq'] print "+" print l1perm['qual'] return if args.long_reads_only: rbe.set_gaussian_fragmentation_default_pacbio() for zi in range(0,args.long_read_count): [name,seq] = rbe.emit_long_read() if args.no_errors: g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs' print "@"+g print seq print "+" print 'I'*len(seq) else: g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs' seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(seq) print "@"+g print seqperm['seq'] print "+" print seqperm['qual'] return if not os.path.exists(args.output): os.makedirs(args.output) rbe.set_gaussian_fragmentation_default_hiseq() # Lets prepare to output now sys.stderr.write("Sequencing short reads\n") global left_handle global right_handle left_handle = gzip.open(args.output+"/SR_1.fq.gz",'wb') right_handle = gzip.open(args.output+"/SR_2.fq.gz",'wb') buffer_size = 10000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0,args.short_read_count): z = i+1 if z %1000==0: sys.stderr.write(str(z)+"\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina) do_short(v) else: p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short) buffer = [] if args.threads > 1: p.close() p.join() global greport of = open(args.output+"/SR_report.txt",'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]])+"\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing short reads\n") left_handle.close() right_handle.close() # Now lets create the long read set rbe.set_gaussian_fragmentation_default_pacbio() sys.stderr.write("Sequencing ccs long reads\n") global long_handle long_handle = gzip.open(args.output+"/LR_ccs.fq.gz",'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) z = 0 for i in range(0,args.long_read_count): z = i+1 if z %100==0: sys.stderr.write(str(z)+"\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output+"/LR_ccs_report.txt",'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]])+"\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing ccs long reads\n") sys.stderr.write("Sequencing long sub reads\n") long_handle = gzip.open(args.output+"/LR_sub.fq.gz",'wb') buffer_size = 1000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(z,z+args.long_read_count): z = i+1 if z %100==0: sys.stderr.write(str(z)+"\r") buffer.append(z) if len(buffer) >= buffer_size: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long) buffer = [] if len(buffer) > 0: if args.threads <= 1: v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub') do_long(v) else: p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long) buffer = [] if args.threads > 1: p.close() p.join() long_handle.close() of = open(args.output+"/LR_sub_report.txt",'w') for name in greport: of.write("\t".join([str(x) for x in greport[name]])+"\n") of.close() greport = {} sys.stderr.write("\nFinished sequencing long sub reads\n") combo = {} with open(args.output+"/SR_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name,express,left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output+"/LR_ccs_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name,express,left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) with open(args.output+"/LR_sub_report.txt") as inf: for line in inf: f = line.rstrip().split("\t") [name,express,left] = f if name not in combo: combo[name] = {} combo[name]['express'] = express combo[name]['left'] = 0 combo[name]['left'] += int(left) of = open(args.output+"/LR_SR_combo_report.txt",'w') for name in sorted(combo): of.write(name+"\t"+combo[name]['express']+"\t"+str(combo[name]['left'])+"\n") of.close()
def main(): parser = argparse.ArgumentParser(description="Create a simulated RNA-seq dataset") group0 = parser.add_mutually_exclusive_group(required=True) group0.add_argument('--load_biallelic_transcriptome',help="SERIALIZED BIALLELIC TRANSCRIOTOME EMITTER FILE to load up and use instead of all other file inputs") group0.add_argument('--inputs',nargs=3,help="<reference_genome> <phased_VCF> <transcripts_genepred>") #parser.add_argument('reference_genome',help="The reference genome.") #parser.add_argument('phased_VCF',help="A phased VCF file. If you are simulating the genomes that step can make on of these for you.") #parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts. Each transcript name must be unique.") group = parser.add_mutually_exclusive_group() group.add_argument('--uniform_expression',action='store_true',help="Uniform distribution of transcript expression") group.add_argument('--isoform_expression',help="The transcript expression in TSV format <Transcript name> tab <Expression>") group.add_argument('--cufflinks_isoform_expression',help="The expression of the isoforms or - for a uniform distribution of transcript expression") group2 = parser.add_mutually_exclusive_group() group2.add_argument('--ASE_identical',type=float,help="The ASE for the transcriptome, every isoform will have the same allele preference.") group2.add_argument('--ASE_isoform_random',action='store_true',help="The ASE will be random for every isoform.") group2.add_argument('--ASE_locus_random',action='store_true',help="The ASE will be randomly assigned for each locus") parser.add_argument('--short_read_count',type=int,default=10000,help="INT number of short reads") parser.add_argument('--short_read_length',type=int,default=101,help="INT length of the short reads") parser.add_argument('--long_read_ccs_count',type=int,default=4000,help="INT default number of long reads") parser.add_argument('--long_read_subread_count',type=int,default=4000,help="INT default number of long reads") parser.add_argument('--no_errors',action='store_true',help="Do not simulate errors in reads") parser.add_argument('--threads',type=int,default=cpu_count(),help="Number of threads defaults to cpu_count()") parser.add_argument('--locus_by_gene_name',action='store_true',help="Faster than the complete calculation for overlapping loci.") parser.add_argument('--seed',type=int,help="seed to make transcriptome and rho creation deterministic. Reads are still random, its just the transcriptome and rho that become determinisitic.") group3 = parser.add_mutually_exclusive_group(required=True) group3.add_argument('--output',help="Directory name for output") group3.add_argument('--save_biallelic_transcriptome',help="FILENAME output the biallelic transcriptome used to this file and then exit") parser.add_argument('--starting_read_multiplier',type=int,default=0,help="Used if outputting different reads from object, and you want them number differently give each different set values 0, 1, 2, etc...") args = parser.parse_args() fq_prof_illumina = None fq_prof_pacbio_ccs95 = None fq_prof_pacbio_subreads = None if not args.no_errors: fq_prof_illumina = default_illumina() fq_prof_pacbio_ccs95 = default_pacbio_ccs95() fq_prof_pacbio_subreads = default_pacbio_subreads() rbe = None if not args.load_biallelic_transcriptome: # we need to establish the emitter based on some known data rbe = load_from_inputs(args) else: rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter() inf = open(args.load_biallelic_transcriptome) sline = inf.readline().rstrip() inf.close() rbe.read_serialized(sline) if args.save_biallelic_transcriptome: ofser = open(args.save_biallelic_transcriptome,'w') ofser.write(rbe.get_serialized()) ofser.close() return #exiting here # Lets prepare to output now args.output = args.output.rstrip('/') if not os.path.exists(args.output): os.makedirs(args.output) ofser = open(args.output+"/RandomBiallelicTranscriptomeEmitter.serialized",'w') ofser.write(rbe.get_serialized()) ofser.close() rbe.set_gaussian_fragmentation_default_hiseq() #rbe_ser = rbe.get_serialized() sys.stderr.write("Sequencing short reads\n") global shand1 shand1 = gzip.open(args.output+"/SR_1.fq.gz",'wb') global shand2 shand2 = gzip.open(args.output+"/SR_2.fq.gz",'wb') z = 0 buffer_full_size = 5000 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(args.short_read_count*args.starting_read_multiplier,args.short_read_count*(args.starting_read_multiplier+1)): z = i+1 buffer.append(z) if buffer_full_size <= len(buffer): vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_short_read_buffer,args=(rbe,vals,args),callback=write_short_reads) else: oval = process_short_read_buffer(rbe,vals,args) write_short_reads(oval) if len(buffer) > 0: vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_short_read_buffer,args=(rbe,vals,args),callback=write_short_reads) else: oval = process_short_read_buffer(rbe,vals,args) write_short_reads(oval) if args.threads > 1: p.close() p.join() sys.stderr.write("\nFinished sequencing short reads\n") shand1.close() shand2.close() global emissions_reports for i in range(0,len(emissions_reports)): emissions_reports[i]= emissions_reports[i].get() sr_report = combine_reports(emissions_reports) rbe.emissions_report = {} # initialize so we don't accidentally overwrite # Now lets print out some of the emission details of = open(args.output+"/SR_report.txt",'w') for name in sorted(rbe.name2locus.keys()): express = 1 if rbe.transcriptome1.expression: express = rbe.transcriptome1.expression.get_expression(name) if name in sr_report: of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(sr_report[name][0])+"\t"+str(sr_report[name][1])+"\n") else: of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(0)+"\t"+str(0)+"\n") of.close() rbe.emissions_report = {} emissions_reports = [] # Now lets create the long read set rbe.set_gaussian_fragmentation_default_pacbio() #rbe_ser = rbe.get_serialized() sys.stderr.write("Sequencing long ccs reads\n") shand1 = gzip.open(args.output+"/LR_ccs95.fq.gz",'wb') buffer_full_size = 500 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(args.starting_read_multiplier*args.long_read_ccs_count,(args.starting_read_multiplier+1)*args.long_read_ccs_count): z = i+1 buffer.append(z) if buffer_full_size <= len(buffer): vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_long_ccs_read_buffer,args=(rbe,vals,args),callback=write_long_reads) else: oval = process_long_ccs_read_buffer(rbe,vals,args) write_long_reads(oval) if len(buffer) > 0: vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_long_ccs_read_buffer,args=(rbe,vals,args),callback=write_long_reads) else: oval = process_long_ccs_read_buffer(rbe,vals,args) write_long_reads(oval) if args.threads > 1: p.close() p.join() sys.stderr.write("\nFinished sequencing long reads\n") shand1.close() for i in range(0,len(emissions_reports)): emissions_reports[i]= emissions_reports[i].get() lr_ccs_report = combine_reports(emissions_reports) rbe.emissions_report = {} # initialize so we don't accidentally overwrite # Now lets print out some of the emission details of = open(args.output+"/LR_ccs95_report.txt",'w') for name in sorted(rbe.name2locus.keys()): express = 1 if rbe.transcriptome1.expression: express = rbe.transcriptome1.expression.get_expression(name) if name in lr_ccs_report: of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(lr_ccs_report[name][0])+"\t"+str(lr_ccs_report[name][1])+"\n") else: of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(0)+"\t"+str(0)+"\n") of.close() rbe.emissions_report = {} emissions_reports = [] # Now lets create the long subread read set rbe.set_gaussian_fragmentation_default_pacbio() #rbe_ser = rbe.get_serialized() sys.stderr.write("Sequencing long subreads\n") shand1 = gzip.open(args.output+"/LR_subreads.fq.gz",'wb') buffer_full_size = 500 buffer = [] if args.threads > 1: p = Pool(processes=args.threads) for i in range(args.long_read_subread_count*args.starting_read_multiplier,(args.starting_read_multiplier+1)*args.long_read_subread_count): z = i+1 buffer.append(z) if buffer_full_size <= len(buffer): vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_long_sub_read_buffer,args=(rbe,vals,args),callback=write_long_reads) else: oval = process_long_sub_read_buffer(rbe,vals,args) write_long_reads(oval) if len(buffer) > 0: vals = buffer[:] buffer = [] if args.threads > 1: p.apply_async(process_long_sub_read_buffer,args=(rbe,vals,args),callback=write_long_reads) else: oval = process_long_sub_read_buffer(rbe,vals,args) write_long_reads(oval) if args.threads > 1: p.close() p.join() sys.stderr.write("\nFinished sequencing long reads\n") shand1.close() for i in range(0,len(emissions_reports)): emissions_reports[i]= emissions_reports[i].get() lr_sub_report = combine_reports(emissions_reports) rbe.emissions_report = {} # initialize so we don't accidentally overwrite # Now lets print out some of the emission details of = open(args.output+"/LR_subreads_report.txt",'w') for name in sorted(rbe.name2locus.keys()): express = 1 if rbe.transcriptome1.expression: express = rbe.transcriptome1.expression.get_expression(name) if name in lr_sub_report: of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(lr_sub_report[name][0])+"\t"+str(lr_sub_report[name][1])+"\n") else: of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(0)+"\t"+str(0)+"\n") of.close() combo_report = combine_reports([sr_report,lr_ccs_report,lr_sub_report]) of = open(args.output+"/LR_SR_combo_report.txt",'w') for name in sorted(rbe.name2locus.keys()): express = 1 if rbe.transcriptome1.expression: express = rbe.transcriptome1.expression.get_expression(name) if name in combo_report: of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(combo_report[name][0])+"\t"+str(combo_report[name][1])+"\n") else: of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(0)+"\t"+str(0)+"\n") of.close()