def process_long_sub_read_buffer(rbe,buffer,args):
  #rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter()
  #rbe.read_serialized(rbe_ser)
  fq_prof_pacbio_subreads = default_pacbio_subreads()
  read1 = ''
  zend = 0 
  for z in buffer:
    [name,seq] = rbe.emit_long_read()
    g = 'm150102_010102_11112_c111111111111111112_s1_p0/'+str(z)+'/0_'+str(len(seq)-1)
    zend = z
    read1 += "@"+g+"\n"
    if args.no_errors:
      read1 += seq+"\n"
      read1 += "+\n"
      read1 += len(seq)*'I'+"\n"
    else:
      seqperm = fq_prof_pacbio_subreads.create_fastq_and_permute_sequence(seq)
      read1 += seqperm['seq']+"\n"
      read1 += "+\n"
      read1 += seqperm['qual']+"\n"
  return [read1,len(buffer),rbe.emissions_report]
def process_long_sub_read_buffer(rbe, buffer, args):
    #rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter()
    #rbe.read_serialized(rbe_ser)
    fq_prof_pacbio_subreads = default_pacbio_subreads()
    read1 = ''
    zend = 0
    for z in buffer:
        [name, seq] = rbe.emit_long_read()
        g = 'm150102_010102_11112_c111111111111111112_s1_p0/' + str(
            z) + '/0_' + str(len(seq) - 1)
        zend = z
        read1 += "@" + g + "\n"
        if args.no_errors:
            read1 += seq + "\n"
            read1 += "+\n"
            read1 += len(seq) * 'I' + "\n"
        else:
            seqperm = fq_prof_pacbio_subreads.create_fastq_and_permute_sequence(
                seq)
            read1 += seqperm['seq'] + "\n"
            read1 += "+\n"
            read1 += seqperm['qual'] + "\n"
    return [read1, len(buffer), rbe.emissions_report]
def main():
    parser = argparse.ArgumentParser(
        description="Create a simulated RNA-seq dataset")
    group0 = parser.add_mutually_exclusive_group(required=True)
    group0.add_argument(
        '--load_biallelic_transcriptome',
        help=
        "SERIALIZED BIALLELIC TRANSCRIOTOME EMITTER FILE to load up and use instead of all other file inputs"
    )
    group0.add_argument(
        '--inputs',
        nargs=3,
        help="<reference_genome> <phased_VCF> <transcripts_genepred>")
    #parser.add_argument('reference_genome',help="The reference genome.")
    #parser.add_argument('phased_VCF',help="A phased VCF file.  If you are simulating the genomes that step can make on of these for you.")
    #parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts.  Each transcript name must be unique.")
    group = parser.add_mutually_exclusive_group()
    group.add_argument('--uniform_expression',
                       action='store_true',
                       help="Uniform distribution of transcript expression")
    group.add_argument(
        '--isoform_expression',
        help=
        "The transcript expression in TSV format <Transcript name> tab <Expression>"
    )
    group.add_argument(
        '--cufflinks_isoform_expression',
        help=
        "The expression of the isoforms or - for a uniform distribution of transcript expression"
    )
    group2 = parser.add_mutually_exclusive_group()
    group2.add_argument(
        '--ASE_identical',
        type=float,
        help=
        "The ASE for the transcriptome, every isoform will have the same allele preference."
    )
    group2.add_argument('--ASE_isoform_random',
                        action='store_true',
                        help="The ASE will be random for every isoform.")
    group2.add_argument(
        '--ASE_locus_random',
        action='store_true',
        help="The ASE will be randomly assigned for each locus")
    parser.add_argument('--short_read_count',
                        type=int,
                        default=10000,
                        help="INT number of short reads")
    parser.add_argument('--short_read_length',
                        type=int,
                        default=101,
                        help="INT length of the short reads")
    parser.add_argument('--long_read_ccs_count',
                        type=int,
                        default=4000,
                        help="INT default number of long reads")
    parser.add_argument('--long_read_subread_count',
                        type=int,
                        default=4000,
                        help="INT default number of long reads")
    parser.add_argument('--no_errors',
                        action='store_true',
                        help="Do not simulate errors in reads")
    parser.add_argument('--threads',
                        type=int,
                        default=cpu_count(),
                        help="Number of threads defaults to cpu_count()")
    parser.add_argument(
        '--locus_by_gene_name',
        action='store_true',
        help="Faster than the complete calculation for overlapping loci.")
    parser.add_argument(
        '--seed',
        type=int,
        help=
        "seed to make transcriptome and rho creation deterministic.  Reads are still random, its just the transcriptome and rho that become determinisitic."
    )
    group3 = parser.add_mutually_exclusive_group(required=True)
    group3.add_argument('--output', help="Directory name for output")
    group3.add_argument(
        '--save_biallelic_transcriptome',
        help=
        "FILENAME output the biallelic transcriptome used to this file and then exit"
    )
    parser.add_argument(
        '--starting_read_multiplier',
        type=int,
        default=0,
        help=
        "Used if outputting different reads from object, and you want them number differently give each different set values 0, 1, 2, etc..."
    )
    args = parser.parse_args()
    fq_prof_illumina = None
    fq_prof_pacbio_ccs95 = None
    fq_prof_pacbio_subreads = None
    if not args.no_errors:
        fq_prof_illumina = default_illumina()
        fq_prof_pacbio_ccs95 = default_pacbio_ccs95()
        fq_prof_pacbio_subreads = default_pacbio_subreads()

    rbe = None
    if not args.load_biallelic_transcriptome:
        # we need to establish the emitter based on some known data
        rbe = load_from_inputs(args)

    else:
        rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter()
        inf = open(args.load_biallelic_transcriptome)
        sline = inf.readline().rstrip()
        inf.close()
        rbe.read_serialized(sline)

    if args.save_biallelic_transcriptome:
        ofser = open(args.save_biallelic_transcriptome, 'w')
        ofser.write(rbe.get_serialized())
        ofser.close()
        return  #exiting here
    # Lets prepare to output now
    args.output = args.output.rstrip('/')
    if not os.path.exists(args.output):
        os.makedirs(args.output)
    ofser = open(
        args.output + "/RandomBiallelicTranscriptomeEmitter.serialized", 'w')
    ofser.write(rbe.get_serialized())
    ofser.close()
    rbe.set_gaussian_fragmentation_default_hiseq()
    #rbe_ser = rbe.get_serialized()
    sys.stderr.write("Sequencing short reads\n")
    global shand1
    shand1 = gzip.open(args.output + "/SR_1.fq.gz", 'wb')
    global shand2
    shand2 = gzip.open(args.output + "/SR_2.fq.gz", 'wb')
    z = 0
    buffer_full_size = 5000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    for i in range(args.short_read_count * args.starting_read_multiplier,
                   args.short_read_count *
                   (args.starting_read_multiplier + 1)):
        z = i + 1
        buffer.append(z)
        if buffer_full_size <= len(buffer):
            vals = buffer[:]
            buffer = []
            if args.threads > 1:
                p.apply_async(process_short_read_buffer,
                              args=(rbe, vals, args),
                              callback=write_short_reads)
            else:
                oval = process_short_read_buffer(rbe, vals, args)
                write_short_reads(oval)
    if len(buffer) > 0:
        vals = buffer[:]
        buffer = []
        if args.threads > 1:
            p.apply_async(process_short_read_buffer,
                          args=(rbe, vals, args),
                          callback=write_short_reads)
        else:
            oval = process_short_read_buffer(rbe, vals, args)
            write_short_reads(oval)
    if args.threads > 1:
        p.close()
        p.join()
    sys.stderr.write("\nFinished sequencing short reads\n")
    shand1.close()
    shand2.close()
    global emissions_reports
    for i in range(0, len(emissions_reports)):
        emissions_reports[i] = emissions_reports[i].get()
    sr_report = combine_reports(emissions_reports)
    rbe.emissions_report = {}  # initialize so we don't accidentally overwrite
    # Now lets print out some of the emission details
    of = open(args.output + "/SR_report.txt", 'w')
    for name in sorted(rbe.name2locus.keys()):
        express = 1
        if rbe.transcriptome1.expression:
            express = rbe.transcriptome1.expression.get_expression(name)
        if name in sr_report:
            of.write(name + "\t" + rbe.gene_names[name] + "\t" +
                     str(rbe.name2locus[name]) + "\t" + str(express) + "\t" +
                     str(rbe.transcriptome1_rho[name]) + "\t" +
                     str(sr_report[name][0]) + "\t" + str(sr_report[name][1]) +
                     "\n")
        else:
            of.write(name + "\t" + rbe.gene_names[name] + "\t" +
                     str(rbe.name2locus[name]) + "\t" + str(express) + "\t" +
                     str(rbe.transcriptome1_rho[name]) + "\t" + str(0) + "\t" +
                     str(0) + "\n")
    of.close()

    rbe.emissions_report = {}
    emissions_reports = []
    # Now lets create the long read set
    rbe.set_gaussian_fragmentation_default_pacbio()
    #rbe_ser = rbe.get_serialized()
    sys.stderr.write("Sequencing long ccs reads\n")
    shand1 = gzip.open(args.output + "/LR_ccs95.fq.gz", 'wb')
    buffer_full_size = 500
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    for i in range(args.starting_read_multiplier * args.long_read_ccs_count,
                   (args.starting_read_multiplier + 1) *
                   args.long_read_ccs_count):
        z = i + 1
        buffer.append(z)
        if buffer_full_size <= len(buffer):
            vals = buffer[:]
            buffer = []
            if args.threads > 1:
                p.apply_async(process_long_ccs_read_buffer,
                              args=(rbe, vals, args),
                              callback=write_long_reads)
            else:
                oval = process_long_ccs_read_buffer(rbe, vals, args)
                write_long_reads(oval)
    if len(buffer) > 0:
        vals = buffer[:]
        buffer = []
        if args.threads > 1:
            p.apply_async(process_long_ccs_read_buffer,
                          args=(rbe, vals, args),
                          callback=write_long_reads)
        else:
            oval = process_long_ccs_read_buffer(rbe, vals, args)
            write_long_reads(oval)
    if args.threads > 1:
        p.close()
        p.join()
    sys.stderr.write("\nFinished sequencing long reads\n")
    shand1.close()
    for i in range(0, len(emissions_reports)):
        emissions_reports[i] = emissions_reports[i].get()
    lr_ccs_report = combine_reports(emissions_reports)
    rbe.emissions_report = {}  # initialize so we don't accidentally overwrite
    # Now lets print out some of the emission details
    of = open(args.output + "/LR_ccs95_report.txt", 'w')
    for name in sorted(rbe.name2locus.keys()):
        express = 1
        if rbe.transcriptome1.expression:
            express = rbe.transcriptome1.expression.get_expression(name)
        if name in lr_ccs_report:
            of.write(name + "\t" + rbe.gene_names[name] + "\t" +
                     str(rbe.name2locus[name]) + "\t" + str(express) + "\t" +
                     str(rbe.transcriptome1_rho[name]) + "\t" +
                     str(lr_ccs_report[name][0]) + "\t" +
                     str(lr_ccs_report[name][1]) + "\n")
        else:
            of.write(name + "\t" + rbe.gene_names[name] + "\t" +
                     str(rbe.name2locus[name]) + "\t" + str(express) + "\t" +
                     str(rbe.transcriptome1_rho[name]) + "\t" + str(0) + "\t" +
                     str(0) + "\n")
    of.close()

    rbe.emissions_report = {}
    emissions_reports = []
    # Now lets create the long subread read set
    rbe.set_gaussian_fragmentation_default_pacbio()
    #rbe_ser = rbe.get_serialized()
    sys.stderr.write("Sequencing long subreads\n")
    shand1 = gzip.open(args.output + "/LR_subreads.fq.gz", 'wb')
    buffer_full_size = 500
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    for i in range(
            args.long_read_subread_count * args.starting_read_multiplier,
        (args.starting_read_multiplier + 1) * args.long_read_subread_count):
        z = i + 1
        buffer.append(z)
        if buffer_full_size <= len(buffer):
            vals = buffer[:]
            buffer = []
            if args.threads > 1:
                p.apply_async(process_long_sub_read_buffer,
                              args=(rbe, vals, args),
                              callback=write_long_reads)
            else:
                oval = process_long_sub_read_buffer(rbe, vals, args)
                write_long_reads(oval)
    if len(buffer) > 0:
        vals = buffer[:]
        buffer = []
        if args.threads > 1:
            p.apply_async(process_long_sub_read_buffer,
                          args=(rbe, vals, args),
                          callback=write_long_reads)
        else:
            oval = process_long_sub_read_buffer(rbe, vals, args)
            write_long_reads(oval)
    if args.threads > 1:
        p.close()
        p.join()
    sys.stderr.write("\nFinished sequencing long reads\n")
    shand1.close()
    for i in range(0, len(emissions_reports)):
        emissions_reports[i] = emissions_reports[i].get()
    lr_sub_report = combine_reports(emissions_reports)
    rbe.emissions_report = {}  # initialize so we don't accidentally overwrite
    # Now lets print out some of the emission details
    of = open(args.output + "/LR_subreads_report.txt", 'w')
    for name in sorted(rbe.name2locus.keys()):
        express = 1
        if rbe.transcriptome1.expression:
            express = rbe.transcriptome1.expression.get_expression(name)
        if name in lr_sub_report:
            of.write(name + "\t" + rbe.gene_names[name] + "\t" +
                     str(rbe.name2locus[name]) + "\t" + str(express) + "\t" +
                     str(rbe.transcriptome1_rho[name]) + "\t" +
                     str(lr_sub_report[name][0]) + "\t" +
                     str(lr_sub_report[name][1]) + "\n")
        else:
            of.write(name + "\t" + rbe.gene_names[name] + "\t" +
                     str(rbe.name2locus[name]) + "\t" + str(express) + "\t" +
                     str(rbe.transcriptome1_rho[name]) + "\t" + str(0) + "\t" +
                     str(0) + "\n")
    of.close()

    combo_report = combine_reports([sr_report, lr_ccs_report, lr_sub_report])
    of = open(args.output + "/LR_SR_combo_report.txt", 'w')
    for name in sorted(rbe.name2locus.keys()):
        express = 1
        if rbe.transcriptome1.expression:
            express = rbe.transcriptome1.expression.get_expression(name)
        if name in combo_report:
            of.write(name + "\t" + rbe.gene_names[name] + "\t" +
                     str(rbe.name2locus[name]) + "\t" + str(express) + "\t" +
                     str(rbe.transcriptome1_rho[name]) + "\t" +
                     str(combo_report[name][0]) + "\t" +
                     str(combo_report[name][1]) + "\n")
        else:
            of.write(name + "\t" + rbe.gene_names[name] + "\t" +
                     str(rbe.name2locus[name]) + "\t" + str(express) + "\t" +
                     str(rbe.transcriptome1_rho[name]) + "\t" + str(0) + "\t" +
                     str(0) + "\n")
    of.close()
Пример #4
0
def main():
    parser = argparse.ArgumentParser(
        description="Create a simulated RNA-seq dataset")
    parser.add_argument('reference_genome', help="The reference genome.")
    parser.add_argument(
        'transcripts_genepred',
        help=
        "A genepred file describing the transcripts.  Each transcript name must be unique."
    )
    group = parser.add_mutually_exclusive_group(required=True)
    group.add_argument('--uniform_expression',
                       action='store_true',
                       help="Uniform distribution of transcript expression")
    group.add_argument(
        '--isoform_expression',
        help=
        "The transcript expression in TSV format <Transcript name> tab <Expression>"
    )
    group.add_argument(
        '--cufflinks_isoform_expression',
        help=
        "The expression of the isoforms or - for a uniform distribution of transcript expression"
    )
    group2 = parser.add_mutually_exclusive_group()
    group2.add_argument('--long_reads_only', action='store_true')
    group2.add_argument('--short_reads_only', action='store_true')
    group2.add_argument('--output', help="Directory name for output")
    parser.add_argument('--short_read_count',
                        type=int,
                        default=10000,
                        help="INT number of short reads")
    parser.add_argument('--short_read_length',
                        type=int,
                        default=101,
                        help="INT length of the short reads")
    parser.add_argument('--long_read_count',
                        type=int,
                        default=4000,
                        help="INT default number of long reads")
    parser.add_argument('--no_errors', action='store_true')
    parser.add_argument('--threads', type=int, default=1)
    args = parser.parse_args()
    if args.output:
        args.output = args.output.rstrip('/')

    fq_prof_pacbio_ccs95 = None
    fq_prof_pacbio_subreads = None
    fq_prof_illumina = None
    if not args.no_errors:
        fq_prof_pacbio_ccs95 = default_pacbio_ccs95()
        fq_prof_pacbio_subreads = default_pacbio_subreads()
        fq_prof_illumina = default_illumina()

    ref = read_fasta_into_hash(args.reference_genome)
    txn = Transcriptome()
    txn.set_reference_genome_dictionary(ref)
    with open(args.transcripts_genepred) as inf:
        for line in inf:
            if line[0] == '#': continue
            txn.add_genepred_line(line.rstrip())
    if args.isoform_expression:
        sys.stderr.write("Reading expression from a TSV\n")
        with open(args.isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn.add_expression(f[0], float(f[1]))
    elif args.uniform_expression:
        sys.stderr.write("Using uniform expression model\n")
    elif args.cufflinks_isoform_expression:
        sys.stderr.write("Using cufflinks expression\n")
        with open(args.cufflinks_isoform_expression) as inf:
            line1 = inf.readline()
            for line in inf:
                f = line.rstrip().split("\t")
                txn.add_expression(f[0], float(f[9]))
    sys.stderr.write("have transcriptome\n")
    for n in txn.ref_hash.keys():
        del txn.ref_hash[n]
    rbe = SimulationBasics.RandomTranscriptomeEmitter(txn)
    # Now we have the transcriptomes set
    #Now our dataset is set up
    if args.short_reads_only:
        rbe.set_gaussian_fragmentation_default_hiseq()
        for zi in range(0, args.short_read_count):
            [name, seq] = rbe.emit_short_read(args.short_read_length)
            if args.no_errors:
                print "@SRSIM" + str(zi + 1)
                print seq
                print "+"
                print 'I' * len(seq)
            else:
                l1perm = fq_prof_illumina.create_fastq_and_permute_sequence(
                    seq)
                print "@SRSIM" + str(zi + 1)
                print l1perm['seq']
                print "+"
                print l1perm['qual']
        return
    if args.long_reads_only:
        rbe.set_gaussian_fragmentation_default_pacbio()
        for zi in range(0, args.long_read_count):
            [name, seq] = rbe.emit_long_read()
            if args.no_errors:
                g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str(
                    zi + 1) + '/ccs'
                print "@" + g
                print seq
                print "+"
                print 'I' * len(seq)
            else:
                g = 'm150101_010101_11111_c111111111111111111_s1_p0/' + str(
                    zi + 1) + '/ccs'
                seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(
                    seq)
                print "@" + g
                print seqperm['seq']
                print "+"
                print seqperm['qual']
        return
    if not os.path.exists(args.output):
        os.makedirs(args.output)

    rbe.set_gaussian_fragmentation_default_hiseq()
    # Lets prepare to output now
    sys.stderr.write("Sequencing short reads\n")
    global left_handle
    global right_handle
    left_handle = gzip.open(args.output + "/SR_1.fq.gz", 'wb')
    right_handle = gzip.open(args.output + "/SR_2.fq.gz", 'wb')
    buffer_size = 10000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    z = 0
    for i in range(0, args.short_read_count):
        z = i + 1
        if z % 1000 == 0: sys.stderr.write(str(z) + "\r")
        buffer.append(z)
        if len(buffer) >= buffer_size:
            if args.threads <= 1:
                v = process_short_read_buffer(buffer[:], rbe, args,
                                              fq_prof_illumina)
                do_short(v)
            else:
                p.apply_async(process_short_read_buffer,
                              args=(buffer[:], rbe, args, fq_prof_illumina),
                              callback=do_short)
            buffer = []
    if len(buffer) > 0:
        if args.threads <= 1:
            v = process_short_read_buffer(buffer[:], rbe, args,
                                          fq_prof_illumina)
            do_short(v)
        else:
            p.apply_async(process_short_read_buffer,
                          args=(buffer[:], rbe, args, fq_prof_illumina),
                          callback=do_short)
        buffer = []
    if args.threads > 1:
        p.close()
        p.join()

    global greport
    of = open(args.output + "/SR_report.txt", 'w')
    for name in greport:
        of.write("\t".join([str(x) for x in greport[name]]) + "\n")
    of.close()
    greport = {}

    sys.stderr.write("\nFinished sequencing short reads\n")
    left_handle.close()
    right_handle.close()

    # Now lets create the long read set
    rbe.set_gaussian_fragmentation_default_pacbio()
    sys.stderr.write("Sequencing ccs long reads\n")
    global long_handle
    long_handle = gzip.open(args.output + "/LR_ccs.fq.gz", 'wb')
    buffer_size = 1000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    z = 0
    for i in range(0, args.long_read_count):
        z = i + 1
        if z % 100 == 0: sys.stderr.write(str(z) + "\r")
        buffer.append(z)
        if len(buffer) >= buffer_size:
            if args.threads <= 1:
                v = process_long_reads(buffer[:], rbe, args,
                                       fq_prof_pacbio_ccs95, 'ccs')
                do_long(v)
            else:
                p.apply_async(process_long_reads,
                              args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95,
                                    'ccs'),
                              callback=do_long)
            buffer = []
    if len(buffer) > 0:
        if args.threads <= 1:
            v = process_long_reads(buffer[:], rbe, args, fq_prof_pacbio_ccs95,
                                   'ccs')
            do_long(v)
        else:
            p.apply_async(process_long_reads,
                          args=(buffer[:], rbe, args, fq_prof_pacbio_ccs95,
                                'ccs'),
                          callback=do_long)
        buffer = []
    if args.threads > 1:
        p.close()
        p.join()

    long_handle.close()
    of = open(args.output + "/LR_ccs_report.txt", 'w')
    for name in greport:
        of.write("\t".join([str(x) for x in greport[name]]) + "\n")
    of.close()
    greport = {}
    sys.stderr.write("\nFinished sequencing ccs long reads\n")

    sys.stderr.write("Sequencing long sub reads\n")
    long_handle = gzip.open(args.output + "/LR_sub.fq.gz", 'wb')
    buffer_size = 1000
    buffer = []
    if args.threads > 1:
        p = Pool(processes=args.threads)
    for i in range(z, z + args.long_read_count):
        z = i + 1
        if z % 100 == 0: sys.stderr.write(str(z) + "\r")
        buffer.append(z)
        if len(buffer) >= buffer_size:
            if args.threads <= 1:
                v = process_long_reads(buffer[:], rbe, args,
                                       fq_prof_pacbio_subreads, 'sub')
                do_long(v)
            else:
                p.apply_async(process_long_reads,
                              args=(buffer[:], rbe, args,
                                    fq_prof_pacbio_subreads, 'sub'),
                              callback=do_long)
            buffer = []
    if len(buffer) > 0:
        if args.threads <= 1:
            v = process_long_reads(buffer[:], rbe, args,
                                   fq_prof_pacbio_subreads, 'sub')
            do_long(v)
        else:
            p.apply_async(process_long_reads,
                          args=(buffer[:], rbe, args, fq_prof_pacbio_subreads,
                                'sub'),
                          callback=do_long)
        buffer = []
    if args.threads > 1:
        p.close()
        p.join()

    long_handle.close()
    of = open(args.output + "/LR_sub_report.txt", 'w')
    for name in greport:
        of.write("\t".join([str(x) for x in greport[name]]) + "\n")
    of.close()
    greport = {}
    sys.stderr.write("\nFinished sequencing long sub reads\n")

    combo = {}
    with open(args.output + "/SR_report.txt") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            [name, express, left] = f
            if name not in combo:
                combo[name] = {}
                combo[name]['express'] = express
                combo[name]['left'] = 0
            combo[name]['left'] += int(left)
    with open(args.output + "/LR_ccs_report.txt") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            [name, express, left] = f
            if name not in combo:
                combo[name] = {}
                combo[name]['express'] = express
                combo[name]['left'] = 0
            combo[name]['left'] += int(left)
    with open(args.output + "/LR_sub_report.txt") as inf:
        for line in inf:
            f = line.rstrip().split("\t")
            [name, express, left] = f
            if name not in combo:
                combo[name] = {}
                combo[name]['express'] = express
                combo[name]['left'] = 0
            combo[name]['left'] += int(left)
    of = open(args.output + "/LR_SR_combo_report.txt", 'w')
    for name in sorted(combo):
        of.write(name + "\t" + combo[name]['express'] + "\t" +
                 str(combo[name]['left']) + "\n")
    of.close()
def main():
  parser = argparse.ArgumentParser(description="Create a simulated RNA-seq dataset")
  parser.add_argument('reference_genome',help="The reference genome.")
  parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts.  Each transcript name must be unique.")
  group = parser.add_mutually_exclusive_group(required=True)
  group.add_argument('--uniform_expression',action='store_true',help="Uniform distribution of transcript expression")
  group.add_argument('--isoform_expression',help="The transcript expression in TSV format <Transcript name> tab <Expression>")
  group.add_argument('--cufflinks_isoform_expression',help="The expression of the isoforms or - for a uniform distribution of transcript expression")
  group2 = parser.add_mutually_exclusive_group()
  group2.add_argument('--long_reads_only',action='store_true')
  group2.add_argument('--short_reads_only',action='store_true')
  group2.add_argument('--output',help="Directory name for output")
  parser.add_argument('--short_read_count',type=int,default=10000,help="INT number of short reads")
  parser.add_argument('--short_read_length',type=int,default=101,help="INT length of the short reads")
  parser.add_argument('--long_read_count',type=int,default=4000,help="INT default number of long reads")
  parser.add_argument('--no_errors',action='store_true')
  parser.add_argument('--threads',type=int,default=1)
  args = parser.parse_args()
  if args.output:
    args.output = args.output.rstrip('/')

  fq_prof_pacbio_ccs95 = None
  fq_prof_pacbio_subreads = None
  fq_prof_illumina = None
  if not args.no_errors:
    fq_prof_pacbio_ccs95 = default_pacbio_ccs95()
    fq_prof_pacbio_subreads = default_pacbio_subreads()
    fq_prof_illumina = default_illumina()

  ref = read_fasta_into_hash(args.reference_genome)
  txn = Transcriptome()
  txn.set_reference_genome_dictionary(ref)
  with open(args.transcripts_genepred) as inf:
    for line in inf:
      if line[0]=='#': continue
      txn.add_genepred_line(line.rstrip())
  if args.isoform_expression:
    sys.stderr.write("Reading expression from a TSV\n")
    with open(args.isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn.add_expression(f[0],float(f[1]))
  elif args.uniform_expression:
    sys.stderr.write("Using uniform expression model\n")
  elif args.cufflinks_isoform_expression:
    sys.stderr.write("Using cufflinks expression\n")
    with open(args.cufflinks_isoform_expression) as inf:
      line1 = inf.readline()
      for line in inf:
        f = line.rstrip().split("\t")
        txn.add_expression(f[0],float(f[9]))
  sys.stderr.write("have transcriptome\n")
  for n in txn.ref_hash.keys(): del txn.ref_hash[n]
  rbe = SimulationBasics.RandomTranscriptomeEmitter(txn)
  # Now we have the transcriptomes set
  #Now our dataset is set up
  if args.short_reads_only:
    rbe.set_gaussian_fragmentation_default_hiseq()
    for zi in range(0,args.short_read_count):
      [name,seq] = rbe.emit_short_read(args.short_read_length)
      if args.no_errors:
        print "@SRSIM"+str(zi+1)
        print seq
        print "+"
        print 'I'*len(seq)
      else:
        l1perm = fq_prof_illumina.create_fastq_and_permute_sequence(seq)
        print "@SRSIM"+str(zi+1)
        print l1perm['seq']
        print "+"
        print l1perm['qual']
    return
  if args.long_reads_only:
    rbe.set_gaussian_fragmentation_default_pacbio()
    for zi in range(0,args.long_read_count):
      [name,seq] = rbe.emit_long_read()
      if args.no_errors:
        g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs'
        print "@"+g
        print seq
        print "+"
        print 'I'*len(seq)   
      else: 
        g = 'm150101_010101_11111_c111111111111111111_s1_p0/'+str(zi+1)+'/ccs'
        seqperm = fq_prof_pacbio_ccs95.create_fastq_and_permute_sequence(seq)
        print "@"+g
        print seqperm['seq']
        print "+"
        print seqperm['qual']  
    return
  if not os.path.exists(args.output):
    os.makedirs(args.output)


  rbe.set_gaussian_fragmentation_default_hiseq()
  # Lets prepare to output now
  sys.stderr.write("Sequencing short reads\n")
  global left_handle
  global right_handle
  left_handle = gzip.open(args.output+"/SR_1.fq.gz",'wb')
  right_handle = gzip.open(args.output+"/SR_2.fq.gz",'wb')
  buffer_size = 10000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  z = 0
  for i in range(0,args.short_read_count):
    z = i+1
    if z %1000==0: sys.stderr.write(str(z)+"\r")
    buffer.append(z)
    if len(buffer) >= buffer_size:
      if args.threads <= 1:
        v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina)
        do_short(v)
      else:
        p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short)
      buffer = []
  if len(buffer) > 0:
    if args.threads <= 1:
      v = process_short_read_buffer(buffer[:],rbe,args,fq_prof_illumina)
      do_short(v)
    else:
      p.apply_async(process_short_read_buffer,args=(buffer[:],rbe,args,fq_prof_illumina),callback=do_short)
    buffer = []
  if args.threads > 1:
    p.close()
    p.join()

  global greport
  of = open(args.output+"/SR_report.txt",'w')
  for name in greport:
    of.write("\t".join([str(x) for x in greport[name]])+"\n")
  of.close()  
  greport = {}

  sys.stderr.write("\nFinished sequencing short reads\n")
  left_handle.close()
  right_handle.close()

  # Now lets create the long read set
  rbe.set_gaussian_fragmentation_default_pacbio()
  sys.stderr.write("Sequencing ccs long reads\n")
  global long_handle
  long_handle = gzip.open(args.output+"/LR_ccs.fq.gz",'wb')
  buffer_size = 1000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  z = 0
  for i in range(0,args.long_read_count):
    z = i+1
    if z %100==0: sys.stderr.write(str(z)+"\r")
    buffer.append(z)
    if len(buffer) >= buffer_size:
      if args.threads <= 1:
        v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs')
        do_long(v)
      else:
        p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long)
      buffer = []
  if len(buffer) > 0:
    if args.threads <= 1:
      v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs')
      do_long(v)
    else:
      p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_ccs95,'ccs'),callback=do_long)
    buffer = []
  if args.threads > 1:
    p.close()
    p.join()

  long_handle.close()
  of = open(args.output+"/LR_ccs_report.txt",'w')
  for name in greport:
    of.write("\t".join([str(x) for x in greport[name]])+"\n")
  of.close()  
  greport = {}
  sys.stderr.write("\nFinished sequencing ccs long reads\n")

  sys.stderr.write("Sequencing long sub reads\n")
  long_handle = gzip.open(args.output+"/LR_sub.fq.gz",'wb')
  buffer_size = 1000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  for i in range(z,z+args.long_read_count):
    z = i+1
    if z %100==0: sys.stderr.write(str(z)+"\r")
    buffer.append(z)
    if len(buffer) >= buffer_size:
      if args.threads <= 1:
        v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub')
        do_long(v)
      else:
        p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long)
      buffer = []
  if len(buffer) > 0:
    if args.threads <= 1:
      v = process_long_reads(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub')
      do_long(v)
    else:
      p.apply_async(process_long_reads,args=(buffer[:],rbe,args,fq_prof_pacbio_subreads,'sub'),callback=do_long)
    buffer = []
  if args.threads > 1:
    p.close()
    p.join()

  long_handle.close()
  of = open(args.output+"/LR_sub_report.txt",'w')
  for name in greport:
    of.write("\t".join([str(x) for x in greport[name]])+"\n")
  of.close()  
  greport = {}
  sys.stderr.write("\nFinished sequencing long sub reads\n")

  combo = {}
  with open(args.output+"/SR_report.txt") as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      [name,express,left] = f
      if name not in combo:
        combo[name] = {}
        combo[name]['express'] = express
        combo[name]['left'] = 0
      combo[name]['left'] += int(left)
  with open(args.output+"/LR_ccs_report.txt") as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      [name,express,left] = f
      if name not in combo:
        combo[name] = {}
        combo[name]['express'] = express
        combo[name]['left'] = 0
      combo[name]['left'] += int(left)
  with open(args.output+"/LR_sub_report.txt") as inf:
    for line in inf:
      f = line.rstrip().split("\t")
      [name,express,left] = f
      if name not in combo:
        combo[name] = {}
        combo[name]['express'] = express
        combo[name]['left'] = 0
      combo[name]['left'] += int(left)
  of = open(args.output+"/LR_SR_combo_report.txt",'w')
  for name in sorted(combo):
    of.write(name+"\t"+combo[name]['express']+"\t"+str(combo[name]['left'])+"\n")
  of.close()
def main():
  parser = argparse.ArgumentParser(description="Create a simulated RNA-seq dataset")
  group0 = parser.add_mutually_exclusive_group(required=True)
  group0.add_argument('--load_biallelic_transcriptome',help="SERIALIZED BIALLELIC TRANSCRIOTOME EMITTER FILE to load up and use instead of all other file inputs")
  group0.add_argument('--inputs',nargs=3,help="<reference_genome> <phased_VCF> <transcripts_genepred>")
  #parser.add_argument('reference_genome',help="The reference genome.")
  #parser.add_argument('phased_VCF',help="A phased VCF file.  If you are simulating the genomes that step can make on of these for you.")
  #parser.add_argument('transcripts_genepred',help="A genepred file describing the transcripts.  Each transcript name must be unique.")
  group = parser.add_mutually_exclusive_group()
  group.add_argument('--uniform_expression',action='store_true',help="Uniform distribution of transcript expression")
  group.add_argument('--isoform_expression',help="The transcript expression in TSV format <Transcript name> tab <Expression>")
  group.add_argument('--cufflinks_isoform_expression',help="The expression of the isoforms or - for a uniform distribution of transcript expression")
  group2 = parser.add_mutually_exclusive_group()
  group2.add_argument('--ASE_identical',type=float,help="The ASE for the transcriptome, every isoform will have the same allele preference.")
  group2.add_argument('--ASE_isoform_random',action='store_true',help="The ASE will be random for every isoform.")
  group2.add_argument('--ASE_locus_random',action='store_true',help="The ASE will be randomly assigned for each locus")
  parser.add_argument('--short_read_count',type=int,default=10000,help="INT number of short reads")
  parser.add_argument('--short_read_length',type=int,default=101,help="INT length of the short reads")
  parser.add_argument('--long_read_ccs_count',type=int,default=4000,help="INT default number of long reads")
  parser.add_argument('--long_read_subread_count',type=int,default=4000,help="INT default number of long reads")
  parser.add_argument('--no_errors',action='store_true',help="Do not simulate errors in reads")
  parser.add_argument('--threads',type=int,default=cpu_count(),help="Number of threads defaults to cpu_count()")
  parser.add_argument('--locus_by_gene_name',action='store_true',help="Faster than the complete calculation for overlapping loci.")
  parser.add_argument('--seed',type=int,help="seed to make transcriptome and rho creation deterministic.  Reads are still random, its just the transcriptome and rho that become determinisitic.")
  group3 = parser.add_mutually_exclusive_group(required=True)
  group3.add_argument('--output',help="Directory name for output")
  group3.add_argument('--save_biallelic_transcriptome',help="FILENAME output the biallelic transcriptome used to this file and then exit")
  parser.add_argument('--starting_read_multiplier',type=int,default=0,help="Used if outputting different reads from object, and you want them number differently give each different set values 0, 1, 2, etc...")
  args = parser.parse_args()
  fq_prof_illumina = None
  fq_prof_pacbio_ccs95 = None
  fq_prof_pacbio_subreads = None
  if not args.no_errors:
    fq_prof_illumina = default_illumina()
    fq_prof_pacbio_ccs95 = default_pacbio_ccs95()
    fq_prof_pacbio_subreads = default_pacbio_subreads()

  rbe = None
  if not args.load_biallelic_transcriptome:
    # we need to establish the emitter based on some known data
    rbe = load_from_inputs(args)
  
  else:
    rbe = SimulationBasics.RandomBiallelicTranscriptomeEmitter()
    inf = open(args.load_biallelic_transcriptome)
    sline = inf.readline().rstrip()
    inf.close()
    rbe.read_serialized(sline)
    
  if args.save_biallelic_transcriptome:
    ofser = open(args.save_biallelic_transcriptome,'w')
    ofser.write(rbe.get_serialized())
    ofser.close()
    return #exiting here
  # Lets prepare to output now
  args.output = args.output.rstrip('/')
  if not os.path.exists(args.output):
    os.makedirs(args.output)
  ofser = open(args.output+"/RandomBiallelicTranscriptomeEmitter.serialized",'w')
  ofser.write(rbe.get_serialized())
  ofser.close()
  rbe.set_gaussian_fragmentation_default_hiseq()
  #rbe_ser = rbe.get_serialized()
  sys.stderr.write("Sequencing short reads\n")
  global shand1
  shand1 = gzip.open(args.output+"/SR_1.fq.gz",'wb')
  global shand2
  shand2 = gzip.open(args.output+"/SR_2.fq.gz",'wb')
  z = 0
  buffer_full_size = 5000
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  for i in range(args.short_read_count*args.starting_read_multiplier,args.short_read_count*(args.starting_read_multiplier+1)):
    z = i+1
    buffer.append(z)
    if buffer_full_size <= len(buffer):
      vals = buffer[:]
      buffer = []
      if args.threads > 1:
        p.apply_async(process_short_read_buffer,args=(rbe,vals,args),callback=write_short_reads)
      else:
        oval = process_short_read_buffer(rbe,vals,args)
        write_short_reads(oval)
  if len(buffer) > 0:
    vals = buffer[:]
    buffer = []
    if args.threads > 1:
      p.apply_async(process_short_read_buffer,args=(rbe,vals,args),callback=write_short_reads)
    else:
      oval = process_short_read_buffer(rbe,vals,args)
      write_short_reads(oval)
  if args.threads > 1:
    p.close()
    p.join()
  sys.stderr.write("\nFinished sequencing short reads\n")
  shand1.close()
  shand2.close()
  global emissions_reports
  for i in range(0,len(emissions_reports)): emissions_reports[i]= emissions_reports[i].get()
  sr_report = combine_reports(emissions_reports)
  rbe.emissions_report = {} # initialize so we don't accidentally overwrite 
  # Now lets print out some of the emission details
  of = open(args.output+"/SR_report.txt",'w')
  for name in sorted(rbe.name2locus.keys()):
    express = 1
    if rbe.transcriptome1.expression:
      express = rbe.transcriptome1.expression.get_expression(name)
    if name in sr_report:
      of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(sr_report[name][0])+"\t"+str(sr_report[name][1])+"\n")
    else:
      of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(0)+"\t"+str(0)+"\n")
  of.close()


  rbe.emissions_report = {}
  emissions_reports = []
  # Now lets create the long read set
  rbe.set_gaussian_fragmentation_default_pacbio()
  #rbe_ser = rbe.get_serialized()
  sys.stderr.write("Sequencing long ccs reads\n")
  shand1 = gzip.open(args.output+"/LR_ccs95.fq.gz",'wb')
  buffer_full_size = 500
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  for i in range(args.starting_read_multiplier*args.long_read_ccs_count,(args.starting_read_multiplier+1)*args.long_read_ccs_count):
    z = i+1
    buffer.append(z)
    if buffer_full_size <= len(buffer):
      vals = buffer[:]
      buffer = []
      if args.threads > 1:
        p.apply_async(process_long_ccs_read_buffer,args=(rbe,vals,args),callback=write_long_reads)
      else:
        oval = process_long_ccs_read_buffer(rbe,vals,args)
        write_long_reads(oval)
  if len(buffer) > 0:
    vals = buffer[:]
    buffer = []
    if args.threads > 1:
      p.apply_async(process_long_ccs_read_buffer,args=(rbe,vals,args),callback=write_long_reads)
    else:
      oval = process_long_ccs_read_buffer(rbe,vals,args)
      write_long_reads(oval)
  if args.threads > 1:
    p.close()
    p.join()
  sys.stderr.write("\nFinished sequencing long reads\n")
  shand1.close()
  for i in range(0,len(emissions_reports)): emissions_reports[i]= emissions_reports[i].get()
  lr_ccs_report = combine_reports(emissions_reports)
  rbe.emissions_report = {} # initialize so we don't accidentally overwrite 
  # Now lets print out some of the emission details
  of = open(args.output+"/LR_ccs95_report.txt",'w')
  for name in sorted(rbe.name2locus.keys()):
    express = 1
    if rbe.transcriptome1.expression:
      express = rbe.transcriptome1.expression.get_expression(name)
    if name in lr_ccs_report:
      of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(lr_ccs_report[name][0])+"\t"+str(lr_ccs_report[name][1])+"\n")
    else:
      of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(0)+"\t"+str(0)+"\n")
  of.close()

  rbe.emissions_report = {}
  emissions_reports = []
  # Now lets create the long subread read set
  rbe.set_gaussian_fragmentation_default_pacbio()
  #rbe_ser = rbe.get_serialized()
  sys.stderr.write("Sequencing long subreads\n")
  shand1 = gzip.open(args.output+"/LR_subreads.fq.gz",'wb')
  buffer_full_size = 500
  buffer = []
  if args.threads > 1:
    p = Pool(processes=args.threads)
  for i in range(args.long_read_subread_count*args.starting_read_multiplier,(args.starting_read_multiplier+1)*args.long_read_subread_count):
    z = i+1
    buffer.append(z)
    if buffer_full_size <= len(buffer):
      vals = buffer[:]
      buffer = []
      if args.threads > 1:
        p.apply_async(process_long_sub_read_buffer,args=(rbe,vals,args),callback=write_long_reads)
      else:
        oval = process_long_sub_read_buffer(rbe,vals,args)
        write_long_reads(oval)
  if len(buffer) > 0:
    vals = buffer[:]
    buffer = []
    if args.threads > 1:
      p.apply_async(process_long_sub_read_buffer,args=(rbe,vals,args),callback=write_long_reads)
    else:
      oval = process_long_sub_read_buffer(rbe,vals,args)
      write_long_reads(oval)
  if args.threads > 1:
    p.close()
    p.join()
  sys.stderr.write("\nFinished sequencing long reads\n")
  shand1.close()
  for i in range(0,len(emissions_reports)): emissions_reports[i]= emissions_reports[i].get()
  lr_sub_report = combine_reports(emissions_reports)
  rbe.emissions_report = {} # initialize so we don't accidentally overwrite 
  # Now lets print out some of the emission details
  of = open(args.output+"/LR_subreads_report.txt",'w')
  for name in sorted(rbe.name2locus.keys()):
    express = 1
    if rbe.transcriptome1.expression:
      express = rbe.transcriptome1.expression.get_expression(name)
    if name in lr_sub_report:
      of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(lr_sub_report[name][0])+"\t"+str(lr_sub_report[name][1])+"\n")
    else:
      of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(0)+"\t"+str(0)+"\n")
  of.close()

  combo_report = combine_reports([sr_report,lr_ccs_report,lr_sub_report])
  of = open(args.output+"/LR_SR_combo_report.txt",'w')
  for name in sorted(rbe.name2locus.keys()):
    express = 1
    if rbe.transcriptome1.expression:
      express = rbe.transcriptome1.expression.get_expression(name)
    if name in combo_report:
      of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(combo_report[name][0])+"\t"+str(combo_report[name][1])+"\n")
    else:
      of.write(name +"\t"+rbe.gene_names[name]+"\t"+str(rbe.name2locus[name])+"\t"+str(express)+"\t"+str(rbe.transcriptome1_rho[name])+"\t"+str(0)+"\t"+str(0)+"\n")
  of.close()