def main(args):
  sys.stderr.write("Reading reference fasta\n")
  ref_genome = FastaData(open(args.reference_fasta,'rb').read())
  sys.stderr.write("Reading in transcriptome\n")
  output = {}
  txome = Transcriptome()
  z = 0
  with open(args.reference_gpd) as inf:
    for line in inf:
      z+=1
      if z%1000==0:  sys.stderr.write(str(z)+"       \r")
      gpd = GPD(line)
      gpd.set_sequence(ref_genome)
      txome.add_transcript(gpd)
  sys.stderr.write("\n")
  sys.stderr.write("Serializing transcriptome\n")
  output['txome'] = txome.dump_serialized()
  txweights = {}
  weight_type = 'uniform_distribution' #default
  if args.expression_table:
    weight_type = 'expression_table'
    inf = None
    if args.expression_table[-3:]=='.gz':
      inf = gzip.open(args.expression_table)
    else: inf = open(args.expression_table)
    for line in inf:
      f = line.rstrip().split("\t")
      txweights[f[0]] = float(f[1])
  elif args.exponential_distribution: weight_type = 'exponential_distribution'
  output['weight_type'] = weight_type
  output['weights'] = txweights #only matters for expression based
  of = sys.stdout
  if args.output: of = open(args.output,'w')
  of.write(base64.b64encode(zlib.compress(pickle.dumps(output)))+"\n")
  of.close()


  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
示例#2
0
def main(args):
    # check outputs
    if len(args.output) > 1 and not args.sr:
        sys.stderr.write(
            "Error: Long reads don't support multiple output files\n")
        sys.exit()
    elif len(args.output) > 2:
        sys.stderr.wrtie(
            "Error: Short reads support at most two output files (paired end)\n"
        )
        sys.exit()
    if args.sr_length < args.minimum_read_length:
        args.minimum_read_length = args.sr_length
    inf = sys.stdin
    if args.emitter != '-':
        inf = open(args.emitter)
    sys.stderr.write("reading in transcriptome emitter\n")
    indata = pickle.loads(
        zlib.decompress(base64.b64decode(inf.read().rstrip())))
    txome = Transcriptome()
    txome.load_serialized(indata['txome'])
    rnum = RandomSource()
    rnum_tx = RandomSource()  # for drawing transcripts
    if args.seed:
        rnum = RandomSource(args.seed)
        rnum_tx = RandomSource(args.seed)
    # Load in error profile data
    ep = None
    if args.error_profile:
        sys.stderr.write("read in error profile\n")
        ep = ErrorProfilePermuter(args.error_profile, rnum,
                                  args.skew_profile_error_rate)
    txemitter = TranscriptomeEmitter(txome, rand=rnum_tx)
    if indata['weight_type'] == 'expression_table':
        sys.stderr.write(
            "Using expression table defined transcript expression\n")
        txweight = indata['weights']
        txemitter.set_weights_by_dict(txweight)
    elif indata['weight_type'] == 'exponential_distribution':
        sys.stderr.write(
            "ERROR not yet implemented exponential distribution\n")
        sys.exit()
    elif indata['weight_type'] == 'uniform_distribution':
        sys.stderr.write(
            "Using uniform distribution of transcript expression\n")
    cutter = MakeCuts(rand=rnum_tx)
    if args.sr:
        cutter.set_custom(args.sr_gauss_min, args.sr_gauss_mu,
                          args.sr_gauss_sigma)
    elif args.lr:
        cutter.set_custom(args.lr_gauss_min, args.lr_gauss_mu,
                          args.lr_gauss_sigma)
    # Prepare outputs
    of1 = sys.stdout
    if args.output[0][-3:] == '.gz':
        of1 = gzip.open(args.output[0], 'w')
    elif args.output[0] != '-':
        of1 = open(args.output[0], 'w')
    of2 = None
    if len(args.output) > 1:
        if args.output[1][-3:] == '.gz':
            of2 = gzip.open(args.output[1], 'w')
        elif args.output[0] != '-':
            of2 = open(args.ouptput[1], 'w')
    of_origin = None
    if args.output_original_source:
        if args.output_original_source[-3:] == '.gz':
            of_origin = gzip.open(args.output_original_source, 'w')
        else:
            of_origin = open(args.output_original_source, 'w')
    of_sc = None
    if args.output_sequence_change:
        if args.output_sequence_change[-3:] == '.gz':
            of_sc = gzip.open(args.output_sequence_change, 'w')
        else:
            of_sc = open(args.output_sequence_change, 'w')

    absmax = args.count * 100
    finished_count = 0
    z = 0
    while finished_count < args.count:
        z += 1
        if z > absmax: break
        tx = txemitter.emit_transcript()
        seq = tx.get_sequence()
        stage1seq = seq
        if args.trim_5prime or args.trim_3prime:
            fivestart = 0
            threeend = len(seq)
            if args.trim_5prime:
                lcut = int(args.trim_5prime[0] * len(seq))
                rcut = int(args.trim_5prime[1] * len(seq))
                fivestart = rnum_tx.randint(lcut, rcut)
            if args.trim_3prime:
                lcut = int(args.trim_3prime[0] * len(seq))
                rcut = int(args.trim_3prime[1] * len(seq))
                threeend = rnum_tx.randint(lcut, rcut)
            # set sequence to its new trimmed bounds
            seq = seq[fivestart:threeend]

        # flip sequence if necessary
        if not args.no_flip:
            seq = random_flip(seq, rnum_tx)

        l_read = create_name(rnum)
        r_read = None
        if args.sr or args.lr:
            cutseq = cutter.get_cut(seq)
        else:
            cutseq = seq  #case for no_fragmentation
        ############# if we pass this we will really start with this one
        if len(cutseq) < args.minimum_read_length: continue
        # can now log our read name
        if of_origin:
            of_origin.write(l_read + "\t" + tx.get_gene_name() + "\t" +
                            tx.get_transcript_name() + "\n")
        stage2seq = cutseq
        r = None
        if args.sr:
            r_read = l_read
            l = cutseq[0:args.sr_length]
            r = rc(cutseq[-1 * args.sr_length:])
        elif args.lr:
            l = cutseq
        else:
            l = cutseq
        stage3left = l
        stage3right = r
        if not stage3right: stage3right = ''
        #################
        #  l (or l and r) contains the sequence prior to errors being added
        l_qual = 'I' * len(l)
        r_qual = None
        if r: r_qual = 'I' * len(r)
        if args.fixed_quality:
            #sys.stderr.write("Use fixed quality\n")
            if len(args.fixed_quality) != 1:
                sys.stderr.write(
                    "ERROR fixed quaility should be 1 character\n")
                sys.exit()
            l_qual = args.fixed_quality * len(l)
            if r: r_qual = args.fixed_quality * len(r)
        elif args.quality_from_error_rate:
            #sys.stderr.write("Set quality from error rate\n")
            qchar = chr(
                int(-10 * math.log10(args.quality_from_error_rate)) + 33)
            l_qual = qchar * len(l)
            if r: r_qual = qchar * len(r)
        else:  #default is generate quality from profile
            if not ep:
                sys.stderr.write(
                    "ERROR: cannot generate quality from a profile.  Set error profile or chooce quaility from error rate or fixed quality\n"
                )
                sys.exit()
            l_qual = ep.emit_qual(len(l))
            if r: r_qual = ep.emit_qual(len(r))
        # Now prior to errors l_qual and r_qual contain our qualities

        l_fastq = Fastq([l_read, l, '+', l_qual])
        r_fastq = None
        if r:
            r_fastq = Fastq([r_read, r, '+', r_qual])
        # Permute sequences by a specific error rate
        if args.specific_errors:
            rate = args.specific_errors
            me = MakeErrors(rand=rnum)
            if args.specific_before_context:
                me.set_before_context(args.specific_before_context)
            if args.specific_after_context:
                me.set_after_context(args.specific_after_context)
            if args.specific_reference_base:
                if args.specific_reference_base != '-':
                    me.set_observed_base(args.specific_reference_base)
            if args.specific_modified_base:
                if args.specific_modified_base != '-':
                    me.set_modified_base(args.specific_modified_base)
            if args.specific_reference_base == '-':  #doing insertions
                l_fastq = me.random_insertion(l_fastq, rate)
                if r_fastq: r_fastq = me.random_insertion(r_fastq, rate)
            elif args.specific_modified_base == '-':  #doing deletions
                l_fastq = me.random_deletion(l_fastq, rate)
                if r_fastq: r_fastq = me.random_insertion(r_fastq, rate)
            else:
                l_fastq = me.random_substitution(l_fastq, rate)
                if r_fastq: r_fastq = me.random_insertion(r_fastq, rate)
        elif args.uniform_any_error:
            l_fastq = do_uniform_any(l_fastq, rnum, args.uniform_any_error)
            if r_fastq:
                r_fastq = do_uniform_any(r_fastq, rnum, args.uniform_any_error)
        elif args.uniform_mismatch_error:
            l_fastq = do_uniform_mismatch(l_fastq, rnum,
                                          args.uniform_mismatch_error)
            if r_fastq:
                r_fastq = do_uniform_mismatch(r_fastq, rnum,
                                              args.uniform_mismatch_error)
        elif args.any_error_by_quality:
            l_fastq = do_quality_any(l_fastq, rnum)
            if r_fastq: r_fastq = do_quality_any(r_fastq, rnum)
        elif args.mismatch_error_by_quality:
            l_fastq = do_quality_mismatch(l_fastq, rnum)
            if r_fastq: r_fastq = do_quality_mismatch(r_fastq, rnum)
        elif args.profile_context_error:
            l_fastq = ep.permute_context(l_fastq)
            if r_fastq: r_fastq = ep.permute_context(r_fastq)
        elif args.profile_general_error:
            l_fastq = ep.permute_general(l_fastq)
            if r_fastq: r_fastq = ep.permute_general(r_fastq)

        # if SR grown/shrink to appropriate length
        if args.sr and len(l_fastq) != args.sr_length:
            l_fastq = fit_length(l_fastq, args.sr_length, rnum)
        if r:
            if args.sr and len(r_fastq) != args.sr_length:
                r_fastq = fit_length(r_fastq, args.sr_length, rnum)

        of1.write(l_fastq.fastq())
        if of2:
            of2.write(r_fastq.fastq())

        stage4left = l_fastq.seq
        stage4right = ''
        if of_sc:
            of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \
                      + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n")
        if r_fastq: stage4right = r_fastq.seq
        finished_count += 1
        if finished_count % 1000 == 0:
            sys.stderr.write(
                str(finished_count) + '/' + str(args.count) + "   \r")
    sys.stderr.write("\n")
    of1.close()
    if of2:
        of2.close()
    if of_origin:
        of_origin.close()
    if of_sc:
        of_sc.close()
    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)
示例#3
0
def main(args):
  # check outputs
  if len(args.output) > 1 and not args.sr:
    sys.stderr.write("Error: Long reads don't support multiple output files\n")
    sys.exit()
  elif len(args.output) > 2:
    sys.stderr.wrtie("Error: Short reads support at most two output files (paired end)\n")
    sys.exit()
  if args.sr_length < args.minimum_read_length:
    args.minimum_read_length = args.sr_length
  inf = sys.stdin
  if args.emitter != '-':
    inf = open(args.emitter)
  sys.stderr.write("reading in transcriptome emitter\n")
  indata = pickle.loads(zlib.decompress(base64.b64decode(inf.read().rstrip())))
  txome = Transcriptome()
  txome.load_serialized(indata['txome'])
  rnum = RandomSource()
  rnum_tx = RandomSource() # for drawing transcripts
  if args.seed: 
    rnum = RandomSource(args.seed)
    rnum_tx = RandomSource(args.seed)
  # Load in error profile data
  ep = None
  if args.error_profile:
    sys.stderr.write("read in error profile\n")
    ep = ErrorProfilePermuter(args.error_profile,rnum,args.skew_profile_error_rate)
  txemitter = TranscriptomeEmitter(txome,rand=rnum_tx)
  if indata['weight_type'] == 'expression_table':
    sys.stderr.write("Using expression table defined transcript expression\n")
    txweight = indata['weights']
    txemitter.set_weights_by_dict(txweight)
  elif indata['weight_type'] == 'exponential_distribution':
    sys.stderr.write("ERROR not yet implemented exponential distribution\n")
    sys.exit()
  elif indata['weight_type'] == 'uniform_distribution':
    sys.stderr.write("Using uniform distribution of transcript expression\n")
  cutter = MakeCuts(rand=rnum_tx)
  if args.sr:
    cutter.set_custom(args.sr_gauss_min,args.sr_gauss_mu,args.sr_gauss_sigma)
  elif args.lr:
    cutter.set_custom(args.lr_gauss_min,args.lr_gauss_mu,args.lr_gauss_sigma)
  # Prepare outputs
  of1 = sys.stdout
  if args.output[0][-3:] == '.gz':
    of1 = gzip.open(args.output[0],'w')
  elif args.output[0] != '-':
    of1 = open(args.output[0],'w')
  of2 = None
  if len(args.output) > 1:
    if args.output[1][-3:] == '.gz':
      of2 = gzip.open(args.output[1],'w')
    elif args.output[0] != '-':
      of2 = open(args.ouptput[1],'w')
  of_origin = None
  if args.output_original_source:
    if args.output_original_source[-3:]=='.gz':
      of_origin = gzip.open(args.output_original_source,'w')
    else:
      of_origin = open(args.output_original_source,'w')
  of_sc = None
  if args.output_sequence_change:
    if args.output_sequence_change[-3:]=='.gz':
      of_sc = gzip.open(args.output_sequence_change,'w')
    else:
      of_sc = open(args.output_sequence_change,'w')
  
  absmax = args.count*100
  finished_count = 0
  z = 0
  while finished_count < args.count:
    z += 1
    if z > absmax: break
    tx = txemitter.emit_transcript()
    seq = tx.get_sequence()
    stage1seq = seq
    if args.trim_5prime or args.trim_3prime:
      fivestart = 0
      threeend = len(seq)
      if args.trim_5prime:
        lcut = int(args.trim_5prime[0]*len(seq))
        rcut = int(args.trim_5prime[1]*len(seq))
        fivestart = rnum_tx.randint(lcut,rcut)
      if args.trim_3prime:
        lcut = int(args.trim_3prime[0]*len(seq))
        rcut = int(args.trim_3prime[1]*len(seq))
        threeend = rnum_tx.randint(lcut,rcut)
      # set sequence to its new trimmed bounds
      seq = seq[fivestart:threeend]

    # flip sequence if necessary
    if not args.no_flip:
      seq = random_flip(seq,rnum_tx)

    l_read = create_name(rnum)
    r_read = None
    if args.sr or args.lr:
     cutseq = cutter.get_cut(seq)
    else: cutseq = seq #case for no_fragmentation
    ############# if we pass this we will really start with this one
    if len(cutseq) < args.minimum_read_length: continue
    # can now log our read name
    if of_origin:
      of_origin.write(l_read+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\n")
    stage2seq = cutseq
    r = None
    if args.sr:
      r_read = l_read
      l = cutseq[0:args.sr_length]
      r = rc(cutseq[-1*args.sr_length:])
    elif args.lr:
      l = cutseq
    else: l = cutseq
    stage3left = l
    stage3right = r
    if not stage3right: stage3right = ''
    #################
    #  l (or l and r) contains the sequence prior to errors being added
    l_qual = 'I'*len(l) 
    r_qual = None
    if r: r_qual = 'I'*len(r)
    if args.fixed_quality:
      #sys.stderr.write("Use fixed quality\n")
      if len(args.fixed_quality) != 1:
        sys.stderr.write("ERROR fixed quaility should be 1 character\n")
        sys.exit()
      l_qual = args.fixed_quality*len(l)
      if r: r_qual = args.fixed_quality*len(r)
    elif args.quality_from_error_rate:
      #sys.stderr.write("Set quality from error rate\n")
      qchar = chr(int(-10*math.log10(args.quality_from_error_rate))+33)
      l_qual = qchar*len(l)
      if r: r_qual = qchar*len(r)
    else: #default is generate quality from profile
      if not ep:
        sys.stderr.write("ERROR: cannot generate quality from a profile.  Set error profile or chooce quaility from error rate or fixed quality\n")
        sys.exit()
      l_qual = ep.emit_qual(len(l))
      if r: r_qual = ep.emit_qual(len(r))
    # Now prior to errors l_qual and r_qual contain our qualities

    l_fastq = Fastq([l_read,l,'+',l_qual])
    r_fastq = None
    if r:
      r_fastq = Fastq([r_read,r,'+',r_qual])
    # Permute sequences by a specific error rate
    if args.specific_errors:
      rate = args.specific_errors
      me = MakeErrors(rand=rnum)
      if args.specific_before_context: me.set_before_context(args.specific_before_context)
      if args.specific_after_context: me.set_after_context(args.specific_after_context)
      if args.specific_reference_base: 
        if args.specific_reference_base != '-':
          me.set_observed_base(args.specific_reference_base)
      if args.specific_modified_base: 
        if args.specific_modified_base != '-':
          me.set_modified_base(args.specific_modified_base)
      if args.specific_reference_base == '-': #doing insertions
        l_fastq = me.random_insertion(l_fastq,rate)
        if r_fastq: r_fastq = me.random_insertion(r_fastq,rate)
      elif args.specific_modified_base == '-': #doing deletions
        l_fastq = me.random_deletion(l_fastq,rate)
        if r_fastq: r_fastq = me.random_insertion(r_fastq,rate)
      else:
        l_fastq = me.random_substitution(l_fastq,rate)
        if r_fastq: r_fastq = me.random_insertion(r_fastq,rate)
    elif args.uniform_any_error:
      l_fastq = do_uniform_any(l_fastq,rnum,args.uniform_any_error)
      if r_fastq: r_fastq = do_uniform_any(r_fastq,rnum,args.uniform_any_error)  
    elif args.uniform_mismatch_error:
      l_fastq = do_uniform_mismatch(l_fastq,rnum,args.uniform_mismatch_error)
      if r_fastq: r_fastq = do_uniform_mismatch(r_fastq,rnum,args.uniform_mismatch_error)  
    elif args.any_error_by_quality:
      l_fastq = do_quality_any(l_fastq,rnum)
      if r_fastq: r_fastq = do_quality_any(r_fastq,rnum)      
    elif args.mismatch_error_by_quality:
      l_fastq = do_quality_mismatch(l_fastq,rnum)
      if r_fastq: r_fastq = do_quality_mismatch(r_fastq,rnum)
    elif args.profile_context_error:
      l_fastq = ep.permute_context(l_fastq)
      if r_fastq: r_fastq = ep.permute_context(r_fastq)
    elif args.profile_general_error:
      l_fastq = ep.permute_general(l_fastq)
      if r_fastq: r_fastq = ep.permute_general(r_fastq)
      
    # if SR grown/shrink to appropriate length
    if args.sr and len(l_fastq) != args.sr_length:
      l_fastq = fit_length(l_fastq,args.sr_length,rnum)
    if r:
      if args.sr and len(r_fastq) != args.sr_length:
        r_fastq = fit_length(r_fastq,args.sr_length,rnum)

    of1.write(l_fastq.fastq())
    if of2: 
      of2.write(r_fastq.fastq())

    stage4left = l_fastq.seq
    stage4right = ''
    if of_sc:
      of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \
                + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n")
    if r_fastq: stage4right = r_fastq.seq
    finished_count += 1
    if finished_count %1000==0: sys.stderr.write(str(finished_count)+'/'+str(args.count)+"   \r")
  sys.stderr.write("\n")
  of1.close()
  if of2:
    of2.close()
  if of_origin:
    of_origin.close()
  if of_sc:
    of_sc.close()
  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)