示例#1
0
 def __init__(self,alignment,min_intron_size=68):
   #self._alns = []
   self._min_intron_size=min_intron_size
   self._aligned_query = None
   self._hpas = []
   self._has_quality = False # can be changed when add_alignment uses one that has quality
   self._alignment = alignment
   self._quality_distro = None # gets set by analyze_quality
   self._deletion_type = None
   self._query_errors = None
   self._target_errors = None
   self._context_query_errors = None
   self._context_target_errors = None
   astrings = self._alignment.get_alignment_strings(min_intron_size=self._min_intron_size)
   if self._alignment.get_query_quality(): self._has_quality = True
   if len(astrings) == 0: return None
   alns = []
   for i in range(len(astrings[0])):
     if self._alignment.get_strand() == '+':
       alns.append({'query':astrings[0][i],'target':astrings[1][i],'query_quality':astrings[2][i]})
     else:
       alns.insert(0,{'query':rc(astrings[0][i]),'target':rc(astrings[1][i]),'query_quality':astrings[2][i][::-1]})
   #if self._alignment.get_strand() == '-':
   #  alns = alns[::-1]
   #get homopolymer alignments
   self._hpas = self._misalign_split(alns) # split alignment into homopolymer groups
   self._query_hpas = []
   self._target_hpas = []
   qi = 0
   for i in range(len(self._hpas)):
     prev = None
     if i > 0: prev = self._hpas[i-1]
     foll = None
     if i + 1 < len(self._hpas): foll = self._hpas[i+1]
     qlen = len(self._hpas[i].get_query())
     for j in range(0,qlen):
       self._query_hpas.append({'hpa':self._hpas[i],'pos':j,'prev-hpa':prev,'next-hpa':foll})
     qi+=qlen
   ti = 0
   for i in range(len(self._hpas)):
     prev = None
     if i > 0: prev = self._hpas[i-1]
     foll = None
     if i + 1 < len(self._hpas): foll = self._hpas[i+1]
     tlen = len(self._hpas[i].get_target())
     for j in range(0,tlen):
       self._target_hpas.append({'hpa':self._hpas[i],'pos':j,'prev-hpa':prev,'next-hpa':foll})
     ti+=tlen
   self._target_errors = self.get_target_errors()
   self._query_errors = self.get_query_errors()  
   self._context_target_errors = self.get_context_target_errors()
 def get_SAM(self, min_intron_size=68):
     from Bio.Format.Sam import SAM
     #ar is target then query
     qname = self.get_alignment_ranges()[0][1].chr
     flag = 0
     if self.get_strand() == '-': flag = 16
     rname = self.get_alignment_ranges()[0][0].chr
     pos = self.get_alignment_ranges()[0][0].start
     mapq = 255
     cigar = self.construct_cigar(min_intron_size)
     rnext = '*'
     pnext = 0
     tlen = self.get_target_range().length()
     seq = self.get_query_sequence()
     if not seq: seq = '*'
     qual = self.get_query_quality()
     if not qual: qual = '*'
     #seq = '*'
     #qual = '*'
     if self.get_strand() == '-':
         seq = rc(seq)
         qual = qual[::-1]
     ln = qname + "\t" + str(flag) + "\t" + rname + "\t" + \
          str(pos) + "\t" + str(mapq) + "\t" + cigar + "\t" + \
          rnext + "\t" + str(pnext) + "\t" + str(tlen) + "\t" + \
          seq + "\t" + qual
     return SAM(ln, reference=self._reference)
示例#3
0
 def get_SAM(self,min_intron_size=68):
   from Bio.Format.Sam import SAM
   #ar is target then query
   qname = self.get_alignment_ranges()[0][1].chr
   flag = 0
   if self.get_strand() == '-': flag = 16
   rname = self.get_alignment_ranges()[0][0].chr
   pos = self.get_alignment_ranges()[0][0].start
   mapq = 255
   cigar = self.construct_cigar(min_intron_size)
   rnext = '*'
   pnext = 0
   tlen = self.get_target_range().length()
   seq = self.get_query_sequence()
   if not seq: seq = '*'
   qual = self.get_query_quality()
   if not qual: qual = '*'
   #seq = '*'
   #qual = '*'
   if self.get_strand() == '-':
     seq = rc(seq)
     qual = qual[::-1]
   ln = qname + "\t" + str(flag) + "\t" + rname + "\t" + \
        str(pos) + "\t" + str(mapq) + "\t" + cigar + "\t" + \
        rnext + "\t" + str(pnext) + "\t" + str(tlen) + "\t" + \
        seq + "\t" + qual
   return SAM(ln,reference=self._reference)
示例#4
0
def random_flip(sequence, rnum=None):
    randin = rnum
    if not randin:
        randin = RandomSource()
    if randin.random() < 0.5:
        return rc(sequence)
    return sequence
 def get_alignment_strings(self, min_intron_size=68):
     qseq = self.get_query_sequence()
     if not qseq:
         sys.exit(
             "ERROR: Query sequence must be accessable to get alignment strings\n"
         )
         sys.exit()
     ref = self.get_reference()
     qual = self.get_query_quality()
     if not qual:
         qual = 'I' * len(qseq)  # for a placeholder quality
     if self.get_strand() == '-':
         qseq = rc(qseq)
         qual = qual[::-1]
     tarr = []
     qarr = []
     yarr = []
     tdone = ''
     qdone = ''
     ydone = ''  #query quality
     for i in range(len(self.get_alignment_ranges())):
         [t, q] = self.get_alignment_ranges()[i]
         textra = ''
         qextra = ''
         yextra = ''
         if i >= 1:
             dift = t.start - self.get_alignment_ranges()[i - 1][0].end - 1
             difq = q.start - self.get_alignment_ranges()[i - 1][1].end - 1
             if dift < min_intron_size:
                 if dift > 0:
                     textra = ref[t.chr][t.start - dift - 1:t.start -
                                         1].upper()
                     qextra = '-' * dift
                     yextra = '\0' * dift
                 elif difq > 0:
                     textra = '-' * difq
                     qextra = qseq[q.start - difq - 1:q.start - 1].upper()
                     yextra = qual[q.start - difq - 1:q.start - 1]
             else:
                 tarr.append(tdone)
                 qarr.append(qdone)
                 yarr.append(ydone)
                 tdone = ''
                 qdone = ''
                 ydone = ''
         tdone += textra + ref[t.chr][t.start - 1:t.end].upper()
         qdone += qextra + qseq[q.start - 1:q.end].upper()
         ydone += yextra + qual[q.start - 1:q.end]
     if len(tdone) > 0:
         tarr.append(tdone)
         qarr.append(qdone)
         yarr.append(ydone)
     if self.get_query_quality() == '*':
         yarr = [x.replace('I', ' ') for x in yarr]
     #query, target, query_quality
     return [qarr, tarr, yarr]
示例#6
0
 def set_sequence(self,ref_dict):
   strand = '+'
   if not self._direction:
     sys.stderr.write("WARNING: no strand information for the transcript\n")
   if self._direction: strand = self._direction
   chr = self.get_chrom()
   seq = ''
   for e in [x.get_range() for x in self.exons]:
     seq += ref_dict[chr][e.start-1:e.end]
   if strand == '-':  seq = rc(seq)
   self._sequence = seq.upper()
示例#7
0
 def get_alignment_strings(self,min_intron_size=68):
   qseq = self.get_query_sequence()
   if not qseq:
     sys.exit("ERROR: Query sequence must be accessable to get alignment strings\n")
     sys.exit()
   ref = self.get_reference()
   qual = self.get_query_quality()
   if not qual: 
     qual = 'I'*len(qseq) # for a placeholder quality
   if self.get_strand() == '-': 
     qseq = rc(qseq)
     qual = qual[::-1]
   tarr = []
   qarr = []
   yarr = []
   tdone = ''
   qdone = ''
   ydone = '' #query quality
   for i in range(len(self.get_alignment_ranges())):
     [t,q] = self.get_alignment_ranges()[i]
     textra = ''
     qextra = ''
     yextra = ''
     if i >= 1:
       dift = t.start-self.get_alignment_ranges()[i-1][0].end-1
       difq = q.start-self.get_alignment_ranges()[i-1][1].end-1
       if dift < min_intron_size:
         if dift > 0:
           textra = ref[t.chr][t.start-dift-1:t.start-1].upper()
           qextra = '-'*dift
           yextra = '\0'*dift
         elif difq > 0:
           textra = '-'*difq
           qextra = qseq[q.start-difq-1:q.start-1].upper()
           yextra = qual[q.start-difq-1:q.start-1]
       else:
         tarr.append(tdone)
         qarr.append(qdone)
         yarr.append(ydone)
         tdone = ''
         qdone = ''
         ydone = ''
     tdone += textra+ref[t.chr][t.start-1:t.end].upper()
     qdone += qextra+qseq[q.start-1:q.end].upper()
     ydone += yextra+qual[q.start-1:q.end]
   if len(tdone) > 0: 
     tarr.append(tdone)
     qarr.append(qdone)
     yarr.append(ydone)
   if self.get_query_quality() == '*': yarr = [x.replace('I',' ') for x in yarr]
   #query, target, query_quality
   return [qarr,tarr,yarr]
 def set_sequence(self, ref_dict):
     self._initialize()
     strand = '+'
     if not self._direction:
         sys.stderr.write(
             "WARNING: no strand information for the transcript\n")
     if self._direction: strand = self._direction
     chr = self.get_chrom()
     seq = ''
     for e in [x.get_range() for x in self.exons]:
         seq += ref_dict[chr][e.start - 1:e.end]
     if strand == '-': seq = rc(seq)
     self._sequence = seq.upper()
示例#9
0
def main(args):
    # check outputs
    if len(args.output) > 1 and not args.sr:
        sys.stderr.write(
            "Error: Long reads don't support multiple output files\n")
        sys.exit()
    elif len(args.output) > 2:
        sys.stderr.wrtie(
            "Error: Short reads support at most two output files (paired end)\n"
        )
        sys.exit()
    if args.sr_length < args.minimum_read_length:
        args.minimum_read_length = args.sr_length
    inf = sys.stdin
    if args.emitter != '-':
        inf = open(args.emitter)
    sys.stderr.write("reading in transcriptome emitter\n")
    indata = pickle.loads(
        zlib.decompress(base64.b64decode(inf.read().rstrip())))
    txome = Transcriptome()
    txome.load_serialized(indata['txome'])
    rnum = RandomSource()
    rnum_tx = RandomSource()  # for drawing transcripts
    if args.seed:
        rnum = RandomSource(args.seed)
        rnum_tx = RandomSource(args.seed)
    # Load in error profile data
    ep = None
    if args.error_profile:
        sys.stderr.write("read in error profile\n")
        ep = ErrorProfilePermuter(args.error_profile, rnum,
                                  args.skew_profile_error_rate)
    txemitter = TranscriptomeEmitter(txome, rand=rnum_tx)
    if indata['weight_type'] == 'expression_table':
        sys.stderr.write(
            "Using expression table defined transcript expression\n")
        txweight = indata['weights']
        txemitter.set_weights_by_dict(txweight)
    elif indata['weight_type'] == 'exponential_distribution':
        sys.stderr.write(
            "ERROR not yet implemented exponential distribution\n")
        sys.exit()
    elif indata['weight_type'] == 'uniform_distribution':
        sys.stderr.write(
            "Using uniform distribution of transcript expression\n")
    cutter = MakeCuts(rand=rnum_tx)
    if args.sr:
        cutter.set_custom(args.sr_gauss_min, args.sr_gauss_mu,
                          args.sr_gauss_sigma)
    elif args.lr:
        cutter.set_custom(args.lr_gauss_min, args.lr_gauss_mu,
                          args.lr_gauss_sigma)
    # Prepare outputs
    of1 = sys.stdout
    if args.output[0][-3:] == '.gz':
        of1 = gzip.open(args.output[0], 'w')
    elif args.output[0] != '-':
        of1 = open(args.output[0], 'w')
    of2 = None
    if len(args.output) > 1:
        if args.output[1][-3:] == '.gz':
            of2 = gzip.open(args.output[1], 'w')
        elif args.output[0] != '-':
            of2 = open(args.ouptput[1], 'w')
    of_origin = None
    if args.output_original_source:
        if args.output_original_source[-3:] == '.gz':
            of_origin = gzip.open(args.output_original_source, 'w')
        else:
            of_origin = open(args.output_original_source, 'w')
    of_sc = None
    if args.output_sequence_change:
        if args.output_sequence_change[-3:] == '.gz':
            of_sc = gzip.open(args.output_sequence_change, 'w')
        else:
            of_sc = open(args.output_sequence_change, 'w')

    absmax = args.count * 100
    finished_count = 0
    z = 0
    while finished_count < args.count:
        z += 1
        if z > absmax: break
        tx = txemitter.emit_transcript()
        seq = tx.get_sequence()
        stage1seq = seq
        if args.trim_5prime or args.trim_3prime:
            fivestart = 0
            threeend = len(seq)
            if args.trim_5prime:
                lcut = int(args.trim_5prime[0] * len(seq))
                rcut = int(args.trim_5prime[1] * len(seq))
                fivestart = rnum_tx.randint(lcut, rcut)
            if args.trim_3prime:
                lcut = int(args.trim_3prime[0] * len(seq))
                rcut = int(args.trim_3prime[1] * len(seq))
                threeend = rnum_tx.randint(lcut, rcut)
            # set sequence to its new trimmed bounds
            seq = seq[fivestart:threeend]

        # flip sequence if necessary
        if not args.no_flip:
            seq = random_flip(seq, rnum_tx)

        l_read = create_name(rnum)
        r_read = None
        if args.sr or args.lr:
            cutseq = cutter.get_cut(seq)
        else:
            cutseq = seq  #case for no_fragmentation
        ############# if we pass this we will really start with this one
        if len(cutseq) < args.minimum_read_length: continue
        # can now log our read name
        if of_origin:
            of_origin.write(l_read + "\t" + tx.get_gene_name() + "\t" +
                            tx.get_transcript_name() + "\n")
        stage2seq = cutseq
        r = None
        if args.sr:
            r_read = l_read
            l = cutseq[0:args.sr_length]
            r = rc(cutseq[-1 * args.sr_length:])
        elif args.lr:
            l = cutseq
        else:
            l = cutseq
        stage3left = l
        stage3right = r
        if not stage3right: stage3right = ''
        #################
        #  l (or l and r) contains the sequence prior to errors being added
        l_qual = 'I' * len(l)
        r_qual = None
        if r: r_qual = 'I' * len(r)
        if args.fixed_quality:
            #sys.stderr.write("Use fixed quality\n")
            if len(args.fixed_quality) != 1:
                sys.stderr.write(
                    "ERROR fixed quaility should be 1 character\n")
                sys.exit()
            l_qual = args.fixed_quality * len(l)
            if r: r_qual = args.fixed_quality * len(r)
        elif args.quality_from_error_rate:
            #sys.stderr.write("Set quality from error rate\n")
            qchar = chr(
                int(-10 * math.log10(args.quality_from_error_rate)) + 33)
            l_qual = qchar * len(l)
            if r: r_qual = qchar * len(r)
        else:  #default is generate quality from profile
            if not ep:
                sys.stderr.write(
                    "ERROR: cannot generate quality from a profile.  Set error profile or chooce quaility from error rate or fixed quality\n"
                )
                sys.exit()
            l_qual = ep.emit_qual(len(l))
            if r: r_qual = ep.emit_qual(len(r))
        # Now prior to errors l_qual and r_qual contain our qualities

        l_fastq = Fastq([l_read, l, '+', l_qual])
        r_fastq = None
        if r:
            r_fastq = Fastq([r_read, r, '+', r_qual])
        # Permute sequences by a specific error rate
        if args.specific_errors:
            rate = args.specific_errors
            me = MakeErrors(rand=rnum)
            if args.specific_before_context:
                me.set_before_context(args.specific_before_context)
            if args.specific_after_context:
                me.set_after_context(args.specific_after_context)
            if args.specific_reference_base:
                if args.specific_reference_base != '-':
                    me.set_observed_base(args.specific_reference_base)
            if args.specific_modified_base:
                if args.specific_modified_base != '-':
                    me.set_modified_base(args.specific_modified_base)
            if args.specific_reference_base == '-':  #doing insertions
                l_fastq = me.random_insertion(l_fastq, rate)
                if r_fastq: r_fastq = me.random_insertion(r_fastq, rate)
            elif args.specific_modified_base == '-':  #doing deletions
                l_fastq = me.random_deletion(l_fastq, rate)
                if r_fastq: r_fastq = me.random_insertion(r_fastq, rate)
            else:
                l_fastq = me.random_substitution(l_fastq, rate)
                if r_fastq: r_fastq = me.random_insertion(r_fastq, rate)
        elif args.uniform_any_error:
            l_fastq = do_uniform_any(l_fastq, rnum, args.uniform_any_error)
            if r_fastq:
                r_fastq = do_uniform_any(r_fastq, rnum, args.uniform_any_error)
        elif args.uniform_mismatch_error:
            l_fastq = do_uniform_mismatch(l_fastq, rnum,
                                          args.uniform_mismatch_error)
            if r_fastq:
                r_fastq = do_uniform_mismatch(r_fastq, rnum,
                                              args.uniform_mismatch_error)
        elif args.any_error_by_quality:
            l_fastq = do_quality_any(l_fastq, rnum)
            if r_fastq: r_fastq = do_quality_any(r_fastq, rnum)
        elif args.mismatch_error_by_quality:
            l_fastq = do_quality_mismatch(l_fastq, rnum)
            if r_fastq: r_fastq = do_quality_mismatch(r_fastq, rnum)
        elif args.profile_context_error:
            l_fastq = ep.permute_context(l_fastq)
            if r_fastq: r_fastq = ep.permute_context(r_fastq)
        elif args.profile_general_error:
            l_fastq = ep.permute_general(l_fastq)
            if r_fastq: r_fastq = ep.permute_general(r_fastq)

        # if SR grown/shrink to appropriate length
        if args.sr and len(l_fastq) != args.sr_length:
            l_fastq = fit_length(l_fastq, args.sr_length, rnum)
        if r:
            if args.sr and len(r_fastq) != args.sr_length:
                r_fastq = fit_length(r_fastq, args.sr_length, rnum)

        of1.write(l_fastq.fastq())
        if of2:
            of2.write(r_fastq.fastq())

        stage4left = l_fastq.seq
        stage4right = ''
        if of_sc:
            of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \
                      + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n")
        if r_fastq: stage4right = r_fastq.seq
        finished_count += 1
        if finished_count % 1000 == 0:
            sys.stderr.write(
                str(finished_count) + '/' + str(args.count) + "   \r")
    sys.stderr.write("\n")
    of1.close()
    if of2:
        of2.close()
    if of_origin:
        of_origin.close()
    if of_sc:
        of_sc.close()
    # Temporary working directory step 3 of 3 - Cleanup
    if not args.specific_tempdir:
        rmtree(args.tempdir)
示例#10
0
文件: Sam.py 项目: ocxtal/AlignQC
 def get_query_sequence(self):
     if self.value('seq') == '*': return None
     if self.check_flag(0x10): return rc(self.value('seq'))
     return self.value('seq')
示例#11
0
 def get_query_sequence(self):
   if self.value('seq') == '*': return None
   if self.check_flag(0x10): return rc(self.value('seq'))
   return self.value('seq')
示例#12
0
def main(args):
  # check outputs
  if len(args.output) > 1 and not args.sr:
    sys.stderr.write("Error: Long reads don't support multiple output files\n")
    sys.exit()
  elif len(args.output) > 2:
    sys.stderr.wrtie("Error: Short reads support at most two output files (paired end)\n")
    sys.exit()
  if args.sr_length < args.minimum_read_length:
    args.minimum_read_length = args.sr_length
  inf = sys.stdin
  if args.emitter != '-':
    inf = open(args.emitter)
  sys.stderr.write("reading in transcriptome emitter\n")
  indata = pickle.loads(zlib.decompress(base64.b64decode(inf.read().rstrip())))
  txome = Transcriptome()
  txome.load_serialized(indata['txome'])
  rnum = RandomSource()
  rnum_tx = RandomSource() # for drawing transcripts
  if args.seed: 
    rnum = RandomSource(args.seed)
    rnum_tx = RandomSource(args.seed)
  # Load in error profile data
  ep = None
  if args.error_profile:
    sys.stderr.write("read in error profile\n")
    ep = ErrorProfilePermuter(args.error_profile,rnum,args.skew_profile_error_rate)
  txemitter = TranscriptomeEmitter(txome,rand=rnum_tx)
  if indata['weight_type'] == 'expression_table':
    sys.stderr.write("Using expression table defined transcript expression\n")
    txweight = indata['weights']
    txemitter.set_weights_by_dict(txweight)
  elif indata['weight_type'] == 'exponential_distribution':
    sys.stderr.write("ERROR not yet implemented exponential distribution\n")
    sys.exit()
  elif indata['weight_type'] == 'uniform_distribution':
    sys.stderr.write("Using uniform distribution of transcript expression\n")
  cutter = MakeCuts(rand=rnum_tx)
  if args.sr:
    cutter.set_custom(args.sr_gauss_min,args.sr_gauss_mu,args.sr_gauss_sigma)
  elif args.lr:
    cutter.set_custom(args.lr_gauss_min,args.lr_gauss_mu,args.lr_gauss_sigma)
  # Prepare outputs
  of1 = sys.stdout
  if args.output[0][-3:] == '.gz':
    of1 = gzip.open(args.output[0],'w')
  elif args.output[0] != '-':
    of1 = open(args.output[0],'w')
  of2 = None
  if len(args.output) > 1:
    if args.output[1][-3:] == '.gz':
      of2 = gzip.open(args.output[1],'w')
    elif args.output[0] != '-':
      of2 = open(args.ouptput[1],'w')
  of_origin = None
  if args.output_original_source:
    if args.output_original_source[-3:]=='.gz':
      of_origin = gzip.open(args.output_original_source,'w')
    else:
      of_origin = open(args.output_original_source,'w')
  of_sc = None
  if args.output_sequence_change:
    if args.output_sequence_change[-3:]=='.gz':
      of_sc = gzip.open(args.output_sequence_change,'w')
    else:
      of_sc = open(args.output_sequence_change,'w')
  
  absmax = args.count*100
  finished_count = 0
  z = 0
  while finished_count < args.count:
    z += 1
    if z > absmax: break
    tx = txemitter.emit_transcript()
    seq = tx.get_sequence()
    stage1seq = seq
    if args.trim_5prime or args.trim_3prime:
      fivestart = 0
      threeend = len(seq)
      if args.trim_5prime:
        lcut = int(args.trim_5prime[0]*len(seq))
        rcut = int(args.trim_5prime[1]*len(seq))
        fivestart = rnum_tx.randint(lcut,rcut)
      if args.trim_3prime:
        lcut = int(args.trim_3prime[0]*len(seq))
        rcut = int(args.trim_3prime[1]*len(seq))
        threeend = rnum_tx.randint(lcut,rcut)
      # set sequence to its new trimmed bounds
      seq = seq[fivestart:threeend]

    # flip sequence if necessary
    if not args.no_flip:
      seq = random_flip(seq,rnum_tx)

    l_read = create_name(rnum)
    r_read = None
    if args.sr or args.lr:
     cutseq = cutter.get_cut(seq)
    else: cutseq = seq #case for no_fragmentation
    ############# if we pass this we will really start with this one
    if len(cutseq) < args.minimum_read_length: continue
    # can now log our read name
    if of_origin:
      of_origin.write(l_read+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\n")
    stage2seq = cutseq
    r = None
    if args.sr:
      r_read = l_read
      l = cutseq[0:args.sr_length]
      r = rc(cutseq[-1*args.sr_length:])
    elif args.lr:
      l = cutseq
    else: l = cutseq
    stage3left = l
    stage3right = r
    if not stage3right: stage3right = ''
    #################
    #  l (or l and r) contains the sequence prior to errors being added
    l_qual = 'I'*len(l) 
    r_qual = None
    if r: r_qual = 'I'*len(r)
    if args.fixed_quality:
      #sys.stderr.write("Use fixed quality\n")
      if len(args.fixed_quality) != 1:
        sys.stderr.write("ERROR fixed quaility should be 1 character\n")
        sys.exit()
      l_qual = args.fixed_quality*len(l)
      if r: r_qual = args.fixed_quality*len(r)
    elif args.quality_from_error_rate:
      #sys.stderr.write("Set quality from error rate\n")
      qchar = chr(int(-10*math.log10(args.quality_from_error_rate))+33)
      l_qual = qchar*len(l)
      if r: r_qual = qchar*len(r)
    else: #default is generate quality from profile
      if not ep:
        sys.stderr.write("ERROR: cannot generate quality from a profile.  Set error profile or chooce quaility from error rate or fixed quality\n")
        sys.exit()
      l_qual = ep.emit_qual(len(l))
      if r: r_qual = ep.emit_qual(len(r))
    # Now prior to errors l_qual and r_qual contain our qualities

    l_fastq = Fastq([l_read,l,'+',l_qual])
    r_fastq = None
    if r:
      r_fastq = Fastq([r_read,r,'+',r_qual])
    # Permute sequences by a specific error rate
    if args.specific_errors:
      rate = args.specific_errors
      me = MakeErrors(rand=rnum)
      if args.specific_before_context: me.set_before_context(args.specific_before_context)
      if args.specific_after_context: me.set_after_context(args.specific_after_context)
      if args.specific_reference_base: 
        if args.specific_reference_base != '-':
          me.set_observed_base(args.specific_reference_base)
      if args.specific_modified_base: 
        if args.specific_modified_base != '-':
          me.set_modified_base(args.specific_modified_base)
      if args.specific_reference_base == '-': #doing insertions
        l_fastq = me.random_insertion(l_fastq,rate)
        if r_fastq: r_fastq = me.random_insertion(r_fastq,rate)
      elif args.specific_modified_base == '-': #doing deletions
        l_fastq = me.random_deletion(l_fastq,rate)
        if r_fastq: r_fastq = me.random_insertion(r_fastq,rate)
      else:
        l_fastq = me.random_substitution(l_fastq,rate)
        if r_fastq: r_fastq = me.random_insertion(r_fastq,rate)
    elif args.uniform_any_error:
      l_fastq = do_uniform_any(l_fastq,rnum,args.uniform_any_error)
      if r_fastq: r_fastq = do_uniform_any(r_fastq,rnum,args.uniform_any_error)  
    elif args.uniform_mismatch_error:
      l_fastq = do_uniform_mismatch(l_fastq,rnum,args.uniform_mismatch_error)
      if r_fastq: r_fastq = do_uniform_mismatch(r_fastq,rnum,args.uniform_mismatch_error)  
    elif args.any_error_by_quality:
      l_fastq = do_quality_any(l_fastq,rnum)
      if r_fastq: r_fastq = do_quality_any(r_fastq,rnum)      
    elif args.mismatch_error_by_quality:
      l_fastq = do_quality_mismatch(l_fastq,rnum)
      if r_fastq: r_fastq = do_quality_mismatch(r_fastq,rnum)
    elif args.profile_context_error:
      l_fastq = ep.permute_context(l_fastq)
      if r_fastq: r_fastq = ep.permute_context(r_fastq)
    elif args.profile_general_error:
      l_fastq = ep.permute_general(l_fastq)
      if r_fastq: r_fastq = ep.permute_general(r_fastq)
      
    # if SR grown/shrink to appropriate length
    if args.sr and len(l_fastq) != args.sr_length:
      l_fastq = fit_length(l_fastq,args.sr_length,rnum)
    if r:
      if args.sr and len(r_fastq) != args.sr_length:
        r_fastq = fit_length(r_fastq,args.sr_length,rnum)

    of1.write(l_fastq.fastq())
    if of2: 
      of2.write(r_fastq.fastq())

    stage4left = l_fastq.seq
    stage4right = ''
    if of_sc:
      of_sc.write(l_fastq.name+"\t"+tx.get_gene_name()+"\t"+tx.get_transcript_name()+"\t" \
                + stage1seq+"\t"+stage2seq+"\t"+stage3left+"\t"+stage3right+"\t"+stage4left+"\t"+stage4right+"\n")
    if r_fastq: stage4right = r_fastq.seq
    finished_count += 1
    if finished_count %1000==0: sys.stderr.write(str(finished_count)+'/'+str(args.count)+"   \r")
  sys.stderr.write("\n")
  of1.close()
  if of2:
    of2.close()
  if of_origin:
    of_origin.close()
  if of_sc:
    of_sc.close()
  # Temporary working directory step 3 of 3 - Cleanup
  if not args.specific_tempdir:
    rmtree(args.tempdir)
示例#13
0
 def random_flip(self, sequence):
     if self.random.random() < 0.5:
         return rc(sequence)
     return sequence
示例#14
0
 def __init__(self, alignment, min_intron_size=68):
     #self._alns = []
     self._min_intron_size = min_intron_size
     self._aligned_query = None
     self._hpas = []
     self._has_quality = False  # can be changed when add_alignment uses one that has quality
     self._alignment = alignment
     self._quality_distro = None  # gets set by analyze_quality
     self._deletion_type = None
     self._query_errors = None
     self._target_errors = None
     self._context_query_errors = None
     self._context_target_errors = None
     astrings = self._alignment.get_alignment_strings(
         min_intron_size=self._min_intron_size)
     if self._alignment.get_query_quality(): self._has_quality = True
     if len(astrings) == 0: return None
     alns = []
     for i in range(len(astrings[0])):
         if self._alignment.get_strand() == '+':
             alns.append({
                 'query': astrings[0][i],
                 'target': astrings[1][i],
                 'query_quality': astrings[2][i]
             })
         else:
             alns.insert(
                 0, {
                     'query': rc(astrings[0][i]),
                     'target': rc(astrings[1][i]),
                     'query_quality': astrings[2][i][::-1]
                 })
     #if self._alignment.get_strand() == '-':
     #  alns = alns[::-1]
     #get homopolymer alignments
     self._hpas = self._misalign_split(
         alns)  # split alignment into homopolymer groups
     self._query_hpas = []
     self._target_hpas = []
     qi = 0
     for i in range(len(self._hpas)):
         prev = None
         if i > 0: prev = self._hpas[i - 1]
         foll = None
         if i + 1 < len(self._hpas): foll = self._hpas[i + 1]
         qlen = len(self._hpas[i].get_query())
         for j in range(0, qlen):
             self._query_hpas.append({
                 'hpa': self._hpas[i],
                 'pos': j,
                 'prev-hpa': prev,
                 'next-hpa': foll
             })
         qi += qlen
     ti = 0
     for i in range(len(self._hpas)):
         prev = None
         if i > 0: prev = self._hpas[i - 1]
         foll = None
         if i + 1 < len(self._hpas): foll = self._hpas[i + 1]
         tlen = len(self._hpas[i].get_target())
         for j in range(0, tlen):
             self._target_hpas.append({
                 'hpa': self._hpas[i],
                 'pos': j,
                 'prev-hpa': prev,
                 'next-hpa': foll
             })
         ti += tlen
     self._target_errors = self.get_target_errors()
     self._query_errors = self.get_query_errors()
     self._context_target_errors = self.get_context_target_errors()