示例#1
0
def do_buffer(buffer, msr, spc, psc, args):
    outputs = []
    for entries in buffer:
        l = []
        r = []
        for sam in entries:
            #Print line if its not a pair
            if not_a_mate_sam(sam):
                if not args.mates_only:
                    outputs.append(sam.get_line())
                continue
            if sam.check_flag(64): l.append(sam)
            if sam.check_flag(128): r.append(sam)
        if not (len(l) == 1 and len(r) == 1):
            # more than just a unique pair here
            if not args.mates_only:
                for sam in l:
                    outputs.append(sam.get_line())
                for sam in r:
                    outputs.append(sam.get_line())
            continue
        #Verify pairing by reference and direction
        if l[0].value('rname') != r[0].value('rname') or l[0].check_flag(
                16) == r[0].check_flag(16):
            sys.stderr.write(
                "ERROR, these are not actually properly paired as we were led to believe\n"
            )
            sys.exit()
        p1 = PSL(spc.convert_line(l[0].get_line()))
        if not re.search('[HP]', l[0].value('cigar')):
            p1.set_query(l[0].value('seq'))
            p1.set_quality_seq(l[0].value('qual'))
            if l[0].check_flag(16):
                # set the query to what it actually is
                p1.set_query(rc(l[0].value('seq')))
                p1.set_quality_seq(l[0].value('qual')[::-1])
        p2 = PSL(spc.convert_line(r[0].get_line()))
        if not re.search('[HP]', r[0].value('cigar')):
            p2.set_query(r[0].value('seq'))
            p2.set_quality_seq(r[0].value('qual'))
            if r[0].check_flag(16):
                # set the query to what it actually is
                p2.set_query(rc(r[0].value('seq')))
                p2.set_quality_seq(r[0].value('qual')[::-1])
        p12 = join_mated(p1, p2)
        if not p12:
            if not args.mates_only:
                outputs.append(l[0].get_line())
                outputs.append(r[0].get_line())
            continue
        #if p1.value('strand') == '-' and p2.value('strand') == '+' \
        #and p2.value('tEnd') < p1.value('tStart'):
        sline = psc.convert_line(p12.get_line(),
                                 query_sequence=p12.get_query(),
                                 quality_sequence=p12.get_quality_seq())
        #print p12.get_line()
        outputs.append(sline)
    return outputs
def overlapped(seq1, seq2, thresh):
    v1 = starts_with(seq1, seq2, thresh)
    if v1: return v1
    v2 = starts_with(seq2, seq1, thresh)
    if v2: return v2
    v3 = starts_with(seq1, rc(seq2), thresh)
    if v3: v3
    v4 = starts_with(seq2, rc(seq1), thresh)
    if v4: v4
    return False
示例#3
0
def overlapped(seq1,seq2,thresh):
  v1 = starts_with(seq1,seq2,thresh)
  if v1:  return v1
  v2 = starts_with(seq2,seq1,thresh)
  if v2: return v2
  v3 = starts_with(seq1,rc(seq2),thresh)
  if v3: v3
  v4 = starts_with(seq2,rc(seq1),thresh)
  if v4: v4
  return False
示例#4
0
 def add_sequence(self,seq):
   seq = seq.upper()
   for i in range(len(seq)+1-self.ksize):
     mer = seq[i:i+self.ksize]
     if mer not in self.mers:
       self.mers[mer] = 0
     self.mers[mer] += 1
     if rc(mer) not in self.mers:
       self.mers[rc(mer)] = 0
     self.mers[rc(mer)] += 1
   return
示例#5
0
 def add_sequence(self, seq):
     seq = seq.upper()
     for i in range(len(seq) + 1 - self.ksize):
         mer = seq[i:i + self.ksize]
         if mer not in self.mers:
             self.mers[mer] = 0
         self.mers[mer] += 1
         if rc(mer) not in self.mers:
             self.mers[rc(mer)] = 0
         self.mers[rc(mer)] += 1
     return
def do_buffer(buffer,msr,spc,psc,args):
  outputs = []
  for entries in buffer:
    l = []
    r = []
    for sam in entries:
      #Print line if its not a pair
      if not_a_mate_sam(sam):
        if not args.mates_only:
          outputs.append(sam.get_line())
        continue
      if sam.check_flag(64): l.append(sam)
      if sam.check_flag(128): r.append(sam)
    if not (len(l)==1 and len(r)==1):
      # more than just a unique pair here
      if not args.mates_only:
        for sam in l:  outputs.append(sam.get_line())
        for sam in r:  outputs.append(sam.get_line())
      continue
    #Verify pairing by reference and direction
    if l[0].value('rname') != r[0].value('rname') or l[0].check_flag(16) == r[0].check_flag(16):
      sys.stderr.write("ERROR, these are not actually properly paired as we were led to believe\n")
      sys.exit()
    p1 = PSL(spc.convert_line(l[0].get_line()))
    if not re.search('[HP]',l[0].value('cigar')): 
      p1.set_query(l[0].value('seq'))
      p1.set_quality_seq(l[0].value('qual'))
      if l[0].check_flag(16):
        # set the query to what it actually is
        p1.set_query(rc(l[0].value('seq')))
        p1.set_quality_seq(l[0].value('qual')[::-1])      
    p2 = PSL(spc.convert_line(r[0].get_line()))
    if not re.search('[HP]',r[0].value('cigar')): 
      p2.set_query(r[0].value('seq'))
      p2.set_quality_seq(r[0].value('qual'))
      if r[0].check_flag(16):
        # set the query to what it actually is
        p2.set_query(rc(r[0].value('seq')))
        p2.set_quality_seq(r[0].value('qual')[::-1])      
    p12 = join_mated(p1,p2)
    if not p12:
      if not args.mates_only:
        outputs.append(l[0].get_line())
        outputs.append(r[0].get_line())
      continue
    #if p1.value('strand') == '-' and p2.value('strand') == '+' \
    #and p2.value('tEnd') < p1.value('tStart'):
    sline = psc.convert_line(p12.get_line(),query_sequence=p12.get_query(),quality_sequence=p12.get_quality_seq())
    #print p12.get_line()
    outputs.append(sline)
  return outputs
 def construct_sequences(self,ref_hash):
   self.sequence = ''
   for b in self.bounds:
     arr = b.get_bed_array()
     if re.search(',',arr[0]) or re.search('/',arr[0]) or re.search('\|',arr[0]):
       sys.stderr.write("ERROR: original reference chromosome cannot have a comma, forward slash or vertical bar in its name\n")
       sys.exit()
     if arr[0] not in ref_hash:
       sys.stderr.write("ERROR: sequence "+str(arr[0])+" not found in reference\n")
       sys.exit()
     seq = ''
     if arr[3] == '+':
       seq = ref_hash[arr[0]][arr[1]:arr[2]]
     elif arr[3] == '-':
       seq = rc(ref_hash[arr[0]][arr[1]:arr[2]])
     else:
       sys.stderr.write("ERROR: no direction set in bed\n")
     self.sequence += seq.upper()
   return
 def align(self):
   self.s1 = self.input_s1
   self.s2 = self.input_s2
   self.M = None
   outs1 = self.execute_sw_alignment()
   outs1.append('+')
   outs1.append('+')
   if self.bidirectional:
     self.s1 = self.input_s1
     self.s2 = rc(self.input_s2)
     self.M = None
     outs2 = self.execute_sw_alignment()
     outs2.append('+')
     outs2.append('-')
     if outs2[0] > outs1[0]:
       outs1 = outs2
   result = Alignment()
   result.set_alignment(self.gapopen,self.gapextend,self.match,\
                        self.mismatch,self.bidirectional,outs1[0],outs1[1],\
                        outs1[2],outs1[3],outs1[4],outs1[5],outs1[6],\
                        self.input_s1,self.input_s2)
   return result
示例#9
0
 def align(self):
     self.s1 = self.input_s1
     self.s2 = self.input_s2
     self.M = None
     outs1 = self.execute_sw_alignment()
     outs1.append('+')
     outs1.append('+')
     if self.bidirectional:
         self.s1 = self.input_s1
         self.s2 = rc(self.input_s2)
         self.M = None
         outs2 = self.execute_sw_alignment()
         outs2.append('+')
         outs2.append('-')
         if outs2[0] > outs1[0]:
             outs1 = outs2
     result = Alignment()
     result.set_alignment(self.gapopen,self.gapextend,self.match,\
                          self.mismatch,self.bidirectional,outs1[0],outs1[1],\
                          outs1[2],outs1[3],outs1[4],outs1[5],outs1[6],\
                          self.input_s1,self.input_s2)
     return result
示例#10
0
 def construct_sequences(self, ref_hash):
     self.sequence = ''
     for b in self.bounds:
         arr = b.get_bed_array()
         if re.search(',', arr[0]) or re.search('/', arr[0]) or re.search(
                 '\|', arr[0]):
             sys.stderr.write(
                 "ERROR: original reference chromosome cannot have a comma, forward slash or vertical bar in its name\n"
             )
             sys.exit()
         if arr[0] not in ref_hash:
             sys.stderr.write("ERROR: sequence " + str(arr[0]) +
                              " not found in reference\n")
             sys.exit()
         seq = ''
         if arr[3] == '+':
             seq = ref_hash[arr[0]][arr[1]:arr[2]]
         elif arr[3] == '-':
             seq = rc(ref_hash[arr[0]][arr[1]:arr[2]])
         else:
             sys.stderr.write("ERROR: no direction set in bed\n")
         self.sequence += seq.upper()
     return
def main():
  parser = argparse.ArgumentParser(description="Correct the matches/mismatches and Ncount of a PSL file")
  parser.add_argument('input',help="PSLFILE or - for STIDN")
  parser.add_argument('reference',help="FASTAFILE reference genome")
  parser.add_argument('query',help="FASTAFILE query sequences")
  parser.add_argument('--minimum_intron_size',type=int,default=68,help="INT")
  #parser.add_argument('--ordered_query',action='store_true',help="The query psl and fasta are both ordered by query name for optimal performance")
  args = parser.parse_args()
  # Read in the reference genome
  sys.stderr.write("Reading in reference genome\n")
  g = read_fasta_into_hash(args.reference)
  sys.stderr.write("Finished reading "+str(len(g.keys()))+" reference sequences\n")
  inf = sys.stdin
  if args.input != '-':
    inf = open(args.input)
  fhr = FastaHandleReader(open(args.query))
  last_fasta = fhr.read_entry()
  if not last_fasta:
    sys.stderr.write("ERROR: No query sequences\n")
    sys.exit()
  for line in inf:
    p = PSLBasics.PSL(line)
    if not p.validate():
      sys.stderr.write("WARNING skipping invalid PSL entry. This script fixes improper mismatch and match counts .. and gap counts... it doesn't perform miracles. Problem line:\n"+line.rstrip()+"\n")
    n = p.value('qName')
    if not last_fasta:
      sys.stderr.write("ERROR: Ran out of query sequences too soon.  Are they sorted properly\n")
      sys.exit()
    while last_fasta['name'] != n:
      last_fasta = fhr.read_entry()
    p.set_query(last_fasta['seq'])
    p.set_reference_dictionary(g)
    p.correct_stats()
    print p.get_line()
    continue
    f = last_fasta
    nCount = 0
    matches = 0
    misMatches = 0
    prev_qE = 0
    prev_tE = 0
    qNumInsert = 0
    qBaseInsert = 0
    tNumInsert = 0
    tBaseInsert = 0
    for i in range(p.value('blockCount')):
      blen = p.value('blockSizes')[i]
      qS = p.value('qStarts')[i] #query start
      qE = qS + blen             #query end
      tS = p.value('tStarts')[i] #target start
      tE = tS + blen             #target end
      #Work on gaps
      if prev_qE > 0 or prev_tE > 0: #if its not our first time through
        tgap = tS-prev_tE
        if tgap < args.minimum_intron_size and tgap > 0:
          tNumInsert += 1
          tBaseInsert += tgap
        qgap = qS-prev_qE
        if qgap > 0:
          qNumInsert += 1
          qBaseInsert += qgap
      query = f['seq']
      if p.value('strand') == '-':
        query = rc(f['seq'])
      qseq = query[qS:qE].upper()
      rseq = g[p.value('tName')][tS:tE].upper()
      #print qseq+"\n"+rseq+"\n"
      for j in range(0,blen):
        if qseq[j] == 'N':
          nCount += 1
        elif qseq[j] == rseq[j]:
          matches += 1
        else:
          misMatches += 1
      prev_qE = qE
      prev_tE = tE
    p.entry['matches'] = matches
    p.entry['misMatches'] = misMatches
    p.entry['nCount'] = nCount
    p.entry['qNumInsert'] = qNumInsert
    p.entry['qBaseInsert'] = qBaseInsert
    p.entry['tNumInsert'] = tNumInsert
    p.entry['tBaseInsert'] = tBaseInsert
    p.entry['qSize'] = len(query)
    p.entry['tSize'] = len(g[p.value('tName')]) 
    print p.get_line()
    #p.pretty_print(100)
  fhr.close()
def combine(left,right,args,reference_splices,seq,read,orientation):
  #Kind of the business end where we combine two psl entries
  # Perform a check for an overlap (or near gap) sufficient for consideration
  if left['qEnd'] < right['qStart']-args.max_gap_size: # no overlap may want to have a seperate parameter for a max gap size
    return None
  target_options = get_options(left, \
                   min(left['qEnd'],right['qStart']+1), \
                   left['qEnd'], \
                   right, \
                   right['qStart']+1, \
                   max(left['qEnd'],right['qStart']+1))                     


  # Try to find the junction site
  junction_choice = None
  reference_options = []
  for j in [json.loads(x) for x in reference_splices]:
    if j[0] in target_options:
      if j[1] in target_options[j[0]]:
        for op in target_options[j[0]][j[1]]:
          dist = abs(op[2]-1)+op[3]+op[4]
          # this is where we keep ourselves from looking too far away
          if dist < args.max_search_expand:
            reference_options.append([j[0],j[1]] + op)
  # check for a cannonical spice site
  strand = left['strand']
  candidate_options = []
  for l_t in target_options:
    for r_t in target_options[l_t]:
      candidate = seq[l_t:l_t+2].upper()+'-'+seq[r_t-3:r_t-1].upper()
      iscan = False
      if strand == '+' and is_canon(candidate):
        iscan = True
      if strand == '-' and is_revcanon(candidate):
        iscan = True
      if iscan:
        for entry in target_options[l_t][r_t]:
          dist = abs(entry[2]-1)+entry[3]+entry[4]
          # this is where we keep ourselves from looking too far away
          if dist < args.max_search_expand:
            candidate_options.append([l_t,r_t]+entry+[strand,candidate])

  #for c in candidate_options: print c
  if len(candidate_options) == 0: return None

  # For choosing the best candidate we don't need to align all of the seq
  prefered_alignment_length = 50
  # Which left alignment segment has candidates in it?
  leftlen = len(left['qStarts'])
  minleft_query = min([x[2] for x in candidate_options])
  left_nearest = 0
  for i in range(0,leftlen):
    if minleft_query >= left['qStarts'][i] + 1:
      left_nearest = i
    else:
      break
  
  left_choice = left_nearest
  if left_nearest > 0:
    for i in range(left_nearest,0-1,-1):
      tot = left['qStarts'][left_nearest]-left['qStarts'][i]
      if tot > prefered_alignment_length:
        left_choice = i
        break
      left_choice = i

  # Which right alignment segment has candidates in it?
  rightlen = len(right['qStarts'])
  maxright_query = max([x[3] for x in candidate_options])
  right_nearest = 0
  for i in range(0,rightlen):
    if maxright_query <= right['qStarts'][i]+right['blockSizes'][i]:
      break
    else:
      right_nearest = i


  right_choice = right_nearest
  if right_nearest > 0:
    for i in range(right_nearest,rightlen):
      tot = (right['qStarts'][i]+right['blockSizes'][i])-(right['qStarts'][right_nearest]+right['blockSizes'][right_nearest])
      if tot > prefered_alignment_length:
        right_choice = i
        break
      right_choice = i

  # now left_choice and right_choice contain bounds to align

  # we can come up with options based on a needleman wunsch across the entire thing
  wread = read
  if strand == '-': wread = rc(read)
  newoptions = []
  for option in reference_options + candidate_options:
    #leftlen = len(left['qStarts'])
    #rightlen = len(right['qStarts'])
    [a1,a2,score] = needleman_wunsch( \
                    wread[left['qStarts'][left_choice]:right['qStarts'][right_choice]+right['blockSizes'][right_choice]].upper(), \
                    seq[left['tStarts'][left_choice]:option[0]].upper() + \
                    seq[option[1]-1:right['tStarts'][right_choice]+right['blockSizes'][right_choice]].upper())
    newoptions.append([score,a1,a2]+[option])

  #best_option = get_best_option(reference_options, candidate_options, args, strand)
  #if not best_option:
  #  return None
  [best_option,best_score,best_align] = get_best_option2(newoptions)
  if not best_option:
    return None
  #print best_option
  #print best_align[0]
  #print best_align[1]
  combo = do_combine_operation(best_option,left,right,read,seq,args)
  return combo
示例#13
0
  def sam_to_nav(self,line):
    self.line_count += 1
    if self.line_count % self.thread_count != self.thread_index-1:
      return None
    if not self.LR_seq:
      sys.stderr.write("ERROR initialize target first\n")
    if not self.SR_file:
      sys.stderr.write("ERROR initialize query first\n")
    
    if (line[0] == '@'):
        return None
    
    line_fields = line.strip().split('\t')
    cigar = line_fields[5]
    if ((cigar == '*') or (cigar == '.')):
        return None
    
    SR_name = line_fields[0]
    if (SR_name != self.current_query):
        [self.SR_seq, self.SR_idx_seq] = get_SR_sequence(self.SR_file, self.SR_idx_file, SR_name)
        self.SR_seq_rvs_cmplmnt = rc(self.SR_seq)
        self.current_query = SR_name
    if (int(line_fields[1]) & 0x10):     # Check if seq is reversed complement
        line_fields[3] = '-' + line_fields[3]
    else:
        line_fields[3] = '+' + line_fields[3]
    
    align_list = [','.join([line_fields[2], line_fields[3], line_fields[5], str(0)])]
    
    if (not self.one_line_per_alignment):   # BWA reports all alignment per read in one line
        multi_align_str = ','.join([line_fields[2], line_fields[3], line_fields[5], str(0)]) + ';'
        for fields_idx in range(11, len(line_fields)):
            if (line_fields[fields_idx][0:5] == 'XA:Z:'):
                multi_align_str += line_fields[fields_idx][5:]
                break
        align_list =  multi_align_str[:-1].split(';')
        

    read_seq_len = len(self.SR_seq)
    ostrings = []
    for align_str in align_list:
        err_state = False
        fields = align_str.split(',')
        
        ref_seq = self.LR_seq[fields[0]]
        ref_seq_len = len(ref_seq)
        if (fields[1][0] == '-'):     # Check if seq is reversed complement
            read_seq = self.SR_seq_rvs_cmplmnt
            pseudo_SR_name = "-" + SR_name
        else:
            read_seq = self.SR_seq
            pseudo_SR_name = SR_name
        fields[1] = fields[1][1:]
        read_idx = 0
        sub_ref_idx =  1  # 1-offset address
        ref_idx = int(fields[1]) - 1   # convert to 0-offset address
        diff_list = []
        cigar_list = re.split('(M|I|D)', fields[2])
        num_err = 0
        for idx in range(1, len(cigar_list), 2):
            if (cigar_list[idx - 1].isdigit()):
                if (cigar_list[idx] == 'M'):
                    subseq_len = int(cigar_list[idx - 1])
                    if ((read_idx + subseq_len > read_seq_len) or
                         (ref_idx + subseq_len > ref_seq_len)):
                        err_state = True
                        break
                    read_subseq = list(read_seq[read_idx:(read_idx + subseq_len)])
                    ref_subseq = list(ref_seq[ref_idx:(ref_idx + subseq_len)])
                    mut_indices = [x for x in range(len(read_subseq)) if read_subseq[x] != ref_subseq[x]]
                    for mut_idx in mut_indices:
                        if (read_subseq[mut_idx] != "N"):
                            diff_list += [str(sub_ref_idx + mut_idx) + ref_subseq[mut_idx] + '>' + read_subseq[mut_idx]]
                    read_idx += subseq_len
                    ref_idx += subseq_len
                    sub_ref_idx += subseq_len
                    num_err += len(mut_indices)
                elif (cigar_list[idx] == 'I'):
                    subseq_len = int(cigar_list[idx - 1])
                    if (read_idx + subseq_len > read_seq_len):
                        err_state = True
                        break
                    insert_str = re.sub(r'N|n', '', read_seq[read_idx:(read_idx + int(cigar_list[idx - 1]))])
                    if (insert_str != ""):
                        diff_list += [str(sub_ref_idx) + '+' + read_seq[read_idx:(read_idx + subseq_len)]]
                    read_idx += subseq_len
                    num_err += subseq_len
                elif (cigar_list[idx] == 'D'):
                    subseq_len = int(cigar_list[idx - 1])
                    if (ref_idx + subseq_len > ref_seq_len):
                        err_state = True
                        break
                    for del_idx in range(subseq_len):
                        diff_list += [str(sub_ref_idx + del_idx) + '-' + ref_seq[ref_idx + del_idx]]
                    ref_idx += subseq_len
                    sub_ref_idx += subseq_len
                    num_err += subseq_len
            else:
                #print 'Err in cigar: tag : ' + line
                err_state = True
                break
        if ((cigar_list[-1] == '') and (not err_state)):
            if (len(diff_list) == 0):
                diff_list.append("*")
            err_rate = (100 * num_err) / read_seq_len
            if (err_rate <= self.error_rate_threshold):
                ostrings.append('\t'.join([pseudo_SR_name, fields[0], fields[1], ' '.join(diff_list),self.SR_seq, self.SR_idx_seq]))
    if len(ostrings) == 0: return None
    return ostrings
def main():
    parser = argparse.ArgumentParser(
        description="Convert a sam file into a psl file")
    parser.add_argument('--genome',
                        help="FASTA input file of reference genome")
    parser.add_argument('--get_secondary_alignments',
                        action='store_true',
                        help="Report SA:Z secondary alignments as well")
    parser.add_argument('--get_alternative_alignments',
                        action='store_true',
                        help="Report XA:Z alternative alignments as well")
    parser.add_argument(
        '--get_all_alignments',
        action='store_true',
        help="Report SA:Z and XA:Z alternative alignments as well")
    parser.add_argument('--give_unique_names',
                        action='store_true',
                        help="Output query names will be unique.")
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        '--output_fasta',
        help=
        "FILENAME to save an outgoing fasta.  Only works for primary alignments."
    )
    group.add_argument(
        '--output_fastq',
        help=
        "FILENAME to save an outgoing fastq.  Only works for primary alignments."
    )
    parser.add_argument('infile', help="FILENAME input file or '-' for STDIN")
    parser.add_argument('-o',
                        '--output',
                        help="FILENAME for the output, STDOUT if not set.")
    args = parser.parse_args()
    if (args.output_fasta
            or args.output_fastq) and (args.get_secondary_alignments
                                       or args.get_alternative_alignments
                                       or args.get_all_alignments):
        sys.stderr.write(
            "ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n"
        )
        sys.exit()
    inf = sys.stdin
    if args.infile != '-':
        inf = open(args.infile)
    of = sys.stdout
    if args.output:
        of = open(args.output, 'w')
    spcf = SamBasics.SAMtoPSLconversionFactory()
    if args.genome: spcf.set_genome(args.genome)
    off = None
    if args.output_fasta:
        off = open(args.output_fasta, 'w')
    if args.output_fastq:
        off = open(args.output_fastq, 'w')
    z = 0
    for line in inf:
        line = line.rstrip()
        if SamBasics.is_header(line):
            spcf.read_header_line(line)
            continue
        # We have a line to convert
        psl = spcf.convert_line(line)
        if psl:
            pobj = PSL(psl)
            z += 1
            if args.give_unique_names:
                pobj.entry['qName'] = 'Q' + str(z)
            of.write(pobj.get_line() + "\n")
            if args.output_fastq or args.output_fasta:
                sam = SamBasics.SAM(line)
                sequence = sam.value('seq').upper()
                quality = sam.value('qual')
                if sam.check_flag(16):
                    sequence = rc(sam.value('seq').upper())
                    quality = sam.value('qual')[::-1]
                if args.output_fasta:
                    off.write(">" + pobj.value('qName') + "\n" + sequence +
                              "\n")
                elif args.output_fastq:
                    if len(sequence) == len(quality):
                        off.write("@" + pobj.value('qName') + "\n" + sequence +
                                  "\n" + "+\n" + quality + "\n")
                    else:
                        sys.stderr.write("ERROR: sequence " + sequence +
                                         " length (" + str(len(sequence)) +
                                         ") doesnt match quality " + quality +
                                         " length (" + str(len(quality)) +
                                         ")\n")
                        sys.exit()
        # Lets look for secondary alignments to convert
        if args.get_secondary_alignments or args.get_all_alignments:
            secondary_alignments = SamBasics.get_secondary_alignments(
                line.rstrip())
            for samline in secondary_alignments:
                psl = spcf.convert_line(samline)
                if psl:
                    #print "\nsecondary"
                    #print samline
                    z += 1
                    pobj = PSL(psl)
                    if args.give_unique_names:
                        pobj.entry['qName'] = 'Q' + str(z)
                    of.write(pobj.get_line() + "\n")
        if args.get_alternative_alignments or args.get_all_alignments:
            alternative_alignments = SamBasics.get_alternative_alignments(
                line.rstrip())
            for samline in alternative_alignments:
                psl = spcf.convert_line(samline)
                if psl:
                    #print "\nsecondary"
                    #print samline
                    z += 1
                    pobj = PSL(psl)
                    if args.give_unique_names:
                        pobj.entry['qName'] = 'Q' + str(z)
                    of.write(pobj.get_line() + "\n")
    inf.close()
    of.close()
示例#15
0
def combine(left, right, args, reference_splices, seq, read, orientation):
    #Kind of the business end where we combine two psl entries
    # Perform a check for an overlap (or near gap) sufficient for consideration
    if left['qEnd'] < right[
            'qStart'] - args.max_gap_size:  # no overlap may want to have a seperate parameter for a max gap size
        return None
    target_options = get_options(left, \
                     min(left['qEnd'],right['qStart']+1), \
                     left['qEnd'], \
                     right, \
                     right['qStart']+1, \
                     max(left['qEnd'],right['qStart']+1))

    # Try to find the junction site
    junction_choice = None
    reference_options = []
    for j in [json.loads(x) for x in reference_splices]:
        if j[0] in target_options:
            if j[1] in target_options[j[0]]:
                for op in target_options[j[0]][j[1]]:
                    dist = abs(op[2] - 1) + op[3] + op[4]
                    # this is where we keep ourselves from looking too far away
                    if dist < args.max_search_expand:
                        reference_options.append([j[0], j[1]] + op)
    # check for a cannonical spice site
    strand = left['strand']
    candidate_options = []
    for l_t in target_options:
        for r_t in target_options[l_t]:
            candidate = seq[l_t:l_t + 2].upper() + '-' + seq[r_t - 3:r_t -
                                                             1].upper()
            iscan = False
            if strand == '+' and is_canon(candidate):
                iscan = True
            if strand == '-' and is_revcanon(candidate):
                iscan = True
            if iscan:
                for entry in target_options[l_t][r_t]:
                    dist = abs(entry[2] - 1) + entry[3] + entry[4]
                    # this is where we keep ourselves from looking too far away
                    if dist < args.max_search_expand:
                        candidate_options.append([l_t, r_t] + entry +
                                                 [strand, candidate])

    #for c in candidate_options: print c
    if len(candidate_options) == 0: return None

    # For choosing the best candidate we don't need to align all of the seq
    prefered_alignment_length = 50
    # Which left alignment segment has candidates in it?
    leftlen = len(left['qStarts'])
    minleft_query = min([x[2] for x in candidate_options])
    left_nearest = 0
    for i in range(0, leftlen):
        if minleft_query >= left['qStarts'][i] + 1:
            left_nearest = i
        else:
            break

    left_choice = left_nearest
    if left_nearest > 0:
        for i in range(left_nearest, 0 - 1, -1):
            tot = left['qStarts'][left_nearest] - left['qStarts'][i]
            if tot > prefered_alignment_length:
                left_choice = i
                break
            left_choice = i

    # Which right alignment segment has candidates in it?
    rightlen = len(right['qStarts'])
    maxright_query = max([x[3] for x in candidate_options])
    right_nearest = 0
    for i in range(0, rightlen):
        if maxright_query <= right['qStarts'][i] + right['blockSizes'][i]:
            break
        else:
            right_nearest = i

    right_choice = right_nearest
    if right_nearest > 0:
        for i in range(right_nearest, rightlen):
            tot = (right['qStarts'][i] + right['blockSizes'][i]) - (
                right['qStarts'][right_nearest] +
                right['blockSizes'][right_nearest])
            if tot > prefered_alignment_length:
                right_choice = i
                break
            right_choice = i

    # now left_choice and right_choice contain bounds to align

    # we can come up with options based on a needleman wunsch across the entire thing
    wread = read
    if strand == '-': wread = rc(read)
    newoptions = []
    for option in reference_options + candidate_options:
        #leftlen = len(left['qStarts'])
        #rightlen = len(right['qStarts'])
        [a1,a2,score] = needleman_wunsch( \
                        wread[left['qStarts'][left_choice]:right['qStarts'][right_choice]+right['blockSizes'][right_choice]].upper(), \
                        seq[left['tStarts'][left_choice]:option[0]].upper() + \
                        seq[option[1]-1:right['tStarts'][right_choice]+right['blockSizes'][right_choice]].upper())
        newoptions.append([score, a1, a2] + [option])

    #best_option = get_best_option(reference_options, candidate_options, args, strand)
    #if not best_option:
    #  return None
    [best_option, best_score, best_align] = get_best_option2(newoptions)
    if not best_option:
        return None
    #print best_option
    #print best_align[0]
    #print best_align[1]
    combo = do_combine_operation(best_option, left, right, read, seq, args)
    return combo
示例#16
0
def do_combine_operation(best_option, left, right, read, seq, args):
    #print "choice is "+str(best_option)
    left_target = best_option[0]
    right_target = best_option[1]
    left_query = best_option[2]
    right_query = best_option[3]
    # store for output
    q_start_array = []
    t_start_array = []
    block_size_array = []

    left_query_start = left['qStarts'][0]
    left_target_start = left['tStarts'][0]
    for i in range(0, len(left['tStarts'])):
        tstart = left['tStarts'][i]
        tend = left['tStarts'][i] + left['blockSizes'][i]
        qstart = left['qStarts'][i]
        qend = left['qStarts'][i] + left['blockSizes'][i]
        if left_query <= qstart + 1: break
        left_query_start = qstart
        left_target_start = tstart
        if left_query <= qend: break
        q_start_array.append(qstart)
        t_start_array.append(tstart)
        block_size_array.append(left['blockSizes'][i])

    #print "left things"
    #print [left_query_start+1,left_query]
    #print [left_target_start+1,left_target]

    right_query_end = right['qStarts'][0] + right['blockSizes'][0]
    right_target_end = right['tStarts'][0] + right['blockSizes'][0]
    right_outer_index = 0
    for j in range(0, len(right['tStarts'])):
        tstart = right['tStarts'][j]
        tend = right['tStarts'][j] + right['blockSizes'][j]
        qstart = right['qStarts'][j]
        qend = right['qStarts'][j] + right['blockSizes'][j]
        right_outer_index = j + 1
        if right_query <= qstart + 1: break
        right_query_end = qend
        right_target_end = tend
        if right_query < qend: break
    #print "right things"
    #print [right_query+1,right_query_end]
    #print [right_target+1,right_target_end]
    working_read = read.upper()
    if left['strand'] == '-': working_read = rc(read.upper())
    pread = working_read[left_query_start:right_query_end]
    tseq = seq[left_target_start:left_target].upper(
    ) + seq[right_target - 1:right_target_end].upper()
    res = needleman_wunsch(pread, tseq)
    #print "short needleman wunsch"
    #print res[0]
    #print res[1]

    # Fun part of making the new portion of the alignment
    qindex = left_query_start
    tindex = left_target_start
    in_alignment = 0
    alignment = None
    bynumbers = None
    for i in range(0, len(res[0])):
        if res[0][i] == '-':  #insertion in target (gap in query)
            tindex += 1
            in_alignment = 0
        elif res[1][i] == '-':  #insertion in query (gap in target)
            qindex += 1
            in_alignment = 0
        else:  # we are in an alignment
            if in_alignment == 0:
                # output buffered result
                if alignment:
                    if len(alignment[0]) > 0:
                        q_start_array.append(bynumbers[0])
                        t_start_array.append(bynumbers[1])
                        block_size_array.append(len(alignment[0]))
                alignment = ['', '']
                bynumbers = [qindex, tindex, qindex, tindex]
            in_alignment = 1
            alignment[0] += res[0][i]
            alignment[1] += res[1][i]
            bynumbers[2] += 1
            bynumbers[3] += 1
            qindex += 1
            tindex += 1
        if qindex == right_query:  # switch forward
            #print "switch"
            #print str(tindex) + "\t" + str(right_target)
            #print str(qindex) + "\t" + str(right_query)
            if not tindex == right_target:
                in_alignment = 0
            tindex = right_target
    if alignment:
        if len(alignment[0]) > 0:
            q_start_array.append(bynumbers[0])
            t_start_array.append(bynumbers[1])
            block_size_array.append(len(alignment[0]))
        #print bynumbers

    for i in range(right_outer_index, len(right['blockSizes'])):
        q_start_array.append(right['qStarts'][i])
        t_start_array.append(right['tStarts'][i])
        block_size_array.append(right['blockSizes'][i])

    #now we can finally construct a psl line
    #we won't keep track of repeats for now
    matches = 0
    misMatches = 0
    repMatches = 0
    nCount = 0
    qNumInsert = 0
    qBaseInsert = 0
    tNumInsert = 0
    tBaseInsert = 0
    strand = left['strand']
    qName = left['qName']
    qSize = len(read)
    qStart = q_start_array[0]
    qEnd = q_start_array[len(q_start_array) -
                         1] + block_size_array[len(block_size_array) - 1]
    tName = left['tName']
    tSize = len(seq)
    tStart = t_start_array[0]
    tEnd = t_start_array[len(t_start_array) -
                         1] + block_size_array[len(block_size_array) - 1]
    blockCount = len(block_size_array)
    blockSizes = ','.join([str(x) for x in block_size_array]) + ','
    qStarts = ','.join([str(x) for x in q_start_array]) + ','
    tStarts = ','.join([str(x) for x in t_start_array]) + ','

    prev_q_end = None
    prev_t_end = None
    for i in range(0, len(block_size_array)):
        qseg = working_read[q_start_array[i]:q_start_array[i] +
                            block_size_array[i]]
        tseg = seq[t_start_array[i]:t_start_array[i] +
                   block_size_array[i]].upper()
        for j in range(0, len(qseg)):
            if qseg[j] == 'N': nCount += 1
            if qseg[j] == tseg[j]: matches += 1
            else:
                misMatches += 1
        if prev_t_end:
            t_dist = t_start_array[i] - prev_t_end
            if t_dist > 0 and t_dist < args.min_intron_size:  #we have an insert into the target and its not an intron
                tNumInsert += 1
                tBaseInsert += t_dist
        if prev_q_end:
            q_dist = q_start_array[i] - prev_q_end
            if q_dist > 0:
                qNumInsert += 1
                qBaseInsert += q_dist
        prev_q_end = q_start_array[i] + block_size_array[i]
        prev_t_end = t_start_array[i] + block_size_array[i]

    # now we have everything to make the line
    combo_line = str(matches) + "\t" + str(misMatches) + "\t" + str(repMatches) + "\t" \
               + str(nCount) + "\t" + str(qNumInsert) + "\t" + str(qBaseInsert) + "\t" \
               + str(tNumInsert) + "\t" + str(tBaseInsert) + "\t" \
               + strand + "\t" + qName + "\t" + str(qSize) + "\t" \
               + str(qStart) + "\t" + str(qEnd) + "\t" \
               + tName + "\t" + str(tSize) + "\t" \
               + str(tStart) + "\t" + str(tEnd) + "\t" + str(blockCount) + "\t" \
               + blockSizes + "\t" + qStarts + "\t" + tStarts
    #print combo_line
    #print q_start_array
    #print t_start_array
    #print block_size_array
    #  print str(right['qStarts'][i])+"\t"+str(right['qStarts'][i]+right['blockSizes'][i])
    #  print i
    return PSLBasics.line_to_entry(combo_line)
示例#17
0
def main():
  parser = argparse.ArgumentParser(description="Convert a sam file into a psl file")
  parser.add_argument('--genome',help="FASTA input file of reference genome")
  parser.add_argument('--get_secondary_alignments',action='store_true',help="Report SA:Z secondary alignments as well")
  parser.add_argument('--get_alternative_alignments',action='store_true',help="Report XA:Z alternative alignments as well")
  parser.add_argument('--get_all_alignments',action='store_true',help="Report SA:Z and XA:Z alternative alignments as well")
  parser.add_argument('--give_unique_names',action='store_true',help="Output query names will be unique.")
  group = parser.add_mutually_exclusive_group()
  group.add_argument('--output_fasta',help="FILENAME to save an outgoing fasta.  Only works for primary alignments.")
  group.add_argument('--output_fastq',help="FILENAME to save an outgoing fastq.  Only works for primary alignments.")
  parser.add_argument('infile',help="FILENAME input file or '-' for STDIN")
  parser.add_argument('-o','--output',help="FILENAME for the output, STDOUT if not set.")
  args = parser.parse_args()
  if (args.output_fasta or args.output_fastq) and (args.get_secondary_alignments or args.get_alternative_alignments or args.get_all_alignments):
    sys.stderr.write("ERROR, can only output the fastq/fasta if we are doing primary alignments only.\n")
    sys.exit()
  inf = sys.stdin
  if args.infile != '-': 
    inf = open(args.infile)
  of = sys.stdout
  if args.output:
    of = open(args.output,'w')
  spcf = SamBasics.SAMtoPSLconversionFactory()
  if args.genome: spcf.set_genome(args.genome)
  off = None
  if args.output_fasta:
    off = open(args.output_fasta,'w')
  if args.output_fastq:
    off = open(args.output_fastq,'w')
  z = 0
  for line in inf:
    line = line.rstrip()
    if SamBasics.is_header(line): 
      spcf.read_header_line(line)
      continue
    # We have a line to convert
    psl = spcf.convert_line(line)
    if psl:
      pobj = PSL(psl)
      z += 1
      if args.give_unique_names:
        pobj.entry['qName'] = 'Q'+str(z)
      of.write(pobj.get_line()+"\n")
      if args.output_fastq or args.output_fasta:
        sam = SamBasics.SAM(line)
        sequence = sam.value('seq').upper()
        quality = sam.value('qual')
        if sam.check_flag(16):
          sequence = rc(sam.value('seq').upper())
          quality = sam.value('qual')[::-1]
        if args.output_fasta:
          off.write(">"+pobj.value('qName')+"\n"+sequence+"\n")
        elif args.output_fastq:
          if len(sequence) == len(quality):
            off.write("@"+pobj.value('qName')+"\n"+sequence+"\n"+"+\n"+quality+"\n")
          else:
            sys.stderr.write("ERROR: sequence "+sequence+" length ("+str(len(sequence))+") doesnt match quality "+quality+" length ("+str(len(quality))+")\n")
            sys.exit()
    # Lets look for secondary alignments to convert
    if args.get_secondary_alignments or args.get_all_alignments:
      secondary_alignments = SamBasics.get_secondary_alignments(line.rstrip())
      for samline in secondary_alignments:
        psl = spcf.convert_line(samline)
        if psl:
          #print "\nsecondary"
          #print samline
          z += 1
          pobj = PSL(psl)
          if args.give_unique_names:
            pobj.entry['qName'] = 'Q'+str(z)
          of.write(pobj.get_line()+"\n")
    if args.get_alternative_alignments or args.get_all_alignments:
      alternative_alignments = SamBasics.get_alternative_alignments(line.rstrip())
      for samline in alternative_alignments:
        psl = spcf.convert_line(samline)
        if psl:
          #print "\nsecondary"
          #print samline
          z += 1
          pobj = PSL(psl)
          if args.give_unique_names:
            pobj.entry['qName'] = 'Q'+str(z)
          of.write(pobj.get_line()+"\n")
  inf.close()
  of.close()
示例#18
0
    def sam_to_nav(self, line):
        self.line_count += 1
        if self.line_count % self.thread_count != self.thread_index - 1:
            return None
        if not self.LR_seq:
            sys.stderr.write("ERROR initialize target first\n")
        if not self.SR_file:
            sys.stderr.write("ERROR initialize query first\n")

        if (line[0] == '@'):
            return None

        line_fields = line.strip().split('\t')
        cigar = line_fields[5]
        if ((cigar == '*') or (cigar == '.')):
            return None

        SR_name = line_fields[0]
        if (SR_name != self.current_query):
            [self.SR_seq,
             self.SR_idx_seq] = get_SR_sequence(self.SR_file, self.SR_idx_file,
                                                SR_name)
            self.SR_seq_rvs_cmplmnt = rc(self.SR_seq)
            self.current_query = SR_name
        if (int(line_fields[1]) & 0x10):  # Check if seq is reversed complement
            line_fields[3] = '-' + line_fields[3]
        else:
            line_fields[3] = '+' + line_fields[3]

        align_list = [
            ','.join([line_fields[2], line_fields[3], line_fields[5],
                      str(0)])
        ]

        if (not self.one_line_per_alignment
            ):  # BWA reports all alignment per read in one line
            multi_align_str = ','.join(
                [line_fields[2], line_fields[3], line_fields[5],
                 str(0)]) + ';'
            for fields_idx in range(11, len(line_fields)):
                if (line_fields[fields_idx][0:5] == 'XA:Z:'):
                    multi_align_str += line_fields[fields_idx][5:]
                    break
            align_list = multi_align_str[:-1].split(';')

        read_seq_len = len(self.SR_seq)
        ostrings = []
        for align_str in align_list:
            err_state = False
            fields = align_str.split(',')

            ref_seq = self.LR_seq[fields[0]]
            ref_seq_len = len(ref_seq)
            if (fields[1][0] == '-'):  # Check if seq is reversed complement
                read_seq = self.SR_seq_rvs_cmplmnt
                pseudo_SR_name = "-" + SR_name
            else:
                read_seq = self.SR_seq
                pseudo_SR_name = SR_name
            fields[1] = fields[1][1:]
            read_idx = 0
            sub_ref_idx = 1  # 1-offset address
            ref_idx = int(fields[1]) - 1  # convert to 0-offset address
            diff_list = []
            cigar_list = re.split('(M|I|D)', fields[2])
            num_err = 0
            for idx in range(1, len(cigar_list), 2):
                if (cigar_list[idx - 1].isdigit()):
                    if (cigar_list[idx] == 'M'):
                        subseq_len = int(cigar_list[idx - 1])
                        if ((read_idx + subseq_len > read_seq_len)
                                or (ref_idx + subseq_len > ref_seq_len)):
                            err_state = True
                            break
                        read_subseq = list(read_seq[read_idx:(read_idx +
                                                              subseq_len)])
                        ref_subseq = list(ref_seq[ref_idx:(ref_idx +
                                                           subseq_len)])
                        mut_indices = [
                            x for x in range(len(read_subseq))
                            if read_subseq[x] != ref_subseq[x]
                        ]
                        for mut_idx in mut_indices:
                            if (read_subseq[mut_idx] != "N"):
                                diff_list += [
                                    str(sub_ref_idx + mut_idx) +
                                    ref_subseq[mut_idx] + '>' +
                                    read_subseq[mut_idx]
                                ]
                        read_idx += subseq_len
                        ref_idx += subseq_len
                        sub_ref_idx += subseq_len
                        num_err += len(mut_indices)
                    elif (cigar_list[idx] == 'I'):
                        subseq_len = int(cigar_list[idx - 1])
                        if (read_idx + subseq_len > read_seq_len):
                            err_state = True
                            break
                        insert_str = re.sub(
                            r'N|n', '',
                            read_seq[read_idx:(read_idx +
                                               int(cigar_list[idx - 1]))])
                        if (insert_str != ""):
                            diff_list += [
                                str(sub_ref_idx) + '+' +
                                read_seq[read_idx:(read_idx + subseq_len)]
                            ]
                        read_idx += subseq_len
                        num_err += subseq_len
                    elif (cigar_list[idx] == 'D'):
                        subseq_len = int(cigar_list[idx - 1])
                        if (ref_idx + subseq_len > ref_seq_len):
                            err_state = True
                            break
                        for del_idx in range(subseq_len):
                            diff_list += [
                                str(sub_ref_idx + del_idx) + '-' +
                                ref_seq[ref_idx + del_idx]
                            ]
                        ref_idx += subseq_len
                        sub_ref_idx += subseq_len
                        num_err += subseq_len
                else:
                    #print 'Err in cigar: tag : ' + line
                    err_state = True
                    break
            if ((cigar_list[-1] == '') and (not err_state)):
                if (len(diff_list) == 0):
                    diff_list.append("*")
                err_rate = (100 * num_err) / read_seq_len
                if (err_rate <= self.error_rate_threshold):
                    ostrings.append('\t'.join([
                        pseudo_SR_name, fields[0], fields[1],
                        ' '.join(diff_list), self.SR_seq, self.SR_idx_seq
                    ]))
        if len(ostrings) == 0: return None
        return ostrings
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('--only_output_alternates',action='store_true',help='When selected, the original coordiantes are not output, and only the alternates are output')
  parser.add_argument('--long_form', action='store_true',help="add an additional column to the beginning of the output indicating whether it is an original or alternate splice coordinate")
  parser.add_argument('GenomeFastaFile',nargs=1,help="FILENAME Fasta format file of the reference genome")
  parser.add_argument('SpliceSiteFile',nargs=1,help="FILENAME Splice Site file is in tsv format with <Left chrom> <Left coord (base-1)> <Left dir [+-]> <Right chrom> <Right coord (base-1)> <Right dir [+-]>\nWhere the coordinates indicate the base that is inside the exon proximal to the splice.  Direction indicates the transcription direction on the chromosome for that side of the splice.  For coordiantes 1-base means that the number 1 would be the first base of the sequence (makes sense to do it that way, right? :P)")
  of = sys.stdout
  args = parser.parse_args()
  golds = []
  with open(args.SpliceSiteFile[0]) as inf:
    for line in inf:
      f = line.rstrip().split()
      t = {}
      t['l'] = {}
      t['r'] = {}
      t['l']['chr'] = f[0]
      t['l']['coord'] = int(f[1])
      t['l']['dir'] = f[2]
      t['r']['chr'] = f[3]
      t['r']['coord'] = int(f[4])
      t['r']['dir'] = f[5]
      golds.append(t)

  ref = read_fasta_into_hash(args.GenomeFastaFile[0])
  lens = {}
  for chr in ref:
    lens[chr] = len(ref[chr])
  for g in golds:
    l_chrom = g['l']['chr']
    r_chrom = g['r']['chr']
    l_start = g['l']['coord']
    r_start = g['r']['coord']
    l_dir = g['l']['dir']
    r_dir = g['r']['dir']
    # print the main case
    if not args.only_output_alternates:
      startstring = ''
      if args.long_form: startstring = "original\t"
      of.write(startstring+l_chrom + "\t" + str(l_start) + "\t" + l_dir + "\t" + r_chrom + "\t" + str(r_start) + "\t" + r_dir+"\n")
    #check upstream left
    equivalent = 1
    l_base = l_start
    r_base = r_start
    while(equivalent == 1):
      left_bases = ''
      right_bases = ''
      if l_dir == '+':
        l_base -= 1
        if l_base < 1: break
        left_bases = str(ref[l_chrom][l_base])
      else:
        l_base += 1
        if l_base > lens[l_chrom]: break
        left_bases = rc(str(ref[l_chrom][l_base-2]))
      if r_dir == '+':
        r_base -= 1
        if r_base < 1: break
        right_bases = str(ref[r_chrom][r_base-1])
      else:
        r_base += 1
        if r_base > lens[r_chrom]: break
        right_bases = rc(str(ref[r_chrom][r_base-1]))
      if left_bases != right_bases: break
      startstring = ''
      if args.long_form: startstring = "alternate\t"
      of.write(startstring+l_chrom + "\t" + str(l_base) + "\t" + l_dir + "\t" + r_chrom + "\t" + str(r_base) + "\t" + r_dir+"\n")
    #check downstream left
    equivalent = 1
    l_base = l_start
    r_base = r_start
    while(equivalent == 1):
      left_bases = ''
      right_bases = ''
      if l_dir == '+':
        l_base += 1
        if l_base > lens[l_chrom]: break
        left_bases = str(ref[l_chrom][l_base-1])
      else:
        l_base -= 1
        if l_base < 1: break
        left_bases = rc(str(ref[l_chrom][l_base-1]))
      if r_dir == '+':
        r_base += 1
        if r_base > lens[r_chrom]: break
        right_bases = str(ref[r_chrom][r_base-2])
      else:
        r_base -= 1
        if r_base > lens[r_chrom]: break
        right_bases = rc(str(ref[r_chrom][r_base]))
      if left_bases != right_bases: break
      startstring = ''
      if args.long_form: startstring = "alternate\t"
      of.write(startstring+l_chrom + "\t" + str(l_base) + "\t" + l_dir + "\t" + r_chrom + "\t" + str(r_base) + "\t" + r_dir+"\n")
def do_combine_operation(best_option,left,right,read,seq,args):
  #print "choice is "+str(best_option)
  left_target = best_option[0]
  right_target = best_option[1]
  left_query = best_option[2]
  right_query = best_option[3]
  # store for output
  q_start_array = []
  t_start_array = []
  block_size_array = []

  left_query_start = left['qStarts'][0]
  left_target_start = left['tStarts'][0]
  for i in range(0,len(left['tStarts'])):
    tstart = left['tStarts'][i]
    tend = left['tStarts'][i]+left['blockSizes'][i]
    qstart = left['qStarts'][i]
    qend = left['qStarts'][i]+left['blockSizes'][i]
    if left_query <= qstart+1: break
    left_query_start = qstart
    left_target_start = tstart
    if left_query <= qend: break
    q_start_array.append(qstart)
    t_start_array.append(tstart)
    block_size_array.append(left['blockSizes'][i])

  #print "left things"
  #print [left_query_start+1,left_query]
  #print [left_target_start+1,left_target]

  right_query_end = right['qStarts'][0]+right['blockSizes'][0]
  right_target_end = right['tStarts'][0]+right['blockSizes'][0]
  right_outer_index = 0
  for j in range(0,len(right['tStarts'])):
    tstart = right['tStarts'][j]
    tend = right['tStarts'][j]+right['blockSizes'][j]
    qstart = right['qStarts'][j]
    qend = right['qStarts'][j]+right['blockSizes'][j]
    right_outer_index = j+1
    if right_query <= qstart+1: break
    right_query_end = qend
    right_target_end = tend
    if right_query < qend: break
  #print "right things"
  #print [right_query+1,right_query_end]
  #print [right_target+1,right_target_end]
  working_read = read.upper()
  if left['strand'] == '-': working_read = rc(read.upper())
  pread = working_read[left_query_start:right_query_end]
  tseq = seq[left_target_start:left_target].upper()+seq[right_target-1:right_target_end].upper()
  res = needleman_wunsch(pread,tseq)
  #print "short needleman wunsch"
  #print res[0]
  #print res[1]

  # Fun part of making the new portion of the alignment
  qindex = left_query_start
  tindex = left_target_start
  in_alignment = 0
  alignment = None
  bynumbers = None
  for i in range(0,len(res[0])):
    if res[0][i] == '-':  #insertion in target (gap in query)
      tindex += 1
      in_alignment = 0
    elif res[1][i] == '-':  #insertion in query (gap in target)
      qindex += 1
      in_alignment = 0
    else: # we are in an alignment
      if in_alignment == 0:
        # output buffered result
        if alignment:
          if len(alignment[0]) > 0:
            q_start_array.append(bynumbers[0])
            t_start_array.append(bynumbers[1])
            block_size_array.append(len(alignment[0]))
        alignment = ['','']
        bynumbers = [qindex,tindex,qindex,tindex]
      in_alignment = 1
      alignment[0] += res[0][i]
      alignment[1] += res[1][i]
      bynumbers[2] += 1
      bynumbers[3] += 1
      qindex+=1
      tindex+=1
    if qindex == right_query: # switch forward
      #print "switch"
      #print str(tindex) + "\t" + str(right_target)
      #print str(qindex) + "\t" + str(right_query)
      if not tindex == right_target: 
        in_alignment = 0
      tindex = right_target
  if alignment:
    if len(alignment[0]) > 0:
      q_start_array.append(bynumbers[0])
      t_start_array.append(bynumbers[1])
      block_size_array.append(len(alignment[0]))
    #print bynumbers


  for i in range(right_outer_index,len(right['blockSizes'])):
    q_start_array.append(right['qStarts'][i])
    t_start_array.append(right['tStarts'][i])
    block_size_array.append(right['blockSizes'][i])

  #now we can finally construct a psl line
  #we won't keep track of repeats for now
  matches = 0
  misMatches = 0
  repMatches = 0
  nCount = 0
  qNumInsert = 0
  qBaseInsert = 0
  tNumInsert = 0
  tBaseInsert = 0
  strand = left['strand']
  qName = left['qName']
  qSize = len(read)
  qStart = q_start_array[0]
  qEnd = q_start_array[len(q_start_array)-1]+block_size_array[len(block_size_array)-1]
  tName = left['tName']
  tSize = len(seq)
  tStart = t_start_array[0]
  tEnd = t_start_array[len(t_start_array)-1]+block_size_array[len(block_size_array)-1]
  blockCount = len(block_size_array)
  blockSizes = ','.join([str(x) for x in block_size_array])+','
  qStarts = ','.join([str(x) for x in q_start_array])+','
  tStarts = ','.join([str(x) for x in t_start_array])+','

  prev_q_end = None
  prev_t_end = None
  for i in range(0,len(block_size_array)):
    qseg = working_read[q_start_array[i]:q_start_array[i]+block_size_array[i]]
    tseg = seq[t_start_array[i]:t_start_array[i]+block_size_array[i]].upper()
    for j in range(0,len(qseg)):
      if qseg[j] == 'N': nCount += 1
      if qseg[j] == tseg[j]: matches += 1
      else:
        misMatches += 1
    if prev_t_end:
      t_dist = t_start_array[i]-prev_t_end
      if t_dist > 0 and t_dist < args.min_intron_size: #we have an insert into the target and its not an intron
        tNumInsert += 1
        tBaseInsert += t_dist
    if prev_q_end:
      q_dist = q_start_array[i]-prev_q_end
      if q_dist > 0:
        qNumInsert += 1
        qBaseInsert += q_dist
    prev_q_end = q_start_array[i]+block_size_array[i]
    prev_t_end = t_start_array[i]+block_size_array[i]

  # now we have everything to make the line
  combo_line = str(matches) + "\t" + str(misMatches) + "\t" + str(repMatches) + "\t" \
             + str(nCount) + "\t" + str(qNumInsert) + "\t" + str(qBaseInsert) + "\t" \
             + str(tNumInsert) + "\t" + str(tBaseInsert) + "\t" \
             + strand + "\t" + qName + "\t" + str(qSize) + "\t" \
             + str(qStart) + "\t" + str(qEnd) + "\t" \
             + tName + "\t" + str(tSize) + "\t" \
             + str(tStart) + "\t" + str(tEnd) + "\t" + str(blockCount) + "\t" \
             + blockSizes + "\t" + qStarts + "\t" + tStarts
  #print combo_line
  #print q_start_array
  #print t_start_array
  #print block_size_array
  #  print str(right['qStarts'][i])+"\t"+str(right['qStarts'][i]+right['blockSizes'][i])
  #  print i
  return PSLBasics.line_to_entry(combo_line)