Пример #1
0
def parse_gpdfile(tdir,gpdfile,smoothing_factor):
  # Go through the long reads and make a genepred
  if gpdfile != '-':
    fr = FileBasics.GenericFileReader(gpdfile)
  else:
    fr = sys.stdin
  seennames = {}
  longreadnumber = 0
  of_gpd = open(tdir+'/longreads.gpd','w')
  while True:
    line = fr.readline()
    if not line: break
    if re.match('^#',line): #skip comments
      continue
    longreadnumber += 1
    entry = GenePredBasics.smooth_gaps( \
              GenePredBasics.line_to_entry(line.rstrip()) \
              ,smoothing_factor)
    readname = entry['name']
    if readname in seennames:
      sys.stderr.write("Warning: repeat name '"+readname+"'\n")
    #set our first name to our bin
    entry['name'] = str(longreadnumber)
    gline = GenePredBasics.entry_to_line(entry)
    of_gpd.write(gline+"\n")
  fr.close()
  of_gpd.close()
def get_exons_from_seqs(seqs, d, spcf):
    sind = 0
    oline = ''
    for seq in seqs:
        sind += 1
        psec = 'P'  #primary or secondary
        if sind > 1: psec = 'S'
        d1 = d.copy()
        d1['rname'] = seq[1]
        if seq[2] == '+': d1['flag'] = 0
        else: d1['flag'] = 16
        d1['pos'] = seq[3]
        d1['cigar'] = seq[4]
        d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
        skips = set(['H', 'D', 'N'])
        total_length = 0
        possible_matches = 0
        indels = 0
        qstart = 0
        if d1['cigar_array'][0]['op'] == 'S':
            qstart = d1['cigar_array'][0]['val']
        if d1['cigar_array'][0]['op'] == 'H':
            qstart = d1['cigar_array'][0]['val']
        for ce in d1['cigar_array']:
            if ce['op'] not in skips:
                total_length += ce['val']
            if ce['op'] == 'M': possible_matches += ce['val']
            elif ce['op'] == 'I':
                indels += ce['val']
            elif ce['op'] == 'D' and ce['val'] < 68:
                indels += ce['val']
        fakeseq = 'N' * total_length
        d1['seq'] = fakeseq
        nline = SamBasics.entry_to_line(d1)
        pline = spcf.convert_line(nline)
        pentry = PSLBasics.line_to_entry(pline)
        #mismatch_count = -1
        #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
        #  for i in range(0,len(pentry['blockSizes'])):
        #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
        #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
        #    print pentry['blockSizes'][i]
        #    print tseq
        #    print qseq
        #    for j in range(0,len(tseq)):
        #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
        gline = PSLBasics.convert_entry_to_genepred_line(pentry)
        gentry = GenePredBasics.line_to_entry(gline)
        gsmooth = GenePredBasics.smooth_gaps(gentry, 68)
        for i in range(0, len(gsmooth['exonStarts'])):
            oline += gsmooth['chrom'] + "\t" + str(
                gsmooth['exonStarts'][i]) + "\t" + str(
                    gsmooth['exonEnds']
                    [i]) + "\t" + gsmooth['strand'] + "\t" + gsmooth[
                        'name'] + "\t" + str(possible_matches) + "\t" + str(
                            indels) + "\t" + psec + "\t" + str(qstart) + "\n"
    return oline
Пример #3
0
def main():
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('genepred',help="FILENAME or use - for STDIN")
  parser.add_argument('--smoothing_size',type=int,default=68,help="INT no gaps less than this size")
  args = parser.parse_args()
  inf = sys.stdin
  if args.genepred != '-':
    inf = open(args.genepred)
  for line in inf:
    e = GenePredBasics.line_to_entry(line)
    e2 = GenePredBasics.smooth_gaps(e,args.smoothing_size)
    print GenePredBasics.entry_to_line(e2)
def main():
  parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size")
  parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it")
  parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference")
  parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output")
  parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN")
  parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred")
  args = parser.parse_args()

  cpus = multiprocessing.cpu_count()

  genome = {}
  if args.output_fake_psl:
    genome = read_fasta_into_hash(args.output_fake_psl)

  #read in the reference genepred first
  gpf = GenePredBasics.GenePredFile(args.reference_genepred)
  #lets sort entries by chromosome
  ref = {}
  for e in [x.entry for x in gpf.entries]:
    if len(e['exonStarts']) <= 1: continue
    if e['chrom'] not in ref:
      ref[e['chrom']] = {}
    for i in range(1,len(e['exonStarts'])):
      if e['exonEnds'][i-1] not in ref[e['chrom']]:
        ref[e['chrom']][e['exonEnds'][i-1]] = {}
      if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]:
        ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand']
  #Stored all junctions as 1-base

  read_info = {}
  pf = GenericFileReader(args.psl)
  fcount_total = 0
  while True:
    line = pf.readline()
    if not line: break
    if re.match('^#',line): continue
    line = line.rstrip()
    pe = PSLBasics.line_to_entry(line)
    if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']):
      sys.stderr.write("WARNING invalid psl\n")
      continue
    genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
    ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size)
    refjuns = {}
    if pe['tName'] in ref: refjuns = ref[pe['tName']]
    new_ge = nudge(pe,ge,refjuns,args)
    if args.output_fake_psl:
      new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome)
      print new_psl_line
    else:
      print GenePredBasics.entry_to_line(new_ge)
Пример #5
0
def get_exons_from_seqs(seqs,d,spcf):
  sind = 0
  oline = ''
  for seq in seqs:
    sind+=1
    psec = 'P' #primary or secondary
    if sind > 1: psec = 'S'
    d1 = d.copy()
    d1['rname'] = seq[1]
    if seq[2] == '+':  d1['flag'] = 0
    else: d1['flag'] = 16
    d1['pos'] = seq[3]
    d1['cigar'] = seq[4]
    d1['cigar_array'] = SamBasics.parse_cigar(seq[4])
    skips = set(['H','D','N'])
    total_length = 0
    possible_matches = 0
    indels = 0
    qstart = 0
    if d1['cigar_array'][0]['op'] == 'S':
      qstart = d1['cigar_array'][0]['val']
    if d1['cigar_array'][0]['op'] == 'H':
      qstart = d1['cigar_array'][0]['val']
    for ce in d1['cigar_array']:
      if ce['op'] not in skips:
        total_length += ce['val']
      if ce['op'] == 'M': possible_matches += ce['val']
      elif ce['op'] == 'I':
        indels += ce['val']
      elif ce['op'] == 'D' and ce['val'] < 68:
        indels += ce['val']
    fakeseq = 'N'*total_length
    d1['seq'] = fakeseq
    nline = SamBasics.entry_to_line(d1)
    pline = spcf.convert_line(nline)
    pentry = PSLBasics.line_to_entry(pline)
    #mismatch_count = -1
    #if sind == 1 and args.reference_genome: #for primary alignments we can calculate the number of matches
    #  for i in range(0,len(pentry['blockSizes'])):
    #    tseq = spcf.genome[pentry['tName']][pentry['tStarts'][i]:pentry['tStarts'][i]+pentry['blockSizes'][i]]
    #    qseq = sequence[pentry['qStarts'][i]:pentry['qStarts'][i]+pentry['blockSizes'][i]]
    #    print pentry['blockSizes'][i]
    #    print tseq
    #    print qseq
    #    for j in range(0,len(tseq)):
    #      if tseq[j].upper() != qseq[j].upper(): mismatch_count += 1
    gline = PSLBasics.convert_entry_to_genepred_line(pentry)
    gentry = GenePredBasics.line_to_entry(gline)
    gsmooth = GenePredBasics.smooth_gaps(gentry,68)
    for i in range(0,len(gsmooth['exonStarts'])):
      oline += gsmooth['chrom'] + "\t" + str(gsmooth['exonStarts'][i])+"\t"+str(gsmooth['exonEnds'][i])+"\t"+gsmooth['strand']+"\t"+gsmooth['name']+"\t"+str(possible_matches)+"\t"+str(indels)+"\t"+psec+"\t"+str(qstart)+"\n"
  return oline
Пример #6
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('genepred', help="FILENAME or use - for STDIN")
    parser.add_argument('--smoothing_size',
                        type=int,
                        default=68,
                        help="INT no gaps less than this size")
    args = parser.parse_args()
    inf = sys.stdin
    if args.genepred != '-':
        inf = open(args.genepred)
    for line in inf:
        e = GenePredBasics.line_to_entry(line)
        e2 = GenePredBasics.smooth_gaps(e, args.smoothing_size)
        print GenePredBasics.entry_to_line(e2)
Пример #7
0
def convert_directionless_gpd_alignment_to_reference(sam_filename,genepred_filename,out_map):
  conv = GenePredBasics.get_directionless_gpd_conversion(genepred_filename)
  ofile = open(out_map,'w')
  with open(sam_filename) as samfile:
    for line in samfile:
      line = line.rstrip()
      if re.match('^@[A-Z][A-Z]\s',line): continue #skip header
      d = sam_line_to_dictionary(line)
      if d['rname'] == '*': continue #skip unmapped
      startposition = d['pos']-1
      readcoord = []
      z = 0
      for entry in d['cigar_array']:
        if re.match('[MISX=]',entry['op']):  # all the entries that map to the read
          for i in range(0,entry['val']):
            if re.match('[M=X]',entry['op']): #all the entries that match the reference alignment
              readcoord.append(conv[d['rname']]['coordinates'][startposition+z])
              z+=1
            # lets ignore insertions for now
            #else:
            #  readcoord.append('*')
        if re.match('[DNH]',entry['op']):
          z+= entry['val']      
      abbrev = conv[d['rname']]['chrom']+':'+SequenceBasics.collapse_coordinate_array(readcoord)
      ofile.write(d['qname'] + "\t" + d['rname'] + "\t" + abbrev + "\n")
  ofile.close()
Пример #8
0
def parse_refgpd(tdir,geneprednames,simplenames):
  # get the reference genepreds ready to use in work
  column_number = 0
  entry_number = 0
  of_entries = open(tdir+"/entries.txt",'w')
  for file in geneprednames:
    column_number += 1
    of_ref = open(tdir+"/reference."+str(column_number)+".bed",'w')
    gfr = FileBasics.GenericFileReader(file)
    while True:
      line = gfr.readline()
      if not line: break
      if re.match('^#',line): continue
      entry_number += 1
      line = line.rstrip("\n")
      entry = GenePredBasics.line_to_entry(line)
      entry_length = 0
      for i in range(0,len(entry['exonStarts'])): entry_length += entry['exonEnds'][i]-entry['exonStarts'][i]
      of_entries.write(str(column_number)+ "\t" + simplenames[column_number-1] + "\t" + str(entry_number) + "\t" + entry['gene_name'] + "\t" + entry['name']+"\t"+str(entry_length)+"\n")
      exon_number = 0
      for i in range(0,len(entry['exonStarts'])):
        exon_number += 1
        of_ref.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \
                   + str(entry['exonEnds'][i]) + "\t" + str(entry_number) + "\t" \
                   + entry['gene_name'] + "\t" \
                   + entry['name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \
                   + entry['strand'] + "\t" + str(exon_number) \
                   + "\n")
    gfr.close()
    of_ref.close()
  of_entries.close()
def main():
  parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.")
  parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.")
  parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.")
  args = parser.parse_args()
  
  pslfilehandle = sys.stdin
  if args.input_name != '-':
    pslfilehandle = open(args.input_name)
  with pslfilehandle as infile:
    for line in infile:
      psl_entry = PSLBasics.line_to_entry(line)
      genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry)
      if args.fill_gaps > 0:
        genepred_entry = GenePredBasics.line_to_entry(genepred_line)
        genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps)
        genepred_line = GenePredBasics.entry_to_line(genepred_entry2)
      print genepred_line
Пример #10
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input', help="GENEPRED file input use - for STDIN")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input != '-':
        inf = open(args.input)
    for line in inf:
        e = GenePredBasics.GenePredEntry()
        e.line_to_entry(line.rstrip())
        print e.entry['gene_name'] + "\t" + e.entry['name'] + "\t" + str(
            e.length())
    inf.close()
 def add_genepred_line(self, inline):
     if not self.ref_hash:
         sys.stderr.write(
             "ERROR: Must assign a reference genome dictionary first\n")
         sys.exit()
     gpd = GenePredBasics.GenePredEntry(inline)
     if gpd.value('name') in self.transcripts:
         sys.stderr.write("WARNING: " + inline +
                          " transcript was already set\n")
     seq = ''
     for i in range(0, gpd.value('exonCount')):
         seq += self.ref_hash[gpd.value('chrom')][
             gpd.value('exonStarts')[i]:gpd.value('exonEnds')[i]].upper()
     if gpd.value('strand') == '-': seq = SequenceBasics.rc(seq)
     self.transcripts[gpd.value('name')] = seq
     return
Пример #12
0
  def read_from_fasta_and_genepred(self,genomefastafile,genepredfile):
    # read in our genome
    seen_names = {}
    seen_coords = {}
    genepred = {}
    with open(genepredfile) as inf:
      for line in inf:
        if re.match('^#',line): continue
        e = GenePredBasics.line_to_entry(line)
        hexcoord = hashlib.sha1(e['chrom']+"\t"+e['strand'] + "\t" + str(e['exonStarts'])+"\t" + str(e['exonEnds'])).hexdigest()
        #print hex
        #print e['gene_name']
        #print e['name']
        dupname = 0
        dupcoord = 0
        if hexcoord in seen_coords:
          sys.stderr.write("Warning "+ e['name'] + " " + e['gene_name'] + " exists at identical coordinates as another entry\n")
          dupcoord = 1
        seen_coords[hexcoord] = 1
        currname = e['name']
        if e['name'] in seen_names:
          if dupcoord == 1:
            sys.stderr.write("skipping perfect duplicate of "+e['name']+"\n")
            continue
          newname = e['name'] + "."+str(len(seen_names[e['name']])+1)
          currname = newname
          seen_names[e['name']].append(newname)
          sys.stderr.write("Warning "+ e['name'] + " " + e['gene_name'] + " is a duplicate name.. renaming to "+newname+ "\n")
          dupname = 1
        else:
          seen_names[e['name']] = []
          seen_names[e['name']].append(e['name'])
        genepred[currname] = e

    #print "reading names and locs"             
    ref = read_fasta_into_hash(genomefastafile)
    #print "converting sequences"
    for transcript in genepred:
      e = genepred[transcript]
      if e['chrom'] in ref:
        seq = ''
        self.transcript_names[transcript] = genepred[transcript]['name']
        for i in range(0,e['exonCount']):
          seq += ref[e['chrom']][e['exonStarts'][i]:e['exonEnds'][i]]
        if e['strand'] == '-': seq = rc(seq)
        self.transcripts[transcript] = seq
Пример #13
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('input_file', help="use - for STDIN")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input_file != '-':
        inf = open(args.input_file)
    for line in inf:
        e = GenePredBasics.line_to_entry(line.rstrip())
        matches = 0
        qstartslist = []
        for i in range(0, len(e['exonStarts'])):
            mylen = e['exonEnds'][i] - e['exonStarts'][i]
            matches += mylen
            qstartslist.append(matches - mylen)
        qstarts = ','.join([str(x) for x in qstartslist]) + ','
        oline = str(matches) + "\t"  # 1
        oline += "0\t"  # 2
        oline += "0\t"  # 3
        oline += "0\t"  # 4
        oline += "0\t"  # 5
        oline += "0\t"  # 6
        oline += "0\t"  # 7
        oline += "0\t"  # 8
        oline += e['strand'] + "\t"  # 9
        oline += e['name'] + "\t"  # 10
        oline += str(matches) + "\t"  # 11
        oline += "0\t"  # 12
        oline += str(matches) + "\t"  # 13
        oline += str(e['chrom']) + "\t"  # 14
        oline += str(e['exonEnds'][-1]) + "\t"  # 15
        oline += str(e['exonStarts'][0]) + "\t"  # 16
        oline += str(e['exonEnds'][-1]) + "\t"  # 17
        oline += str(len(e['exonStarts'])) + "\t"  # 18
        oline += ','.join([
            str(e['exonEnds'][x] - e['exonStarts'][x])
            for x in range(0, len(e['exonStarts']))
        ]) + ',' + "\t"  # 19
        oline += qstarts + "\t"  # 20
        oline += ','.join([str(x) for x in e['exonStarts']]) + ','  # 21
        print oline
    inf.close()
def main():
  parser = argparse.ArgumentParser(description='Create artifical reference sequences from a genepred')
  parser.add_argument('gpd_file')
  parser.add_argument('reference_fasta')
  parser.add_argument('-o','--output',help="output file to write to or STDOUT if not set")
  args = parser.parse_args()
  of  = sys.stdout
  if args.output: of = open(args.output,'w')
  f = read_fasta_into_hash(args.reference_fasta)
  with open(args.gpd_file) as inf:
    for line in inf:
      gpd = GenePredBasics.GenePredEntry()
      gpd.line_to_entry(line.rstrip())
      ars = ARS()
      beds = []
      for i in range(0,gpd.value('exonCount')):
        b = Bed(gpd.value('chrom'),gpd.value('exonStarts')[i],gpd.value('exonEnds')[i],gpd.value('strand'))
        beds.append(b)
      ars.set_bounds(beds)
      ars.set_name(gpd.value('name'))
      ars.set_sequence_from_original_reference_hash(f)
      of.write(ars.get_fasta())
def main():
  parser = argparse.ArgumentParser()
  parser.add_argument('input_file',help="use - for STDIN")
  args = parser.parse_args()
  inf = sys.stdin
  if args.input_file != '-':
    inf = open(args.input_file)
  for line in inf:
    e = GenePredBasics.line_to_entry(line.rstrip())
    matches = 0
    qstartslist = []
    for i in range(0,len(e['exonStarts'])):
      mylen = e['exonEnds'][i]-e['exonStarts'][i]
      matches += mylen
      qstartslist.append(matches-mylen)
    qstarts = ','.join([str(x) for x in qstartslist])+','
    oline =  str(matches)+"\t" # 1
    oline += "0\t" # 2
    oline += "0\t" # 3
    oline += "0\t" # 4
    oline += "0\t" # 5
    oline += "0\t" # 6
    oline += "0\t" # 7
    oline += "0\t" # 8
    oline += e['strand']+"\t" # 9
    oline += e['name']+"\t" # 10
    oline += str(matches)+"\t" # 11
    oline += "0\t" # 12
    oline += str(matches)+"\t" # 13
    oline += str(e['chrom'])+"\t" # 14
    oline += str(e['exonEnds'][-1])+"\t" # 15
    oline += str(e['exonStarts'][0])+"\t" # 16
    oline += str(e['exonEnds'][-1])+"\t" # 17
    oline += str(len(e['exonStarts']))+"\t" # 18
    oline += ','.join([str(e['exonEnds'][x]-e['exonStarts'][x]) for x in range(0,len(e['exonStarts']))])+','+"\t" # 19
    oline += qstarts + "\t" # 20
    oline += ','.join([str(x) for x in e['exonStarts']])+',' # 21
    print oline
  inf.close()
Пример #16
0
def main():
    #do our inputs
    args = do_inputs()
    global gout
    gout = args.output
    gls = GenePredBasics.GenePredLocusStream(args.input)
    fgs = GenePredFuzzyBasics.FuzzyGenePredSeparator()
    if args.threads > 1:
        p = Pool(processes=args.threads)
    while True:
        buffer = gls.read_locus()
        if not buffer: break
        if args.threads > 1:
            p.apply_async(process_buffer,
                          args=(buffer, args),
                          callback=out_gpds)
        else:
            v = process_buffer(buffer, args)
            out_gpds(v)
    if args.threads > 1:
        p.close()
        p.join()
    sys.stderr.write("\n")
Пример #17
0
def break_gpdfile(tdir,job_size):
  bfcr = BigFileBasics.BigFileChunkReader(tdir+'/longreads.gpd')
  bfcr.set_chunk_size_bytes(job_size)
  num_jobs = bfcr.chunk_count
  for i in range(0,bfcr.chunk_count):
    oc = bfcr.open_chunk(i)
    job = i+1
    of_bed = open(tdir+'/partreads.'+str(job)+'.bed','w')
    while True:
      line = oc.read_line()
      if not line: break
      line = line.rstrip("\n")
      entry = GenePredBasics.line_to_entry(line)
      exon_number = 0
      for i in range(0,len(entry['exonStarts'])):
        exon_number += 1
        of_bed.write(entry['chrom'] + "\t" + str(entry['exonStarts'][i]) + "\t" \
                     + str(entry['exonEnds'][i]) + "\t" + entry['name']+"\t" \
                     + entry['gene_name'] + "\t" + str(len(entry['exonStarts'])) + "\t" \
                     + entry['strand'] + "\t" + str(exon_number) + "\n")   
    oc.close()
    of_bed.close()
  return num_jobs
Пример #18
0
def convert_directionless_gpd_alignment_to_reference(sam_filename,
                                                     genepred_filename,
                                                     out_map):
    conv = GenePredBasics.get_directionless_gpd_conversion(genepred_filename)
    ofile = open(out_map, 'w')
    with open(sam_filename) as samfile:
        for line in samfile:
            line = line.rstrip()
            if re.match('^@[A-Z][A-Z]\s', line): continue  #skip header
            d = sam_line_to_dictionary(line)
            if d['rname'] == '*': continue  #skip unmapped
            startposition = d['pos'] - 1
            readcoord = []
            z = 0
            for entry in d['cigar_array']:
                if re.match(
                        '[MISX=]',
                        entry['op']):  # all the entries that map to the read
                    for i in range(0, entry['val']):
                        if re.match(
                                '[M=X]', entry['op']
                        ):  #all the entries that match the reference alignment
                            readcoord.append(
                                conv[d['rname']]['coordinates'][startposition +
                                                                z])
                            z += 1
                        # lets ignore insertions for now
                        #else:
                        #  readcoord.append('*')
                if re.match('[DNH]', entry['op']):
                    z += entry['val']
            abbrev = conv[d['rname']][
                'chrom'] + ':' + SequenceBasics.collapse_coordinate_array(
                    readcoord)
            ofile.write(d['qname'] + "\t" + d['rname'] + "\t" + abbrev + "\n")
    ofile.close()
def nudge(psl_entry,gpd_entry,refjun,args):
  junctions = []
  fcount = 0
  if len(gpd_entry['exonStarts']) == 1:
    #print "no intron 1"
    return gpd_entry
  bounds = []
  for i in range(1,len(gpd_entry['exonStarts'])):
    junc_start = gpd_entry['exonEnds'][i-1]
    junc_finish = gpd_entry['exonStarts'][i]+1
    bounds.append([junc_start, junc_finish,i-1])
  if len(bounds) < 1:
    #print "no intron 2"
    return gpd_entry
  bestbounds = []
  for bound in bounds:
    best_distance = [10000000,10000000]
    best_result = None
    for z1 in range(bound[0]-args.search_size,bound[0]+args.search_size+1):
      d1 = abs(z1-bound[0])
      if z1 in refjun:
        for z2 in range(bound[1]-args.search_size,bound[1]+args.search_size+args.search_size+1):
          d2 = abs(z2-bound[1])
          if z2 in refjun[z1]:
            refstrand = refjun[z1][z2]
            if d1+d2 < best_distance[0]+best_distance[1]:
              best_distance = [d1,d2]
              best_result = [z1,z2,refstrand,bound[2]]+best_distance
    if best_result:
      bestbounds.append(best_result)
  if len(bestbounds) < 1: 
    #nothing fixable
    #sys.stderr.write("nothing fixable\n")
    return gpd_entry
  #Now we have a list of nudgable bounds
  #Lets pick a strand
  plus_score = 0
  minus_score = 0
  #print '----'
  #print bestbounds
  for bound in bestbounds:
    if bound[2] == '+':
      plus_score += 1/(float(abs(bound[4]))+float(abs(bound[5]))+1)
    else:
      minus_score += 1/(float(abs(bound[4]))+float(abs(bound[5]))+1)
  use_strand = '+'
  #print [plus_score,minus_score]
  if plus_score < minus_score: use_strand = '-'
  #print use_strand
  choice_bounds = []
  for bound in bestbounds:
    if bound[2] == use_strand:  choice_bounds.append(bound)
  #print '---'
  #print GenePredBasics.entry_to_line(gpd_entry)
  #print bestbounds
  #print choice_bounds
  if len(choice_bounds) < 1: 
    print "ERROR  should have choices"
    sys.exit()
  replacements = {}
  for bound in choice_bounds:  replacements[bound[3]] = [bound[0],bound[1]]
  junctions = []
  #print "fixed "+str(len(replacements.keys()))
  for i in range(0,len(bounds)):
    val = bounds[i]
    if i in replacements:
      #sys.stderr.write("use replacement\n")
      val = replacements[i]
      fcount += 1
    junctions.append([val[0],val[1]])
  #print junctions
  #sys.stderr.write("replace\n")
  #print junctions
  new_gpd_line  = gpd_entry['gene_name'] + "\t"
  new_gpd_line += gpd_entry['name'] + "\t"
  new_gpd_line += gpd_entry['chrom'] + "\t"
  new_gpd_line += gpd_entry['strand'] + "\t"
  new_gpd_line += str(gpd_entry['txStart']) + "\t"
  new_gpd_line += str(gpd_entry['txEnd']) + "\t"
  new_gpd_line += str(gpd_entry['cdsStart']) + "\t"
  new_gpd_line += str(gpd_entry['cdsEnd']) + "\t"
  new_gpd_line += str(len(junctions)+1) + "\t"
  exon_starts = [gpd_entry['txStart']]
  exon_ends = [] #gpd_entry['txEnd']]
  for junc in junctions:
    exon_starts.append(junc[1]-1)
    exon_ends.append(junc[0])
  exon_ends.append(gpd_entry['txEnd'])
  new_gpd_line += ','.join([str(x) for x in exon_starts])+','+"\t"
  new_gpd_line += ','.join([str(x) for x in exon_ends])+','+"\t"
  #print new_gpd_line
  new_gpd_entry = GenePredBasics.line_to_entry(new_gpd_line)
  #print "got junctions"
  #print new_gpd_line
  #print '.........'
  return new_gpd_entry
Пример #20
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('a', nargs=1, help='FILENAME genepred file A')
    parser.add_argument('b', nargs=1, help='FILENAME genepred file B')
    #parser.add_argument('-p',nargs='?',help='INT the number of threads to run.')
    parser.add_argument('--minexoncount',
                        nargs='?',
                        help='INT the minimum number of exons required.')
    parser.add_argument(
        '--minoverlap_internal',
        nargs='?',
        help=
        'FLOAT the fraction (0-1) of the required reciprocal overlap of an internal exon to call an exon a match.'
    )
    parser.add_argument(
        '--minoverlap_first',
        nargs='?',
        help=
        'FLOAT the fraction (0-1) of the required reciprocal overlap of the first exon to call an exon a match.'
    )
    parser.add_argument(
        '--minoverlap_last',
        nargs='?',
        help=
        'FLOAT the fraction (0-1) of the required reciprocal overlap of the last exon to call an exon a match.'
    )
    parser.add_argument(
        '--minoverlap',
        nargs='?',
        help=
        'FLOAT the fraction (0-1) of the required reciprocal overlap of any exon to call an exon a match.'
    )
    parser.add_argument(
        '--leftouterjoin',
        action='store_true',
        help=
        'Output the entry A regardless of whether a matching entry in B is found'
    )
    parser.add_argument('--output_a_not_in_b',
                        action='store_true',
                        help='Output entries that occur in A but not B')
    parser.add_argument(
        '--best_b_only',
        action='store_true',
        help=
        'Output only one entry of B for each A and try to pick the best based on reciprocal overlap'
    )
    parser.add_argument(
        '--allow_a_subset_of_b_fragments',
        action='store_true',
        help=
        'If A is just a subset of B, then call it as a match.  This means all exons of A found a conecutive match, but B could have more exons on either end.'
    )
    parser.add_argument(
        '--allow_any_fragments',
        action='store_true',
        help='If set, allow any partial match, not just the best')
    args = parser.parse_args()

    #pcount = multiprocessing.cpu_count()
    #if args.p: pcount = int(args.p)
    # go through contingencies of overlap requirements and set them
    overlap = [0, 0, 0]
    if args.minoverlap:
        overlap = [
            float(args.minoverlap),
            float(args.minoverlap),
            float(args.minoverlap)
        ]
    if args.minoverlap_first:
        overlap[0] = float(args.minoverlap_last)
    if args.minoverlap_last:
        overlap[2] = float(args.minoverlap_last)
    if args.minoverlap_internal:
        overlap[1] = float(args.minoverlap_internal)

    # read the genepred files
    gpdA = GenePredBasics.GenePredFile(args.a[0])
    gpdB = GenePredBasics.GenePredFile(args.b[0])

    #if pcount > 1:
    #  p = multiprocessing.Pool(processes=pcount)
    for eA in gpdA.entries:
        #if pcount > 1:
        #  p.apply_async(check_B_entries,[eA,overlap,args])
        #else:
        check_B_entries(eA, gpdB, overlap, args)
Пример #21
0
def check_B_entries(eA, gpdB, overlap, args):
    a_unique = True
    best_exon_count = 0
    best_overlap = 0
    best_line = ''
    best_frac = 0
    ostring = ''
    for eB in gpdB.entries:
        double_line = GenePredBasics.entry_to_line(
            eA.entry) + "\t" + GenePredBasics.entry_to_line(eB.entry) + "\n"
        gpd_comparison = GenePredBasics.GenePredComparison()
        gpd_comparison.set_overlap_requirement(overlap)
        if eA.entry['chrom'] != eB.entry['chrom']: continue
        # normal is to do full length matches
        if not (args.allow_a_subset_of_b_fragments
                or args.allow_any_fragments):
            # do some easy checks
            if eA.get_exon_count() != eB.get_exon_count(): continue
            gpd_comparison.set_require_all_exons_overlap(True)
            gpd_comparison.compare(eA, eB)
            if gpd_comparison.output['full_match']:
                a_unique = False
                if args.output_a_not_in_b:
                    break  # we can bust out of the inner loop if we are only printing stuff unique to a
                if not args.best_b_only:  # if we aren't waiting for the best, print it
                    ostring += double_line
                else:
                    # only do the best
                    if gpd_comparison.output['consecutive_exons'] > best_exon_count \
                    or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
                    and gpd_comparison.output['overlap_length'] > best_overlap) \
                    or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
                    and gpd_comparison.output['overlap_length'] == best_overlap \
                    and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac):
                        best_exon_count = gpd_comparison.output[
                            'consecutive_exons']
                        best_overlap = gpd_comparison.output['overlap_length']
                        best_line = double_line
                        best_frac = harmonic_mean(
                            gpd_comparison.output['overlap_fractions'])
        # Allow partial matches
        else:
            gpd_comparison.compare(eA, eB)
            if gpd_comparison.output['partial_match']:
                # if we require a to be subset of b
                if args.allow_a_subset_of_b_fragments \
                and not (eA.get_exon_count() < eB.get_exon_count() \
                and eA.get_exon_count() == gpd_comparison.output['consecutive_exons']):
                    break
                a_unique = False
                if args.output_a_not_in_b:
                    break
                    # only do the best
                if not args.best_b_only:
                    ostring += double_line
                else:
                    if gpd_comparison.output['consecutive_exons'] > best_exon_count \
                    or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
                    and gpd_comparison.output['overlap_length'] > best_overlap) \
                    or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
                    and gpd_comparison.output['overlap_length'] == best_overlap \
                    and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac):
                        best_exon_count = gpd_comparison.output[
                            'consecutive_exons']
                        best_overlap = gpd_comparison.output['overlap_length']
                        best_line = double_line
                        best_frac = harmonic_mean(
                            gpd_comparison.output['overlap_fractions'])
    if best_exon_count > 0 and args.best_b_only:
        ostring += best_line
    if a_unique and (args.output_a_not_in_b or args.leftouterjoin):
        ostring += GenePredBasics.entry_to_line(eA.entry) + "\n"
    sys.stdout.write(ostring)
    #oval.put(ostring)
    return
def main():
  parser = argparse.ArgumentParser(description='report regions that lack annotations')
  parser.add_argument('--read_annotations',help="FILENAME either rawoutput or bestoutput from annotate_psl_with_gpd")
  parser.add_argument('--bam',help="FILENAME of sorted bam file",required=True)
  parser.add_argument('--tempdir',default="/tmp",help="DIRECTORY of where temporary files can be stored")
  parser.add_argument('--depth',type=int,help="INT Instead of checking many depths only check this depth")
  parser.add_argument('--minintron',default=68,type=int,help="INT minimum size of intron default 68")
  parser.add_argument('--maxintron',default=100000,type=int,help="INT maximum size of intron default 100000")
  parser.add_argument('--gpdoutput',help="FILENAME store the genepred file created")
  parser.add_argument('--output','-o',help="FILENAME bed format output")
  group2 = parser.add_mutually_exclusive_group()
  group2.add_argument('--full',action='store_true',help="Exclude reads with full matches, retaining only partial and novel matches.")
  group2.add_argument('--partial',action='store_true',help="Exclude reads with partial matches, retaining only novel reads DEFAULT.")
  args = parser.parse_args()

  depth = {}
  if not os.path.exists(args.tempdir):
    sys.stderr.write("could not find temporary directory path\n")
    return
  if not os.path.exists(args.tempdir.rstrip("/")+"/weirathe"):
    os.makedirs(args.tempdir.rstrip("/")+"/weirathe")
  tdir = args.tempdir.rstrip("/") + "/weirathe/weirathe.orphan"+str(randint(1,10000000))
  sys.stderr.write("Using temporary directory: "+tdir+"\n")
  if not os.path.exists(tdir):
    os.makedirs(tdir)

  # iterate though read annotations
  annotated_reads = set()
  if args.read_annotations:
    with open(args.read_annotations) as inf:
      for line in inf:
        line = line.rstrip()
        if re.match('^psl_entry_id\s',line): continue
        if re.match('^$',line): continue
        f = line.split("\t")

        if args.full: # we only want the full matches
          if f[9] != 'Full': continue
        annotated_reads.add(f[1])

  if args.bam:
    # Later we will want to have chromosome lengths
    cmd0 = "samtools view -H "+args.bam
    ps0 = subprocess.Popen(cmd0.split(),stdout=subprocess.PIPE)
    of0 = open(tdir+"/lengths.txt",'w')
    for line in ps0.stdout:
      line = line.rstrip()
      if re.match('^@SQ',line):
        m1 = re.search('\sSN:(\S+)',line)
        m2 = re.search('\sLN:(\S+)',line)
        if m1 and m2: 
          of0.write(m1.group(1)+"\t"+m2.group(1)+"\n")
    of0.close()
    ps0.communicate()
    # first filter our bam
    cmd1 = "samtools view -h "+args.bam
    ps1 = subprocess.Popen(cmd1.split(),stdout=subprocess.PIPE)
    cmd2 = "samtools view -Sb -o "+tdir+"/temp.bam"+" -"
    ps2 = subprocess.Popen(cmd2.split(),stdin=subprocess.PIPE)
    for line in ps1.stdout:
      f = line.rstrip().split("\t")
      if len(f) < 9:
        ps2.stdin.write(line)
      if f[0] not in annotated_reads:
        ps2.stdin.write(line)
    ps1.stdout.close()
    ps2.communicate()

    # Now sort the new bam file
    cmd3 = "samtools sort "+tdir+"/temp.bam"+" "+tdir+"/temp.sorted"
    subprocess.call(cmd3.split())
    # Now get the coverage information
    cmd4 = "bedtools genomecov -bg -split -ibam "+tdir+"/temp.sorted.bam"
    coverage_file = tdir+"/temp.bed"
    of4 = open(coverage_file,'w')
    subprocess.call(cmd4.split(),stdout=of4)
    of4.close()
    #find our maxdepth
    maxdepth = 0
    with open(coverage_file) as inf:
      for line in inf:
        f = line.rstrip().split("\t")
        cov = int(f[3])
        if cov > maxdepth: maxdepth = cov
    print maxdepth

    # for all our depths make a bed file to explore 
    fhs = {}
    depths = []
    d = 1 #starting depth
    while d < maxdepth:
      depths.append(d)
      d*=2
    depths.append(maxdepth)
    if args.depth: depths = [args.depth]
    sys.stderr.write(str(depths)+"\n")
    for i in depths:
      fhs[i] = open(tdir+"/depth."+str(i)+".bed",'w')
    with open(coverage_file) as inf:
      for line in inf:
        f = line.rstrip().split("\t")
        cov = int(f[3])
        for i in depths:
          if cov >= i:
            fhs[i].write(line)
          else:
            continue
    for i in fhs:
      fhs[i].close()

    #sort the bed files
    for i in depths:
      cmd5 = "bedtools sort -i "+tdir+"/depth."+str(i)+".bed"
      of5 = open(tdir+"/depth."+str(i)+".sorted.bed",'w')
      subprocess.call(cmd5.split(),stdout=of5)
      of5.close

    # for each of our depths get the merged bed
    z = 0
    if args.gpdoutput:
      ofgpd = open(args.gpdoutput,'w')
    ofout = sys.stdout
    if args.output:
      ofout = open(args.output,'w')
    for i in depths:
      #compress_depth(tdir,i,args.minintron)
      bfile = tdir + "/depth."+str(i)+".sorted.bed"
      gpd_entries = GenePredBasics.bed_to_genepred(args.minintron,args.maxintron,bfile)
      for e in gpd_entries:
        z+=1
        iter = e.entry['name']
        name = "depth-"+str(i)+"_"+str(iter)
        e.entry['gene_name'] = str(i)
        e.entry['name'] = name
        line = e.get_line()
        length = e.length()
        exons = e.get_exon_count()
        if args.gpdoutput:
          ofgpd.write(line+"\n")
        ofout.write(e.entry['chrom'] + "\t" + str(e.entry['txStart']) + "\t" + str(e.entry['txEnd']) + "\t" + str(i) + "\t" + str(exons) + "\t" + str(length) + "\t" + name +"\n")

  rmtree(tdir)        
def nudge(psl_entry, gpd_entry, refjun, args):
    junctions = []
    fcount = 0
    if len(gpd_entry['exonStarts']) == 1:
        #print "no intron 1"
        return gpd_entry
    bounds = []
    for i in range(1, len(gpd_entry['exonStarts'])):
        junc_start = gpd_entry['exonEnds'][i - 1]
        junc_finish = gpd_entry['exonStarts'][i] + 1
        bounds.append([junc_start, junc_finish, i - 1])
    if len(bounds) < 1:
        #print "no intron 2"
        return gpd_entry
    bestbounds = []
    for bound in bounds:
        best_distance = [10000000, 10000000]
        best_result = None
        for z1 in range(bound[0] - args.search_size,
                        bound[0] + args.search_size + 1):
            d1 = abs(z1 - bound[0])
            if z1 in refjun:
                for z2 in range(
                        bound[1] - args.search_size,
                        bound[1] + args.search_size + args.search_size + 1):
                    d2 = abs(z2 - bound[1])
                    if z2 in refjun[z1]:
                        refstrand = refjun[z1][z2]
                        if d1 + d2 < best_distance[0] + best_distance[1]:
                            best_distance = [d1, d2]
                            best_result = [z1, z2, refstrand, bound[2]
                                           ] + best_distance
        if best_result:
            bestbounds.append(best_result)
    if len(bestbounds) < 1:
        #nothing fixable
        #sys.stderr.write("nothing fixable\n")
        return gpd_entry
    #Now we have a list of nudgable bounds
    #Lets pick a strand
    plus_score = 0
    minus_score = 0
    #print '----'
    #print bestbounds
    for bound in bestbounds:
        if bound[2] == '+':
            plus_score += 1 / (float(abs(bound[4])) + float(abs(bound[5])) + 1)
        else:
            minus_score += 1 / (float(abs(bound[4])) + float(abs(bound[5])) +
                                1)
    use_strand = '+'
    #print [plus_score,minus_score]
    if plus_score < minus_score: use_strand = '-'
    #print use_strand
    choice_bounds = []
    for bound in bestbounds:
        if bound[2] == use_strand: choice_bounds.append(bound)
    #print '---'
    #print GenePredBasics.entry_to_line(gpd_entry)
    #print bestbounds
    #print choice_bounds
    if len(choice_bounds) < 1:
        print "ERROR  should have choices"
        sys.exit()
    replacements = {}
    for bound in choice_bounds:
        replacements[bound[3]] = [bound[0], bound[1]]
    junctions = []
    #print "fixed "+str(len(replacements.keys()))
    for i in range(0, len(bounds)):
        val = bounds[i]
        if i in replacements:
            #sys.stderr.write("use replacement\n")
            val = replacements[i]
            fcount += 1
        junctions.append([val[0], val[1]])
    #print junctions
    #sys.stderr.write("replace\n")
    #print junctions
    new_gpd_line = gpd_entry['gene_name'] + "\t"
    new_gpd_line += gpd_entry['name'] + "\t"
    new_gpd_line += gpd_entry['chrom'] + "\t"
    new_gpd_line += gpd_entry['strand'] + "\t"
    new_gpd_line += str(gpd_entry['txStart']) + "\t"
    new_gpd_line += str(gpd_entry['txEnd']) + "\t"
    new_gpd_line += str(gpd_entry['cdsStart']) + "\t"
    new_gpd_line += str(gpd_entry['cdsEnd']) + "\t"
    new_gpd_line += str(len(junctions) + 1) + "\t"
    exon_starts = [gpd_entry['txStart']]
    exon_ends = []  #gpd_entry['txEnd']]
    for junc in junctions:
        exon_starts.append(junc[1] - 1)
        exon_ends.append(junc[0])
    exon_ends.append(gpd_entry['txEnd'])
    new_gpd_line += ','.join([str(x) for x in exon_starts]) + ',' + "\t"
    new_gpd_line += ','.join([str(x) for x in exon_ends]) + ',' + "\t"
    #print new_gpd_line
    new_gpd_entry = GenePredBasics.line_to_entry(new_gpd_line)
    #print "got junctions"
    #print new_gpd_line
    #print '.........'
    return new_gpd_entry
def check_B_entries(eA,gpdB,overlap,args):
    a_unique = True
    best_exon_count = 0
    best_overlap = 0
    best_line = ''
    best_frac = 0
    ostring = ''
    for eB in gpdB.entries:
      double_line = GenePredBasics.entry_to_line(eA.entry) + "\t" + GenePredBasics.entry_to_line(eB.entry) + "\n"
      gpd_comparison = GenePredBasics.GenePredComparison()
      gpd_comparison.set_overlap_requirement(overlap)
      if eA.entry['chrom'] != eB.entry['chrom']: continue
      # normal is to do full length matches
      if not (args.allow_a_subset_of_b_fragments or args.allow_any_fragments):
        # do some easy checks
        if eA.get_exon_count() != eB.get_exon_count(): continue
        gpd_comparison.set_require_all_exons_overlap(True)
        gpd_comparison.compare(eA,eB)
        if gpd_comparison.output['full_match']:
          a_unique = False
          if args.output_a_not_in_b:
            break # we can bust out of the inner loop if we are only printing stuff unique to a 
          if not args.best_b_only: # if we aren't waiting for the best, print it
            ostring += double_line
          else:
            # only do the best
            if gpd_comparison.output['consecutive_exons'] > best_exon_count \
            or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
            and gpd_comparison.output['overlap_length'] > best_overlap) \
            or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
            and gpd_comparison.output['overlap_length'] == best_overlap \
            and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac):
              best_exon_count = gpd_comparison.output['consecutive_exons']
              best_overlap = gpd_comparison.output['overlap_length']
              best_line = double_line
              best_frac = harmonic_mean(gpd_comparison.output['overlap_fractions'])
      # Allow partial matches
      else:          
        gpd_comparison.compare(eA,eB)
        if gpd_comparison.output['partial_match']:
          # if we require a to be subset of b
          if args.allow_a_subset_of_b_fragments \
          and not (eA.get_exon_count() < eB.get_exon_count() \
          and eA.get_exon_count() == gpd_comparison.output['consecutive_exons']):
            break
          a_unique = False
          if args.output_a_not_in_b:
            break
            # only do the best
          if not args.best_b_only:
            ostring += double_line
          else:
            if gpd_comparison.output['consecutive_exons'] > best_exon_count \
            or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
            and gpd_comparison.output['overlap_length'] > best_overlap) \
            or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
            and gpd_comparison.output['overlap_length'] == best_overlap \
            and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac):
              best_exon_count = gpd_comparison.output['consecutive_exons']
              best_overlap = gpd_comparison.output['overlap_length']
              best_line = double_line
              best_frac = harmonic_mean(gpd_comparison.output['overlap_fractions'])
    if best_exon_count > 0 and args.best_b_only:
      ostring += best_line
    if a_unique and (args.output_a_not_in_b or args.leftouterjoin):
      ostring += GenePredBasics.entry_to_line(eA.entry)+"\n"
    sys.stdout.write(ostring)
    #oval.put(ostring)
    return
def main():
    parser = argparse.ArgumentParser(
        description='Use reference junctions when they are close',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT min intron size")
    parser.add_argument(
        '--min_local_support',
        type=int,
        default=0,
        help=
        "INT min number of junctions within search_size of a junction in order to count it"
    )
    parser.add_argument('--search_size',
                        type=int,
                        default=10,
                        help="INT search space for reference")
    parser.add_argument(
        '--output_fake_psl',
        help="FASTAFILE reference genome to make a fake PSL output")
    parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN")
    parser.add_argument('reference_genepred',
                        help="FASTAFILENAME for reference genepred")
    args = parser.parse_args()

    cpus = multiprocessing.cpu_count()

    genome = {}
    if args.output_fake_psl:
        genome = read_fasta_into_hash(args.output_fake_psl)

    #read in the reference genepred first
    gpf = GenePredBasics.GenePredFile(args.reference_genepred)
    #lets sort entries by chromosome
    ref = {}
    for e in [x.entry for x in gpf.entries]:
        if len(e['exonStarts']) <= 1: continue
        if e['chrom'] not in ref:
            ref[e['chrom']] = {}
        for i in range(1, len(e['exonStarts'])):
            if e['exonEnds'][i - 1] not in ref[e['chrom']]:
                ref[e['chrom']][e['exonEnds'][i - 1]] = {}
            if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i -
                                                                           1]]:
                ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] +
                                                      1] = e['strand']
    #Stored all junctions as 1-base

    read_info = {}
    pf = GenericFileReader(args.psl)
    fcount_total = 0
    while True:
        line = pf.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        pe = PSLBasics.line_to_entry(line)
        if len(pe['tStarts']) != len(pe['blockSizes']) or len(
                pe['qStarts']) != len(pe['blockSizes']):
            sys.stderr.write("WARNING invalid psl\n")
            continue
        genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
        ge = GenePredBasics.smooth_gaps(
            GenePredBasics.line_to_entry(genepred_line), args.min_intron_size)
        refjuns = {}
        if pe['tName'] in ref: refjuns = ref[pe['tName']]
        new_ge = nudge(pe, ge, refjuns, args)
        if args.output_fake_psl:
            new_psl_line = GenePredBasics.entry_to_fake_psl_line(
                new_ge, genome)
            print new_psl_line
        else:
            print GenePredBasics.entry_to_line(new_ge)
def main():
    parser = argparse.ArgumentParser(
        description=
        "Make a universal genepred and key for comparing IDP results")
    parser.add_argument(
        '--output_directory',
        default='IDP_output_merge',
        help='DIRECTORY to write output to.  Will not overwrite existing')
    parser.add_argument(
        'genepred_exp_name_sets',
        nargs='+',
        help=
        "three files for each IDP entry 1) a genepred 2) an expression file 3) a sample name."
    )
    args = parser.parse_args()

    mydir = args.output_directory.rstrip('/')
    if os.path.isdir(mydir):
        sys.stderr.write("ERROR: output directory " + mydir +
                         " already exists\n")
        return
    os.makedirs(mydir)

    set_args = args.genepred_exp_name_sets
    if len(set_args) % 3 != 0:
        sys.stderr.write("Data must be in sets of three")
    setnum = 0
    resultnumber = 0
    numbers = {}
    byset = {}
    chromosomes = set()
    established_names = {}
    expression = {}
    sample_names = set()
    while len(set_args) > 0:
        setnum += 1
        gpd = set_args.pop(0)
        exp = set_args.pop(0)
        sample_name = set_args.pop(0)
        sample_names.add(sample_name)
        sys.stderr.write("Set: " + str(setnum) + "\n")
        sys.stderr.write("  GenePred: " + gpd + "\n")
        sys.stderr.write("  Expression: " + exp + "\n")
        sys.stderr.write("  Sample: " + sample_name + "\n")

        with open(gpd) as inf:
            for line in inf:
                if re.match('^#', line): continue
                e = GenePredBasics.GenePredEntry()
                e.line_to_entry(line)
                chromosomes.add(e.entry['chrom'])
                junctions = e.junctions
                resultnumber += 1
                junstring = ";".join(junctions)
                if junstring not in byset:
                    byset[junstring] = set()
                byset[junstring].add(resultnumber)
                numbers[resultnumber] = [sample_name, e.entry['name'], e]
        with open(exp) as inf:
            for line in inf:
                f = line.rstrip().split("\t")
                if sample_name not in expression:
                    expression[sample_name] = {}
                expression[sample_name][f[0]] = [float(
                    f[1]), float(f[2])]  #transcript and gene exression

    #bysample = {}
    gene_records = {}
    for junc in byset:
        lowest = False
        highest = False
        realnames = set()
        realgenenames = set()
        chromnames = set()
        chromgenenames = set()
        arbitrary_gpd = False
        sgpds = {}
        for i in byset[junc]:
            [sample, name, gpd] = numbers[i]
            gene_name = gpd.entry['gene_name']
            arbitrary_gpd = gpd
            sgpds[sample] = gpd
            # Figure out if its a reference transcript name or an IDP manufactured name
            m = re.match('^([^:]+):\d+-\d+', name)
            if not m:
                realnames.add(name)
            else:
                chromnames.add(m.group(1))

            # Figure out if its a reference gene name or an IPD manufacture gene anme
            m = re.match('^([^:]+):\d+-\d+', gene_name)
            if not m:
                realgenenames.add(gene_name)
            else:
                chromgenenames.add(m.group(1))

            if not lowest or gpd.entry['txStart'] < lowest:
                lowest = gpd.entry['txStart']
            if not highest or gpd.entry['txEnd'] > highest:
                highest = gpd.entry['txEnd']
            #if sample not in bysample:
            #  bysample[sample] = {}
            #if name not in bysample[sample]:
            #  bysample[sample][name] = i
        usename = False
        basename = False
        if len(realnames) > 0:
            usename = next(iter(realnames))
            if len(realnames) > 1:
                sys.stderr.write(
                    "WARNING: multiple transcript names as with the same junctions.\n"
                    + str(realnames) + "\nUsing: " + str(usename) + "\n")
            if usename in established_names:
                sys.stderr.write(
                    "WARNING: reference transcript name " + usename +
                    " refers to different transcripts with different junction compositions.  Renaming the second instance to a unique name."
                )
                established_names[usename] += 1
                usename = usename + '.' + str(established_names[usename])
            else:
                established_names[usename] = 0
        else:
            usechrom = next(iter(chromnames))
            if len(chromnames) > 1:
                sys.stderr.write(
                    "ERROR: multiple chromosome names are not supported in a single transcript yet.\n"
                    + str(chromnames) + "\n")
                sys.exit()
            basename = usechrom + ":" + str(lowest) + '-' + str(highest)
            if basename not in established_names:
                established_names[basename] = 0
            established_names[basename] += 1
            usename = basename + '.' + str(established_names[basename])
        # See if we have a real gene name for base name
        if len(realgenenames) > 0:
            basename = next(iter(realgenenames))
        #print basename + "\t" + usename
        if basename not in gene_records:
            gene_records[basename] = {}
        gene_records[basename][usename] = {}
        gene_records[basename][usename]['sample_gpd'] = {}
        gene_records[basename][usename]['sample_exp'] = {}
        gene_records[basename][usename]['gpd'] = GenePredBasics.GenePredEntry()
        # copy the old record
        gene_records[basename][usename]['gpd'].line_to_entry(
            arbitrary_gpd.get_line())
        if lowest < gene_records[basename][usename]['gpd'].entry['txStart']:
            sys.stderr.write("ADJUSTING NEW GPD TXSTART FOR " + basename +
                             " " + usename + "\n")
            gene_records[basename][usename]['gpd'].entry['txStart'] = lowest
            gene_records[basename][usename]['gpd'].entry['cdsStart'] = lowest
            gene_records[basename][usename]['gpd'].entry['exonStarts'][
                0] = lowest

        if highest > gene_records[basename][usename]['gpd'].entry['txEnd']:
            sys.stderr.write("ADJUSTING NEW GPD TXEND FOR " + basename + " " +
                             usename + "\n")
            gene_records[basename][usename]['gpd'].entry['txEnd'] = highest
            gene_records[basename][usename]['gpd'].entry['cdsEnd'] = highest
            gene_records[basename][usename]['gpd'].entry['exonEnds'][
                len(gene_records[basename][usename]['gpd'].entry['exonEnds']) -
                1] = lowest
        # Now add the original sample information
        for sample in sgpds:
            gene_records[basename][usename]['sample_gpd'][sample] = sgpds[
                sample]
            gene_records[basename][usename]['sample_exp'][sample] = expression[
                sample][sgpds[sample].entry['name']][0]

    #Now all necessary data should be in gene_records
    sample_list = sorted(list(sample_names))
    ofgene = open(mydir + '/gene.exp', 'w')
    ofgene.write("gene")
    for sample in sample_list:
        ofgene.write("\t" + sample)
    ofgene.write("\n")
    geneexp = {}
    for gene in gene_records:
        total = {}
        for sample in sample_list:
            total[sample] = 0
        geneexp[gene] = {}
        for transcript in gene_records[gene]:
            for sample in gene_records[gene][transcript]['sample_exp']:
                total[sample] += gene_records[gene][transcript]['sample_exp'][
                    sample]
        ofgene.write(gene)
        for sample in sample_list:
            geneexp[gene][sample] = total[sample]
            ofgene.write("\t" + str(total[sample]))
        ofgene.write("\n")
    ofgene.close()

    #Now we can do all the transcript writing
    ofgeneiso = open(mydir + '/gene_isoform.exp', 'w')
    ofgeneiso.write("gene\tisoform")
    for sample in sample_list:
        ofgeneiso.write("\t" + sample + ".gene" + "\t" + sample + ".isoform")
    ofgeneiso.write("\n")
    ofiso = open(mydir + '/isoform.exp', 'w')
    ofiso.write("isoform")
    for sample in sample_list:
        ofiso.write("\t" + sample)
    ofiso.write("\n")
    for gene in gene_records:
        for transcript in gene_records[gene]:
            ofiso.write(transcript)
            ofgeneiso.write(gene + "\t" + transcript)
            for sample in sample_list:
                if sample in gene_records[gene][transcript]['sample_exp']:
                    ofgeneiso.write("\t" + str(geneexp[gene][sample]) + "\t" +
                                    str(gene_records[gene][transcript]
                                        ['sample_exp'][sample]))
                    ofiso.write("\t" + str(gene_records[gene][transcript]
                                           ['sample_exp'][sample]))
                else:
                    ofiso.write("\t0")
                    if sample in geneexp[gene]:
                        ofgeneiso.write("\t" + str(geneexp[gene][sample]) +
                                        "\t0")
                    else:
                        ofgeneiso.write("0\t0")
            ofiso.write("\n")
            ofgeneiso.write("\n")
    ofiso.close()

    #Maybe we can finish it all off by writing the new genepred
    ofgpd = open(mydir + '/isoform.gpd', 'w')
    for gene in gene_records:
        for transcript in gene_records[gene]:
            ofgpd.write(gene_records[gene][transcript]['gpd'].get_line() +
                        "\n")
    ofgpd.close()