def evaluate_junctions(fz, sr, args):
    cnt = 0
    source_names = [x.entry['name'] for x in fz.gpds]
    working = fz.copy()
    if len(working.fuzzy_junctions) == 0: return []
    for i in range(0, len(working.fuzzy_junctions)):
        newjun = working.fuzzy_junctions[i]
        newjun.left.get_payload()['junc'] = []
        newjun.right.get_payload()['junc'] = []
        oldjun = fz.fuzzy_junctions[i]
        for srjun in sr:
            sjun = sr[srjun]['fzjun']
            if oldjun.overlaps(sjun, args.junction_tolerance):
                for i in range(0, min(sr[srjun]['cnt'], args.downsample)):
                    newjun.left.get_payload()['junc'].append(
                        sjun.left.get_payload()['junc'][0])
                    newjun.right.get_payload()['junc'].append(
                        sjun.right.get_payload()['junc'][0])
                    cnt += 1
    juncs = []
    starts = []
    ends = []
    evidences = []
    for i in range(0, len(fz.fuzzy_junctions)):
        evidence = len(working.fuzzy_junctions[i].left.get_payload()['junc'])
        if evidence >= args.required_evidence:
            if i == 0:
                starts.append(working.start.start)
            elif working.fuzzy_junctions[i].left.get_payload()['start']:
                starts.append(working.fuzzy_junctions[i].left.get_payload()
                              ['start'].start)
            else:
                starts.append(working.fuzzy_junctions[i - 1].right.start)
            #now ends
            if i == len(fz.fuzzy_junctions) - 1:
                ends.append(working.end.end)
            elif working.fuzzy_junctions[i].right.get_payload()['end']:
                ends.append(
                    working.fuzzy_junctions[i].right.get_payload()['end'].end)
            else:
                ends.append(working.fuzzy_junctions[i + 1].left.end)
            bestleft = GenePredFuzzyBasics.mode(
                working.fuzzy_junctions[i].left.get_payload()['junc'])
            bestright = GenePredFuzzyBasics.mode(
                working.fuzzy_junctions[i].right.get_payload()['junc'])
            juncs.append([bestleft, bestright])
            #print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright)
        else:
            starts.append([])
            ends.append([])
            juncs.append([])
        evidences.append(evidence)
    #print juncs
    #print starts
    #print ends
    #print evidences
    # now we can put together the runs
    runs = []
    current_run = []
    for i in range(0, len(evidences)):
        if evidences[i] < args.required_evidence:
            if len(current_run) > 0:
                runs.append(current_run)
            current_run = []
            continue
        current_run.append(i)
    if len(current_run) > 0:
        runs.append(current_run)
    # now the runs are in runs
    #print 'runs:'
    parts = []
    for run in runs:
        sarr = []
        sarr.append(starts[run[0]] - 1)  #put back to zero index
        earr = []
        for i in range(0, len(run)):
            sarr.append(juncs[run[i]][1] - 1)
            earr.append(juncs[run[i]][0])
        earr.append(ends[run[-1]])
        # ready to build a genepred!
        part = ''
        part += str(working.start.chr) + "\t"
        part += '+' + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(len(sarr)) + "\t"
        part += ','.join([str(x) for x in sarr]) + ',' + "\t"
        part += ','.join([str(x) for x in earr]) + ','
        # Final quality check here
        gpd = GenePredEntry("test1\ttest1\t" + part)
        if not gpd.is_valid():
            sys.stderr.write("\nWARNING skipping invalid GPD\n" +
                             gpd.get_line() + "\n")
            continue
        parts.append([part, source_names])
    #print parts
    return parts
示例#2
0
def evaluate_junctions(fz, sr, args):
    cnt = 0
    source_names = [x.entry["name"] for x in fz.gpds]
    working = fz.copy()
    if len(working.fuzzy_junctions) == 0:
        return []
    for i in range(0, len(working.fuzzy_junctions)):
        newjun = working.fuzzy_junctions[i]
        newjun.left.get_payload()["junc"] = []
        newjun.right.get_payload()["junc"] = []
        oldjun = fz.fuzzy_junctions[i]
        for srjun in sr:
            sjun = sr[srjun]["fzjun"]
            if oldjun.overlaps(sjun, args.junction_tolerance):
                for i in range(0, min(sr[srjun]["cnt"], args.downsample)):
                    newjun.left.get_payload()["junc"].append(sjun.left.get_payload()["junc"][0])
                    newjun.right.get_payload()["junc"].append(sjun.right.get_payload()["junc"][0])
                    cnt += 1
    juncs = []
    starts = []
    ends = []
    evidences = []
    for i in range(0, len(fz.fuzzy_junctions)):
        evidence = len(working.fuzzy_junctions[i].left.get_payload()["junc"])
        if evidence >= args.required_evidence:
            if i == 0:
                starts.append(working.start.start)
            elif working.fuzzy_junctions[i].left.get_payload()["start"]:
                starts.append(working.fuzzy_junctions[i].left.get_payload()["start"].start)
            else:
                starts.append(working.fuzzy_junctions[i - 1].right.start)
            # now ends
            if i == len(fz.fuzzy_junctions) - 1:
                ends.append(working.end.end)
            elif working.fuzzy_junctions[i].right.get_payload()["end"]:
                ends.append(working.fuzzy_junctions[i].right.get_payload()["end"].end)
            else:
                ends.append(working.fuzzy_junctions[i + 1].left.end)
            bestleft = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].left.get_payload()["junc"])
            bestright = GenePredFuzzyBasics.mode(working.fuzzy_junctions[i].right.get_payload()["junc"])
            juncs.append([bestleft, bestright])
            # print 'jun '+str(i)+' evid: '+str(evidence)+" "+str(bestleft)+" "+str(bestright)
        else:
            starts.append([])
            ends.append([])
            juncs.append([])
        evidences.append(evidence)
    # print juncs
    # print starts
    # print ends
    # print evidences
    # now we can put together the runs
    runs = []
    current_run = []
    for i in range(0, len(evidences)):
        if evidences[i] < args.required_evidence:
            if len(current_run) > 0:
                runs.append(current_run)
            current_run = []
            continue
        current_run.append(i)
    if len(current_run) > 0:
        runs.append(current_run)
    # now the runs are in runs
    # print 'runs:'
    parts = []
    for run in runs:
        sarr = []
        sarr.append(starts[run[0]] - 1)  # put back to zero index
        earr = []
        for i in range(0, len(run)):
            sarr.append(juncs[run[i]][1] - 1)
            earr.append(juncs[run[i]][0])
        earr.append(ends[run[-1]])
        # ready to build a genepred!
        part = ""
        part += str(working.start.chr) + "\t"
        part += "+" + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(sarr[0]) + "\t"
        part += str(earr[-1]) + "\t"
        part += str(len(sarr)) + "\t"
        part += ",".join([str(x) for x in sarr]) + "," + "\t"
        part += ",".join([str(x) for x in earr]) + ","
        # Final quality check here
        gpd = GenePredEntry("test1\ttest1\t" + part)
        if not gpd.is_valid():
            sys.stderr.write("\nWARNING skipping invalid GPD\n" + gpd.get_line() + "\n")
            continue
        parts.append([part, source_names])
    # print parts
    return parts
示例#3
0
def main():
    parser = argparse.ArgumentParser(
        description=
        "Rename gene and transcript elements of GenePred file that are redundant.  Please specify an output if you would like report files generated for the filters."
    )
    parser.add_argument('input', help="GENEPREDFILE or '-' for STDIN")
    parser.add_argument(
        '-o',
        '--output',
        help=
        "OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated"
    )
    parser.add_argument(
        '--minimum_locus_distance',
        type=int,
        default=500000,
        help="Genes with the same name will be renamed if this far apart")
    parser.add_argument(
        '--keep_positional_duplicates',
        action='store_true',
        help="By default we remove one of the duplicate entries")
    parser.add_argument(
        '--keep_transcript_names',
        action='store_true',
        help="By default we rename duplicated transcript names")
    parser.add_argument(
        '--keep_gene_names',
        action='store_true',
        help="By default we rename genes located at different loci.")
    args = parser.parse_args()
    inf = sys.stdin
    if args.input != '-': inf = open(args.input)
    of = sys.stdout
    if args.output: of = open(args.output, 'w')
    txdef = {}
    gfams = {}
    for line in inf:
        if line[0] == '#': continue
        g = GenePredEntry(line)
        loc = g.value('chrom') + ':' + ','.join(
            [str(x) for x in g.value('exonStarts')]) + '-' + ','.join(
                [str(x)
                 for x in g.value('exonEnds')]) + '/' + g.value('strand')
        if loc not in txdef:
            txdef[loc] = []
        txdef[loc].append(g)
        if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = []
        gfams[g.value('gene_name')].append(g.value('name'))
    # now we have cataloged all transcripts by unique locations
    omissions = []
    keepers = []
    for loc in sorted(txdef.keys()):
        if args.keep_positional_duplicates:  # We don't want to ommit anything here
            for g in txdef[loc]:
                keepers.append(g)
            continue  #basically skipping this part by populating keepers
        num = len(txdef[loc])
        if num > 1:
            sys.stderr.write("Found " + str(num) + " entries at location\n")
            sys.stderr.write(loc + "\n")
            sys.stderr.write("They are:\n")
            largest = 0
            keepgene = None
            keepindex = -1
            i = 0
            for e in txdef[loc]:
                famsize = len(gfams[e.value('gene_name')])
                sys.stderr.write("     " + e.value('gene_name') + "\t" +
                                 e.value('name') + "\t" + str(famsize) + "\n")
                if famsize > largest:
                    keepgene = e
                    largest = famsize
                    keepindex = i
                i += 1
            for j in range(0, len(txdef[loc])):
                if j != keepindex: omissions.append(txdef[loc][j])
                else: keepers.append(txdef[loc][j])
            sys.stderr.write("     Biggest gene family is " +
                             keepgene.value('gene_name') + " with " +
                             str(largest) + " transcripts\n")
            sys.stderr.write("     so keep that one.\n")
        else:
            keepers.append(txdef[loc][0])
    sys.stderr.write("Omitting " + str(len(omissions)) +
                     " entries for redundant positions\n")
    if args.output and not args.keep_positional_duplicates:
        of1 = open(args.output + '.positional_duplicate_omissions', 'w')
        for g in omissions:
            of1.write(g.get_line() + "\n")
        of1.close()
    # Now the keepers contains transcripts with unique locations
    # Lets provide unique names to remaining transcripts
    tnames = {}
    renametx = {}
    for g in keepers:
        tx = g.value('name')
        if tx not in tnames: tnames[tx] = []
        tnames[tx].append(g)
    for name in tnames:
        if args.keep_transcript_names: continue  # We don't want to rename them
        nsize = len(tnames[name])
        if nsize > 1:
            sys.stderr.write("Name: " + name + " has a family of size " +
                             str(nsize) + "\n")
            for i in range(0, len(tnames[name])):
                newname = name + '[' + str(i + 1) + '/' + str(nsize) + ']'
                renametx[newname] = name
                tnames[name][i].entry['name'] = newname
    sys.stderr.write("Renamed: " + str(len(renametx)) + " transcripts\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_transcripts', 'w')
        for name in sorted(renametx.keys()):
            of1.write(name + "\t" + renametx[name] + "\n")
        of1.close()
    #now we need to arrange into gene families
    gnames = {}
    for name in tnames:
        for g in tnames[name]:
            gene = g.value('gene_name')
            if gene not in gnames: gnames[gene] = []
            gnames[gene].append(g)
    renamegene = {}
    finished = []
    for gene in gnames:
        if args.keep_gene_names:
            for g in gnames[gene]:
                finished.append(g)
            continue  # We don't want to rename genes
        if len(gnames[gene]) == 1:
            finished.append(gnames[gene][0])
            continue
        # Now we need to make sure these genes are really on the same locus.
        loci = Loci()
        loci.set_minimum_distance(args.minimum_locus_distance)
        for g in gnames[gene]:
            r = g.locus_range.copy()
            r.set_payload(g)
            loc = Locus()
            loc.add_member(r)
            loci.add_locus(loc)
        loci.update_loci()
        lcount = len(loci.loci)
        if lcount == 1:
            for g in gnames[gene]:
                finished.append(g)
            continue
        # need to rename some genes
        for i in range(0, lcount):
            newname = gene + '[' + str(i + 1) + '/' + str(lcount) + ']'
            rstr = loci.loci[i].range.get_range_string()
            renamegene[newname] = gene
            sys.stderr.write(newname + "\t" + rstr + "\n")
            for m in loci.loci[i].members:
                m.get_payload().entry['gene_name'] = newname
                finished.append(m.get_payload())
    sys.stderr.write("Renamed: " + str(len(renamegene)) + " genes\n")
    if args.output and not args.keep_transcript_names:
        of1 = open(args.output + '.renamed_genes', 'w')
        for name in sorted(renamegene.keys()):
            of1.write(name + "\t" + renamegene[name] + "\n")
        of1.close()
    #Now lets resort by genes
    bygene = {}
    for g in finished:
        gene = g.value('gene_name')
        if gene not in bygene: bygene[gene] = []
        bygene[gene].append(g)
    for gene in sorted(bygene.keys()):
        for g in bygene[gene]:
            of.write(g.get_line() + "\n")
    of.close()
    inf.close()
def main():
  parser = argparse.ArgumentParser(description="Rename gene and transcript elements of GenePred file that are redundant.  Please specify an output if you would like report files generated for the filters.")
  parser.add_argument('input',help="GENEPREDFILE or '-' for STDIN")
  parser.add_argument('-o','--output',help="OUTPUT FILE default is STDOUT, but you need to specify an output file to get report files generated")
  parser.add_argument('--minimum_locus_distance',type=int,default=500000,help="Genes with the same name will be renamed if this far apart")
  parser.add_argument('--keep_positional_duplicates',action='store_true',help="By default we remove one of the duplicate entries")
  parser.add_argument('--keep_transcript_names',action='store_true',help="By default we rename duplicated transcript names")
  parser.add_argument('--keep_gene_names',action='store_true',help="By default we rename genes located at different loci.")
  args = parser.parse_args()
  inf = sys.stdin
  if args.input != '-': inf = open(args.input)
  of = sys.stdout
  if args.output: of = open(args.output,'w')
  txdef = {}
  gfams = {}
  for line in inf:
    if line[0] == '#': continue
    g = GenePredEntry(line)
    loc = g.value('chrom') + ':' +','.join([str(x) for x in g.value('exonStarts')]) + '-' + ','.join([str(x) for x in g.value('exonEnds')])+'/'+g.value('strand')
    if loc not in txdef:
      txdef[loc] = []
    txdef[loc].append(g)
    if g.value('gene_name') not in gfams: gfams[g.value('gene_name')] = []
    gfams[g.value('gene_name')].append(g.value('name'))
  # now we have cataloged all transcripts by unique locations
  omissions = []
  keepers = []
  for loc in sorted(txdef.keys()):
    if args.keep_positional_duplicates: # We don't want to ommit anything here
      for g in txdef[loc]: keepers.append(g)
      continue #basically skipping this part by populating keepers
    num = len(txdef[loc])
    if num > 1:
      sys.stderr.write("Found "+str(num)+" entries at location\n")
      sys.stderr.write(loc +"\n")
      sys.stderr.write("They are:\n")
      largest = 0
      keepgene = None
      keepindex = -1
      i = 0
      for e in txdef[loc]:
        famsize = len(gfams[e.value('gene_name')])
        sys.stderr.write("     "+e.value('gene_name')+"\t"+e.value('name')+"\t"+str(famsize)+"\n")
        if famsize > largest:
          keepgene = e
          largest = famsize
          keepindex = i
        i+=1
      for j in range(0,len(txdef[loc])):  
        if j != keepindex: omissions.append(txdef[loc][j])
        else: keepers.append(txdef[loc][j])
      sys.stderr.write("     Biggest gene family is "+keepgene.value('gene_name')+" with "+str(largest)+" transcripts\n")
      sys.stderr.write("     so keep that one.\n")
    else:
      keepers.append(txdef[loc][0])
  sys.stderr.write("Omitting "+str(len(omissions))+" entries for redundant positions\n")
  if args.output and not args.keep_positional_duplicates:
    of1 = open(args.output+'.positional_duplicate_omissions','w')
    for g in omissions:
      of1.write(g.get_line()+"\n")
    of1.close()
  # Now the keepers contains transcripts with unique locations
  # Lets provide unique names to remaining transcripts
  tnames = {}
  renametx = {}
  for g in keepers:
    tx = g.value('name')
    if tx not in tnames: tnames[tx] = []
    tnames[tx].append(g)
  for name in tnames:
    if args.keep_transcript_names: continue # We don't want to rename them
    nsize = len(tnames[name])
    if nsize > 1:
      sys.stderr.write("Name: "+name+" has a family of size "+str(nsize)+"\n")
      for i in range(0,len(tnames[name])):
        newname = name+'['+str(i+1)+'/'+str(nsize)+']'
        renametx[newname] = name
        tnames[name][i].entry['name'] = newname
  sys.stderr.write("Renamed: "+str(len(renametx))+" transcripts\n")
  if args.output and not args.keep_transcript_names:
    of1 = open(args.output+'.renamed_transcripts','w')
    for name in sorted(renametx.keys()):
      of1.write(name+"\t"+renametx[name]+"\n")
    of1.close()
  #now we need to arrange into gene families
  gnames = {}
  for name in tnames:
    for g in tnames[name]:
      gene = g.value('gene_name')
      if gene not in gnames:  gnames[gene] = []
      gnames[gene].append(g)
  renamegene = {}
  finished = []
  for gene in gnames:
    if args.keep_gene_names:
      for g in gnames[gene]: finished.append(g)
      continue # We don't want to rename genes
    if len(gnames[gene])==1:
      finished.append(gnames[gene][0])
      continue
    # Now we need to make sure these genes are really on the same locus.
    loci = Loci()
    loci.set_minimum_distance(args.minimum_locus_distance)
    for g in gnames[gene]:
      r = g.locus_range.copy()
      r.set_payload(g)
      loc = Locus()
      loc.add_member(r)
      loci.add_locus(loc)
    loci.update_loci()
    lcount = len(loci.loci)
    if lcount == 1:
      for g in gnames[gene]: finished.append(g)
      continue
    # need to rename some genes
    for i in range(0,lcount):
      newname = gene+'['+str(i+1)+'/'+str(lcount)+']'
      rstr = loci.loci[i].range.get_range_string()
      renamegene[newname] = gene
      sys.stderr.write(newname+"\t"+rstr+"\n")
      for m in loci.loci[i].members:
        m.get_payload().entry['gene_name'] = newname
        finished.append(m.get_payload())
  sys.stderr.write("Renamed: "+str(len(renamegene))+" genes\n")
  if args.output and not args.keep_transcript_names:
    of1 = open(args.output+'.renamed_genes','w')
    for name in sorted(renamegene.keys()):
      of1.write(name+"\t"+renamegene[name]+"\n")
    of1.close()
  #Now lets resort by genes
  bygene = {}
  for g in finished:
    gene = g.value('gene_name')
    if gene not in bygene: bygene[gene] = []
    bygene[gene].append(g)
  for gene in sorted(bygene.keys()):
    for g in bygene[gene]:
      of.write(g.get_line()+"\n")
  of.close()
  inf.close()