def main():
  parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size")
  parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it")
  parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference")
  parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output")
  parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN")
  parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred")
  args = parser.parse_args()

  cpus = multiprocessing.cpu_count()

  genome = {}
  if args.output_fake_psl:
    genome = read_fasta_into_hash(args.output_fake_psl)

  #read in the reference genepred first
  gpf = GenePredBasics.GenePredFile(args.reference_genepred)
  #lets sort entries by chromosome
  ref = {}
  for e in [x.entry for x in gpf.entries]:
    if len(e['exonStarts']) <= 1: continue
    if e['chrom'] not in ref:
      ref[e['chrom']] = {}
    for i in range(1,len(e['exonStarts'])):
      if e['exonEnds'][i-1] not in ref[e['chrom']]:
        ref[e['chrom']][e['exonEnds'][i-1]] = {}
      if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]:
        ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand']
  #Stored all junctions as 1-base

  read_info = {}
  pf = GenericFileReader(args.psl)
  fcount_total = 0
  while True:
    line = pf.readline()
    if not line: break
    if re.match('^#',line): continue
    line = line.rstrip()
    pe = PSLBasics.line_to_entry(line)
    if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']):
      sys.stderr.write("WARNING invalid psl\n")
      continue
    genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
    ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size)
    refjuns = {}
    if pe['tName'] in ref: refjuns = ref[pe['tName']]
    new_ge = nudge(pe,ge,refjuns,args)
    if args.output_fake_psl:
      new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome)
      print new_psl_line
    else:
      print GenePredBasics.entry_to_line(new_ge)
Пример #2
0
def main():
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
  parser.add_argument('genepred',help="FILENAME or use - for STDIN")
  parser.add_argument('--smoothing_size',type=int,default=68,help="INT no gaps less than this size")
  args = parser.parse_args()
  inf = sys.stdin
  if args.genepred != '-':
    inf = open(args.genepred)
  for line in inf:
    e = GenePredBasics.line_to_entry(line)
    e2 = GenePredBasics.smooth_gaps(e,args.smoothing_size)
    print GenePredBasics.entry_to_line(e2)
Пример #3
0
def parse_gpdfile(tdir,gpdfile,smoothing_factor):
  # Go through the long reads and make a genepred
  if gpdfile != '-':
    fr = FileBasics.GenericFileReader(gpdfile)
  else:
    fr = sys.stdin
  seennames = {}
  longreadnumber = 0
  of_gpd = open(tdir+'/longreads.gpd','w')
  while True:
    line = fr.readline()
    if not line: break
    if re.match('^#',line): #skip comments
      continue
    longreadnumber += 1
    entry = GenePredBasics.smooth_gaps( \
              GenePredBasics.line_to_entry(line.rstrip()) \
              ,smoothing_factor)
    readname = entry['name']
    if readname in seennames:
      sys.stderr.write("Warning: repeat name '"+readname+"'\n")
    #set our first name to our bin
    entry['name'] = str(longreadnumber)
    gline = GenePredBasics.entry_to_line(entry)
    of_gpd.write(gline+"\n")
  fr.close()
  of_gpd.close()
Пример #4
0
def main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('genepred', help="FILENAME or use - for STDIN")
    parser.add_argument('--smoothing_size',
                        type=int,
                        default=68,
                        help="INT no gaps less than this size")
    args = parser.parse_args()
    inf = sys.stdin
    if args.genepred != '-':
        inf = open(args.genepred)
    for line in inf:
        e = GenePredBasics.line_to_entry(line)
        e2 = GenePredBasics.smooth_gaps(e, args.smoothing_size)
        print GenePredBasics.entry_to_line(e2)
def main():
  parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.")
  parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.")
  parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.")
  args = parser.parse_args()
  
  pslfilehandle = sys.stdin
  if args.input_name != '-':
    pslfilehandle = open(args.input_name)
  with pslfilehandle as infile:
    for line in infile:
      psl_entry = PSLBasics.line_to_entry(line)
      genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry)
      if args.fill_gaps > 0:
        genepred_entry = GenePredBasics.line_to_entry(genepred_line)
        genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps)
        genepred_line = GenePredBasics.entry_to_line(genepred_entry2)
      print genepred_line
def check_B_entries(eA,gpdB,overlap,args):
    a_unique = True
    best_exon_count = 0
    best_overlap = 0
    best_line = ''
    best_frac = 0
    ostring = ''
    for eB in gpdB.entries:
      double_line = GenePredBasics.entry_to_line(eA.entry) + "\t" + GenePredBasics.entry_to_line(eB.entry) + "\n"
      gpd_comparison = GenePredBasics.GenePredComparison()
      gpd_comparison.set_overlap_requirement(overlap)
      if eA.entry['chrom'] != eB.entry['chrom']: continue
      # normal is to do full length matches
      if not (args.allow_a_subset_of_b_fragments or args.allow_any_fragments):
        # do some easy checks
        if eA.get_exon_count() != eB.get_exon_count(): continue
        gpd_comparison.set_require_all_exons_overlap(True)
        gpd_comparison.compare(eA,eB)
        if gpd_comparison.output['full_match']:
          a_unique = False
          if args.output_a_not_in_b:
            break # we can bust out of the inner loop if we are only printing stuff unique to a 
          if not args.best_b_only: # if we aren't waiting for the best, print it
            ostring += double_line
          else:
            # only do the best
            if gpd_comparison.output['consecutive_exons'] > best_exon_count \
            or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
            and gpd_comparison.output['overlap_length'] > best_overlap) \
            or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
            and gpd_comparison.output['overlap_length'] == best_overlap \
            and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac):
              best_exon_count = gpd_comparison.output['consecutive_exons']
              best_overlap = gpd_comparison.output['overlap_length']
              best_line = double_line
              best_frac = harmonic_mean(gpd_comparison.output['overlap_fractions'])
      # Allow partial matches
      else:          
        gpd_comparison.compare(eA,eB)
        if gpd_comparison.output['partial_match']:
          # if we require a to be subset of b
          if args.allow_a_subset_of_b_fragments \
          and not (eA.get_exon_count() < eB.get_exon_count() \
          and eA.get_exon_count() == gpd_comparison.output['consecutive_exons']):
            break
          a_unique = False
          if args.output_a_not_in_b:
            break
            # only do the best
          if not args.best_b_only:
            ostring += double_line
          else:
            if gpd_comparison.output['consecutive_exons'] > best_exon_count \
            or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
            and gpd_comparison.output['overlap_length'] > best_overlap) \
            or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
            and gpd_comparison.output['overlap_length'] == best_overlap \
            and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac):
              best_exon_count = gpd_comparison.output['consecutive_exons']
              best_overlap = gpd_comparison.output['overlap_length']
              best_line = double_line
              best_frac = harmonic_mean(gpd_comparison.output['overlap_fractions'])
    if best_exon_count > 0 and args.best_b_only:
      ostring += best_line
    if a_unique and (args.output_a_not_in_b or args.leftouterjoin):
      ostring += GenePredBasics.entry_to_line(eA.entry)+"\n"
    sys.stdout.write(ostring)
    #oval.put(ostring)
    return
def main():
    parser = argparse.ArgumentParser(
        description='Use reference junctions when they are close',
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--min_intron_size',
                        type=int,
                        default=68,
                        help="INT min intron size")
    parser.add_argument(
        '--min_local_support',
        type=int,
        default=0,
        help=
        "INT min number of junctions within search_size of a junction in order to count it"
    )
    parser.add_argument('--search_size',
                        type=int,
                        default=10,
                        help="INT search space for reference")
    parser.add_argument(
        '--output_fake_psl',
        help="FASTAFILE reference genome to make a fake PSL output")
    parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN")
    parser.add_argument('reference_genepred',
                        help="FASTAFILENAME for reference genepred")
    args = parser.parse_args()

    cpus = multiprocessing.cpu_count()

    genome = {}
    if args.output_fake_psl:
        genome = read_fasta_into_hash(args.output_fake_psl)

    #read in the reference genepred first
    gpf = GenePredBasics.GenePredFile(args.reference_genepred)
    #lets sort entries by chromosome
    ref = {}
    for e in [x.entry for x in gpf.entries]:
        if len(e['exonStarts']) <= 1: continue
        if e['chrom'] not in ref:
            ref[e['chrom']] = {}
        for i in range(1, len(e['exonStarts'])):
            if e['exonEnds'][i - 1] not in ref[e['chrom']]:
                ref[e['chrom']][e['exonEnds'][i - 1]] = {}
            if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i -
                                                                           1]]:
                ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] +
                                                      1] = e['strand']
    #Stored all junctions as 1-base

    read_info = {}
    pf = GenericFileReader(args.psl)
    fcount_total = 0
    while True:
        line = pf.readline()
        if not line: break
        if re.match('^#', line): continue
        line = line.rstrip()
        pe = PSLBasics.line_to_entry(line)
        if len(pe['tStarts']) != len(pe['blockSizes']) or len(
                pe['qStarts']) != len(pe['blockSizes']):
            sys.stderr.write("WARNING invalid psl\n")
            continue
        genepred_line = PSLBasics.convert_entry_to_genepred_line(pe)
        ge = GenePredBasics.smooth_gaps(
            GenePredBasics.line_to_entry(genepred_line), args.min_intron_size)
        refjuns = {}
        if pe['tName'] in ref: refjuns = ref[pe['tName']]
        new_ge = nudge(pe, ge, refjuns, args)
        if args.output_fake_psl:
            new_psl_line = GenePredBasics.entry_to_fake_psl_line(
                new_ge, genome)
            print new_psl_line
        else:
            print GenePredBasics.entry_to_line(new_ge)
Пример #8
0
def check_B_entries(eA, gpdB, overlap, args):
    a_unique = True
    best_exon_count = 0
    best_overlap = 0
    best_line = ''
    best_frac = 0
    ostring = ''
    for eB in gpdB.entries:
        double_line = GenePredBasics.entry_to_line(
            eA.entry) + "\t" + GenePredBasics.entry_to_line(eB.entry) + "\n"
        gpd_comparison = GenePredBasics.GenePredComparison()
        gpd_comparison.set_overlap_requirement(overlap)
        if eA.entry['chrom'] != eB.entry['chrom']: continue
        # normal is to do full length matches
        if not (args.allow_a_subset_of_b_fragments
                or args.allow_any_fragments):
            # do some easy checks
            if eA.get_exon_count() != eB.get_exon_count(): continue
            gpd_comparison.set_require_all_exons_overlap(True)
            gpd_comparison.compare(eA, eB)
            if gpd_comparison.output['full_match']:
                a_unique = False
                if args.output_a_not_in_b:
                    break  # we can bust out of the inner loop if we are only printing stuff unique to a
                if not args.best_b_only:  # if we aren't waiting for the best, print it
                    ostring += double_line
                else:
                    # only do the best
                    if gpd_comparison.output['consecutive_exons'] > best_exon_count \
                    or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
                    and gpd_comparison.output['overlap_length'] > best_overlap) \
                    or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
                    and gpd_comparison.output['overlap_length'] == best_overlap \
                    and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac):
                        best_exon_count = gpd_comparison.output[
                            'consecutive_exons']
                        best_overlap = gpd_comparison.output['overlap_length']
                        best_line = double_line
                        best_frac = harmonic_mean(
                            gpd_comparison.output['overlap_fractions'])
        # Allow partial matches
        else:
            gpd_comparison.compare(eA, eB)
            if gpd_comparison.output['partial_match']:
                # if we require a to be subset of b
                if args.allow_a_subset_of_b_fragments \
                and not (eA.get_exon_count() < eB.get_exon_count() \
                and eA.get_exon_count() == gpd_comparison.output['consecutive_exons']):
                    break
                a_unique = False
                if args.output_a_not_in_b:
                    break
                    # only do the best
                if not args.best_b_only:
                    ostring += double_line
                else:
                    if gpd_comparison.output['consecutive_exons'] > best_exon_count \
                    or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
                    and gpd_comparison.output['overlap_length'] > best_overlap) \
                    or (gpd_comparison.output['consecutive_exons'] == best_exon_count \
                    and gpd_comparison.output['overlap_length'] == best_overlap \
                    and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac):
                        best_exon_count = gpd_comparison.output[
                            'consecutive_exons']
                        best_overlap = gpd_comparison.output['overlap_length']
                        best_line = double_line
                        best_frac = harmonic_mean(
                            gpd_comparison.output['overlap_fractions'])
    if best_exon_count > 0 and args.best_b_only:
        ostring += best_line
    if a_unique and (args.output_a_not_in_b or args.leftouterjoin):
        ostring += GenePredBasics.entry_to_line(eA.entry) + "\n"
    sys.stdout.write(ostring)
    #oval.put(ostring)
    return