def main(): parser = argparse.ArgumentParser( description='Use reference junctions when they are close', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size', type=int, default=68, help="INT min intron size") parser.add_argument( '--min_local_support', type=int, default=0, help= "INT min number of junctions within search_size of a junction in order to count it" ) parser.add_argument('--search_size', type=int, default=10, help="INT search space for reference") parser.add_argument( '--output_fake_psl', help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred', help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1, len(e['exonStarts'])): if e['exonEnds'][i - 1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i - 1]] = {} if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i - 1]]: ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] + 1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len( pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps( GenePredBasics.line_to_entry(genepred_line), args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe, ge, refjuns, args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line( new_ge, genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def main(): parser = argparse.ArgumentParser() parser.add_argument('a', nargs=1, help='FILENAME genepred file A') parser.add_argument('b', nargs=1, help='FILENAME genepred file B') #parser.add_argument('-p',nargs='?',help='INT the number of threads to run.') parser.add_argument('--minexoncount', nargs='?', help='INT the minimum number of exons required.') parser.add_argument( '--minoverlap_internal', nargs='?', help= 'FLOAT the fraction (0-1) of the required reciprocal overlap of an internal exon to call an exon a match.' ) parser.add_argument( '--minoverlap_first', nargs='?', help= 'FLOAT the fraction (0-1) of the required reciprocal overlap of the first exon to call an exon a match.' ) parser.add_argument( '--minoverlap_last', nargs='?', help= 'FLOAT the fraction (0-1) of the required reciprocal overlap of the last exon to call an exon a match.' ) parser.add_argument( '--minoverlap', nargs='?', help= 'FLOAT the fraction (0-1) of the required reciprocal overlap of any exon to call an exon a match.' ) parser.add_argument( '--leftouterjoin', action='store_true', help= 'Output the entry A regardless of whether a matching entry in B is found' ) parser.add_argument('--output_a_not_in_b', action='store_true', help='Output entries that occur in A but not B') parser.add_argument( '--best_b_only', action='store_true', help= 'Output only one entry of B for each A and try to pick the best based on reciprocal overlap' ) parser.add_argument( '--allow_a_subset_of_b_fragments', action='store_true', help= 'If A is just a subset of B, then call it as a match. This means all exons of A found a conecutive match, but B could have more exons on either end.' ) parser.add_argument( '--allow_any_fragments', action='store_true', help='If set, allow any partial match, not just the best') args = parser.parse_args() #pcount = multiprocessing.cpu_count() #if args.p: pcount = int(args.p) # go through contingencies of overlap requirements and set them overlap = [0, 0, 0] if args.minoverlap: overlap = [ float(args.minoverlap), float(args.minoverlap), float(args.minoverlap) ] if args.minoverlap_first: overlap[0] = float(args.minoverlap_last) if args.minoverlap_last: overlap[2] = float(args.minoverlap_last) if args.minoverlap_internal: overlap[1] = float(args.minoverlap_internal) # read the genepred files gpdA = GenePredBasics.GenePredFile(args.a[0]) gpdB = GenePredBasics.GenePredFile(args.b[0]) #if pcount > 1: # p = multiprocessing.Pool(processes=pcount) for eA in gpdA.entries: #if pcount > 1: # p.apply_async(check_B_entries,[eA,overlap,args]) #else: check_B_entries(eA, gpdB, overlap, args)