def main(): parser = argparse.ArgumentParser(description='Use reference junctions when they are close',formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size',type=int,default=68,help="INT min intron size") parser.add_argument('--min_local_support',type=int,default=0,help="INT min number of junctions within search_size of a junction in order to count it") parser.add_argument('--search_size',type=int,default=10,help="INT search space for reference") parser.add_argument('--output_fake_psl',help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl',help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred',help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1,len(e['exonStarts'])): if e['exonEnds'][i-1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i-1]] = {} if e['exonStarts'][i]+1 not in ref[e['chrom']][e['exonEnds'][i-1]]: ref[e['chrom']][e['exonEnds'][i-1]][e['exonStarts'][i]+1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#',line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len(pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps(GenePredBasics.line_to_entry(genepred_line),args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe,ge,refjuns,args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line(new_ge,genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('genepred',help="FILENAME or use - for STDIN") parser.add_argument('--smoothing_size',type=int,default=68,help="INT no gaps less than this size") args = parser.parse_args() inf = sys.stdin if args.genepred != '-': inf = open(args.genepred) for line in inf: e = GenePredBasics.line_to_entry(line) e2 = GenePredBasics.smooth_gaps(e,args.smoothing_size) print GenePredBasics.entry_to_line(e2)
def parse_gpdfile(tdir,gpdfile,smoothing_factor): # Go through the long reads and make a genepred if gpdfile != '-': fr = FileBasics.GenericFileReader(gpdfile) else: fr = sys.stdin seennames = {} longreadnumber = 0 of_gpd = open(tdir+'/longreads.gpd','w') while True: line = fr.readline() if not line: break if re.match('^#',line): #skip comments continue longreadnumber += 1 entry = GenePredBasics.smooth_gaps( \ GenePredBasics.line_to_entry(line.rstrip()) \ ,smoothing_factor) readname = entry['name'] if readname in seennames: sys.stderr.write("Warning: repeat name '"+readname+"'\n") #set our first name to our bin entry['name'] = str(longreadnumber) gline = GenePredBasics.entry_to_line(entry) of_gpd.write(gline+"\n") fr.close() of_gpd.close()
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('genepred', help="FILENAME or use - for STDIN") parser.add_argument('--smoothing_size', type=int, default=68, help="INT no gaps less than this size") args = parser.parse_args() inf = sys.stdin if args.genepred != '-': inf = open(args.genepred) for line in inf: e = GenePredBasics.line_to_entry(line) e2 = GenePredBasics.smooth_gaps(e, args.smoothing_size) print GenePredBasics.entry_to_line(e2)
def main(): parser = argparse.ArgumentParser(description="Convert a psl file into a target formated genepred file.") parser.add_argument('--fill_gaps',type=int,default=0,help="Close gaps this size or smaller.") parser.add_argument('input_name',help="Input PSL file, use - to indicate STDIN.") args = parser.parse_args() pslfilehandle = sys.stdin if args.input_name != '-': pslfilehandle = open(args.input_name) with pslfilehandle as infile: for line in infile: psl_entry = PSLBasics.line_to_entry(line) genepred_line = PSLBasics.convert_entry_to_genepred_line(psl_entry) if args.fill_gaps > 0: genepred_entry = GenePredBasics.line_to_entry(genepred_line) genepred_entry2 = GenePredBasics.smooth_gaps(genepred_entry,args.fill_gaps) genepred_line = GenePredBasics.entry_to_line(genepred_entry2) print genepred_line
def check_B_entries(eA,gpdB,overlap,args): a_unique = True best_exon_count = 0 best_overlap = 0 best_line = '' best_frac = 0 ostring = '' for eB in gpdB.entries: double_line = GenePredBasics.entry_to_line(eA.entry) + "\t" + GenePredBasics.entry_to_line(eB.entry) + "\n" gpd_comparison = GenePredBasics.GenePredComparison() gpd_comparison.set_overlap_requirement(overlap) if eA.entry['chrom'] != eB.entry['chrom']: continue # normal is to do full length matches if not (args.allow_a_subset_of_b_fragments or args.allow_any_fragments): # do some easy checks if eA.get_exon_count() != eB.get_exon_count(): continue gpd_comparison.set_require_all_exons_overlap(True) gpd_comparison.compare(eA,eB) if gpd_comparison.output['full_match']: a_unique = False if args.output_a_not_in_b: break # we can bust out of the inner loop if we are only printing stuff unique to a if not args.best_b_only: # if we aren't waiting for the best, print it ostring += double_line else: # only do the best if gpd_comparison.output['consecutive_exons'] > best_exon_count \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] > best_overlap) \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] == best_overlap \ and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac): best_exon_count = gpd_comparison.output['consecutive_exons'] best_overlap = gpd_comparison.output['overlap_length'] best_line = double_line best_frac = harmonic_mean(gpd_comparison.output['overlap_fractions']) # Allow partial matches else: gpd_comparison.compare(eA,eB) if gpd_comparison.output['partial_match']: # if we require a to be subset of b if args.allow_a_subset_of_b_fragments \ and not (eA.get_exon_count() < eB.get_exon_count() \ and eA.get_exon_count() == gpd_comparison.output['consecutive_exons']): break a_unique = False if args.output_a_not_in_b: break # only do the best if not args.best_b_only: ostring += double_line else: if gpd_comparison.output['consecutive_exons'] > best_exon_count \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] > best_overlap) \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] == best_overlap \ and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac): best_exon_count = gpd_comparison.output['consecutive_exons'] best_overlap = gpd_comparison.output['overlap_length'] best_line = double_line best_frac = harmonic_mean(gpd_comparison.output['overlap_fractions']) if best_exon_count > 0 and args.best_b_only: ostring += best_line if a_unique and (args.output_a_not_in_b or args.leftouterjoin): ostring += GenePredBasics.entry_to_line(eA.entry)+"\n" sys.stdout.write(ostring) #oval.put(ostring) return
def main(): parser = argparse.ArgumentParser( description='Use reference junctions when they are close', formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--min_intron_size', type=int, default=68, help="INT min intron size") parser.add_argument( '--min_local_support', type=int, default=0, help= "INT min number of junctions within search_size of a junction in order to count it" ) parser.add_argument('--search_size', type=int, default=10, help="INT search space for reference") parser.add_argument( '--output_fake_psl', help="FASTAFILE reference genome to make a fake PSL output") parser.add_argument('psl', help="PSLFILENAME or '-' for STDIN") parser.add_argument('reference_genepred', help="FASTAFILENAME for reference genepred") args = parser.parse_args() cpus = multiprocessing.cpu_count() genome = {} if args.output_fake_psl: genome = read_fasta_into_hash(args.output_fake_psl) #read in the reference genepred first gpf = GenePredBasics.GenePredFile(args.reference_genepred) #lets sort entries by chromosome ref = {} for e in [x.entry for x in gpf.entries]: if len(e['exonStarts']) <= 1: continue if e['chrom'] not in ref: ref[e['chrom']] = {} for i in range(1, len(e['exonStarts'])): if e['exonEnds'][i - 1] not in ref[e['chrom']]: ref[e['chrom']][e['exonEnds'][i - 1]] = {} if e['exonStarts'][i] + 1 not in ref[e['chrom']][e['exonEnds'][i - 1]]: ref[e['chrom']][e['exonEnds'][i - 1]][e['exonStarts'][i] + 1] = e['strand'] #Stored all junctions as 1-base read_info = {} pf = GenericFileReader(args.psl) fcount_total = 0 while True: line = pf.readline() if not line: break if re.match('^#', line): continue line = line.rstrip() pe = PSLBasics.line_to_entry(line) if len(pe['tStarts']) != len(pe['blockSizes']) or len( pe['qStarts']) != len(pe['blockSizes']): sys.stderr.write("WARNING invalid psl\n") continue genepred_line = PSLBasics.convert_entry_to_genepred_line(pe) ge = GenePredBasics.smooth_gaps( GenePredBasics.line_to_entry(genepred_line), args.min_intron_size) refjuns = {} if pe['tName'] in ref: refjuns = ref[pe['tName']] new_ge = nudge(pe, ge, refjuns, args) if args.output_fake_psl: new_psl_line = GenePredBasics.entry_to_fake_psl_line( new_ge, genome) print new_psl_line else: print GenePredBasics.entry_to_line(new_ge)
def check_B_entries(eA, gpdB, overlap, args): a_unique = True best_exon_count = 0 best_overlap = 0 best_line = '' best_frac = 0 ostring = '' for eB in gpdB.entries: double_line = GenePredBasics.entry_to_line( eA.entry) + "\t" + GenePredBasics.entry_to_line(eB.entry) + "\n" gpd_comparison = GenePredBasics.GenePredComparison() gpd_comparison.set_overlap_requirement(overlap) if eA.entry['chrom'] != eB.entry['chrom']: continue # normal is to do full length matches if not (args.allow_a_subset_of_b_fragments or args.allow_any_fragments): # do some easy checks if eA.get_exon_count() != eB.get_exon_count(): continue gpd_comparison.set_require_all_exons_overlap(True) gpd_comparison.compare(eA, eB) if gpd_comparison.output['full_match']: a_unique = False if args.output_a_not_in_b: break # we can bust out of the inner loop if we are only printing stuff unique to a if not args.best_b_only: # if we aren't waiting for the best, print it ostring += double_line else: # only do the best if gpd_comparison.output['consecutive_exons'] > best_exon_count \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] > best_overlap) \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] == best_overlap \ and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac): best_exon_count = gpd_comparison.output[ 'consecutive_exons'] best_overlap = gpd_comparison.output['overlap_length'] best_line = double_line best_frac = harmonic_mean( gpd_comparison.output['overlap_fractions']) # Allow partial matches else: gpd_comparison.compare(eA, eB) if gpd_comparison.output['partial_match']: # if we require a to be subset of b if args.allow_a_subset_of_b_fragments \ and not (eA.get_exon_count() < eB.get_exon_count() \ and eA.get_exon_count() == gpd_comparison.output['consecutive_exons']): break a_unique = False if args.output_a_not_in_b: break # only do the best if not args.best_b_only: ostring += double_line else: if gpd_comparison.output['consecutive_exons'] > best_exon_count \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] > best_overlap) \ or (gpd_comparison.output['consecutive_exons'] == best_exon_count \ and gpd_comparison.output['overlap_length'] == best_overlap \ and harmonic_mean(gpd_comparison.output['overlap_fractions']) > best_frac): best_exon_count = gpd_comparison.output[ 'consecutive_exons'] best_overlap = gpd_comparison.output['overlap_length'] best_line = double_line best_frac = harmonic_mean( gpd_comparison.output['overlap_fractions']) if best_exon_count > 0 and args.best_b_only: ostring += best_line if a_unique and (args.output_a_not_in_b or args.leftouterjoin): ostring += GenePredBasics.entry_to_line(eA.entry) + "\n" sys.stdout.write(ostring) #oval.put(ostring) return