def main(): args = get_args() regex = re.compile("[N,n]{20,}") if args.dupefile: dupes = get_dupes(args.dupefile, longfile=False) else: dupes = None matches, probes = get_matches(args.lastz, args.splitchar, args.components, args.fish) #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]]) if args.fasta: tb = bx.seq.twobit.TwoBitFile(file(args.genome)) count = 0 for k, v in matches.iteritems(): chromo, strand, start, end, skip = quality_control_matches( matches, probes, dupes, k, v, args.verbose) if not skip and args.fasta: prep_and_write_fasta(tb, regex, args.fasta, chromo, strand, start, end, count, args.flank) if not skip and args.bed: args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format( chromo, start - args.flank, end + args.flank, k, strand)) count += 1 #pdb.set_trace() args.fasta.close()
def main(): args = get_args() uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [ os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace( '-', "_") for f in files ] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k, v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key=itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose: print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def main(): args = get_args() uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k,v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key = itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose:print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def main(): args = get_args() regex = re.compile("[N,n]{20,}") if args.dupefile: dupes = get_dupes(args.dupefile, longfile=False) else: dupes = None matches, probes = get_matches(args.lastz, args.splitchar, args.components, args.fish) #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]]) if args.fasta: tb = bx.seq.twobit.TwoBitFile(file(args.genome)) count = 0 for k, v in matches.iteritems(): chromo, strand, start, end, skip = quality_control_matches(matches, probes, dupes, k, v, args.verbose) if not skip and args.fasta: prep_and_write_fasta(tb, regex, args.fasta, chromo, strand, start, end, count, args.flank) if not skip and args.bed: args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format(chromo, start - args.flank, end + args.flank, k, strand)) count += 1 #pdb.set_trace() args.fasta.close()