def quality_control_matches(matches, probes, dupes, k, v, verbose=False): """check to make sure we don't get any more matches than expected and that matches are reasonably close to each other on their respective chromos""" skip = False chromo, strand, start, end = None, None, None, None if len(v) > 1: if run_checks(k, v, probes): # sort by match position v_sort = sorted(v, key=itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 140): skip = True if verbose: print "range longer than expected" else: chromo = v[0][0] strand = v[0][1] else: skip = True elif k in dupes: skip = True print "{0} is in dupefile".format(k) else: chromo, strand, start, end = v[0] return chromo, strand, start, end, skip
def main(): args = get_args() uces = set([ get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query) ]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [ os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace( '-', "_") for f in files ] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k, v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key=itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose: print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()
def main(): args = get_args() uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)]) files = glob.glob(os.path.join(args.lastz, '*.lastz')) # this prob. needs to be more robust organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files] conn, c = create_match_database(args.db, organisms, uces) if args.dupefile: dupes = get_dupes(args.dupefile) else: dupes = None #pdb.set_trace() for f in files: critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0] matches, probes = get_matches(f, args.splitchar, args.components) count = 0 for k,v in matches.iteritems(): skip = False if len(v) > 1: if run_checks(k, v, probes, args.verbose): # sort by match position v_sort = sorted(v, key = itemgetter(2)) start, end = v_sort[0][2], v_sort[-1][3] diff = end - start # ensure our range is less than N(probes) * probe_length - this # still gives us a little wiggle room because probes are ~ 2X tiled if diff > (probes[k] * 120): skip = True if args.verbose: print "range longer than expected" else: skip = True elif args.dupefile and k in dupes: skip = True if args.verbose:print "{0} is in dupefile".format(k) else: pass if not skip: store_lastz_results_in_db(c, critter, k) count += 1 print "Entered {} matches for {}".format(count, critter) conn.commit() c.close() conn.close()