def quality_control_matches(matches, probes, dupes, k, v, verbose=False):
    """check to make sure we don't get any more matches than expected
    and that matches are reasonably close to each other on their respective
    chromos"""
    skip = False
    chromo, strand, start, end = None, None, None, None
    if len(v) > 1:
        if run_checks(k, v, probes):
            # sort by match position
            v_sort = sorted(v, key=itemgetter(2))
            start, end = v_sort[0][2], v_sort[-1][3]
            diff = end - start
            # ensure our range is less than N(probes) * probe_length - this
            # still gives us a little wiggle room because probes are ~ 2X tiled
            if diff > (probes[k] * 140):
                skip = True
                if verbose:
                    print "range longer than expected"
            else:
                chromo = v[0][0]
                strand = v[0][1]
        else:
            skip = True

    elif k in dupes:
        skip = True
        print "{0} is in dupefile".format(k)
    else:
        chromo, strand, start, end = v[0]
    return chromo, strand, start, end, skip
def quality_control_matches(matches, probes, dupes, k, v, verbose=False):
    """check to make sure we don't get any more matches than expected
    and that matches are reasonably close to each other on their respective
    chromos"""
    skip = False
    chromo, strand, start, end = None, None, None, None
    if len(v) > 1:
        if run_checks(k, v, probes):
            # sort by match position
            v_sort = sorted(v, key=itemgetter(2))
            start, end = v_sort[0][2], v_sort[-1][3]
            diff = end - start
            # ensure our range is less than N(probes) * probe_length - this
            # still gives us a little wiggle room because probes are ~ 2X tiled
            if diff > (probes[k] * 140):
                skip = True
                if verbose:
                    print "range longer than expected"
            else:
                chromo = v[0][0]
                strand = v[0][1]
        else:
            skip = True

    elif k in dupes:
        skip = True
        print "{0} is in dupefile".format(k)
    else:
        chromo, strand, start, end = v[0]
    return chromo, strand, start, end, skip
示例#3
0
def main():
    args = get_args()
    uces = set([
        get_name(read.identifier, "|", 1)
        for read in fasta.FastaReader(args.query)
    ])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [
        os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace(
            '-', "_") for f in files
    ]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k, v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key=itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose: print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()
def main():
    args = get_args()
    uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k,v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key = itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose:print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()