def get_matches(lastz_file, splitchar, components, fish = False):
    matches = defaultdict(list)
    probes = defaultdict(int)
    for lz in lastz.Reader(lastz_file, long_format = True):
        # skip silly hg19 mhc haplotypes
        if "hap" in lz.name1:
            print "Skipping: ", lz.name1
        else:
            if not fish:
                uce_name = get_name(lz.name2, "|", 1)
                probe_number = int(lz.name2.split(':')[-1])
            else:
                uce_name = get_name(lz.name2, "_", 1)
                # add 1 because fish probe indexing starts @ 0
                probe_number = int(lz.name2.split('|')[1].split('_')[1]) + 1
            #pdb.set_trace()
            if probe_number > probes[uce_name]:
                probes[uce_name] = probe_number
            matches[uce_name].append([get_name(lz.name1, splitchar = splitchar, items = components), lz.strand2, lz.zstart1, lz.end1])
    return matches, probes
def main():
    args = get_args()
    uces = set([
        get_name(read.identifier, "|", 1)
        for read in fasta.FastaReader(args.query)
    ])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [
        os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace(
            '-', "_") for f in files
    ]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k, v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key=itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose: print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()
def get_matches(lastz_file, splitchar, components, fish=False):
    matches = defaultdict(list)
    probes = defaultdict(int)
    for lz in lastz.Reader(lastz_file, long_format=True):
        # skip silly hg19 mhc haplotypes
        if "hap" in lz.name1:
            print "Skipping: ", lz.name1
        else:
            if not fish:
                uce_name = get_name(lz.name2, "|", 1)
                probe_number = int(lz.name2.split(':')[-1])
            else:
                uce_name = get_name(lz.name2, "_", 1)
                # add 1 because fish probe indexing starts @ 0
                probe_number = int(lz.name2.split('|')[1].split('_')[1]) + 1
            #pdb.set_trace()
            if probe_number > probes[uce_name]:
                probes[uce_name] = probe_number
            matches[uce_name].append([
                get_name(lz.name1, splitchar=splitchar, items=components),
                lz.strand2, lz.zstart1, lz.end1
            ])
    return matches, probes
def main():
    args = get_args()
    uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k,v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key = itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose:print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()