def main():
    args = get_args()
    regex = re.compile("[N,n]{20,}")
    if args.dupefile:
        dupes = get_dupes(args.dupefile, longfile=False)
    else:
        dupes = None
    matches, probes = get_matches(args.lastz, args.splitchar, args.components,
                                  args.fish)
    #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]])
    if args.fasta:
        tb = bx.seq.twobit.TwoBitFile(file(args.genome))
    count = 0
    for k, v in matches.iteritems():
        chromo, strand, start, end, skip = quality_control_matches(
            matches, probes, dupes, k, v, args.verbose)
        if not skip and args.fasta:
            prep_and_write_fasta(tb, regex, args.fasta, chromo, strand, start,
                                 end, count, args.flank)
        if not skip and args.bed:
            args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format(
                chromo, start - args.flank, end + args.flank, k, strand))
        count += 1
        #pdb.set_trace()
    args.fasta.close()
Exemplo n.º 2
0
def main():
    args = get_args()
    uces = set([
        get_name(read.identifier, "|", 1)
        for read in fasta.FastaReader(args.query)
    ])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [
        os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace(
            '-', "_") for f in files
    ]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k, v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key=itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose: print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()
def main():
    args = get_args()
    uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k,v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key = itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose:print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()
def main():
    args = get_args()
    regex = re.compile("[N,n]{20,}")
    if args.dupefile:
        dupes = get_dupes(args.dupefile, longfile=False)
    else:
        dupes = None
    matches, probes = get_matches(args.lastz, args.splitchar, args.components, args.fish)
    #unique_matches = sum([1 for uce, map_pos in matches.iteritems() if len(map_pos) == probes[uce]])
    if args.fasta:
        tb = bx.seq.twobit.TwoBitFile(file(args.genome))
    count = 0
    for k, v in matches.iteritems():
        chromo, strand, start, end, skip = quality_control_matches(matches, probes, dupes, k, v, args.verbose)
        if not skip and args.fasta:
            prep_and_write_fasta(tb, regex, args.fasta, chromo, strand, start, end, count, args.flank)
        if not skip and args.bed:
            args.bed.write("{0} {1} {2} {3} 1000 {4}\n".format(chromo, start - args.flank, end + args.flank, k, strand))
        count += 1
        #pdb.set_trace()
    args.fasta.close()