def get_matches(lastz_file):
    matches = defaultdict(list)
    probes = defaultdict(int)
    for lz in lastz.Reader(lastz_file, long_format=True):
        uce_name = get_uce_name(lz.name2)
        probe_number = get_uce_num(lz.name2)
        if probe_number > probes[uce_name]:
            probes[uce_name] = probe_number
        matches[uce_name].append([get_name(lz.name1).lower(), lz.strand2, lz.zstart1, lz.end1])
    return matches, probes
def get_bgi_matches(lastz_file, stripnum):
    matches = defaultdict(list)
    probes = defaultdict(int)
    for lz in lastz.Reader(lastz_file, long_format=True):
        uce_name = re.sub(stripnum, 's', lz.name2).lower()
        probe_number = int(lz.name2.split('_')[-1])
        if probe_number > probes[uce_name]:
            probes[uce_name] = probe_number
        matches[uce_name].append(
            [get_name(lz.name1).lower(), lz.strand2, lz.zstart1, lz.end1])
    return matches, probes
Exemplo n.º 3
0
def main():
    args = get_args()
    uces = set([
        get_name(read.identifier, "|", 1)
        for read in fasta.FastaReader(args.query)
    ])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [
        os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace(
            '-', "_") for f in files
    ]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k, v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key=itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose: print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()
def get_matches(lastz_file):
    matches = defaultdict(list)
    probes = defaultdict(int)
    for lz in lastz.Reader(lastz_file, long_format=True):
        uce_name = get_uce_name(lz.name2)
        probe_number = get_uce_num(lz.name2)
        if probe_number > probes[uce_name]:
            probes[uce_name] = probe_number
        matches[uce_name].append(
                [
                    get_name(lz.name1).lower(),
                    lz.strand2,
                    lz.zstart1,
                    lz.end1
                ]
            )
    return matches, probes
def get_bgi_matches(lastz_file, stripnum):
    matches = defaultdict(list)
    probes = defaultdict(int)
    for lz in lastz.Reader(lastz_file, long_format=True):
        uce_name = re.sub(stripnum, 's', lz.name2).lower()
        probe_number = int(lz.name2.split('_')[-1])
        if probe_number > probes[uce_name]:
            probes[uce_name] = probe_number
        matches[uce_name].append(
                [
                    get_name(lz.name1).lower(),
                    lz.strand2,
                    lz.zstart1,
                    lz.end1
                ]
            )
    return matches, probes
def main():
    args = get_args()
    uces = set([get_name(read.identifier, "|", 1) for read in fasta.FastaReader(args.query)])
    files = glob.glob(os.path.join(args.lastz, '*.lastz'))
    # this prob. needs to be more robust
    organisms = [os.path.splitext(os.path.basename(f).split('-')[-1])[0].replace('-',"_") for f in files]
    conn, c = create_match_database(args.db, organisms, uces)
    if args.dupefile:
        dupes = get_dupes(args.dupefile)
    else:
        dupes = None
    #pdb.set_trace()
    for f in files:
        critter = os.path.splitext(os.path.basename(f).split('-')[-1])[0]
        matches, probes = get_matches(f, args.splitchar, args.components)
        count = 0
        for k,v in matches.iteritems():
            skip = False
            if len(v) > 1:
                if run_checks(k, v, probes, args.verbose):
                    # sort by match position
                    v_sort = sorted(v, key = itemgetter(2))
                    start, end = v_sort[0][2], v_sort[-1][3]
                    diff = end - start
                    # ensure our range is less than N(probes) * probe_length - this
                    # still gives us a little wiggle room because probes are ~ 2X tiled
                    if diff > (probes[k] * 120):
                        skip = True
                        if args.verbose:
                            print "range longer than expected"
                else:
                    skip = True
            elif args.dupefile and k in dupes:
                skip = True
                if args.verbose:print "{0} is in dupefile".format(k)
            else:
                pass
            if not skip:
                store_lastz_results_in_db(c, critter, k)
                count += 1
        print "Entered {} matches for {}".format(count, critter)
    conn.commit()
    c.close()
    conn.close()
def main():
    args = get_args()
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.read(args.config)
    conn = sqlite3.connect(args.db)
    c = conn.cursor()
    if args.extend_db:
        query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db)
        c.execute(query)
    organisms = get_names_from_config(config, "Organisms")
    uces = get_names_from_config(config, "Loci")
    # pdb.set_trace()
    uce_fasta_out = fasta.FastaWriter(args.output)
    regex = re.compile("[N,n]{1,21}")
    for organism in organisms:
        print "Getting {0} reads...".format(organism)
        written = []
        # going to need to do something more generic w/ suffixes
        # pdb.set_trace()
        name = organism.replace("_", "-")
        if args.notstrict:
            if not organism.endswith("*"):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True)
            elif args.extend_dir:
                # remove the asterisk
                name = name.rstrip("*")
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip("*"), uces, extend=True, notstrict=True)
        else:
            if not name.endswith("*"):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces)
            elif name.endswith("*") and args.extend_dir:
                # remove the asterisk
                name = name.rstrip("*")
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip("*"), uces, extend=True)
        for read in fasta.FastaReader(reads):
            name = get_name(read.identifier).lower()
            coverage = get_coverage(read.identifier)
            if name in node_dict.keys():
                uce_seq = fasta.FastaSequence()
                uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism.rstrip("*"), coverage)
                # deal with strandedness because aligners dont, which
                # is annoying
                if node_dict[name][1] == "-":
                    uce_seq.sequence = transform.DNA_reverse_complement(read.sequence)
                else:
                    uce_seq.sequence = read.sequence
                # replace any occurrences of <21 Ns in a given sequence with
                # blanks.  These should gap out during alignment.
                if regex.search(uce_seq.sequence):
                    uce_seq.sequence = re.sub(regex, "", uce_seq.sequence)
                    print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(" ")[0])
                # Replace and leading/trailing lowercase bases from velvet
                # assemblies. Lowercase bases indicate low coverage, and these
                # have been problematic in downstream alignments).
                uce_seq.sequence = re.sub("^[acgtn]+", "", uce_seq.sequence)
                uce_seq.sequence = re.sub("[acgtn]+$", "", uce_seq.sequence)
                uce_fasta_out.write(uce_seq)
                written.append(str(node_dict[name][0]))
            else:
                pass
        # pdb.set_trace()
        if args.notstrict and missing:
            args.notstrict.write("[{0}]\n".format(organism))
            for name in missing:
                args.notstrict.write("{0}\n".format(name))
                written.append(name)
        assert set(written) == set(uces), "UCE names do not match"
        # assert set(written) == set(uces), pdb.set_trace()
    uce_fasta_out.close()
def main():
    args = get_args()
    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.read(args.config)
    conn = sqlite3.connect(args.db)
    c = conn.cursor()
    if args.extend_db:
        query = "ATTACH DATABASE '{0}' AS extended".format(args.extend_db)
        c.execute(query)
    organisms = get_names_from_config(config, 'Organisms')
    uces = get_names_from_config(config, 'Loci')
    #pdb.set_trace()
    uce_fasta_out = fasta.FastaWriter(args.output)
    regex = re.compile("[N,n]{1,21}")
    for organism in organisms:
        print "Getting {0} reads...".format(organism)
        written = []
        # going to need to do something more generic w/ suffixes
        #pdb.set_trace()
        name = organism.replace('_', '-')
        if args.notstrict:
            if not organism.endswith('*'):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces, extend=False, notstrict=True)
            elif args.extend_dir:
                # remove the asterisk
                name = name.rstrip('*')
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True, notstrict=True)
        else:
            if not name.endswith('*'):
                reads = find_file(args.contigs, name)
                node_dict, missing = get_nodes_for_uces(c, organism, uces)
            elif name.endswith('*') and args.extend_dir:
                # remove the asterisk
                name = name.rstrip('*')
                reads = find_file(args.extend_dir, name)
                node_dict, missing = get_nodes_for_uces(c, organism.rstrip('*'), uces, extend=True)
        for read in fasta.FastaReader(reads):
            name = get_name(read.identifier).lower()
            coverage = get_coverage(read.identifier)
            if name in node_dict.keys():
                uce_seq = fasta.FastaSequence()
                uce_seq.identifier = ">{0}_{1} |{0}|{2}".format(node_dict[name][0], organism, coverage)
                # deal with strandedness because aligners dont, which
                # is annoying
                if node_dict[name][1] == '-':
                    uce_seq.sequence = transform.DNA_reverse_complement(read.sequence)
                else:
                    uce_seq.sequence = read.sequence
                # replace any occurrences of <21 Ns
                if regex.search(uce_seq.sequence):
                    uce_seq.sequence = re.sub(regex, "", uce_seq.sequence)
                    print "\tReplaced < 20 ambiguous bases in {0}".format(uce_seq.identifier.split(' ')[0])
                uce_fasta_out.write(uce_seq)
                written.append(str(node_dict[name][0]))
            else:
                pass
        #pdb.set_trace()
        if args.notstrict and missing:
            args.notstrict.write("[{0}]\n".format(organism))
            for name in missing:
                args.notstrict.write("{0}\n".format(name))
                written.append(name)
        assert set(written) == set(uces), "UCE names do not match"
        #assert set(written) == set(uces), pdb.set_trace()
    uce_fasta_out.close()