def align(params):
    locus, opts = params
    name, sequences = locus
    # get additional params from params tuple
    window, threshold, notrim, proportion, divergence, min_len = opts
    fasta = create_locus_specific_fasta(sequences)
    aln = Align(fasta)
    aln.run_alignment()
    if notrim:
        aln.trim_alignment(
                method="notrim"
            )
    else:
        aln.trim_alignment(
                method="running",
                window_size=window,
                proportion=proportion,
                threshold=threshold,
                max_divergence=divergence,
                min_len=min_len
            )
    if aln.trimmed:
        sys.stdout.write(".")
    else:
        sys.stdout.write("X")
    sys.stdout.flush()
    return (name, aln)
Exemplo n.º 2
0
def align(params):
    locus, opts = params
    name, sequences = locus
    # get additional params from params tuple
    window, threshold, notrim, proportion = opts
    fasta = create_locus_specific_fasta(sequences)
    aln = Align(fasta)
    aln.run_alignment()
    if notrim:
        aln.trim_alignment(method='notrim')
    else:
        aln.trim_alignment(method='running',
                           window_size=window,
                           threshold=threshold,
                           proportion=proportion)
    sys.stdout.write(".")
    sys.stdout.flush()
    return (name, aln)
Exemplo n.º 3
0
def align(params):
    locus, opts = params
    name, sequences = locus
    # get additional params from params tuple
    window, threshold, notrim, proportion = opts
    fasta = create_locus_specific_fasta(sequences)
    aln = Align(fasta)
    aln.run_alignment()
    if notrim:
        aln.trim_alignment(
                method='notrim'
            )
    else:
        aln.trim_alignment(
                method='running',
                window_size=window,
                threshold=threshold,
                proportion=proportion
            )
    sys.stdout.write(".")
    sys.stdout.flush()
    return (name, aln)
def main():
    args = get_args()
    # compile some regular expressions we'll use later
    stripnum = re.compile("s_[0-9]+$")
    manyn = re.compile("[N,n]{20,}")
    # get names of loci and taxa
    uces = get_uce_names_from_probes(args.probes)
    taxa = get_taxa_names_from_fastas(args.fasta)
    print "\n"
    if not args.extend:
        if args.db is None:
            db = os.path.join(args.output, "probe.matches.sqlite")
        else:
            db = args.db
        # create db to hold results
        conn, c = create_probe_database(db, taxa, uces, True)
    else:
        conn, c = extend_probe_database(args.db, taxa)
    # get duplicate probe sequences for filtering
    if args.dupefile:
        print "Determining duplicate probes..."
        dupes = get_dupes(args.dupefile, longfile=False)
    else:
        dupes = None
    # iterate over LASTZ files for each taxon
    for lz in glob.glob(os.path.join(args.lastz, "*")):
        # get fasta name from lastz file
        ff = get_fasta_name_from_lastz_pth(lz, args.fasta, args.pattern)
        # get taxon name from lastz file
        taxon = get_taxon_from_filename(ff)
        print "\n{0}\n{1}\n{0}".format("=" * 30, taxon)
        # get lastz matches
        print "\tGetting LASTZ matches from GENOME alignments..."
        matches, probes = get_matches(lz)
        # remove bad loci (dupes)
        print "\tGetting bad (potentially duplicate) GENOME matches..."
        loci_to_skip = []
        for k, v in matches.iteritems():
            # check matches to makes sure all is well - keep names lc
            loci_to_skip.extend(quality_control_matches(matches, probes, dupes, k, v, False))
        # pdb.set_trace()
        # convert to set, to keep only uniques
        loci_to_skip = set(loci_to_skip)
        print "\tSkipping {} bad (duplicate hit) loci...".format(len(loci_to_skip))
        # get (and possibly assemble) non-skipped
        seqdict = defaultdict(list)
        # determine those contigs to skip and group those to assemble
        for contig in fasta.FastaReader(ff):
            # make sure all names are lowercase
            contig.identifier = contig.identifier.lower()
            name = contig.identifier.split("|")[-4].strip()
            locus = name.split("_")[0]
            # skip what we identified as bad loci
            if locus not in loci_to_skip:
                seqdict[locus].append(contig)
        output_name = "{}.fasta".format(taxon.replace("_", "-"))
        fout_name = os.path.join(args.output, output_name)
        print "\tOutput filename is {}".format(output_name)
        fout = fasta.FastaWriter(fout_name)
        # this tracks "fake" contig number
        count = 0
        # this tracks loci kept
        kept = 0
        # when > 1 contig, assemble contigs across matches
        sys.stdout.write("\tWriting and Aligning/Assembling UCE loci with multiple probes (dot/1000 loci)")
        for k, v in seqdict.iteritems():
            bad = False
            contig_names = []
            if count % 1000 == 0:
                sys.stdout.write(".")
                sys.stdout.flush()
            if len(v) == 1:
                # trim ambiguous bases on flanks
                record = v[0]
                orient = [matches[k][0][1]]
                if args.flank:
                    record = trim_uce_reads(record, args.flank)
                contig_names.append(record.identifier)
                record.sequence = record.sequence.strip("N")
                # trim many ambiguous bases within contig
                result = manyn.search(record.sequence)
                if result:
                    uce_start, uce_end = get_probe_positions(record)
                    uce = record.sequence[uce_start:uce_end]
                    record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False)
                # change header
                record.identifier = ">Node_{0}_length_{1}_cov_1000".format(count, len(record.sequence))
                fout.write(v[0])
            else:
                orient = list(set([m[1] for m in matches[k]]))
                # skip any loci having matches of mixed orientation
                # ['+', '-']
                if len(orient) == 1:
                    # create tempfile for the reads
                    fd, temp = tempfile.mkstemp(suffix=".fasta")
                    os.close(fd)
                    temp_out = fasta.FastaWriter(temp)
                    # write all slices to outfile, trimming if we want
                    # pdb.set_trace()
                    for record in v:
                        if args.flank:
                            record = trim_uce_reads(record, args.flank)
                        # keep names of contigs we assembled to store in db assoc
                        # w/ resulting assembled contig name
                        contig_names.append(record.identifier)
                        record.sequence = record.sequence.strip("N")
                        # trim many ambiguous bases within contig
                        result = manyn.search(record.sequence)
                        if result:
                            uce_start, uce_end = get_probe_positions(record)
                            uce = record.sequence[uce_start:uce_end]
                            record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False)
                        temp_out.write(record)
                    # make sure to close the file
                    temp_out.close()
                    # assemble
                    aln = Align(temp)
                    aln.run_alignment()
                    record = fasta.FastaSequence()
                    record.sequence = aln._alignment_consensus(aln.alignment)
                    record.identifier = ">Node_{0}_length_{1}_cov_1000".format(count, len(record.sequence))
                    # ensure that resulting consensus has no gaps or
                    # other odd characters (e.g. X)
                    if re.match("[ACGTNacgtn]", record.sequence):
                        fout.write(record)
                    else:
                        bad = True
                else:
                    bad = True
            if not bad:
                # track contig assembly and renaming data in db
                q = "UPDATE matches SET {0} = 1 WHERE uce = '{1}'".format(taxon, k)
                c.execute(q)
                # generate db match and match map tables for data
                orient_key = "node_{0}({1})".format(count, orient[0])
                q = "UPDATE match_map SET {0} = '{1}' WHERE uce = '{2}'".format(taxon, orient_key, k)
                c.execute(q)
                # keep track of new name :: old name mapping
                for old_name in contig_names:
                    q = "INSERT INTO contig_map VALUES ('{0}', '{1}', '{2}', '{3}')".format(
                        taxon, k, old_name, record.identifier
                    )
                    c.execute(q)
                kept += 1
            # tracking "fake" contig number
            count += 1
        conn.commit()
        print "\n\t{0} loci of {1} matched ({2:.0f}%), {3} dupes dropped ({4:.0f}%), {5} ({6:.0f}%) kept".format(
            count,
            len(uces),
            float(count) / len(uces) * 100,
            len(loci_to_skip),
            float(len(loci_to_skip)) / len(uces) * 100,
            kept,
            float(kept) / len(uces) * 100,
        )
    # conn.commit()
    c.close()
    conn.close()
def main():
    args = get_args()
    # compile some regular expressions we'll use later
    stripnum = re.compile("s_[0-9]+$")
    manyn = re.compile("[N,n]{20,}")
    # get names of loci and taxa
    uces = get_uce_names_from_probes(args.probes)
    taxa = get_taxa_names_from_fastas(args.fasta)
    print "\n"
    if not args.extend:
        if args.db is None:
            db = os.path.join(args.output, 'probe.matches.sqlite')
        else:
            db = args.db
        # create db to hold results
        conn, c = create_probe_database(
                db,
                taxa,
                uces,
                True
            )
    else:
        conn, c = extend_probe_database(
                args.db,
                taxa
            )
    # get duplicate probe sequences for filtering
    if args.dupefile:
        print "Determining duplicate probes..."
        dupes = get_dupes(args.dupefile, longfile=False)
    else:
        dupes = None
    # iterate over LASTZ files for each taxon
    for lz in glob.glob(os.path.join(args.lastz, '*')):
        # get fasta name from lastz file
        ff = get_fasta_name_from_lastz_pth(lz, args.fasta, args.pattern)
        # get taxon name from lastz file
        taxon = get_taxon_from_filename(ff)
        print "\n{0}\n{1}\n{0}".format('=' * 30, taxon)
        # get lastz matches
        print "\tGetting LASTZ matches from GENOME alignments..."
        matches, probes = get_matches(lz)
        # remove bad loci (dupes)
        print "\tGetting bad (potentially duplicate) GENOME matches..."
        loci_to_skip = []
        for k, v in matches.iteritems():
            # check matches to makes sure all is well - keep names lc
            loci_to_skip.extend(quality_control_matches(matches, probes, dupes, k, v, False))
        #pdb.set_trace()
        # convert to set, to keep only uniques
        loci_to_skip = set(loci_to_skip)
        print "\tSkipping {} bad (duplicate hit) loci...".format(len(loci_to_skip))
        # get (and possibly assemble) non-skipped
        seqdict = defaultdict(list)
        # determine those contigs to skip and group those to assemble
        for contig in fasta.FastaReader(ff):
            # make sure all names are lowercase
            contig.identifier = contig.identifier.lower()
            name = contig.identifier.split('|')[-4].strip()
            locus = name.split('_')[0]
            # skip what we identified as bad loci
            if locus not in loci_to_skip:
                seqdict[locus].append(contig)
        output_name = "{}.fasta".format(taxon.replace('_', '-'))
        fout_name = os.path.join(args.output, output_name)
        print "\tOutput filename is {}".format(output_name)
        fout = fasta.FastaWriter(fout_name)
        # this tracks "fake" contig number
        count = 0
        # this tracks loci kept
        kept = 0
        # when > 1 contig, assemble contigs across matches
        sys.stdout.write("\tWriting and Aligning/Assembling UCE loci with multiple probes (dot/1000 loci)")
        for k, v in seqdict.iteritems():
            bad = False
            contig_names = []
            if count % 1000 == 0:
                sys.stdout.write('.')
                sys.stdout.flush()
            if len(v) == 1:
                # trim ambiguous bases on flanks
                record = v[0]
                orient = [matches[k][0][1]]
                if args.flank:
                    record = trim_uce_reads(record, args.flank)
                contig_names.append(record.identifier)
                record.sequence = record.sequence.strip('N')
                # trim many ambiguous bases within contig
                result = manyn.search(record.sequence)
                if result:
                    uce_start, uce_end = get_probe_positions(record)
                    uce = record.sequence[uce_start:uce_end]
                    record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False)
                # change header
                record.identifier = ">Node_{0}_length_{1}_cov_1000".format(
                        count,
                        len(record.sequence)
                    )
                fout.write(v[0])
            else:
                orient = list(set([m[1] for m in matches[k]]))
                # skip any loci having matches of mixed orientation
                # ['+', '-']
                if len(orient) == 1:
                    # create tempfile for the reads
                    fd, temp = tempfile.mkstemp(suffix='.fasta')
                    os.close(fd)
                    temp_out = fasta.FastaWriter(temp)
                    # write all slices to outfile, trimming if we want
                    #pdb.set_trace()
                    for record in v:
                        if args.flank:
                            record = trim_uce_reads(record, args.flank)
                        # keep names of contigs we assembled to store in db assoc
                        # w/ resulting assembled contig name
                        contig_names.append(record.identifier)
                        record.sequence = record.sequence.strip('N')
                        # trim many ambiguous bases within contig
                        result = manyn.search(record.sequence)
                        if result:
                            uce_start, uce_end = get_probe_positions(record)
                            uce = record.sequence[uce_start:uce_end]
                            record.sequence = snip_if_many_N_bases(manyn, k, record.sequence, uce, verbose=False)
                        temp_out.write(record)
                    # make sure to close the file
                    temp_out.close()
                    # assemble
                    aln = Align(temp)
                    aln.run_alignment()
                    record = fasta.FastaSequence()
                    record.sequence = aln.alignment_consensus.tostring()
                    record.identifier = ">Node_{0}_length_{1}_cov_1000".format(
                            count,
                            len(record.sequence)
                        )
                    fout.write(record)
                else:
                    bad = True
            if not bad:
                # track contig assembly and renaming data in db
                q = "UPDATE matches SET {0} = 1 WHERE uce = '{1}'".format(taxon, k)
                c.execute(q)
                # generate db match and match map tables for data
                orient_key = "node_{0}({1})".format(count, orient[0])
                q = "UPDATE match_map SET {0} = '{1}' WHERE uce = '{2}'".format(taxon, orient_key, k)
                c.execute(q)
                # keep track of new name :: old name mapping
                for old_name in contig_names:
                    q = "INSERT INTO contig_map VALUES ('{0}', '{1}', '{2}', '{3}')".format(taxon, k, old_name, record.identifier)
                    c.execute(q)
                kept += 1
            # tracking "fake" contig number
            count += 1
        conn.commit()
        print "\n\t{0} loci of {1} matched ({2:.0f}%), {3} dupes dropped ({4:.0f}%), {5} ({6:.0f}%) kept".format(
            count,
            len(uces),
            float(count) / len(uces) * 100,
            len(loci_to_skip),
            float(len(loci_to_skip)) / len(uces) * 100,
            kept,
            float(kept) / len(uces) * 100
            )
    #conn.commit()
    c.close()
    conn.close()