Пример #1
0
def makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=3):
    D = dict(parse_fasta(derepfile))
    U = defaultdict(list)  # Clusters
    fp = open(userfile)
    for row in fp:
        query, target, id, qcov, tcov = row.rstrip().split("\t")
        U[target].append((query, getsize(query),
                          float(id) * float(qcov) * float(tcov)))

    fw = open(clustfile, "w")
    for key, members in U.items():
        keysize = getsize(key)
        members.sort(key=lambda x: (-x[1], -x[2]))
        totalsize = keysize + sum(x[1] for x in members)
        if totalsize < mindepth:
            continue

        # Recruit cluster members
        seqs = [('>' + key, D[key])]
        for name, size, id in members:
            seqs.append(('>' + name, D[name]))

        seq = "\n".join("\n".join(x) for x in seqs)
        print >> fw, "\n".join((seq, SEP))

    I = dict(parse_fasta(notmatchedfile))
    singletons = set(I.keys()) - set(U.keys())
    for key in singletons:
        if getsize(key) < mindepth:
            continue
        print >> fw, "\n".join(('>' + key, I[key], SEP))
    fw.close()
Пример #2
0
def bed(args):
    """
    %prog bed fastafile kmer.dump.txt

    Map kmers on FASTA.
    """
    from jcvi.formats.fasta import rc, parse_fasta

    p = OptionParser(bed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, dumpfile = args
    fp = open(dumpfile)
    KMERS = set()
    for row in fp:
        kmer = row.split()[0]
        kmer_rc = rc(kmer)
        KMERS.add(kmer)
        KMERS.add(kmer_rc)

    K = len(kmer)
    logging.debug("Imported {} {}-mers".format(len(KMERS), K))

    for name, seq in parse_fasta(fastafile):
        name = name.split()[0]
        for i in range(len(seq) - K):
            if i % 5000000 == 0:
                print("{}:{}".format(name, i), file=sys.stderr)
            kmer = seq[i:i + K]
            if kmer in KMERS:
                print("\t".join(str(x) for x in (name, i, i + K, kmer)))
Пример #3
0
def bed(args):
    """
    %prog bed fastafile kmer.dump.txt

    Map kmers on FASTA.
    """
    from jcvi.formats.fasta import rc, parse_fasta

    p = OptionParser(bed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, dumpfile = args
    fp = open(dumpfile)
    KMERS = set()
    for row in fp:
        kmer = row.split()[0]
        kmer_rc = rc(kmer)
        KMERS.add(kmer)
        KMERS.add(kmer_rc)

    K = len(kmer)
    logging.debug("Imported {} {}-mers".format(len(KMERS), K))

    for name, seq in parse_fasta(fastafile):
        name = name.split()[0]
        for i in range(len(seq) - K):
            if i % 5000000 == 0:
                print >> sys.stderr, "{}:{}".format(name, i)
            kmer = seq[i: i + K]
            if kmer in KMERS:
                print "\t".join(str(x) for x in (name, i, i + K, kmer))
Пример #4
0
def dust(args):
    """
    %prog dust assembly.fasta

    Remove low-complexity contigs within assembly.
    """
    p = OptionParser(dust.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    dustfastafile = fastafile.rsplit(".", 1)[0] + ".dust.fasta"
    if need_update(fastafile, dustfastafile):
        cmd = "dustmasker -in {0}".format(fastafile)
        cmd += " -out {0} -outfmt fasta".format(dustfastafile)
        sh(cmd)

    for name, seq in parse_fasta(dustfastafile):
        nlow = sum(1 for x in seq if x in "acgtnN")
        pctlow = nlow * 100.0 / len(seq)
        if pctlow < 98:
            continue
        # print "{0}\t{1:.1f}".format(name, pctlow)
        print(name)
Пример #5
0
def dust(args):
    """
    %prog dust assembly.fasta

    Remove low-complexity contigs within assembly.
    """
    p = OptionParser(dust.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    dustfastafile = fastafile.rsplit(".", 1)[0] + ".dust.fasta"
    if need_update(fastafile, dustfastafile):
        cmd = "dustmasker -in {0}".format(fastafile)
        cmd += " -out {1} -outfmt fasta".format(dustfastafile)
        sh(cmd)

    for name, seq in parse_fasta(dustfastafile):
        nlow = sum(1 for x in seq if x in "acgtN")
        pctlow = nlow * 100. / len(seq)
        if pctlow < 98:
            continue
        #print "{0}\t{1:.1f}".format(name, pctlow)
        print name
Пример #6
0
def fasta2bed(fastafile):
    """
    Alternative BED generation from FASTA file. Used for sanity check.
    """
    dustfasta = fastafile.rsplit(".", 1)[0] + ".dust.fasta"
    for name, seq in parse_fasta(dustfasta):
        for islower, ss in groupby(enumerate(seq), key=lambda x: x[-1].islower()):
            if not islower:
                continue
            ss = list(ss)
            ms, mn = min(ss)
            xs, xn = max(ss)
            print "\t".join(str(x) for x in (name, ms, xs))
Пример #7
0
def mcluster(args):
    """
    %prog mcluster *.consensus

    Cluster across samples using consensus sequences.
    """
    p = OptionParser(mcluster.__doc__)
    add_consensus_options(p)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    consensusfiles = args
    minlength = opts.minlength
    cpus = opts.cpus
    pf = opts.prefix
    pctid = find_pctid(consensusfiles)

    pf += ".P{0}".format(pctid)
    consensusfile = pf + ".consensus.fasta"
    if need_update(consensusfiles, consensusfile):
        fw_cons = must_open(consensusfile, "w")
        totalseqs = 0
        for cf in consensusfiles:
            nseqs = 0
            s = op.basename(cf).split(".")[0]
            for name, seq in parse_fasta(cf):
                name = '.'.join((s, name))
                print >> fw_cons, ">{0}\n{1}".format(name, seq)
                nseqs += 1
            logging.debug("Read `{0}`: {1} seqs".format(cf, nseqs))
            totalseqs += nseqs
        logging.debug("Total: {0} seqs".format(totalseqs))
        fw_cons.close()

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(consensusfile, userfile):
        cluster_smallmem(consensusfile, userfile, notmatchedfile,
                         minlength, pctid, cpus)

    clustfile = pf + ".clust"
    if need_update((consensusfile, userfile, notmatchedfile), clustfile):
        makeclust(consensusfile, userfile, notmatchedfile, clustfile)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus, minsamp=opts.minsamp)
Пример #8
0
def main(arg):
    f = parse_fasta(arg)
    seqs = [seq for k, seq in f]
    A, B = seqs
    transitions = transversions = 0
    for a, b in zip(A, B):
        if a == b:
            continue
        if (a, b) in (('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')):
            transitions += 1
        else:
            transversions += 1

    print transitions * 1. / transversions
Пример #9
0
def circular(args):
    """
    %prog circular fastafile startpos

    Make circular genome, startpos is the place to start the sequence. This can
    be determined by mapping to a reference. Self overlaps are then resolved.
    Startpos is 1-based.
    """
    from jcvi.assembly.goldenpath import overlap

    p = OptionParser(circular.__doc__)
    p.add_option(
        "--flip",
        default=False,
        action="store_true",
        help="Reverse complement the sequence",
    )
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, startpos = args
    startpos = int(startpos)
    key, seq = next(parse_fasta(fastafile))
    aseq = seq[startpos:]
    bseq = seq[:startpos]
    aseqfile, bseqfile = "a.seq", "b.seq"

    for f, s in zip((aseqfile, bseqfile), (aseq, bseq)):
        fw = must_open(f, "w")
        print(">{0}\n{1}".format(f, s), file=fw)
        fw.close()

    o = overlap([aseqfile, bseqfile])
    seq = aseq[:o.qstop] + bseq[o.sstop:]
    seq = Seq(seq)

    if opts.flip:
        seq = seq.reverse_complement()

    for f in (aseqfile, bseqfile):
        os.remove(f)

    fw = must_open(opts.outfile, "w")
    rec = SeqRecord(seq, id=key, description="")
    SeqIO.write([rec], fw, "fasta")
    fw.close()
Пример #10
0
def circular(args):
    """
    %prog circular fastafile startpos

    Make circular genome, startpos is the place to start the sequence. This can
    be determined by mapping to a reference. Self overlaps are then resolved.
    Startpos is 1-based.
    """
    from jcvi.assembly.goldenpath import overlap

    p = OptionParser(circular.__doc__)
    p.add_option("--flip", default=False, action="store_true",
                 help="Reverse complement the sequence")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, startpos = args
    startpos = int(startpos)
    key, seq = parse_fasta(fastafile).next()
    aseq = seq[startpos:]
    bseq = seq[:startpos]
    aseqfile, bseqfile = "a.seq", "b.seq"

    for f, s in zip((aseqfile, bseqfile), (aseq, bseq)):
        fw = must_open(f, "w")
        print >> fw, ">{0}\n{1}".format(f, s)
        fw.close()

    o = overlap([aseqfile, bseqfile])
    seq = aseq[:o.qstop] + bseq[o.sstop:]
    seq = Seq(seq)

    if opts.flip:
        seq = seq.reverse_complement()

    for f in (aseqfile, bseqfile):
        os.remove(f)

    fw = must_open(opts.outfile, "w")
    rec = SeqRecord(seq, id=key, description="")
    SeqIO.write([rec], fw, "fasta")
    fw.close()
Пример #11
0
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option("--clean", default=False, action="store_true", help="Clean up irregular chars in seq")
    p.add_option("--matefile", help="Matepairs file")
    p.add_option("--maxreadlen", default=262143, type="int", help="Maximum read length allowed")
    p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed")
    p.add_option("--sequential", default=False, action="store_true", help="Overwrite read name (e.g. long Pacbio name)")
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    maxreadlen = opts.maxreadlen
    minreadlen = opts.minreadlen
    if maxreadlen > 0:
        split = False
        f = Fasta(fastafile, lazy=True)
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug("Sequence {0} (size={1}) longer than max read len {2}".format(id, size, maxreadlen))
                split = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = opts.size != 0
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = plate

    frgfile = libname + ".frg"

    if opts.clean:
        cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
        if need_update(fastafile, cleanfasta):
            clean([fastafile, "--canonical", "-o", cleanfasta])
        fastafile = cleanfasta

    if mated:
        qualfile = make_qual(fastafile, score=21)
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

        cmd = "convert-fasta-to-v2.pl"
        cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile)
        if mated:
            cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

        sh(cmd, outfile=frgfile)
        return

    fw = must_open(frgfile, "w")
    print >> fw, headerTemplate.format(libID=libname)

    sequential = opts.sequential
    i = j = 0
    for fragID, seq in parse_fasta(fastafile):
        if len(seq) < minreadlen:
            j += 1
            continue
        i += 1
        if sequential:
            fragID = libname + str(100000000 + i)
        emitFragment(fw, fragID, libname, seq)
    fw.close()

    logging.debug("A total of {0} fragments written to `{1}` ({2} discarded).".format(i, frgfile, j))
Пример #12
0
def main(arg):
    f = parse_fasta(arg)
    for seq in f:
        print seq
Пример #13
0
def main(arg):
    f = parse_fasta(arg)
    seqs = [seq for k, seq in f]
    A, B = seqs
    print calc_edit(A, B)
Пример #14
0
Файл: ca.py Проект: zjwang6/jcvi
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option(
        "--clean",
        default=False,
        action="store_true",
        help="Clean up irregular chars in seq",
    )
    p.add_option("--matefile", help="Matepairs file")
    p.add_option("--maxreadlen",
                 default=262143,
                 type="int",
                 help="Maximum read length allowed")
    p.add_option("--minreadlen",
                 default=1000,
                 type="int",
                 help="Minimum read length allowed")
    p.add_option(
        "--sequential",
        default=False,
        action="store_true",
        help="Overwrite read name (e.g. long Pacbio name)",
    )
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    maxreadlen = opts.maxreadlen
    minreadlen = opts.minreadlen
    if maxreadlen > 0:
        split = False
        f = Fasta(fastafile, lazy=True)
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug(
                    "Sequence {0} (size={1}) longer than max read len {2}".
                    format(id, size, maxreadlen))
                split = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = opts.size != 0
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = plate

    frgfile = libname + ".frg"

    if opts.clean:
        cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
        if need_update(fastafile, cleanfasta):
            clean([fastafile, "--canonical", "-o", cleanfasta])
        fastafile = cleanfasta

    if mated:
        qualfile = make_qual(fastafile, score=21)
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

        cmd = "convert-fasta-to-v2.pl"
        cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile)
        if mated:
            cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

        sh(cmd, outfile=frgfile)
        return

    fw = must_open(frgfile, "w")
    print(headerTemplate.format(libID=libname), file=fw)

    sequential = opts.sequential
    i = j = 0
    for fragID, seq in parse_fasta(fastafile):
        if len(seq) < minreadlen:
            j += 1
            continue
        i += 1
        if sequential:
            fragID = libname + str(100000000 + i)
        emitFragment(fw, fragID, libname, seq)
    fw.close()

    logging.debug(
        "A total of {0} fragments written to `{1}` ({2} discarded).".format(
            i, frgfile, j))