def makeclust(derepfile, userfile, notmatchedfile, clustfile, mindepth=3): D = dict(parse_fasta(derepfile)) U = defaultdict(list) # Clusters fp = open(userfile) for row in fp: query, target, id, qcov, tcov = row.rstrip().split("\t") U[target].append((query, getsize(query), float(id) * float(qcov) * float(tcov))) fw = open(clustfile, "w") for key, members in U.items(): keysize = getsize(key) members.sort(key=lambda x: (-x[1], -x[2])) totalsize = keysize + sum(x[1] for x in members) if totalsize < mindepth: continue # Recruit cluster members seqs = [('>' + key, D[key])] for name, size, id in members: seqs.append(('>' + name, D[name])) seq = "\n".join("\n".join(x) for x in seqs) print >> fw, "\n".join((seq, SEP)) I = dict(parse_fasta(notmatchedfile)) singletons = set(I.keys()) - set(U.keys()) for key in singletons: if getsize(key) < mindepth: continue print >> fw, "\n".join(('>' + key, I[key], SEP)) fw.close()
def bed(args): """ %prog bed fastafile kmer.dump.txt Map kmers on FASTA. """ from jcvi.formats.fasta import rc, parse_fasta p = OptionParser(bed.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, dumpfile = args fp = open(dumpfile) KMERS = set() for row in fp: kmer = row.split()[0] kmer_rc = rc(kmer) KMERS.add(kmer) KMERS.add(kmer_rc) K = len(kmer) logging.debug("Imported {} {}-mers".format(len(KMERS), K)) for name, seq in parse_fasta(fastafile): name = name.split()[0] for i in range(len(seq) - K): if i % 5000000 == 0: print("{}:{}".format(name, i), file=sys.stderr) kmer = seq[i:i + K] if kmer in KMERS: print("\t".join(str(x) for x in (name, i, i + K, kmer)))
def bed(args): """ %prog bed fastafile kmer.dump.txt Map kmers on FASTA. """ from jcvi.formats.fasta import rc, parse_fasta p = OptionParser(bed.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, dumpfile = args fp = open(dumpfile) KMERS = set() for row in fp: kmer = row.split()[0] kmer_rc = rc(kmer) KMERS.add(kmer) KMERS.add(kmer_rc) K = len(kmer) logging.debug("Imported {} {}-mers".format(len(KMERS), K)) for name, seq in parse_fasta(fastafile): name = name.split()[0] for i in range(len(seq) - K): if i % 5000000 == 0: print >> sys.stderr, "{}:{}".format(name, i) kmer = seq[i: i + K] if kmer in KMERS: print "\t".join(str(x) for x in (name, i, i + K, kmer))
def dust(args): """ %prog dust assembly.fasta Remove low-complexity contigs within assembly. """ p = OptionParser(dust.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args dustfastafile = fastafile.rsplit(".", 1)[0] + ".dust.fasta" if need_update(fastafile, dustfastafile): cmd = "dustmasker -in {0}".format(fastafile) cmd += " -out {0} -outfmt fasta".format(dustfastafile) sh(cmd) for name, seq in parse_fasta(dustfastafile): nlow = sum(1 for x in seq if x in "acgtnN") pctlow = nlow * 100.0 / len(seq) if pctlow < 98: continue # print "{0}\t{1:.1f}".format(name, pctlow) print(name)
def dust(args): """ %prog dust assembly.fasta Remove low-complexity contigs within assembly. """ p = OptionParser(dust.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args dustfastafile = fastafile.rsplit(".", 1)[0] + ".dust.fasta" if need_update(fastafile, dustfastafile): cmd = "dustmasker -in {0}".format(fastafile) cmd += " -out {1} -outfmt fasta".format(dustfastafile) sh(cmd) for name, seq in parse_fasta(dustfastafile): nlow = sum(1 for x in seq if x in "acgtN") pctlow = nlow * 100. / len(seq) if pctlow < 98: continue #print "{0}\t{1:.1f}".format(name, pctlow) print name
def fasta2bed(fastafile): """ Alternative BED generation from FASTA file. Used for sanity check. """ dustfasta = fastafile.rsplit(".", 1)[0] + ".dust.fasta" for name, seq in parse_fasta(dustfasta): for islower, ss in groupby(enumerate(seq), key=lambda x: x[-1].islower()): if not islower: continue ss = list(ss) ms, mn = min(ss) xs, xn = max(ss) print "\t".join(str(x) for x in (name, ms, xs))
def mcluster(args): """ %prog mcluster *.consensus Cluster across samples using consensus sequences. """ p = OptionParser(mcluster.__doc__) add_consensus_options(p) p.set_cpus() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) consensusfiles = args minlength = opts.minlength cpus = opts.cpus pf = opts.prefix pctid = find_pctid(consensusfiles) pf += ".P{0}".format(pctid) consensusfile = pf + ".consensus.fasta" if need_update(consensusfiles, consensusfile): fw_cons = must_open(consensusfile, "w") totalseqs = 0 for cf in consensusfiles: nseqs = 0 s = op.basename(cf).split(".")[0] for name, seq in parse_fasta(cf): name = '.'.join((s, name)) print >> fw_cons, ">{0}\n{1}".format(name, seq) nseqs += 1 logging.debug("Read `{0}`: {1} seqs".format(cf, nseqs)) totalseqs += nseqs logging.debug("Total: {0} seqs".format(totalseqs)) fw_cons.close() userfile = pf + ".u" notmatchedfile = pf + ".notmatched" if need_update(consensusfile, userfile): cluster_smallmem(consensusfile, userfile, notmatchedfile, minlength, pctid, cpus) clustfile = pf + ".clust" if need_update((consensusfile, userfile, notmatchedfile), clustfile): makeclust(consensusfile, userfile, notmatchedfile, clustfile) clustSfile = pf + ".clustS" if need_update(clustfile, clustSfile): parallel_musclewrap(clustfile, cpus, minsamp=opts.minsamp)
def main(arg): f = parse_fasta(arg) seqs = [seq for k, seq in f] A, B = seqs transitions = transversions = 0 for a, b in zip(A, B): if a == b: continue if (a, b) in (('A', 'G'), ('G', 'A'), ('C', 'T'), ('T', 'C')): transitions += 1 else: transversions += 1 print transitions * 1. / transversions
def circular(args): """ %prog circular fastafile startpos Make circular genome, startpos is the place to start the sequence. This can be determined by mapping to a reference. Self overlaps are then resolved. Startpos is 1-based. """ from jcvi.assembly.goldenpath import overlap p = OptionParser(circular.__doc__) p.add_option( "--flip", default=False, action="store_true", help="Reverse complement the sequence", ) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, startpos = args startpos = int(startpos) key, seq = next(parse_fasta(fastafile)) aseq = seq[startpos:] bseq = seq[:startpos] aseqfile, bseqfile = "a.seq", "b.seq" for f, s in zip((aseqfile, bseqfile), (aseq, bseq)): fw = must_open(f, "w") print(">{0}\n{1}".format(f, s), file=fw) fw.close() o = overlap([aseqfile, bseqfile]) seq = aseq[:o.qstop] + bseq[o.sstop:] seq = Seq(seq) if opts.flip: seq = seq.reverse_complement() for f in (aseqfile, bseqfile): os.remove(f) fw = must_open(opts.outfile, "w") rec = SeqRecord(seq, id=key, description="") SeqIO.write([rec], fw, "fasta") fw.close()
def circular(args): """ %prog circular fastafile startpos Make circular genome, startpos is the place to start the sequence. This can be determined by mapping to a reference. Self overlaps are then resolved. Startpos is 1-based. """ from jcvi.assembly.goldenpath import overlap p = OptionParser(circular.__doc__) p.add_option("--flip", default=False, action="store_true", help="Reverse complement the sequence") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, startpos = args startpos = int(startpos) key, seq = parse_fasta(fastafile).next() aseq = seq[startpos:] bseq = seq[:startpos] aseqfile, bseqfile = "a.seq", "b.seq" for f, s in zip((aseqfile, bseqfile), (aseq, bseq)): fw = must_open(f, "w") print >> fw, ">{0}\n{1}".format(f, s) fw.close() o = overlap([aseqfile, bseqfile]) seq = aseq[:o.qstop] + bseq[o.sstop:] seq = Seq(seq) if opts.flip: seq = seq.reverse_complement() for f in (aseqfile, bseqfile): os.remove(f) fw = must_open(opts.outfile, "w") rec = SeqRecord(seq, id=key, description="") SeqIO.write([rec], fw, "fasta") fw.close()
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option("--clean", default=False, action="store_true", help="Clean up irregular chars in seq") p.add_option("--matefile", help="Matepairs file") p.add_option("--maxreadlen", default=262143, type="int", help="Maximum read length allowed") p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed") p.add_option("--sequential", default=False, action="store_true", help="Overwrite read name (e.g. long Pacbio name)") p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args maxreadlen = opts.maxreadlen minreadlen = opts.minreadlen if maxreadlen > 0: split = False f = Fasta(fastafile, lazy=True) for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug("Sequence {0} (size={1}) longer than max read len {2}".format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = opts.size != 0 mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = plate frgfile = libname + ".frg" if opts.clean: cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta if mated: qualfile = make_qual(fastafile, score=21) if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile) return fw = must_open(frgfile, "w") print >> fw, headerTemplate.format(libID=libname) sequential = opts.sequential i = j = 0 for fragID, seq in parse_fasta(fastafile): if len(seq) < minreadlen: j += 1 continue i += 1 if sequential: fragID = libname + str(100000000 + i) emitFragment(fw, fragID, libname, seq) fw.close() logging.debug("A total of {0} fragments written to `{1}` ({2} discarded).".format(i, frgfile, j))
def main(arg): f = parse_fasta(arg) for seq in f: print seq
def main(arg): f = parse_fasta(arg) seqs = [seq for k, seq in f] A, B = seqs print calc_edit(A, B)
def fasta(args): """ %prog fasta fastafile Convert reads formatted as FASTA file, and convert to CA frg file. If .qual file is found, then use it, otherwise just make a fake qual file. Mates are assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a matefile is given. """ from jcvi.formats.fasta import clean, make_qual p = OptionParser(fasta.__doc__) p.add_option( "--clean", default=False, action="store_true", help="Clean up irregular chars in seq", ) p.add_option("--matefile", help="Matepairs file") p.add_option("--maxreadlen", default=262143, type="int", help="Maximum read length allowed") p.add_option("--minreadlen", default=1000, type="int", help="Minimum read length allowed") p.add_option( "--sequential", default=False, action="store_true", help="Overwrite read name (e.g. long Pacbio name)", ) p.set_size() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (fastafile, ) = args maxreadlen = opts.maxreadlen minreadlen = opts.minreadlen if maxreadlen > 0: split = False f = Fasta(fastafile, lazy=True) for id, size in f.itersizes_ordered(): if size > maxreadlen: logging.debug( "Sequence {0} (size={1}) longer than max read len {2}". format(id, size, maxreadlen)) split = True break if split: for f in split_fastafile(fastafile, maxreadlen=maxreadlen): fasta([f, "--maxreadlen=0"]) return plate = op.basename(fastafile).split(".")[0] mated = opts.size != 0 mean, sv = get_mean_sv(opts.size) if mated: libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate else: libname = plate frgfile = libname + ".frg" if opts.clean: cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta" if need_update(fastafile, cleanfasta): clean([fastafile, "--canonical", "-o", cleanfasta]) fastafile = cleanfasta if mated: qualfile = make_qual(fastafile, score=21) if opts.matefile: matefile = opts.matefile assert op.exists(matefile) else: matefile = make_matepairs(fastafile) cmd = "convert-fasta-to-v2.pl" cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile) if mated: cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile) sh(cmd, outfile=frgfile) return fw = must_open(frgfile, "w") print(headerTemplate.format(libID=libname), file=fw) sequential = opts.sequential i = j = 0 for fragID, seq in parse_fasta(fastafile): if len(seq) < minreadlen: j += 1 continue i += 1 if sequential: fragID = libname + str(100000000 + i) emitFragment(fw, fragID, libname, seq) fw.close() logging.debug( "A total of {0} fragments written to `{1}` ({2} discarded).".format( i, frgfile, j))