def FastqPairedIterator(read1, read2): if read1 == read2: p1fp = p2fp = must_open(read1) else: p1fp = must_open(read1) p2fp = must_open(read2) return p1fp, p2fp
def splitread(args): """ %prog splitread fastqfile Split fastqfile into two read fastqfiles, cut in the middle. """ p = OptionParser(splitread.__doc__) sp1.add_argument("-n", dest="n", default=76, type="int", help="Split at N-th base position [default: %default]") sp1.add_argument("--rc", default=False, action="store_true", help="Reverse complement second read [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pairsfastq, = args base = op.basename(pairsfastq).split(".")[0] fq1 = base + ".1.fastq" fq2 = base + ".2.fastq" fw1 = must_open(fq1, "w") fw2 = must_open(fq2, "w") fp = must_open(pairsfastq) n = args.n minsize = n * 8 / 5 for name, seq, qual in FastqGeneralIterator(fp): if len(seq) < minsize: logging.error("Skipping read {0}, length={1}".format( name, len(seq))) continue name = "@" + name rec1 = FastqLite(name, seq[:n], qual[:n]) rec2 = FastqLite(name, seq[n:], qual[n:]) if args.rc: rec2.rc() print >> fw1, rec1 print >> fw2, rec2 logging.debug("Reads split into `{0},{1}`".format(fq1, fq2)) fw1.close() fw2.close()
def vcf_filter(args): sites = set() if args.exclude: for line in must_open(args.exclude): seqid, pos = line.strip().split("\t") locus = "%s_%s" % (seqid, pos) sites.add(locus) vcfr = vcf.Reader(must_open(args.fi)) #vcfw = vcf.Writer(fho, vcfr) for rcd in vcfr: n_sm = len(rcd.samples) sms = rcd.samples locus = "%s_%s" % (rcd.CHROM, rcd.POS) if locus in sites: continue sm = sms[0] gt = sm.gt_type qd = rcd.INFO['QD'] if 'QD' in rcd.INFO else None fs = rcd.INFO['FS'] if 'FS' in rcd.INFO else None mq = rcd.INFO['MQ'] if 'MQ' in rcd.INFO else None mqrs = rcd.INFO['MQRankSum'] if 'MQRankSum' in rcd.INFO else None rprs = rcd.INFO['ReadPosRankSum'] if 'ReadPosRankSum' in rcd.INFO else None sor = rcd.INFO['SOR'] if 'SOR' in rcd.INFO else None flagpass = (gt is None or gt == 2) and ( (rcd.is_snp & (qd is None or qd >= 2) & (fs is None or fs <= 60) & (mq is None or mq >= 40) & (mqrs is None or mqrs >= -12.5) & (rprs is None or rprs >= -8) & (sor is None or sor <= 4) ) or (rcd.is_indel & (qd is None or qd >= 2) & (fs is None or fs <= 200) & (rprs is None or rprs >= -20) & (sor is None or sor <= 10) ) ) alts = ",".join(map(str, rcd.ALT)) alt = str(rcd.ALT[0]) if flagpass: #vcfw.write_record(rcd) if rcd.is_snp: print("%s\t%s\t%s\t%d\t%s" % (locus, 'single', rcd.CHROM, rcd.POS-1, alt)) elif rcd.is_deletion: print("%s\t%s\t%s\t%d\t%d" % (locus, 'deletion', rcd.CHROM, rcd.POS-1, len(rcd.REF)-1)) else: print("%s\t%s\t%s\t%d\t%s" % (locus, 'insertion', rcd.CHROM, rcd.POS-1, alt[1:]))
def translate(args): fh = must_open(args.fi) for rcd in SeqIO.parse(fh, "fasta"): sid = rcd.id aa = rcd.seq.translate(to_stop = True) nrcd = SeqRecord(aa, id = sid, description = "") SeqIO.write(nrcd, sys.stdout, "fasta")
def uniq(args): """ %prog uniq vcffile Retain only the first entry in vcf file. """ from urlparse import parse_qs p = OptionParser(uniq.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args fp = must_open(vcffile) data = [] for row in fp: if row[0] == '#': print(row.strip()) continue v = VcfLine(row) data.append(v) for pos, vv in groupby(data, lambda x: x.pos): vv = list(vv) if len(vv) == 1: print(vv[0]) continue bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0])) print(bestv)
def parseEff(args): fhi = must_open(args.fi) for line in fhi: if line.startswith("#"): continue row = line.strip("\n").split("\t") chrom, pos, vid, ref, alt, qual, filt, info = row[:8] pos = int(pos) refl, altl = len(ref), len(alt) vnttype = '' if refl == 1 and altl == 1: vnttype = 'snp' elif refl == 1 and altl > 1: vnttype = 'ins' elif refl > 1 and altl == 1: vnttype = 'del' else: assert refl > 1 and altl > 1, "error: %s" % line vnttype = 'mix' if info == '.': continue ps = info.replace("ANN=",'').split("|") allele, anno, impact, gname, gid, ttyppe, tid = ps[:7] print("\t".join([chrom, str(pos), str(refl), str(altl), vnttype, anno, impact, gid, tid]))
def bam_stat(args): bam = pysam.AlignmentFile(args.fi, 'r') if not args.bychr: s = BamStat() for aln in bam: count_read(aln, s) if len(s.rdic) > 0: logging.debug("%d 'paired' reads don't have a mate" % len(s.rdic)) if args.isize: fho = must_open(args.isize, "w") print("\t".join(('insert_size','count')), file=fho) for ins, cnt in s.idic.items(): print("%d\t%d\n" % (ins, cnt), file=fho) fho.close() print(s) else: ss = dict() for ist in bam.get_index_statistics(): ss[ist.contig] = BamStat() for aln in bam: if aln.is_unmapped: continue chrom = aln.reference_name count_read(aln, ss[chrom]) for chrom, s in ss.items(): for k in s.stats: print("\t".join((chrom, k, str(getattr(s, k)))))
def merge(args): """ %prog merge ref.fasta query.fasta *.delta Merge delta files into a single delta. """ p = OptionParser(merge.__doc__) p.set_outfile(outfile="merged_results.delta") opts, args = p.parse_args(args) if len(args) < 3: sys.exit(not p.print_help()) ref, query = args[:2] deltafiles = args[2:] outfile = args.outfile ref = get_abs_path(ref) query = get_abs_path(query) fw = must_open(outfile, "w") print >> fw, " ".join((ref, query)) print >> fw, "NUCMER" fw.close() for d in deltafiles: cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d) sh(cmd, outfile=outfile, append=True)
def gtb2tsv(args): fhi = must_open(args.fi) print("\t".join("gid tid ttype etype chrom start end srd fam note".split())) for line in fhi: line = line.strip("\n") if line.startswith("#") or line.startswith("id"): continue ary = line.split("\t") if len(ary) < 18: print("less than 18 columns:\n%s" % line) continue tid, gid, seqid, tbeg, tend, srd, \ locES, locIS, locCS, loc5S, loc3S, phase, \ src, conf, cat1, cat2, cat3, note = ary tbeg, tend = int(tbeg), int(tend) if cat1 == 'mRNA': assert locCS, "no CDS for %d" % tid else: assert locES, "no exon for %d" % tid ldic = { 'exon': locES, 'cds': locCS, \ 'utr5': loc5S, 'utr3': loc3S, 'intron':locIS } for etype, locS in ldic.items(): if not locS: continue for rbeg, rend in locStr2Ary(locS): beg, end = 0, 0 if srd == "-": beg, end = tend - rend + 1, tend - rbeg + 1 else: assert srd == '+', "unknown strand: %s for %s" % (srd, tid) beg, end = tbeg + rbeg - 1, tbeg + rend - 1 fields = [gid, tid, cat1, etype, seqid, str(beg), str(end), srd, cat3, note] print("\t".join(fields)) fhi.close()
def annotate(args): """ %prog annotate blastfile query.fasta subject.fasta Annotate overlap types (dovetail, contained, etc) in BLAST tabular file. """ from jcvi.assembly.goldenpath import Cutoff, Overlap, Overlap_types p = OptionParser(annotate.__doc__) p.set_align(pctid=94, hitlen=500) sp1.add_argument("--hang", default=500, type="int", help="Maximum overhang length") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, afasta, bfasta = args fp = must_open(blastfile) asizes = Sizes(afasta).mapping bsizes = Sizes(bfasta).mapping cutoff = Cutoff(opts.pctid, opts.hitlen, opts.hang) logging.debug(str(cutoff)) for row in fp: b = BlastLine(row) asize = asizes[b.query] bsize = bsizes[b.subject] if b.query == b.subject: continue ov = Overlap(b, asize, bsize, cutoff) if ov.otype: ov.print_graphic() print("{0}\t{1}".format(b, Overlap_types[ov.otype]))
def uniq(args): """ %prog uniq vcffile Retain only the first entry in vcf file. """ from urlparse import parse_qs p = OptionParser(uniq.__doc__) opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) vcffile, = args fp = must_open(vcffile) data = [] for row in fp: if row[0] == '#': print(row.strip()) continue v = VcfLine(row) data.append(v) for pos, vv in groupby(data, lambda x: x.pos): vv = list(vv) if len(vv) == 1: print(vv[0]) continue bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0])) print(bestv)
def catread(args): """ %prog catread fastqfile1 fastqfile2 Concatenate paired end reads into one. Useful for example to do single-end mapping and perform filtering on the whole read pair level. """ p = OptionParser(catread.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) r1, r2 = args p1fp, p2fp = FastqPairedIterator(r1, r2) outfile = pairspf((r1, r2)) + ".cat.fastq" fw = must_open(outfile, "w") while True: a = list(islice(p1fp, 4)) if not a: break atitle, aseq, _, aqual = a btitle, bseq, _, bqual = list(islice(p2fp, 4)) print >> fw, "\n".join((atitle.strip(), aseq.strip() + bseq.strip(), \ "+", aqual.strip() + bqual.strip()))
def tile(args): from maize.utils.location import maketile fhi = must_open(args.fi) winstep, winsize = args.step, args.size for rcd in SeqIO.parse(fhi, "fasta") : size = len(rcd.seq) sid, beg, end = rcd.id, 1, size ary = rcd.id.split("-") if len(ary) >= 3: sid, beg, end = ary[0], int(ary[1]), int(ary[2]) assert size == end - beg + 1, "size error: %s not %d" % (rcd.id, size) elif len(ary) == 2: sid, beg = ary[0], int(ary[1]) end = beg + size - 1 wins = maketile(1, size, winsize, winstep) rcds = [] seq = str(rcd.seq) for rbeg, rend in wins: abeg, aend = beg + rbeg - 1, beg + rend - 1 ssid = "%s-%d-%d" % (sid, abeg, aend) seqstr = seq[rbeg-1:rend] rcds.append(SeqRecord(Seq(seqstr), id = ssid, description = '')) SeqIO.write(rcds, sys.stdout, "fasta") fhi.close()
def bed2chain(args): from maize.formats.sizes import Sizes tdic = Sizes(args.tsize) qdic = Sizes(args.qsize) firstline = True cid0, tName0, qName0, srd0, locs = '', '', '', '', [] for line in must_open(args.fi): line = line.rstrip("\n") if not line: continue tName, tStart, tEnd, srd, qName, qStart, qEnd, cid = line.split()[:8] tStart, tEnd, qStart, qEnd = int(tStart), int(tEnd), int(qStart), int(qEnd) if firstline: cid0, tName0, qName0, srd0 = cid, tName, qName, srd locs.append([tStart, tEnd, qStart, qEnd]) firstline = False elif cid0 == cid: assert tName == tName0 and qName == qName0 and srd == srd0, "inconsistent info in chain" locs.append([tStart, tEnd, qStart, qEnd]) else: print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs) cid0, tName0, qName0, srd0 = cid, tName, qName, srd locs = [[tStart, tEnd, qStart, qEnd]] print_chain(cid0, tName0, qName0, srd0, tdic.get_size(tName0), qdic.get_size(qName0), locs)
def suffix(args): """ %prog suffix fastqfile CAG Filter reads based on suffix. """ p = OptionParser(suffix.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastqfile, sf = args fw = must_open(args.outfile, "w") nreads = nselected = 0 for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break if rec.seq.endswith(sf): print >> fw, rec nselected += 1 logging.debug("Selected reads with suffix {0}: {1}".\ format(sf, percentage(nselected, nreads)))
def uniq(args): """ %prog uniq fastqfile Retain only first instance of duplicate reads. Duplicate is defined as having the same read name. """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args fw = must_open(args.outfile, "w") nduplicates = nreads = 0 seen = set() for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break name = rec.name if name in seen: nduplicates += 1 continue seen.add(name) print >> fw, rec logging.debug("Removed duplicate reads: {}".\ format(percentage(nduplicates, nreads)))
def merge(args): cfg = args.cfg for line in must_open(cfg): line = line.strip(" \t\n\r") if line == "": continue (pre, fseq) = line.split(",") if not os.access(fseq, os.R_OK): eprint("no access to input file: %s" % fseq) sys.exit(1) fh = must_open(fseq) seq_it = SeqIO.parse(fh, "fasta") seqs = [SeqRecord(rcd.seq, id = pre + "|" + rcd.id, description = '') for rcd in seq_it] SeqIO.write(seqs, sys.stdout, "fasta") fh.close()
def __init__(self, filename, sorted=False): super(BlastSlow, self).__init__(filename) fp = must_open(filename) for row in fp: self.append(BlastLine(row)) self.sorted = sorted if not sorted: self.sort(key=lambda x: x.query)
def cleanid(args): fh = must_open(args.fi) for line in fh: line = line.strip() if line.startswith(">"): print(line.rstrip(":.")) else: print(line) fh.close()
def clean(args): reg = re.compile("[^ATCGN]") fh = must_open(args.fi) alns = AlignIO.read(fh, "phylip-relaxed") for rcd in alns: rcd.seq = reg.subn("N", str(rcd.seq).upper())[0] #rcd = SeqRecord(Seq(newseq), id = rcd.id) AlignIO.write(alns, sys.stdout, "phylip-relaxed") fh.close()
def clean(args): reg = re.compile("[^ATCGN]") fh = must_open(args.fi) alns = AlignIO.read(fh, "phylip-relaxed") for rcd in alns: rcd.seq = reg.subn("N", str(rcd.seq).upper())[0] #rcd = SeqRecord(Seq(newseq), id = rcd.id) AlignIO.write(alns, sys.stdout, "phylip-relaxed") fh.close()
def desc(args): fh = must_open(args.fi) if args.header: print("seqid\tdesc") for rcd in SeqIO.parse(fh, "fasta"): sid, desc = rcd.id, rcd.description if sid == desc: desc = '' print("%s\t%s" % (sid, desc))
def shuffle(args): """ %prog shuffle p1.fastq p2.fastq Shuffle pairs into interleaved format. """ p = OptionParser(shuffle.__doc__) p.set_tag() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) p1, p2 = args pairsfastq = pairspf((p1, p2)) + ".fastq" tag = args.tag p1fp = must_open(p1) p2fp = must_open(p2) pairsfw = must_open(pairsfastq, "w") nreads = 0 while True: a = list(islice(p1fp, 4)) if not a: break b = list(islice(p2fp, 4)) if tag: name = a[0].rstrip() a[0] = name + "/1\n" b[0] = name + "/2\n" pairsfw.writelines(a) pairsfw.writelines(b) nreads += 2 pairsfw.close() extra = nreads * 2 if tag else 0 checkShuffleSizes(p1, p2, pairsfastq, extra=extra) logging.debug("File `{0}` verified after writing {1} reads.".\ format(pairsfastq, nreads)) return pairsfastq
def gaps(args): import re reg = re.compile("N+") fh = must_open(args.fi) for rcd in SeqIO.parse(fh, "fasta"): sid, seq = rcd.id, str(rcd.seq).upper() for res in reg.finditer(seq): beg, end = res.start(0), res.end(0) if end - beg >= args.gap: print("%s\t%d\t%d" % (sid, beg, end))
def coverage(args): """ %prog coverage fastafile bamfile Calculate coverage for BAM file. BAM file will be sorted unless with --nosort. """ p = OptionParser(coverage.__doc__) sp1.add_argument("--format", default="bigwig", choices=("bedgraph", "bigwig", "coverage"), help="Output format") sp1.add_argument("--nosort", default=False, action="store_true", help="Do not sort BAM") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, bamfile = args format = args.format if args.nosort: logging.debug("BAM sorting skipped") else: bamfile = index([bamfile, "--fasta={0}".format(fastafile)]) pf = bamfile.rsplit(".", 2)[0] sizesfile = Sizes(fastafile).filename cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile) if format in ("bedgraph", "bigwig"): cmd += " -bg" bedgraphfile = pf + ".bedgraph" sh(cmd, outfile=bedgraphfile) if format == "bedgraph": return bedgraphfile bigwigfile = pf + ".bigwig" cmd = "bedGraphToBigWig {0} {1} {2}".\ format(bedgraphfile, sizesfile, bigwigfile) sh(cmd) return bigwigfile coveragefile = pf + ".coverage" if need_update(fastafile, coveragefile): sh(cmd, outfile=coveragefile) gcf = GenomeCoverageFile(coveragefile) fw = must_open(args.outfile, "w") for seqid, cov in gcf.iter_coverage_seqid(): print >> fw, "\t".join((seqid, "{0:.1f}".format(cov))) fw.close()
def filter(args): """ %prog filter <deltafile|coordsfile> Produce a new delta/coords file and filter based on id% or cov%. Use `delta-filter` for .delta file. """ p = OptionParser(filter.__doc__) p.set_align(pctid=0, hitlen=0) sp1.add_argument("--overlap", default=False, action="store_true", help="Print overlap status (e.g. terminal, contained)") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pctid = args.pctid hitlen = args.hitlen filename, = args if pctid == 0 and hitlen == 0: return filename pf, suffix = filename.rsplit(".", 1) outfile = "".join((pf, ".P{0}L{1}.".format(int(pctid), int(hitlen)), suffix)) if not need_update(filename, outfile): return outfile if suffix == "delta": cmd = "delta-filter -i {0} -l {1} {2}".format(pctid, hitlen, filename) sh(cmd, outfile=outfile) return outfile fp = open(filename) fw = must_open(outfile, "w") for row in fp: try: c = CoordsLine(row) except AssertionError: continue if c.identity < pctid: continue if c.len2 < hitlen: continue if args.overlap and not c.overlap: continue outrow = row.rstrip() if args.overlap: ov = Overlap_types[c.overlap] outrow += "\t" + ov print >> fw, outrow return outfile
def subset(args): """ %prog subset blastfile qbedfile sbedfile Extract blast hits between given query and subject chrs. If --qchrs or --schrs is not given, then all chrs from q/s genome will be included. However one of --qchrs and --schrs must be specified. Otherwise the script will do nothing. """ p = OptionParser(subset.__doc__) sp1.add_argument("--qchrs", default=None, help="query chrs to extract, comma sep [default: %default]") sp1.add_argument("--schrs", default=None, help="subject chrs to extract, comma sep [default: %default]") sp1.add_argument("--convert", default=False, action="store_true", help="convert accns to chr_rank [default: %default]") opts, args = p.parse_args(args) if len(args) != 3: sys.exit(not p.print_help()) blastfile, qbedfile, sbedfile = args qchrs = opts.qchrs schrs = opts.schrs assert qchrs or schrs, p.print_help() convert = opts.convert outfile = blastfile + "." if qchrs: outfile += qchrs + "." qchrs = set(qchrs.split(",")) else: qchrs = set(Bed(qbedfile).seqids) if schrs: schrs = set(schrs.split(",")) if qbedfile != sbedfile or qchrs != schrs: outfile += ",".join(schrs) + "." else: schrs = set(Bed(sbedfile).seqids) outfile += "blast" qo = Bed(qbedfile).order so = Bed(sbedfile).order fw = must_open(outfile, "w") for b in Blast(blastfile): q, s = b.query, b.subject if qo[q][1].seqid in qchrs and so[s][1].seqid in schrs: if convert: b.query = qo[q][1].seqid + "_" + "{0:05d}".format(qo[q][0]) b.subject = so[s][1].seqid + "_" + "{0:05d}".format(so[s][0]) print >> fw, b fw.close() logging.debug("Subset blastfile written to `{0}`".format(outfile))
def stat(args): fhi = must_open(args.fi) print("\t".join(["chr", "pos", "nalt", "rsize", "asize", "nsam", "aaf", "nucdiv"])) vcf_reader = vcf.Reader(fhi) for rcd in vcf_reader: num_chroms = float(2.0 * rcd.num_called) nucl_diversity = float(num_chroms / (num_chroms - 1.0)) * rcd.heterozygosity print("\t".join(map(str, [rcd.CHROM, rcd.POS, \ len(rcd.ALT), len(rcd.REF), len(rcd.ALT[0]), \ rcd.num_called, rcd.aaf[0], nucl_diversity])))
def rmdot(args): from string import maketrans fh = must_open(args.fi) tt = maketrans(".", "-") for line in fh: line = line.strip() if line.startswith('>'): print(line) else: print(line.translate(tt)) fh.close()
def clean(args): import re reg = re.compile("[^ATCGN]") fh = must_open(args.fi) cnt = 0 for rcd in SeqIO.parse(fh, "fasta"): sid, seq = rcd.id, str(rcd.seq).upper() newseq, ncnt = reg.subn("N", seq) cnt += ncnt nrcd = SeqRecord(Seq(newseq), id = sid, description = "") SeqIO.write(nrcd, sys.stdout, "fasta") logging.debug("Total bad char: %d" % cnt)
def mcl2tsv(args): fhi = must_open(args.mcl) print("grp\tgid") grp = 1 for line in fhi: line = line.strip("\n") gids = line.split("\t") if len(gids) < 5: continue for gid in gids: print("%d\t%s" % (grp, gid)) grp += 1
def mcl2tsv(args): fhi = must_open(args.mcl) print("grp\tgid") grp = 1 for line in fhi: line = line.strip("\n") gids = line.split("\t") if len(gids) < 5: continue for gid in gids: print("%d\t%s" % (grp, gid)) grp += 1
def coverage(args): """ %prog coverage fastafile bamfile Calculate coverage for BAM file. BAM file will be sorted unless with --nosort. """ p = OptionParser(coverage.__doc__) sp1.add_argument("--format", default="bigwig", choices=("bedgraph", "bigwig", "coverage"), help="Output format") sp1.add_argument("--nosort", default=False, action="store_true", help="Do not sort BAM") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, bamfile = args format = args.format if args.nosort: logging.debug("BAM sorting skipped") else: bamfile = index([bamfile, "--fasta={0}".format(fastafile)]) pf = bamfile.rsplit(".", 2)[0] sizesfile = Sizes(fastafile).filename cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile) if format in ("bedgraph", "bigwig"): cmd += " -bg" bedgraphfile = pf + ".bedgraph" sh(cmd, outfile=bedgraphfile) if format == "bedgraph": return bedgraphfile bigwigfile = pf + ".bigwig" cmd = "bedGraphToBigWig {0} {1} {2}".\ format(bedgraphfile, sizesfile, bigwigfile) sh(cmd) return bigwigfile coveragefile = pf + ".coverage" if need_update(fastafile, coveragefile): sh(cmd, outfile=coveragefile) gcf = GenomeCoverageFile(coveragefile) fw = must_open(args.outfile, "w") for seqid, cov in gcf.iter_coverage_seqid(): print >> fw, "\t".join((seqid, "{0:.1f}".format(cov))) fw.close()
def iter_fastq(filename, offset=0, key=None): if isinstance(filename, str): logging.debug("Read file `{0}`".format(filename)) fh = must_open(filename) else: fh = filename while True: rec = FastqRecord(fh, offset=offset, key=key) if not rec.name: break yield rec yield None # sentinel
def __init__(self, filename=None): super(Psl, self).__init__(filename) import re self.mCounts = {} # dict to hold match counts if not filename: return for line in must_open(filename): if not re.match(r'\d+', line[0]): continue self.append(PslLine(line))
def swap(args): """ %prog swap blastfile Print out a new blast file with query and subject swapped. """ p = OptionParser(swap.__doc__) opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) blastfile, = args swappedblastfile = blastfile + ".swapped" fp = must_open(blastfile) fw = must_open(swappedblastfile, "w") for row in fp: b = BlastLine(row) print >> fw, b.swapped fw.close() sort([swappedblastfile])
def UMIcount(args): """ %prog UMIcount fastqfile Report number of occurances of each unique UMI """ fhi = must_open(args.fi) if args.fi.endswith(".gz"): fhi = gzip.open(args.fi, "r") ud = dict() for (seqid, seq, qual) in read_fastq(args.fi, fhi): umi = seqid.split(" ")[1].split("+")[1] if umi in ud: ud[umi] += 1 else: ud[umi] = 1 fho = must_open(args.fo, 'w') for umi, cnt in ud.items(): fho.write("%s\t%s\n" % (umi, cnt)) logging.debug("{} UMIs detected".format(len(ud)))
def one2tsv(args): fhi = must_open(args.mcl) print("grp\tgid") grp = 1 for line in fhi: ps = line.strip("\n").split(",") if ps[0] == 'Cluster': continue mid, size, density, iwt, ewt, quality, pval, gidstr = ps gids = gidstr.replace("\"", "").split(" ") if float(pval) >= args.maxp or len(gids) < 5: continue for gid in gids: print("%d\t%s" % (grp, gid)) grp += 1
def vcf2tsv(args): vcf_reader = vcf.Reader(fsock=must_open(args.fi)) #vcf_reader.fetch("B02", 50001, 100000) #print(str(vcf_reader)) #sys.exit() lst1 = ["DP", "QD", "FS", "MQ", "MQRankSum", "ReadPosRankSum", "SOR"] lsth = ['chr', 'pos', 'ref', 'alt', 'IS_SNP', 'PASS', 'QUAL'] + lst1 lst2 = ["AD", "DP", "GQ"] lstb = ['GT'] + lst2 head = 1 for rcd in vcf_reader: n_sm = len(rcd.samples) if head == 1: head = 0 if n_sm == 1: print("\t".join(lsth + lstb)) else: sm_names = [x.sample for x in rcd.samples] lstbe = [] for sm_name in sm_names: lstbe1 = [i+'_'+j for i,j in zip([sm_name]*len(lstb), lstb)] lstbe += lstbe1 print("\t".join(lsth + lstbe)) alts = ",".join(map(str, rcd.ALT)) alt = rcd.ALT[0] filt = rcd.FILTER flagpass = 0 if filt is None or len(filt) == 0: flagpass = 1 val1, val2 = [], [] for k in lst1: v = '' if k in rcd.INFO: v = rcd.INFO[k] val1.append(v) valh = [rcd.CHROM, rcd.POS, rcd.REF, alts, int(rcd.is_snp), flagpass, rcd.QUAL] + val1 sms = rcd.samples valb = [] for sm in sms: # need to address multiple alleles val2 = [getattr(sm.data, k, '') for k in lst2] if val2[0] is not None and val2[0] != '': val2[0] = val2[0][1] valb1 = [sm.gt_type] + val2 valb += ['' if x is None else str(x) for x in valb1] print("\t".join(map(str, valh + valb)))
def breakread(args): fhi = must_open(args.fi) fo1 = "%s_1.fq.gz" % fo fo2 = "%s_2.fq.gz" % fo fho1 = gzip.open(fo1, "wb") fho2 = gzip.open(fo2, "wb") for (seqid, seq, qual) in read_fastq(args.fi, fhi): assert len(seq) == readlen * 2 and len(qual) == readlen * 2, \ "%s: seq[%d] qual[%d] not %d" % \ (seqid, len(seq), len(qual), readlen) eles = seqid.split(" ") if len(eles) > 2: seqid = " ".join(eles[0:2]) seq1, seq2 = seq[0:readlen], seq[readlen:readlen * 2] qual1, qual2 = qual[0:readlen], qual[readlen:readlen * 2] fho1.write(("@%s\n%s\n+\n%s\n" % (seqid, seq1, qual1)).encode('utf8')) fho2.write(("@%s\n%s\n+\n%s\n" % (seqid, seq2, qual2)).encode('utf8'))
def coordT(args): sizes = Sizes(args.fs) for line in must_open(args.fi): if not re.match(r'\d+', line[0]): continue p = PslLine(line) tnames = p.tName.split("-") if len(tnames) == 3: x = p.tName t.qName, tosStart, tosEnd = tnames[0], int(tnames[1]), int(tnames[2]) assert tosEnd-tosStart+1 == p.tSize cSize = sizes.get_size(p.tName) p.tStart += tosStart - 1 p.tEnd += tosStart - 1 p.tStarts = [x + tosStart - 1 for x in p.tStarts] p.tSize = cSize print(str(p))
def write(self): for subjob in self.subjobs: subjob.write() source = Template(self.__stanza__) cmd_str = "\n".join([x.strip() for x in self.cmds]) params = { "queue": self.queue, "node": self.node, "ppn": self.ppn, "walltime": self.walltime, "memstr": self.mem, "email": self.email, "cmds": cmd_str + "\n" } fhj = must_open(self.fname, "w") fhj.write(source.substitute(params)) fhj.close()
def psl2tsv(args): sMatch, sMisMatch, sGapOpen, sGapExtend = 2, -3, -5, -2 print("\t".join('''qName qStart qEnd qSize strand tName tStart tEnd tSize alnLen match misMatch baseN qNumIns tNumIns qBaseIns tBaseIns ident score qLoc tLoc'''.split())) for line in must_open(args.fi): if not re.match(r'\d+', line[0]): continue p = PslLine(line) qName, qStart, qEnd, qSize, strand = p.qName, p.qStart, p.qEnd, p.qSize, p.qstrand tName, tStart, tEnd, tSize = p.tName, p.tStart, p.tEnd, p.tSize match, misMatch, baseN, qNumIns, tNumIns, qBaseIns, tBaseIns = \ p.matches, p.misMatches, p.nCount, p.qNumInsert, p.tNumInsert, \ p.qBaseInsert, p.tBaseInsert assert p.blockCount==len(p.tStarts), "unequal pieces" assert p.blockCount==len(p.qStarts), "unequal pieces" assert p.blockCount==len(p.blockSizes), "unequal pieces" match += p.repMatches alnLen = match + misMatch + baseN assert alnLen==sum(p.blockSizes), "block size error: %s %d %d" % (qId, alnLen, sum(blockSizes)) assert alnLen+qBaseIns==qEnd-qStart, "%s: qLen error" % qId assert alnLen+tBaseIns==tEnd-tStart, "%s: tLen error" % qId qLocs, tLocs = [], [] for i in range(p.blockCount): rtb, rte = p.tStarts[i]-tStart, p.tStarts[i]-tStart+p.blockSizes[i] rqb, rqe = 0, 0 if strand == '-': rqb, rqe = p.qStarts[i]-(qSize-qEnd), p.qStarts[i]-(qSize-qEnd)+p.blockSizes[i] else: rqb, rqe = p.qStarts[i]-qStart, p.qStarts[i]+p.blockSizes[i]-qStart qLocs.append([rqb+1,rqe]) tLocs.append([rtb+1,rte]) score = match * sMatch + misMatch * sMisMatch numIns = qNumIns + tNumIns if numIns >= 1: score += sGapOpen + (numIns - 1) * sGapExtend ident = "%.03f" % (float(match)/(match+misMatch)) print("\t".join(str(x) for x in [qName, qStart+1, qEnd, qSize, strand, tName, tStart+1, tEnd, tSize, alnLen, match, misMatch, baseN, qNumIns, tNumIns, qBaseIns, tBaseIns, ident, score, locAry2Str(qLocs), locAry2Str(tLocs)]))
def create_job_chain(fjs, fo): cmds = ['#!/bin/bash'] jobs = [] for i in range(len(fjs)): fj = fjs[i] assert op.isfile(fj), "cannot read %s" % fj job = "job%d" % (i + 1) if i == 0: cmds.append("%s=$(qsub %s)" % (job, fj)) cmds.append("echo $%s" % job) else: pjob = jobs[i - 1] cmds.append("%s=$(qsub -W depend=afterok:$%s %s)" % (job, pjob, fj)) cmds.append("echo $%s" % job) jobs.append(job) fho = must_open(fo, "w") fho.write("\n".join(cmds) + "\n")
def fastp(args): """ %prog fastp jsonfile Convert fastp json to tsv file. """ jsons = args.json logging.info("reading %s files..." % len(jsons)) keys = """passed_filter_reads low_quality_reads too_many_N_reads too_short_reads too_long_reads""".split() print('\t'.join(['sid'] + keys)) for fi in jsons: sid = op.basename(op.splitext(fi)[0]) fhi = must_open(fi) js = json.load(fhi) print("\t".join([sid] + [str(js['filtering_result'][x]) for x in keys]))
def bbduk(args): """ %prog bbduk jsonfile Convert bbduk json to tsv file. """ jsons = args.json skip = args.skip logging.info("reading %s files..." % len(jsons)) keys = "readsIn readsRemoved readsOut ".split() print('\t'.join(['sid'] + keys)) for fi in jsons: sid = op.basename(op.splitext(fi)[0]) fhi = must_open(fi) if skip >= 1: for i in range(skip): next(fhi) js = json.load(fhi) print("\t".join([sid] + [str(js[x]) for x in keys]))
def psl2bed(args): for line in must_open(args.fi): if not re.match(r'\d+', line[0]): continue p = PslLine(line) for i in range(p.blockCount): tbeg, tend = p.tStarts[i], p.tStarts[i] + p.blockSizes[i] qbeg, qend = 0, 0 if p.qstrand == '-': qbeg, qend = p.qSize - p.qStarts[i] - p.blockSizes[i], p.qSize - p.qStarts[i] else: qbeg, qend = p.qStarts[i], p.qStarts[i] + p.blockSizes[i] tstr = "%s:%d-%d" % (p.tName, tbeg, tend) qstr = "%s:%d-%d" % (p.qName, qbeg, qend) if args.qry: print("%s\t%d\t%d\t%s\t%s" % (p.qName, qbeg, qend, p.qstrand, tstr)) else: print("%s\t%d\t%d\t%s\t%s_%d_%d_%s" % (p.tName, tbeg, tend, p.qstrand, qstr))
def rmgap(args): firstLine = True pid, locs = '', [] for line in must_open(args.fi): line = line.rstrip("\n") if not line: continue ps = line.split() assert len(ps) == 12, "not 12 fields: %s" % line cid = "\t".join(ps[0:8]) oStart, oEnd, oSize = int(ps[9]), int(ps[10]), int(ps[11]) if firstLine: pid = cid locs.append([oStart, oEnd, oSize]) firstLine = False elif pid == cid: locs.append([oStart, oEnd, oSize]) else: rm1gap(pid, locs) pid = cid locs = [[oStart, oEnd, oSize]] rm1gap(pid, locs)
def gtb2tsv(args): fhi = must_open(args.fi) print("\t".join( "gid tid ttype etype chrom start end srd fam note".split())) for line in fhi: line = line.strip("\n") if line.startswith("#") or line.startswith("id"): continue ary = line.split("\t") if len(ary) < 18: print("less than 18 columns:\n%s" % line) continue tid, gid, seqid, tbeg, tend, srd, \ locES, locIS, locCS, loc5S, loc3S, phase, \ src, conf, cat1, cat2, cat3, note = ary tbeg, tend = int(tbeg), int(tend) if cat1 == 'mRNA': assert locCS, "no CDS for %d" % tid else: assert locES, "no exon for %d" % tid ldic = { 'exon': locES, 'cds': locCS, \ 'utr5': loc5S, 'utr3': loc3S, 'intron':locIS } for etype, locS in ldic.items(): if not locS: continue for rbeg, rend in locStr2Ary(locS): beg, end = 0, 0 if srd == "-": beg, end = tend - rend + 1, tend - rbeg + 1 else: assert srd == '+', "unknown strand: %s for %s" % (srd, tid) beg, end = tbeg + rbeg - 1, tbeg + rend - 1 fields = [ gid, tid, cat1, etype, seqid, str(beg), str(end), srd, cat3, note ] print("\t".join(fields)) fhi.close()
def filter(args): fhi = must_open(args.fi) line = fhi.readline() print(line.strip("\n")) pqid = '' pscore = 0 lines = [] for line in fhi: line = line.strip("\n") qName, qStart, qEnd, qSrd, qSize,\ tName, tStart, tEnd, tSrd, tSize,\ alnLen, match, misMatch, baseN,\ qNumIns, tNumIns, qBaseIns, tBaseIns,\ ident, score, qLoc, tLoc = line.split("\t") #print(line) #print(qSize) if float(ident) < args.ident: continue if int(alnLen) / int(qSize) < args.cov: continue if int(match) < args.match: continue if pqid == '': pqid = qName pscore = score lines.append(line) elif qName != pqid: print("\n".join(lines)) pqid = qName pscore = score lines = [line] else: if args.best: if score > pscore: lines = [line] pscore = score elif score == pscore: lines.append(line) else: lines.append(line) print("\n".join(lines))
def write_csv(header, contents, sep=",", filename="stdout", thousands=False, tee=False, align=True, comment=False): """ Write csv that are aligned with the column headers. >>> header = ["x_value", "y_value"] >>> contents = [(1, 100), (2, 200)] >>> write_csv(header, contents) x_value, y_value 1, 100 2, 200 """ from maize.formats.base import must_open formatted = load_csv(header, contents, sep=sep, thousands=thousands, align=align) if comment: formatted[0] = '#' + formatted[0][1:] formatted = "\n".join(formatted) fw = must_open(filename, "w") fw.write(formatted + "\n") if tee and filename != "stdout": print(formatted)
def validate(args): """ %prog validate input.vcf genome.fasta Fasta validation of vcf file. """ import pyfasta p = OptionParser(validate.__doc__) p.add_option("--prefix", help="Add prefix to seqid") opts, args = p.parse_args(args) vcffile, fastafile = args pf = opts.prefix genome = pyfasta.Fasta(fastafile, record_class=pyfasta.MemoryRecord) fp = must_open(vcffile) match_ref = match_alt = total = 0 for row in fp: if row[0] == '#': continue seqid, pos, id, ref, alt = row.split()[:5] total += 1 if pf: seqid = pf + seqid pos = int(pos) if seqid not in genome: continue true_ref = genome[seqid][pos - 1] if total % 100000 == 0: print >> sys.stderr, total, "sites parsed" if ref == true_ref: match_ref += 1 elif alt == true_ref: match_alt += 1 logging.debug("Match REF: {}".format(percentage(match_ref, total))) logging.debug("Match ALT: {}".format(percentage(match_alt, total)))
def bed(args): """ %prog bed blastfile Print out bed file based on coordinates in BLAST report. By default, write out subject positions. Use --swap to write query positions. """ from jcvi.formats.bed import sort as bed_sort p = OptionParser(bed.__doc__) sp1.add_argument("--swap", default=False, action="store_true", help="Write query positions [default: %default]") opts, args = p.parse_args(args) if len(args) != 1: sys.exit(p.print_help()) blastfile, = args swap = opts.swap fp = must_open(blastfile) bedfile = "{0}.bed".format(blastfile.rsplit(".", 1)[0]) \ if blastfile.endswith(".blast") \ else "{0}.bed".format(blastfile) fw = open(bedfile, "w") for row in fp: b = BlastLine(row) if swap: b = b.swapped print >> fw, b.bedline logging.debug("File written to `{0}`.".format(bedfile)) fw.close() bed_sort([bedfile, "-i"]) return bedfile