def filter(args): """ %prog filter consensus.fasta Filter consensus sequence with min cluster size. """ from jcvi.formats.fasta import Fasta, SeqIO p = OptionParser(filter.__doc__) p.add_option("--minsize", default=10, type="int", help="Minimum cluster size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args minsize = opts.minsize f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") for desc, rec in f.iterdescriptions_ordered(): if desc.startswith("singleton"): continue # consensus_for_cluster_0 with 63 sequences name, w, size, seqs = desc.split() assert w == "with" size = int(size) if size < minsize: continue SeqIO.write(rec, fw, "fasta")
def filterm4(args): """ %prog filterm4 sample.m4 > filtered.m4 Filter .m4 file after blasr is run. As blasr takes a long time to run, changing -bestn is undesirable. This screens the m4 file to retain top hits. """ p = OptionParser(filterm4.__doc__) p.add_option("--best", default=1, type="int", help="Only retain best N hits") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) m4file, = args best = opts.best fp = open(m4file) fw = must_open(opts.outfile, "w") seen = defaultdict(int) retained = total = 0 for row in fp: r = M4Line(row) total += 1 if total % 100000 == 0: logging.debug("Retained {0} lines".\ format(percentage(retained, total))) if seen.get(r.query, 0) < best: fw.write(row) seen[r.query] += 1 retained += 1 fw.close()
def compile(args): """ %prog compile directory Extract telomere length and ccn. """ p = OptionParser(compile.__doc__) p.set_outfile(outfile="age.tsv") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) dfs = [] for folder in args: ofolder = os.listdir(folder) # telomeres subdir = [x for x in ofolder if x.startswith("telomeres")][0] subdir = op.join(folder, subdir) filename = op.join(subdir, "tel_lengths.txt") df = pd.read_csv(filename, sep="\t") d1 = df.ix[0].to_dict() # ccn subdir = [x for x in ofolder if x.startswith("ccn")][0] subdir = op.join(folder, subdir) filename = iglob(subdir, "*.ccn.json")[0] js = json.load(open(filename)) d1.update(js) df = pd.DataFrame(d1, index=[0]) dfs.append(df) df = pd.concat(dfs, ignore_index=True) df.to_csv(opts.outfile, sep="\t", index=False)
def nmd(args): """ %prog nmd gffile Identify transcript variants which might be candidates for nonsense mediated decay (NMD) A transcript is considered to be a candidate for NMD when the CDS stop codon is located more than 50nt upstream of terminal splice site donor References: http://www.nature.com/horizon/rna/highlights/figures/s2_spec1_f3.html http://www.biomedcentral.com/1741-7007/7/23/figure/F1 """ import __builtin__ from jcvi.utils.cbook import enumerate_reversed p = OptionParser(nmd.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args gff = make_index(gffile) fw = must_open(opts.outfile, "w") for gene in gff.features_of_type('gene', order_by=('seqid', 'start')): _enumerate = __builtin__.enumerate if gene.strand == "-" else enumerate_reversed for mrna in gff.children(gene, featuretype='mRNA', order_by=('start')): tracker = dict() tracker['exon'] = list(gff.children(mrna, featuretype='exon', order_by=('start'))) tracker['cds'] = [None] * len(tracker['exon']) tcds_pos = None for i, exon in _enumerate(tracker['exon']): for cds in gff.region(region=exon, featuretype='CDS', completely_within=True): if mrna.id in cds['Parent']: tracker['cds'][i] = cds tcds_pos = i break if tcds_pos: break NMD, distance = False, 0 if (mrna.strand == "+" and tcds_pos + 1 < len(tracker['exon'])) \ or (mrna.strand == "-" and tcds_pos - 1 >= 0): tcds = tracker['cds'][tcds_pos] texon = tracker['exon'][tcds_pos] PTC = tcds.end if mrna.strand == '+' else tcds.start TDSS = texon.end if mrna.strand == '+' else texon.start distance = abs(TDSS - PTC) NMD = True if distance > 50 else False print >> fw, "\t".join(str(x) for x in (gene.id, mrna.id, \ gff.children_bp(mrna, child_featuretype='CDS'), distance, NMD)) fw.close()
def mstmap(args): """ %prog mstmap LMD50.snps.genotype.txt Convert LMDs to MSTMAP input. """ from jcvi.assembly.geneticmap import MSTMatrix p = OptionParser(mstmap.__doc__) p.add_option("--population_type", default="RIL6", help="Type of population, possible values are DH and RILd") p.add_option("--missing_threshold", default=.5, help="Missing threshold, .25 excludes any marker with >25% missing") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) lmd, = args fp = open(lmd) fp.next() # Header table = {"0": "-", "1": "A", "2": "B", "3": "X"} mh = ["locus_name"] + fp.next().split()[4:] genotypes = [] for row in fp: atoms = row.split() chr, pos, ref, alt = atoms[:4] locus_name = ".".join((chr, pos)) codes = [table[x] for x in atoms[4:]] genotypes.append([locus_name] + codes) mm = MSTMatrix(genotypes, mh, opts.population_type, opts.missing_threshold) mm.write(opts.outfile, header=True)
def mergemat(args): """ %prog mergemat *.npy Combine counts from multiple .npy data files. """ p = OptionParser(mergemat.__doc__) p.set_outfile(outfile="out") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) npyfiles = args A = np.load(npyfiles[0]) logging.debug("Load `{}`: matrix of shape {}:; sum={}" .format(npyfiles[0], A.shape, A.sum())) for npyfile in npyfiles[1:]: B = np.load(npyfile) A += B logging.debug("Load `{}`: sum={}" .format(npyfiles[0], A.sum())) pf = opts.outfile np.save(pf, A) logging.debug("Combined {} files into `{}.npy`".format(len(npyfiles), pf))
def fix(args): """ %prog fix ahrd.csv > ahrd.fixed.csv Fix ugly names from Uniprot. """ p = OptionParser(fix.__doc__) p.add_option("--ignore_sym_pat", default=False, action="store_true", help="Do not fix names matching symbol patterns i.e." + \ " names beginning or ending with gene symbols or a series of numbers." + \ " e.g. `ARM repeat superfamily protein`, `beta-hexosaminidase 3`," + \ " `CYCLIN A3;4`, `WALL ASSOCIATED KINASE (WAK)-LIKE 10`") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) csvfile, = args fp = open(csvfile) fw = must_open(opts.outfile, "w") for row in fp: if row[0] == '#': continue if row.strip() == "": continue atoms = row.rstrip("\r\n").split("\t") name, hit, ahrd_code, desc = atoms[:4] \ if len(atoms) > 2 else \ (atoms[0], None, None, atoms[-1]) newdesc = fix_text(desc, ignore_sym_pat=opts.ignore_sym_pat) if hit and hit.strip() != "" and newdesc == Hypothetical: newdesc = "conserved " + newdesc print("\t".join(atoms[:4] + [newdesc] + atoms[4:]), file=fw)
def first(args): """ %prog first N fastqfile(s) Get first N reads from file. """ from jcvi.apps.base import need_update p = OptionParser(first.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) N = int(args[0]) nlines = N * 4 fastqfiles = args[1:] fastqfile = fastqfiles[0] outfile = opts.outfile if not need_update(fastqfiles, outfile): logging.debug("File `{0}` exists. Will not overwrite.".format(outfile)) return gz = fastqfile.endswith(".gz") for fastqfile in fastqfiles: if gz: cmd = "zcat {0} | head -n {1}".format(fastqfile, nlines) else: cmd = "head -n {0} {1}".format(nlines, fastqfile) sh(cmd, outfile=opts.outfile, append=True)
def freq(args): """ %prog freq fastafile bamfile Call SNP frequencies and generate GFF file. """ p = OptionParser(freq.__doc__) p.add_option("--mindepth", default=3, type="int", help="Minimum depth [default: %default]") p.add_option("--minqual", default=20, type="int", help="Minimum quality [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, bamfile = args cmd = "freebayes -f {0} --pooled-continuous {1}".format(fastafile, bamfile) cmd += " -F 0 -C {0}".format(opts.mindepth) cmd += ' | vcffilter -f "QUAL > {0}"'.format(opts.minqual) cmd += " | vcfkeepinfo - AO RO TYPE" sh(cmd, outfile=opts.outfile)
def insertion(args): """ %prog insertion mic.mac.bed Find IES based on mapping MIC reads to MAC genome. Output a bedfile with 'lesions' (stack of broken reads) in the MAC genome. """ p = OptionParser(insertion.__doc__) p.add_option("--mindepth", default=6, type="int", help="Minimum depth to call an insertion") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args mindepth = opts.mindepth bed = Bed(bedfile) fw = must_open(opts.outfile, "w") for seqid, feats in bed.sub_beds(): left_ends = Counter([x.start for x in feats]) right_ends = Counter([x.end for x in feats]) selected = [] for le, count in left_ends.items(): if count >= mindepth: selected.append((seqid, le, "LE-{0}".format(le), count)) for re, count in right_ends.items(): if count >= mindepth: selected.append((seqid, re, "RE-{0}".format(re), count)) selected.sort() for seqid, pos, label, count in selected: label = "{0}-r{1}".format(label, count) print >> fw, "\t".join((seqid, str(pos - 1), str(pos), label))
def fix(args): """ %prog fix ahrd.csv > ahrd.fixed.csv Fix ugly names from Uniprot. """ p = OptionParser(fix.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) csvfile, = args fp = open(csvfile) fw = must_open(opts.outfile, "w") for row in fp: if row[0] == '#': continue if row.strip() == "": continue atoms = row.rstrip("\r\n").split("\t") name, hit, ahrd_code, desc = atoms[:4] \ if len(atoms) > 2 else \ atoms[0], None, None, atoms[-1] newdesc = fix_text(desc) if hit and hit.strip() != "" and newdesc == Hypothetical: newdesc = "conserved " + newdesc print >> fw, "\t".join(atoms[:4] + [newdesc] + atoms[4:])
def merge(args): """ %prog merge ref.fasta query.fasta *.delta Merge delta files into a single delta. """ p = OptionParser(merge.__doc__) p.set_outfile(outfile="merged_results.delta") opts, args = p.parse_args(args) if len(args) < 3: sys.exit(not p.print_help()) ref, query = args[:2] deltafiles = args[2:] outfile = opts.outfile ref = get_abs_path(ref) query = get_abs_path(query) fw = must_open(outfile, "w") print(" ".join((ref, query)), file=fw) print("NUCMER", file=fw) fw.close() for d in deltafiles: cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d) sh(cmd, outfile=outfile, append=True)
def merge(args): """ %prog merge ref.fasta query.fasta *.delta Merge delta files into a single delta. """ p = OptionParser(merge.__doc__) p.set_outfile(outfile="merged_results.delta") opts, args = p.parse_args(args) if len(args) < 3: sys.exit(not p.print_help()) ref, query = args[:2] deltafiles = args[2:] outfile = opts.outfile ref = get_abs_path(ref) query = get_abs_path(query) fw = must_open(outfile, "w") print >> fw, " ".join((ref, query)) print >> fw, "NUCMER" fw.close() for d in deltafiles: cmd = "awk 'NR > 2 {{print $0}}' {0}".format(d) sh(cmd, outfile=outfile, append=True)
def fix(args): """ %prog fix ahrd.csv > ahrd.fixed.csv Fix ugly names from Uniprot. """ p = OptionParser(fix.__doc__) p.add_option("--ignore_sym_pat", default=False, action="store_true", help="Do not fix names matching symbol patterns i.e." + \ " names beginning or ending with gene symbols or a series of numbers." + \ " e.g. `ARM repeat superfamily protein`, `beta-hexosaminidase 3`," + \ " `CYCLIN A3;4`, `WALL ASSOCIATED KINASE (WAK)-LIKE 10`") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) csvfile, = args fp = open(csvfile) fw = must_open(opts.outfile, "w") for row in fp: if row[0] == '#': continue if row.strip() == "": continue atoms = row.rstrip("\r\n").split("\t") name, hit, ahrd_code, desc = atoms[:4] \ if len(atoms) > 2 else \ (atoms[0], None, None, atoms[-1]) newdesc = fix_text(desc, ignore_sym_pat=opts.ignore_sym_pat) if hit and hit.strip() != "" and newdesc == Hypothetical: newdesc = "conserved " + newdesc print >> fw, "\t".join(atoms[:4] + [newdesc] + atoms[4:])
def mergecsv(args): """ %prog mergecsv *.tsv Merge a set of tsv files. """ p = OptionParser(mergecsv.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) tsvfiles = args outfile = opts.outfile if op.exists(outfile): os.remove(outfile) fw = must_open(opts.outfile, "w") for i, tsvfile in enumerate(tsvfiles): fp = open(tsvfile) if i > 0: next(fp) for row in fp: fw.write(row) fw.close()
def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'.\ format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def trimUTR(args): """ %prog trimUTR gffile Remove UTRs in the annotation set. """ p = OptionParser(trimUTR.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) gffile, = args g = make_index(gffile) gff = Gff(gffile) mRNA_register = {} fw = must_open(opts.outfile, "w") for c in gff: cid, ctype = c.accn, c.type if ctype == "gene": start, end = get_cds_minmax(g, cid) trim(c, start, end) elif ctype == "mRNA": start, end = get_cds_minmax(g, cid, level=1) trim(c, start, end) mRNA_register[cid] = (start, end) elif ctype != "CDS": start, end = mRNA_register[c.parent] trim(c, start, end) if c.start > c.end: print >> sys.stderr, cid, \ "destroyed [{0} > {1}]".format(c.start, c.end) else: print >> fw, c
def group(args): """ %prog group anchorfiles Group the anchors into ortho-groups. Can input multiple anchor files. """ p = OptionParser(group.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) anchorfiles = args groups = Grouper() for anchorfile in anchorfiles: ac = AnchorFile(anchorfile) for a, b, idx in ac.iter_pairs(): groups.join(a, b) logging.debug("Created {0} groups with {1} members.".\ format(len(groups), groups.num_members)) outfile = opts.outfile fw = must_open(outfile, "w") for g in groups: print >> fw, ",".join(sorted(g)) fw.close() return outfile
def silicosoma(args): """ %prog silicosoma in.silico > out.soma Convert .silico to .soma file. Format of .silico A text file containing in-silico digested contigs. This file contains pairs of lines. The first line in each pair constains an identifier, this contig length in bp, and the number of restriction sites, separated by white space. The second line contains a white space delimited list of the restriction site positions. Format of .soma Each line of the text file contains two decimal numbers: The size of the fragment and the standard deviation (both in kb), separated by white space. The standard deviation is ignored. """ p = OptionParser(silicosoma.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) (silicofile,) = args fp = must_open(silicofile) fw = must_open(opts.outfile, "w") next(fp) positions = [int(x) for x in next(fp).split()] for a, b in pairwise(positions): assert a <= b fragsize = int(round((b - a) / 1000.0)) # kb if fragsize: print(fragsize, 0, file=fw)
def dump(args): """ %prog dump fastafile Convert FASTA sequences to list of K-mers. """ p = OptionParser(dump.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastafile, = args K = opts.K fw = must_open(opts.outfile, "w") f = Fasta(fastafile, lazy=True) for name, rec in f.iteritems_ordered(): kmers = list(make_kmers(rec.seq, K)) print >> fw, "\n".join(kmers) fw.close()
def gcn(args): """ %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz Compile gene copy njumber based on CANVAS results. """ p = OptionParser(gcn.__doc__) p.set_cpus() p.set_tmpdir(tmpdir="tmp") p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) exonbed = args[0] canvasvcfs = args[1:] tsvfile = opts.outfile tmpdir = opts.tmpdir mkdir(tmpdir) set_tempdir(tmpdir) df = vcf_to_df(canvasvcfs, exonbed, opts.cpus) for suffix in (".avgcn", ".medcn"): df_to_tsv(df, tsvfile, suffix)
def silicosoma(args): """ %prog silicosoma in.silico > out.soma Convert .silico to .soma file. Format of .silico A text file containing in-silico digested contigs. This file contains pairs of lines. The first line in each pair constains an identifier, this contig length in bp, and the number of restriction sites, separated by white space. The second line contains a white space delimited list of the restriction site positions. Format of .soma Each line of the text file contains two decimal numbers: The size of the fragment and the standard deviation (both in kb), separated by white space. The standard deviation is ignored. """ p = OptionParser(silicosoma.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) silicofile, = args fp = must_open(silicofile) fw = must_open(opts.outfile, "w") fp.next() positions = [int(x) for x in fp.next().split()] for a, b in pairwise(positions): assert a <= b fragsize = int(round((b - a) / 1000.)) # kb if fragsize: print >> fw, fragsize, 0
def index(args): """ %prog index bedfile Compress frgscffile.sorted and index it using `tabix`. """ p = OptionParser(index.__doc__) p.add_option("--query", help="Chromosome location [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args gzfile = bedfile + ".gz" if need_update(bedfile, gzfile): bedfile = sort([bedfile]) cmd = "bgzip -c {0}".format(bedfile) sh(cmd, outfile=gzfile) tbifile = gzfile + ".tbi" if need_update(gzfile, tbifile): cmd = "tabix -p bed {0}".format(gzfile) sh(cmd) query = opts.query if not query: return cmd = "tabix {0} {1}".format(gzfile, query) sh(cmd, outfile=opts.outfile)
def bincount(args): """ %prog bincount fastafile binfile Count K-mers in the bin. """ from bitarray import bitarray from jcvi.formats.sizes import Sizes p = OptionParser(bincount.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, binfile = args K = opts.K fp = open(binfile) a = bitarray() a.fromfile(fp) f = Sizes(fastafile) tsize = 0 fw = must_open(opts.outfile, "w") for name, seqlen in f.iter_sizes(): ksize = seqlen - K + 1 b = a[tsize:tsize + ksize] bcount = b.count() print("\t".join(str(x) for x in (name, bcount)), file=fw) tsize += ksize
def bincount(args): """ %prog bincount fastafile binfile Count K-mers in the bin. """ from bitarray import bitarray from jcvi.formats.sizes import Sizes p = OptionParser(bincount.__doc__) p.add_option("-K", default=23, type="int", help="K-mer size [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, binfile = args K = opts.K fp = open(binfile) a = bitarray() a.fromfile(fp) f = Sizes(fastafile) tsize = 0 fw = must_open(opts.outfile, "w") for name, seqlen in f.iter_sizes(): ksize = seqlen - K + 1 b = a[tsize : tsize + ksize] bcount = b.count() print >> fw, "\t".join(str(x) for x in (name, bcount)) tsize += ksize
def uniq(args): """ %prog uniq fastqfile Retain only first instance of duplicate reads. Duplicate is defined as having the same read name. """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args fw = must_open(opts.outfile, "w") nduplicates = nreads = 0 seen = set() for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break name = rec.name if name in seen: nduplicates += 1 continue seen.add(name) print(rec, file=fw) logging.debug("Removed duplicate reads: {}".\ format(percentage(nduplicates, nreads)))
def mergecsv(args): """ %prog mergecsv *.tsv Merge a set of tsv files. """ p = OptionParser(mergecsv.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) tsvfiles = args outfile = opts.outfile if op.exists(outfile): os.remove(outfile) tsvfile = tsvfiles[0] fw = must_open(opts.outfile, "w") for i, tsvfile in enumerate(tsvfiles): fp = open(tsvfile) if i > 0: next(fp) for row in fp: fw.write(row) fw.close()
def uniq(args): """ %prog uniq fastqfile Retain only first instance of duplicate reads. Duplicate is defined as having the same read name. """ p = OptionParser(uniq.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) fastqfile, = args fw = must_open(opts.outfile, "w") nduplicates = nreads = 0 seen = set() for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break name = rec.name if name in seen: nduplicates += 1 continue seen.add(name) print >> fw, rec logging.debug("Removed duplicate reads: {}".\ format(percentage(nduplicates, nreads)))
def suffix(args): """ %prog suffix fastqfile CAG Filter reads based on suffix. """ p = OptionParser(suffix.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastqfile, sf = args fw = must_open(opts.outfile, "w") nreads = nselected = 0 for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break if rec.seq.endswith(sf): print >> fw, rec nselected += 1 logging.debug("Selected reads with suffix {0}: {1}".\ format(sf, percentage(nselected, nreads)))
def fromaligns(args): """ %prog fromaligns out.aligns Convert aligns file (old MCscan output) to anchors file. """ p = OptionParser(fromaligns.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) alignsfile, = args fp = must_open(alignsfile) fw = must_open(opts.outfile, "w") for row in fp: if row.startswith("## Alignment"): print >> fw, "###" continue if row[0] == '#' or not row.strip(): continue atoms = row.split(':')[-1].split() print >> fw, "\t".join(atoms[:2]) fw.close()
def mini(args): """ %prog mini bamfile minibamfile Prepare mini-BAMs that contain only the STR loci. """ p = OptionParser(mini.__doc__) p.add_option("--pad", default=20000, type="int", help="Add padding to the STR reigons") p.add_option("--treds", default=None, help="Extract specific treds, use comma to separate") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bamfile, minibam = args treds = opts.treds.split(",") if opts.treds else None pad = opts.pad bedfile = make_STR_bed(pad=pad, treds=treds) get_minibam_bed(bamfile, bedfile, minibam) logging.debug("Mini-BAM written to `{}`".format(minibam))
def cat(args): """ %prog cat *.pdf -o output.pdf Concatenate pages from pdf files into a single pdf file. Page ranges refer to the previously-named file. A file not followed by a page range means all the pages of the file. PAGE RANGES are like Python slices. {page_range_help} EXAMPLES pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1 Concatenate all of head.pdf, all but page seven of content.pdf, and the last page of tail.pdf, producing output.pdf. pdfcat chapter*.pdf >book.pdf You can specify the output file by redirection. pdfcat chapter?.pdf chapter10.pdf >book.pdf In case you don't want chapter 10 before chapter 2. """ p = OptionParser(cat.__doc__.format(page_range_help=PAGE_RANGE_HELP)) p.add_option("--nosort", default=False, action="store_true", help="Do not sort file names") p.set_outfile() p.set_verbose(help="Show page ranges as they are being read") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) outfile = opts.outfile if outfile in args: args.remove(outfile) if not opts.nosort: args = natsorted(args) filename_page_ranges = parse_filename_page_ranges(args) verbose = opts.verbose fw = must_open(outfile, "wb") merger = PdfFileMerger() in_fs = {} try: for (filename, page_range) in filename_page_ranges: if verbose: print >> sys.stderr, filename, page_range if filename not in in_fs: in_fs[filename] = open(filename, "rb") merger.append(in_fs[filename], pages=page_range) except: print >> sys.stderr, traceback.format_exc() print >> sys.stderr, "Error while reading " + filename sys.exit(1) merger.write(fw) fw.close()
def agp(args): """ %prog agp main_results/ contigs.fasta Generate AGP file based on LACHESIS output. """ p = OptionParser(agp.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) odir, contigsfasta = args fwagp = must_open(opts.outfile, 'w') orderingfiles = natsorted(iglob(odir, "*.ordering")) sizes = Sizes(contigsfasta).mapping contigs = set(sizes.keys()) anchored = set() for ofile in orderingfiles: co = ContigOrdering(ofile) anchored |= set([x.contig_name for x in co]) obj = op.basename(ofile).split('.')[0] co.write_agp(obj, sizes, fwagp) singletons = contigs - anchored logging.debug('Anchored: {}, Singletons: {}'. format(len(anchored), len(singletons))) for s in natsorted(singletons): order_to_agp(s, [(s, "?")], sizes, fwagp)
def augustus(args): """ %prog augustus augustus.gff3 > reformatted.gff3 AUGUSTUS does generate a gff3 (--gff3=on) but need some refinement. """ from jcvi.formats.gff import Gff p = OptionParser(augustus.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) ingff3, = args gff = Gff(ingff3) fw = must_open(opts.outfile, "w") seen = defaultdict(int) for g in gff: if g.type not in ("gene", "transcript", "CDS"): continue if g.type == "transcript": g.type = "mRNA" prefix = g.seqid + "_" pid = prefix + g.id newid = "{0}-{1}".format(pid, seen[pid]) if pid in seen else pid seen[pid] += 1 g.attributes["ID"] = [newid] g.attributes["Parent"] = [(prefix + x) for x in g.attributes["Parent"]] g.update_attributes() print >> fw, g fw.close()
def suffix(args): """ %prog suffix fastqfile CAG Filter reads based on suffix. """ from jcvi.utils.cbook import percentage p = OptionParser(suffix.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastqfile, sf = args fw = must_open(opts.outfile, "w") nreads = nselected = 0 for rec in iter_fastq(fastqfile): nreads += 1 if rec is None: break if rec.seq.endswith(sf): print >> fw, rec nselected += 1 logging.debug("Selected reads with suffix {0}: {1}".format(sf, percentage(nselected, nreads)))
def digest(args): """ %prog digest fastafile NspI,BfuCI Digest fasta sequences to map restriction site positions. """ p = OptionParser(digest.__doc__) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, enzymes = args enzymes = enzymes.split(",") enzymes = [x for x in AllEnzymes if str(x) in enzymes] f = Fasta(fastafile, lazy=True) fw = must_open(opts.outfile, "w") header = ["Contig", "Length"] + [str(x) for x in enzymes] print("\t".join(header), file=fw) for name, rec in f.iteritems_ordered(): row = [name, len(rec)] for e in enzymes: pos = e.search(rec.seq) pos = "na" if not pos else "|".join(str(x) for x in pos) row.append(pos) print("\t".join(str(x) for x in row), file=fw)
def gff(args): """ %prog gff pslfile Convert to gff format. """ p = OptionParser(gff.__doc__) p.add_option("--source", default="GMAP", help="specify GFF source [default: %default]") p.add_option("--type", default="EST_match", help="specify GFF feature type [default: %default]") p.add_option("--suffix", default=".match", help="match ID suffix [default: \"%default\"]") p.add_option("--swap", default=False, action="store_true", help="swap query and target features [default: %default]") p.add_option("--simple_score", default=False, action="store_true", help="calculate a simple percent score [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) pslfile, = args fw = must_open(opts.outfile, "w") print("##gff-version 3", file=fw) psl = Psl(pslfile) for p in psl: if opts.swap: p.swap psl.trackMatches(p.qName) # switch from 0-origin to 1-origin p.qStart += 1 p.tStart += 1 print(p.gffline(source=opts.source, type=opts.type, suffix=opts.suffix, \ primary_tag="ID", alt_score=opts.simple_score, \ count=psl.getMatchCount(p.qName)), file=fw) # create an empty PslLine() object and load only # the targetName, queryName and strand info part = PslLine("\t".join(str(x) for x in [0] * p.nargs)) part.tName, part.qName, part.strand = p.tName, p.qName, p.strand nparts = len(p.qStarts) for n in xrange(nparts): part.qStart, part.tStart, aLen = p.qStarts[n] + 1, p.tStarts[n] + 1, p.blockSizes[n] part.qEnd = part.qStart + aLen - 1 part.tEnd = part.tStart + aLen - 1 if part.strand == "-": part.qStart = p.qSize - (p.qStarts[n] + p.blockSizes[n]) + 1 part.qEnd = p.qSize - p.qStarts[n] print(part.gffline(source=opts.source, suffix=opts.suffix, \ count=psl.getMatchCount(part.qName)), file=fw)
def coverage(args): """ %prog coverage fastafile bamfile Calculate coverage for BAM file. BAM file will be sorted unless with --nosort. """ p = OptionParser(coverage.__doc__) p.add_option( "--format", default="bigwig", choices=("bedgraph", "bigwig", "coverage"), help="Output format", ) p.add_option("--nosort", default=False, action="store_true", help="Do not sort BAM") p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) fastafile, bamfile = args format = opts.format if opts.nosort: logging.debug("BAM sorting skipped") else: bamfile = index([bamfile, "--fasta={0}".format(fastafile)]) pf = bamfile.rsplit(".", 2)[0] sizesfile = Sizes(fastafile).filename cmd = "genomeCoverageBed -ibam {0} -g {1}".format(bamfile, sizesfile) if format in ("bedgraph", "bigwig"): cmd += " -bg" bedgraphfile = pf + ".bedgraph" sh(cmd, outfile=bedgraphfile) if format == "bedgraph": return bedgraphfile bigwigfile = pf + ".bigwig" cmd = "bedGraphToBigWig {0} {1} {2}".format(bedgraphfile, sizesfile, bigwigfile) sh(cmd) return bigwigfile coveragefile = pf + ".coverage" if need_update(fastafile, coveragefile): sh(cmd, outfile=coveragefile) gcf = GenomeCoverageFile(coveragefile) fw = must_open(opts.outfile, "w") for seqid, cov in gcf.iter_coverage_seqid(): print("\t".join((seqid, "{0:.1f}".format(cov))), file=fw) fw.close()
def bed(args): """ %prog bed anchorsfile Convert ANCHORS file to BED format. """ from collections import defaultdict from jcvi.compara.synteny import AnchorFile, check_beds from jcvi.formats.bed import Bed from jcvi.formats.base import get_number p = OptionParser(bed.__doc__) p.add_option("--switch", default=False, action="store_true", help="Switch reference and aligned map elements") p.add_option("--scale", type="float", help="Scale the aligned map distance by factor") p.set_beds() p.set_outfile() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) anchorsfile, = args switch = opts.switch scale = opts.scale ac = AnchorFile(anchorsfile) pairs = defaultdict(list) for a, b, block_id in ac.iter_pairs(): pairs[a].append(b) qbed, sbed, qorder, sorder, is_self = check_beds(anchorsfile, p, opts) bd = Bed() for q in qbed: qseqid, qstart, qend, qaccn = q.seqid, q.start, q.end, q.accn if qaccn not in pairs: continue for s in pairs[qaccn]: si, s = sorder[s] sseqid, sstart, send, saccn = s.seqid, s.start, s.end, s.accn if switch: qseqid, sseqid = sseqid, qseqid qstart, sstart = sstart, qstart qend, send = send, qend qaccn, saccn = saccn, qaccn if scale: sstart /= scale try: newsseqid = get_number(sseqid) except ValueError: raise ValueError, "`{0}` is on `{1}` with no number to extract".\ format(saccn, sseqid) bedline = "\t".join(str(x) for x in (qseqid, qstart - 1, qend, "{0}:{1}".format(newsseqid, sstart))) bd.add(bedline) bd.print_to_file(filename=opts.outfile, sorted=True)
def prepare(args): """ %prog prepare pairsfile cdsfile [pepfile] -o paired.cds.fasta Pick sequences from cdsfile to form pairs, ready to be calculated. The pairsfile can be generated from formats.blast.cscore(). The first two columns contain the pair. """ from jcvi.formats.fasta import Fasta p = OptionParser(prepare.__doc__) p.set_outfile() opts, args = p.parse_args(args) outfile = opts.outfile if len(args) == 2: pairsfile, cdsfile = args pepfile = None elif len(args) == 3: pairsfile, cdsfile, pepfile = args else: sys.exit(not p.print_help()) f = Fasta(cdsfile) fp = open(pairsfile) fw = must_open(outfile, "w") if pepfile: assert outfile != "stdout", "Please specify outfile name." f2 = Fasta(pepfile) fw2 = must_open(outfile + ".pep", "w") for row in fp: if row[0] == '#': continue a, b = row.split()[:2] if a == b: logging.debug("Self pairs found: {0} - {1}. Ignored".format(a, b)) continue if a not in f: a = find_first_isoform(a, f) assert a, a if b not in f: b = find_first_isoform(b, f) assert b, b acds = f[a] bcds = f[b] SeqIO.write((acds, bcds), fw, "fasta") if pepfile: apep = f2[a] bpep = f2[b] SeqIO.write((apep, bpep), fw2, "fasta") fw.close() if pepfile: fw2.close()
def flanking(args): """ %prog flanking bedfile [options] Get up to n features (upstream or downstream or both) flanking a given position. """ from numpy import array, argsort p = OptionParser(flanking.__doc__) p.add_option("--chrom", default=None, type="string", help="chrom name of the position in query. Make sure it matches bedfile.") p.add_option("--coord", default=None, type="int", help="coordinate of the position in query.") p.add_option("-n", default=10, type="int", help="number of flanking features to get [default: %default]") p.add_option("--side", default="both", choices=("upstream", "downstream", "both"), help="which side to get flanking features [default: %default]") p.add_option("--max_d", default=None, type="int", help="features <= max_d away from position [default: %default]") p.set_outfile() opts, args = p.parse_args(args) if any([len(args) != 1, opts.chrom is None, opts.coord is None]): sys.exit(not p.print_help()) bedfile, = args position = (opts.chrom, opts.coord) n, side, maxd = opts.n, opts.side, opts.max_d chrombed = Bed(bedfile).sub_bed(position[0]) if side == "upstream": data = [(abs(f.start-position[1]), f) for f in chrombed \ if f.start <= position[1]] elif side == "downstream": data = [(abs(f.start-position[1]), f) for f in chrombed \ if f.start >= position[1]] else: data = [(abs(f.start-position[1]), f) for f in chrombed] if maxd: data = [f for f in data if f[0]<=maxd] n += 1 # not counting self n = min(n, len(data)) distances, subbed = zip(*data) distances = array(distances) idx = argsort(distances)[:n] flankingbed = [f for (i, f) in enumerate(subbed) if i in idx] fw = must_open(opts.outfile, "w") for atom in flankingbed: print >>fw, str(atom) return (position, flankingbed)
def cat(args): """ %prog cat *.pdf -o output.pdf Concatenate pages from pdf files into a single pdf file. Page ranges refer to the previously-named file. A file not followed by a page range means all the pages of the file. PAGE RANGES are like Python slices. {page_range_help} EXAMPLES pdfcat -o output.pdf head.pdf content.pdf :6 7: tail.pdf -1 Concatenate all of head.pdf, all but page seven of content.pdf, and the last page of tail.pdf, producing output.pdf. pdfcat chapter*.pdf >book.pdf You can specify the output file by redirection. pdfcat chapter?.pdf chapter10.pdf >book.pdf In case you don't want chapter 10 before chapter 2. """ p = OptionParser(cat.__doc__.format(page_range_help=PAGE_RANGE_HELP)) p.set_outfile() p.set_verbose(help="Show page ranges as they are being read") opts, args = p.parse_args(args) if len(args) < 1: sys.exit(not p.print_help()) outfile = opts.outfile if outfile in args: args.remove(outfile) args = natsorted(args) filename_page_ranges = parse_filename_page_ranges(args) verbose = opts.verbose fw = must_open(outfile, "wb") merger = PdfFileMerger() in_fs = {} try: for (filename, page_range) in filename_page_ranges: if verbose: print >> sys.stderr, filename, page_range if filename not in in_fs: in_fs[filename] = open(filename, "rb") merger.append(in_fs[filename], pages=page_range) except: print >> sys.stderr, traceback.format_exc() print >> sys.stderr, "Error while reading " + filename sys.exit(1) merger.write(fw) fw.close()
def gc3(args): """ %prog gc3 ksfile cdsfile [cdsfile2] -o newksfile Filter the Ks results to remove high GC3 genes. High GC3 genes are problematic in Ks calculation - see Tang et al. 2010 PNAS. Specifically, the two calculation methods produce drastically different results for these pairs. Therefore we advise to remoeve these high GC3 genes. This is often the case for studying cereal genes. If 2 genomes are involved, the cdsfile of the 2nd genome can be provided concatenated or separated. """ p = OptionParser(gc3.__doc__) p.add_option("--plot", default=False, action="store_true", help="Also plot the GC3 histogram [default: %default]") p.set_outfile() opts, args = p.parse_args(args) outfile = opts.outfile plot = opts.plot if not 1 < len(args) < 4: sys.exit(not p.print_help()) ks_file, cdsfile = args[:2] GC3 = get_GC3(cdsfile) if plot: plot_GC3(GC3, cdsfile, fill="green") if len(args) == 3: cdsfile2 = args[2] GC3_2 = get_GC3(cdsfile2) GC3.update(GC3_2) if plot: plot_GC3(GC3_2, cdsfile2, fill="lightgreen") data = KsFile(ks_file) noriginals = len(data) fw = must_open(outfile, "w") writer = csv.writer(fw) writer.writerow(fields.split(",")) nlines = 0 cutoff = .75 for d in data: a, b = d.name.split(";") aratio, bratio = GC3[a], GC3[b] if (aratio + bratio) / 2 > cutoff: continue writer.writerow(d) nlines += 1 logging.debug("{0} records written (from {1}).".format(nlines, noriginals))
def summary(args): """ %prog summary input.bed scaffolds.fasta Print out summary statistics per map, followed by consensus summary of scaffold anchoring based on multiple maps. """ p = OptionParser(summary.__doc__) p.set_table(sep="|", align=True) p.set_outfile() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) inputbed, scaffolds = args pf = inputbed.rsplit(".", 1)[0] mapbed = pf + ".bed" chr_agp = pf + ".chr.agp" sep = opts.sep align = opts.align cc = Map(mapbed) mapnames = cc.mapnames s = Sizes(scaffolds) total, l50, n50 = s.summary r = {} maps = [] fw = must_open(opts.outfile, "w") print >> fw, "*** Summary for each individual map ***" for mapname in mapnames: markers = [x for x in cc if x.mapname == mapname] ms = MapSummary(markers, l50, s) r["Linkage Groups", mapname] = ms.num_lgs ms.export_table(r, mapname, total) maps.append(ms) print >> fw, tabulate(r, sep=sep, align=align) r = {} agp = AGP(chr_agp) print >> fw, "*** Summary for consensus map ***" consensus_scaffolds = set(x.component_id for x in agp if not x.is_gap) oriented_scaffolds = set(x.component_id for x in agp \ if (not x.is_gap) and x.orientation != '?') unplaced_scaffolds = set(s.mapping.keys()) - consensus_scaffolds for mapname, sc in (("Anchored", consensus_scaffolds), ("Oriented", oriented_scaffolds), ("Unplaced", unplaced_scaffolds)): markers = [x for x in cc if x.seqid in sc] ms = MapSummary(markers, l50, s, scaffolds=sc) ms.export_table(r, mapname, total) print >> fw, tabulate(r, sep=sep, align=align)