#!/usr/bin/env import sys from seqio import iteratorFromExtension from nucio import fileIterator if not len(sys.argv) == 2: sys.exit("sequencToLine.py in.{fa.fq}\n") it = iteratorFromExtension(sys.argv[1]) for record in fileIterator(sys.argv[1], it): if hasattr(record, "desc"): print "\t".join([record.name, record.seq, record.desc, record.qual]) else: print "\t".join([record.name, record.seq])
sys.exit("reference_segments.py in.fa") store_table = {"A": [], "C": [], "G": [], "T": [], "N": []} previous = 0 curr_count = 1 table = 2 accumulator = ["N", -1, store_table] def runs(acc, nxt): prev_letter = acc[previous] acc[curr_count] += 1 if not prev_letter == nxt: if acc[curr_count] > 0: acc[table][prev_letter].append(acc[curr_count]) acc[previous] = nxt acc[curr_count] = 0 return accumulator for entry in fileIterator(sys.argv[1], fastaIterator): reduce(runs, entry.seq, accumulator) runs(accumulator, "X") # get last sequence accumulator[previous] = "N" accumulator[curr_count] = -1 print max(store_table["N"])
arguments = map(CLArgument._make, argument_list) if not len(sys.argv) > 1: sys.exit(getHelpStr(description, arguments) + "\n") (p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments) if not len(args_remaining) >= 1: sys.exit(getHelpStr(description, arguments) + "\n") title = args_remaining[0] infiles = args_remaining[1:] cellnames = map(lambda f: "_".join(f.split(".")[0].split("_")[:2]), infiles) fit_gen = lambda filename: fileIterator(filename, lineItemIterator) file_iterators = map(fit_gen, infiles) def getBasesFromLineArr(arr): if not bool(arr): return if arr[0].startswith("n="): return arr[6].split("=")[1] if arr[0].startswith("#>%d" % p_arg_map["lengreater"]): return arr[1].split("=")[1] def getCountsFromLineArr(arr): if not bool(arr): return
def getRawAlignments(fn): '''fn is the filename returns An iterator over raw M4Records''' itemIterator = lambda f : lineRecordIterator(f, M4Record, M4RecordTypes) return fileIterator(fn, itemIterator)
#Downsample a library import sys from nucio import typeify, fileIterator from seqio import iteratorFromExtension, recordToString, seqlen if not len(sys.argv) == 5: sys.exit("Usage: downsample.py genome_size desired_cov input.{fa,fq} output.{fa,fq}\n") types = [int, float, str, str] sysins = sys.argv[1:len(types)+1] (genome_size, target_cov, infn, outfn) = typeify(sysins,types) max_bases = genome_size * target_cov total_bases = 0 with open(outfn, "w") as of: for record in fileIterator(infn,iteratorFromExtension(infn)): length = seqlen(record) if "N" in record.seq: continue if total_bases > max_bases: break of.write(recordToString(record)) of.write("\n") total_bases += length
#!/usr/bin/env python import sys from seqio import iteratorFromExtension, recordToString from nucio import fileIterator from misc import reverse_complement if not len(sys.argv) == 2: sys.exit("reverseComplement.py in.{fa,fq}") f = sys.argv[1] for record in fileIterator(f,iteratorFromExtension(f)): print recordToString(record._replace(seq=reverse_complement(record.seq)))
ref = itemgetter(7) pos = lambda r : int(itemgetter(8)(r)) strand = itemgetter(9) if not len(sys.argv) == 3: sys.exit("filterpairs.py read1.novo read2.novo\n") names_eq = lambda name1,name2: name1.split("/")[0] == name2.split("/")[0] filenames = sys.argv[1:3] #filter out header lines headfilt = lambda x : not x.startswith("#") filt_lii = partial(lineItemIterator, filter_func=headfilt) filt_fits = map( lambda fn : fileIterator(fn, filt_lii), filenames) failrepeat = 0 failmapq = 0 failsameref = 0 total = 0 passed = 0 insertNotRF = [] insertRF = [] for read1,read2 in izip(*filt_fits): total += 1 if not names_eq(name(read1), name(read2)): sys.exit("Error: %s not equal to %s\n" % (name(read1),name(read2)))
["miny", "miny", int, 0, "minimum y axis value"], ["maxy", "maxy", int, -1, "maximmum y axis value"], [ "out", "out", str, "out.pdf", "output file name (default out.pdf)" ], ["title", "title", str, "", "Title for graph"]] arguments = map(CLArgument._make, argument_list) (p_args, args_remaining) = parseArgs(sys.argv[1:], arguments) if not len(args_remaining) == 1: sys.exit(getHelpStr(description, arguments) + "\n") conv = lambda (i, j): (int(i), int(j)) (x, y) = zip(*imap(conv, fileIterator(args_remaining[0], lineItemIterator))) pp = PdfPages(p_args["out"]) plt.plot(x, y) (minx, maxx) = plt.xlim() (miny, maxy) = plt.ylim() minx = p_args["minx"] if p_args["minx"] > minx else minx maxx = p_args["maxx"] if p_args["maxx"] > 0 else maxx miny = p_args["miny"] if p_args["miny"] > miny else miny maxy = p_args["maxy"] if p_args["maxy"] > 0 else maxy plt.xlim((minx, maxx)) plt.ylim((miny, maxy)) plt.xlabel("Kmer Coverage")
#!/usr/bin/env python import sys from itertools import imap from seqio import iteratorFromExtension from nucio import fileIterator ##Create Kmers if not len(sys.argv) == 3: sys.exit("Usage: kmer.py k-size in.fa\n") fn = sys.argv[2] ksize = int(sys.argv[1]) for record in fileIterator(fn, iteratorFromExtension(fn)): seq = record.seq starts = range(len(seq)-ksize+1) kmers = imap(lambda start: seq[start:start+ksize], starts) for kmer in kmers: print kmer
argument_list = [["span","span", argflag, False, "Only alignments that span the region"]] arguments = map(CLArgument._make, argument_list) (p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments) if not len(args_remaining) == 2: sys.exit(getHelpStr(description,arguments) + "\n") inm4 = args_remaining[0] (chrom, rest) = args_remaining[1].split(":") (start,end) = map(int,rest.split("-")) it = lambda fh : lineRecordIterator(fh, M4Record, M4RecordTypes) cond = lambda r : r.tname == chrom and not r.tend < start and not r.tstart > end if p_arg_map["span"]: cond = lambda r : r.tname == chrom and r.tstart < start and r.tend > end filt_records = ifilter(cond,fileIterator(inm4, it)) for r in imap(recordToString, filt_records): print r
["minx","minx", int, 0,"minimum x axis value"], ["maxx", "maxx", int, -1,"maximum x axis value"], ["miny","miny", int, 0,"minimum y axis value"], ["maxy","maxy", int, -1,"maximmum y axis value"], ["out","out", str, "out.pdf", "output file name (default out.pdf)"], ["title","title", str, "", "Title for graph"]] arguments = map(CLArgument._make, argument_list) (p_args, args_remaining) = parseArgs(sys.argv[1:], arguments) if not len(args_remaining) == 1: sys.exit(getHelpStr(description, arguments) + "\n") conv = lambda (i,j) : (int(i),int(j)) (x,y) = zip(*imap(conv,fileIterator(args_remaining[0], lineItemIterator))) pp = PdfPages(p_args["out"]) plt.plot(x,y) (minx,maxx)=plt.xlim() (miny,maxy)=plt.ylim() minx = p_args["minx"] if p_args["minx"] > minx else minx maxx = p_args["maxx"] if p_args["maxx"] > 0 else maxx miny = p_args["miny"] if p_args["miny"] > miny else miny maxy = p_args["maxy"] if p_args["maxy"] > 0 else maxy plt.xlim((minx,maxx)) plt.ylim((miny,maxy)) plt.xlabel("Kmer Coverage")
aqual = lambda r: int(itemgetter(6)(r)) ref = itemgetter(7) pos = lambda r: int(itemgetter(8)(r)) strand = itemgetter(9) if not len(sys.argv) == 3: sys.exit("filterpairs.py read1.novo read2.novo\n") names_eq = lambda name1, name2: name1.split("/")[0] == name2.split("/")[0] filenames = sys.argv[1:3] #filter out header lines headfilt = lambda x: not x.startswith("#") filt_lii = partial(lineItemIterator, filter_func=headfilt) filt_fits = map(lambda fn: fileIterator(fn, filt_lii), filenames) failrepeat = 0 failmapq = 0 failsameref = 0 total = 0 passed = 0 insertNotRF = [] insertRF = [] for read1, read2 in izip(*filt_fits): total += 1 if not names_eq(name(read1), name(read2)): sys.exit("Error: %s not equal to %s\n" % (name(read1), name(read2))) if not status(read1) == "U" or not status(read2) == "U":
if not len(sys.argv) == 2: sys.exit("reference_segments.py in.fa") store_table = {'A': [], 'C': [], 'G': [], 'T': [], 'N': []} previous = 0 curr_count = 1 table = 2 accumulator = ["N", -1, store_table] def runs(acc, nxt): prev_letter = acc[previous] acc[curr_count] += 1 if not prev_letter == nxt: if acc[curr_count] > 0: acc[table][prev_letter].append(acc[curr_count]) acc[previous] = nxt acc[curr_count] = 0 return accumulator for entry in fileIterator(sys.argv[1], fastaIterator): reduce(runs, entry.seq, accumulator) runs(accumulator, 'X') #get last sequence accumulator[previous] = 'N' accumulator[curr_count] = -1 print max(store_table['N'])
def getRawAlignments(fn): '''fn is the filename returns An iterator over raw M4Records''' itemIterator = lambda f: lineRecordIterator(f, M4Record, M4RecordTypes) return fileIterator(fn, itemIterator)
from args import parseArgs, getHelpStr, argflag, CLArgument description = ("Usage: m4region.py [options] input.m4 chr:start-end\n" "Returns alignments that touch a region\n") argument_list = [[ "span", "span", argflag, False, "Only alignments that span the region" ]] arguments = map(CLArgument._make, argument_list) (p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments) if not len(args_remaining) == 2: sys.exit(getHelpStr(description, arguments) + "\n") inm4 = args_remaining[0] (chrom, rest) = args_remaining[1].split(":") (start, end) = map(int, rest.split("-")) it = lambda fh: lineRecordIterator(fh, M4Record, M4RecordTypes) cond = lambda r: r.tname == chrom and not r.tend < start and not r.tstart > end if p_arg_map["span"]: cond = lambda r: r.tname == chrom and r.tstart < start and r.tend > end filt_records = ifilter(cond, fileIterator(inm4, it)) for r in imap(recordToString, filt_records): print r
if not len(sys.argv) > 1: sys.exit(getHelpStr(description, arguments) + "\n") (p_arg_map, args_remaining) = parseArgs(sys.argv[1:], arguments) if not len(args_remaining) >= 1: sys.exit(getHelpStr(description, arguments) + "\n") title = args_remaining[0] infiles = args_remaining[1:] cellnames = map(lambda f : "_".join(f.split(".")[0].split("_")[:2]), infiles) fit_gen = lambda filename : fileIterator(filename, lineItemIterator) file_iterators = map(fit_gen, infiles) def getBasesFromLineArr(arr): if not bool(arr): return if arr[0].startswith("n="): return arr[6].split("=")[1] if arr[0].startswith("#>%d" % p_arg_map["lengreater"]): return arr[1].split("=")[1] def getCountsFromLineArr(arr): if not bool(arr): return if arr[0].startswith("n="): return arr[0].split("=")[1]