def main(args): # fields from issake fields = "contig_id length reads avg_coverage seed v_region j_region".split() # the only fields i believe make any sense to keep out = "id v_region j_region length reads avg_coverage percent_of_total sequence".split() # total reads used in assembly total = 0. with nopen(args.fasta_in) as fasta: for name, seq in read_fasta(fasta): name = name.replace("size","").replace("cov","").replace("read","").replace("seed:","") d = dict(zip(fields, name.split("|"))) total += int(d['reads']) with nopen(args.fasta_in) as fasta,\ open(args.fasta_out, 'wb') as fasta_out,\ open(args.meta, 'wb') as meta: # print header meta.write("\t".join(out) + "\n") for i, (name, seq) in enumerate(read_fasta(fasta)): # remove some text from iSSAKE output name = name.replace("size","").replace("cov","").replace("read","").replace("seed:","") d = dict(zip(fields, name.split("|"))) # want to shorten the read names d['id'] = "contig_%d" % i d['percent_of_total'] = "%.6g" % (100 * (int(d['reads']) / total)) d['sequence'] = seq meta.write("\t".join(map(str, [d[o] for o in out])) + "\n") write_fasta(fasta_out, d['id'], seq.upper())
def main(prefix, name_re, min_samples, methylation_files): name_re = re.compile(r"%s" % name_re) if not prefix.endswith((".", "/")): prefix += "." fhm = nopen('{prefix}methylation.txt.gz'.format(prefix=prefix), 'w') fhme = nopen('{prefix}methylated.txt.gz'.format(prefix=prefix), 'w') fhc = nopen('{prefix}counts.txt.gz'.format(prefix=prefix), 'w') def source_from_fname(fname): try: return name_re.search(fname).groups(0)[0] except: return op.basename(fname) iterables = [gen_iterable(f, source_from_fname) for f in methylation_files] sources = [source_from_fname(f) for f in methylation_files] fmt = "{chrom}:{start}\t{vals}\n" fhm.write("probe\t%s" % "\t".join(sources) + "\n") fhc.write("probe\t%s" % "\t".join(sources) + "\n") fhme.write("probe\t%s" % "\t".join(sources) + "\n") for chrom, start, end, values, counts, meths in bed_merge(iterables, sources): if sum(tryfloat(v) > 0 for v in values) < min_samples: continue vals = "\t".join(values) fhm.write(fmt.format(chrom=chrom, start=start, vals=vals)) counts = "\t".join(counts) fhc.write(fmt.format(chrom=chrom, start=start, vals=counts)) meths = "\t".join(meths) fhme.write(fmt.format(chrom=chrom, start=start, vals=meths))
def main(prefix, name_re, min_samples, methylation_files): name_re = re.compile(r"%s" % name_re) if not prefix.endswith((".", "/")): prefix += "." fhm = nopen('{prefix}methylation.txt.gz'.format(prefix=prefix), 'w') fhme = nopen('{prefix}methylated.txt.gz'.format(prefix=prefix), 'w') fhc = nopen('{prefix}counts.txt.gz'.format(prefix=prefix), 'w') def source_from_fname(fname): try: return name_re.search(fname).groups(0)[0] except: return op.basename(fname) iterables = [gen_iterable(f, source_from_fname) for f in methylation_files] sources = [source_from_fname(f) for f in methylation_files] fmt = "{chrom}:{start}\t{vals}\n" fhm.write("probe\t%s" % "\t".join(sources) + "\n") fhc.write("probe\t%s" % "\t".join(sources) + "\n") fhme.write("probe\t%s" % "\t".join(sources) + "\n") for chrom, start, end, values, counts, meths in bed_merge( iterables, sources): if sum(tryfloat(v) > 0 for v in values) < min_samples: continue vals = "\t".join(values) fhm.write(fmt.format(chrom=chrom, start=start, vals=vals)) counts = "\t".join(counts) fhc.write(fmt.format(chrom=chrom, start=start, vals=counts)) meths = "\t".join(meths) fhme.write(fmt.format(chrom=chrom, start=start, vals=meths))
def main(args): tags = {} if args.verbose: sys.stderr.write(">> reading in tag sequences...\n") with nopen(args.tags) as fasta: for name, seq in read_fasta(fasta): tags[name] = seq i = 0 for fx in args.reads: if args.verbose: sys.stderr.write(">> processing %s...\n" % op.basename(fx)) # process either fasta or fastq. if ".fasta" in fx or ".fa" in fx: with nopen(fx) as fa: for f_id, f_seq in read_fasta(fa): i += 1 if i % 1000000 == 0 and args.verbose: sys.stderr.write(">> processed %d reads...\n" % i) print_record(tags, f_id, f_seq) else: with nopen(fx) as fq: for f_id, f_seq, f_qual in read_fastq(fq): i += 1 if i % 1000000 == 0 and args.verbose: sys.stderr.write(">> processed %d reads...\n" % i) print_record(tags, f_id, f_seq)
def _set_structure(self, structure): """ here, we want to intersect the query and subject bed files with the structure.bed file and give each set of intervals in query and bed that fall within (or have any overlap with) a unique, fake chromosome so that all shuffling is within that chromosome. in order to do this, we also have to create a fake genome file that contains the lengths of those chromosomes. """ if structure in (None, ""): return self.chrom = True # has to be by chromosome. n_query_before = sum(1 for _ in nopen(self.query)) n_subject_before = sum(1 for _ in nopen(self.subject)) new_genome = open(mktemp(suffix='.fake_genome'), 'w') structure = "<(cut -f 1-3 %s)" % structure seen_segs = {} for bed in ('query', 'subject', 'exclude', 'include'): bed_path = getattr(self, "_" + bed, getattr(self, bed)) if not bed_path: continue new_fh = open(mktemp(suffix='%s.fake' % bed), 'w') for toks in reader("|bedtools intersect -wo -a %s -b '%s' \ | sort -k4,4 -k5,5g" % (structure, bed_path), header=False): gtoks, btoks = toks[:3], toks[3:-1] # drop the bp overlap new_chrom = "_".join(gtoks) gtoks[1:] = map(int, gtoks[1:]) btoks[1:3] = map(int, btoks[1:3]) glen = gtoks[2] - gtoks[1] # fake chrom length. if new_chrom.startswith('chr'): new_chrom = new_chrom[3:] if not new_chrom in seen_segs: # save it in the genome file. print >> new_genome, "\t".join((new_chrom, str(glen))) seen_segs[new_chrom] = True # with partial overlap, we'll have a negative start or an # end outside the genome... for now, just truncate. # adjust the interval to its location the new chrom. btoks[0] = new_chrom btoks[1] = max(0, btoks[1] - gtoks[1]) # don't let it go below 0 # chop to end of fake chrom. btoks[2] = min(btoks[2] - gtoks[1], glen - 1) assert 0 <= btoks[1] <= btoks[2] < glen btoks[1:3] = map(str, btoks[1:3]) print >> new_fh, "\t".join(btoks) new_fh.close() setattr(self, bed, new_fh.name) new_genome.close() self.genome_file = new_genome.name
def _set_structure(self, structure): """ here, we want to intersect the query and subject bed files with the structure.bed file and give each set of intervals in query and bed that fall within (or have any overlap with) a unique, fake chromosome so that all shuffling is within that chromosome. in order to do this, we also have to create a fake genome file that contains the lengths of those chromosomes. """ if structure in (None, ""): return self.chrom = True # has to be by chromosome. n_query_before = sum(1 for _ in nopen(self.query)) n_subject_before = sum(1 for _ in nopen(self.subject)) new_genome = open(mktemp(suffix='.fake_genome'), 'w') structure = "<(cut -f 1-3 %s)" % structure seen_segs = {} for bed in ('query', 'subject', 'exclude', 'include'): bed_path = getattr(self, "_" + bed, getattr(self, bed)) if not bed_path: continue new_fh = open(mktemp(suffix='%s.fake' % bed), 'w') for toks in reader("|bedtools intersect -wo -a %s -b '%s' \ | sort -k4,4 -k5,5g" % (structure, bed_path), header=False): gtoks, btoks = toks[:3], toks[3:-1] # drop the bp overlap new_chrom = "_".join(gtoks) gtoks[1:] = map(int, gtoks[1:]) btoks[1:3] = map(int, btoks[1:3]) glen = gtoks[2] - gtoks[1] # fake chrom length. if new_chrom.startswith('chr'): new_chrom = new_chrom[3:] if not new_chrom in seen_segs: # save it in the genome file. print >>new_genome, "\t".join((new_chrom, str(glen))) seen_segs[new_chrom] = True # with partial overlap, we'll have a negative start or an # end outside the genome... for now, just truncate. # adjust the interval to its location the new chrom. btoks[0] = new_chrom btoks[1] = max(0, btoks[1] - gtoks[1]) # don't let it go below 0 # chop to end of fake chrom. btoks[2] = min(btoks[2] - gtoks[1], glen - 1) assert 0 <= btoks[1] <= btoks[2] < glen btoks[1:3] = map(str, btoks[1:3]) print >>new_fh, "\t".join(btoks) new_fh.close() setattr(self, bed, new_fh.name) new_genome.close() self.genome_file = new_genome.name
def extend_bed(fin, fout, bases): # we're extending both a.bed and b.bed by this distance # so divide by 2. bases /= 2 with nopen(fout, 'w') as fh: for toks in (l.rstrip("\r\n").split("\t") for l in nopen(fin)): toks[1] = max(0, int(toks[1]) - bases) toks[2] = max(0, int(toks[2]) + bases) if toks[1] > toks[2]: # negative distances toks[1] = toks[2] = (toks[1] + toks[2]) / 2 assert toks[1] <= toks[2] print >>fh, "\t".join(map(str, toks)) return fh.name
def convert_reads(fq1s, fq2s, out=sys.stdout): for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")): sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2)) fq1 = nopen(fq1) #examines first five lines to detect if this is an interleaved fastq file first_five = list(islice(fq1, 5)) fq1.seek(0) r1_header = first_five[0] r2_header = first_five[-1] if r1_header.split(' ')[0] == r2_header.split(' ')[0]: already_interleaved = True else: already_interleaved = False q1_iter = izip(*[fq1] * 4) if fq2 != "NA": fq2 = nopen(fq2) q2_iter = izip(*[fq2] * 4) else: if already_interleaved: sys.stderr.write("detected interleaved fastq\n") else: sys.stderr.write( "WARNING: running bwameth in single-end mode\n") q2_iter = repeat((None, None, None, None)) lt80 = 0 if already_interleaved: selected_iter = q1_iter else: selected_iter = chain(*izip(q1_iter, q2_iter)) for read_i, (name, seq, _, qual) in enumerate(selected_iter): if name is None: continue convert_and_write_read(name, seq, qual, read_i % 2, out) if len(seq) < 80: lt80 += 1 out.flush() if lt80 > 50: sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80) sys.stderr.write(" : this program is designed for long reads\n") return 0
def extend_bed(fin, fout, bases): # `bedtools slop` # we're extending both a.bed and b.bed by this distance # so divide by 2. bases /= 2 with nopen(fout, 'w') as fh: for toks in (l.rstrip("\r\n").split("\t") for l in nopen(fin)): toks[1] = max(0, int(toks[1]) - bases) toks[2] = max(0, int(toks[2]) + bases) if toks[1] > toks[2]: # negative distances toks[1] = toks[2] = (toks[1] + toks[2]) / 2 assert toks[1] <= toks[2] print >> fh, "\t".join(map(str, toks)) return fh.name
def intersect(ref, xref, peaks): if xref: xref = xref_to_dict(xref) # group the output by chr->gene->start cmd = ("|bedtools intersect -wb -a {peaks} -b {ref} " "| sort -k1,1 -k8,8 -k2,2n").format(**locals()) cols = ['chrom','start','stop','peak','_chrom', '_start','_stop','gene','_score','strand'] tmp = open(tempfile.mkstemp(suffix=".bed")[1], 'wb') for g in grouper(nopen(cmd), cols): negs = [] for i, l in enumerate(unique_everseen(\ g, lambda t: ret_item(t, cols, 'peak')), start=1): l = lparser(l, cols) # negative stranded sites if l['strand'] == "-": # need to count through them up, saving l each time negs.append(l) continue # positive stranded sites print >>tmp, "\t".join(get_out(l, i, xref)) for i, l in izip(count(len(negs), -1), negs): print >>tmp, "\t".join(get_out(l, i, xref)) tmp.close() return tmp.name
def readfx(fastx): with nopen(fastx) as fp: last = None while True: if not last: for l in fp: if l[0] in '>@': last = l[:-1] break if not last: break name, seqs, last = last[1:].partition(" ")[0], [], None for l in fp: if l[0] in '@+>': last = l[:-1] break seqs.append(l[:-1]) if not last or last[0] != '+': yield name, ''.join(seqs), None if not last: break else: seq, leng, seqs = ''.join(seqs), 0, [] for l in fp: seqs.append(l[:-1]) leng += len(l) - 1 if leng >= len(seq): last = None yield name, seq, ''.join(seqs); break if last: yield name, seq, None break
def run_metric(cmd, metric=None): """ Metric can be a string, e.g. "wc -l" or a python callable that consumes lines of input and returns a single value. e.g. def mymetric(fh): val = 0 for line in fh: val += float(line.split("\t")[4]) return val The lines sent to the metric function will be the result of bedtools intersect -wo -- so that both the -a and -b intervals will be present in each line. """ if metric is None: cmd, metric = cmd if isinstance(metric, basestring): return float(run("%s | %s" % (cmd, metric))) else: proc = nopen("|%s" % cmd, mode=None) res = metric(proc.stdout) check_proc(proc, cmd) assert isinstance(res, (int, float)) return res
def GetTotalSeqRecords(input_file): ''' This function count of number strings in fastq file and return sequences number ''' with nopen(input_file) as f: TotalSeqRecords = int(sum(1 for _ in f)) / 4 return TotalSeqRecords
def readccrs(path, gerp, phast, cadd): for i, d in enumerate(ts.reader(path, header="ordered")): d['gerp'] = ",".join( map( str, gerp.values("chr" + d['chrom'], int(d['start']), int(d['end'])))) d['phast'] = ",".join( map( str, phast.values("chr" + d['chrom'], int(d['start']), int(d['end'])))) region = d['chrom'] + ":" + d['start'] + "-" + d['end'] var = None vals = [] caddvals = [] for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen("| tabix " + cadd + " {region}".format(region=region)) if x[1] != "#"): #TODO replace w cyvcf2 if var == None or var == toks[1]: vals.append(float(toks[5])) elif var != toks[1] and var != None: caddvals.append(np.mean(vals)) vals = [] var = toks[1] d['cadd'] = ",".join(map(str, caddvals)) if i == 0: print "\t".join(d.keys()) print "\t".join(map(str, d.values()))
def CollectBarcode(indexFile, barcodeLength, readsValue, barcodeError, const_2, const_2Error, regExpBc, mergeBC, reverseBC): bcList = [] records = supp.GetTotalSeqRecords(indexFile) bar = progressbar.ProgressBar( maxval=records, widgets=[progressbar.Bar(left='<', marker='.', right='>')]).start() t = 0.0 expr = regex.compile(regExpBc) with nopen(indexFile) as handle: for seq_record in SeqIO.parse(handle, "fastq"): bar.update(t) t += 1 match = expr.match(str(seq_record.seq)) if match is not None: if int(barcodeLength * 0.9) <= len( match.group("barcode")) <= int(barcodeLength * 1.1): if "N" not in match.group("barcode").upper(): if reverseBC: bcList.append( reverseComplement(match.group("barcode"))) else: bcList.append(match.group("barcode")) bar.finish() if mergeBC: return bcList bcCount = Counter(bcList) bcDict = SelectionReliableBarcode(bcCount, readsValue, barcodeError) if len(bcDict) <= 10**5: # print(" Checking barcodes ... Estimated time: ~ {}".format(supp.EstimateCalculationTime(bcDict))) mainCheckBarcodeInDict(bcDict, barcodeError) return bcDict
def read_exons(gtf): transcripts = defaultdict(pyinter.IntervalSet) totlen = 0 names = [] trs, ids = [], [] for toks in (x.rstrip('\r\n').split("\t") for x in ts.nopen(gtf) if x[0] != "#"): if toks[2] not in("CDS", "stop_codon") or toks[1] not in("protein_coding"): continue #if toks[0] != "1": break start, end = map(int, toks[3:5]) assert start <= end, toks transcript = toks[8].split('transcript_id "')[1].split('"', 1)[0] transcripts[transcript].add(pyinter.closedopen(start-1, end)) names.append(toks[8].split('transcript_name "')[1].split('"', 1)[0].rsplit("-", 1)[0]) ids.append(toks[8].split('gene_id "')[1].split('"', 1)[0]) trs.append(toks[8].split('transcript_id "')[1].split('"', 1)[0]) # sort by start so we can do binary search. # TODO: need to remove overlapping exons so we don't double-count transcripts = dict((k, sorted(v)) for k, v in transcripts.iteritems()) #ends = dict((k, sorted(v)) for k, v in ends.iteritems()) ints={} lens=pyinter.IntervalSet() for tr, ivset in transcripts.iteritems(): sends = sorted(list(ivset)) iset=pyinter.IntervalSet(pyinter.closedopen(x.lower_value,x.upper_value) for x in sends) lens = lens.union(iset) ss, es = [x.lower_value for x in sends], [x.upper_value for x in sends] ints[tr] = (ss,es) totlen = sum(x.upper_value-x.lower_value for x in lens) return ints, set(names), set(ids), set(trs), totlen
def subsample(infiles, outfiles, prob, seed=None): prob = 1 - prob if seed: random.seed(seed) def open_fq(f): fh = nopen(f, 'rb') return zip(*[fh] * 4) in_fh = [open_fq(i) for i in infiles] out_fh = [nopen(o, 'wb') for o in outfiles] try: written = 0 for total, reads in enumerate(zip(*in_fh), 1): if random.random() >= prob: written += 1 for read, fh in zip(reads, out_fh): fh.writelines(read) print("wrote {} of {} reads".format(written, total)) finally: for i in in_fh: i.close() for o in out_fh: o.close()
def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("-p", dest="pvals", help="BED containing all the p values" " used to generate `regions`") p.add_argument("-r", dest="regions", help="BED containing all the regions") p.add_argument("-s", dest="step", type=int, default=50, help="step size for acf calculation. should be the same " " value as the step sent to -d arg for acf") p.add_argument("--mlog", dest="mlog", action="store_true", default=False, help="do the correlation on the -log10 of" "the p-values. Default is to do it on the raw values") p.add_argument("-N", dest="N", help="number of simulations to perform", type=int, default=0) p.add_argument("-c", dest="c", help="column number containing the p-value" " of interest", type=int, default=-1) p.add_argument("-z", dest="z", help="use z-score correction", action="store_true") args = p.parse_args() if not (args.regions and args.pvals): import sys sys.exit(not p.print_help()) from toolshed import nopen header = nopen(args.regions).next() if header.startswith("#") or (not header.split("\t")[2].isdigit()): print "%s\tslk_p\tslk_sidak_p" % (header.rstrip("\r\n"),) return run(args)
def fastq_to_dict(fastq): """docstring for fastq_to_dict""" d = {} with nopen(fastq) as fh: for name, seq, qual in read_fastx(fh): d[name] = {'seq':seq,'qual':qual} return d
def FastqJoinPaired(r1, r2, output_dir, gap_size, separator, mode="paired", reverse_complement=False): # set dictionary based on mode if mode == "R2": fastqdict = fastqtodict(r1, separator) fastq = r2 else: fastqdict = fastqtodict(r2, separator) fastq = r1 gap_bind, gap_qual = "N"*int(gap_size), "*"*int(gap_size) p_out, unq_out = os.path.join(output_dir, "output_paired.fastq"), os.path.join(output_dir, "output_unique.fastq") with nopen(fastq) as fq: with open(p_out, "w") as handle_p: with open(unq_out, "w") as handle_unq: for name, seq, qual in read_fastq(fq): try: # explicitly state space to facilitate future changes name = name.split(" ")[0] cseq = fastqdict.get(name)[0] cqual = fastqdict.get(name)[1] if reverse_complement: cseq = reverseComplement(cseq) handle_p.write("@{}\n{}{}{}\n+\n{}{}{}\n".format(name, seq, gap_bind, cseq, qual, gap_qual, cqual)) except KeyError: # without pairs if not mode == "paired": handle_unq.write("@{}\n{}\n+\n{}\n".format(name, seq, qual)) return p_out
def readgenes(trans): genes = defaultdict(str) for fields in (x.rstrip('\r\n').split("\t") for x in ts.nopen(trans)): gene = fields[0] transcript = fields[1] genes[gene] = transcript return genes
def genome_control_adjust_bed(bedfiles, colnum, outfh): c = colnum adj = genome_control_adjust([d['p'] for d in bediter(bedfiles, colnum)]) diff = 0 if len(bedfiles) > 1: print("can't do genomic control adjustment with more than 1 bed file", file=sys.stderr) sys.exit(4) for j, bedfile in enumerate(bedfiles): for i, toks in enumerate(line.rstrip("\r\n").split("\t") \ for line in ts.nopen(bedfile)): try: float(toks[c]) except ValueError: # headder if i == 0 == j: print("\t".join(toks), file=outfh) diff = 1 continue elif i == 0: continue else: raise toks[c] = "%.5g" % adj[i - diff] print("\t".join(toks), file=outfh)
def run_metric(cmd, metric=None): """ Metric can be a string, e.g. "wc -l" or a python callable that consumes lines of input and returns a single value. e.g. def mymetric(fh): val = 0 for line in fh: val += float(line.split("\t")[4]) return val The lines sent to the metric function will be the result of bedtools intersect -wa -- so that both the -a and -b intervals will be present in each line. """ if metric is None: cmd, metric = cmd if isinstance(metric, basestring): return float(run("%s | %s" % (cmd, metric))) else: proc = nopen("|%s" % cmd, mode=None) res = metric(proc.stdout) check_proc(proc, cmd) assert isinstance(res, (int, float)) return res
def local_shuffle(bed, loc='500000'): """ Randomize the location of each interval in `bed` by moving its start location to within `loc` bp of its current location or to its containing interval in `loc`. Arguments: bed - input bed file loc - shuffle intervals to within this distance (+ or -). If not an integer, then this should be a BED file containing regions such that each interval in `bed` is shuffled within its containing interval in `loc` """ from random import randint if str(loc).isdigit(): dist = abs(int(loc)) with nopen(bed) as fh: for toks in (l.rstrip('\r\n').split('\t') for l in fh): d = randint(-dist, dist) toks[1:3] = [str(max(0, int(bloc) + d)) for bloc in toks[1:3]] print "\t".join(toks) else: # we are using dist as the windows within which to shuffle assert os.path.exists(loc) bed4 = mktemp() with open(bed4, 'w') as fh: # this step is so we don't have to track the number of columns in A for toks in reader(bed, header=False): fh.write("%s\t%s\n" % ("\t".join(toks[:3]), SEP.join(toks))) missing = 0 # we first find the b-interval that contains each a-interval by # using bedtools intersect for toks in reader("|bedtools intersect -wao -a {bed4} -b {loc}" .format(**locals()), header=False): ajoin = toks[:4] a = ajoin[3].split(SEP) # extract the full interval b = toks[4:] if int(b[-1]) == 0: missing += 1 continue assert a[0] == b[0], ('chroms dont match', a, b) alen = int(a[2]) - int(a[1]) # doesn't care if the new interval is completely contained in b astart = randint(int(b[1]), int(b[2])) # subtract half the time. aend = (astart - alen) if randint(0, 1) == 0 and astart > alen \ else (astart + alen) a[1], a[2] = map(str, (astart, aend) if astart < aend else (aend, astart)) print "\t".join(a) if missing > 0: print >> sys.stderr, ("found {missing} intervals in {bed} that " " were not contained in {loc}" .format(**locals()))
def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument("-p", dest="pvals", help="BED containing all the p values" " used to generate `regions`") p.add_argument("-r", dest="regions", help="BED containing all the regions") p.add_argument("-s", "--step", dest="step", type=int, default=50, help="step size for acf calculation. should be the same " " value as the step sent to -d arg for acf") p.add_argument("-c", dest="c", help="column number containing the p-value" " of interest", type=str, default=-1) p.add_argument("-z", dest="z", help="use z-score correction", action="store_true") args = p.parse_args() if not (args.regions and args.pvals): import sys sys.exit(not p.print_help()) header = ts.nopen(args.regions).next() if header.startswith("#") or (not header.split("\t")[2].isdigit()): print "%s\tslk_p\tslk_sidak_p" % (header.rstrip("\r\n"),) header = ts.header(args.pvals) if args.c in header: args.c = header.index(args.c) + 1 else: args.c = int(args.c) return run(args)
def infos(path): infos = [] for x in ts.nopen(path): if x[1] != "#": break if not "INFO" in x: continue infos.append(x.split("ID=")[1].split(",")[0]) return infos
def read_regions(fregions): if not fregions: return None regions = {} for toks in (l.split("\t") for l in ts.nopen(fregions) if l[0] != "#"): if not toks[0] in regions: regions[toks[0]] = [] regions[toks[0]].append((int(toks[1]), int(toks[2]))) return regions
def protein(self): from toolshed import nopen l = "http://genome.ucsc.edu/cgi-bin/hgGene?hgg_do_getProteinSeq=1&hgg_gene=" url = l + self.name seq = [x.strip() for x in nopen(url) if x.strip() and not ">" in x] return "".join(seq)
def tofile(fiter, fname): fh = nopen(fname, "w") for line in fiter: print >> fh, line.rstrip("\r\n") fh.close() atexit.register(os.unlink, fname) return fname
def main(): p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) p.add_argument('attrs', help='Cytoscape attributes file of overlapping features') args = p.parse_args() attrsfile = nopen(args.attrs) # remove header attrsfile.readline() uniqid = os.path.basename(args.attrs).split(".", 1)[0] previous_feature = None for line in attrsfile: attr = line.rstrip("\r\n").split("=", 1) # everything before the "=" cyto_info = attr[0].strip() feature = attr[1].strip().upper() fields = (cyto_info, "= True\n") if previous_feature and previous_feature == feature: fileout.write(" ".join(map(str, fields))) else: fileout = open("%s.%s.eda" % (uniqid, feature), 'w') fileout.write("feature%s\n" % feature) fileout.write(" ".join(map(str, fields))) previous_feature = feature
def main(precision, path): header = None tmpl = "{Chrom}\t{Pos}\t.\t{Ref}\t{Alt}\t1\tPASS\traw={RawScore:.%if};phred={PHRED:.%if}" % ( precision, precision) hdr = """\ ##fileformat=VCFv4.1 ##INFO=<ID=raw,Number=1,Type=Float,Description="raw cadd score"> ##INFO=<ID=phred,Number=1,Type=Float,Description="phred-scaled cadd score"> ##CADDCOMMENT=<ID=comment,comment="{comment}"> #CHROM POS ID REF ALT QUAL FILTER INFO""" for i, line in enumerate(ts.nopen(path)): line = _to_str(line) if i == 0: print(hdr.format(comment=line.strip("# ").strip())) continue if header is None and line.startswith("#Chrom"): header = line[1:].rstrip().split("\t") continue d = dict(zip(header, line.rstrip().split("\t"))) d['PHRED'] = float(d['PHRED']) d['RawScore'] = float(d['RawScore']) print(tmpl.format(**d))
def tofile(fiter, fname): fh = nopen(fname, "w") for line in fiter: print >>fh, line.rstrip("\r\n") fh.close() atexit.register(os.unlink, fname) return fname
def convert_reads(fq1s, fq2s, out=sys.stdout): for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")): sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2)) fq1 = nopen(fq1) #examines first five lines to detect if this is an interleaved fastq file first_five = list(islice(fq1, 5)) r1_header = first_five[0] r2_header = first_five[-1] if r1_header.split(' ')[0] == r2_header.split(' ')[0]: already_interleaved = True else: already_interleaved = False q1_iter = izip(*[chain.from_iterable([first_five,fq1])] * 4) if fq2 != "NA": fq2 = nopen(fq2) q2_iter = izip(*[fq2] * 4) else: if already_interleaved: sys.stderr.write("detected interleaved fastq\n") else: sys.stderr.write("WARNING: running bwameth in single-end mode\n") q2_iter = repeat((None, None, None, None)) lt80 = 0 if already_interleaved: selected_iter = q1_iter else: selected_iter = chain.from_iterable(izip(q1_iter, q2_iter)) for read_i, (name, seq, _, qual) in enumerate(selected_iter): if name is None: continue convert_and_write_read(name,seq,qual,read_i%2,out) if len(seq) < 80: lt80 += 1 out.flush() if lt80 > 50: sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80) sys.stderr.write(" : this program is designed for long reads\n") return 0
def main(): args = get_args() if args.verbose: sys.stderr.write(">> building gene orthology cross-reference...\n") xref = get_xref(args.xref) if args.verbose: sys.stderr.write(">> building uniprot library...\n") uniprot = parse_uniprot_flat(args.uniprot) if args.verbose: sys.stderr.write(">> annotating matrisome...\n") header = nopen(args.matrisome).readline().rstrip("\r\n").split("\t") headerext = ['r_ENSRNOP', 'r_score', 'r_geneid', 'r_gene_description', \ 'r_uniprot', 'r_interpro', 'r_refseqn', 'r_refseqp', \ 'r_ensg', 'r_enst', 'r_ensp'] header.extend(headerext) print "\t".join(h for h in header) for entry in reader(args.matrisome): # reset vars for h in headerext: entry[h] = "" # handle multiple entries delimited by ":" for entryname in entry[args.xref_col].split(":"): # looping over entire defaultdict each time for uid, ddict in xref.iteritems(): # find a matching ortholog for orthoname in ddict['orthonames']: if orthoname == entryname: # use the uid to get the rat names and scores for ratname, ratscore in izip(xref[uid]['ratnames'], xref[uid]['ratscores']): # print ratname entry['r_ENSRNOP'] += "%s:" % ratname entry['r_score'] += "%s:" % ratscore # for each rat ENSP, add the corresponding annotation(s) for uniqueid, uniprot_entry in uniprot.iteritems(): for ensemblname in uniprot_entry['ensemblp']: if ensemblname == ratname: #print all of the info for this uid entry['r_geneid'] += ':'.join(t for t in uniprot[uniqueid]['geneid']) + ":" entry['r_gene_description'] += ':'.join(t for t in uniprot[uniqueid]['description']) + ":" entry['r_uniprot'] += ':'.join(t for t in uniprot[uniqueid]['uniprotid']) + ":" entry['r_interpro'] += ':'.join(t for t in uniprot[uniqueid]['interpro']) + ":" entry['r_refseqn'] += ':'.join(t for t in uniprot[uniqueid]['refseqn']) + ":" entry['r_refseqp'] += ':'.join(t for t in uniprot[uniqueid]['refseqp']) + ":" entry['r_ensg'] += ':'.join(t for t in uniprot[uniqueid]['ensemblg']) + ":" entry['r_enst'] += ':'.join(t for t in uniprot[uniqueid]['ensemblt']) + ":" entry['r_ensp'] += ':'.join(t for t in uniprot[uniqueid]['ensemblp']) + ":" print "\t".join(entry[h].rstrip(":") for h in header)
def get_read_length(fq): lens = [] for i, line in enumerate(ts.nopen(fq)): if i % 4 == 1: lens.append(len(line) -1) if len(lens) > 100: break assert len(set(lens)) == 1, ("don't trim reads before sending to bwa-mips", set(lens)) return lens[0]
def readstat(cifstat): with nopen(cifstat) as fh: clean = (x.strip("\r\n") for x in fh if x.strip()) while True: rd = [x for x in islice(clean, 6)] if not rd: raise StopIteration assert all(rd) and len(rd) == 6 yield CifStat(rd)
def readfq(fq): with nopen(fq) as fh: fqclean = (x.strip("\r\n") for x in fh if x.strip()) while True: rd = [x for x in islice(fqclean, 4)] if not rd: raise StopIteration assert all(rd) and len(rd) == 4 yield Fastq(rd)
def __call__(self, fh): out = tofile(fh, tempfile.mktemp()) try: value = nopen("%s < %s" % (self.command_string, out)).next() return dict(value=float(value)) except: print self.command_string raise
def fastqtodict(fastq, separator): """returns dict of read name to sequence""" fdict = {} with nopen(fastq) as fq: for name, seq, qual in read_fastq(fq): # explicitly state space to facilitate future changes fdict[name.split(separator)[0]] = [seq, qual] return fdict
def read_acf(acf_file): acf_vals = {} for row in ts.nopen(acf_file): if row[0] == "#": continue row = row.split("\t") if row[0] == "lag_min": continue acf_vals[(int(row[0]), int(row[1]))] = float(row[2]) return sorted(acf_vals.items())
def convert_reads(fq1s, fq2s, out=sys.stdout): for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")): sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2)) fq1 = nopen(fq1) if fq2 != "NA": fq2 = nopen(fq2) q2_iter = izip(*[fq2] * 4) else: sys.stderr.write("WARNING: running bwameth in single-end mode\n") q2_iter = repeat((None, None, None, None)) q1_iter = izip(*[fq1] * 4) lt80 = 0 for pair in izip(q1_iter, q2_iter): for read_i, (name, seq, _, qual) in enumerate(pair): if name is None: continue name = name.rstrip("\r\n").split(" ")[0] if name[0] != "@": sys.stderr.write("""ERROR!!!! ERROR!!! FASTQ conversion failed ERROR!!! expecting FASTQ 4-tuples, but found a record %s that doesn't start with "@" """ % name) sys.exit(1) if name.endswith(("_R1", "_R2")): name = name[:-3] elif name.endswith(("/1", "/2")): name = name[:-2] seq = seq.upper().rstrip('\n') if len(seq) < 80: lt80 += 1 char_a, char_b = ['CT', 'GA'][read_i] # keep original sequence as name. name = " ".join((name, "YS:Z:" + seq + "\tYC:Z:" + char_a + char_b + '\n')) seq = seq.replace(char_a, char_b) out.write("".join((name, seq, "\n+\n", qual))) out.flush() if lt80 > 50: sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80) sys.stderr.write(" : this program is designed for long reads\n") return 0
def fqiter(fq, n=4): with ts.nopen(fq) as fh: fqclean = (x.strip("\r\n") for x in fh if x.strip()) while True: rec = [x for x in islice(fqclean, n)] if not rec: raise StopIteration assert all(rec) and len(rec) == 4 yield rec
def convert_reads(fq1s, fq2s, out=sys.stdout): for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")): sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2)) fq1 = nopen(fq1) if fq2 != "NA": fq2 = nopen(fq2) q2_iter = izip(*[fq2] * 4) else: sys.stderr.write("WARNING: running bwameth in single-end mode\n") q2_iter = repeat((None, None, None, None)) q1_iter = izip(*[fq1] * 4) lt80 = 0 for pair in izip(q1_iter, q2_iter): for read_i, (name, seq, _, qual) in enumerate(pair): if name is None: continue name = name.rstrip("\r\n").split(" ")[0] if name[0] != "@": sys.stderr.write("""ERROR!!!! ERROR!!! FASTQ conversion failed ERROR!!! expecting FASTQ 4-tuples, but found a record %s that doesn't start with "@" """ % name) sys.exit(1) if name.endswith(("_R1", "_R2")): name = name[:-3] elif name.endswith(("/1", "/2")): name = name[:-2] seq = seq.upper().rstrip('\n') if len(seq) < 80: lt80 += 1 char_a, char_b = ['CT', 'GA'][read_i] # keep original sequence as name. name = " ".join( (name, "YS:Z:" + seq + "\tYC:Z:" + char_a + char_b + '\n')) seq = seq.replace(char_a, char_b) out.write("".join((name, seq, "\n+\n", qual))) out.flush() if lt80 > 50: sys.stderr.write("WARNING: %i reads with length < 80\n" % lt80) sys.stderr.write(" : this program is designed for long reads\n") return 0
def convert_reads(fq1s, fq2s, out=sys.stdout): for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")): sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2)) fq1 = nopen(fq1) if fq2 != "NA": fq2 = nopen(fq2) q2_iter = izip(*[fq2] * 4) else: sys.stderr.write("WARNING: running bwameth in single-end mode\n") q2_iter = repeat((None, None, None, None)) q1_iter = izip(*[fq1] * 4) lt80 = 0 for pair in izip(q1_iter, q2_iter): for read_i, (name, seq, _, qual) in enumerate(pair): if name == None or 'ST:Z:gbs' in name: #cache error when read2 is absent or read is GBS continue original_name = name[:-1].replace(' ','\t') if 'crick' in name.lower(): convert_list = ['CT', 'GA'][::-1] else: convert_list = ['CT', 'GA'] name = name.rstrip("\r\n").split(" ")[0] if name.endswith(("_R1", "_R2")): name = name[:-3] elif name.endswith(("/1", "/2")): name = name[:-2] seq = seq.upper().rstrip('\n') if len(seq) < 80: lt80 += 1 char_a, char_b = convert_list[read_i] # keep original sequence as name. name = "\t".join((original_name, "YS:Z:" + seq + '\n')) seq = seq.replace(char_a, char_b) out.write("".join((name, seq, "\n+\n", qual))) out.flush() out.close() if lt80 > 50: a=1
def bed_sample(bed, n=100): """\ choose n random lines from a bed file. uses reservoir sampling Arguments: bed - a bed file n - number of lines to sample """ n, lines = int(n), [] from random import randint with nopen(bed) as fh: for i, line in enumerate(nopen(fh)): if i < n: lines.append(line) else: replace_idx = randint(0, i) if replace_idx < n: lines[replace_idx] = line print "".join(lines),
def process_exact_fastq(fastq, n): """Group identical reads using a Counter. Returns Counter.""" c = Counter() with nopen(fastq) as fh: for name, seq, qual in read_fastq(fh): seq = trim_seq(seq, 4) if len(seq) < n: continue c.update([seq]) return c
def readfa(fa): with nopen(fa) as fh: for header, group in groupby(fh, lambda line: line[0] == '>'): if header: line = group.next() name = line[1:].strip() else: seq = ''.join(line.strip() for line in group) yield name, seq
def _qvality(fbed_file, col_num, col_null): from qvality import qvality ps = [b['p'] for b in bediter(fbed_file, col_num)] nulls = [b['p'] for b in bediter(fbed_file, col_null)] fh = ts.nopen(fbed_file) drop_header(fh) for (pval, pep, qval), l in izip(qvality(ps, nulls, r=None), fh): yield qval, pep, l
def convert_reads(fq1s, fq2s, out=sys.stdout): for fq1, fq2 in zip(fq1s.split(","), fq2s.split(",")): sys.stderr.write("converting reads in %s,%s\n" % (fq1, fq2)) fq1 = nopen(fq1) if fq2 != "NA": fq2 = nopen(fq2) q2_iter = izip(*[fq2] * 4) else: sys.stderr.write("WARNING: running bwameth in single-end mode\n") q2_iter = repeat((None, None, None, None)) q1_iter = izip(*[fq1] * 4) lt80 = 0 for pair in izip(q1_iter, q2_iter): for read_i, (name, seq, _, qual) in enumerate(pair): if name == None or 'ST:Z:gbs' in name: #cache error when read2 is absent or read is GBS continue original_name = name[:-1].replace(' ', '\t') if 'crick' in name.lower(): convert_list = ['CT', 'GA'][::-1] else: convert_list = ['CT', 'GA'] name = name.rstrip("\r\n").split(" ")[0] if name.endswith(("_R1", "_R2")): name = name[:-3] elif name.endswith(("/1", "/2")): name = name[:-2] seq = seq.upper().rstrip('\n') if len(seq) < 80: lt80 += 1 char_a, char_b = convert_list[read_i] # keep original sequence as name. name = "\t".join((original_name, "YS:Z:" + seq + '\n')) seq = seq.replace(char_a, char_b) out.write("".join((name, seq, "\n+\n", qual))) out.flush() out.close() if lt80 > 50: a = 1
def bed_sample(bed, n=100): """\ Choose n random lines from a bed file. Uses reservoir sampling. Arguments: bed - a bed file n - number of lines to sample """ n, lines = int(n), [] from random import randint with nopen(bed) as fh: for i, line in enumerate(nopen(fh)): if i < n: lines.append(line) else: replace_idx = randint(0, i) if replace_idx < n: lines[replace_idx] = line print "".join(lines),
def counter(fname): fname = fname[0] if not isinstance(fname, basestring) else fname print >>sys.stderr, fname qual_count = [0] * 256 for sam_line in (x.rstrip().split("\t") for x in nopen(fname) if not x.startswith('@')): qual = int(sam_line[4]) qual_count[qual] += 1 # each qual should be the sum of all quals with a lower qual than it return np.cumsum(qual_count[::-1])[::-1]
def main(regions, bams, reads=None, flags="-F%i" % (0x100 | 0x4 | 0x200 | 0x400), pad=100): r2 = open(tempfile.mktemp(), 'w') for toks in reader(regions, header=False): if toks[0][0] == "@" or not (toks[1] + toks[2]).isdigit(): continue toks[1] = str(max(0, int(toks[1]) - pad)) toks[2] = str(int(toks[2]) + pad) print >> r2, "\t".join(toks) r2.flush() regions = r2.name print reads if reads.isdigit(): reads = int(reads) elif reads != "bam": reads = int( nopen( "|bioawk -c fastx 'END { print NR }' %s" % reads).next()) * 2.0 counts = {} colors = cycle('rgbkmy') bam_reads = {} counts = dict(pmap(count_both, ((bam, regions, flags) for bam in bams))) for bam in bams: nreads = count_bam(bam, flags) if reads == "bam" else reads bam_reads[bam] = nreads symbol = 'o' if len(set(counts[bam][0])) < 3 else '.' pl.plot(counts[bam][0] / float(nreads), counts[bam][1] / float(nreads), '%s%s' % (colors.next(), symbol), label=name(bam)) pl.xlabel('off target') pl.ylabel('on target') pl.legend(loc='lower right') pl.xlim(xmin=0) pl.ylim(ymin=0) pl.show() os.unlink(r2.name) out = sys.stdout print >> out, "qual\tmethod\toff\ton" for qual in range(0, 256): for b in bams: print >> out, "{qual}\t{bam}\t{off}\t{on}".format( qual=qual, bam=name(b), off=counts[b][0][qual] / bam_reads[bam], on=counts[b][1][qual] / bam_reads[bam]) print >> sys.stderr, "wrote", out.name