def getAlleleCount(bamfile, snpfile, outfile): brcparams = Box() brcparams.f = ref brcparams.w = 0 brcparams.l = snpfile brcparams[''] = bamfile cmd = '{bamrc} {args} > {outfile!r}'.format( bamrc = bamrc, args = cmdargs(brcparams, equal = ' '), outfile = outfile + '.tmp') runcmd(cmd) # reformated output to desired format reader = TsvReader(outfile + '.tmp', cnames = False) snper = TsvReader(snpfile, cnames = False) #chr1 564773 C 14 =:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 A:0:0.00:0.00:0.00:0:0:0.00:0.00:0.00:0:0.00:0.00:0.00 C:14:... G:0:... T:0:... N:0:... writer = TsvWriter(outfile) writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount'] for r in reader: while True: try: snp = next(snper) except StopIteration: break # use the end position, in case it's 0-based if snp[0] == r[0] and snp[2] == r[1]: counts = dict( A = r[5].split(':', 2)[1], C = r[6].split(':', 2)[1], G = r[7].split(':', 2)[1], T = r[8].split(':', 2)[1] ) rec = TsvRecord() rec.Chrm = r[0] rec.pos = r[1] rec.Total = r[3] rec.A = counts['A'] rec.C = counts['C'] rec.G = counts['G'] rec.T = counts['T'] # if reference allele is unknown, assuming all are ref alleles rec.refCount = counts.get(snp[6].upper(), r[3]) # if mut allele is unknown, assuming no mutations happened rec.mutCount = counts.get(snp[7].upper(), 0) writer.write(rec) # go to next snp break else: # go to next r continue writer.close()
def main(opts): """Main function""" org_tfgenes = read_tfgenes(opts.origin) add_tfgenes = read_tfgenes(opts.addition) writer = TsvWriter(opts.outfile) logger.info('Writing the union set to %s ...', opts.outfile) for gene, tfs in org_tfgenes.items(): for tf in (tfs | add_tfgenes.pop(gene, set())): writer.write([tf, gene]) for gene, tfs in add_tfgenes.items(): for tf in tfs: writer.write([tf, gene]) writer.close() logger.info('Done.')
""" S1 S2 .. Sn G1 ... G2 ... """ expreader = TsvReader(expfile) expdata = [r for r in expreader if r[0] in genes or r[0] in tfs] expreader.close() datawriter = TsvWriter(outdata) for i, cname in enumerate(expreader.cnames): if i == 0: # genes + tfs datawriter.cnames = [r[0] for r in expdata] datawriter.writeHead() else: datawriter.write([cname] + [r[i] for r in expdata]) datawriter.close() del expdata genes = [g for g in genes if g in datawriter.cnames] tfs = [g for g in tfs if g in datawriter.cnames] genetfs = {g: [tf for tf in gtfs if tf in tfs] for g, gtfs in genetfs.items() if g in genes} # save the group file # mutfile """ S1 S2 .. Sn M1 ... (0/1/2/NA) M2 ... """ mutreader = TsvReader(mutfile)
reader = TsvReader(infile, cnames=False) allsnps = set(reader.dump(0)) reader.rewind() allgenes = set(reader.dump(1)) reader.close() # assign a probability to each snp nsnps = len(allsnps) ngenes = len(allgenes) snp_probs = dict(zip(allsnps, random.choices(range(ngenes * snppergene), k=nsnps))) genebed = TsvWriter(genefile) snpbed = TsvWriter(snpfile) geneperchr = math.ceil(float(ngenes) / float(nchr)) for i, gene in enumerate(allgenes): chrname = 'chr' + str(int(i % nchr) + 1) start = (int(i / nchr) + 1) * dist end = start + 1 first_snp_pos = int(start - dist/2.0 - snppergene) snps = (snp for snp in snp_probs if i * snppergene <= snp_probs[snp] < (i+1)*snppergene) genebed.write([chrname, start, end, gene, 0, '+']) for j, snp in enumerate(snps): snppos = first_snp_pos + j snpbed.write([chrname, snppos, snppos, snp, 0, '+']) genebed.close() snpbed.close()
(r.IID, r))).dump()) else: metadata = None logger.info('Reading genotype matrix ...') # snp1 gt1s1 gt1s2 ... inreader = TsvReader(infile, cnames=True) samples = inreader.meta[1:] logger.info('Writing tfam file ...') tfamWriter = TsvWriter(tfamfile) tfamWriter.meta = ['FID', 'IID', 'PID', 'MID', 'Sex', 'Pheno'] #tfamWriter.writeHead(callback = lambda meta: '#' + '\t'.join(meta)) if not metadata: for s in samples: tfamWriter.write([s, s, '0', '0', 'other', '-9']) else: for s in samples: tfamWriter.write([ metadata[s].FID if s in metadata and 'FID' in metadata[s] else s, s, (metadata[s].PID or '0') if s in metadata and 'PID' in metadata[s] else '0', (metadata[s].MID or '0') if s in metadata and 'MID' in metadata[s] else '0', (metadata[s].Sex or 'other') if s in metadata and 'Sex' in metadata[s] else 'other', (metadata[s].Pheno or '-9') if s in metadata and 'Pheno' in metadata[s] else '-9' ]) tfamWriter.close()
for key in sorted(attrs.keys()): if key in writer.cnames: continue if 'id' in key.lower(): return attrs[key] if 'name' in key.lower(): return attrs[key] return attrs[key] gff = Gff(infile) for record in gff: r = TsvRecord() r.CHR = record['seqid'] r.START = record['start'] r.END = record['end'] r.SCORE = record['score'] r.STRAND = record['strand'] attrs = record['attributes'] attrs.update(dict( CHR = r.CHR, START = r.START, END = r.END, SCORE = r.SCORE, STRAND = r.STRAND )) r.NAME = getNameFromAttrs(attrs) if keepinfo: r.ORIGINAL = '; '.join('{}={}'.format(k,v) for k, v in attrs.items() if k not in writer.cnames) writer.write(r)
indata1 = TsvReader(infile1, **inopts1) indata2 = TsvReader(infile2, **inopts2) cnames1 = indata1.meta if not rnames1 else indata1.meta[1:] cnames2 = indata2.meta if not rnames2 else indata2.meta[1:] paired = list(set(cnames1) & set(cnames2)) cnames1 = cnames2 = paired if rnames1: cnames1 = [indata1.meta[0]] + cnames1 if rnames2: cnames2 = [indata2.meta[0]] + cnames2 cindex1 = [indata1.meta.index(c) for c in cnames1] cindex2 = [indata2.meta.index(c) for c in cnames2] outdata1 = TsvWriter(outfile1) outdata2 = TsvWriter(outfile2) outdata1.meta = cnames1 outdata2.meta = cnames2 outdata1.writeHead() outdata2.writeHead() for r1 in indata1: outdata1.write(r1[i] for i in cindex1) outdata1.close() for r2 in indata2: outdata2.write(r2[i] for i in cindex2) outdata2.close()
from bioprocs.utils.tsvio2 import TsvReader, TsvWriter infile = {{i.infile | quote}} outfile = {{o.outfile | quote}} inopts = {{args.inopts | repr}} infmt = {{args.infmt | quote}} cutoff = {{args.cutoff | repr}} degrees = defaultdict(lambda: 0) if infmt.startswith('pair'): reader = TsvReader(infile, **inopts) for r in reader: if cutoff: try: score = float(r[2]) except TypeError: raise TypeError( 'The 3rd column should be a score for apply the cutoff.') if score < cutoff: continue degrees[r[0]] += 1 degrees[r[1]] += 1 writer = TsvWriter(outfile) for node in sorted(degrees.keys(), key=lambda x: degrees[x], reverse=True): if infmt.endswith('complete'): writer.write([node, int(int(degrees[node]) / 2)]) else: writer.write([node, degrees[node]]) writer.close() else: raise ValueError('Input format other than "pair" not supported yet.')
writer.cnames = ['CHROM', 'START', 'END', 'NAME', 'SCORE', 'STRAND'] writer.writeHead() for r in reader: gene = r[genecol] if gene not in genes: msg = 'Gene does not exist: {}'.format(gene) if notfound == 'error': raise ValueError(msg) else: log2pyppl('Gene does not exist: {msg}', 'warning') continue chrom, _, _, start, end, _, strand = genes[gene] start, end = int(start), int(end) if strand == '-': record = [ chrom, min(start, end - region.down) if region.withbody else end - region.down, end + region.up, gene, 0, strand ] else: record = [ chrom, start - region.up, max(end, start + region.down) if region.withbody else start + region.down, gene, 0, strand ] writer.write(record) writer.close()
for i in range(0, len(snplist), 1000): chunk = snplist[i:i + 1000] sql = 'SELECT chrom, chromStart, chromEnd, name, score, strand, refUCSC, alleles, alleleFreqs FROM snp{dbsnpver} WHERE name in ({snps})'.format( dbsnpver=dbsnpver, snps=', '.join("'{}'".format(s) for s in chunk)) result = g.sql(sql) for r in result: allfreqs = dict(zip(r.alleles.split(','), r.alleleFreqs.split(','))) reffreq = allfreqs.get(r.refUCSC, '0') if r.refUCSC in allfreqs: del allfreqs[r.refUCSC] if '' in allfreqs: del allfreqs[''] writer.write([ r.chrom, r.chromStart, r.chromEnd, r.name, r.score, r.strand, r.refUCSC, ','.join(allfreqs.keys()), ','.join([reffreq] + list(allfreqs.values())) ]) writer.close() else: # snps snplist = path.join(jobindir, path.basename(snpfile) + '.list') reader = TsvReader(snpfile, cnames=False) writer = TsvWriter(snplist) for r in reader: writer.write([r[snpcol]]) reader.close() writer.close() shell.TOOLS.vcftools = vcftools
raise ValueError('Method %s not supported yet.' % method) def numpval(pval): try: return float(pval) except TypeError: return 1.0 reader = TsvReader(infile) writer = TsvWriter(outfile) prevsnp = None prevpvals = [] for r in reader: snp = r.Case.split('.')[0] if snp != prevsnp: if prevsnp: writer.write([ prevsnp, aggregate(prevpvals, method) ]) prevsnp = snp prevpvals = [numpval(r.Pval)] else: prevpvals.append(numpval(r.Pval)) writer.write([ prevsnp, aggregate(prevpvals, method) ]) writer.close()
# get number of lines of affysnps file total = wc_l(affysnps) dists = distribute(total, nthread) reader = TsvReader(affysnps, cnames = False) # dir to save the split file and result file thdir = path.join(outdir, 'bamrc.nthreads') if not path.exists(thdir): makedirs(thdir) asbname = path.basename(affysnps).split('.')[0] for i, dist in enumerate(dists): writer = TsvWriter(path.join(thdir, '{bname}.thread{i}.snp'.format( bname = asbname, i = i ))) for _ in range(dist): writer.write(next(reader)) writer.close() para = Parallel(nthread, raiseExc = True) para.run(getAlleleCount, [ (tumbam, path.join( thdir, '{bname}.thread{i}.snp'.format(bname = asbname, i = i) ), path.join( thdir, '{tumbn}.thread{i}.bamrc'.format(tumbn = path.basename(tumbam), i = i) )) for i in range(nthread) ]) # merge to tumsnp writer = TsvWriter(tumsnp) writer.cnames = ['Chrm', 'pos', 'A', 'C', 'G', 'T', 'Total', 'refCount', 'mutCount'] writer.writeHead(lambda cn: "#" + "\t".join(cn)) for i in range(nthread):