def __call__(self, **kw): assembly_id = kw.get('assembly') or None assembly = genrep.Assembly(assembly_id) tinput = track(kw.get('track'), chrmeta=assembly.chrmeta) try: thPromot = int(kw.get("promoter")) except (ValueError, TypeError): thPromot = prom_def try: thInter = int(kw.get("intergenic")) except (ValueError, TypeError): thInter = inter_def try: thUTR = int(kw.get("UTR")) except (ValueError, TypeError): thUTR = utr_def output = self.temporary_path(fname=tinput.name+'_annotated.txt') _fields = tinput.fields+['gene', 'location_type', 'distance'] tout = track(output, format='txt', fields=_fields) tout.make_header("#"+"\t".join(tout.fields)) for chrom in assembly.chrnames: tout.write(getNearestFeature( tinput.read(selection=chrom), assembly.gene_track(chrom), thPromot, thInter, thUTR), mode='append') tout.close() self.new_file(output, 'table') return self.display_time()
def test_getNearestFeature(self): features = fstream([('chrII', 14795327, 14798367)], fields=['chr', 'start', 'end']) expected = [ ('chrII', 14795327, 14798367, 'Y54E2A.12|tbc-20_Y54E2A.11|eif-3.B', 'Promot_Included', '28_0') ] annotations = self.assembly.gene_track(chromlist=['chrII']) res = list(getNearestFeature(features, annotations)) self.assertItemsEqual(res, expected)
def all_snps(ex, chrom, vcfs, bams, outall, assembly, sample_names, mincov, minsnp, logfile=sys.stdout, debugfile=sys.stderr): """For a given chromosome, returns a summary file containing all SNPs identified in at least one of the samples. Each row contains: chromosome id, SNP position, reference base, SNP base (with proportions) :param chrom: (str) chromosome name. :param outall: (str) name of the file that will contain the list of all SNPs. :param sample_names: (list of str) list of sample names. :param mincov: (int) Minimum number of reads supporting an SNP at a position for it to be considered. [5] :param minsnp: (int) Minimum percentage of reads supporting the SNP for it to be returned. N.B.: Effectively, half of it on each strand for diploids. [40] """ ploidy = assembly.ploidy allsnps = [] nsamples = len(sample_names) sorder = range(len(sample_names)) current = [None]*nsamples bam_tracks = [track(v,format='bam') for k,v in sorted(bams.items())] vcf_handles = [open(v) for k,v in sorted(vcfs.items())] for i,vh in enumerate(vcf_handles): line = '#' while line and line[0]=='#': line = vh.readline() current[i] = parse_vcf(line) lastpos = 0 pos = -1 while any(current): current_pos = [int(x[0][1]) if x else sys.maxint for x in current] pos = min(current_pos) if pos == sys.maxint: break current_snp_idx = set(i for i in range(nsamples) if current_pos[i]==pos) current_snps = ["0"]*nsamples for i in current_snp_idx: general,snp_info,sample_stats = current[i] chrbam = general[0] ref = general[2] current_snps[i] = filter_snp(general,snp_info,sample_stats, mincov,minsnp,ploidy) # If there were still snp called at this position after filtering if any(current_snps[i] not in ("-","0") for i in current_snp_idx): for i in set(range(nsamples))-current_snp_idx: for coverage in bam_tracks[sorder[i]].coverage((chrom,pos-1,pos)): if coverage[-1] > 0: current_snps[i] = '-' # '/'.join([ref]*ploidy) if pos != lastpos: # indel can be located at the same position as an SNP allsnps.append((chrom,pos-1,pos,ref)+tuple(current_snps)) lastpos = pos for i in current_snp_idx: current[i] = parse_vcf(vcf_handles[i].readline()) for f in vcf_handles: f.close() for b in bam_tracks: b.close() logfile.write(" Annotate all SNPs\n"); logfile.flush() snp_read = FeatureStream(allsnps, fields=['chr','start','end','name']+sample_names) try: annotation = assembly.gene_track(chrom) annotated_stream = gm_stream.getNearestFeature(snp_read,annotation, thresholdPromot=3000, thresholdInter=3000, thresholdUTR=10) except: annotated_stream = snp_read logfile.write(" Write all SNPs\n"); logfile.flush() with open(outall,"a") as fout: for snp in annotated_stream: # snp: ('chrV',154529, 154530,'T','A (50% of 10)','A (80% of 10)', # 'YER002W|NOP16_YER001W|MNN1','Upstream_Included','2271_1011') fout.write('\t'.join(str(x) for x in (snp[0],)+snp[2:])+'\n') # remove start coord (0-based) return allsnps
def test_getNearestFeature(self): features = fstream([('chrII',14795327,14798367)], fields=['chr','start','end']) expected = [('chrII',14795327, 14798367, 'Y54E2A.12|tbc-20_Y54E2A.11|eif-3.B', 'Promot_Included', '28_0')] annotations = self.assembly.gene_track(chromlist=['chrII']) res = list(getNearestFeature(features,annotations)) self.assertItemsEqual(res,expected)
def all_snps(ex, chrom, vcfs, bams, outall, assembly, headerfile, sample_names, mincov, minsnp, logfile=sys.stdout, debugfile=sys.stderr, via='local'): """For a given chromosome, returns a summary file containing all SNPs identified in at least one of the samples. Each row contains: chromosome id, SNP position, reference base, SNP base (with proportions) :param chrom: (str) chromosome name. :param vcfs: (dict) vcf files for each sample, dictionary keys are group ids. :param bams: (dict) bamfiles organized like the vcf files. :param outall: (str) name of the file that will contain the list of all SNPs. :param assembly: (genrep.Assembly) assembly for the fasta files and ploidy value. :param headerfile: (string) name of file with substitute bam header to match the fasta files. :param sample_names: (list of str) list of sample names. :param mincov: (int) minimum number of reads supporting an SNP at a position for it to be considered. [5] :param minsnp: (int) minimum percentage of reads supporting the SNP for it to be returned. N.B.: Effectively, half of it on each strand for diploids. [40] """ ploidy = assembly.ploidy reffasta = assembly.fasta_by_chrom[chrom] allsnps = [] nsamples = len(sample_names) sorder = range(len(sample_names)) current = [None]*nsamples ##### if nsamples > 1: poslist = set() bamchrom = None for vf in vcfs.values(): with open(vf) as vh: for line in vh: if (not line) or line[0]=='#': continue line = line.strip().split('\t') if bamchrom is None: bamchrom = line[0] poslist.add( int(line[1]) ) snplist = unique_filename_in() with open(snplist,"w") as snpfh: snpfh.write("\n".join("%s\t%i" %(bamchrom,pos) for pos in sorted(poslist))) pilejobs = [] vcfs2 = {} for gid, bamfile in bams.iteritems(): vcfs2[gid] = unique_filename_in() pilejobs.append( pileup.nonblocking(ex, bams[gid], reffasta, step="list", bedfile=snplist, header=headerfile, via=via, stdout=vcfs2[gid]) ) [job.wait() for job in pilejobs] else: vcfs2 = vcfs ##### bam_tracks = [track(v,format='bam') for k,v in sorted(bams.items())] vcf_handles = [open(v) for k,v in sorted(vcfs2.items())] for i,vh in enumerate(vcf_handles): line = '#' while line and line[0]=='#': line = vh.readline() current[i] = parse_vcf(line) lastpos = 0 pos = -1 while any(current): current_pos = [int(x[0][1]) if x else sys.maxint for x in current] pos = min(current_pos) if pos == sys.maxint: break current_snp_idx = set(i for i in range(nsamples) if current_pos[i]==pos) current_snps = ["0"]*nsamples for i in current_snp_idx: general,snp_info,sample_stats = current[i] chrbam = general[0] ref = general[2] current_snps[i] = filter_snp(general,snp_info,sample_stats, mincov,minsnp,ploidy) # If there were still snp called at this position after filtering if any(current_snps[i] not in ("-","0") for i in current_snp_idx): for i in set(range(nsamples))-current_snp_idx: for coverage in bam_tracks[sorder[i]].coverage((chrom,pos-1,pos)): if coverage[-1] > 0: current_snps[i] = '-' # '/'.join([ref]*ploidy) if pos != lastpos: # indel can be located at the same position as an SNP allsnps.append((chrom,pos-1,pos,ref)+tuple(current_snps)) lastpos = pos for i in current_snp_idx: current[i] = parse_vcf(vcf_handles[i].readline()) for f in vcf_handles: f.close() for b in bam_tracks: b.close() logfile.write(" Annotate all SNPs\n"); logfile.flush() snp_read = FeatureStream(allsnps, fields=['chr','start','end','name']+sample_names) try: annotation = assembly.gene_track(chrom) annotated_stream = gm_stream.getNearestFeature(snp_read,annotation, thresholdPromot=3000, thresholdInter=3000, thresholdUTR=10) except: annotated_stream = snp_read logfile.write(" Write all SNPs\n"); logfile.flush() with open(outall,"a") as fout: for snp in annotated_stream: # snp: ('chrV',154529, 154530,'T','A (50% of 10)','A (80% of 10)', # 'YER002W|NOP16_YER001W|MNN1','Upstream_Included','2271_1011') fout.write('\t'.join(str(x) for x in (snp[0],)+snp[2:])+'\n') # remove start coord (0-based) return allsnps
def chipseq_workflow( ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf' ): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict,frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict,dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files',{}) else: raise TypeError("job_or_dict must be a frontend. Job object or a dictionary with key 'groups'.") merge_strands = int(options.get('merge_strands',-1)) suffixes = ["fwd","rev"] peak_deconvolution = options.get('peak_deconvolution',False) if isinstance(peak_deconvolution,basestring): peak_deconvolution = peak_deconvolution.lower() in ['1','true','t'] run_meme = options.get('run_meme',False) if isinstance(run_meme,basestring): run_meme = run_meme.lower() in ['1','true','t'] macs_args = options.get('macs_args',["--bw","200"]) b2w_args = options.get('b2w_args',[]) if not(isinstance(mapseq_files,dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid,mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not(isinstance(mapped,dict)): raise TypeError("Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'.") if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name+"_"+str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking( ex, mapped[k]["bam"], via=via ) if mapped[k].get('poisson_threshold',-1)>0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns)>0: p_thresh['group_name'] = sum(ptruns)/len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped)>1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid,group_name)) else: tests.append(bamfile) names['tests'].append((gid,group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls)<1: controls = [None] names['controls'] = [(0,None)] logfile.write("Starting MACS.\n");logfile.flush() processed = {'macs': add_macs_results( ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via ) } logfile.write("Done MACS.\n");logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score':(6,sys.maxint)} _fields = ['chr','start','end','name','score'] for i,name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = track(processed['macs'][ctrl]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([apply(track(processed['macs'][(name,x)]+"_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n,_n=xn: "%s:%i" %(__n,_n)) for xn,x in enumerate(names['controls'])]) ############################## macs_neighb = neighborhood( macsbed, before_start=150, after_end=150 ) peak_list[name] = unique_filename_in()+".sql" macs_final = track( peak_list[name], chrmeta=chrlist, info={'datatype':'qualitative'}, fields=['start','end','name','score'] ) macs_final.write(fusion(macs_neighb),clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int(options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension']>100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid,mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not('wig' in m) or len(m['wig'])<2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via ) wig.append(dict((s,output+s+'.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict((s,merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv( stream, pval ): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0],)+((x[2]+x[1])/2-150,(x[2]+x[1])/2+150)+x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields ) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1]+" deconvolution.\n");logfile.flush() if len(names['controls']) < 2: ctrl = (name,names['controls'][0]) macsbed = processed['macs'][ctrl]+"_peaks.bed" else: macsbed = intersect_many_bed( ex, [processed['macs'][(name,x)]+"_peaks.bed" for x in names['controls']], via=via ) deconv = run_deconv( ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via ) peak_list[name] = unique_filename_in()+".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed,0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1]+'_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1]+'_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'],(bigwig,"bigWig")) ex.add(bigwig, description=set_file_descr(name[1]+'_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e));logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1]+'_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs( stream, xlsl, _f ): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int(_n.split(";")[0][13:]) if _n[:3] == "ID=" else int(_n[10:]) yield _p+xlsl[0][nb-1][1:] else: nb = _n.split(";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p+xlsl[int(nb[1])][int(nb[0])-1][1:] return FeatureStream( _macs_row(stream), fields=_f ) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist,chrmeta=chrlist,fields=["chr","start","end","name","score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([processed['macs'][(name,_c)]+"_peaks.xls" for _c in names['controls']]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height','gene(s)','location_type','distance']+_fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs(getNearestFeature(ptrack.read(selection=chrom),_feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr','start','end','name','score']+["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#"+"\t".join(['chromosome','start','end','info','peak_height']+_fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex,peakfile) peakfile_list.append(track(peakfile+".gz", format='txt', fields=_fields)) ex.add(peakfile+".gz", description=set_file_descr(name[1]+'_annotated_peaks.txt.gz',type='text', step='annotation',groupId=name[0])) stracks = [track(wig,info={'name':name+"_"+st}) for name,wigdict in merged_wig.iteritems() for st,wig in wigdict.iteritems()] tablefile = unique_filename_in() with open(tablefile,"w") as _tf: _pnames = ["MACS_%s_vs_%s" %(_s[1],_c[1]) if _c[1] else "MACS_%s" %_s[1] for _s in names['tests'] for _c in names['controls']] _tf.write("\t".join(['#chromosome','start','end',]+_pnames+[s.name for s in stracks])+"\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [apply(pt.read(chrom,fields=['chr','start','end','name']), 'name', lambda __n,_n=npt: "%s:%i" %(__n,_n)) for npt,pt in enumerate(peakfile_list)] features = fusion(concatenate(pk_lst, fields=['chr','start','end','name'], remove_duplicates=True, group_by=['chr','start','end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile,"a") as _tf: for row in quantifs: pcols = ['']*_ns*_nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while ( _k < len(_rnsplit)-1-int(_nc>1) ): if _nc > 1: _k += 2 _n2 = _rnsplit[_k-1] _n = _rnsplit[_k].split("|") pcols[int(_n[0])*_nc+int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join(str(tt) for tt in row[:nidx]+tuple(pcols)+row[nidx+1:])+"\n") gzipfile(ex,tablefile) ex.add(tablefile+".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz',type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n");logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs','4','-meme-mod','zoops'], via=via ) return processed
def chipseq_workflow(ex, job_or_dict, assembly, script_path='', logfile=sys.stdout, via='lsf'): """Runs a chipseq workflow over bam files obtained by mapseq. Will optionally run ``macs`` and 'run_deconv'. :param ex: a 'bein' execution environment to run jobs in, :param job_or_dict: a 'Frontend' 'job' object, or a dictionary with key 'groups', 'files' and 'options' if applicable, :param assembly: a genrep.Assembly object, :param script_path: only needed if 'run_deconv' is in the job options, must point to the location of the R scripts. Defaults ``macs`` parameters (overriden by ``job_or_dict['options']['macs_args']``) are set as follows: * ``'-bw'``: 200 ('bandwith') * ``'-m'``: 10,100 ('minimum and maximum enrichments relative to background or control') The enrichment bounds will be computed from a Poisson threshold *T*, if available, as *(min(30,5*(T+1)),50*(T+1))*. Returns a tuple of a dictionary with keys *group_id* from the job groups, *macs* and *deconv* if applicable and values file description dictionaries and a dictionary of *group_ids* to *names* used in file descriptions. """ options = {} if logfile is None: logfile = sys.stdout if isinstance(job_or_dict, frontend.Job): options = job_or_dict.options groups = job_or_dict.groups mapseq_files = job_or_dict.files elif isinstance(job_or_dict, dict) and 'groups' in job_or_dict: if 'options' in job_or_dict: options = job_or_dict['options'] groups = job_or_dict['groups'] for gid in groups.keys(): if not ('name' in groups[gid]): groups[gid]['name'] = gid mapseq_files = job_or_dict.get('files', {}) else: raise TypeError( "job_or_dict must be a frontend. Job object or a dictionary with key 'groups'." ) merge_strands = int(options.get('merge_strands', -1)) suffixes = ["fwd", "rev"] peak_deconvolution = options.get('peak_deconvolution', False) if isinstance(peak_deconvolution, basestring): peak_deconvolution = peak_deconvolution.lower() in ['1', 'true', 't'] run_meme = options.get('run_meme', False) if isinstance(run_meme, basestring): run_meme = run_meme.lower() in ['1', 'true', 't'] macs_args = options.get('macs_args', ["--bw", "200"]) b2w_args = options.get('b2w_args', []) if not (isinstance(mapseq_files, dict)): raise TypeError("Mapseq_files must be a dictionary.") tests = [] controls = [] names = {'tests': [], 'controls': []} read_length = [] p_thresh = {} for gid, mapped in mapseq_files.iteritems(): group_name = groups[gid]['name'] if not (isinstance(mapped, dict)): raise TypeError( "Mapseq_files values must be dictionaries with keys *run_ids* or 'bam'." ) if 'bam' in mapped: mapped = {'_': mapped} futures = {} ptruns = [] for k in mapped.keys(): if not 'libname' in mapped[k]: mapped[k]['libname'] = group_name + "_" + str(k) if not 'stats' in mapped[k]: futures[k] = mapseq.bamstats.nonblocking(ex, mapped[k]["bam"], via=via) if mapped[k].get('poisson_threshold', -1) > 0: ptruns.append(mapped[k]['poisson_threshold']) if len(ptruns) > 0: p_thresh['group_name'] = sum(ptruns) / len(ptruns) for k in futures.keys(): mapped[k]['stats'] = f.wait() if len(mapped) > 1: bamfile = mapseq.merge_bam(ex, [m['bam'] for m in mapped.values()]) else: bamfile = mapped.values()[0]['bam'] if groups[gid]['control']: controls.append(bamfile) names['controls'].append((gid, group_name)) else: tests.append(bamfile) names['tests'].append((gid, group_name)) read_length.append(mapped.values()[0]['stats']['read_length']) genome_size = mapped.values()[0]['stats']['genome_size'] if len(controls) < 1: controls = [None] names['controls'] = [(0, None)] logfile.write("Starting MACS.\n") logfile.flush() processed = { 'macs': add_macs_results(ex, read_length, genome_size, tests, ctrlbam=controls, name=names, poisson_threshold=p_thresh, macs_args=macs_args, via=via) } logfile.write("Done MACS.\n") logfile.flush() peak_list = {} chrlist = assembly.chrmeta ## select only peaks with p-val <= 1e-0.6 = .25 => score = -10log10(p) >= 6 _select = {'score': (6, sys.maxint)} _fields = ['chr', 'start', 'end', 'name', 'score'] for i, name in enumerate(names['tests']): if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = track(processed['macs'][ctrl] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select) else: macsbed = concatenate([ apply(track(processed['macs'][(name, x)] + "_summits.bed", chrmeta=chrlist, fields=_fields).read(selection=_select), 'name', lambda __n, _n=xn: "%s:%i" % (__n, _n)) for xn, x in enumerate(names['controls']) ]) ############################## macs_neighb = neighborhood(macsbed, before_start=150, after_end=150) peak_list[name] = unique_filename_in() + ".sql" macs_final = track(peak_list[name], chrmeta=chrlist, info={'datatype': 'qualitative'}, fields=['start', 'end', 'name', 'score']) macs_final.write(fusion(macs_neighb), clip=True) macs_final.close() ############################## merged_wig = {} options['read_extension'] = int( options.get('read_extension') or read_length[0]) if options['read_extension'] < 1: options['read_extension'] = read_length[0] make_wigs = merge_strands >= 0 or options['read_extension'] > 100 if options['read_extension'] > 100: options['read_extension'] = 50 for gid, mapped in mapseq_files.iteritems(): # if groups[gid]['control']: continue group_name = groups[gid]['name'] wig = [] for m in mapped.values(): if make_wigs or not ('wig' in m) or len(m['wig']) < 2: output = mapseq.parallel_density_sql( ex, m["bam"], assembly.chrmeta, nreads=m["stats"]["total"], merge=-1, read_extension=options['read_extension'], convert=False, b2w_args=b2w_args, via=via) wig.append(dict((s, output + s + '.sql') for s in suffixes)) else: wig.append(m['wig']) if len(wig) > 1: merged_wig[group_name] = dict( (s, merge_sql(ex, [x[s] for x in wig], via=via)) for s in suffixes) else: merged_wig[group_name] = wig[0] if peak_deconvolution: ############################## def _filter_deconv(stream, pval): ferr = re.compile(r';FERR=([\d\.]+)$') return FeatureStream( ((x[0], ) + ((x[2] + x[1]) / 2 - 150, (x[2] + x[1]) / 2 + 150) + x[3:] for x in stream if "FERR=" in x[3] and float(ferr.search(x[3]).groups()[0]) <= pval), fields=stream.fields) ############################## processed['deconv'] = {} for name in names['tests']: logfile.write(name[1] + " deconvolution.\n") logfile.flush() if len(names['controls']) < 2: ctrl = (name, names['controls'][0]) macsbed = processed['macs'][ctrl] + "_peaks.bed" else: macsbed = intersect_many_bed(ex, [ processed['macs'][(name, x)] + "_peaks.bed" for x in names['controls'] ], via=via) deconv = run_deconv(ex, merged_wig[name[1]], macsbed, assembly.chrmeta, options['read_extension'], script_path, via=via) peak_list[name] = unique_filename_in() + ".bed" trbed = track(deconv['peaks']).read() with track(peak_list[name], chrmeta=chrlist, fields=trbed.fields) as bedfile: bedfile.write(fusion(_filter_deconv(trbed, 0.65))) ex.add(deconv['peaks'], description=set_file_descr(name[1] + '_peaks.sql', type='sql', step='deconvolution', groupId=name[0])) ex.add(deconv['profile'], description=set_file_descr(name[1] + '_deconv.sql', type='sql', step='deconvolution', groupId=name[0])) bigwig = unique_filename_in() try: convert(deconv['profile'], (bigwig, "bigWig")) ex.add(bigwig, description=set_file_descr(name[1] + '_deconv.bw', type='bigWig', ucsc='1', step='deconvolution', groupId=name[0])) except OSError as e: logfile.write(str(e)) logfile.flush() ex.add(deconv['pdf'], description=set_file_descr(name[1] + '_deconv.pdf', type='pdf', step='deconvolution', groupId=name[0])) processed['deconv'][name] = deconv ############################## def _join_macs(stream, xlsl, _f): def _macs_row(_s): for _p in _s: for _n in _p[3].split("|"): if len(xlsl) == 1: nb = int( _n.split(";")[0][13:]) if _n[:3] == "ID=" else int( _n[10:]) yield _p + xlsl[0][nb - 1][1:] else: nb = _n.split( ";")[0][13:] if _n[:3] == "ID=" else _n[10:] nb = nb.split(":") yield _p + xlsl[int(nb[1])][int(nb[0]) - 1][1:] return FeatureStream(_macs_row(stream), fields=_f) ############################## peakfile_list = [] for name, plist in peak_list.iteritems(): ptrack = track(plist, chrmeta=chrlist, fields=["chr", "start", "end", "name", "score"]) peakfile = unique_filename_in() xlsh, xlsl = parse_MACS_xls([ processed['macs'][(name, _c)] + "_peaks.xls" for _c in names['controls'] ]) try: ###### if assembly doesn't have annotations, we skip the "getNearestFeature" but still go through "_join_macs" assembly.gene_track() _fields = ['chr','start','end','name','score','gene','location_type','distance']\ +["MACS_%s"%h for h in xlsh[1:5]]+xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join([ 'chromosome', 'start', 'end', 'info', 'peak_height', 'gene(s)', 'location_type', 'distance' ] + _fields[8:])) for chrom in assembly.chrnames: _feat = assembly.gene_track(chrom) peakout.write(_join_macs( getNearestFeature(ptrack.read(selection=chrom), _feat), xlsl, _fields), mode='append') except ValueError: _fields = ['chr', 'start', 'end', 'name', 'score' ] + ["MACS_%s" % h for h in xlsh[1:5]] + xlsh[5:] peakout = track(peakfile, format='txt', chrmeta=chrlist, fields=_fields) peakout.make_header("#" + "\t".join( ['chromosome', 'start', 'end', 'info', 'peak_height'] + _fields[8:])) for chrom in assembly.chrnames: peakout.write(_join_macs(ptrack.read(selection=chrom), xlsl, _fields), mode='append') peakout.close() gzipfile(ex, peakfile) peakfile_list.append( track(peakfile + ".gz", format='txt', fields=_fields)) ex.add(peakfile + ".gz", description=set_file_descr(name[1] + '_annotated_peaks.txt.gz', type='text', step='annotation', groupId=name[0])) stracks = [ track(wig, info={'name': name + "_" + st}) for name, wigdict in merged_wig.iteritems() for st, wig in wigdict.iteritems() ] tablefile = unique_filename_in() with open(tablefile, "w") as _tf: _pnames = [ "MACS_%s_vs_%s" % (_s[1], _c[1]) if _c[1] else "MACS_%s" % _s[1] for _s in names['tests'] for _c in names['controls'] ] _tf.write("\t".join([ '#chromosome', 'start', 'end', ] + _pnames + [s.name for s in stracks]) + "\n") #### need to do something about peak origin (split names, write to separate columns?) for chrom in assembly.chrnames: pk_lst = [ apply(pt.read(chrom, fields=['chr', 'start', 'end', 'name']), 'name', lambda __n, _n=npt: "%s:%i" % (__n, _n)) for npt, pt in enumerate(peakfile_list) ] features = fusion( concatenate(pk_lst, fields=['chr', 'start', 'end', 'name'], remove_duplicates=True, group_by=['chr', 'start', 'end'])) sread = [sig.read(chrom) for sig in stracks] quantifs = score_by_feature(sread, features, method='sum') nidx = quantifs.fields.index('name') _ns = len(tests) _nc = len(controls) with open(tablefile, "a") as _tf: for row in quantifs: pcols = [''] * _ns * _nc _rnsplit = row[nidx].split(":") _n1 = _rnsplit[0] _k = 0 while (_k < len(_rnsplit) - 1 - int(_nc > 1)): if _nc > 1: _k += 2 _n2 = _rnsplit[_k - 1] _n = _rnsplit[_k].split("|") pcols[int(_n[0]) * _nc + int(_n2)] = _n1 else: _k += 1 _n = _rnsplit[_k].split("|") pcols[int(_n[0])] = _n1 _n1 = "|".join(_n[1:]) _tf.write("\t".join( str(tt) for tt in row[:nidx] + tuple(pcols) + row[nidx + 1:]) + "\n") gzipfile(ex, tablefile) ex.add(tablefile + ".gz", description=set_file_descr('Combined_peak_quantifications.txt.gz', type='text', step='summary')) if run_meme: from bbcflib.motif import parallel_meme logfile.write("Starting MEME.\n") logfile.flush() processed['meme'] = parallel_meme( ex, assembly, peak_list.values(), name=peak_list.keys(), chip=True, meme_args=['-meme-nmotifs', '4', '-meme-mod', 'zoops'], via=via) return processed
def all_snps(ex, chrom, vcfs, bams, outall, assembly, headerfile, sample_names, mincov, minsnp, logfile=sys.stdout, debugfile=sys.stderr, via='local'): """For a given chromosome, returns a summary file containing all SNPs identified in at least one of the samples. Each row contains: chromosome id, SNP position, reference base, SNP base (with proportions) :param chrom: (str) chromosome name. :param vcfs: (dict) vcf files for each sample, dictionary keys are group ids. :param bams: (dict) bamfiles organized like the vcf files. :param outall: (str) name of the file that will contain the list of all SNPs. :param assembly: (genrep.Assembly) assembly for the fasta files and ploidy value. :param headerfile: (string) name of file with substitute bam header to match the fasta files. :param sample_names: (list of str) list of sample names. :param mincov: (int) minimum number of reads supporting an SNP at a position for it to be considered. [5] :param minsnp: (int) minimum percentage of reads supporting the SNP for it to be returned. N.B.: Effectively, half of it on each strand for diploids. [40] """ ploidy = assembly.ploidy reffasta = assembly.fasta_by_chrom[chrom] allsnps = [] nsamples = len(sample_names) sorder = range(len(sample_names)) current = [None] * nsamples ##### if nsamples > 1: poslist = set() bamchrom = None for vf in vcfs.values(): with open(vf) as vh: for line in vh: if (not line) or line[0] == '#': continue line = line.strip().split('\t') if bamchrom is None: bamchrom = line[0] poslist.add(int(line[1])) snplist = unique_filename_in() with open(snplist, "w") as snpfh: snpfh.write("\n".join("%s\t%i" % (bamchrom, pos) for pos in sorted(poslist))) pilejobs = [] vcfs2 = {} for gid, bamfile in bams.iteritems(): vcfs2[gid] = unique_filename_in() pilejobs.append( pileup.nonblocking(ex, bams[gid], reffasta, step="list", bedfile=snplist, header=headerfile, via=via, stdout=vcfs2[gid])) [job.wait() for job in pilejobs] else: vcfs2 = vcfs ##### bam_tracks = [track(v, format='bam') for k, v in sorted(bams.items())] vcf_handles = [open(v) for k, v in sorted(vcfs2.items())] for i, vh in enumerate(vcf_handles): line = '#' while line and line[0] == '#': line = vh.readline() current[i] = parse_vcf(line) lastpos = 0 pos = -1 while any(current): current_pos = [int(x[0][1]) if x else sys.maxint for x in current] pos = min(current_pos) if pos == sys.maxint: break current_snp_idx = set(i for i in range(nsamples) if current_pos[i] == pos) current_snps = ["0"] * nsamples for i in current_snp_idx: general, snp_info, sample_stats = current[i] chrbam = general[0] ref = general[2] current_snps[i] = filter_snp(general, snp_info, sample_stats, mincov, minsnp, ploidy) # If there were still snp called at this position after filtering if any(current_snps[i] not in ("-", "0") for i in current_snp_idx): for i in set(range(nsamples)) - current_snp_idx: for coverage in bam_tracks[sorder[i]].coverage( (chrom, pos - 1, pos)): if coverage[-1] > 0: current_snps[i] = '-' # '/'.join([ref]*ploidy) if pos != lastpos: # indel can be located at the same position as an SNP allsnps.append((chrom, pos - 1, pos, ref) + tuple(current_snps)) lastpos = pos for i in current_snp_idx: current[i] = parse_vcf(vcf_handles[i].readline()) for f in vcf_handles: f.close() for b in bam_tracks: b.close() logfile.write(" Annotate all SNPs\n") logfile.flush() snp_read = FeatureStream(allsnps, fields=['chr', 'start', 'end', 'name'] + sample_names) try: annotation = assembly.gene_track(chrom) annotated_stream = gm_stream.getNearestFeature(snp_read, annotation, thresholdPromot=3000, thresholdInter=3000, thresholdUTR=10) except: annotated_stream = snp_read logfile.write(" Write all SNPs\n") logfile.flush() with open(outall, "a") as fout: for snp in annotated_stream: # snp: ('chrV',154529, 154530,'T','A (50% of 10)','A (80% of 10)', # 'YER002W|NOP16_YER001W|MNN1','Upstream_Included','2271_1011') fout.write('\t'.join(str(x) for x in (snp[0], ) + snp[2:]) + '\n') # remove start coord (0-based) return allsnps