def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix): """Plot the running motif presence, starting at most significant peaks""" in_peaks, in_motifs = in_files[0], in_files[1:] out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence' out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png' out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations' out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed' wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) old_size = matplotlib.rcParams['font.size'] matplotlib.rcParams['font.size'] = 6 # read in the peaks file, sorting it by *score* print in_peaks print open(in_peaks).readline() try: peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)] print peaks peaks = sorted([l.strip().split('\t') for l in open(in_peaks)], key=lambda line:float(line[4]), reverse=True) except ValueError: print 'here is the error!', l.strip(), float(l.strip().split('\t')[4]) raise motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks) for m_file in in_motifs: cur_motifs = {} m_file_short = re.sub(r'((treat|fastq|fastq_illumina|min_qual|bowtie|' + r'maq|peaks|with_mean_sd|discovered|' + r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*', '', m_file) #print m_file_short with open(m_file) as infile: try: cur_motifs.update(pickle.load(infile)) except: infile.seek(0) for line in infile: #print line, name, consensus = line.strip('\n').split('\t') cur_motifs.update({name: sequence_motif.makePWMFromIUPAC(consensus)}) #print m_file, cur_motifs all_motif_percent = {} for zscore in cfg.get('motifs','motif_zscores').strip().split(','): for name, pwm in cur_motifs.items(): with_motif = 0 percent_with = [] # percent with motif at each peak for total, p in enumerate(peaks): chrom, start, stop = p[0], int(p[1]), int(p[2]) region = wb_genome[chrom][start:stop] # extend peaks to at least pwm length while len(region) < len(pwm): region = wb_genome[chrom][region.start-5:region.stop+5] # catch nasty infinite loops for very short scaffolds if len(region) == len(wb_genome[chrom]): break # check if the motif occurs in the region try: hits = list(pwm.find_in_region(region, zscore=float(zscore))) except Exception as e: log.debug('issue with sequence', repr(region), name, e.message) hits = [] if len(hits) > 0: with_motif += 1 # add all peak locations to the list motifs_in_peaks[tuple(p)][name].extend(( h[0] + start, h[1] + start, '+' if h[2] == 1 else '-') for h in hits) percent_with.append(float(with_motif) / (total+1)) #print all_motif_percent, name, percent_with all_motif_percent[name] = percent_with # having calculated for all motifs in all files, # plot a figure and give a summary with open(out_summary % ('z' + zscore), 'w') as outfile: outfile.writelines('%s\t%s\n' % (name, percent) for name, percent in all_motif_percent.items()) # write the peak locations along with the motif instances # that occur in them with open(out_locations % ('z' + zscore), 'w') as outfile: with open(out_locations_bed % ('z' + zscore), 'w') as out_bed: # header is 6 columns of peak info, then motif info outfile.write('\t'.join(['p_chrom', 'p_start', 'p_stop', 'p_name', 'p_score', 'p_strand'])) for motif_name in sorted(cur_motifs): outfile.write('\t%s\t#instances_%s' % (motif_name, motif_name)) outfile.write('\n') # write one line per peak, then the motif counts and # instances in the peak # instances for each motif are all in one column for p in peaks: outfile.write('\t'.join(map(str, p))) for motif_name in sorted(cur_motifs): hits = motifs_in_peaks[tuple(p)][motif_name] outfile.write('\t%s\t%s' % (len(hits), hits)) for h in hits: out_bed.write('\t'.join(map(str, [p[0], h[0], h[1], motif_name, 1000, h[2]])) + '\n') outfile.write('\n') all_motif_percent_dict = sorted(all_motif_percent.items()) names = [k for k, v in all_motif_percent_dict] datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T # plot original data pyplot.plot(datapoints) pyplot.legend(names) pyplot.title('Motifs from\n%s\nPresence in\n%s' % (m_file_short, in_peaks)) pyplot.savefig(out_png % ('z'+zscore)) pyplot.close() # plot top 10% of data plot_top = len(datapoints) / 10 #print datapoints #print datapoints[:plot_top, :] # check if the slice is the right dimension pyplot.plot(datapoints[:plot_top, :]) pyplot.legend(names) pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' % ( m_file_short, in_peaks)) pyplot.savefig(out_png % ('z' + zscore + '.top10percent')) pyplot.close() matplotlib.rcParams['font.size'] = old_size
def uniquefy_downsample_reads(in_files, out_files): """Uniquefy sequence reads then downsample so the total unique tag count in treatment and control is the same. This may generate many downsampled datasets. """ # WARNING: this is a circular dependency. It has to be included at runtime # Top-level import will cause this module to load only 1/2 way # we import here because we need to call this function directly, # and not just when using ruffus from hts_waterworks.visualize import bed_uniquefy if not cfg.getboolean('peaks', 'downsample_reads'): with log_mtx: log.debug('NOT downsampling the sequence reads!') else: in_treat, in_control = in_files out_treat_template = re.sub(r'mapped_reads$', 'matched_size_%s.mapped_reads', in_treat) out_control_template = re.sub(r'mapped_reads$', 'matched_size_%s.mapped_reads', in_control) if out_treat_template == in_treat: raise RuntimeError('regex substitution failed from %s to %s' % ( in_treat, out_treat_template)) if out_control_template == in_control: raise RuntimeError('regex substitution failed from %s to %s' % ( in_control, out_control_template)) tmp_t_sorted = tempfile.NamedTemporaryFile(delete=False).name tmp_c_sorted = tempfile.NamedTemporaryFile(delete=False).name tmp_t_unique = tempfile.NamedTemporaryFile(delete=False).name tmp_c_unique = tempfile.NamedTemporaryFile(delete=False).name # sort the reads bed_clip_and_sort(in_treat, tmp_t_sorted) bed_clip_and_sort(in_control, tmp_c_sorted) # uniquefy the reads bed_uniquefy(tmp_t_sorted, tmp_t_unique, cfg.getint('visualization', 'uniquefy_track_max_reads')) bed_uniquefy(tmp_c_sorted, tmp_c_unique, cfg.getint('visualization', 'uniquefy_track_max_reads')) total_treat = sum(1 for l in open(tmp_t_unique)) total_control = sum(1 for l in open(tmp_c_unique)) if total_treat == total_control: with log_mtx: log.debug('No downsampling required-- tag counts identical') else: # downsample num_down_sample times for i in xrange(cfg.getint('peaks', 'num_down_samples')): out_treat = out_treat_template % i out_control = out_control_template % i if total_treat > total_control: # reduce number of treatment reads inds_to_keep = set(random.sample(xrange(total_treat), total_control)) in_orig, out_orig = tmp_c_unique, out_control in_subset, out_subset = tmp_t_unique, out_treat else: # reduce number of control reads inds_to_keep = set(random.sample(xrange(total_control), total_treat)) in_orig, out_orig = tmp_t_unique, out_treat in_subset, out_subset = tmp_c_unique, out_control sys_call('cp %s %s' % (in_orig, out_orig)) # subset the tags with open(in_subset) as infile: with open(out_subset, 'w') as outfile: outfile.writelines(line for i, line in enumerate(infile) if i in inds_to_keep) for f in [tmp_t_sorted, tmp_t_unique, tmp_c_sorted, tmp_c_unique]: os.unlink(f)
def motif_presence_sorted_peaks(in_files, out_patterns, in_prefix, in_suffix): """Plot the running motif presence, starting at most significant peaks""" in_peaks, in_motifs = in_files[0], in_files[1:] out_summary = in_prefix + in_suffix + '.%s.peak_motif_presence' out_png = in_prefix + in_suffix + '.%s.peak_motif_presence.png' out_locations = in_prefix + in_suffix + '.%s.peak_motif_locations' out_locations_bed = in_prefix + in_suffix + '.%s.peak_motif_locations.bed' wb_genome = worldbase(cfg.get('DEFAULT', 'worldbase_genome')) old_size = matplotlib.rcParams['font.size'] matplotlib.rcParams['font.size'] = 6 # read in the peaks file, sorting it by *score* print in_peaks print open(in_peaks).readline() try: peaks = [float(l.strip().split('\t')[4]) for l in open(in_peaks)] print peaks peaks = sorted([l.strip().split('\t') for l in open(in_peaks)], key=lambda line: float(line[4]), reverse=True) except ValueError: print 'here is the error!', l.strip(), float(l.strip().split('\t')[4]) raise motifs_in_peaks = dict((tuple(p), defaultdict(list)) for p in peaks) for m_file in in_motifs: cur_motifs = {} m_file_short = re.sub( r'((treat|fastq|fastq_illumina|min_qual|bowtie|' + r'maq|peaks|with_mean_sd|discovered|' + r'motifs_meme_out|motifs|matched_size_[0-9]|sorted|[0-9]+_around|small_sample)\.)+(motifs\.*)*', '', m_file) #print m_file_short with open(m_file) as infile: try: cur_motifs.update(pickle.load(infile)) except: infile.seek(0) for line in infile: #print line, name, consensus = line.strip('\n').split('\t') cur_motifs.update( {name: sequence_motif.makePWMFromIUPAC(consensus)}) #print m_file, cur_motifs all_motif_percent = {} for zscore in cfg.get('motifs', 'motif_zscores').strip().split(','): for name, pwm in cur_motifs.items(): with_motif = 0 percent_with = [] # percent with motif at each peak for total, p in enumerate(peaks): chrom, start, stop = p[0], int(p[1]), int(p[2]) region = wb_genome[chrom][start:stop] # extend peaks to at least pwm length while len(region) < len(pwm): region = wb_genome[chrom][region.start - 5:region.stop + 5] # catch nasty infinite loops for very short scaffolds if len(region) == len(wb_genome[chrom]): break # check if the motif occurs in the region try: hits = list( pwm.find_in_region(region, zscore=float(zscore))) except Exception as e: log.debug('issue with sequence', repr(region), name, e.message) hits = [] if len(hits) > 0: with_motif += 1 # add all peak locations to the list motifs_in_peaks[tuple(p)][name].extend( (h[0] + start, h[1] + start, '+' if h[2] == 1 else '-') for h in hits) percent_with.append(float(with_motif) / (total + 1)) #print all_motif_percent, name, percent_with all_motif_percent[name] = percent_with # having calculated for all motifs in all files, # plot a figure and give a summary with open(out_summary % ('z' + zscore), 'w') as outfile: outfile.writelines( '%s\t%s\n' % (name, percent) for name, percent in all_motif_percent.items()) # write the peak locations along with the motif instances # that occur in them with open(out_locations % ('z' + zscore), 'w') as outfile: with open(out_locations_bed % ('z' + zscore), 'w') as out_bed: # header is 6 columns of peak info, then motif info outfile.write('\t'.join([ 'p_chrom', 'p_start', 'p_stop', 'p_name', 'p_score', 'p_strand' ])) for motif_name in sorted(cur_motifs): outfile.write('\t%s\t#instances_%s' % (motif_name, motif_name)) outfile.write('\n') # write one line per peak, then the motif counts and # instances in the peak # instances for each motif are all in one column for p in peaks: outfile.write('\t'.join(map(str, p))) for motif_name in sorted(cur_motifs): hits = motifs_in_peaks[tuple(p)][motif_name] outfile.write('\t%s\t%s' % (len(hits), hits)) for h in hits: out_bed.write('\t'.join( map(str, [ p[0], h[0], h[1], motif_name, 1000, h[2] ])) + '\n') outfile.write('\n') all_motif_percent_dict = sorted(all_motif_percent.items()) names = [k for k, v in all_motif_percent_dict] datapoints = numpy.array([v for k, v in all_motif_percent_dict]).T # plot original data pyplot.plot(datapoints) pyplot.legend(names) pyplot.title('Motifs from\n%s\nPresence in\n%s' % (m_file_short, in_peaks)) pyplot.savefig(out_png % ('z' + zscore)) pyplot.close() # plot top 10% of data plot_top = len(datapoints) / 10 #print datapoints #print datapoints[:plot_top, :] # check if the slice is the right dimension pyplot.plot(datapoints[:plot_top, :]) pyplot.legend(names) pyplot.title('Top 10%% of Motifs from\n%s\nPresence in\n%s' % (m_file_short, in_peaks)) pyplot.savefig(out_png % ('z' + zscore + '.top10percent')) pyplot.close() matplotlib.rcParams['font.size'] = old_size