def get_region_size_metrics(peak_file, out_dir='.'): ''' From the peak file, return a plot of the region size distribution and the quartile metrics (summary from R) ''' import pandas as pd import numpy as np import matplotlib as mpl mpl.use('Agg') from matplotlib import pyplot as plt from collections import OrderedDict basename = os.path.basename(strip_ext_peak(peak_file)) prefix = os.path.join(out_dir, basename) log = '{}.peak_region_size.qc'.format(prefix) plot = '{}.peak_region_size.png'.format(prefix) # Load peak file. If it fails, return nothing as above peak_df = pd.read_table(peak_file, compression='gzip', header=None) # Subtract third column from second to get summary region_sizes = peak_df.iloc[:, 2] - peak_df.iloc[:, 1] # Summarize and store in ordered dict peak_summary_stats = region_sizes.describe() peak_size_summ = OrderedDict([ ('Min size', peak_summary_stats['min']), ('25 percentile', peak_summary_stats['25%']), ('50 percentile (median)', peak_summary_stats['50%']), ('75 percentile', peak_summary_stats['75%']), ('Max size', peak_summary_stats['max']), ('Mean', peak_summary_stats['mean']), ]) # Plot density diagram using matplotlib fig = plt.figure() ax = fig.add_subplot(111) y, binEdges = np.histogram(region_sizes, bins=100) bincenters = 0.5 * (binEdges[1:] + binEdges[:-1]) # write to log file with open(log, 'w') as fp: for key, val in peak_size_summ.items(): fp.write(key + '\t' + str(val) + '\n') plt.plot(bincenters, y, '-') filename = os.path.basename(peak_file) ax.set_title('Peak width distribution for {0}'.format(filename)) # write to plot file fig.savefig(plot, format='png') return log, plot
def get_num_peaks(peak_file, out_dir='.'): ''' From the peak file, return number of lines in it ''' basename = os.path.basename(strip_ext_peak(peak_file)) prefix = os.path.join(out_dir, basename) log = '{}.num_peak.qc'.format(prefix) with open(log, 'w') as fp: fp.write(str(get_num_lines(peak_file)) + '\n') return log
def peak_to_hammock(peak, out_dir): peak_type = get_peak_type(peak) prefix = os.path.join(out_dir, os.path.basename(strip_ext_peak(peak))) hammock = '{}.{}.hammock'.format(prefix, peak_type) hammock_tmp = '{}.tmp'.format(hammock) hammock_tmp2 = '{}.tmp2'.format(hammock) hammock_gz = '{}.gz'.format(hammock) hammock_gz_tbi = '{}.gz.tbi'.format(hammock) if get_num_lines(peak) == 0: cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, hammock_gz) run_shell_cmd(cmd) cmd2 = 'touch {}'.format(hammock_gz_tbi) else: cmd = "zcat -f {} | " cmd += "LC_COLLATE=C sort -k1,1V -k2,2n > {}" cmd = cmd.format(peak, hammock_tmp) run_shell_cmd(cmd) with open(hammock_tmp, 'r') as fin, open(hammock_tmp2, 'w') as fout: id = 1 for line in fin: lst = line.rstrip().split('\t') if peak_type == 'narrowPeak' or peak_type == 'regionPeak': fout.write( '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},' '{0[8]}],id:{1},'.format(lst, id)) if len(lst[3]) > 1: fout.write('name:"' + lst[3] + '",') if lst[5] != '.': fout.write('strand:"' + lst[5] + '",') if lst[9] != '-1': fout.write('sbstroke:[' + lst[9] + ']') elif peak_type == 'gappedPeak': fout.write( '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},' '{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],' 'thick:['.format(lst, id)) a = int(lst[1]) sizes = lst[10].split(',') starts = lst[11].split(',') for i in range(len(sizes)): fout.write('[{0},{1}],'.format( a + int(starts[i]), a + int(starts[i]) + int(sizes[i]))) fout.write(']},') if len(lst[3]) > 1: fout.write('name:"' + lst[3] + '",') if lst[5] != '.': fout.write('strand:"' + lst[5] + '",') elif peak_type == 'broadPeak': fout.write( '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],' 'id:{1},'.format(lst, id)) if len(lst[3]) > 1: fout.write('name:"' + lst[3] + '",') if lst[5] != '.': fout.write('strand:"' + lst[5] + '",') else: raise Exception("Unsupported peak_type {}".format(peak)) id += 1 fout.write('\n') cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | bgzip -cf > {}' cmd2 = cmd2.format(hammock_tmp2, hammock_gz) run_shell_cmd(cmd2) cmd3 = 'tabix -f -p bed {}'.format(hammock_gz) run_shell_cmd(cmd3) rm_f([hammock, hammock_tmp, hammock_tmp2]) return (hammock_gz, hammock_gz_tbi)