Exemplo n.º 1
0
def get_region_size_metrics(peak_file, out_dir='.'):
    '''
    From the peak file, return a plot of the region size distribution and
    the quartile metrics (summary from R)
    '''
    import pandas as pd
    import numpy as np
    import matplotlib as mpl
    mpl.use('Agg')
    from matplotlib import pyplot as plt
    from collections import OrderedDict

    basename = os.path.basename(strip_ext_peak(peak_file))
    prefix = os.path.join(out_dir, basename)
    log = '{}.peak_region_size.qc'.format(prefix)
    plot = '{}.peak_region_size.png'.format(prefix)

    # Load peak file. If it fails, return nothing as above
    peak_df = pd.read_table(peak_file, compression='gzip', header=None)

    # Subtract third column from second to get summary
    region_sizes = peak_df.iloc[:, 2] - peak_df.iloc[:, 1]

    # Summarize and store in ordered dict
    peak_summary_stats = region_sizes.describe()

    peak_size_summ = OrderedDict([
        ('Min size', peak_summary_stats['min']),
        ('25 percentile', peak_summary_stats['25%']),
        ('50 percentile (median)', peak_summary_stats['50%']),
        ('75 percentile', peak_summary_stats['75%']),
        ('Max size', peak_summary_stats['max']),
        ('Mean', peak_summary_stats['mean']),
    ])

    # Plot density diagram using matplotlib
    fig = plt.figure()
    ax = fig.add_subplot(111)

    y, binEdges = np.histogram(region_sizes, bins=100)
    bincenters = 0.5 * (binEdges[1:] + binEdges[:-1])

    # write to log file
    with open(log, 'w') as fp:
        for key, val in peak_size_summ.items():
            fp.write(key + '\t' + str(val) + '\n')

    plt.plot(bincenters, y, '-')
    filename = os.path.basename(peak_file)
    ax.set_title('Peak width distribution for {0}'.format(filename))

    # write to plot file
    fig.savefig(plot, format='png')

    return log, plot
Exemplo n.º 2
0
def get_num_peaks(peak_file, out_dir='.'):
    '''
    From the peak file, return number of lines in it
    '''
    basename = os.path.basename(strip_ext_peak(peak_file))
    prefix = os.path.join(out_dir, basename)
    log = '{}.num_peak.qc'.format(prefix)

    with open(log, 'w') as fp:
        fp.write(str(get_num_lines(peak_file)) + '\n')
    return log
Exemplo n.º 3
0
def peak_to_hammock(peak, out_dir):
    peak_type = get_peak_type(peak)
    prefix = os.path.join(out_dir, os.path.basename(strip_ext_peak(peak)))
    hammock = '{}.{}.hammock'.format(prefix, peak_type)
    hammock_tmp = '{}.tmp'.format(hammock)
    hammock_tmp2 = '{}.tmp2'.format(hammock)
    hammock_gz = '{}.gz'.format(hammock)
    hammock_gz_tbi = '{}.gz.tbi'.format(hammock)

    if get_num_lines(peak) == 0:
        cmd = 'zcat -f {} | gzip -nc > {}'.format(peak, hammock_gz)
        run_shell_cmd(cmd)
        cmd2 = 'touch {}'.format(hammock_gz_tbi)
    else:
        cmd = "zcat -f {} | "
        cmd += "LC_COLLATE=C sort -k1,1V -k2,2n > {}"
        cmd = cmd.format(peak, hammock_tmp)
        run_shell_cmd(cmd)

        with open(hammock_tmp, 'r') as fin, open(hammock_tmp2, 'w') as fout:
            id = 1
            for line in fin:
                lst = line.rstrip().split('\t')

                if peak_type == 'narrowPeak' or peak_type == 'regionPeak':
                    fout.write(
                        '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]},'
                        '{0[8]}],id:{1},'.format(lst, id))
                    if len(lst[3]) > 1:
                        fout.write('name:"' + lst[3] + '",')
                    if lst[5] != '.':
                        fout.write('strand:"' + lst[5] + '",')
                    if lst[9] != '-1':
                        fout.write('sbstroke:[' + lst[9] + ']')
                elif peak_type == 'gappedPeak':
                    fout.write(
                        '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[12]},{0[13]},'
                        '{0[14]}],id:{1},struct:{{thin:[[{0[1]},{0[2]}]],'
                        'thick:['.format(lst, id))
                    a = int(lst[1])
                    sizes = lst[10].split(',')
                    starts = lst[11].split(',')
                    for i in range(len(sizes)):
                        fout.write('[{0},{1}],'.format(
                            a + int(starts[i]),
                            a + int(starts[i]) + int(sizes[i])))
                    fout.write(']},')

                    if len(lst[3]) > 1:
                        fout.write('name:"' + lst[3] + '",')
                    if lst[5] != '.':
                        fout.write('strand:"' + lst[5] + '",')
                elif peak_type == 'broadPeak':
                    fout.write(
                        '{0[0]}\t{0[1]}\t{0[2]}\tscorelst:[{0[6]},{0[7]}],'
                        'id:{1},'.format(lst, id))
                    if len(lst[3]) > 1:
                        fout.write('name:"' + lst[3] + '",')
                    if lst[5] != '.':
                        fout.write('strand:"' + lst[5] + '",')
                else:
                    raise Exception("Unsupported peak_type {}".format(peak))
                id += 1

                fout.write('\n')

        cmd2 = 'zcat -f {} | sort -k1,1 -k2,2n | bgzip -cf > {}'
        cmd2 = cmd2.format(hammock_tmp2, hammock_gz)
        run_shell_cmd(cmd2)
        cmd3 = 'tabix -f -p bed {}'.format(hammock_gz)
        run_shell_cmd(cmd3)

        rm_f([hammock, hammock_tmp, hammock_tmp2])
    return (hammock_gz, hammock_gz_tbi)