Пример #1
0
def read_bedgraph(sample_name, chrom_file):
    bedgraph_file_path = get_file_path(sample_name, BEDGRAPH_DIR)

    if bedgraph_file_path is None:
        print(f"No bedgraph file for {sample_name}")
        return None

    print(f"Reading in {bedgraph_file_path}")
    # return BedGraph(chrom_file, bedgraph_file_path, ignore_missing_bp=False, chrom_wanted='chr1')
    return BedGraph(chrom_file, bedgraph_file_path, ignore_missing_bp=False)
Пример #2
0
    def find_loop_anchor_points(self, bedgraph: BedGraph):
        """
        Finds the exact loop anchor points.

        Finds peak values for each anchor and weighs the loop. Also finds loops
        that have overlapping start/end indexes due to close and long start/end
        anchors.

        Parameters
        ----------
        bedgraph : BedGraph
            Used to find the anchor points of each loop
        """

        log.info(f'Finding anchor points for {self.sample_name}\'s {self.name}'
                 f' from {bedgraph.name}')

        bedgraph.load_chrom_data(self.name)

        # Get index of peaks in every anchor interval
        self.start_list = bedgraph.stats(start_list=self.start_anchor_list[0],
                                         end_list=self.start_anchor_list[1],
                                         chrom_name=self.name,
                                         stat='max_index')
        self.end_list = bedgraph.stats(start_list=self.end_anchor_list[0],
                                       end_list=self.end_anchor_list[1],
                                       chrom_name=self.name,
                                       stat='max_index')

        # Get peak value for every anchor interval
        start_list_peaks = bedgraph.stats(start_list=self.start_anchor_list[0],
                                          end_list=self.start_anchor_list[1],
                                          chrom_name=self.name,
                                          stat='max')
        end_list_peaks = bedgraph.stats(start_list=self.end_anchor_list[0],
                                        end_list=self.end_anchor_list[1],
                                        chrom_name=self.name,
                                        stat='max')
        self.start_list_peaks = start_list_peaks
        self.end_list_peaks = end_list_peaks
        bedgraph.free_chrom_data(self.name)

        start_list_peaks = start_list_peaks / start_list_peaks.sum()
        end_list_peaks = end_list_peaks / end_list_peaks.sum()

        for i in range(self.numb_loops):
            # loop_start = self.start_list[i]
            # loop_end = self.end_list[i]

            # Remove anchors that have the same* peak
            # Keep indexes of loop length to avoid comparisons in interval
            # if not loop_start < loop_end:
            #     self.value_list[i] = 0
            #
            #     # Removed interval goes from
            #     # (start of start anchor, end of end anchor)
            #     self.removed_intervals[0].append(self.start_anchor_list[0][i])
            #     self.removed_intervals[1].append(self.end_anchor_list[1][i])
            #     continue

            # Weigh each loop based on its corresponding bedgraph peak
            # peak_value = max(start_list_peaks[i], end_list_peaks[i])
            peak_value = start_list_peaks[i] + end_list_peaks[i]
            self.value_list[i] *= peak_value

        self.max_loop_value = np.max(self.value_list)

        # Should be very small due to peaks being weighted earlier
        log.debug(f"Max loop weighted value: {self.max_loop_value}")
    def __init__(self,
                 chrom_size_file: str,
                 loop_file: str,
                 bedgraph: BedGraph,
                 peak_dict: Dict[str, list],
                 chroms_to_load: List[str] = None,
                 min_loop_value: int = 0):
        """
        Initializes all chromosomes and adds loops to them from given file.

        Finds peak max from bedgraph

        Parameters
        ----------
        chrom_size_file : str
            File containing the base pair size of each chromosome to use
        loop_file : str
            File containing loops in format:
            chrom1  start1   end1 chrom2  start2   end2 pet_count
        bedgraph : BedGraph
            The bedgraph file for this sample (from pyBedGraph)
        peak_dict : dict[str, list]
            Key: Name of chromosome (chr1, chr2, ...)
            Value: List of peaks in chromosome
            Peak format: [start, end, length]
        chroms_to_load : list, optional
             List of names of chromosome to load (default is None)
        min_loop_value : int, optional
            Minimum loop value (PET count) to include (default is 0)
        """

        # Prints peak_dict which is too large to be meaningful
        # log.debug(locals())

        self.species_name = os.path.basename(chrom_size_file).split('.')[0]
        self.sample_name = os.path.basename(loop_file).split('.')[0]

        self.total_samples = 0

        self.peak_dict = {}

        # Find values for each peak since peak caller is not accurate sometimes
        for chrom_name, peak_chrom in peak_dict.items():
            if not bedgraph.has_chrom(chrom_name):
                log.warning(f'{bedgraph.name} does not have {chrom_name}')
                continue

            bedgraph.load_chrom_data(chrom_name)
            start_list = [x[0] for x in peak_chrom]
            end_list = [x[1] for x in peak_chrom]
            max_list = \
                bedgraph.stats(start_list=start_list, end_list=end_list,
                               chrom_name=chrom_name, stat='max')
            mean_list = \
                bedgraph.stats(start_list=start_list, end_list=end_list,
                               chrom_name=chrom_name, stat='mean')
            for i in range(max_list.size):
                peak_chrom[i].append(max_list[i])
                peak_chrom[i].append(mean_list[i])
            bedgraph.free_chrom_data(chrom_name)

            self.peak_dict[chrom_name] = peak_dict[chrom_name]

        # Initialize all chromosomes to be loaded
        self.chrom_dict = {}
        with open(chrom_size_file) as in_file:
            for line in in_file:
                line = line.strip().split()
                chrom_name = line[0]
                if chroms_to_load and chrom_name not in chroms_to_load:
                    continue

                if chrom_name in CHROMS_TO_IGNORE:
                    continue

                if chrom_name not in peak_dict:
                    continue

                chrom_size = int(line[1])

                self.chrom_dict[chrom_name] = \
                    ChromLoopData(chrom_name, chrom_size, self.sample_name)

        with open(loop_file) as in_file:
            loop_anchor_list = []
            for line in in_file:
                line = line.strip().split()
                chrom_name = line[0]
                if chrom_name not in self.chrom_dict:
                    continue

                loop_value = int(line[6])
                if loop_value < min_loop_value:
                    continue

                # head interval
                loop_start1 = int(line[1])
                loop_end1 = int(line[2])

                # tail anchor
                loop_start2 = int(line[4])
                loop_end2 = int(line[5])

                self.chrom_dict[chrom_name].add_loop(loop_start1, loop_end1,
                                                     loop_start2, loop_end2,
                                                     loop_value)

                head_interval = loop_end1 - loop_start1
                tail_interval = loop_end2 - loop_start2

                loop_anchor_list.append(head_interval)
                loop_anchor_list.append(tail_interval)

            log.debug(f'Anchor mean width: {np.mean(loop_anchor_list)}')

        # Get rid of chroms that had problems initializing
        to_remove = []
        for chrom_name in self.chrom_dict:
            if self.chrom_dict[chrom_name].finish_init(bedgraph):
                self.total_samples += \
                    np.sum(self.chrom_dict[chrom_name].value_list)
            else:
                to_remove.append(chrom_name)

        # Chromosomes with no loops or other random problems
        for chrom_name in to_remove:
            del self.chrom_dict[chrom_name]
Пример #4
0
            output = key + "\n" + " ".join([str(x) for x in run_time_results[key]]) + '\n'
            out.write(output)

    # generate_images.create_runtime_num_test(data_name, num_test_list, run_time_results)


if len(sys.argv) != 3:
    print("Needs 2 arguments:\n"
          "arg 1 - chrom_sizes_file\n"
          "arg 2 - bigWig file")
    exit(-1)

chrom_name = 'chr1'

start_time = time.time()
bedGraph = BedGraph(sys.argv[1], sys.argv[2], chrom_name)
print("Time for loading bedGraph file: ", time.time() - start_time)

start_time = time.time()
print(f"Time for loading {chrom_name}: ", time.time() - start_time, '\n')

bench = Benchmark(bedGraph, sys.argv[2])

data_name = Path(sys.argv[2]).stem
if not os.path.isdir(f'graphs'):
    os.mkdir(f'graphs')
if not os.path.isdir(f'graphs/{data_name}'):
    os.mkdir(f'graphs/{data_name}')

# runtime_benchmark()
interval_size_error_benchmark()
Пример #5
0
from pyBedGraph import BedGraph
import os
import csv

FOLDER_LOC = '/media/hirow/extra/jax/data/pybedgraph'

stats = []

for folder in os.listdir(FOLDER_LOC):
    for filename in os.listdir(f'{FOLDER_LOC}/{folder}'):
        print(folder, filename)
        bedgraph = BedGraph(
            f'/media/hirow/extra/jax/data/chrom_sizes/{folder}.chrom.sizes',
            f'{FOLDER_LOC}/{folder}/{filename}', 'chr1')
        sample_name = filename.split('.')[0]
        sample = {}
        sample['name'] = sample_name
        chrom = bedgraph.chromosome_map['chr1']

        sample['total_coverage'] = chrom.total_coverage
        sample['num_samples'] = chrom.num_samples
        sample['avg_chrom_value'] = chrom.avg_chrom_value
        sample['avg_interval_value'] = chrom.avg_interval_value
        sample['avg_interval_size'] = chrom.avg_interval_size
        sample['num_intervals'] = chrom.num_intervals

        stats.append(sample)

csv_columns = list(stats[0].keys())

with open('bedgraph_stats.csv', 'w') as csv_file:
Пример #6
0
import time
import pyBedGraph
from pyBedGraph import BedGraph

print(f'Using {pyBedGraph.__file__}')

DEBUG = False

try:
    bedGraph = BedGraph('test_files/hg38.chrom.sizes',
                        'test_files/ENCFF376VCU.bigWig', ['chr14'])
    assert False
except RuntimeError:
    print("Passed giving wrong chrom size test!")

start_time = time.time()
bedGraph = BedGraph('test_files/mm10.chrom.sizes',
                    'test_files/ENCFF376VCU.bedGraph',
                    debug=DEBUG)
print(f"Loading ENCFF376VCU.bedgraph took {time.time() - start_time}")
# Takes 170 seconds on i5-7300HQ

bedGraph.load_chrom_data('chr1')
bedGraph.load_chrom_bins('chr1', 100)

bedGraph.load_chrom_data('chr4')
bedGraph.load_chrom_bins('chr4', 100)

if DEBUG:
    total_num_intervals = 0
    avg_interval_sizes = {'chr1': 26.447609, 'chr10': 25.53135}
Пример #7
0
def read_data(input_data_file: str,
              chrom_size_file: str,
              min_loop_value: int = 1,
              min_bedgraph_value: int = 1,
              chroms_to_load: List[str] = None,
              use_bigwig: bool = False,
              output_dir: str = 'output') -> Dict[str, GenomeLoopData]:
    """
    Reads all samples that are found in loop_data_dir.

    loop_data_dir/peak_data_dir/bedgraph_data_dir do not have to be separate
    directories.

    Parameters
    ----------
    input_data_file : str
        File with file paths to all necessary input files.
        Format:
        sample1_name bedgraph1_file   peak1_file   loop2_file
        sample2_name bedgraph2_file   peak2_file   loop1_file
        ...
    chrom_size_file : str
        Path to chromosome size file
    min_loop_value : int, optional
        Minimum loop value accepted by GenomeLoopData/ChromLoopData
    min_bedgraph_value : int, optional
        Minimum value accepted by BedGraph obj from pyBedGraph
    chroms_to_load : list, optional
        Specify specific chromosomes to load instead of the entire genome
    use_bigwig : bool, optional
        Specify if input_file is bigwig or not. Not implemented yet.
    output_dir : str
        Directory to output data

    Returns
    -------
    OrderedDict[str, GenomeLoopData]
    """
    total_start_time = time.time()
    os.makedirs(f'{output_dir}/timings', exist_ok=True)
    sample_data_dict = OrderedDict()

    if not os.path.isfile(chrom_size_file):
        log.error(f"Chrom size file: {chrom_size_file} is not a valid file")
        return sample_data_dict

    if not os.path.isfile(input_data_file):
        log.error(f"Data file: {input_data_file} is not a valid file")
        return sample_data_dict

    # Get input file names
    input_sample_files = []
    with open(input_data_file) as in_file:
        for line in in_file:
            sample_files = line.split()
            if len(sample_files) != 4:
                log.error(f"Invalid number of columns in {input_data_file}")
                return sample_data_dict
            input_sample_files.append(sample_files)

    sample_timings = OrderedDict()
    for sample_files in input_sample_files:
        sample_start_time = time.time()

        sample_name = sample_files[0]
        bedgraph_file = sample_files[1]
        peak_file = sample_files[2]
        loop_file = sample_files[3]

        # Check for file validity
        invalid_file = False
        for i in range(1, 4):
            if not os.path.isfile(sample_files[i]):
                log.error(f"Data file: {sample_files[i]} is not a valid file")
                invalid_file = True
                break
        if invalid_file:
            continue

        log.info(f'Loading {sample_name} ...')

        peak_dict = read_peak_file(peak_file)
        bedgraph = BedGraph(chrom_size_file,
                            bedgraph_file,
                            chroms_to_load=chroms_to_load,
                            ignore_missing_bp=False,
                            min_value=min_bedgraph_value)

        gld = GenomeLoopData(chrom_size_file,
                             loop_file,
                             bedgraph,
                             peak_dict,
                             min_loop_value=min_loop_value,
                             chroms_to_load=chroms_to_load)
        sample_data_dict[sample_name] = gld
        sample_timings[sample_name] = time.time() - sample_start_time

    with open(f'{output_dir}/timings/read_data.txt', 'w') as out_file:
        out_file.write(f'sample_name\ttime_taken\n')
        for sample_name, sample_timing in sample_timings.items():
            out_file.write(f'{sample_name}\t{sample_timing}\n')
        out_file.write(f'total\t{time.time() - total_start_time}\n')

    return sample_data_dict
def mainfn(args):
    bedgraph = args.p2bedgraph
    expr_name = args.expr_name
    chrom_size = args.p2chrom
    p2annot_loop = args.p2loop_annot
    p2loop_tag = args.p2loop_tag
    nbins = args.nbins
    p2save_dir = args.p2save_dir
    pseudo = args.pseudo

    p2bedgraph = os.path.join(bedgraph, expr_name)
    bg = BedGraph(chrom_size, p2bedgraph)

    annot_columns = [
        'left_chr', 'left_start', 'left_end', 'right_chr', 'right_start',
        'right_end', 'PET count', 'left_max_intensity', 'right_max_intensity',
        'left_max_index', 'right_max_index', 'loop_ID', 'left_motif_chr',
        'left_motif_start', 'left_motif_end', 'left_motif_strand',
        'left_distance', 'right_motif_chr', 'right_motif_start',
        'right_motif_end', 'right_motif_strand', 'right_distance'
    ]
    df_loop = pd.read_csv(p2annot_loop, names=annot_columns, sep='\t')

    chromfile = pd.read_table(chrom_size, names=['chrom', 'size'])
    for row in chromfile.iterrows():
        chrom_name = row[1]['chrom']
        if chrom_name not in df_loop['left_chr'].values:
            continue
        bg.load_chrom_data(chrom_name)

    if pseudo == 0:
        loop_tag = pd.read_csv(p2loop_tag, sep='\t', index_col=0)
    elif pseudo == 1:
        # def drop_exceed_loop(df_loop_row, chromfile):
        #     chrom_name = df_loop_row['left_chr']
        #     chrom_len = chromfile[chromfile['chrom'] == chrom_name]['size'].values
        #     loop_end = df_loop_row['right_end']
        #     return loop_end > chrom_len
        # tmp_fn = lambda x: drop_exceed_loop(x, chromfile)
        # df_loop['exceed_chorm'] = df_loop.apply(tmp_fn, axis = 1)
        # df_loop = df_loop[df_loop['exceed_chorm'] == False]
        # df_loop = df_loop.reset_index()

        loop_tag = pd.DataFrame(index=df_loop.index,
                                columns=['bias', 'convergence', 'NULL motif'])
        loop_tag.bias = 'balance'
        loop_tag.convergence = 'convergent'
        loop_tag['NULL motif'] = 'na'

    df_binned_intensity_per_loop = pd.DataFrame(
        index=df_loop.index,
        columns=['bias', '{} binned intensity'.format(nbins)])
    df_binned_intensity_per_loop['bias'] = loop_tag['bias']

    tmp_df = df_loop.apply(lambda x: get_max_intensity_in_same_len_bins(
        bg,
        nbins,
        x.left_start,
        x.left_chr,
        x.right_end,
        x.right_chr,
        chrom_size=chromfile[chromfile['chrom'] == x.left_chr]['size']),
                           axis=1)
    df_binned_intensity_per_loop['{} binned intensity'.format(nbins)] = tmp_df

    df_binned_intensity_per_loop['convergence'] = loop_tag['convergence']
    df_binned_intensity_per_loop['NULL motif'] = loop_tag['NULL motif']
    df_binned_intensity_per_loop['chrom'] = df_loop['left_chr']

    binned_intensity_per_loop_name = 'binned_results_{}'.format(expr_name)
    if pseudo == 1:
        binned_intensity_per_loop_name = 'pseudo_' + binned_intensity_per_loop_name
    if not os.path.isdir(p2save_dir):
        os.makedirs(p2save_dir)
    p2binned_intensity_per_loop = os.path.join(p2save_dir,
                                               binned_intensity_per_loop_name)
    df_binned_intensity_per_loop.to_pickle(p2binned_intensity_per_loop)

    norm_df_binned_intensity_per_loop = df_binned_intensity_per_loop.copy()
    binned_name_list = []
    for name in df_binned_intensity_per_loop.columns:
        if 'binned intensity' in name:
            binned_name_list.append(name)
            norm_fn = lambda x: x / max(x)
            norm_df_binned_intensity_per_loop[
                name] = norm_df_binned_intensity_per_loop[name].apply(norm_fn)
    if args.norm == 0:
        df_agg_sum, df_agg_mean, df_agg_var = get_aggregated_inten_for_each_class(
            df_binned_intensity_per_loop, nbins=nbins, catag='bias')
    else:
        df_agg_sum, df_agg_mean, df_agg_var = get_aggregated_inten_for_each_class(
            norm_df_binned_intensity_per_loop, nbins=nbins, catag='bias')

    for label in df_agg_mean.columns:
        fig_name = 'norm_sum_agg_plot_{}_{}'.format(label.replace(' ', '_'),
                                                    expr_name)
        if pseudo == 1:
            fig_name = 'pseudo_' + fig_name
        p2avg_fig = os.path.join(p2save_dir, 'aggregated_plots', fig_name)
        # aggre_by_mean_var(df_agg_mean, df_agg_var, label=label,
        #                   chrom='whole genome', scilent=True,
        #                   p2f=p2avg_fig)
        aggre_by_sum(df_agg_sum,
                     label=label,
                     chrom='whole genome',
                     scilent=True,
                     p2f=p2avg_fig)
Пример #9
0
import sys
sys.path.append("../..")
from pyBedGraph import BedGraph

# arg1 - chromosome sizes file
# arg2 - bedgraph file
# arg3 - (optional) chromosome_name
# Just load chromosome 'chr1' (uses less memory and takes less time)
bedGraph = BedGraph('myChrom.sizes', 'random_test.bedGraph', 'chr1')

# Load the whole bedGraph file
bedGraph = BedGraph('myChrom.sizes', 'random_test.bedGraph', 'chr1')

# Option to not ignore missing basePairs when calculating statistics
# Used the exact same way but produces slightly different results
inclusive_bedGraph = BedGraph('myChrom.sizes',
                              'random_test.bedGraph',
                              ignore_missing_bp=False)

bedGraph.load_chrom_data('chr1')
inclusive_bedGraph.load_chrom_data('chr1')
bedGraph.load_chrom_bins('chr1', 3)
inclusive_bedGraph.load_chrom_bins('chr1', 3)

import numpy as np

# Option 1
test_intervals = [['chr1', 24, 26], ['chr1', 12, 15], ['chr1', 8, 12],
                  ['chr1', 9, 10], ['chr1', 0, 5]]
values = bedGraph.stats(intervals=test_intervals)
def main_fn(args):
    p2loop_file = args.p2loop_file
    p2bedgraph = args.p2bedgraph
    p2save_loop_tag = args.p2save_loop_tag
    nbins = args.nbins
    p2chrom_size = args.p2chrom_size
    p2binned_intensity_per_loop = args.p2binned_intensity_per_loop
    p2agg_stats = args.p2agg_stats

    annot_col_names = [
        'left_chr', 'left_start', 'left_end', 'right_chr', 'right_start',
        'right_end', 'PET count', 'left_max_intensity', 'right_max_intensity',
        'left_max_index', 'right_max_index', 'loop_ID', 'left_motif_chr',
        'left_motif_start', 'left_motif_end', 'left_motif_strand',
        'left_distance', 'right_motif_chr', 'right_motif_start',
        'right_motif_end', 'right_motif_strand', 'right_distance'
    ]

    conv_dict = {
        '+-': 'convergence',
        '-+': 'divergence',
        '++': 'right tandem',
        '--': 'left tandem'
    }

    null_dict = {
        '.+': 'NULL-right',
        '.-': 'NULL-left',
        '-.': 'left-NULL',
        '+.': 'right-NULL',
        '..': 'NULL'
    }

    df_loop = pd.read_table(p2loop_file, names=annot_col_names)

    loop_tag = pd.DataFrame(columns=['bias', 'convergence', 'NULL motif'],
                            index=df_loop.index)
    loop_tag['bias'] = df_loop.apply(lambda x: binomial_test_fn(
        x.left_max_intensity, x.right_max_intensity),
                                     axis=1)
    loop_tag['convergence'] = df_loop.apply(lambda x: motif_convergence_fn(
        x.left_motif_strand, x.right_motif_strand, conv_dict),
                                            axis=1)
    loop_tag['NULL motif'] = df_loop.apply(lambda x: find_NULL_motif(
        x.left_motif_strand, x.right_motif_strand, null_dict),
                                           axis=1)

    # save loop tag and added label loop annotation file.
    df_loop_new = df_loop.copy()
    df_loop_new[['bias', 'convergence', 'NULL motif'
                 ]] = loop_tag[['bias', 'convergence', 'NULL motif']]

    loop_tag.to_csv(p2save_loop_tag, sep='\t')

    p2labeled_loop = p2loop_file + '_added_labels'
    df_loop_new.to_csv(p2labeled_loop, sep='\t')

    whole_genome_balance_count = (loop_tag['bias'] == 'balance').sum()
    whole_genome_left_biased_count = (loop_tag['bias'] == 'left biased').sum()
    whole_genome_right_biased_count = (
        loop_tag['bias'] == 'right biased').sum()

    # aggregate bias
    chrom_list = list(
        set(df_loop['left_chr']).union(set(df_loop['right_chr'])))
    chrom_list.sort(key=lambda x: int(x[3:]) if x != 'chrX' else 24)
    chrom_list.append('whole genome')
    df_bias_count = pd.DataFrame(columns=[
        'balance_loop_count',
        'balance_PET_count',
        'left_biased_loop_count',
        'left_biased_PET_count',
        'right_biased_loop_count',
        'right_biased_PET_count',
    ],
                                 index=chrom_list)

    for chrom in chrom_list[:-1]:
        chrom_loop_idx = (df_loop['left_chr'] == chrom)

        balance_tag_idx = (loop_tag['bias'] == 'balance')
        left_bias_tag_idx = (loop_tag['bias'] == 'left biased')
        right_bias_tag_idx = (loop_tag['bias'] == 'right biased')

        chrom_balance_idx = (balance_tag_idx & chrom_loop_idx)
        chrom_left_biased_idx = (left_bias_tag_idx & chrom_loop_idx)
        chrom_right_biased_idx = (right_bias_tag_idx & chrom_loop_idx)

        chrom_balance_count = chrom_balance_idx.sum()
        chrom_left_biased_count = chrom_left_biased_idx.sum()
        chrom_right_biased_count = chrom_right_biased_idx.sum()

        chrom_balance_PET = df_loop.loc[chrom_balance_idx]['PET count'].sum()
        chrom_left_biased_PET = df_loop.loc[chrom_left_biased_idx][
            'PET count'].sum()
        chrom_right_biased_PET = df_loop.loc[chrom_right_biased_idx][
            'PET count'].sum()

        df_bias_count.loc[chrom] = {
            'balance_loop_count': chrom_balance_count,
            'balance_PET_count': chrom_balance_PET,
            'left_biased_loop_count': chrom_left_biased_count,
            'left_biased_PET_count': chrom_left_biased_PET,
            'right_biased_loop_count': chrom_right_biased_count,
            'right_biased_PET_count': chrom_right_biased_PET
        }

    df_bias_count.loc['whole genome'] = df_bias_count.loc[chrom_list[:-1]].sum(
        axis=0)

    df_bias_count['loop_count_proportion_blr'] = df_bias_count.apply(
        lambda x: count_proportion_fn(x, 'balance_loop_count',
                                      'left_biased_loop_count',
                                      'right_biased_loop_count'),
        axis=1)
    df_bias_count['PET_count_proportion_blr'] = df_bias_count.apply(
        lambda x: count_proportion_fn(x, 'balance_PET_count',
                                      'left_biased_PET_count',
                                      'right_biased_PET_count'),
        axis=1)

    p2df_bias_count = p2agg_stats + '_bias_count.csv'
    df_bias_count.to_csv(p2df_bias_count)

    # aggregate convergence results.
    conv_column_list = [
        'convergence_loop_count', 'convergence_PET_count',
        'divergence_loop_count', 'divergence_PET_count',
        'left_tandem_loop_count', 'left_tandem_PET_count',
        'right_tandem_loop_count', 'right_tandem_PET_count'
    ]

    df_convergence_count = pd.DataFrame(columns=conv_column_list,
                                        index=chrom_list)

    for chrom in chrom_list[:-1]:
        chrom_loop_idx = (df_loop['left_chr'] == chrom)

        convergence_tag_idx = (loop_tag['convergence'] == 'convergence')
        divergence_tag_idx = (loop_tag['convergence'] == 'divergence')
        left_tendem_tag_idx = (loop_tag['convergence'] == 'left tandem')
        right_tendem_tag_idx = (loop_tag['convergence'] == 'right tandem')

        chrom_convergence_idx = (convergence_tag_idx & chrom_loop_idx)
        chrom_divergence_idx = (divergence_tag_idx & chrom_loop_idx)
        chrom_left_tendem_idx = (left_tendem_tag_idx & chrom_loop_idx)
        chrom_right_tendem_idx = (right_tendem_tag_idx & chrom_loop_idx)

        chrom_convergence_count = chrom_convergence_idx.sum()
        chrom_divergence_count = chrom_divergence_idx.sum()
        chrom_left_tendem_count = chrom_left_tendem_idx.sum()
        chrom_right_tendem_count = chrom_right_tendem_idx.sum()

        chrom_convergence_PET = df_loop.loc[chrom_convergence_idx][
            'PET count'].sum()
        chrom_divergence_PET = df_loop.loc[chrom_divergence_idx][
            'PET count'].sum()
        chrom_left_tendem_PET = df_loop.loc[chrom_left_tendem_idx][
            'PET count'].sum()
        chrom_right_tendem_PET = df_loop.loc[chrom_right_tendem_idx][
            'PET count'].sum()

        count_list = [
            chrom_convergence_count, chrom_convergence_PET,
            chrom_divergence_count, chrom_divergence_PET,
            chrom_left_tendem_count, chrom_left_tendem_PET,
            chrom_right_tendem_count, chrom_right_tendem_PET
        ]

        df_convergence_count.loc[chrom] = dict(
            zip(conv_column_list, count_list))

    df_convergence_count.loc['whole genome'] = df_convergence_count.loc[
        chrom_list[:-1]].sum(axis=0)

    df_convergence_count[
        'PET_count_proportion_cdlr'] = df_convergence_count.apply(
            lambda x: convergence_proportion_fn(
                x, 'convergence_PET_count', 'divergence_PET_count',
                'left_tandem_PET_count', 'right_tandem_PET_count'),
            axis=1)

    p2df_convergence_count = p2agg_stats + '_convergence_count.csv'
    df_convergence_count.to_csv(p2df_convergence_count)

    # aggregate NULL motif.
    NULL_name_list = list(set(loop_tag['NULL motif']))
    NULL_name_list.sort()

    NULL_column_list = []
    for n in NULL_name_list:
        if n == 'na':
            continue
        NULL_column_list.append('{}_loop_count'.format(n))
        NULL_column_list.append('{}_PET_count'.format(n))

    df_NULL_count = pd.DataFrame(columns=NULL_column_list, index=chrom_list)

    for chrom in chrom_list[:-1]:
        chrom_loop_idx = (df_loop['left_chr'] == chrom)

        NULL_val_list = []
        for n in NULL_column_list:
            cur_type = n.split('_')[0]
            cur_tag_idx = (loop_tag['NULL motif'] == cur_type)

            chrom_cur_tag_idx = (cur_tag_idx & chrom_loop_idx)

            if n.split('_')[1] == 'loop':
                chrom_cur_count = chrom_cur_tag_idx.sum()
            elif n.split('_')[1] == 'PET':
                chrom_cur_count = df_loop.loc[chrom_cur_tag_idx][
                    'PET count'].sum()

            NULL_val_list.append(chrom_cur_count)

        df_NULL_count.loc[chrom] = dict(zip(NULL_column_list, NULL_val_list))

    df_NULL_count.loc['whole genome'] = df_NULL_count.loc[
        chrom_list[:-1]].sum()

    loop_count_name_list = [x for x in NULL_column_list if 'loop' in x]
    df_NULL_count['loop_nn_nl_nr_ln_rn'] = df_NULL_count.apply(
        lambda x: NULL_proportion_fn(x, loop_count_name_list), axis=1)

    PET_count_name_list = [x for x in NULL_column_list if 'PET' in x]
    df_NULL_count['PET_nn_nl_nr_ln_rn'] = df_NULL_count.apply(
        lambda x: NULL_proportion_fn(x, PET_count_name_list), axis=1)

    p2df_NULL_count = p2agg_stats + '_NULL_motif_count.csv'
    df_NULL_count.to_csv(p2df_NULL_count)

    # READ bedgraph file and get intensity
    ipdb.set_trace()
    bg = BedGraph(p2chrom_size, p2bedgraph)
    chromfile = pd.read_table(p2chrom_size, names=['chrom', 'size'])
    for row in chromfile.iterrows():
        bg.load_chrom_data(row[1]['chrom'])

    bin_name = '{} binned intensity'.format(nbins)
    df_binned_intensity_per_loop = pd.DataFrame(index=df_loop.index,
                                                columns=['bias', bin_name])
    df_binned_intensity_per_loop['bias'] = loop_tag['bias']

    my_bg = bg
    tmp_df = df_loop.apply(lambda x: get_max_intensity_in_same_len_bins(
        my_bg, nbins, x.left_start, x.left_chr, x.right_end, x.right_chr),
                           axis=1)
    df_binned_intensity_per_loop[bin_name] = tmp_df
    df_binned_intensity_per_loop['convergence'] = loop_tag['convergence']
    df_binned_intensity_per_loop['NULL motif'] = loop_tag['NULL motif']
    df_binned_intensity_per_loop['chrom'] = df_loop['left_chr']

    df_binned_intensity_per_loop.to_pickle(p2binned_intensity_per_loop)