示例#1
0
def get_chromosome_arms(genome, exclude=None):
    # Uses bioframe to get chromosomal regions

    if exclude is not None:
        if isinstance(exclude, str):
            exclude = [exclude]
        exclude = [str(item) for item in exclude]
    else:
        exclude = []

    try:
        chromlengths = bioframe.fetch_chromsizes(genome)
        centromeres = bioframe.fetch_centromeres(genome).set_index('chrom')
    except:
        print(f'Information for genome {genome} could not be found.')
        return None

    arms = []
    for chrom, length in chromlengths.iteritems():
        if chrom in exclude:
            continue

        if chrom in centromeres.index:
            mid = centromeres.loc[chrom, 'mid']
            arms.append((chrom, 0, mid))
            arms.append((chrom, mid, length))
        else:
            arms.append((chrom, 0, length))

    return arms
示例#2
0
def get_chroms(genome, ignoreXYMT=True):
    "Get list of chroms to analyze"
    print("Using chroms from " + genome)
    chromsizes = bioframe.fetch_chromsizes(genome)
    chr_list = list(chromsizes.index)
    if ignoreXYMT == True:
        chr_list = [i for i in chr_list if i not in ("chrM", "chrX", "chrY")]
    return chr_list
示例#3
0
def cooler_global_scaling(cool,
                          genome,
                          trans=True,
                          mapper=map,
                          balance='weight',
                          thres=None,
                          ignore_diags=2):

    row_masker = col_masker = cooler_mask(cool, header=balance, thres=thres)
    matrix_fetcher = cooler_matrix_generator(cool, header=balance)
    resolution = cool.info['bin-size']

    chrom_arms = DNA_info.get_chromosome_arms(genome)
    cis_regions = [(arm, arm) for arm in chrom_arms]

    cis_results = cis_binning(cis_regions,
                              matrix_fetcher,
                              row_masker,
                              col_masker,
                              resolution,
                              ignore_diags,
                              mapper=mapper)
    cis_results = pd.concat(cis_results)
    cis_results = cis_results.reset_index().rename(
        columns={'region1': 'region'})
    del cis_results['region2']
    cis_results.set_index(['region', 'diag'], inplace=True, drop=True)

    if trans:
        print('Computing trans expected')
        chromsizes = bioframe.fetch_chromsizes(genome)
        trans_regions = [(bioframe.parse_region(cool.chromnames[i],
                                                chromsizes=chromsizes),
                          bioframe.parse_region(cool.chromnames[j],
                                                chromsizes=chromsizes))
                         for i in range(len(cool.chromnames))
                         for j in range(i + 1, len(cool.chromnames))]

        trans_results = trans_binning(trans_regions,
                                      matrix_fetcher,
                                      row_masker,
                                      col_masker,
                                      resolution,
                                      mapper=mapper)
        trans_results = [
            result for result in trans_results if result is not None
        ]
        trans_results = pd.concat(trans_results)
        trans_results['chrom1'] = trans_results.index.map(
            lambda x: x[0][0]).values
        trans_results['chrom2'] = trans_results.index.map(
            lambda x: x[1][0]).values
        trans_results.set_index(['chrom1', 'chrom2'], inplace=True)

        return cis_results, trans_results
    return cis_results
示例#4
0
def gene_content(genome, binsize, gc=True):

    chrom_sizes = bioframe.fetch_chromsizes(genome)
    chrom_table = binnify(chrom_sizes, binsize)

    gene_count = frac_gene_coverage(chrom_table, genome)
    if gc:
        fasta_path = f'/net/levsha/share/lab/genomes/{genome}/{genome}.fa'
        fasta_records = load_fasta(fasta_path)
        gene_count['frac_gc'] = frac_gc(chrom_table, fasta_records)

    return gene_count
def gene_content(genome, binsize, gc=True, fasta_path=None):
    
    chrom_sizes = bioframe.fetch_chromsizes(genome)
    chrom_table = binnify(chrom_sizes, binsize)

    gene_count = frac_gene_coverage(chrom_table, genome)
    if gc:
        assert fasta_path is not None, 'Please provide valid fasta file path if you want GC content'
        fasta_records = load_fasta(fasta_path) 
        gene_count['frac_gc'] = frac_gc(chrom_table, fasta_records)
    
    return gene_count
示例#6
0
def compute_trans_scaling(cooler_path, out_path, resolution, regions1,
                          regions2, labels, title):
    chromsizes = bioframe.fetch_chromsizes('sacCer3',
                                           filter_chroms=False,
                                           as_bed=True)
    avg_contacts = cooltools.expected.diagsum_asymm(
        clr=cooler.Cooler('::/resolutions/'.join(
            (cooler_path, str(resolution)))),
        supports1=list(regions1),
        supports2=list(regions2),
        transforms={
            'balanced': lambda p: p['count'] * p['weight1'] * p['weight2']
        })

    avg_contacts['balanced.avg'] = avg_contacts['balanced.sum'] / avg_contacts(
        'n_valid')

    print('...')
示例#7
0
def plot_insulation(clr, insulation, windows, resolution, out_path, exclude_chroms, title):
    dir_path = os.path.join(os.path.dirname(out_path), title)

    if not os.path.exists(dir_path):
        os.mkdir(dir_path)

    chromsizes = bioframe.fetch_chromsizes('sacCer3', filter_chroms=False)
    regions = [(k, 0, v) for k, v in chromsizes.drop('chrM').iteritems()]

    for region in regions:
        norm = LogNorm(vmax=0.1, vmin=0.001)
        data = clr.matrix(balance=True).fetch(region)
        fig, ax = plt.subplots(figsize=(20, 4))

        img = plot_45_mat(ax, data, start=0, resolution=resolution, norm=norm, cmap='fall')

        ax.set_aspect(0.5)
        ax.set_ylim(0, 30000)
        format_ticks(ax, rotate=False)
        ax.xaxis.set_visible(False)

        divider = make_axes_locatable(ax)
        cax = divider.append_axes('right', size='1%' ,pad=0.1, aspect=6)
        plt.colorbar(img, cax=cax)

        insul_region = bioframe.select(insulation, region)
        
        ins_ax = divider.append_axes('bottom', size='50%', pad=0.0, sharex=ax)
        ins_ax.set_prop_cycle(plt.cycler('color', plt.cm.plasma(np.linspace(0, 1, 5))))

        for window in windows:
            ins_ax.plot(insul_region[['start', 'end']].mean(axis=1),
                insul_region[f'log2_insulation_score_{window}'],
                label=f'{window} bp window', lw=1)

        ins_ax.legend(bbox_to_anchor=(1.125, 1.05), loc='upper right')
        fig.suptitle(f'{title}: {region[0]}')

        path = os.path.join(dir_path, '_'.join((region[0], os.path.basename(out_path))))

        plt.savefig(path, dpi=300)
示例#8
0
def get_arms_hg19() -> pd.DataFrame:
    """Downloads the coordinates for chromosomal arms of the
    genome assembly hg19 and returns it as a dataframe."""
    # download chromosomal sizes
    chromsizes = bioframe.fetch_chromsizes("hg19")
    # download centromers
    centromeres = bioframe.fetch_centromeres("hg19")
    centromeres.set_index("chrom", inplace=True)
    centromeres = centromeres.mid
    # define chromosomes that are well defined (filter out unassigned contigs)
    good_chroms = list(chromsizes.index[:23])
    # construct arm regions (for each chromosome fro 0-centromere and from centromere to the end)
    arms = [
        arm for chrom in good_chroms for arm in (
            (chrom, 0, centromeres.get(chrom, 0)),
            (chrom, centromeres.get(chrom, 0), chromsizes.get(chrom, 0)),
        )
    ]
    # construct dataframe out of arms
    arms = pd.DataFrame(arms, columns=["chrom", "start", "end"])
    return arms
示例#9
0
def exclude_regions(df, regions_to_keep=[], genome=None, print_final=False):
    if len(regions_to_keep):
        assert genome is not None, 'Please provide valid genome'
        chromsizes = bioframe.fetch_chromsizes(genome)
    else:
        if print_final:
            print(np.asarray(df.region.unique()))
        return df

    regions_to_keep = [
        bioframe.parse_region(reg, chromsizes) for reg in regions_to_keep
    ]

    assert 'region' in df.columns

    regions = df['region'].apply(
        lambda x: bioframe.parse_region(x, chromsizes)).values
    chrom, start, end = list(zip(*regions))
    df['chrom'] = chrom
    df['start'] = start
    df['end'] = end

    new_df = []
    for chrom, start, end in regions_to_keep:
        sub_df = bioframe.bedslice(df, (chrom, start, end))
        new_df.append(sub_df)
    new_df = pd.concat(new_df)

    if print_final:
        print(np.asarray(new_df.region.unique()))

    del new_df['chrom']
    del new_df['start']
    del new_df['end']

    return new_df
示例#10
0
def compute_scaling(pairs_paths, out_path, region, exclude_chroms, assembly,
                    centromeres_path, split_arms, normalized, plot_slope,
                    show_average_trans, labels, title, no_cache):
    """
    Compute and plot contact frequency vs genomic separation curves for one or more pairs files.
    """
    labels = list(labels)
    # parse left/right arm parameter of chromosomes to exclude
    exclude_chroms = [chrom.split(':') for chrom in exclude_chroms]

    chromsizes = bioframe.fetch_chromsizes(assembly,
                                           filter_chroms=False,
                                           as_bed=True)
    chromsizes = chromsizes[~chromsizes.chrom.isin(exclude_chroms)]

    if centromeres_path:
        centromeres = {}
        with open(centromeres_path) as file:
            for line in file:
                cols = line.split(' ')
                centromeres[cols[0]] = (int(cols[1]) + int(cols[2])) // 2
    else:
        centromeres = bioframe.fetch_centromeres(assembly)
        centromeres.set_index('chrom', inplace=True)
        centromeres = centromeres.mid.to_dict()

    if len(labels) != 0 and len(pairs_paths) != len(labels) and not split_arms:
        sys.exit('Please provide as many labels as pairs paths.')

    if region:
        regions = bioframe.select(chromsizes, region).reset_index()
    else:
        # use chromosomal arms as separate regions if no regions are specified
        arms = bioframe.split(chromsizes, centromeres)
        # remove user-excluded chromosomes/arms
        for chrom in exclude_chroms:
            if len(chrom) == 1:
                # no arm specified, remove entire chromosome
                arms = arms[arms.chrom != chrom[0]]
            elif chrom[1] == 'left':
                # remove specified chromosome with start == 0 (left arm)
                arms = arms[~((arms.chrom == chrom[0]) & (arms.start == 0))]
            elif chrom[1] == 'right':
                # remove specified chromosome with start != 0 (right arm)
                arms = arms[~((arms.chrom == chrom[0]) & (arms.start != 0))]

        # remove 40kb from each side (80kb total) of an arm to remove centromere and telomere regions
        arms = bioframe.ops.expand(arms, -ARM_PADDING)
        # remove arms arms with a length of < 0 after removing side regions
        regions = arms[arms.start < arms.end].reset_index()

    all_scalings = []
    all_avg_trans_levels = []

    for idx, path in enumerate(pairs_paths):
        cis_scalings, avg_trans = None, None

        if split_arms:
            # calculate scalings per arm per chromosome
            cis_scalings, trans_levels = pairlib.scalings.compute_scaling(
                path,
                regions,
                chromsizes,
                dist_range=(int(1e1), int(1e9)),
                n_dist_bins=128,
                chunksize=int(1e7))

            # remove unassigned pairs with start/end positions < 0
            cis_scalings = cis_scalings[(cis_scalings.start1 > 0)
                                        & (cis_scalings.end1 > 0) &
                                        (cis_scalings.start2 > 0) &
                                        (cis_scalings.end2 > 0)]

            sc_agg = (cis_scalings.groupby(
                ['chrom1', 'start1', 'min_dist', 'max_dist']).agg({
                    'n_pairs':
                    'sum',
                    'n_bp2':
                    'sum'
                }).reset_index())
            avail_chroms = set(sc_agg.chrom1)

            for chrom in avail_chroms:
                # calculate scalings for left/right arms (left arms start at position 0 + ARM_PADDING)
                sc_left, avg_trans_left = (calc_pair_freqs(
                    sc_agg[(sc_agg.chrom1 == chrom)
                           & (sc_agg.start1 == ARM_PADDING)], trans_levels,
                    show_average_trans, normalized))
                sc_right, avg_trans_right = (calc_pair_freqs(
                    sc_agg[(sc_agg.chrom1 == chrom)
                           & (sc_agg.start1 != ARM_PADDING)], trans_levels,
                    show_average_trans, normalized))

                dir_path = os.path.join(os.path.dirname(out_path),
                                        os.path.basename(path))
                if not os.path.exists(dir_path):
                    os.mkdir(dir_path)
                chrom_path = os.path.join(
                    dir_path, '_'.join((chrom, os.path.basename(out_path))))
                (plot_scalings(
                    scalings=[sc_left, sc_right],
                    avg_trans_levels=[avg_trans_left, avg_trans_right],
                    plot_slope=plot_slope,
                    labels=['left', 'right'],
                    title=chrom,
                    out_path=chrom_path))
        else:
            if not no_cache:
                # get cached values
                cached = cache.get(path)
                if cached is not None:
                    cis_scalings = cached['cis_scalings'] if cached[
                        'normalized'] == normalized else None
                    avg_trans = cached['avg_trans']

            if no_cache or cis_scalings is None or (avg_trans is None
                                                    and show_average_trans):
                print(
                    f'Computing scalings for file {idx + 1}/{len(pairs_paths)} ...',
                    end='\r')
                # caching disabled or no cached values found

                cis_scalings, trans_levels = pairlib.scalings.compute_scaling(
                    path,
                    regions,
                    chromsizes,
                    dist_range=(int(1e1), int(1e9)),
                    n_dist_bins=128,
                    chunksize=int(1e7))
                # remove unassigned pairs with start/end positions < 0
                cis_scalings = cis_scalings[(cis_scalings.start1 >= 0)
                                            & (cis_scalings.end1 >= 0) &
                                            (cis_scalings.start2 >= 0) &
                                            (cis_scalings.end2 >= 0)]

                sc_agg = (cis_scalings.groupby(['min_dist', 'max_dist']).agg({
                    'n_pairs':
                    'sum',
                    'n_bp2':
                    'sum'
                }).reset_index())

                cis_scalings, avg_trans = calc_pair_freqs(
                    sc_agg, trans_levels, show_average_trans, normalized)

                if not no_cache:
                    cache.set(
                        path, {
                            'cis_scalings': cis_scalings,
                            'avg_trans': avg_trans,
                            'normalized': normalized
                        })
            else:
                print(
                    f'Retrieved cached values for file {idx + 1}/{len(pairs_paths)}.',
                    end='\r')

            # use file names as labels if labels have not been provided
            labels.append(
                os.path.basename) if len(labels) < len(pairs_paths) else None

            all_scalings.append(cis_scalings)
            all_avg_trans_levels.append(
                avg_trans) if avg_trans is not None else None

        if len(all_scalings) > 0 and not split_arms:
            plot_scalings(all_scalings, all_avg_trans_levels, plot_slope,
                          labels, title, out_path)
示例#11
0
import os.path as op
import pandas as pd

import bioframe
import cooler
import cooltools.expected

chromsizes = bioframe.fetch_chromsizes("mm9")
chromosomes = list(chromsizes.index)
supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes]


def test_diagsum(request):
    clr = cooler.Cooler(
        op.join(request.fspath.dirname, "data/CN.mm9.1000kb.cool"))
    tables = cooltools.expected.diagsum(
        clr,
        supports,
        transforms={
            "balanced": lambda p: p["count"] * p["weight1"] * p["weight2"]
        },
        chunksize=10000000,
    )
    pd.concat(
        [tables[support] for support in supports],
        keys=[support[0] for support in supports],
        names=["chrom"],
    )


def test_blocksum(request):
def pileup_multiple_dot_lists(cool_file, dot_file_list, exp_cool, resolution,
                              flank, anchor_dist, anchor_flank, plot_name):
    i = 0
    filename1 = cool_file[0].split("/")[-2].split("_hg38")[0]
    filename2 = cool_file[1].split("/")[-2].split("_hg38")[0]
    filename3 = cool_file[2].split("/")[-2].split("_hg38")[0]

    cool = [filename1, filename2, filename3]
    exp_cool = [exp_cool[0], exp_cool[1], exp_cool[2]]
    conditions = ['HiC-FA-DpnII', 'HiC-DSG-DpnII', 'MicroC-DSG-MNase']

    print(filename1)
    print(filename2)
    print(filename3)

    resolution = resolution
    flank = flank
    #resolution=sys.argv[4]
    hg38 = bioframe.fetch_chromsizes('hg38')
    chromsizes = bioframe.fetch_chromsizes('hg38')
    chromosomes = list(chromsizes.index)
    binsize = resolution

    cooler_paths = {
        'HiC-FA-DpnII': cool_file[0],
        'HiC-DSG-DpnII': cool_file[1],
        'MicroC-DSG-MNase': cool_file[2],
    }

    exp_paths = {
        'HiC-FA-DpnII': exp_cool[0],
        'HiC-DSG-DpnII': exp_cool[1],
        'MicroC-DSG-MNase': exp_cool[2],
    }

    long_names = {
        'HiC-FA-DpnII': 'HiC-FA-DpnII',
        'HiC-DSG-DpnII': 'HiC-DSG-DpnII',
        'MicroC-DSG-MNase': 'MicroC-DSG-MNase',
    }

    pal = sns.color_palette('colorblind')
    colors = {
        filename1: pal[0],
        filename2: '#333333',
        filename3: pal[2],
    }

    clrs = {cond: cooler.Cooler(cooler_paths[cond]) for cond in conditions}

    anchor_dist = anchor_dist
    anchor_flank = flank
    # dot file list

    gs = plt.GridSpec(nrows=len(conditions), ncols=len(dot_file_list) + 1)
    plt.figure(figsize=(6 * len(conditions) + 1, 7))
    mean_list = {}
    for dot_file in dot_file_list:
        print(dot_file)
        sites = pd.read_table(dot_file)
        mid1 = (sites['start1'] + sites['end1']) / 2
        mid2 = (sites['start2'] + sites['end2']) / 2
        new_file = pd.DataFrame()
        new_file = pd.concat([sites['chrom1'], mid1, sites['chrom2'], mid2],
                             axis=1)

        # "convergent" orientation of paired CTCF motifs
        # sites = sites[(sites['strand1'] == '+') & (sites['strand2'] == '-')] ## not working
        new_file.columns = ['chrom1', 'mid1', 'chrom2', 'mid2']
        #print(len(new_file))
        new_file.head()
        supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes]

        snippet_flank = flank
        windows1 = snipping.make_bin_aligned_windows(binsize,
                                                     new_file['chrom1'],
                                                     new_file['mid1'],
                                                     flank_bp=snippet_flank)
        # windows1['strand'] = sites['strand1']
        windows2 = snipping.make_bin_aligned_windows(binsize,
                                                     new_file['chrom2'],
                                                     new_file['mid2'],
                                                     flank_bp=snippet_flank)
        windows = pd.merge(windows1,
                           windows2,
                           left_index=True,
                           right_index=True,
                           suffixes=('1', '2'))
        windows = snipping.assign_regions(windows, supports)
        windows = windows.dropna()
        windows.head()
        stacks = {}
        piles = {}
        # mid point distplot
        k = 0
        r_list = []
        mean_1 = []
        for cond in conditions:
            expected = pd.read_table(exp_paths[cond])
            snipper = snipping.ObsExpSnipper(clrs[cond], expected)
            #print(snipper)
            stack = snipping.pileup(windows, snipper.select, snipper.snip)
            stacks[cond] = stack
            piles[cond] = np.nanmean(stack, axis=2)

            mid_pixel_norm = []
            sq_size = piles[cond].shape[0]
            midpoint = np.int(np.floor(sq_size / 2))
            background_size_start = np.int(np.ceil(sq_size * 40 / 100))
            background_size_end = np.int(np.floor(sq_size * 60 / 100))

            print(midpoint)
            print(background_size_start)
            print(background_size_end)

            slice_ = piles[cond]
            # mid point of each dot
            mid_pixel = slice_[midpoint, midpoint]
            #mid_list_9pixels=np.nanmean(slice_[midpoint-1:midpoint+2,midpoint-1:midpoint+2])
            # upper left
            up_left = np.nanmean(
                slice_[:background_size_start, :background_size_start])
            # upper right
            up_right = np.nanmean(slice_[:background_size_start,
                                         background_size_end:])
            # upper left
            lower_left = np.nanmean(
                slice_[background_size_end:, :background_size_start])
            # upper right
            lower_right = np.nanmean(slice_[background_size_end:,
                                            background_size_end:])

            # mid point of each dot
            mid_pixel = slice_[midpoint, midpoint]
            # Stripe up
            stripe_up = np.nanmean(
                slice_[:background_size_start,
                       background_size_start:background_size_end])
            # stripe down
            stripe_down = np.nanmean(
                slice_[background_size_end:,
                       background_size_start:background_size_end])
            # stripe left
            stripe_left = np.nanmean(
                slice_[background_size_start:background_size_end, :
                       background_size_start])
            # stripe right
            stripe_right = np.nanmean(
                slice_[background_size_start:background_size_end,
                       background_size_end:])

            stripes_mean = (stripe_up + stripe_right) / 2
            corners_mean = (up_left + up_right + lower_right) / 3

            mid_pixel_norm.append(mid_pixel /
                                  ((stripes_mean + corners_mean) / 2))
            #mid_pixel_norm.append(mid_list_9pixels/((stripes_mean+corners_mean)/2))
            ax = plt.subplot(gs[k, i])
            new_list = mid_pixel_norm
            m = np.mean(new_list)
            mean_1.append(np.mean(new_list))
            mean_list[dot_file] = mean_1
            ax = sns.kdeplot(new_list, shade=True)
            plt.axvline(m, color='k', linestyle='dashed', linewidth=2)
            min_ylim, max_ylim = plt.ylim()
            k = k + 1
            ax.yaxis.set_visible(True)
            ax.xaxis.set_visible(True)
            if k > 0:
                ax.yaxis.set_visible(True)
                ax.xaxis.set_visible(True)
                ax = plt.subplot(gs[len(conditions)])
        i = i + 1
        plt.title(plot_name)
        #plt.savefig(plot_name)
        w = csv.writer(open(plot_name + ".csv", "w"))
        for key, val in mean_list.items():
            w.writerow([key, val])
示例#13
0
def save_bigwig(vectors, savepath, genome, columns=['E1', 'E2', 'E3']):
    chroms = fetch_chromsizes(genome)
    for item in columns:
        save = savepath+'.{}.bw'.format(item)
        create_dir(save)
        to_bigwig(vectors, chroms, save, value_field=item)
示例#14
0
def pileup_multiple_dot_lists(cool_file,dot_file_list, exp_cool,resolution,flank,anchor_dist,anchor_flank,pileup_name):
    i=0
    filename1=cool_file[0].split("/")[-2].split("_hg38")[0]
    filename2=cool_file[1].split("/")[-2].split("_hg38")[0]
    filename3=cool_file[2].split("/")[-2].split("_hg38")[0]
    
    cool = [filename1,filename2,filename3]
    exp_cool = [exp_cool[0], exp_cool[1], exp_cool[2]]
    conditions = ['HiC-FA-DpnII', 'HiC-DSG-DpnII','MicroC-DSG-MNase']

    print(filename1)
    print(filename2)
    print(filename3)
        
    resolution=resolution
    flank = flank
    #resolution=sys.argv[4]
    hg38 = bioframe.fetch_chromsizes('hg38')
    chromsizes = bioframe.fetch_chromsizes('hg38')
    chromosomes = list(chromsizes.index)
    binsize = resolution

    cooler_paths = {    
    'HiC-FA-DpnII' : cool_file[0],
    'HiC-DSG-DpnII' : cool_file[1],
    'MicroC-DSG-MNase' : cool_file[2],
    }
    
    exp_paths = {    
    'HiC-FA-DpnII' : exp_cool[0],
    'HiC-DSG-DpnII'  : exp_cool[1],
    'MicroC-DSG-MNase' : exp_cool[2],
    }

    
    
    long_names = {
    'HiC-FA-DpnII': 'HiC-FA-DpnII',
    'HiC-DSG-DpnII': 'HiC-DSG-DpnII',
    'MicroC-DSG-MNase': 'MicroC-DSG-MNase',
    }

    pal = sns.color_palette('colorblind')
    colors = {
        filename1: pal[0],
        filename2 : '#333333',
        filename3: pal[2],
    }

    clrs = {
    cond: cooler.Cooler(cooler_paths[cond]) for cond in conditions
    }

    anchor_dist = anchor_dist
    anchor_flank = flank
    # dot file list
    gs = plt.GridSpec(nrows=len(conditions), ncols=len(dot_file_list) + 1)
    plt.figure(figsize=(6 * len(conditions)+1, 7))
            
    for dot_file in dot_file_list:
        print(dot_file)
        sites = pd.read_table(dot_file)
        mid1=(sites['start1']+sites['end1'])/2
        mid2=(sites['start2']+sites['end2'])/2
        new_file=pd.DataFrame()
        new_file = pd.concat([sites['chrom1'],mid1,sites['chrom2'],mid2],axis=1)

        # "convergent" orientation of paired CTCF motifs
        # sites = sites[(sites['strand1'] == '+') & (sites['strand2'] == '-')] ## not working 
        new_file.columns=['chrom1','mid1','chrom2','mid2']
        print(len(new_file))
        new_file.head()
        supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes]

        snippet_flank = flank
        windows1 = snipping.make_bin_aligned_windows(
            binsize, 
            new_file['chrom1'], 
            new_file['mid1'],
            flank_bp=snippet_flank)
        # windows1['strand'] = sites['strand1']
        windows2 = snipping.make_bin_aligned_windows(
            binsize, 
            new_file['chrom2'], 
            new_file['mid2'],
            flank_bp=snippet_flank)
        windows = pd.merge(windows1, windows2, left_index=True, right_index=True, suffixes=('1', '2'))
        windows = snipping.assign_regions(windows, supports)
        windows = windows.dropna()
        windows.head()
        #stacks = {}
        piles = {}
        k=0        
        for cond in conditions:
            expected = pd.read_table(exp_paths[cond])
            snipper = snipping.ObsExpSnipper(clrs[cond], expected)
            print(snipper)
            stack = snipping.pileup(windows, snipper.select, snipper.snip)
            #stacks[cond] = stack
            piles[cond] = np.nanmean(stack, axis=2)

            opts = dict(
                vmin=-0.25,
                vmax=0.25,
                extent=[-flank//1000, flank//1000, -flank//1000, flank//1000],
                cmap='coolwarm'
            )

            ax = plt.subplot(gs[k,i])
            img = ax.matshow(
                np.log10(piles[cond]), #piles[cond]),  
                **opts)
            #plt.title(dot_name_list[i],fontsize=7)
            #ax.xaxis.tick_bottom()
            k=k+1
            ax.yaxis.set_visible(True)
            ax.xaxis.set_visible(False)
            if k > 0:
                ax.yaxis.set_visible(True)
                ax.xaxis.set_visible(False)
                ax = plt.subplot(gs[len(conditions)])
                #plt.suptitle(f'Dot calls ({anchor_dist//1000} +/- {anchor_flank//1000})kb apart\n'
                #                 f'Hi-C resolution = {binsize//1000}kb; # of pairs = {len(windows)}')
        #plt.title(dot_name_list[i])
        i=i+1
    #plt.colorbar(img, cax=ax)
    plt.savefig(pileup_name)
示例#15
0
def plot_pileup(cooler_paths, out_path, resolution, region, size, assembly,
                exclude_chroms, centromeres_path, title):
    """
    Plots pileups of a specified size around centromeres for an input cooler file. Input two file paths as arguments to create a ratio of pileups instead.
    """
    if len(cooler_paths) > 2:
        sys.exit('Please provide up to 2 cooler files max.')

    clrs = []

    for path in cooler_paths:
        clr_ext = os.path.splitext(path)[1]
        if clr_ext == '.cool':
            clr = cooler.Cooler(path)
        elif clr_ext == '.mcool':
            clr = cooler.Cooler('::/resolutions/'.join(
                (path, str(resolution))))
        else:
            sys.exit('Please provide a .cool or .mcool file.')
        clrs.append(clr)

    chromsizes = bioframe.fetch_chromsizes(assembly,
                                           filter_chroms=False,
                                           as_bed=True)
    chromsizes = chromsizes[~chromsizes.chrom.isin(exclude_chroms)]

    if centromeres_path:
        features = pd.read_csv(centromeres_path,
                               delim_whitespace=True,
                               header=None,
                               names=['chrom', 'start', 'end', 'mid'])
        features['mid'] = features.apply(lambda row:
                                         (row['start'] + row['end']) // 2,
                                         axis=1)
    else:
        pass  # TODO: implement

    flank = size // 2

    stacks = [
        snip_pileup(clr, resolution, features, chromsizes, flank)
        for clr in clrs
    ]

    vmax = -3.75
    vmin = -1.75
    cmap = 'fall'

    if len(stacks) == 2:
        stacks[0] = stacks[0] / stacks[1]
        vmax = 1
        vmin = -1
        cmap = 'RdBu'

    plt.imshow(np.log10(stacks[0]), vmax=vmax, vmin=vmin, cmap=cmap)
    plt.colorbar(label='log10 mean')
    ticks_px = np.linspace(0, flank * 2 // resolution, 5)
    ticks_kbp = ((ticks_px - ticks_px[-1] / 2) * resolution //
                 1000).astype(int)
    plt.xticks(ticks_px, ticks_kbp)
    plt.yticks(ticks_px, ticks_kbp)
    plt.xlabel('relative position, kbp')
    plt.ylabel('relative position, kbp')
    plt.title(title)
    plt.savefig(out_path, dpi=300)
示例#16
0
import os.path as op
import numpy as np
import pandas as pd

import bioframe
import cooler
import cooltools.expected

chromsizes = bioframe.fetch_chromsizes('mm9')
chromosomes = list(chromsizes.index)
supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes]


def test_diagsum(request):
    clr = cooler.Cooler(op.join(request.fspath.dirname, 'data/CN.mm9.1000kb.cool'))
    tables = cooltools.expected.diagsum(
            clr, 
            supports, 
            transforms={
                'balanced': lambda p: p['count'] * p['weight1'] * p['weight2']
            },
            chunksize=10000000)
    exc = pd.concat(
        [tables[support] for support in supports], 
        keys=[support[0] for support in supports], 
        names=['chrom'])


def test_blocksum(request):
    clr = cooler.Cooler(op.join(request.fspath.dirname, 'data/CN.mm9.1000kb.cool'))
    records = cooltools.expected.blocksum_pairwise(
示例#17
0
def fetch_chromsizes(db):
    import bioframe
    chromsizes = bioframe.fetch_chromsizes(db)
    print(chromsizes.to_csv(sep='\t'))
from matplotlib.gridspec import GridSpec
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
mpl.style.use('seaborn-white')

import multiprocess as mp
import numpy as np
import pandas as pd
import bioframe
import cooltools
import cooler
from cooltools.eigdecomp import cooler_cis_eig

mm10 = bioframe.fetch_chromsizes('mm10')
chromsizes = bioframe.fetch_chromsizes('mm10')
chromosomes = list(chromsizes.index)

binsize = 10000
bins = cooler.binnify(mm10, binsize)
fasta_records = bioframe.load_fasta('/data05/genomes/mm10_20chr.fa')
bins['GC'] = bioframe.tools.frac_gc(bins, fasta_records)
bins.head()

import fnmatch
import os

for file in os.listdir('.'):
    if fnmatch.fnmatch(file, '*_10kb.cool'):
        clr = cooler.Cooler(file)
        cond = file.split('.')[0]