Пример #1
0
def doSaddle(filename, eig, gen):
    c = cooler.Cooler(filename)

    gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"])

    gen.setResolution(getResolution(filename))
    saddles = []
    for chrom in range(gen.chrmCount):
        saddle = np.zeros((5,5), dtype = float)
        st = gen.chrmStartsBinCont[chrom]
        end = gen.chrmEndsBinCont[chrom]
        cur = c.matrix(balance=False).fetch(gen.idx2label[chrom])
        cur = observedOverExpected(cur)
        mask = np.sum(cur , axis=0) > 0
        cur = cur [mask]
        cur = cur [:, mask]
        GC = eig[st:end]
        GC = GC[mask]
        if len(GC) > 5:
            for i in range(5):
                for j in range(5):
                    G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20])
                    mask1 = (GC > G1) * (GC < G2)

                    G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20])
                    mask2 = (GC > G1) * (GC < G2)
                    saddle[i, j] += cur[np.ix_(mask1, mask2)].mean()
        saddles.append(saddle)

    return saddles
Пример #2
0
def doSaddles(q, E1_values, genome_db):
    saddles = {}
    for chrom in range(genome_db.chrmCount):
        saddle = np.ones((5, 5), dtype=float)
        st = genome_db.chrmStartsBinCont[chrom]
        end = genome_db.chrmEndsBinCont[chrom]
        cur = q[st:end, st:end]

        E1 = E1_values[st:end]

        mask = np.sum(cur, axis=0) > 0
        if sum(mask) > 5:
            cur = cur[mask]
            cur = cur[:, mask]

            cur = observedOverExpected(cur)
            E1 = E1[mask]
            assert cur.shape[0] == cur.shape[1] == len(E1)

            for i in range(5):
                for j in range(5):
                    P1, P2 = np.percentile(E1, [20 * i, 20 * i + 20])
                    mask1 = (E1 > P1) * (E1 < P2)
                    P1, P2 = np.percentile(E1, [20 * j, 20 * j + 20])
                    mask2 = (E1 > P1) * (E1 < P2)
                    if sum(mask1) * sum(mask2) != 0:
                        saddle[i, j] = np.nanmean(cur[np.ix_(mask1, mask2)])
                    else:
                        saddle[i, j] = None
            saddles[genome_db.idx2label[chrom]] = saddle
        else:
            pass
            #print "Ommiting chromsome ",genome_db.idx2label[chrom]

    all_average = np.zeros((5, 5), dtype=float)
    for i in range(5):
        for j in range(5):
            all_average[i, j] = np.average([
                saddles[c][i, j] for c in saddles
                if not np.isnan(saddles[c][i, j])
            ])
    saddles["all_average"] = all_average
    strength = math.log(all_average[0, 0] * all_average[-1, -1] /
                        (all_average[0, -1] * all_average[0, -1]))
    return saddles, strength
Пример #3
0
def doSaddleError(filename, eig, gen, correct=False):


    gen = Genome("/home/magus/HiC2011/data/" + gen, readChrms=["#", "X"])
    cur = 0
    data = h5dict(filename,'r')["heatmap"]
    if correct:
        data = completeIC(data)
    gen.setResolution(getResolution(filename))
    if eig == "GC":
        eig = np.concatenate(gen.GCBin)
    saddles = []
    permutted = []
    saddle = np.zeros((5,5), dtype = float)
    for i in range(100):
        permutted.append(np.zeros((5,5), dtype = float))

    for chrom in range(gen.chrmCount):
        st = gen.chrmStartsBinCont[chrom]
        end = gen.chrmEndsBinCont[chrom]
        cur = data[st:end, st:end]
        cur = observedOverExpected(cur)
        mask = np.sum(cur , axis=0) > 0
        cur = cur [mask]
        cur = cur [:, mask]
        GC = eig[st:end]
        GC = GC[mask]
        if len(GC) > 5:
            for i in range(5):
                for j in range(5):
                    G1, G2 = np.percentile(GC, [20 * i, 20 * i + 20])
                    mask1 = (GC > G1) * (GC < G2)
                    G1, G2 = np.percentile(GC, [20 * j, 20 * j + 20])
                    mask2 = (GC > G1) * (GC < G2)
                    addition = cur[np.ix_(mask1, mask2)]
                    addition = np.reshape(addition, (-1))
                    for k in range(100):
                        resampled = np.random.choice(addition, len(addition), replace=True)
                        permutted[k][i,j] += resampled.mean()
                    saddle[i, j] += addition.mean()
    return saddle, permutted
Пример #4
0
def get_by_chr_E1(genome_db, resolution):
    if heatmap_filepath.endswith(".IC"):
        raw = heatmap_filepath[:-3]
    else:
        raw = heatmap_filepath

    print "Using raw heatmap ", raw
    global BD_raw
    BD_raw = binnedData.binnedData(resolution, genome_db)
    BD_raw.simpleLoad(raw, 'heatmap')
    BD_raw.removeDiagonal()

    # Remove bins with less than half of a bin sequenced
    BD_raw.removeBySequencedCount(0.5)
    # We'll do iterative correction and Eigenvector expansion on trans data only!
    # We want to remove cis, because later we want to remove poor regions in trans
    BD_raw.removeCis()
    # Truncate top 0.05% of interchromosomal counts (possibly, PCR blowouts)
    # Do this before removing poor regions, because single blowouts may give
    # lots of contacts to a region which does not have much contacts otehrwise.
    BD_raw.truncTrans(high=0.0005)
    # Remove 1% of regions with low coverage
    BD_raw.removePoorRegions(cutoff=1)
    # Fake cis counts. Data gets iteratively corrected during this process...
    BD_raw.fakeCis()
    # Remove bins with zero counts for eigenvector analysis --> This will be done for each chromosome in for loop
    #	BD.removeZeros()

    # Perform eigenvector expansion.

    result = {"OE": {}, "Classic": {}, "genome_wide_Classic": {}}
    genom_wide_E1 = np.genfromtxt(raw + ".eig", dtype=None)['f2']
    for chrom in range(genome_db.chrmCount):
        st = genome_db.chrmStartsBinCont[chrom]
        end = genome_db.chrmEndsBinCont[chrom]
        cur = BD_raw.dataDict['heatmap'][st:end, st:end]
        mask = np.sum(cur, axis=0) > 0
        if sum(mask) > 5:
            cur = cur[mask]
            cur = cur[:, mask]
            currentEIG, eigenvalues = EIG(cur, numPCs=1)
            if spearmanr(currentEIG[0],
                         BD_raw.trackDict["GC"][st:end][mask])[0] < 0:
                currentEIG[0] = -currentEIG[0]
            E1 = np.empty(shape=(len(mask), )) * np.nan
            E1[mask] = currentEIG[0]
            result["Classic"][chrom] = E1

            cur = observedOverExpected(cur)
            mask = np.sum(cur, axis=0) > 0
            if sum(mask) > 5:
                cur = cur[mask]
                cur = cur[:, mask]
                currentEIG, eigenvalues = EIG(cur, numPCs=1)
                if spearmanr(currentEIG[0],
                             BD_raw.trackDict["GC"][st:end][mask])[0] < 0:
                    currentEIG[0] = -currentEIG[0]
                E1 = np.empty(shape=(len(mask), )) * np.nan
                E1[mask] = currentEIG[0]
                result["OE"][chrom] = E1

        result["genome_wide_Classic"][chrom] = genom_wide_E1[st:end]
    return result
Пример #5
0
def snip(segmentation, map, output_prefix, format, balance, niter, window,
         enrichment_only, diagonals_to_remove):
    """
    Create snips for TADs and calculate enrichment with shuffled control.
    OUTPUT_PREFIX: The prefix for writing output files (pickle with snips and tsv file with TAD info).

    Output files to be created:
      {OUTPUT_PREFIX}.TADmetadata.tsv
    if not --enrichment-only:
      {OUTPUT_PREFIX}.TADsnips.pickle
      {OUTPUT_PREFIX}.TADsnips_shuf0.pickle etc.

    Example run:
      avTAD snip data/OSC_TADS.bed data/OSC_dm3.cool tmp_results --format cool --diagonals-to-remove 2 --balance --niter 2
    """

    logger = get_logger(__name__)
    logger.info(
        f"Running snipping for: segmentation file {segmentation}, heatmap {map} in {format} format ..."
    )

    logger.info(
        f"Reading {map} file with balance={balance} in {format} format ...")
    if format == 'cool':
        dataset, chrms, resolution = read_cooler(map, balance=balance)
    elif format == 'hiclib_heatmap':
        dataset, chrms, resolution = read_hiclib_heatmap(map, balance=balance)
    elif format == 'hiclib_bychr':
        dataset, chrms, resolution = read_hiclib_bychr(map, balance=balance)
    else:
        raise Exception(f'Map format {format} is not supported ...')

    logger.info(f"Reading segmentation file: {segmentation}")
    df_segmentation = pd.read_csv(segmentation,
                                  sep='\s',
                                  header=None,
                                  engine='python')
    add_columns = list(df_segmentation.columns[3:]) if len(
        df_segmentation.columns) > 3 else []
    df_segmentation.columns = ['ch', 'bgn', 'end'] + add_columns

    df_segmentation.loc[:, 'bgn_bin'] = df_segmentation.bgn // resolution
    df_segmentation.loc[:, 'end_bin'] = df_segmentation.end // resolution
    df_segmentation.loc[:,
                        'TAD_size'] = df_segmentation.end_bin - df_segmentation.bgn_bin

    df_segmentation = df_segmentation.drop_duplicates().sort_values(
        ['ch', 'bgn_bin']).reset_index(drop=True)

    chrms_used = np.unique(df_segmentation.loc[:, 'ch'].values)
    chrms = [ch for ch in chrms if ch in chrms_used]

    logger.info(f"Chromosomes in the dataset: {dataset.keys()}")
    logger.info(
        f"Lengths of chromosomes in bins of {resolution} bp: \n{[(ch, len(dataset[ch])) for ch in chrms]}"
    )
    logger.info(f"Selected chromosomes are: {chrms}")

    # Creating shuffled segmentations
    def shuffle_segmentation_dataframe(x):
        segmentation = x[['bgn_bin', 'end_bin']].values
        shuf, order = shuffle_segmentation(segmentation)
        #print(segmentation[0:5], shuf[0:5], order[0:5])
        ret = pd.DataFrame(shuf, columns=['bgn_bin', 'end_bin']).astype(int)
        ret.loc[:, 'index'] = order
        ret.loc[:, 'TAD_size'] = ret.end_bin - ret.bgn_bin
        ret = ret.sort_values('index').reset_index(drop=True)
        assert np.all(x['TAD_size'].values == ret['TAD_size'].values)
        return ret

    for i in range(niter):
        df_segmentation_shuffled = df_segmentation.groupby('ch').apply(shuffle_segmentation_dataframe)\
            .reset_index().drop(['level_1', 'index', 'ch', 'TAD_size'], axis=1)

        df_segmentation_shuffled.columns = [
            f'bgn_bin_shuf{i}', f'end_bin_shuf{i}'
        ]

        df_segmentation = pd.merge(df_segmentation,
                                   df_segmentation_shuffled,
                                   left_index=True,
                                   right_index=True)

    # Computing observed over expected
    dataset_obsexp = {}
    for ch in chrms:
        mtx = numutils.observedOverExpected(dataset[ch])
        #inx_lower_triangle = np.tril_indices(len(mtx))
        #mtx[inx_lower_triangle] = np.nan
        for i in range(1, diagonals_to_remove):
            np.fill_diagonal(mtx[i:, :-i], np.nan)
            np.fill_diagonal(mtx[:-i, i:], np.nan)
        if diagonals_to_remove:
            np.fill_diagonal(mtx, np.nan)
        dataset_obsexp.update({ch: mtx})

    for mod in [''] + [f'_shuf{i}' for i in range(niter)]:
        enrichments = []
        for i, r in df_segmentation.iterrows():
            mtx = np.log2(
                dataset_obsexp[r.ch][r[f'bgn_bin{mod}']:r[f'end_bin{mod}'],
                                     r[f'bgn_bin{mod}']:r[f'end_bin{mod}']])
            mtx[np.isinf(mtx)] = np.nan
            enrichment = (np.nansum(mtx), np.nanmean(mtx), np.nanmedian(mtx),
                          np.sum(np.isfinite(mtx)))
            enrichments.append(np.array(enrichment))
        enrichments = np.array(enrichments)

        df_segmentation.loc[:, f"sum{mod}"] = enrichments[:, 0]
        df_segmentation.loc[:, f"mean{mod}"] = enrichments[:, 1]
        df_segmentation.loc[:, f"median{mod}"] = enrichments[:, 2]
        df_segmentation.loc[:, f"nelements{mod}"] = enrichments[:, 3]

    # Save enrichment dataframe to a file:

    cols = ['bgn_bin', 'end_bin', 'sum', 'mean', 'median', 'nelements']
    columns = ['ch', 'bgn', 'end', 'TAD_size'] + add_columns + cols + [
        f'{x}_shuf{i}' for i in range(niter) for x in cols
    ]
    df_segmentation[columns].to_csv(f"{output_prefix}.TADmetadata.tsv",
                                    sep='\t',
                                    index=True,
                                    header=True)

    if not enrichment_only:
        # Retrieval of snippets, log2 and filling inf with nans included:
        snips = snipper(segmentations=df_segmentation,
                        dataset=dataset_obsexp,
                        window=window)

        # Save snippets to file:
        pickle.dump(snips, open(f"{output_prefix}.TADsnips.pickle", 'wb'))

        for i in range(niter):
            # Retrieval of snippets:
            snips = snipper(segmentations=df_segmentation,
                            dataset=dataset_obsexp,
                            window=window,
                            key_bgn=f'bgn_bin_shuf{i}',
                            key_end=f'end_bin_shuf{i}')

            # Save snippets to file:
            pickle.dump(snips,
                        open(f"{output_prefix}.TADsnips_shuf{i}.pickle", 'wb'))
Пример #6
0
def cis_eig(A, k=3, robust=True, gc=None, classic=False):
    """
    Compute compartment eigenvector on a cis matrix
    Parameters
    ----------
    A : 2D array
        balanced whole genome contact matrix
    k : int
        number of eigenvectors to compute; default = 3
    robust : bool
        Clip top 0.1 percentile and smooth first two diagonals
    gc : 1D array, optional
        GC content per bin for choosing and orienting the primary compartment 
        eigenvector; not performed if no array is provided
    classic : bool
        Do it old-school
    Returns
    -------
    eigenvalues, eigenvectors
    """
    A = np.array(A)
    A[~np.isfinite(A)] = 0

    mask = A.sum(axis=0) > 0

    if A.shape[0] <= 5 or mask.sum() <= 5:
        return (np.array([np.nan for i in range(k)]),
                np.array([np.ones(A.shape[0]) * np.nan for i in range(k)]))

    if robust:
        A = np.clip(A, 0, np.percentile(A, 99.9))
        fill_value = np.mean(np.diag(A, 2) * 2)
        for d in [-1, 0, 1]:
            numutils.fillDiagonal(A, fill_value, d)
            A[~mask, :] = 0
            A[:, ~mask] = 0

    OE = numutils.observedOverExpected(A[mask, :][:, mask])

    if robust:
        OE = np.clip(OE, 0, np.percentile(OE, 99.9))

    if classic:
        OE = numutils.iterativeCorrection(OE)[0]
        if (~np.isfinite(OE)).sum() > 0:
            return (
                np.array([np.ones(A.shape[0]) * np.nan for i in range(k)]),
                np.array([np.nan for i in range(k)]),
            )
        # mean-centered (subtract mean)
        eigvecs_compressed, eigvals = numutils.EIG(OE, k)
    else:
        eigvecs_compressed, eigvals = numutils.EIG((OE - 1.0),
                                                   k,
                                                   subtractMean=False,
                                                   divideByMean=False)

    # Restore full eigs
    eigvecs = []
    for i in range(k):
        v = np.ones(mask.shape[0]) * np.nan
        v[mask] = eigvecs_compressed[i]
        eigvecs.append(v)
    eigvecs = np.array(eigvecs)

    # Orient and reorder
    eigvals, eigvecs = _orient_eigs(eigvals, eigvecs, gc)

    return eigvals, eigvecs
Пример #7
0
sys.path.append("/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/utils")
import figPath
import ntpath
figure_path=figPath.figure_path+ntpath.basename(heatmap_filepath)+"_"+'Compartment_strength'


genome_db = genome.Genome("/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/",
				readChrms=[],
				chrmFileTemplate="%s.fna")

resolution = int(heatmap_filepath.split("-")[-1].split("k")[0])*1000
print "Resolution determined: ",resolution
print "Loading file "+heatmap_filepath
BD = binnedData.binnedData(resolution, genome_db)
BD.simpleLoad(heatmap_filepath, 'heatmap')
q=BD.dataDict['heatmap']
obs_exp = observedOverExpected(q)

E1_values = np.loadtxt(E1_file)
assert E1_values.shape()[0] = genome_db.numBins
bins = dict([(i,np.zeros(len(genome_db.chrmLenBins[i]))) for i in range(genome_db.chmCount)])

for i in E1_values:
	chr = genome_db.label2idx[i["f1"]]
	nt_start = genome_db.label2idx[i["f2"]]
	bin_start = nt_start/resolution
	assert bins[chr][bin_start] == 0
	bins[chr][bin_start] = i

saddles = {}
for chrom in range(genome_db.chrmCount):
    saddle = np.ones((5, 5), dtype=float)
    st = genome_db.chrmStartsBinCont[chrom]
    end = genome_db.chrmEndsBinCont[chrom]
    cur = q[st:end, st:end]

    E1 = E1_values[st:end]

    mask = np.sum(cur, axis=0) > 0
    if sum(mask) > 5:
        cur = cur[mask]
        cur = cur[:, mask]

        cur = observedOverExpected(cur)
        E1 = E1[mask]
        assert cur.shape[0] == cur.shape[1] == len(E1)

        for i in range(5):
            for j in range(5):
                P1, P2 = np.percentile(E1, [20 * i, 20 * i + 20])
                mask1 = (E1 > P1) * (E1 < P2)
                P1, P2 = np.percentile(E1, [20 * j, 20 * j + 20])
                mask2 = (E1 > P1) * (E1 < P2)
                if sum(mask1) * sum(mask2) != 0:
                    saddle[i, j] = np.nanmean(cur[np.ix_(mask1, mask2)])
                else:
                    saddle[i, j] = None
        saddles[genome_db.idx2label[chrom]] = saddle
    else: