예제 #1
0
def make_slices(
    clr: Cooler,
    regions: Dict[str, np.ndarray],
    names:Optional[dict] = {},
    force_disjoint:Optional[bool] = False
) -> List[np.ndarray]:
    # Fetch relevant bin_ids from the cooler file
    b_ids, n_ids = fetch_bins_from_cooler(cooler=clr, 
                                          regions=regions, 
                                          names=names)
    if force_disjoint:
        # Identify unique bin_ids and isolate disjoint regions
        slices = {chrom: get_unique_bins(b_ids=b_ids[chrom]) for chrom in b_ids}
        n_ids = {}
        for chrom in slices:
            n_ids[chrom] = []
            for sl in slices[chrom]:
                # start, end, bins and node names for region
                stl = clr.bins()[sl[0]]["start"].values[0]
                el = clr.bins()[sl[-1] + 1]["end"].values[0]

                sl_id = f"{chrom}:{stl}-{el}"
                n_ids[chrom].append(sl_id)
    else:
        slices = {chrom:[np.array(item) for item in b_ids[chrom]] for chrom in b_ids}
        
    return slices, n_ids
예제 #2
0
def coords_to_bins(clr: cooler.Cooler, coords: pd.DataFrame) -> np.ndarray:
    """
    Converts genomic coordinates to a list of bin ids based on the whole genome
    contact map.

    Parameters
    ----------
    coords : pandas.DataFrame
        Table of genomic coordinates, with columns chrom, pos.

    Returns
    -------
    numpy.array of ints :
        Indices in the whole genome matrix contact map.

    """
    coords.pos = (coords.pos // clr.binsize) * clr.binsize
    # Coordinates are merged with bins, both indices are kept in memory so that
    # the indices of matching bins can be returned in the order of the input
    # coordinates
    idx = (clr.bins()[:].reset_index().rename(columns={
        "index": "bin_idx"
    }).merge(
        coords.reset_index().rename(columns={"index": "coord_idx"}),
        left_on=["chrom", "start"],
        right_on=["chrom", "pos"],
        how="right",
    ).set_index("bin_idx").sort_values("coord_idx").index.values)
    return idx
예제 #3
0
def train_hmm(clr: cooler.Cooler, mix_num: int = 3, discore_fn=di_score):
    """

    :param clr:
    :param mix_num:
    :param discore_fn:
    :return:
    """
    model = ghmm_model(STATES,
                       INIT_TRANSITION,
                       INIT_PROB,
                       END_PROB,
                       init_mean_fn(mix_num),
                       init_var_fn(mix_num))
    di_dict = {}
    for chrom in clr.chromnames:
        matrix = clr.matrix(sparse=True).fetch(chrom).tocsr()
        di_array = discore_fn(matrix)
        gap_mask = remove_small_gap(
            np.isnan(clr.bins().fetch(chrom)['weight'].values))
        di_dict[chrom] = split_diarray(di_array, gap_mask)

    train_data = []
    for chrom_di in di_dict.values():
        train_data.extend(di for di in chrom_di.values())
    model.fit(
        train_data,
        algorithm='baum-welch',
        max_iterations=10000,
        stop_threshold=1e-5,
        n_jobs=CPU_CORE - 5,
        verbose=False
    )

    return model
예제 #4
0
def make_bins(
    clr: Cooler,
    sites: Dict[str,np.ndarray],
    names: Optional[Dict[str,str]]=None
) -> Dict[str, np.ndarray]:
    bins = {}
    outnames = {}
    
    bad_sites = {}
    for chrom in sites:
        cbins = clr.bins().fetch(chrom)
        start = cbins['start'].values[0]
        site_locs = ((sites[chrom] - start)/clr.binsize).astype('int')
        good_sites = site_locs < cbins.shape[0]
        
        bad_sites[chrom] = np.where(~good_sites)[0]
        
        bins[chrom] = site_locs[good_sites]
        if names is not None:
            outnames[chrom] = np.array(names[chrom])[good_sites]
    
    if names is not None:
        return bins, outnames, bad_sites
    else:
        return bins, bad_sites
예제 #5
0
def fetch_bins_from_cooler(
    cooler: Cooler, 
    regions: Dict[str, np.ndarray],
    names: Optional[dict] = {}
) -> List[List[np.int64]]:
    # Fetch relevant bin_ids from the cooler file
    b_ids = {}
    n_ids = {}
    for chrom in regions:
        b_ids[chrom] = []
        n_ids[chrom] = []
        for idx, row in enumerate(regions[chrom]):
            b_add = list(
                    cooler.bins()
                    .fetch("{}:{}-{}".format(chrom, row[0], row[1]))
                    .index.values
            )
            try:
                n_ids[chrom].append(names[chrom][idx])
            except:
                n_ids[chrom].append("{}:{}-{}".format(chrom, row[0], row[1]))
                
            b_ids[chrom].append(
                b_add
            )
    return b_ids, n_ids
예제 #6
0
def preprocess_hic(
    clr: cooler.Cooler,
    min_contacts: Optional[int] = None,
    region: Optional[str] = None,
) -> sp.csr_matrix:
    """
    Given an input cooler object, returns the preprocessed Hi-C matrix.
    Preprocessing involves (in that order): subsetting region, subsampling
    contacts, normalisation, detrending (obs / exp). Balancing weights must
    be pre-computer in the referenced cool file. Region must be in UCSC format.
    """
    # Load raw matrix and subset region if requested
    mat = clr.matrix(sparse=True, balance=False)
    bins = clr.bins()
    if region is None:
        mat = mat[:]
        bins = bins[:]
    else:
        mat = mat.fetch(region)
        bins = bins.fetch(region)
    try:
        biases = bins["weight"].values
    except KeyError as err:
        sys.stderr.write("Error: Input cooler must be balanced.\n")
        raise err
    # get to same coverage if requested and matrix is not empty
    if mat.sum() and (min_contacts is not None):
        mat = cup.subsample_contacts(mat, min_contacts).tocoo()
    valid = cup.get_detectable_bins(mat, n_mads=5)

    # balance region with weights precomputed on the whole matrix
    mat.data = mat.data * biases[mat.row] * biases[mat.col]
    # Detrend for P(s)
    mat = cup.detrend(mat.tocsr(), smooth=False, detectable_bins=valid[0])
    # Replace NaNs by 0s
    mat.data = np.nan_to_num(mat.data)
    mat.eliminate_zeros()
    return mat
예제 #7
0
def _single_clr_edge_and_node_info_from_sites(c: cooler.Cooler,
                                              sites: Dict[str, np.ndarray],
                                              balance: Optional[bool] = True,
                                              join: Optional[bool] = False):
    '''
    Given some cooler and a dictionary of sites (with chromosomes as keys), return the submatrices retrieved from these slices within the Hi-C map. Submatrices are returned in sparse COO format with an edge_idxs dictionary, an edge_attrs dictionary and a node_info dictionary. Optionally users can balance the Hi-C matrix before retrieval of matrix information. SInce multiple chromosomes and slices per chromosome can be supplied, user may optionally join regions into one larger region consisting of the given slices concatenated together. This function does not actually do the joining procedure since the passed slices may not be disjoint.  
    
    :param cooler: Cooler file object
    :type edge_index: cooler.Cooler
    :param slices: Dictionary with chromosomes as keys and lists of sites as values. Multiple sites are allowed per chromosome.
    :type slices: Dict[str,List[np.ndarray]]
    :param balance: Whether to perform matrix balancing on the Hi-C matrix before retrieving individual slices.
    :type balance: Optional[bool]
    :param join: Boolean determining whether to retrieve Hi-C martrix information corresponding to the interface between slices. This is only recommended if slices are disjoint since the interface isn't well defined if slices aren't disjoint.
    :type join: Optional[bool]
    '''
    # Iterate through slices, adding in edge indexes and edge attributes
    edge_idxs = {}
    edge_attrs = {}
    sub_graph_nodes = {}
    chroms = list(sites.keys())
    for idx, chrom1 in enumerate(chroms):
        edge_idxs[chrom1] = {}
        edge_attrs[chrom1] = {}
        sub_graph_nodes[chrom1] = {}
        for chrom2 in chroms[idx:]:
            if chrom1 != chrom2 and not join:
                continue
            mat = c.matrix(balance=balance).fetch(chrom1, chrom2)
            mat = mat[sites[chrom1], :]
            mat = mat[:, sites[chrom2]]

            mat = coo(mat)

            b1 = c.bins().fetch(chrom1).index.values[sites[chrom1]]
            b2 = c.bins().fetch(chrom2).index.values[sites[chrom2]]

            edge_index = np.concatenate(
                [b1[mat.row][None, :], b2[mat.col][None, :]],
                axis=0,
            )

            edge_data = mat.data[:, None]

            if chrom1 != chrom2:
                edge_index = np.append(edge_index, edge_index[::-1, :], axis=1)
                edge_data = np.append(edge_data, edge_data, axis=0)

                edge_data[np.isnan(edge_data)] = 0

            ind = np.lexsort((edge_index[0, :], edge_index[1, :]))
            edge_index = edge_index[:, ind]
            edge_data = edge_data[ind, :]

            edge_idxs[chrom1][chrom2] = [edge_index]
            edge_attrs[chrom1][chrom2] = [edge_data]

            if chrom1 == chrom2:
                sub_graph_nodes[chrom1][chrom2] = [b1]
            else:
                sub_graph_nodes[chrom1][chrom2] = [np.append(b1, b2)]

    return edge_idxs, edge_attrs, sub_graph_nodes
예제 #8
0
def get_pairing_score_obs_exp(
        clr: cooler.Cooler,
        expected: pd.DataFrame,
        windowsize: int = 4 * 10**4,
        func: Callable = np.mean,
        regions: pd.DataFrame = pd.DataFrame(),
        norm: bool = True,
        arms: pd.DataFrame = pd.DataFrame(),
) -> pd.DataFrame:
    """Takes a cooler file (clr), an expected dataframe (expected; maybe generated by getExpected),
    a windowsize (windowsize), a summary
    function (func) and a set of genomic
    regions to calculate the pairing score
    as follows: A square with side-length windowsize
    is created for each of the entries in the supplied genomics
    regions and the summary function applied to the Hi-C pixels (obs/exp values)
    at the location in the supplied cooler file. The results are
    returned as a dataframe. If no regions are supplied, regions
    are constructed for each bin in the cooler file to
    construct a genome-wide pairing score."""
    # Check whether genomic regions were supplied
    if len(regions) == 0:
        # If no regions are supplied, pregenerate all bins; drop bins with nan weights
        regions = clr.bins()[:].dropna()
        # find midpoint of each bin to assign windows to each midpoint
        regions.loc[:, "mid"] = (regions["start"] + regions["end"]) // 2
    # check that norm is only set if genomewide pairingScore is calculated
    elif norm:
        raise ValueError(
            "Norm flag can only be set with genomeWide pairingScore!")
    # drop nan rows from regions
    regions = regions.dropna()
    # fix indices
    regions.index = range(len(regions))
    regions.loc[:, "binID"] = range(len(regions))
    # Chromosomal arms are needed so each process only extracts a subset from the file
    if len(arms) == 0:
        arms = get_arms_hg19()
    # extract all windows
    windows = assign_regions(windowsize, clr.binsize, regions["chrom"],
                             regions["mid"], arms)
    # add binID to later merge piles
    windows.loc[:, "binID"] = regions["binID"]
    windows = windows.dropna()
    # generate pileup
    pile = do_pileup_obs_exp(clr, expected, windows, collapse=False)
    # convert to dataframe
    pile_frame = pile_to_frame(pile)
    # replace inf with nan
    pile_frame = pile_frame.replace([np.inf, -np.inf], np.nan)
    # apply function to each row (row = individual window)
    summarized = pile_frame.apply(func, axis=1)
    # subset regions with regions that were assigned windows
    output = pd.merge(regions, windows, on="binID",
                      suffixes=("", "_w")).dropna()
    # add results
    output.loc[:, "PairingScore"] = summarized
    # normalize by median
    if norm:
        output.loc[:, "PairingScore"] = output["PairingScore"] - np.median(
            output.dropna()["PairingScore"])
    return output[["chrom", "start", "end", "PairingScore"]]