Exemplo n.º 1
0
def nbinom_est_dist(size, prob, triu_ma, cut_intervals):
    # compute a mapping from mean to distance
    mean2dist = {'mean': [], 'dist': []}
    for dist in np.sort(size.keys()):
        mean = scipy.stats.nbinom.mean(size[dist], prob[dist])
        if not np.isnan(mean):
            mean2dist['mean'].append(mean)
            mean2dist['dist'].append(dist)
    mean2dist['mean'] = np.array(mean2dist['mean'])
    mean2dist['dist'] = np.array(mean2dist['dist'])

    # the values have to be computed for all the
    # matrix excepting inter chromosome
    row, col = np.triu_indices(triu_ma.shape[0])
    dist_list, chrom_list = hiCMatrix.getDistList(row, col, cut_intervals)
    triu_ma = triu_ma.tolil()
    transf_ma = np.zeros(len(dist_list))
    for idx, orig_dist in enumerate(dist_list):
        if orig_dist == -1:
            continue
        data = triu_ma[row[idx], col[idx]]
        try:
            size[orig_dist]
        except KeyError:
            continue
        if _nbinomPvalue(data, size[orig_dist], prob[orig_dist]) < 5:
            # pass
            continue

        # get largest closest mean
        if data > mean2dist['mean'].max() or \
                data < mean2dist['mean'].min():
            dist = orig_dist
        else:
            try:
                mean_idx = np.flatnonzero(data < mean2dist['mean'])[-1]
                dist = mean2dist['dist'][mean_idx]
            except IndexError:
                dist = orig_dist

        # min distance should be 1, otherwise the
        # sparse matrix will treat nans as cero distance
        transf_ma[idx] = dist_list[idx] - dist
        transf_ma[idx] = dist

    # set the new values back into the original matrix
    triu_ma = scipy.sparse.coo_matrix((transf_ma, (row, col)),
                                      shape=triu_ma.shape)
    # fill the lower triangle
    triu_ma = triu_ma + scipy.sparse.triu(triu_ma, 1).T
    triu_ma = triu_ma.tocsr()
    triu_ma.eliminate_zeros()

    return triu_ma
Exemplo n.º 2
0
def transformMatrix(hicma,
                    method,
                    per_chr=False,
                    original_matrix=None,
                    depth_in_bins=None):
    methods_avail = {
        'residuals': _residuals,
        'obs/exp': _obsExp,
        'z-score': _zscore,
        't-score': _tscore,
        'p-value': _pvalue,
        'nbinom-p-value': _nbinomPvalue,
        'nbinom-expected': _nbinomExpected,
        'log-norm': _lognormPvalue,
        'chi-squared': _chi2Pvalue
    }

    counts_by_distance, cut_intervals = hicma.getCountsByDistance(
        per_chr=per_chr)
    if method in ['nbinom-p-value', 'nbinom-expected', 'nbinom-est-dist']:
        size, prob = fitNegBinom_Rserve(counts_by_distance,
                                        per_chr=per_chr,
                                        plot_distribution=True)
    elif method == 'log-norm':
        mu_, sigma = fitDistribution(counts_by_distance, 'lognorm')
    else:
        if per_chr:
            mu_ = {}
            sigma = {}
            n_value = {}
            for chrom in counts_by_distance.keys():
                mu_[chrom] = dict([(x, np.mean(counts_by_distance[chrom][x]))
                                   for x in counts_by_distance[chrom]])
                sigma[chrom] = dict([(x, np.std(counts_by_distance[chrom][x]))
                                     for x in counts_by_distance[chrom]])
                n_value[chrom] = dict([(x, len(counts_by_distance[chrom][x]))
                                       for x in counts_by_distance[chrom]])
        else:
            mu_ = dict([(x, np.mean(counts_by_distance[x]))
                        for x in counts_by_distance])
            sigma = dict([(x, np.std(counts_by_distance[x]))
                          for x in counts_by_distance])
            n_value = dict([(x, len(counts_by_distance[x]))
                            for x in counts_by_distance])

    # use only the upper half of the matrix
    triu_ma = scipy.sparse.triu(hicma.matrix, format='coo')
    if original_matrix:
        orig_ma = original_matrix.matrix
        if per_chr:
            noise_level = {}
            for chrom in counts_by_distance.keys():
                chr_range = original_matrix.getChrBinRange(chrom)
                chr_submatrix = orig_ma[chr_range[0]:chr_range[1],
                                        chr_range[0]:chr_range[1]]
                noise_level[chrom] = np.median(chr_submatrix.data)
        else:
            noise_level = np.median(orig_ma.data)

        sys.stderr.write('noise error set to {}\n'.format(noise_level))
    else:
        noise_level = None
    sys.stderr.write("finish computing fitting parameters\n")

    ########################
    # after the distributions are fitted
    # now the matrix values are evaluated
    if method == 'nbinom-est-dist':
        triu_ma = nbinom_est_dist(size, prob, triu_ma, hicma.cut_intervals)

    else:
        under_noise = 0
        dist_list, chrom_list = hiCMatrix.getDistList(triu_ma.row, triu_ma.col,
                                                      cut_intervals)

        assert len(dist_list) == len(triu_ma.data), "lists not of equal size"
        susprow_list = []
        suspcol_list = []
        transf_ma = np.zeros(len(triu_ma.data))
        start_time = time.time()
        # transform each value  of the data matrix to p-value, obs/exp, correlation etc.
        sys.stderr.write("computing transform values\n")
        for idx, data in enumerate(triu_ma.data):
            # skip if original value is less than noise level
            if noise_level:
                if per_chr:
                    if dist_list[idx] == -1:
                        continue
                    elif (orig_ma[triu_ma.row[idx], triu_ma.col[idx]] <=
                          noise_level[chrom_list[idx]]):
                        under_noise += 1
                        continue
                elif orig_ma[triu_ma.row[idx],
                             triu_ma.col[idx]] <= noise_level:
                    under_noise += 1
                    continue

            if method in ['nbinom-p-value', 'nbinom-expected']:
                if dist_list[idx] == -1:
                    continue
                if per_chr:
                    transf_ma[idx] = methods_avail[method](
                        data, size[chrom_list[idx]][dist_list[idx]],
                        prob[chrom_list[idx]][dist_list[idx]])
                else:
                    transf_ma[idx] = methods_avail[method](
                        data, size[dist_list[idx]], prob[dist_list[idx]])
                if data > 3 * orig_ma[triu_ma.row[idx], triu_ma.col[idx]]:
                    sys.stderr.write("skipping p-value {} for "
                                     "value {} at {}, norm-value {}\n".format(
                                         transf_ma[idx], chrom_list[idx],
                                         orig_ma[triu_ma.row[idx],
                                                 triu_ma.col[idx]], data))
                    continue
                if transf_ma[idx] > 4.5 and \
                        data > 2 * orig_ma[triu_ma.row[idx], triu_ma.col[idx]]:
                    susprow = triu_ma.row[idx]
                    suspcol = triu_ma.col[idx]
                    sys.stderr.write("suspicious p-value {} for "
                                     "value {} at {}, norm-value {}\n".format(
                                         transf_ma[idx], chrom_list[idx],
                                         orig_ma[susprow, suspcol], data))
                    susprow_list.append(susprow)
                    suspcol_list.append(suspcol)

            if method in ['obs/exp', 'residuals']:
                if per_chr:
                    if dist_list[idx] == -1:
                        continue
                    fit_mu = mu_[chrom_list[idx]]
                else:
                    fit_mu = mu_

                transf_ma[idx] = methods_avail[method](data,
                                                       fit_mu[dist_list[idx]])

            else:
                if per_chr:
                    if dist_list[idx] == -1:
                        continue
                    fit_mu = mu_[chrom_list[idx]]
                    fit_sigma = sigma[chrom_list[idx]]
                    fit_n = n_value[chrom_list[idx]]
                else:
                    fit_mu = mu_
                    fit_sigma = sigma
                    fit_n = n_value

                transf_ma[idx] = methods_avail[method](
                    data, fit_mu[dist_list[idx]], fit_sigma[dist_list[idx]],
                    fit_n[dist_list[idx]])

            if idx > 0 and (idx == 10000 or idx % 500000 == 0):
                endtime = time.time()
                estimated = (float(len(transf_ma) - idx) *
                             (endtime - start_time)) / idx
                mmin, sec = divmod(estimated, 60)
                hour, mmin = divmod(mmin, 60)
                print "iteration: {} Estimated remaining time "\
                    "{:.0f}:{:.0f}:{:.0f}".format(idx, hour, mmin, sec)
        """
        print "problematic bins:"
        for uniq in np.concatenate([susprow_list, suspcol_list]):
            print hicma.cut_intervals[uniq]
        """
        # set the new values back into the original matrix
        triu_ma.data = transf_ma
        # fill the lower triangle
        triu_ma = triu_ma + scipy.sparse.triu(triu_ma, 1).T
        triu_ma = triu_ma.tocsr()
        triu_ma.eliminate_zeros()

    return triu_ma