def call_bdtrc(hitCount, observedSum, prior_p, recursion=0):
    if recursion >= 10:
        return 1.0

    p_val = scsp.bdtrc(int(hitCount), int(observedSum), prior_p)
    if np.isnan(p_val):
        p_val = call_bdtrc(int(hitCount / 2), int(observedSum / 2), prior_p, recursion + 1)
    return p_val
示例#2
0
 def _sf_single(self, x, n, a, b):
     k = floor(x)
     p = linspace(0, 1, num=10001)
     bta = btdtr(a, b, p)
     p_med = (p[:-1] + p[1:]) / 2
     bta_med = bta[1:] - bta[:-1]
     vals = (bdtrc(k, n, p_med) * bta_med).sum(axis=-1)
     return vals
def call_bdtrc(hitCount, observedSum, prior_p, recursion=0):
	if (recursion>= 10):
		return 1.0

	p_val=scsp.bdtrc(int(hitCount),int(observedSum),prior_p)
	if (np.isnan(p_val)):
		p_val=call_bdtrc(int(hitCount/2), int(observedSum/2), prior_p, recursion+1)
	return p_val
示例#4
0
def grbbinomial_Pmin_raw(localProb, Ndraws):
    localProb = np.asarray(localProb)
    Ntail = len(localProb)

    # Cumulative binomial probability of getting (1+,2+,...Ntail+) events this improbable.
    # NB: stats.binom.sf maps to the lower level special.bdtrc
    P = special.bdtrc(np.arange(Ntail), Ndraws, localProb)
    index = P.argmin()
    Pmin_raw = P[index]

    return Pmin_raw, index + 1
示例#5
0
def grbbinomial_Pmin_raw(localProb, Ndraws):
    localProb = np.asarray(localProb)
    Ntail = len(localProb)

    # Cumulative binomial probability of getting (1+,2+,...Ntail+) events this improbable.
    # NB: stats.binom.sf maps to the lower level special.bdtrc
    P = special.bdtrc(np.arange(Ntail), Ndraws, localProb)
    index = P.argmin()
    Pmin_raw = P[index]

    return Pmin_raw, index + 1
示例#6
0
def grbbinomialtest_threshold(Ndraws,
                              Ntail,
                              percentile,
                              Nmc,
                              discreteness=None,
                              blocksize=10000):
    """
    Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest_threshold.m

    Ndraws is a scalar saying how many GRBs were analyzed in total
    Ntail is the number of loudest GRB events kept
    percentile is the desired percentile of the binomial probability
        distribution (This should be between 0 and 100!)
    Nmc is the number of Monte-Carlo simulations to perform in assessing
        significance
    discreteness is optional, but allows you to draw FAP values uniformly
        from multiples of 1 / discreteness

    Return the threshold on Pmin for the given percentile and an array of the
        FAPs corresponding to that threshold for each k=1..Ntail at which
        we evaluate the binomial probability.
    """
    assert Ntail <= Ndraws
    if discreteness is None:
        draw = lambda n: stats.uniform.rvs(size=(n, Ndraws))
    else:
        draw = lambda n: stats.randint.rvs(
            0, discreteness + 1, size=(n, Ndraws)) / discreteness

    PminMC = []
    num_drawn = 0
    while num_drawn < Nmc:  # draw random numbers in blocks to reduce memory
        num_to_draw = min(Nmc - num_drawn, blocksize)
        localProbMC = draw(num_to_draw)

        # keep Ntail most significant values of this block
        localProbMC.sort(axis=1)
        localProbMC = localProbMC[:, :Ntail]

        # NB: stats.binom.sf maps to the lower level special.bdtrc
        PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC)
        PminMC.extend(PMC.min(axis=1))
        num_drawn += num_to_draw

    # determine threshold on Pmin
    PminMC = np.asarray(PminMC)
    Pmin_thresh = stats.scoreatpercentile(PminMC, percentile)
    return Pmin_thresh
示例#7
0
def grbbinomialtest_threshold(Ndraws, Ntail, percentile, Nmc, discreteness=None, blocksize=10000):
    """
    Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest_threshold.m

    Ndraws is a scalar saying how many GRBs were analyzed in total
    Ntail is the number of loudest GRB events kept
    percentile is the desired percentile of the binomial probability
        distribution (This should be between 0 and 100!)
    Nmc is the number of Monte-Carlo simulations to perform in assessing
        significance
    discreteness is optional, but allows you to draw FAP values uniformly
        from multiples of 1 / discreteness

    Return the threshold on Pmin for the given percentile and an array of the
        FAPs corresponding to that threshold for each k=1..Ntail at which
        we evaluate the binomial probability.
    """
    assert Ntail <= Ndraws
    if discreteness is None:
        draw = lambda n: stats.uniform.rvs(size=(n, Ndraws))
    else:
        draw = lambda n: stats.randint.rvs(0, discreteness + 1, size=(n, Ndraws)) / discreteness

    PminMC = []
    num_drawn = 0
    while num_drawn < Nmc:  # draw random numbers in blocks to reduce memory
        num_to_draw = min(Nmc - num_drawn, blocksize)
        localProbMC = draw(num_to_draw)

        # keep Ntail most significant values of this block
        localProbMC.sort(axis=1)
        localProbMC = localProbMC[:, :Ntail]

        # NB: stats.binom.sf maps to the lower level special.bdtrc
        PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC)
        PminMC.extend(PMC.min(axis=1))
        num_drawn += num_to_draw

    # determine threshold on Pmin
    PminMC = np.asarray(PminMC)
    Pmin_thresh = stats.scoreatpercentile(PminMC, percentile)
    return Pmin_thresh
示例#8
0
def grbbinomialtest(localProb, Ndraws, Nmc, discreteness=None):
    """
    Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest.m

    localProb is a *sorted* array of FAP values, one per GRB to be tested
    Ndraws is a scalar saying how many GRBs were analyzed in total
    Nmc is the number of Monte-Carlo simulations to perform in assessing
        significance.
    discreteness is optional, but allows you to draw FAP values uniformly
        from multiples of 1 / discreteness

    Pmin_raw     Lowest cumulative binomial probability of the input set
                 localProb.  Note that this number does not account for the
                 trials factor when length(localProb)>1.
    Pmin         Probability that the tail of length(localProb) of a set of
                 Ndraws uniformly distributed random numbers will give a
                 cumulative binomial probability less than or equal to
                 Pmin_raw.
    Nmin         Number of tail values to include at which the binomial
                 probability Pmin_raw occurs.
    """
    Ntail = len(localProb)
    Pmin_raw, Nmin = grbbinomial_Pmin_raw(localProb, Ndraws)

    # Do a Monte-Carlo to determine significance
    if discreteness is None:
        localProbMC = stats.uniform.rvs(size=(Nmc, Ndraws))
    else:
        localProbMC = stats.randint.rvs(
            0, discreteness + 1, size=(Nmc, Ndraws)) / discreteness

    # keep the Ntail most significant values
    localProbMC.sort(axis=1)
    localProbMC = localProbMC[:, :Ntail]

    PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC)
    PminMC = PMC.min(axis=1)
    Pmin = (PminMC <= Pmin_raw).mean()

    return Pmin_raw, Pmin, Nmin
示例#9
0
def grbbinomialtest(localProb, Ndraws, Nmc, discreteness=None):
    """
    Adapted from https://trac.ligo.caltech.edu/xpipeline/browser/trunk/utilities/grbbinomialtest.m

    localProb is a *sorted* array of FAP values, one per GRB to be tested
    Ndraws is a scalar saying how many GRBs were analyzed in total
    Nmc is the number of Monte-Carlo simulations to perform in assessing
        significance.
    discreteness is optional, but allows you to draw FAP values uniformly
        from multiples of 1 / discreteness

    Pmin_raw     Lowest cumulative binomial probability of the input set
                 localProb.  Note that this number does not account for the
                 trials factor when length(localProb)>1.
    Pmin         Probability that the tail of length(localProb) of a set of
                 Ndraws uniformly distributed random numbers will give a
                 cumulative binomial probability less than or equal to
                 Pmin_raw.
    Nmin         Number of tail values to include at which the binomial
                 probability Pmin_raw occurs.
    """
    Ntail = len(localProb)
    Pmin_raw, Nmin = grbbinomial_Pmin_raw(localProb, Ndraws)

    # Do a Monte-Carlo to determine significance
    if discreteness is None:
        localProbMC = stats.uniform.rvs(size=(Nmc, Ndraws))
    else:
        localProbMC = stats.randint.rvs(0, discreteness + 1, size=(Nmc, Ndraws)) / discreteness

    # keep the Ntail most significant values
    localProbMC.sort(axis=1)
    localProbMC = localProbMC[:, :Ntail]

    PMC = special.bdtrc(np.arange(Ntail)[None, :], Ndraws, localProbMC)
    PminMC = PMC.min(axis=1)
    Pmin = (PminMC <= Pmin_raw).mean()

    return Pmin_raw, Pmin, Nmin
示例#10
0
 def test_domain(self):
     val = sc.bdtrc(-1.1, 1, 0.5)
     val2 = sc.bdtrc(2.1, 1, 0.5)
     assert np.isnan(val2)
     assert_allclose(val, 1.0)
示例#11
0
 def test_bdtr_bdtrc_sum_to_one(self):
     bdtr_vals = sc.bdtr([0, 1, 2], 2, 0.5)
     bdtrc_vals = sc.bdtrc([0, 1, 2], 2, 0.5)
     vals = bdtr_vals + bdtrc_vals
     assert_allclose(vals, [1.0, 1.0, 1.0])
示例#12
0
def calculateSignificant(outfilename,
                         infilename,
                         splineX,
                         splineY,
                         possibleIntraInRangeCount,
                         observedIntraInRangeSum,
                         possibleInterAllCount,
                         observedInterAllSum,
                         lowThres,
                         upThres,
                         passNo,
                         region='intraOnly'):
    print("Calculating p-values and q-values for all pairs from input file...")
    print("--------------------------------------------------")
    newAllReads = read_countsFile(infilename, lowThres, upThres, silence=True)
    CCNT = newAllReads.contactCount.values
    DIST = newAllReads.distance.values
    ITYPE = newAllReads.contactType.values

    allReg, interOnly = region_parser(region, silence=True)

    p_vals = []
    for cc, dist, itype in zip(CCNT, DIST, ITYPE):
        bias1 = 1.0
        bias2 = 1.0  # not use bias file so far
        if (bias1 < 0 or bias2 < 0) and itype != 'inter':
            prior_p = 1.0
            p_val = 1.0
        elif itype == 'intraInRange' and not interOnly:
            i = bisect.bisect_left(splineX, dist)
            prior_p = splineY[i] * (bias1 * bias2)
            p_val = bdtrc(cc - 1, observedIntraInRangeSum, prior_p)
        elif itype == 'intraShort' and not interOnly:
            prior_p = 1.0
            p_val = 1.0
        elif itype == 'intraLong' and not interOnly:
            prior_p = 1.0
            p_val = 1.0
        else:
            if allReg or interOnly:
                prior_p = interChrProb * (bias1 * bias2)
                p_val = bdtrc(cc - 1, observedInterAllSum, prior_p)
            else:
                p_val = 1.0
        p_vals.append(p_val)

    # Do the BH FDR correction
    if allReg:
        totalValidCount = possibleIntraInRangeCount + possibleInterAllCount
    elif interOnly and not allReg:
        totalValidCount = possibleInterAllCount
    else:
        totalValidCount = possibleIntraInRangeCount
    outlierThres = 1.0 / totalValidCount
    q_vals = bh_correction(p_vals, totalValidCount)
    print(f'The calculation of p-values and q-values finished!')
    print(f'>>>> Writing to {outfilename}.significant.gz')
    newAllReads['p_vals'] = p_vals
    newAllReads['q_vals'] = q_vals
    newAllReads.to_csv(f'{outfilename}.significant.gz',
                       sep='\t',
                       index=False,
                       compression='gzip')
    print(f'>>>> p-vals and q-vals written to {outfilename}.significant.gz')

    # Find all outliers
    outlierReads = newAllReads[newAllReads.p_vals < outlierThres]
    outliersline = sorted(outlierReads.index.tolist())
    outliersdist = sorted(outlierReads.distance.tolist())
    print(f'Outlier threshold is: {outlierThres:.6e}')
    print(f'Found outlier pairs: {len(outliersline)}')

    print(f'>>>> Plotting q-values to file {outfilename}.qplot.svg')
    FDRx, FDRy = plot_qvalues(outfilename,
                              q_vals,
                              minFDR=0,
                              maxFDR=0.05,
                              increment=1e-3)

    return outliersline, outliersdist, FDRx, FDRy
示例#13
0
 def test_inf(self, k, n, p):
     with suppress_warnings() as sup:
         sup.filter(DeprecationWarning)
         val = sc.bdtrc(k, n, p)
     assert np.isnan(val)
def test_legacy_cast():
    with suppress_warnings() as sup:
        sup.filter(RuntimeWarning,
                   "floating point number truncated to an integer")
        res = sc.bdtrc(np.nan, 1, 0.5)
        assert_(np.isnan(res))
示例#15
0
def fit_Spline(x, y, yerr, infilename, sortedInteractions, biasDic, figname,
               passNo):
    sys.stderr.write("\nFit a univariate spline to the probability means\n")
    sys.stderr.write(
        "------------------------------------------------------------------------------------\n"
    )
    sys.stderr.write("baseline intra-chr probability: " +
                     repr(baselineIntraChrProb) +
                     "\tbaseline inter-chr probability: " +
                     repr(baselineInterChrProb) + "\n")
    # xi and yi will be used only for visualization purposes
    # acutal fit and residual is all done on vectors x and y
    xi = np.linspace(min(x), max(x), overSample * len(x))

    # assume residualFactor==-1:
    splineError = min(y) * min(y)

    # use fitpack2 method -fit on the real x and y from equal occupancy binning
    ius = UnivariateSpline(x, y, s=splineError)
    yi = ius(xi)

    #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
    ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG
    ### This does the isotonic regression using option antitonic to make sure
    ### I get monotonically decreasing probabilites with increasion genomic distance

    tempMaxX = max(x)
    tempMinX = min(x)
    tempList = sorted(list(set([int(i[0]) for i in sortedInteractions])))
    splineX = []
    ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
    ### Therefore everything will be within the range where the spline is defined
    for i in tempList:
        if tempMinX <= i and i <= tempMaxX:
            splineX.append(i)
    # END for
    #print len(splineX)
    splineY = ius(splineX)

    # R vector format
    rSplineX = ro.FloatVector(splineX)
    rSplineY = ro.FloatVector(splineY)
    rMonoReg = ro.r['monoreg']
    # do the antitonic regression
    allRres = rMonoReg(rSplineX, rSplineY, type="antitonic")
    rNewSplineY = allRres[3]
    # convert data back to Python format
    newSplineY = []
    diff = []
    diffX = []
    for i in range(len(rNewSplineY)):
        newSplineY.append(rNewSplineY[i])
        if (splineY[i] - newSplineY[i]) > 0:
            diff.append(splineY[i] - newSplineY[i])
            diffX.append(splineX[i])
    # END for
    #print len(splineX)

    residual = sum([i * i for i in (y - ius(x))])

    if visual == True:
        ### Now plot the results
        sys.stderr.write("Plotting %s" % figname + ".png\n")
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(2, 1, 1)
        plt.plot(myUtils.scale_a_list(splineX, toKb),
                 myUtils.scale_a_list(newSplineY, toProb),
                 'g-',
                 label="spline-" + str(passNo),
                 linewidth=2)
        plt.errorbar(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list(y, toProb),
                     myUtils.scale_a_list(yerr, toProb),
                     fmt='r.',
                     label="Mean with std. error",
                     linewidth=2)

        if useInters:
            plt.plot(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list([baselineIntraChrProb for i in x],
                                          toProb),
                     'k-',
                     label="Baseline intra-chromosomal")
            plt.plot(myUtils.scale_a_list(x, toKb),
                     myUtils.scale_a_list([baselineIntraChrProb for i in x],
                                          toProb),
                     'b-',
                     label="Baseline inter-chromosomal")
        plt.ylabel('Contact probability (x10$^{-5}$)', fontsize='large')
        plt.xlabel('Genomic distance (kb)', fontsize='large')
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres], toKb))
        plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None))
        ax.legend(loc="upper right")

        ax = fig.add_subplot(2, 1, 2)

        plt.loglog(splineX, newSplineY, 'g-')
        plt.errorbar(x, y, yerr=yerr, fmt='r.')  # Data
        if useInters:
            plt.loglog(x, [baselineIntraChrProb for i in x], 'k-')
            plt.loglog(x, [baselineIntraChrProb for i in x], 'b-')
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim([distLowThres, distUpThres])
        plt.ylabel('Contact probability (log-scale)', fontsize='large')
        plt.xlabel('Genomic distance (log-scale)', fontsize='large')

        plt.savefig(outdir + '/' + figname + '.png')

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'r')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " +
                     repr(distUpThres) + "\n")
    p_vals = []
    q_vals = []
    for line in infile:
        words = line.rstrip().split()
        interxn = myUtils.Interaction(
            [words[0], int(words[1]), words[2],
             int(words[3])])
        interxn.setCount(int(words[4]))
        chr1 = words[0]
        chr2 = words[2]
        midPoint1 = int(words[1])
        midPoint2 = int(words[3])

        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if len(biasDic) > 0:
            if chr1 in biasDic and midPoint1 in biasDic[chr1]:
                bias1 = biasDic[chr1][midPoint1]
            if chr2 in biasDic and midPoint2 in biasDic[chr2]:
                bias2 = biasDic[chr2][midPoint2]

        if (bias1 < 0 or bias2 < 0) and interxn.type != 'inter':
            prior_p = 1.0
            p_val = 1.0
            p_vals.append(p_val)
        elif interxn.getType(distLowThres, distUpThres) == 'intraInRange':
            # make sure the interaction distance is covered by the probability bins
            distToLookUp = max(interxn.distance, min(x))
            distToLookUp = min(distToLookUp, max(x))
            i = min(bisect.bisect_left(splineX, distToLookUp),
                    len(splineX) - 1)
            #prior_p=newSplineY[i]
            prior_p = newSplineY[i] * (bias1 * bias2
                                       )  # biases added in the picture
            intraInRangeCount += 1
            ############# THIS HAS TO BE interactionCount-1 ##################
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraInRangeSum,
                               prior_p)
            p_vals.append(p_val)

        elif interxn.getType(distLowThres, distUpThres) == 'intraShort':
            prior_p = 1.0
            p_val = 1.0
            intraVeryProximalCount += 1
            p_vals.append(p_val)

        elif interxn.getType(distLowThres, distUpThres) == 'intraLong':
            # out of range bigger than distUpThres
            # use the prior of the baseline intra-chr interaction probability
            prior_p = 1.0  #baselineIntraChrProb*(bias1*bias2)  # biases added in the picture
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum,
                               prior_p)
            intraOutOfRangeCount += 1
            p_vals.append(p_val)

        else:
            if useInters:
                #prior_p=baselineIntraChrProb
                prior_p = baselineInterChrProb * (
                    bias1 * bias2)  # biases added in the picture
                ############# THIS HAS TO BE interactionCount-1 ##################
                p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum,
                                   prior_p)
                interCount += 1
                p_vals.append(p_val)
    # END for
    infile.close()

    # Do the BH FDR correction
    if useInters:
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleInterAllCount + possibleIntraAllCount)
        sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " +
                         repr(possibleInterAllCount + possibleIntraAllCount) +
                         "\n")
    else:
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleIntraInRangeCount)
        sys.stderr.write("possibleIntraInRangeCount " +
                         repr(possibleIntraInRangeCount) + "\n")

    infile = gzip.open(infilename, 'r')
    outfile = gzip.open(outdir + '/' + figname + '.significances.txt.gz', 'w')
    sys.stderr.write("Writing p-values to file %s" % figname +
                     ".significances.txt.gz\n")
    count = 0
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n"
    )

    for line in infile:
        words = line.rstrip().split()
        chrNo1 = words[0]
        midPoint1 = int(words[1])
        chrNo2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = int(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]

        if useInters == False and chrNo1 == chrNo2:  # intra
            interactionDistance = abs(midPoint1 - midPoint2)  # dist
            if myUtils.in_range_check(interactionDistance, distLowThres,
                                      distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                              (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                               interactionCount, p_val, q_val))
        elif useInters == True and chrNo1 != chrNo2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                          (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                           interactionCount, p_val, q_val))
        #outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

        count += 1
    # END for - printing pvals and qvals for all the interactions
    outfile.close()

    isOutlier = []
    distsBelow = []
    distsAbove = []
    intcountsBelow = []
    intcountsAbove = []
    belowThresCount = 0
    aboveThresCount = 0
    outlierThres = 1.0 / possibleIntraInRangeCount
    for interactionDistance, interactionCount, bias12 in sortedInteractions:
        # make sure the interaction distance is covered by the probability bins
        distToLookUp = max(interactionDistance, min(x))
        distToLookUp = min(distToLookUp, max(x))
        i = min(bisect.bisect_left(splineX, distToLookUp), len(splineX) - 1)
        prior_p = newSplineY[i] * float(bias12)  # biases added in the picture
        ############# THIS HAS TO BE interactionCount-1 ##################
        p_val = scsp.bdtrc(interactionCount - 1, observedIntraInRangeSum,
                           prior_p)
        if p_val < outlierThres:
            distsBelow.append(interactionDistance)
            intcountsBelow.append(interactionCount)
            isOutlier.append(1)
            belowThresCount += 1
        else:
            distsAbove.append(interactionDistance)
            intcountsAbove.append(interactionCount)
            isOutlier.append(0)
            aboveThresCount += 1
    # END for - doing the outlier check for all interactions in sortedInteractions

    if visual == True:
        sys.stderr.write("Plotting results of extracting outliers to file %s" %
                         figname + ".extractOutliers.png\n")
        plt.clf()
        fig = plt.figure()
        ax = fig.add_subplot(111)
        downsample = 30  # for the non-outliers
        randIndcsAbove = sample([i for i in range(len(intcountsAbove))],
                                len(intcountsAbove) / downsample)
        randIndcsAbove = sorted(randIndcsAbove)
        downsample = 20  # for the outliers
        randIndcsBelow = sample([i for i in range(len(intcountsBelow))],
                                len(intcountsBelow) / downsample)
        randIndcsBelow = sorted(randIndcsBelow)

        plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow],
                                      toKb),
                 [intcountsBelow[i] for i in randIndcsBelow],
                 'r.',
                 label="Outliers (p-value < 1/M)")
        plt.plot(myUtils.scale_a_list(splineX + [maxObservedGenomicDist],
                                      toKb),
                 [
                     newSplineY[i] * observedIntraInRangeSum
                     for i in range(len(newSplineY))
                 ] + [newSplineY[-1] * observedIntraInRangeSum],
                 'g-',
                 label="spline-" + str(passNo) + " (x N)",
                 linewidth=2.5)

        plt.xlabel('Genomic distance (kb)')
        plt.ylabel('Contact counts')
        print(repr(len(intcountsBelow)) + "\t"),
        ## this limits y-axis of the hit count plots
        if len(intcountsBelow) > 0:
            plt.ylim([0, min(max(intcountsBelow), 1500)])
        if distLowThres > -1 and distUpThres > -1:
            plt.xlim([0, distUpThres * toKb])
        ax.legend(loc="upper right", fancybox=True)
        plt.savefig(outdir + '/' + figname + '.extractOutliers.png')

    sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\
     repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n")

    if visual == True:
        sys.stderr.write("Plotting q-values to file %s" % figname +
                         ".qplot.png\n")
    minFDR = 0.0
    maxFDR = 0.05
    increment = 0.001
    FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment,
                              figname + ".qplot")

    infile.close()

    return [splineX, newSplineY, residual, isOutlier, FDRx,
            FDRy]  # from fit_Spline
示例#16
0
def fit_Spline(x,y,yerr,infilename,sortedInteractions,biasDic,figname,passNo):
	sys.stderr.write("\nFit a univariate spline to the probability means\n")
	sys.stderr.write("------------------------------------------------------------------------------------\n")
	sys.stderr.write("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\tbaseline inter-chr probability: " + repr(baselineInterChrProb)+"\n")
	# xi and yi will be used only for visualization purposes
	# acutal fit and residual is all done on vectors x and y
	xi = np.linspace(min(x), max(x), overSample*len(x))

	# assume residualFactor==-1: 
	splineError=min(y)*min(y)

	# use fitpack2 method -fit on the real x and y from equal occupancy binning
	ius = UnivariateSpline(x, y, s=splineError)
	yi = ius(xi)

	#### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
	### NOW I DO THIS BY CALLING AN R function CALLED MONOREG 
	### This does the isotonic regression using option antitonic to make sure 
	### I get monotonically decreasing probabilites with increasion genomic distance 

	tempMaxX=max(x)
	tempMinX=min(x)
	tempList=sorted(list(set([int(i[0]) for i in sortedInteractions])))
	splineX=[]
	### The below for loop will make sure nothing is out of range of [min(x) max(x)]
	### Therefore everything will be within the range where the spline is defined
	for i in tempList:
		if tempMinX<=i and i<=tempMaxX:
			splineX.append(i)
	# END for
	#print len(splineX)
	splineY=ius(splineX)

	# R vector format
	rSplineX=ro.FloatVector(splineX)
	rSplineY=ro.FloatVector(splineY)
	rMonoReg=ro.r['monoreg']
	# do the antitonic regression
	allRres=rMonoReg(rSplineX,rSplineY,type="antitonic")
	rNewSplineY=allRres[3]
	# convert data back to Python format
	newSplineY=[]
	diff=[]
	diffX=[]
	for i in range(len(rNewSplineY)):
		newSplineY.append(rNewSplineY[i])
		if (splineY[i]-newSplineY[i]) > 0:
			diff.append(splineY[i]-newSplineY[i])
			diffX.append(splineX[i])
	# END for
	#print len(splineX)
	
	residual =sum([i*i for i in (y - ius(x))])

	if visual==True:
		### Now plot the results
		sys.stderr.write("Plotting %s" % figname + ".png\n")
		plt.clf()
		fig = plt.figure()
		ax = fig.add_subplot(2,1,1)
		plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2)
		plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) 

		if useInters:
			plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'k-',label="Baseline intra-chromosomal")
			plt.plot(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list([baselineIntraChrProb for i in x],toProb),'b-',label="Baseline inter-chromosomal")
		plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
		plt.xlabel('Genomic distance (kb)',fontsize='large')
		if distLowThres>-1 and distUpThres>-1:
			plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb))
		plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None))
		ax.legend(loc="upper right")

		ax = fig.add_subplot(2,1,2)

		plt.loglog(splineX,newSplineY,'g-')
		plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data
		if useInters:
			plt.loglog(x,[baselineIntraChrProb for i in x],'k-')
			plt.loglog(x,[baselineIntraChrProb for i in x],'b-')
		if distLowThres>-1 and distUpThres>-1:
			plt.xlim([distLowThres, distUpThres])
		plt.ylabel('Contact probability (log-scale)',fontsize='large')
		plt.xlabel('Genomic distance (log-scale)',fontsize='large')

		plt.savefig(outdir+'/'+figname+'.png')

	# NOW write the calculated pvalues and corrected pvalues in a file 
	infile =gzip.open(infilename, 'r')
	intraInRangeCount=0
	intraOutOfRangeCount=0
	intraVeryProximalCount=0
	interCount=0
	sys.stderr.write("distLowThres " + repr(distLowThres) + "\tdistUpThres " + repr(distUpThres) +"\n")
	p_vals=[]
	q_vals=[]
	for line in infile:
		words=line.rstrip().split()
		interxn=myUtils.Interaction([words[0], int(words[1]), words[2], int(words[3])])
		interxn.setCount(int(words[4]))
		chr1=words[0]
		chr2=words[2]
		midPoint1=int(words[1])
		midPoint2=int(words[3])

		bias1=1.0; bias2=1.0;  # assumes there is no bias to begin with
		# if the biasDic is not null sets the real bias values
		if len(biasDic)>0:
			if chr1 in biasDic and midPoint1 in biasDic[chr1]:
				bias1=biasDic[chr1][midPoint1]
			if chr2 in biasDic and midPoint2 in biasDic[chr2]:
				bias2=biasDic[chr2][midPoint2]

		if (bias1<0 or bias2<0) and interxn.type!='inter':
			prior_p=1.0
			p_val=1.0
			p_vals.append(p_val)
		elif interxn.getType(distLowThres,distUpThres)=='intraInRange': 
			# make sure the interaction distance is covered by the probability bins
			distToLookUp=max(interxn.distance,min(x))
			distToLookUp=min(distToLookUp,max(x))
			i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) 
			#prior_p=newSplineY[i]
			prior_p=newSplineY[i]*(bias1*bias2) # biases added in the picture
			intraInRangeCount +=1
			############# THIS HAS TO BE interactionCount-1 ##################
			p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraInRangeSum,prior_p)
			p_vals.append(p_val)

		elif interxn.getType(distLowThres,distUpThres)=='intraShort':
			prior_p=1.0
			p_val=1.0
			intraVeryProximalCount +=1
			p_vals.append(p_val)

		elif interxn.getType(distLowThres,distUpThres)=='intraLong':
			# out of range bigger than distUpThres
			# use the prior of the baseline intra-chr interaction probability
			prior_p=1.0 #baselineIntraChrProb*(bias1*bias2)  # biases added in the picture
			p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraAllSum,prior_p)
			intraOutOfRangeCount +=1
			p_vals.append(p_val)

		else:
			if useInters:
				#prior_p=baselineIntraChrProb
				prior_p=baselineInterChrProb*(bias1*bias2) # biases added in the picture
				############# THIS HAS TO BE interactionCount-1 ##################
				p_val=scsp.bdtrc(interxn.hitCount-1,observedInterAllSum,prior_p)
				interCount +=1
				p_vals.append(p_val)
	# END for
	infile.close()

	# Do the BH FDR correction 
	if useInters:
		q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraAllCount)
		sys.stderr.write("possibleInterAllCount+possibleIntraAllCount " + repr(possibleInterAllCount+possibleIntraAllCount)+"\n")
	else:
		q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
		sys.stderr.write("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n")

	infile =gzip.open(infilename, 'r')
	outfile =gzip.open(outdir+'/'+figname+'.significances.txt.gz', 'w')
	sys.stderr.write("Writing p-values to file %s" % figname + ".significances.txt.gz\n")
	count=0
	outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n")

	for line in infile:
		words=line.rstrip().split()
		chrNo1=words[0]
		midPoint1=int(words[1])
		chrNo2=words[2]
		midPoint2=int(words[3])
		interactionCount=int(words[4])
		p_val=p_vals[count]
		q_val=q_vals[count]
		
		if useInters==False and chrNo1==chrNo2: # intra
			interactionDistance=abs(midPoint1-midPoint2) # dist 
			if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres):
				outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
		elif useInters==True and chrNo1!=chrNo2:
			outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
		#outfile.write("ALL\t%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

		count+=1
	# END for - printing pvals and qvals for all the interactions
	outfile.close()

	isOutlier=[]
	distsBelow=[]
	distsAbove=[]
	intcountsBelow=[]
	intcountsAbove=[]
	belowThresCount=0
	aboveThresCount=0
	outlierThres=1.0/possibleIntraInRangeCount
	for interactionDistance,interactionCount,bias12 in sortedInteractions:
		# make sure the interaction distance is covered by the probability bins
		distToLookUp=max(interactionDistance,min(x))
		distToLookUp=min(distToLookUp,max(x))
		i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1) 
		prior_p=newSplineY[i]*float(bias12) # biases added in the picture
		############# THIS HAS TO BE interactionCount-1 ##################
		p_val=scsp.bdtrc(interactionCount-1,observedIntraInRangeSum,prior_p)
		if p_val < outlierThres:
			distsBelow.append(interactionDistance)
			intcountsBelow.append(interactionCount)
			isOutlier.append(1)
			belowThresCount +=1
		else:
			distsAbove.append(interactionDistance)
			intcountsAbove.append(interactionCount)
			isOutlier.append(0)
			aboveThresCount +=1
	# END for - doing the outlier check for all interactions in sortedInteractions


	if visual==True:
		sys.stderr.write("Plotting results of extracting outliers to file %s" % figname + ".extractOutliers.png\n")
		plt.clf()
		fig = plt.figure()
		ax = fig.add_subplot(111)
		downsample=30 # for the non-outliers
		randIndcsAbove=sample([i for i in range(len(intcountsAbove))],len(intcountsAbove)/downsample)
		randIndcsAbove=sorted(randIndcsAbove)
		downsample=20 # for the outliers
		randIndcsBelow=sample([i for i in range(len(intcountsBelow))],len(intcountsBelow)/downsample)
		randIndcsBelow=sorted(randIndcsBelow)

		plt.plot(myUtils.scale_a_list([distsBelow[i] for i in randIndcsBelow],toKb),[intcountsBelow[i] for i in randIndcsBelow], 'r.',label="Outliers (p-value < 1/M)")
		plt.plot(myUtils.scale_a_list(splineX+[maxObservedGenomicDist],toKb),[newSplineY[i]*observedIntraInRangeSum	for i in range(len(newSplineY))]+[newSplineY[-1]*observedIntraInRangeSum], 'g-', label="spline-"+str(passNo)+" (x N)", linewidth=2.5)

		plt.xlabel('Genomic distance (kb)')
		plt.ylabel('Contact counts')
		print(repr(len(intcountsBelow))+"\t"),
		## this limits y-axis of the hit count plots
		if len(intcountsBelow)>0:
			plt.ylim([0,min(max(intcountsBelow),1500)])
		if distLowThres>-1 and distUpThres>-1:
			plt.xlim([0, distUpThres*toKb])
		ax.legend(loc="upper right",fancybox=True)
		plt.savefig(outdir+'/'+figname+'.extractOutliers.png')

	sys.stderr.write("intraInRangeCount " + repr(intraInRangeCount)+"\tintraOutOfRangeCount " +\
		repr(intraOutOfRangeCount)+"\tintraVeryProximalCount " + repr(intraVeryProximalCount) +"\tinterCount " + repr(interCount)+"\n")

	if visual==True:
		sys.stderr.write("Plotting q-values to file %s" % figname + ".qplot.png\n")
	minFDR=0.0
	maxFDR=0.05
	increment=0.001
	FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,figname+".qplot")

	infile.close()

	return [splineX, newSplineY, residual, isOutlier, FDRx, FDRy] # from fit_Spline
示例#17
0
文件: fithic.py 项目: ay-lab/fithic
def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic,outliersline,outliersdist,observedIntraInRangeSum, possibleIntraInRangeCount, possibleInterAllCount, observedIntraAllSum, observedInterAllSum, resolution, passNo):
    with open(logfile, 'a') as log:
        log.write("\nFitting a univariate spline to the probability means\n"),
        log.write("------------------------------------------------------------------------------------\n"),
   
    splineX = None
    newSplineY = None
    residual = None 
    FDRx = None
    FDRy = None

    if not interOnly:
        if outliersdist != None:
            y = [f for _, f in sorted(zip(x,y), key=lambda pair: pair[0])]
            x.sort()
        for i in range(1,len(x)):
            if x[i]<=x[i-1]:
                print("ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct.")
                print("Avg. distance of bin(i-1)... %s" % x[i-1])
                print("Avg. distance of bin(i)... %s" % x[i])
                sys.exit(2)
        
        # maximum residual allowed for spline is set to min(y)^2
        splineError=min(y)*min(y)

        # use fitpack2 method -fit on the real x and y from equal occupancy binning
        ius = UnivariateSpline(x, y, s=splineError)
        tempMaxX=max(x)
        tempMinX=min(x)
        tempList=sorted([dis for dis in mainDic])
        splineX=[]
        ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
        ### Therefore everything will be within the range where the spline is defined
        for i in tempList:
            if tempMinX<=i<=tempMaxX:
                splineX.append(i)
        splineY=ius(splineX)
        #print(splineY)
        #print(yerr)


        ir = IsotonicRegression(increasing=False)
        newSplineY = ir.fit_transform(splineX,splineY)
        #print(newSplineY)
        residual =sum([i*i for i in (y - ius(x))])

        if visual==True:
            xi = np.linspace(min(x),max(x),5*len(x))
            yi = ius(xi)

            print("Plotting %s" % (outfilename + ".png"))
            plt.clf()
            fig = plt.figure()
            ax = fig.add_subplot(2,1,1)
            plt.plot(myUtils.scale_a_list(splineX,toKb), myUtils.scale_a_list(newSplineY,toProb),'g-',label="spline-"+str(passNo),linewidth=2)
            plt.errorbar(myUtils.scale_a_list(x,toKb),myUtils.scale_a_list(y,toProb),myUtils.scale_a_list(yerr,toProb),fmt='r.',label="Mean with std. error",linewidth=2) 
        
            #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
            #plt.xlabel('Genomic distance (kb)',fontsize='large')
            plt.ylabel('Contact probability (x10$^{-5}$)')
            plt.xlabel('Genomic distance (kb)')
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim(myUtils.scale_a_list([distLowThres, distUpThres],toKb))
            plt.gca().yaxis.set_major_locator( MaxNLocator(nbins = 3, prune=None))
            ax.legend(loc="upper right")

            ax = fig.add_subplot(2,1,2)

            plt.loglog(splineX,newSplineY,'g-')
            plt.errorbar(x, y, yerr=yerr, fmt='r.') # Data
            if distLowThres>0 and distUpThres<float("inf"):
                plt.xlim([distLowThres, distUpThres])
            plt.ylabel('Contact probability (log-scale)')
            plt.xlabel('Genomic distance (log-scale)')

            plt.savefig(outfilename+'.png')
            

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'rt')
    intraInRangeCount=0
    intraOutOfRangeCount=0
    intraVeryProximalCount=0
    interCount=0
    discardCount=0
    p_vals=[]
    q_vals=[]
    biasl=[]
    biasr=[]
    for line in infile:
        ch1,mid1,ch2,mid2,contactCount=line.rstrip().split()
        contactCount = float(contactCount)
        interxn=myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        mid1 = int(mid1); mid2 = int(mid2)
        interactionType = interxn.getType(distLowThres,distUpThres)
        bias1=1.0; bias2=1.0;  # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if biasDic:
            if ch1 not in biasDic:
                print("Warning. Bias file does not contain chromosome %s. \
                Please ensure you're using correct file. Fit-Hi-C will continue with\
                bias = -1 for this locus" % ch1)
                bias1 = -1
            else:
                if mid1 not in biasDic[ch1]:
                    print("Error. Bias file does not contain midpoint %s within \
                    %s. Please ensure you're using the correct file and/or resolution \
                    argument. Fit-Hi-C will continue with bias = -1 for this locus" \
                    % (mid1, ch1))
                    bias1 = -1
                else: 
                    bias1=biasDic[ch1][mid1]
            if ch2 not in biasDic:
                print("Warning. Bias file does not contain chromosome %s. \
                Please ensure you're using correct file. Fit-Hi-C will continue with\
                bias = -1 for this locus" % ch2)
                bias2 = -1
            else:
                if mid2 not in biasDic[ch2]:
                    print("Error. Bias file does not contain midpoint %s within \
                    %s. Please ensure you're using the correct file and/or resolution \
                    argument. Fit-Hi-C will continue with bias = -1 for this locus" \
                    % (mid2, ch2))
                    bias2 = -1
                else:
                    bias2=biasDic[ch2][mid2]
        biasl.append(bias1)
        biasr.append(bias2)
        if (bias1<0 or bias2<0) and interactionType !='inter':
            prior_p=1.0
            p_val=1.0
            discardCount+=1
        elif interactionType=='intraInRange' and not interOnly:
            distToLookUp=max(interxn.getDistance(),min(x))
            distToLookUp=min(distToLookUp,max(x))
            i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1)
            prior_p=newSplineY[i]*(bias1*bias2) 
            p_val=scsp.bdtrc(interxn.getCount()-1,observedIntraInRangeSum,prior_p)
            intraInRangeCount +=1
        elif interactionType =='intraShort' and not interOnly:
            prior_p=1.0
            p_val=1.0
            intraVeryProximalCount += 1
        elif interactionType =='intraLong' and not interOnly:
            prior_p=1.0
            #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY
            p_val=1.0
            intraOutOfRangeCount += 1
        else:
            if allReg or interOnly:
                prior_p=interChrProb*(bias1*bias2)
                p_val=scsp.bdtrc(interxn.getCount()-1,observedInterAllSum,prior_p)
                interCount += 1
            else:
                p_val=1.0
                #p_vals.append(p_val)
        p_vals.append(p_val)
    infile.close()

    outlierThres = 0
    # Do the BH FDR correction
    if allReg:
        outlierThres=1.0/(possibleIntraInRangeCount+possibleInterAllCount)
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraInRangeCount)
    elif interOnly and not allReg:
        outlierThres = 1.0/possibleInterAllCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount)
    else:
        outlierThres = 1.0/possibleIntraInRangeCount
        q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
    print("Outlier threshold is... %s" % (outlierThres))

    #now we write the values back to the file
    infile =gzip.open(infilename, 'rt')
    if resolution:
        outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'wt')
    else:
        outfile =gzip.open(outfilename+'.significances.txt.gz', 'wt')
    print("Writing p-values and q-values to file %s" % (outfilename + ".significances.txt"))
    outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n")
    count=0
    for line in infile:
        words=line.rstrip().split()
        chr1=words[0]
        midPoint1=int(words[1])
        chr2=words[2]
        midPoint2=int(words[3])
        interactionCount=float(words[4])
        p_val=p_vals[count]
        q_val=q_vals[count]
        bias1=biasl[count]
        bias2=biasr[count]
        
        if (allReg or interOnly) and chr1!=chr2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        if (allReg or not interOnly) and chr1==chr2:
            interactionDistance = abs(midPoint1-midPoint2)
            if myUtils.in_range_check(interactionDistance,distLowThres, distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" % (str(chr1), midPoint1, str(chr2), midPoint2, interactionCount, p_val, q_val, bias1, bias2))
        
        if p_val<outlierThres:
            outliersline.add(count)
            outliersdist.add(abs(midPoint1-midPoint2))
        count+=1
    outfile.close()
    infile.close()
    if visual == True:
        print("Plotting q-values to file %s" % outfilename + ".qplot.png")
    minFDR=0.0
    maxFDR=0.05
    increment=0.001
    FDRx,FDRy=plot_qvalues(q_vals,minFDR,maxFDR,increment,outfilename+".qplot")
        
    with open(logfile, 'a') as log:
        log.write("Spline successfully fit\n"),
        log.write("\n"),
        log.write("\n"),

    return [splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy] # from fit_Spline
示例#18
0
def test_legacy_cast():
    with suppress_warnings() as sup:
        sup.filter(RuntimeWarning,
                   "floating point number truncated to an integer")
        res = sc.bdtrc(np.nan, 1, 0.5)
        assert_(np.isnan(res))
def fit_Spline(mainDic,x,y,yerr,infilename,outfilename,biasDic):
	print("\nFit a univariate spline to the probability means\n"),
	print("------------------------------------------------------------------------------------\n"),
	#print("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\n"),

	# maximum residual allowed for spline is set to min(y)^2
	splineError=min(y)*min(y) 

	# use fitpack2 method -fit on the real x and y from equal occupancy binning
	ius = UnivariateSpline(x, y, s=splineError)

	#### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
	### NOW I DO THIS BY CALLING AN R function CALLED MONOREG
	### This does the isotonic regression using option antitonic to make sure
	### I get monotonically decreasing probabilites with increasion genomic distance

	tempMaxX=max(x)
	tempMinX=min(x)
	tempList=sorted([dis for dis in mainDic])
	splineX=[]
	### The below for loop will make sure nothing is out of range of [min(x) max(x)]
	### Therefore everything will be within the range where the spline is defined
	for i in tempList:
		if tempMinX<=i and i<=tempMaxX:
			splineX.append(i)
	# END for
	splineY=ius(splineX)

	# R vector format
	rSplineX=ro.FloatVector(splineX)
	rSplineY=ro.FloatVector(splineY)
	rMonoReg=ro.r['monoreg']
	# do the antitonic regression
	allRres=rMonoReg(rSplineX,rSplineY,type="antitonic")
	rNewSplineY=allRres[3]
	# convert data back to Python format
	newSplineY=[]
	diff=[]
	diffX=[]
	for i in range(len(rNewSplineY)):
		newSplineY.append(rNewSplineY[i])
		if (splineY[i]-newSplineY[i]) > 0:
			diff.append(splineY[i]-newSplineY[i])
			diffX.append(splineX[i])
		# END if
	# END for

	### Now newSplineY holds the monotonic contact probabilities
	residual =sum([i*i for i in (y - ius(x))])

	### Now plot the results
	plt.clf()
	fig = plt.figure()
	ax = fig.add_subplot(2,1,1)
	plt.title('Univariate spline fit to the output of equal occupancy binning. \n Residual= %e' % (residual),size='small')
	plt.plot([i/1000.0 for i in x], [i*100000 for i in y], 'ro', label="Means")
	#plt.plot([i/1000.0 for i in xi], [i*100000 for i in yi],'g-',label="Spline fit")
	plt.plot([i/1000.0 for i in splineX], [i*100000 for i in newSplineY],'g-',label="Spline fit")
	#plt.plot([i/1000.0 for i in x], [normalizedInterChrProb*100000 for i in x],'k-',label="Random intra-chromosomal")
	#plt.plot([i/1000.0 for i in x], [interChrProb*100000 for i in x],'b-',label="Inter-chromosomal")
	plt.ylabel('Probability (1e-5)')
	plt.xlabel('Genomic distance (kb)')
	plt.xlim([min(x)/1000.0,max(x)/1000.0])
	ax.legend(loc="upper right")

	ax = fig.add_subplot(2,1,2)
	plt.loglog(splineX,newSplineY,'g-')
	#plt.loglog(xi, yi, 'g-') 
	plt.loglog(x, y, 'r.')  # Data
	#plt.loglog(x, [normalizedInterChrProb for i in x],'k-')
	#plt.loglog(x, [interChrProb for i in x],'b-')
	plt.ylabel('Probability (log scale)')
	plt.xlabel('Genomic distance (log scale)')
	#plt.xlim([20000,100000])
	plt.xlim([min(x),max(x)])
	plt.savefig(outfilename+'.res'+str(resolution)+'.png')
	sys.stderr.write("Plotting %s" % outfilename + ".png\n")

	# NOW write the calculated pvalues and corrected pvalues in a file
	infile =gzip.open(infilename, 'r')
	intraInRangeCount=0
	intraOutOfRangeCount=0
	intraVeryProximalCount=0
	interCount=0
	discardCount=0
	print("lower bound on mid-range distances  "+ repr(distLowThres) + ", upper bound on mid-range distances  " + repr(distUpThres) +"\n"),
	p_vals=[]
	q_vals=[]
	for line in infile:
		words=line.rstrip().split()
		interxn=myUtils.Interaction([words[0], int(words[1]), words[2], int(words[3])])
		interxn.setCount(float(words[4]))
		chr1=words[0]
		chr2=words[2]
		midPoint1=int(words[1])
		midPoint2=int(words[3])
		
		bias1=1.0; bias2=1.0;  # assumes there is no bias to begin with
		# if the biasDic is not null sets the real bias values
		if len(biasDic)>0:
			if chr1 in biasDic and midPoint1 in biasDic[chr1]:
				bias1=biasDic[chr1][midPoint1]
			if chr2 in biasDic and midPoint2 in biasDic[chr2]:
				bias2=biasDic[chr2][midPoint2]
	
		if bias1==-1 or bias2==-1:
			p_val=1.0
			discardCount+=1
		elif interxn.type=='intra':
			if interxn.getType(distLowThres,distUpThres)=='intraInRange':
				# make sure the interaction distance is covered by the probability bins
				distToLookUp=max(interxn.distance,min(x))
				distToLookUp=min(distToLookUp,max(x))
				i=min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1)
				prior_p=newSplineY[i]*(bias1*bias2) # biases added in the picture
				p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraInRangeSum,prior_p)
				intraInRangeCount +=1
			elif interxn.getType(distLowThres,distUpThres)=='intraShort':
				prior_p=1.0
				p_val=1.0
				intraVeryProximalCount +=1
			elif interxn.getType(distLowThres,distUpThres)=='intraLong':
				## out of range distance
				## use the prior of the baseline intra-chr interaction probability
				prior_p=baselineIntraChrProb*(bias1*bias2)  # biases added in the picture
				p_val=scsp.bdtrc(interxn.hitCount-1,observedIntraAllSum,prior_p)
				intraOutOfRangeCount +=1
			# END if
		else: # inter
			#prior_p=normalizedInterChrProb
			prior_p=interChrProb*(bias1*bias2) # biases added in the picture
			############# THIS HAS TO BE interactionCount-1 ##################
			p_val=scsp.bdtrc(interxn.hitCount-1,observedInterAllSum,prior_p)
			interCount +=1
		#
		p_vals.append(p_val)

	# END for
	infile.close()

	# Do the BH FDR correction
	q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraAllCount)
	#q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
	#print("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n"),

	infile =gzip.open(infilename, 'r')
	outfile =gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'w')
	print("Writing p-values and q-values to file %s" % outfilename + ".significances.txt\n"),
	print("Number of pairs discarded due to bias not in range [0.5 2]\n"),
	outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n")
	count=0
	for line in infile:
		words=line.rstrip().split()
		chrNo1=words[0]
		midPoint1=int(words[1])
		chrNo2=words[2]
		midPoint2=int(words[3])
		interactionCount=int(words[4])
		p_val=p_vals[count]
		q_val=q_vals[count]
		#if chrNo1==chrNo2: # intra
		#	interactionDistance=abs(midPoint1-midPoint2) # dist
		#	if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres):
		#		outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
		#else:
		#	outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

		outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
		count+=1
	# END for - printing pvals and qvals for all the interactions
	outfile.close()
	infile.close()
	return [splineX, newSplineY, residual] # from fit_Spline
def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic):
    print("\nFit a univariate spline to the probability means\n"),
    print(
        "------------------------------------------------------------------------------------\n"
    ),
    #print("baseline intra-chr probability: " + repr(baselineIntraChrProb)+ "\n"),

    # maximum residual allowed for spline is set to min(y)^2
    splineError = min(y) * min(y)

    # use fitpack2 method -fit on the real x and y from equal occupancy binning
    ius = UnivariateSpline(x, y, s=splineError)

    #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
    ### NOW I DO THIS BY CALLING AN R function CALLED MONOREG
    ### This does the isotonic regression using option antitonic to make sure
    ### I get monotonically decreasing probabilites with increasion genomic distance

    tempMaxX = max(x)
    tempMinX = min(x)
    tempList = sorted([dis for dis in mainDic])
    splineX = []
    ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
    ### Therefore everything will be within the range where the spline is defined
    for i in tempList:
        if tempMinX <= i and i <= tempMaxX:
            splineX.append(i)
    # END for
    splineY = ius(splineX)

    # R vector format
    rSplineX = ro.FloatVector(splineX)
    rSplineY = ro.FloatVector(splineY)
    rMonoReg = ro.r['monoreg']
    # do the antitonic regression
    allRres = rMonoReg(rSplineX, rSplineY, type="antitonic")
    rNewSplineY = allRres[3]
    # convert data back to Python format
    newSplineY = []
    diff = []
    diffX = []
    for i in range(len(rNewSplineY)):
        newSplineY.append(rNewSplineY[i])
        if (splineY[i] - newSplineY[i]) > 0:
            diff.append(splineY[i] - newSplineY[i])
            diffX.append(splineX[i])
        # END if
    # END for

    ### Now newSplineY holds the monotonic contact probabilities
    residual = sum([i * i for i in (y - ius(x))])

    ### Now plot the results
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    plt.title(
        'Univariate spline fit to the output of equal occupancy binning. \n Residual= %e'
        % (residual),
        size='small')
    plt.plot([i / 1000.0 for i in x], [i * 100000 for i in y],
             'ro',
             label="Means")
    #plt.plot([i/1000.0 for i in xi], [i*100000 for i in yi],'g-',label="Spline fit")
    plt.plot([i / 1000.0 for i in splineX], [i * 100000 for i in newSplineY],
             'g-',
             label="Spline fit")
    #plt.plot([i/1000.0 for i in x], [normalizedInterChrProb*100000 for i in x],'k-',label="Random intra-chromosomal")
    #plt.plot([i/1000.0 for i in x], [interChrProb*100000 for i in x],'b-',label="Inter-chromosomal")
    plt.ylabel('Probability (1e-5)')
    plt.xlabel('Genomic distance (kb)')
    plt.xlim([min(x) / 1000.0, max(x) / 1000.0])
    ax.legend(loc="upper right")

    ax = fig.add_subplot(2, 1, 2)
    plt.loglog(splineX, newSplineY, 'g-')
    #plt.loglog(xi, yi, 'g-')
    plt.loglog(x, y, 'r.')  # Data
    #plt.loglog(x, [normalizedInterChrProb for i in x],'k-')
    #plt.loglog(x, [interChrProb for i in x],'b-')
    plt.ylabel('Probability (log scale)')
    plt.xlabel('Genomic distance (log scale)')
    #plt.xlim([20000,100000])
    plt.xlim([min(x), max(x)])
    plt.savefig(outfilename + '.res' + str(resolution) + '.png')
    sys.stderr.write("Plotting %s" % outfilename + ".png\n")

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'r')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    discardCount = 0
    print("lower bound on mid-range distances  " + repr(distLowThres) +
          ", upper bound on mid-range distances  " + repr(distUpThres) + "\n"),
    p_vals = []
    q_vals = []
    for line in infile:
        words = line.rstrip().split()
        interxn = myUtils.Interaction(
            [words[0], int(words[1]), words[2],
             int(words[3])])
        interxn.setCount(float(words[4]))
        chr1 = words[0]
        chr2 = words[2]
        midPoint1 = int(words[1])
        midPoint2 = int(words[3])

        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if len(biasDic) > 0:
            if chr1 in biasDic and midPoint1 in biasDic[chr1]:
                bias1 = biasDic[chr1][midPoint1]
            if chr2 in biasDic and midPoint2 in biasDic[chr2]:
                bias2 = biasDic[chr2][midPoint2]

        if bias1 == -1 or bias2 == -1:
            p_val = 1.0
            discardCount += 1
        elif interxn.type == 'intra':
            if interxn.getType(distLowThres, distUpThres) == 'intraInRange':
                # make sure the interaction distance is covered by the probability bins
                distToLookUp = max(interxn.distance, min(x))
                distToLookUp = min(distToLookUp, max(x))
                i = min(bisect.bisect_left(splineX, distToLookUp),
                        len(splineX) - 1)
                prior_p = newSplineY[i] * (bias1 * bias2
                                           )  # biases added in the picture
                p_val = scsp.bdtrc(interxn.hitCount - 1,
                                   observedIntraInRangeSum, prior_p)
                intraInRangeCount += 1
            elif interxn.getType(distLowThres, distUpThres) == 'intraShort':
                prior_p = 1.0
                p_val = 1.0
                intraVeryProximalCount += 1
            elif interxn.getType(distLowThres, distUpThres) == 'intraLong':
                ## out of range distance
                ## use the prior of the baseline intra-chr interaction probability
                prior_p = baselineIntraChrProb * (
                    bias1 * bias2)  # biases added in the picture
                p_val = scsp.bdtrc(interxn.hitCount - 1, observedIntraAllSum,
                                   prior_p)
                intraOutOfRangeCount += 1
            # END if
        else:  # inter
            #prior_p=normalizedInterChrProb
            prior_p = interChrProb * (bias1 * bias2
                                      )  # biases added in the picture
            ############# THIS HAS TO BE interactionCount-1 ##################
            p_val = scsp.bdtrc(interxn.hitCount - 1, observedInterAllSum,
                               prior_p)
            interCount += 1
        #
        p_vals.append(p_val)

    # END for
    infile.close()

    # Do the BH FDR correction
    q_vals = myStats.benjamini_hochberg_correction(
        p_vals, possibleInterAllCount + possibleIntraAllCount)
    #q_vals=myStats.benjamini_hochberg_correction(p_vals, possibleIntraInRangeCount)
    #print("possibleIntraInRangeCount " + repr(possibleIntraInRangeCount)+"\n"),

    infile = gzip.open(infilename, 'r')
    outfile = gzip.open(
        outfilename + '.res' + str(resolution) + '.significances.txt.gz', 'w')
    print("Writing p-values and q-values to file %s" % outfilename +
          ".significances.txt\n"),
    print("Number of pairs discarded due to bias not in range [0.5 2]\n"),
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n"
    )
    count = 0
    for line in infile:
        words = line.rstrip().split()
        chrNo1 = words[0]
        midPoint1 = int(words[1])
        chrNo2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = int(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]
        #if chrNo1==chrNo2: # intra
        #	interactionDistance=abs(midPoint1-midPoint2) # dist
        #	if myUtils.in_range_check(interactionDistance,distLowThres,distUpThres):
        #		outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))
        #else:
        #	outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" % (str(chrNo1),midPoint1,str(chrNo2),midPoint2,interactionCount,p_val,q_val))

        outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\n" %
                      (str(chrNo1), midPoint1, str(chrNo2), midPoint2,
                       interactionCount, p_val, q_val))
        count += 1
    # END for - printing pvals and qvals for all the interactions
    outfile.close()
    infile.close()
    return [splineX, newSplineY, residual]  # from fit_Spline
示例#21
0
def fit_spline(mainDic, x, y, yerr, infilename, outfilename, biasDic,
               resolution, min_dist, max_dist, verbose):
    if verbose:
        print("\nFit a univariate spline to the probability means\n"),
        print(
            "------------------------------------------------------------------------------------\n"
        ),

    # maximum residual allowed for spline is set to min(y)^2
    splineError = min(y)**2

    # use fitpack2 method -fit on the real x and y from equal occupancy binning
    ius = UnivariateSpline(x, y, s=splineError)

    #### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
    ### NOW I DO THIS BY CALLING A SKLEARN ISOTONIC REGRESSION
    ### This does the isotonic regression using option antitonic to make sure
    ### I get monotonically decreasing probabilites with increasion genomic distance

    min_x, max_x = min(x), max(x)
    tempList = sorted([dis for dis in mainDic])
    splineX = []
    ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
    ### Therefore everything will be within the range where the spline is defined
    for i in tempList:
        if min_x <= i <= max_x:
            splineX.append(i)

    splineY = ius(splineX)

    ir = IsotonicRegression(increasing=False)
    rNewSplineY = ir.fit_transform(splineX, splineY)

    newSplineY = []
    diff = []
    diffX = []
    for i in range(len(rNewSplineY)):
        newSplineY.append(rNewSplineY[i])
        if (splineY[i] - newSplineY[i]) > 0:
            diff.append(splineY[i] - newSplineY[i])
            diffX.append(splineX[i])

    ### Now newSplineY holds the monotonic contact probabilities
    residual = sum([i * i for i in (y - ius(x))])

    ### Now plot the results
    plt.clf()
    fig = plt.figure()
    ax = fig.add_subplot(2, 1, 1)
    plt.title(
        'Univariate spline fit to the output of equal occupancy binning. \n Residual= %e'
        % (residual),
        size='small')
    plt.plot([i / 1000.0 for i in x], [i * 100000 for i in y],
             'ro',
             label="Means")
    plt.plot([i / 1000.0 for i in splineX], [i * 100000 for i in newSplineY],
             'g-',
             label="Spline fit")

    plt.ylabel('Probability (1e-5)')
    plt.xlabel('Genomic distance (kb)')
    plt.xlim([min_x / 1000.0, max_x / 1000.0])
    ax.legend(loc="upper right")

    ax = fig.add_subplot(2, 1, 2)
    plt.loglog(splineX, newSplineY, 'g-')
    plt.loglog(x, y, 'r.')  # Data

    plt.ylabel('Probability (log scale)')
    plt.xlabel('Genomic distance (log scale)')
    plt.xlim([min_x, max_x])
    plt.savefig(outfilename + '.res' + str(resolution) + '.png')
    sys.stderr.write("Plotting %s" % outfilename + ".png\n")

    # NOW write the calculated pvalues and corrected pvalues in a file
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    discardCount = 0

    if verbose:
        print("lower bound on mid-range distances  " + repr(min_dist) +
              ", upper bound on mid-range distances  " + repr(max_dist) +
              "\n"),

    with gzip.open(infilename, 'r') as infile:
        with gzip.open(
                '{}.res{}.significances.txt.gz'.format(outfilename,
                                                       resolution),
                'w') as outfile:
            outfile.write(
                "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n"
            )

            for line in infile:
                chr1, mid1, chr2, mid2, contactCount = line.rstrip().split()
                mid1, mid2, contactCount = int(mid1), int(mid2), int(
                    contactCount)
                distance = mid2 - mid1

                bias1 = 1.0
                bias2 = 1.0
                # assumes there is no bias to begin with
                # if the biasDic is not null sets the real bias values
                if len(biasDic) > 0:
                    if chr1 in biasDic and mid1 in biasDic[chr1]:
                        bias1 = biasDic[chr1][mid1]
                    if chr2 in biasDic and mid2 in biasDic[chr2]:
                        bias2 = biasDic[chr2][mid2]

                if min_dist <= distance <= max_dist:
                    # make sure the interaction distance is covered by the probability bins
                    distToLookUp = min(max(distance, min_x), max_x)
                    i = min(bisect.bisect_left(splineX, distToLookUp),
                            len(splineX) - 1)
                    prior_p = newSplineY[i] * (bias1 * bias2
                                               )  # biases added in the picture
                    p_val = scsp.bdtrc(contactCount - 1,
                                       observedIntraInRangeSum, prior_p)

                    if p_val <= 1:
                        outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(
                            chr1, mid1, chr2, mid2, contactCount, p_val, -1))

    return splineX, newSplineY, residual
示例#22
0
 def test_value(self):
     val = sc.bdtrc(0, 1, 0.5)
     assert_allclose(val, 0.5)
示例#23
0
 def _sf(self, x, n, p):
     k = floor(x)
     return special.bdtrc(k, n, p)
示例#24
0
 def test_sum_is_one(self):
     val = sc.bdtrc([0, 1, 2], 2, 0.5)
     assert_array_equal(val, [0.75, 0.25, 0.0])
示例#25
0
def fit_Spline(mainDic, x, y, yerr, infilename, outfilename, biasDic,
               outliersline, outliersdist, observedIntraInRangeSum,
               possibleIntraInRangeCount, possibleInterAllCount,
               observedIntraAllSum, observedInterAllSum, resolution, passNo):
    with open(logfile, 'a') as log:
        log.write("\nFitting a univariate spline to the probability means\n"),
        log.write(
            "------------------------------------------------------------------------------------\n"
        ),

    splineX = None
    newSplineY = None
    residual = None
    FDRx = None
    FDRy = None

    if not interOnly:
        if outliersdist != None:
            y = [f for _, f in sorted(zip(x, y), key=lambda pair: pair[0])]
            x.sort()
        for i in range(1, len(x)):
            if x[i] <= x[i - 1]:
                print(
                    "ERROR in spline fitting. Distances do not decrease across bins. Ensure interaction file is correct."
                )
                print("Avg. distance of bin(i-1)... %s" % x[i - 1])
                print("Avg. distance of bin(i)... %s" % x[i])
                sys.exit(2)

        # maximum residual allowed for spline is set to min(y)^2
        splineError = min(y) * min(y)

        # use fitpack2 method -fit on the real x and y from equal occupancy binning
        ius = UnivariateSpline(x, y, s=splineError)
        tempMaxX = max(x)
        tempMinX = min(x)
        tempList = sorted([dis for dis in mainDic])
        splineX = []
        ### The below for loop will make sure nothing is out of range of [min(x) max(x)]
        ### Therefore everything will be within the range where the spline is defined
        for i in tempList:
            if tempMinX <= i <= tempMaxX:
                splineX.append(i)
        splineY = ius(splineX)
        #print(splineY)
        #print(yerr)

        ir = IsotonicRegression(increasing=False)
        newSplineY = ir.fit_transform(splineX, splineY)
        #print(newSplineY)
        residual = sum([i * i for i in (y - ius(x))])

        if visual == True:
            xi = np.linspace(min(x), max(x), 5 * len(x))
            yi = ius(xi)

            print("Plotting %s" % (outfilename + ".png"))
            plt.clf()
            fig = plt.figure()
            ax = fig.add_subplot(2, 1, 1)
            plt.plot(myUtils.scale_a_list(splineX, toKb),
                     myUtils.scale_a_list(newSplineY, toProb),
                     'g-',
                     label="spline-" + str(passNo),
                     linewidth=2)
            plt.errorbar(myUtils.scale_a_list(x, toKb),
                         myUtils.scale_a_list(y, toProb),
                         myUtils.scale_a_list(yerr, toProb),
                         fmt='r.',
                         label="Mean with std. error",
                         linewidth=2)

            #plt.ylabel('Contact probability (x10$^{-5}$)',fontsize='large')
            #plt.xlabel('Genomic distance (kb)',fontsize='large')
            plt.ylabel('Contact probability (x10$^{-5}$)')
            plt.xlabel('Genomic distance (kb)')
            if distLowThres > 0 and distUpThres < float("inf"):
                plt.xlim(
                    myUtils.scale_a_list([distLowThres, distUpThres], toKb))
            plt.gca().yaxis.set_major_locator(MaxNLocator(nbins=3, prune=None))
            ax.legend(loc="upper right")

            ax = fig.add_subplot(2, 1, 2)

            plt.loglog(splineX, newSplineY, 'g-')
            plt.errorbar(x, y, yerr=yerr, fmt='r.')  # Data
            if distLowThres > 0 and distUpThres < float("inf"):
                plt.xlim([distLowThres, distUpThres])
            plt.ylabel('Contact probability (log-scale)')
            plt.xlabel('Genomic distance (log-scale)')

            plt.savefig(outfilename + '.png')

    # NOW write the calculated pvalues and corrected pvalues in a file
    infile = gzip.open(infilename, 'rt')
    intraInRangeCount = 0
    intraOutOfRangeCount = 0
    intraVeryProximalCount = 0
    interCount = 0
    discardCount = 0
    p_vals = []
    q_vals = []
    biasl = []
    biasr = []
    for line in infile:
        ch1, mid1, ch2, mid2, contactCount = line.rstrip().split()
        contactCount = float(contactCount)
        interxn = myUtils.Interaction([ch1, int(mid1), ch2, int(mid2)])
        interxn.setCount(contactCount)
        mid1 = int(mid1)
        mid2 = int(mid2)
        interactionType = interxn.getType(distLowThres, distUpThres)
        bias1 = 1.0
        bias2 = 1.0
        # assumes there is no bias to begin with
        # if the biasDic is not null sets the real bias values
        if biasDic:
            if ch1 in biasDic and mid1 in biasDic[ch1]:
                bias1 = biasDic[ch1][mid1]
            if ch2 in biasDic and mid2 in biasDic[ch2]:
                bias2 = biasDic[ch2][mid2]
        biasl.append(bias1)
        biasr.append(bias2)
        if (bias1 < 0 or bias2 < 0) and interactionType != 'inter':
            prior_p = 1.0
            p_val = 1.0
            discardCount += 1
        elif interactionType == 'intraInRange' and not interOnly:
            distToLookUp = max(interxn.getDistance(), min(x))
            distToLookUp = min(distToLookUp, max(x))
            i = min(bisect.bisect_left(splineX, distToLookUp),
                    len(splineX) - 1)
            prior_p = newSplineY[i] * (bias1 * bias2)
            p_val = scsp.bdtrc(interxn.getCount() - 1, observedIntraInRangeSum,
                               prior_p)
            intraInRangeCount += 1
        elif interactionType == 'intraShort' and not interOnly:
            prior_p = 1.0
            p_val = 1.0
            intraVeryProximalCount += 1
        elif interactionType == 'intraLong' and not interOnly:
            prior_p = 1.0
            #p_val=scsp.bdtrc(interxn.getCount()-1, observedIntraAllSum,prior_p) ##RUNBY
            p_val = 1.0
            intraOutOfRangeCount += 1
        else:
            if allReg or interOnly:
                prior_p = interChrProb * (bias1 * bias2)
                p_val = scsp.bdtrc(interxn.getCount() - 1, observedInterAllSum,
                                   prior_p)
                interCount += 1
            else:
                p_val = 1.0
                #p_vals.append(p_val)
        p_vals.append(p_val)
    infile.close()

    outlierThres = 0
    # Do the BH FDR correction
    if allReg:
        outlierThres = 1.0 / (possibleIntraInRangeCount +
                              possibleInterAllCount)
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleInterAllCount + possibleIntraInRangeCount)
    elif interOnly and not allReg:
        outlierThres = 1.0 / possibleInterAllCount
        q_vals = myStats.benjamini_hochberg_correction(p_vals,
                                                       possibleInterAllCount)
    else:
        outlierThres = 1.0 / possibleIntraInRangeCount
        q_vals = myStats.benjamini_hochberg_correction(
            p_vals, possibleIntraInRangeCount)
    print("Outlier threshold is... %s" % (outlierThres))

    #now we write the values back to the file
    infile = gzip.open(infilename, 'rt')
    if resolution:
        outfile = gzip.open(
            outfilename + '.res' + str(resolution) + '.significances.txt.gz',
            'wt')
    else:
        outfile = gzip.open(outfilename + '.significances.txt.gz', 'wt')
    print("Writing p-values and q-values to file %s" %
          (outfilename + ".significances.txt"))
    outfile.write(
        "chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\tbias1\tbias2\n"
    )
    count = 0
    for line in infile:
        words = line.rstrip().split()
        chr1 = words[0]
        midPoint1 = int(words[1])
        chr2 = words[2]
        midPoint2 = int(words[3])
        interactionCount = float(words[4])
        p_val = p_vals[count]
        q_val = q_vals[count]
        bias1 = biasl[count]
        bias2 = biasr[count]

        if (allReg or interOnly) and chr1 != chr2:
            outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" %
                          (str(chr1), midPoint1, str(chr2), midPoint2,
                           interactionCount, p_val, q_val, bias1, bias2))
        if (allReg or not interOnly) and chr1 == chr2:
            interactionDistance = abs(midPoint1 - midPoint2)
            if myUtils.in_range_check(interactionDistance, distLowThres,
                                      distUpThres):
                outfile.write("%s\t%d\t%s\t%d\t%d\t%e\t%e\t%e\t%e\n" %
                              (str(chr1), midPoint1, str(chr2), midPoint2,
                               interactionCount, p_val, q_val, bias1, bias2))

        if p_val < outlierThres:
            outliersline.add(count)
            outliersdist.add(abs(midPoint1 - midPoint2))
        count += 1
    outfile.close()
    infile.close()
    if visual == True:
        print("Plotting q-values to file %s" % outfilename + ".qplot.png")
    minFDR = 0.0
    maxFDR = 0.05
    increment = 0.001
    FDRx, FDRy = plot_qvalues(q_vals, minFDR, maxFDR, increment,
                              outfilename + ".qplot")

    with open(logfile, 'a') as log:
        log.write("Spline successfully fit\n"),
        log.write("\n"),
        log.write("\n"),

    return [
        splineX, newSplineY, residual, outliersline, outliersdist, FDRx, FDRy
    ]  # from fit_Spline
示例#26
0
 def test_rounding(self):
     double_val = sc.bdtrc([0.1, 1.1, 2.1], 2, 0.5)
     int_val = sc.bdtrc([0, 1, 2], 2, 0.5)
     assert_array_equal(double_val, int_val)
 def _sf(self, x, n, p):
     k = floor(x)
     return special.bdtrc(k, n, p)
示例#28
0
def fit_spline(mainDic, x, y, yerr, infilename, outfilename, biasDic, resolution, min_dist, max_dist):
	print("\nFit a univariate spline to the probability means\n"),
	print("------------------------------------------------------------------------------------\n"),

	# maximum residual allowed for spline is set to min(y)^2
	splineError = min(y)**2

	# use fitpack2 method -fit on the real x and y from equal occupancy binning
	ius = UnivariateSpline(x, y, s=splineError)

	#### POST-PROCESS THE SPLINE TO MAKE SURE IT'S NON-INCREASING
	### NOW I DO THIS BY CALLING A SKLEARN ISOTONIC REGRESSION
	### This does the isotonic regression using option antitonic to make sure
	### I get monotonically decreasing probabilites with increasion genomic distance

	min_x, max_x = min(x), max(x)
	tempList=sorted([dis for dis in mainDic])
	splineX=[]
	### The below for loop will make sure nothing is out of range of [min(x) max(x)]
	### Therefore everything will be within the range where the spline is defined
	for i in tempList:
		if min_x <= i <= max_x:
			splineX.append(i)

	splineY=ius(splineX)
	
	ir = IsotonicRegression(increasing=False)
	rNewSplineY = ir.fit_transform(splineX, splineY)

	newSplineY=[]
	diff=[]
	diffX=[]
	for i in range(len(rNewSplineY)):
		newSplineY.append(rNewSplineY[i])
		if (splineY[i]-newSplineY[i]) > 0:
			diff.append(splineY[i]-newSplineY[i])
			diffX.append(splineX[i])

	### Now newSplineY holds the monotonic contact probabilities
	residual = sum([i*i for i in (y - ius(x))])

	### Now plot the results
	plt.clf()
	fig = plt.figure()
	ax = fig.add_subplot(2,1,1)
	plt.title('Univariate spline fit to the output of equal occupancy binning. \n Residual= %e' % (residual),size='small')
	plt.plot([i/1000.0 for i in x], [i*100000 for i in y], 'ro', label="Means")
	plt.plot([i/1000.0 for i in splineX], [i*100000 for i in newSplineY],'g-',label="Spline fit")

	plt.ylabel('Probability (1e-5)')
	plt.xlabel('Genomic distance (kb)')
	plt.xlim([min_x/1000.0, max_x/1000.0])
	ax.legend(loc="upper right")

	ax = fig.add_subplot(2,1,2)
	plt.loglog(splineX,newSplineY,'g-')
	plt.loglog(x, y, 'r.')  # Data

	plt.ylabel('Probability (log scale)')
	plt.xlabel('Genomic distance (log scale)')
	plt.xlim([min_x, max_x])
	plt.savefig(outfilename+'.res'+str(resolution)+'.png')
	sys.stderr.write("Plotting %s" % outfilename + ".png\n")

	# NOW write the calculated pvalues and corrected pvalues in a file
	intraInRangeCount = 0
	intraOutOfRangeCount = 0
	intraVeryProximalCount = 0
	interCount = 0
	discardCount = 0

	print("lower bound on mid-range distances  "+ repr(min_dist) + ", upper bound on mid-range distances  " + repr(max_dist) +"\n"),
	p_vals=[]
	q_vals=[]

	with gzip.open(infilename, 'r') as infile:
		for line in infile:
			chr1, mid1, chr2, mid2, contactCount = line.rstrip().split()
			mid1 = int(mid1)
			mid2 = int(mid2)
			contactCount = int(contactCount)
			distance = mid2 - mid1

			
			bias1 = 1.0; 
			bias2 = 1.0;  # assumes there is no bias to begin with
			# if the biasDic is not null sets the real bias values
			if len(biasDic)>0:
				if chr1 in biasDic and mid1 in biasDic[chr1]:
					bias1=biasDic[chr1][mid1]
				if chr2 in biasDic and mid2 in biasDic[chr2]:
					bias2=biasDic[chr2][mid2]
		
			if bias1 == -1 or bias2 == -1:
				p_val = 1.0
				discardCount += 1
			elif chr1 == chr2:
				if (min_dist==-1 or (min_dist>-1 and distance >min_dist)) and\
				   (max_dist==-1 or (max_dist>-1 and distance <= max_dist)):
					# make sure the interaction distance is covered by the probability bins
					distToLookUp = min(max(distance, min_x), max_x)
					i = min(bisect.bisect_left(splineX, distToLookUp),len(splineX)-1)
					prior_p = newSplineY[i] * (bias1 * bias2) # biases added in the picture
					p_val = scsp.bdtrc(contactCount-1, observedIntraInRangeSum, prior_p)
					intraInRangeCount +=1
				elif (min_dist > -1 and distance <= min_dist):
					prior_p = 1.0
					p_val = 1.0
					intraVeryProximalCount += 1
				elif (max_dist>-1 and distance > max_dist):
					## out of range distance
					## use the prior of the baseline intra-chr interaction probability
					prior_p = baselineIntraChrProb * (bias1 * bias2)  # biases added in the picture
					p_val = scsp.bdtrc(contactCount-1, observedIntraAllSum, prior_p)
					intraOutOfRangeCount += 1
			else: 
				prior_p = interChrProb*(bias1*bias2) # biases added in the picture
				############# THIS HAS TO BE interactionCount-1 ##################
				p_val = scsp.bdtrc(contactCount-1, observedInterAllSum, prior_p)
				interCount += 1
			
			p_vals.append(p_val)

	# Do the BH FDR correction
	q_vals = benjamini_hochberg_correction(p_vals, possibleInterAllCount+possibleIntraAllCount)

	print("Writing p-values and q-values to file %s" % outfilename + ".significances.txt\n"),
	print("Number of pairs discarded due to bias not in range [0.5 2]\n"),

	with gzip.open(infilename, 'r') as infile:
		with gzip.open(outfilename+'.res'+str(resolution)+'.significances.txt.gz', 'w') as outfile:
			outfile.write("chr1\tfragmentMid1\tchr2\tfragmentMid2\tcontactCount\tp-value\tq-value\n")
			for i, line in enumerate(infile):
				p_val, q_val = p_vals[i], q_vals[i]
				chr1, mid1, chr2, mid2, contactCount = line.rstrip().split()
				outfile.write("{}\t{}\t{}\t{}\t{}\t{}\t{}\n".format(chr1, mid1, chr2, mid2, contactCount, p_val, q_val))

	return splineX, newSplineY, residual