Exemplo n.º 1
0
    def summarize(self,
                  interval,
                  bins=None,
                  method='summarize',
                  function='mean'):

        # We may be dividing by zero in some cases, which raises a warning in
        # NumPy based on the IEEE 754 standard (see
        # http://docs.scipy.org/doc/numpy/reference/generated/
        #       numpy.seterr.html)
        #
        # That's OK -- we're expecting that to happen sometimes. So temporarily
        # disable this error reporting for the duration of this method.
        orig = np.geterr()['invalid']
        np.seterr(invalid='ignore')

        if (bins is None) or (method == 'get_as_array'):
            bw = BigWigFile(open(self.fn))
            s = bw.get_as_array(
                interval.chrom,
                interval.start,
                interval.stop,
            )
            if s is None:
                s = np.zeros((interval.stop - interval.start, ))
            else:
                s[np.isnan(s)] = 0

        elif method == 'ucsc_summarize':
            if function in ['mean', 'min', 'max', 'std', 'coverage']:
                return self.ucsc_summarize(interval, bins, function=function)
            else:
                raise ValueError('function "%s" not supported by UCSC\'s'
                                 'bigWigSummary')

        else:
            bw = BigWigFile(open(self.fn))
            s = bw.summarize(interval.chrom, interval.start, interval.stop,
                             bins)
            if s is None:
                s = np.zeros((bins, ))
            else:
                if function == 'sum':
                    s = s.sum_data
                if function == 'mean':
                    s = s.sum_data / s.valid_count
                    s[np.isnan(s)] = 0
                if function == 'min':
                    s = s.min_val
                    s[np.isinf(s)] = 0
                if function == 'max':
                    s = s.max_val
                    s[np.isinf(s)] = 0
                if function == 'std':
                    s = (s.sum_squares / s.valid_count)
                    s[np.isnan(s)] = 0

        # Reset NumPy error reporting
        np.seterr(divide=orig)
        return s
Exemplo n.º 2
0
def scan_fp(plusdnase, minusdnase, bed, out, upstream, downstream):
    p = BwIO(plusdnase)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle1 = BigWigFile(open(plusdnase, 'rb'))
    bwHandle2 = BigWigFile(open(minusdnase, 'rb'))
    inf = open(bed)
    outf = open(out, 'w')
    for line in inf:
        ll = line.split()
        if not chrom_len.has_key(ll[0]):
            continue
        if int(ll[1]) < upstream:
            continue
        signal1 = bwHandle1.summarize(
            ll[0],
            int(ll[1]) - upstream,
            int(ll[2]) + downstream,
            (int(ll[2]) + downstream - int(ll[1]) + upstream))
        signal2 = bwHandle2.summarize(
            ll[0],
            int(ll[1]) - upstream,
            int(ll[2]) + downstream,
            (int(ll[2]) + downstream - int(ll[1]) + upstream))
        #ll.append(str(float(signal.sum_data)))
        newll = ll[:6] + map(str, list(signal1.sum_data)) + map(
            str, list(signal2.sum_data))
        outf.write("\t".join(newll) + "\n")
    inf.close()
    outf.close()
Exemplo n.º 3
0
def get_signal(inputfile, output, vp, vm, dp, dm):
    p = BwIO(vp)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    vpBw = BigWigFile(open(vp, 'rb'))
    vmBw = BigWigFile(open(vm, 'rb'))
    dpBw = BigWigFile(open(dp, 'rb'))
    dmBw = BigWigFile(open(dm, 'rb'))
    inf = open(inputfile)
    outf = open(output, 'w')
    colnames = [
        "chrom", "start", "end", "seq", "motifscore", "strand",
        "LncapARsignal", "LncapDNaseCutsite", "LncapDNaseFrag",
        "K562DNaseFrag", "LncapFP", "K562FP", "overARpeak", "VehPlus",
        "VehMinus", "DHTPlus", "DHTMinus"
    ]
    outf.write("\t".join(colnames) + "\n")
    for line in inf:
        if line.startswith("chrom"):
            continue
        ll = line.split()
        if not chrom_len.has_key(ll[0]):
            continue
        signal = vpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1)
        ll.append(str(float(signal.sum_data)))
        signal = vmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1)
        ll.append(str(float(signal.sum_data)))
        signal = dpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1)
        ll.append(str(float(signal.sum_data)))
        signal = dmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1)
        ll.append(str(float(signal.sum_data)))
        outf.write("\t".join(ll) + "\n")
    inf.close()
    outf.close()
Exemplo n.º 4
0
def getsignal(inputfile,outputfile,pcut,DHT,Veh,pspan):

    
#    p=BwIO(pcut)
#    chrom_len = {}
#    for i in p.chromosomeTree['nodes']:
#        chrom_len[i['key']] = i['chromSize']
    pcutbw = BigWigFile(open(pcut, 'rb'))
    dht = BigWigFile(open(DHT, 'rb'))
    veh = BigWigFile(open(Veh, 'rb'))
    inf = open(inputfile)    
    testll = inf.readline().split()
    ml = int(testll[2]) - int(testll[1])
    inf.seek(0)
    outf = open(outputfile,'w')

    for line in inf:
        ll = line.split()
#        if not chrom_len.has_key(ll[0]):
#            continue
        cut = list(pcutbw.summarize(ll[0],int(ll[1]) + ml/2 -pspan ,int(ll[1]) + ml/2 +pspan ,2*pspan).sum_data)
        TC = sum(cut)
        C = sum(cut[(pspan-ml/2) : (pspan-ml/2+ml)])
        L = sum(cut[(pspan-ml/2-ml):(pspan-ml/2)])
        R = sum(cut[(pspan-ml/2+ml):(pspan-ml/2+2*ml)])
        FOS = -1*( (C+1)/(R+1) + (C+1)/(L+1) )
        dhtnum = sum(list(dht.summarize(ll[0],int(ll[1]) + ml/2 -pspan ,int(ll[1]) + ml/2 +pspan ,2).sum_data)) + 1
        vehnum = sum(list(veh.summarize(ll[0],int(ll[1]) + ml/2 -pspan ,int(ll[1]) + ml/2 +pspan ,2).sum_data)) + 1
        newll = ll + [TC,FOS,dhtnum,vehnum]
        outf.write("\t".join(map(str,newll))+"\n")

    outf.close()
def get_regionLevel_simplex_parameters(inputbed, outputbed, plusbw, minusbw,
                                       biasmat, ext, genome2bit):
    simplex_code = encoding()
    biasdict, flank = readBG(biasmat)
    B, B0, B1, B2 = paramest(biasdict)
    permuteSeq = {}
    inf = open("permuteSeq8mer.txt")
    for line in inf:
        ll = line.split()
        permuteSeq[ll[0]] = ll[1]
    inf.close()
    #    outitem = seq2biasParm("ACTCGCAA",B,simplex_code)
    #print B
    genome = twobitreader.TwoBitFile(genome2bit)
    #    seq = genome[chrm][(int(ll[1])-flank):(int(ll[1])+flank)].upper()

    plusBWH = BigWigFile(open(plusbw, 'rb'))
    minusBWH = BigWigFile(open(minusbw, 'rb'))

    inf = open(inputbed)
    outf = open(outputbed, 'w')
    for line in inf:
        ll = line.split()
        chrm = ll[0]
        center = (int(ll[1]) + int(ll[2])) / 2
        start = max(0, center - ext)
        end = center + ext
        plusSig = plusBWH.summarize(ll[0], start, end, end - start).sum_data
        minusSig = minusBWH.summarize(ll[0], start, end, end - start).sum_data
        if type(plusSig) == None or type(minusSig) == None:
            continue
        plusSequence = genome[chrm][(start - flank):(end + flank)].upper()
        minusSequence = genome[chrm][(start - flank + 1):(end + flank +
                                                          1)].upper()
        plus_data = numpy.array([0.0] * len(B))
        minus_data = numpy.array([0.0] * len(B))
        for i in range(len(plusSig)):
            #position = start + i
            pcuts = plusSig[i]
            if pcuts > 0:
                pseq = plusSequence[i:(i + 2 * flank)].upper()
                if not "N" in pseq:
                    p_out = seq2biasParm(permuteSeq[pseq], B, simplex_code)
                    plus_data += pcuts * p_out

        for i in range(len(minusSig)):
            #position = start + i
            mcuts = minusSig[i]
            if mcuts > 0:
                tmpseq = minusSequence[i:(i + 2 * flank)]
                if not "N" in tmpseq:
                    mseq = revcomp(tmpseq).upper()
                    m_out = seq2biasParm(permuteSeq[mseq], B, simplex_code)
                    minus_data += mcuts * m_out

        newll = ll + list(plus_data) + list(minus_data)
        outf.write("\t".join(map(str, newll)) + "\n")

    inf.close()
    outf.close()
Exemplo n.º 6
0
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,Ipcut,Incut,pspan,tspan,gen,left,right,fetch_length=100):
    
 #   p=BwIO(pcut)
 #   chrom_len = {}
 #   for i in p.chromosomeTree['nodes']:
 #       chrom_len[i['key']] = i['chromSize']
    genome = twobitreader.TwoBitFile(gen)
    pcutbw = BigWigFile(open(pcut, 'rb'))
    ncutbw = BigWigFile(open(ncut, 'rb'))
    Ipcutbw = BigWigFile(open(Ipcut, 'rb'))
    Incutbw = BigWigFile(open(Incut, 'rb'))

    inf = open(inputfile)    
    testll = inf.readline().split()
    ml = int(testll[2]) - int(testll[1])
    pspan = pspan - ml/2
    inf.seek(0)
    pBG,nBG = readBG(BGmatrix)
    outf = open(outputfile,'w')
    for line in inf:
        ll = line.split()

        chrom = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        strand = ll[5]
        seq = genome[chrom][(start-pspan-left):(end + pspan+right)]
        pout = make_cut(pcutbw,ll,pspan,fetch_length)
        nout = make_cut(ncutbw,ll,pspan,fetch_length)
        Ipout = make_cut(Ipcutbw,ll,pspan,fetch_length)
        Inout = make_cut(Incutbw,ll,pspan,fetch_length)

        if strand == "-":
            pout,nout = nout,pout
            Ipout,Inout = Inout,Ipout
        if pout == 'NA':
            continue        

        if 'N' in seq.upper():
            continue
        #print 1
        pseq = seq[:-1]
        nseq = seq[1:]
        p=[]
        n=[]
        for k in range(len(pseq)  +1 - left-right):
            p.append(pBG[pseq[k:k+left+right].upper()])
            n.append(nBG[nseq[k:k+left+right].upper()])
        if strand != '-':
            pbglist = p
            nbglist = n
        else:
            pbglist = n[::-1]
            nbglist = p[::-1]
        TC,FOS = makeTCFOS(pcutbw,ncutbw,ll,tspan,ml)
        newll = ll  + [TC,FOS] + pout + nout + Ipout + Inout + pbglist + nbglist
        outf.write("\t".join(map(str,newll))+"\n")
    outf.close()
    inf.close()
def get_signal(inputfile, output, bwfiles, bwfolder, extend):
    signalbw = bwfiles.strip().strip(',').split(',')

    if not bwfolder:
        bwfolder = "./"
    if not bwfolder.endswith('/'):
        bwfolder += '/'

    bwHs = []
    for sb in signalbw:
        if sb.startswith('/'):
            bwHs.append(BigWigFile(open(sb, 'rb')))
        else:
            bwHs.append(BigWigFile(open(bwfolder + sb, 'rb')))

    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if "_" in ll[0]:
            continue
        #center = (int(ll[1]) + int(ll[2]))/2
        #S = max(0,center - extend)
        #E = center + extend
        #C = (int(ll[1]) + int(ll[2]) ) /2
        #S = C - extend
        #E = C + extend
        S = int(ll[1])
        E = int(ll[2])

        for bwHandle in bwHs:
            try:
                signal1 = (bwHandle.summarize(ll[0], max(0, S - extend), S,
                                              20))
                signal2 = (bwHandle.summarize(ll[0], S, E, 20))
                signal3 = (bwHandle.summarize(ll[0], E, E + extend, 20))
                binlen1 = extend * 1.0 / 20
                binlen2 = (E - S) * 1.0 / 20
                binlen3 = extend * 1.0 / 20
                if type(signal1.sum_data) == None or type(
                        signal2.sum_data) == None or type(
                            signal3.sum_data) == None:
                    addsig = [0] * 60
                else:
                    addsig1 = signal1.sum_data / binlen1  #float(signal.sum_data/signal.valid_count)
                    addsig2 = signal2.sum_data / binlen2
                    addsig3 = signal3.sum_data / binlen3
                    addsig = list(addsig1) + list(addsig2) + list(addsig3)
            except:
                #print 'c2',line
                addsig = [0] * 60  #'nan'
            # ll.extend(list(signal.sum_data/signal.valid_count))
            if len(ll) >= 6 and ll[5] == "-":
                ll.extend(addsig[::-1])
            else:
                ll.extend(addsig)
        outf.write("\t".join(map(str, ll)) + "\n")
    inf.close()
    outf.close()
Exemplo n.º 8
0
def count_cut_nmers(fp, w_plus, w_minus, lflank, rflank, single_nmer_cutoff,
                    sequence):
    """
    count the number of cuts associated with each nmer in sequence covered by X.
    offset is the position of the cut to be associated with each nmer.
    if offset = 0 the first base of the tag is lined up with the nmer start
    """
    w_plus_H = BigWigFile(open(w_plus, 'rb'))
    w_minus_H = BigWigFile(open(w_minus, 'rb'))

    genome = twobitreader.TwoBitFile(sequence)
    # keep count of the number of occurrences of each n-mer

    seq_nmer_dict = {}

    cut_nmer_dict = {}

    for line in fp.readlines():
        ll = line.split()
        chrm = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        seq = genome[chrm][(start - lflank):(end + rflank)].upper()
        cp = list(w_plus_H.summarize(ll[0], start, end, end - start).sum_data)
        cn = list(w_minus_H.summarize(ll[0], start, end, end - start).sum_data)
        #each = (len(ll)-5)/2
        #cp = (map(float,ll[5:(5+each)]))
        #cn = (map(float,ll[(5+each):(5+each*2)]))

        for k in range(len(cp)):

            p_cut = cp[k]
            n_cut = cn[k]

            p_seq = seq[k:(k + lflank + rflank)]
            n_seq = seq[(k + 1):(k + lflank + rflank + 1)]
            #     rev_n_seq = rev(n_seq)
            if 'N' not in p_seq and p_cut <= single_nmer_cutoff:
                try:
                    cut_nmer_dict[p_seq] += p_cut
                except:
                    cut_nmer_dict[p_seq] = p_cut
                try:
                    seq_nmer_dict[p_seq] += 1
                except:
                    seq_nmer_dict[p_seq] = 1
            if 'N' not in n_seq and n_cut <= single_nmer_cutoff:
                rev_n_seq = rev(n_seq)
                try:
                    cut_nmer_dict[rev_n_seq] += n_cut
                except:
                    cut_nmer_dict[rev_n_seq] = n_cut
                try:
                    seq_nmer_dict[rev_n_seq] += 1
                except:
                    seq_nmer_dict[rev_n_seq] = 1
    return seq_nmer_dict, cut_nmer_dict
Exemplo n.º 9
0
def get_signal(inputfile, output, bwfiles, extend, N, bwfolder):
    signalbw = bwfiles.strip().strip(',').split(',')

    if not bwfolder:
        bwfolder = "./"
    if not bwfolder.endswith('/') and not bwfolder != "":
        bwfolder += '/'

    bwHs = []
    for sb in signalbw:
        if sb.startswith('/') or startswith("./") or startswith("../"):
            bwHs.append(BigWigFile(open(sb, 'rb')))
        else:
            bwHs.append(BigWigFile(open(bwfolder + sb, 'rb')))

    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if "_" in ll[0]:
            continue
        if len(ll) >= 6 and ll[5] == "-":
            start = int(ll[2])
            strand_flap = 1
        else:
            start = int(ll[1])
            strand_flap = 0
        S = max(0, start - extend)
        E = start + extend
        #        S = int(ll[1])
        #        E = int(ll[2])
        outdata = ll
        for bwHandle in bwHs:
            try:
                signal = (bwHandle.summarize(ll[0], S, E, N))
                binlen = (E - S) * 1.0 / N
                if type(signal.sum_data) == None:
                    print 'c1', line
                    addsig = ["na"] * N
                else:

                    addsig = list(
                        signal.sum_data * 1.0 /
                        (binlen))  #float(signal.sum_data/signal.valid_count)
            except:
                print 'c2', line
                addsig = ["na"] * N  #'nan'
            # ll.extend(list(signal.sum_data/signal.valid_count))
            if strand_flap == 1:
                ll.extend(addsig[::-1])
            else:
                ll.extend(addsig)

            # ll.extend(list(signal.sum_data/signal.valid_count))
        outf.write("\t".join(map(str, ll)) + "\n")
    inf.close()
    outf.close()
Exemplo n.º 10
0
def sitepro_scan(peak, outname, w_plus, w_minus, Cspan):

    inf = open(peak)
    w_plus_H = BigWigFile(open(w_plus, 'rb'))
    w_minus_H = BigWigFile(open(w_minus, 'rb'))

    outf_propPlus = open(outname + "_propcutPlus.bdg", 'w')
    outf_propMinus = open(outname + "_propcutMinus.bdg", 'w')

    for line in inf:
        ll = line.split()
        chrm = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        if start - Cspan < 0:
            print ll
            continue
        plus_obj = w_plus_H.summarize(chrm, start - Cspan, end + Cspan,
                                      (end - start + 2 * Cspan))
        minus_obj = w_minus_H.summarize(chrm, start - Cspan, end + Cspan,
                                        (end - start + 2 * Cspan))
        if not plus_obj:
            plus_vector = numpy.array([0] * (end - start + 2 * Cspan)) + 1
        else:
            plus_vector = plus_obj.sum_data + 1
        if not minus_obj:
            minus_vector = numpy.array([0] * (end - start + 2 * Cspan)) + 1
        else:
            minus_vector = minus_obj.sum_data + 1

        roundN = 4
        #### assign bias to bp and proportion
        for outpos in range(Cspan, (end - start + Cspan)):

            this_plus_cuts_prop = round(
                plus_vector[outpos] /
                sum(plus_vector[(outpos - Cspan):(outpos + Cspan)]), roundN)
            this_minus_cuts_prop = round(
                minus_vector[outpos] /
                sum(minus_vector[(outpos - Cspan):(outpos + Cspan)]), roundN)

            out_chrm = chrm
            out_start = start + outpos - Cspan
            out_end = out_start + 1

            outf_propPlus.write("\t".join(
                map(str, [out_chrm, out_start, out_end, this_plus_cuts_prop]))
                                + "\n")
            outf_propMinus.write("\t".join(
                map(str, [out_chrm, out_start, out_end, this_minus_cuts_prop]))
                                 + "\n")

    outf_propPlus.close()
    outf_propMinus.close()

    inf.close()
Exemplo n.º 11
0
def sitepro_scan(peak, outp, outn, w_plus, w_minus, bgmatrix, span, gen,
                 lflank, rflank):
    nmer = lflank + rflank
    genome = twobitreader.TwoBitFile(gen)
    pBG, nBG = readBG(bgmatrix)
    inf = open(peak)
    w_plus_H = BigWigFile(open(w_plus, 'rb'))
    w_minus_H = BigWigFile(open(w_minus, 'rb'))

    outfp = open(outp, 'w')
    outfn = open(outn, 'w')
    for line in inf:  ### chr start end name motifscore strand FP DNase chip
        ll = line.split()  #####  3 below is flanking length
        chrm = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        ## remove overflow
        if start - span - lflank <= 0:
            continue
        ## get cleavage
        p_sum = list(
            w_plus_H.summarize(chrm, start - span, end + span,
                               end - start + 2 * span).sum_data)
        n_sum = list(
            w_minus_H.summarize(chrm, start - span, end + span,
                                end - start + 2 * span).sum_data)
        ## get seqbias
        seq = genome[chrm][(start - span - lflank):(end + span + rflank)]
        if 'N' in seq.upper():
            continue
        pseq = seq[:-1]
        nseq = seq[1:]
        p = []
        n = []  ### bias
        for k in range(len(pseq) + 1 - nmer):
            p.append(pBG[pseq[k:(k + nmer)].upper()])
            n.append(nBG[nseq[k:(k + nmer)].upper()])

        for bp in range(len(p_sum) - 2 * span):
            ptotal = sum(p_sum[bp:(bp + 2 * span)])  ### total
            ntotal = sum(n_sum[bp:(bp + 2 * span)])
            pc = int(p_sum[bp + span])  #### observation cut
            nc = int(n_sum[bp + span])
            pbias = p[bp + span]
            nbias = n[bp + span]
            pbgtotal = sum(p[bp:(bp + span * 2)])
            nbgtotal = sum(n[bp:(bp + span * 2)])
            paraw = (pbias / pbgtotal) * ptotal
            naraw = (nbias / nbgtotal) * ntotal

            outfp.write("\t".join(map(str, [pc, ptotal, pbias, paraw])) + "\n")
            outfn.write("\t".join(map(str, [nc, ntotal, nbias, paraw])) + "\n")
    outfp.close()
    outfn.close()
    inf.close()
Exemplo n.º 12
0
def summary(bwfile1, bwfile2, bwfile_add, bedfile, topnumber, out):
    total_result = []
    p = BwIO(bwfile1)
    q = BwIO(bwfile2)
    chrom_len1 = {}
    chrom_len2 = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len1[i['key']] = i['chromSize']
    for i in q.chromosomeTree['nodes']:
        chrom_len2[i['key']] = i['chromSize']
    bwHandle1 = BigWigFile(open(bwfile1, 'rb'))
    bwHandle2 = BigWigFile(open(bwfile2, 'rb'))
    inf = open(bedfile)
    t = time.time()
    for line in inf:
        ll = line.split()
        ll[3] = "-"
        if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]):
            summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), 1)
            if summary.valid_count == 0:
                mean_value1 = 0
            else:
                mean_value1 = (summary.sum_data / summary.valid_count)[0]
            summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), 1)
            if summary.valid_count == 0:
                mean_value2 = 0
            else:
                mean_value2 = (summary.sum_data / summary.valid_count)[0]
            total_result.append(ll + [mean_value1 + mean_value2])
    inf.close()
    total_result.sort(reverse=True, key=lambda x: x[-1])
    bwHs = []
    for i in bwfile_add:
        bwHs.append(BigWigFile(open(i, 'rb')))
    outf = open(out, 'w')
    print "scaning 1st ", time.time() - t
    t = time.time()
    for i in range(min(len(total_result), topnumber)):
        ll = total_result[i]
        summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]),
                                      (int(ll[2]) - int(ll[1])))
        additional_value1 = ",".join(map(str, list(summary.sum_data)))
        summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]),
                                      (int(ll[2]) - int(ll[1])))
        additional_value2 = ",".join(map(str, list(summary.sum_data)))
        result = map(str, (ll + [additional_value1, additional_value2]))
        for bwH in bwHs:
            summary = bwH.summarize(ll[0], int(ll[1]), int(ll[2]),
                                    (int(ll[2]) - int(ll[1])))
            additional_value_add = ",".join(map(str, list(summary.sum_data)))
            result.append(additional_value_add)
        outf.write("\t".join(result) + "\n")
    outf.close()
    print "scaning 2nd ", time.time() - t
def Main():
    global args
    args = ParseArg()
    bw1 = BigWigFile(open(args.percentile1))
    bw2 = BigWigFile(open(args.percentile2))
    gout = WriteToFile(args.output + ".list")
    perc_array1 = []
    perc_array2 = []
    for line in ReadFromFile(args.geneList):
        row = line.strip().split()
        gene = row[0]
        chrom = row[1]
        start = int(row[2])
        end = int(row[3])
        array1 = bw1.get_as_array(chrom, start, end)
        array2 = bw2.get_as_array(chrom, start, end)
        if array1 is not None and array2 is not None:
            perc1 = np.mean(array1) + 50
            perc2 = np.mean(array2) + 50
            print >> gout, '%s\t%s\t%d\t%d\t%f\t%f' % (gene, chrom, start, end,
                                                       perc1, perc2)
            perc_array1.append(perc1)
            perc_array2.append(perc2)
    '''scatter plot'''
    sns.set()
    plt.scatter(perc_array1,
                perc_array2,
                marker=',',
                color='black',
                s=1,
                alpha=0.1)
    plt.axes().set_aspect('equal')
    plt.xlabel(args.x, fontsize=20)
    plt.ylabel(args.y, fontsize=20)
    plt.ylim(0, 100)
    plt.xlim(0, 100)
    plt.tick_params(axis='both', which='major', labelsize=20, width=2)
    plt.gca().set_yticks([0, 20, 40, 60, 80, 100])
    plt.gca().set_xticks([0, 20, 40, 60, 80, 100])
    x1, y1 = [0, 89.5], [10.5, 100]
    x2, y2 = [10.5, 100
              ], [0, 89.5
                  ]  # draw lines showing the threshold to call changed domains
    plt.gca().spines['left'].set_linewidth(2)
    plt.gca().spines['bottom'].set_linewidth(2)
    plt.gca().spines['right'].set_linewidth(2)
    plt.gca().spines['top'].set_linewidth(2)
    plt.subplots_adjust(bottom=.2, left=.2)
    plt.plot(x1, y1, linewidth=0.5, linestyle='--', color='red')
    plt.plot(x2, y2, linewidth=0.5, linestyle='--', color='red')
    plt.savefig(args.output + '_dot.eps', format='eps')
    plt.close()

    logging("DONE!!!")
Exemplo n.º 14
0
def get_signal(inputfile, output, bwfiles, bwfolder, extend, N):
    signalbw = bwfiles.strip().strip(',').split(',')

    if not bwfolder:
        bwfolder = "./"
    if not bwfolder.endswith('/'):
        bwfolder += '/'

    bwHs = []
    for sb in signalbw:
        if sb.startswith('/'):
            bwHs.append(BigWigFile(open(sb, 'rb')))
        else:
            bwHs.append(BigWigFile(open(bwfolder + sb, 'rb')))

    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if "_" in ll[0]:
            continue
        #center = (int(ll[1]) + int(ll[2]))/2
        #S = max(0,center - extend)
        #E = center + extend
        C = (int(ll[1]) + int(ll[2])) / 2
        #if len(ll)>=6 and ll[5] == "-":
        #    C = int(ll[1])
        #else:
        #    C = int(ll[2])
        S = max(0, C - extend)
        E = C + extend

        for bwHandle in bwHs:
            try:
                signal = (bwHandle.summarize(ll[0], S, E, N))
                binlen = extend * 2.0 / N
                if type(
                        signal.sum_data
                ) == None:  #or type(signal2.sum_data) == None or type(signal3.sum_data) == None:
                    addsig = [0] * N
                else:
                    addsig_tmp = signal.sum_data / binlen  #float(signal.sum_data/signal.valid_count)
                    addsig = list(addsig_tmp)  #+ list(addsig2) + list(addsig3)
            except:
                #print 'c2',line
                addsig = [0] * N  #'nan'
            # ll.extend(list(signal.sum_data/signal.valid_count))
            if len(ll) >= 6 and ll[5] == "-":
                ll.extend(addsig[::-1])
            else:
                ll.extend(addsig)
        outf.write("\t".join(map(str, ll)) + "\n")
    inf.close()
    outf.close()
Exemplo n.º 15
0
def get_signal(inputfile, output, Pbw, Nbw, score_range):
    persudo = 0.2
    p = BwIO(Pbw)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    PH = BigWigFile(open(Pbw, 'rb'))
    NH = BigWigFile(open(Nbw, 'rb'))
    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if not chrom_len.has_key(ll[0]):
            continue
        motif_len = int(ll[2]) - int(ll[1])
        Psignal = list(
            PH.summarize(ll[0], max(int(ll[1]) - 100, 0),
                         int(ll[1]) + 100, 200).sum_data)
        Nsignal = list(
            NH.summarize(ll[0], max(int(ll[1]) - 100, 0),
                         int(ll[1]) + 100, 200).sum_data)
        DNase = sum(Psignal) + sum(Nsignal)

        if ll[5] == '+':
            S_up_same = sum(Psignal[(100 - score_range):100])
            S_up_diff = sum(Nsignal[(100 - score_range):100])
            S_down_same = sum(Psignal[(100 + motif_len):100 + motif_len +
                                      score_range])
            S_down_diff = sum(Nsignal[(100 + motif_len):100 + motif_len +
                                      score_range])

        elif ll[5] == '-':
            S_up_same = sum(Nsignal[(100 + motif_len):100 + motif_len +
                                    score_range])
            S_up_diff = sum(Psignal[(100 + motif_len):100 + motif_len +
                                    score_range])
            S_down_same = sum(Nsignal[(100 - score_range):100])
            S_down_diff = sum(Psignal[(100 - score_range):100])
        else:
            print line
            sys.exit(1)

    #    if S_up_same == 0 or S_up_diff ==0 or S_down_same == 0 or S_down_diff == 0:
    #        continue
        FPscore1 = math.log((S_up_same + persudo) * (S_down_diff + persudo) /
                            ((S_up_diff + persudo) * (S_down_same + persudo)),
                            2)
        FPscore2 = math.sqrt(S_up_same) + math.sqrt(S_down_diff) - math.sqrt(
            S_up_diff) - math.sqrt(S_down_same)

        ll.extend([DNase, FPscore1, FPscore2])
        outf.write("\t".join(map(str, ll)) + "\n")
    inf.close()
    outf.close()
def get_signal(inputfile, output, bwfiles, bwfolder, extend):
    signalbw = bwfiles.strip().strip(',').split(',')

    if not bwfolder:
        bwfolder = ""
    if not bwfolder.endswith('/'):
        bwfolder += '/'

    bwHs = []
    for sb in signalbw:
        if sb.startswith('/'):
            bwHs.append(BigWigFile(open(sb, 'rb')))
        else:
            bwHs.append(BigWigFile(open(bwfolder + sb, 'rb')))

    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if "_" in ll[0]:
            continue
        if len(ll) >= 6 and ll[5] == "-":
            strand_flap = 1
        else:
            strand_flap = 0
#        center = (int(ll[1]) + int(ll[2]))/2
#        S = max(0,center - extend)
#        E = center + extend
        S = int(ll[1])
        E = int(ll[2])
        outdata = []
        for bwHandle in bwHs:
            try:
                signal = (bwHandle.summarize(ll[0], S, E, (E - S)))
                if signal:
                    thisdata_tmp = list(signal.sum_data)
                    #                    if strand_flap == 1:
                    #                        thisdata = map(round,thisdata_tmp,[4]*(E-S))[::-1]
                    #                    else:
                    thisdata = map(round, thisdata_tmp, [4] * (E - S))
                else:
                    thisdata = ["NA"] * (E - S)
            except:
                thisdata = ["NA"] * (E - S)
            outdata.append(thisdata)
            # ll.extend(list(signal.sum_data/signal.valid_count))

        for pos in range(len(outdata[0])):
            newll = [ll[0], S + pos, S + pos + 1]
            for dataorder in range(len(outdata)):
                newll.append(outdata[dataorder][pos])
            outf.write("\t".join(map(str, newll)) + "\n")
    inf.close()
    outf.close()
Exemplo n.º 17
0
def sitepro_scan(peak, outname, w_plus, w_minus, Cspan):

    inf = open(peak)
    w_plus_H = BigWigFile(open(w_plus, 'rb'))
    w_minus_H = BigWigFile(open(w_minus, 'rb'))

    outf = open(outname + "_Cuts.txt", 'w')

    for line in inf:
        ll = line.split()
        chrm = ll[0]
        start = int(ll[1])
        end = int(ll[2])
        if start - Cspan < 0:
            print ll
            continue
        plus_obj = w_plus_H.summarize(chrm, start - Cspan, end + Cspan,
                                      (end - start + 2 * Cspan))
        minus_obj = w_minus_H.summarize(chrm, start - Cspan, end + Cspan,
                                        (end - start + 2 * Cspan))
        if not plus_obj:
            plus_vector = numpy.array([0] * (end - start + 2 * Cspan))
        else:
            plus_vector = plus_obj.sum_data
        if not minus_obj:
            minus_vector = numpy.array([0] * (end - start + 2 * Cspan))
        else:
            minus_vector = minus_obj.sum_data

        #roundN = 4
        #### assign bias to bp and proportion
        for outpos in range(Cspan, (end - start + Cspan)):
            this_plus = plus_vector[outpos]
            this_minus = minus_vector[outpos]
            this_plus_cuts_sum = sum(plus_vector[(outpos - Cspan):(outpos +
                                                                   Cspan)])
            this_minus_cuts_sum = sum(minus_vector[(outpos - Cspan):(outpos +
                                                                     Cspan)])

            out_chrm = chrm
            out_start = start + outpos - Cspan
            out_end = out_start + 1

            outf.write("\t".join(
                map(str, [
                    out_chrm + ":" + str(out_start) + "-" +
                    str(out_end), this_plus, this_plus_cuts_sum, this_minus,
                    this_minus_cuts_sum
                ])) + "\n")

    outf.close()
    inf.close()
Exemplo n.º 18
0
def make_template(data, flank, pflank, topmotif, out, pbw, mbw):
    w_plus_H = BigWigFile(open(pbw, 'rb'))
    w_minus_H = BigWigFile(open(mbw, 'rb'))
    i = 0
    templatelist = []
    pp = []
    pm = []
    inf = open(data)
    l1st = inf.readline().split()
    ml = int(l1st[2]) - int(l1st[1])
    inf.seek(0)
    for line in inf:
        #if i >= topmotif:
        #   break
        ll = line.split()
        templatelist.append(ll)

    inf.close()
    templatelist.sort(key=lambda x: float(x[4]), reverse=True)
    for ll in templatelist:
        p_sum = list(
            w_plus_H.summarize(ll[0],
                               int(ll[1]) - flank,
                               int(ll[1]) + flank, 2 * flank).sum_data)
        m_sum = list(
            w_minus_H.summarize(ll[0],
                                int(ll[1]) - flank,
                                int(ll[1]) + flank, 2 * flank).sum_data)
        if ll[5] == "+":
            pp.append(p_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 +
                                                           pflank)])
            pm.append(m_sum[(flank + 1 + ml / 2 - pflank):(flank + 1 + ml / 2 +
                                                           pflank)])
        if ll[5] == '-':
            pm.append(p_sum[::-1][(flank + 1 + ml / 2 - ml -
                                   pflank):(flank + 1 + ml / 2 - ml + pflank)])
            pp.append(m_sum[::-1][(flank + 1 + ml / 2 - ml -
                                   pflank):(flank + 1 + ml / 2 - ml + pflank)])

    meanp = apply_mean(pp)
    meanm = apply_mean(pm)
    allsum = sum(meanp) + sum(meanm)
    P = []
    M = []
    for i in range(len(meanp)):
        P.append(meanp[i])  #/allsum)
        M.append(meanm[i])  #/allsum)

    plot_template(P, M, out)
Exemplo n.º 19
0
def get_signal(inputfile, output, plusBW, minusBW, bwfolder, extend):

    if not bwfolder:
        bwfolder = "./"
    if not bwfolder.endswith('/'):
        bwfolder += '/'

    plus = BigWigFile(open(bwfolder + plusBW, 'rb'))
    minus = BigWigFile(open(bwfolder + minusBW, 'rb'))

    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if "_" in ll[0]:
            continue
        if len(ll) >= 6 and ll[5] == "-":
            strand_flap = 1
        else:
            strand_flap = 0
        start = int(ll[1])
        end = int(ll[2])
        S = max(0, start - extend)
        E = end + extend
        #        S = int(ll[1])
        #        E = int(ll[2])
        outdata = ll
        try:
            plus_signal = (plus.summarize(ll[0], S, E, (E - S)))
            minus_signal = (minus.summarize(ll[0], S, E, (E - S)))
            if plus_signal and minus_signal:
                plus_tmp = list(plus_signal.sum_data)
                minus_tmp = list(minus_signal.sum_data)

                if strand_flap == 1:
                    thisdata_tmp = minus_tmp[::
                                             -1] + plus_tmp[::
                                                            -1]  #map(round,thisdata_tmp,[4]*(E-S))[::-1]
                else:
                    thisdata_tmp = plus_tmp + minus_tmp
                thisdata = thisdata_tmp  #map(round,thisdata_tmp,[4]*len(thisdata_tmp))
        except:
            pass
        outdata.extend(thisdata)
        # ll.extend(list(signal.sum_data/signal.valid_count))
        outf.write("\t".join(map(str, outdata)) + "\n")
    inf.close()
    outf.close()
Exemplo n.º 20
0
def main():
    p = optparse.OptionParser(__doc__)
    p.add_option('-A', '--absolute', action='store_true',dest='A',\
                 default=False, help='absolute threshold')
    p.add_option('-s','--standard_background', action='store_true',\
                 dest='stdbg')
    p.add_option('-D', '--debug', action='store_true', dest='debug')
    options, args = p.parse_args()
    debug_c = 0

    BEDFILE = open(args[0], 'rU')
    BW = BigWigFile(file=open(args[1]))
    BEDout = open(args[2], 'w')

    for line in BEDFILE:
        print(line)
        line = line.strip().split('\t')
        x = BW.query(line[0], int(line[1]), int(line[2]),1)
        line.append(str(round(x[0]['mean'], 5)))
        BEDout.write("\t".join(line)+"\n")
        """
        for i in x:
            print i['mean']
        """

        if options.debug:
            debug_c +=1
            if debug_c >= 10:
            break


if __name__ == '__main__':
    main()
Exemplo n.º 21
0
def test_summaries_from_file():
    bw = BigWigFile(file=open("test_data/bbi_tests/test.bw", 'rb'))

    def check_summary(line):
        fields = line.split()
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        n = int(fields[3])
        t = fields[4]
        values = [float(v.replace('n/a', 'NaN')) for v in fields[5:]]
        sd = bw.summarize(chrom, start, end, n)
        if t == 'mean':
            print(sd.sum_data / sd.valid_count)
            print(values)
            assert allclose(sd.sum_data / sd.valid_count, values)
        elif t == 'min':
            assert allclose(sd.min_val, values)
        elif t == 'max':
            assert allclose(sd.max_val, values)
        # elif t == 'std':
        #    assert numpy.allclose( sd.max_val, values )
    for i, line in enumerate(open("test_data/bbi_tests/test.expectation")):
        f = partial(check_summary, line)
        f.description = "Test summaries line %d: %s" % (i, line[:40])
        yield (f, )
Exemplo n.º 22
0
def summary(bwfile,bedfile,topnumber,out):
    total_result = []
    p=BwIO(bwfile)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle=BigWigFile(open(bwfile, 'rb'))
    inf = open(bedfile)
    t = time.time()
    for line in inf:
        ll = line.split()
        ll[3]="-"
        if chrom_len.has_key(ll[0]):
            summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),1)
            if summary.valid_count == 0:
                mean_value = 0
            else:
                mean_value = (summary.sum_data/summary.valid_count)[0]
            total_result.append(ll+[mean_value])
    inf.close()   
    total_result.sort(reverse=True,key=lambda x:x[-1])
    outf = open(out,'w')
    print "scaning 1st ",time.time()-t
    t=time.time()
    for i in range(topnumber):
        ll = total_result[i]
        summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1])))
        additional_value = ",".join(map(str,list(summary.sum_data)))
        result = map(str,(ll+[additional_value]))
        outf.write("\t".join(result)+"\n")
    outf.close()
    print "scaning 2nd ",time.time()-t
Exemplo n.º 23
0
def Readbw(bwfile,chrm,start,end,n):
    bwHandle=BigWigFile(open(bwfile, 'rb'))
    summary = bwHandle.summarize(chrm,int(start),int(end),(int(end)-int(start))/n)
    count = map(sudocount,summary.valid_count)
    sum = summary.sum_data
    scores = list(sum/count)
    return scores
def test_summaries_from_file():
    bw = BigWigFile(file=open("test_data/bbi_tests/test.bw"))

    def check_summary(line):
        fields = line.split()
        chrom = fields[0]
        start = int(fields[1])
        end = int(fields[2])
        n = int(fields[3])
        t = fields[4]
        values = [float(v.replace('n/a', 'NaN')) for v in fields[5:]]
        sd = bw.summarize(chrom, start, end, n)
        if t == 'mean':
            print sd.sum_data / sd.valid_count
            print values
            assert allclose(sd.sum_data / sd.valid_count, values)
        elif t == 'min':
            assert allclose(sd.min_val, values)
        elif t == 'max':
            assert allclose(sd.max_val, values)
        #elif t == 'std':
        #    assert numpy.allclose( sd.max_val, values )

    for line in open("test_data/bbi_tests/test.expectation"):
        yield check_summary, line
Exemplo n.º 25
0
def refine_with_summit(_soft,_mark,_tissue):
    _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/top_bed/{0}.{1}.{2}.bed"\
                                                       .format(_soft,_mark,_tissue))]

    _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.rep0.bw".format(_mark,_tissue))
    _temp_enrich = open("/Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w')
    _bw = BigWigFile(file=_temp_bw)

    for line in _temp_peak:
        vals = _bw.get(line[0],int(line[1]),int(line[2]))
        vals =tuple(vals)
        if len(vals)>0:
            maxs = 0
            for _key in vals:
                if float(_key[2])>maxs:
                    maxs = float(_key[2])
                    summit = _key[:2]
            summit_p=int((float(summit[0])+float(summit[1]))/2)
            if summit_p-1000>0:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999))
            else:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000)
    _temp_enrich.close()
    sh('sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\
     >/Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed'.format(_soft,_mark,_tissue))

    sh('bash ../get_enrich.sh /Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed {1} {2} {3}'\
       .format(_soft,_mark,_tissue,_soft))
Exemplo n.º 26
0
def getChromatinDataSeries(bigwigFile, libraryTable, sgInfoTable, tssTable, colname = '', naValue = 0):
	bwindex = BigWigFile(open(bigwigFile))
	chromDict = tssTable['chromosome'].to_dict()

	chromatinScores = []
	for name, sgInfo in sgInfoTable.iterrows():
		geneTup = (sgInfo['gene_name'],','.join(sgInfo['transcript_list']))

		if geneTup not in chromDict: #negative controls
			chromatinScores.append(np.nan)
			continue

		if sgInfo['strand'] == '+':
			sgRange = sgInfo['pam coordinate'] + sgInfo['length']
		else:
			sgRange = sgInfo['pam coordinate'] - sgInfo['length']

		chrom = chromDict[geneTup]
		
		chromatinArray = bwindex.get_as_array(chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange))
		if chromatinArray is not None and len(chromatinArray) > 0:
			chromatinScores.append(np.nanmean(chromatinArray))
		else: #often chrY when using K562 data..
			# print name
			# print chrom, min(sgInfo['pam coordinate'], sgRange), max(sgInfo['pam coordinate'], sgRange)
			chromatinScores.append(np.nan)

	chromatinSeries = pd.Series(chromatinScores, index=libraryTable.index, name = colname)

	return chromatinSeries.fillna(naValue)
Exemplo n.º 27
0
def get_signal(inputfile,output,signalbw,extend):
    signalbw = signalbw.strip().strip(',').split(',')
    
    p=BwIO(signalbw[0])
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle = []
    for k in signalbw:
        bwHandle.append(BigWigFile(open(k, 'rb')))
    inf = open(inputfile)
    outf = open(output,'w')
    for line in inf:
        ll = line.split()
        inputlen = len(ll)
        if not chrom_len.has_key(ll[0]):
            continue
        for bwH in bwHandle:
            S = (int(ll[1]) + int(ll[2]))/2
            E = (int(ll[1]) + int(ll[2]))/2 + 1
            try:
                signal=bwH.summarize(ll[0],max(0,S-extend),E+extend,1)
            except:
                break
            if float(signal.valid_count) == 0:
                ll.append('0')
            else:
                ll.append(str(float(signal.sum_data/signal.valid_count)))
        if len(ll) == ( inputlen + len(bwHandle)  ):
            outf.write("\t".join(ll)+"\n")
    inf.close()
    outf.close()
Exemplo n.º 28
0
def createMappabilityList(fragmentsMap, bwfile, fragmentCount, options):
    # keep record which fragment has decent mappability
    mappable = np.zeros((fragmentCount, ), dtype=np.float)

    # lazy load
    from bx.intervals.io import GenomicIntervalReader
    from bx.bbi.bigwig_file import BigWigFile
    bw = BigWigFile(open(bwfile))

    for fragmentId in fragmentsMap.keys():

        (chrom, start, end) = fragmentsMap[fragmentId]

        if (options.vverbose):
            print >> sys.stdout, "- process %s %d-%d " % (chrom, start, end)

        try:
            mappable[fragmentId] = bw.query(chrom, start, end, 1)[0]["mean"]
            if (np.isnan(mappable[fragmentId])):
                mappable[fragmentId] = 0
        except:
            mappable[fragmentId] = 0.
            # problem with invalid values
            if (options.vverbose):
                print >> sys.stderr, "Problem with bw file at %s %d-%d" % (
                    chrom, start, end)
                print traceback.format_exc()

    return mappable
Exemplo n.º 29
0
def load_annos(args):
    """
    Populate a dictionary of Tabixfile handles for
    each annotation file.  Other modules can then
    access a given handle and fetch data from it
    as follows:

    dbsnp_handle = annotations.annos['dbsnp']
    hits = dbsnp_handle.fetch(chrom, start, end)
    """
    anno_files = get_anno_files(args)
    for anno in anno_files:
        try:
            # .gz denotes Tabix files.
            if anno_files[anno].endswith(".gz"):
                annos[anno] = pysam.Tabixfile(anno_files[anno])
            # .bw denotes BigWig files.
            elif anno_files[anno].endswith(".bw"):
                annos[anno] = BigWigFile(open(anno_files[anno]))

        except IOError:
            sys.exit("Gemini cannot open this annotation file: %s. \n"
                     "Have you installed the annotation files?  If so, "
                     "have they been moved or deleted? Exiting...\n\n"
                     "For more details:\n\t"
                     "http://gemini.readthedocs.org/en/latest/content/"
                     "#installation.html\#installing-annotation-files\n" %
                     anno_files[anno])
Exemplo n.º 30
0
def get_mean_phastcons(bedtool, phastcons_location, sample_size=1000):
    """
    
    Get means phastcons scores for all intervals in a bed tool
    bedtool - bedtool to extract data from
    phastcons_location - location of phastcons file
    
    """

    with open(phastcons_location) as bw_file:
        bw = BigWigFile(bw_file)

        data = []

        for bedline in bedtool.random_subset(min(len(bedtool), sample_size)):
            conservation_values = bw.get_as_array(bedline.chrom, bedline.start,
                                                  bedline.stop)
            try:
                if len(conservation_values) > 0:
                    mean_phastcons = np.mean(conservation_values)
                else:
                    mean_phastcons = 0
                data.append(mean_phastcons)
            except TypeError:
                pass
    return data