Пример #1
0
def summary(bwfile1, bwfile2, bwfile_add, bedfile, topnumber, out):
    total_result = []
    p = BwIO(bwfile1)
    q = BwIO(bwfile2)
    chrom_len1 = {}
    chrom_len2 = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len1[i['key']] = i['chromSize']
    for i in q.chromosomeTree['nodes']:
        chrom_len2[i['key']] = i['chromSize']
    bwHandle1 = BigWigFile(open(bwfile1, 'rb'))
    bwHandle2 = BigWigFile(open(bwfile2, 'rb'))
    inf = open(bedfile)
    t = time.time()
    for line in inf:
        ll = line.split()
        ll[3] = "-"
        if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]):
            summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]), 1)
            if summary.valid_count == 0:
                mean_value1 = 0
            else:
                mean_value1 = (summary.sum_data / summary.valid_count)[0]
            summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]), 1)
            if summary.valid_count == 0:
                mean_value2 = 0
            else:
                mean_value2 = (summary.sum_data / summary.valid_count)[0]
            total_result.append(ll + [mean_value1 + mean_value2])
    inf.close()
    total_result.sort(reverse=True, key=lambda x: x[-1])
    bwHs = []
    for i in bwfile_add:
        bwHs.append(BigWigFile(open(i, 'rb')))
    outf = open(out, 'w')
    print "scaning 1st ", time.time() - t
    t = time.time()
    for i in range(min(len(total_result), topnumber)):
        ll = total_result[i]
        summary = bwHandle1.summarize(ll[0], int(ll[1]), int(ll[2]),
                                      (int(ll[2]) - int(ll[1])))
        additional_value1 = ",".join(map(str, list(summary.sum_data)))
        summary = bwHandle2.summarize(ll[0], int(ll[1]), int(ll[2]),
                                      (int(ll[2]) - int(ll[1])))
        additional_value2 = ",".join(map(str, list(summary.sum_data)))
        result = map(str, (ll + [additional_value1, additional_value2]))
        for bwH in bwHs:
            summary = bwH.summarize(ll[0], int(ll[1]), int(ll[2]),
                                    (int(ll[2]) - int(ll[1])))
            additional_value_add = ",".join(map(str, list(summary.sum_data)))
            result.append(additional_value_add)
        outf.write("\t".join(result) + "\n")
    outf.close()
    print "scaning 2nd ", time.time() - t
Пример #2
0
def summary(bwfile,bedfile,topnumber,out):
    total_result = []
    p=BwIO(bwfile)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle=BigWigFile(open(bwfile, 'rb'))
    inf = open(bedfile)
    t = time.time()
    for line in inf:
        ll = line.split()
        ll[3]="-"
        if chrom_len.has_key(ll[0]):
            summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),1)
            if summary.valid_count == 0:
                mean_value = 0
            else:
                mean_value = (summary.sum_data/summary.valid_count)[0]
            total_result.append(ll+[mean_value])
    inf.close()   
    total_result.sort(reverse=True,key=lambda x:x[-1])
    outf = open(out,'w')
    print "scaning 1st ",time.time()-t
    t=time.time()
    for i in range(topnumber):
        ll = total_result[i]
        summary = bwHandle.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1])))
        additional_value = ",".join(map(str,list(summary.sum_data)))
        result = map(str,(ll+[additional_value]))
        outf.write("\t".join(result)+"\n")
    outf.close()
    print "scaning 2nd ",time.time()-t
Пример #3
0
def get_signal(inputfile,output,signalbw,extend):
    signalbw = signalbw.strip().strip(',').split(',')
    
    p=BwIO(signalbw[0])
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle = []
    for k in signalbw:
        bwHandle.append(BigWigFile(open(k, 'rb')))
    inf = open(inputfile)
    outf = open(output,'w')
    for line in inf:
        ll = line.split()
        inputlen = len(ll)
        if not chrom_len.has_key(ll[0]):
            continue
        for bwH in bwHandle:
            S = (int(ll[1]) + int(ll[2]))/2
            E = (int(ll[1]) + int(ll[2]))/2 + 1
            try:
                signal=bwH.summarize(ll[0],max(0,S-extend),E+extend,1)
            except:
                break
            if float(signal.valid_count) == 0:
                ll.append('0')
            else:
                ll.append(str(float(signal.sum_data/signal.valid_count)))
        if len(ll) == ( inputlen + len(bwHandle)  ):
            outf.write("\t".join(ll)+"\n")
    inf.close()
    outf.close()
Пример #4
0
def scan_fp(plusdnase, minusdnase, bed, out, upstream, downstream):
    p = BwIO(plusdnase)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle1 = BigWigFile(open(plusdnase, 'rb'))
    bwHandle2 = BigWigFile(open(minusdnase, 'rb'))
    inf = open(bed)
    outf = open(out, 'w')
    for line in inf:
        ll = line.split()
        if not chrom_len.has_key(ll[0]):
            continue
        if int(ll[1]) < upstream:
            continue
        signal1 = bwHandle1.summarize(
            ll[0],
            int(ll[1]) - upstream,
            int(ll[2]) + downstream,
            (int(ll[2]) + downstream - int(ll[1]) + upstream))
        signal2 = bwHandle2.summarize(
            ll[0],
            int(ll[1]) - upstream,
            int(ll[2]) + downstream,
            (int(ll[2]) + downstream - int(ll[1]) + upstream))
        #ll.append(str(float(signal.sum_data)))
        newll = ll[:6] + map(str, list(signal1.sum_data)) + map(
            str, list(signal2.sum_data))
        outf.write("\t".join(newll) + "\n")
    inf.close()
    outf.close()
Пример #5
0
def get_signal(inputfile, output, vp, vm, dp, dm):
    p = BwIO(vp)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    vpBw = BigWigFile(open(vp, 'rb'))
    vmBw = BigWigFile(open(vm, 'rb'))
    dpBw = BigWigFile(open(dp, 'rb'))
    dmBw = BigWigFile(open(dm, 'rb'))
    inf = open(inputfile)
    outf = open(output, 'w')
    colnames = [
        "chrom", "start", "end", "seq", "motifscore", "strand",
        "LncapARsignal", "LncapDNaseCutsite", "LncapDNaseFrag",
        "K562DNaseFrag", "LncapFP", "K562FP", "overARpeak", "VehPlus",
        "VehMinus", "DHTPlus", "DHTMinus"
    ]
    outf.write("\t".join(colnames) + "\n")
    for line in inf:
        if line.startswith("chrom"):
            continue
        ll = line.split()
        if not chrom_len.has_key(ll[0]):
            continue
        signal = vpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1)
        ll.append(str(float(signal.sum_data)))
        signal = vmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1)
        ll.append(str(float(signal.sum_data)))
        signal = dpBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1)
        ll.append(str(float(signal.sum_data)))
        signal = dmBw.summarize(ll[0], int(ll[1]) - 50, int(ll[2]) + 50, 1)
        ll.append(str(float(signal.sum_data)))
        outf.write("\t".join(ll) + "\n")
    inf.close()
    outf.close()
Пример #6
0
def get_signal(inputfile, output, Pbw, Nbw, score_range):
    persudo = 0.2
    p = BwIO(Pbw)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    PH = BigWigFile(open(Pbw, 'rb'))
    NH = BigWigFile(open(Nbw, 'rb'))
    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if not chrom_len.has_key(ll[0]):
            continue
        motif_len = int(ll[2]) - int(ll[1])
        Psignal = list(
            PH.summarize(ll[0], max(int(ll[1]) - 100, 0),
                         int(ll[1]) + 100, 200).sum_data)
        Nsignal = list(
            NH.summarize(ll[0], max(int(ll[1]) - 100, 0),
                         int(ll[1]) + 100, 200).sum_data)
        DNase = sum(Psignal) + sum(Nsignal)

        if ll[5] == '+':
            S_up_same = sum(Psignal[(100 - score_range):100])
            S_up_diff = sum(Nsignal[(100 - score_range):100])
            S_down_same = sum(Psignal[(100 + motif_len):100 + motif_len +
                                      score_range])
            S_down_diff = sum(Nsignal[(100 + motif_len):100 + motif_len +
                                      score_range])

        elif ll[5] == '-':
            S_up_same = sum(Nsignal[(100 + motif_len):100 + motif_len +
                                    score_range])
            S_up_diff = sum(Psignal[(100 + motif_len):100 + motif_len +
                                    score_range])
            S_down_same = sum(Nsignal[(100 - score_range):100])
            S_down_diff = sum(Psignal[(100 - score_range):100])
        else:
            print line
            sys.exit(1)

    #    if S_up_same == 0 or S_up_diff ==0 or S_down_same == 0 or S_down_diff == 0:
    #        continue
        FPscore1 = math.log((S_up_same + persudo) * (S_down_diff + persudo) /
                            ((S_up_diff + persudo) * (S_down_same + persudo)),
                            2)
        FPscore2 = math.sqrt(S_up_same) + math.sqrt(S_down_diff) - math.sqrt(
            S_up_diff) - math.sqrt(S_down_same)

        ll.extend([DNase, FPscore1, FPscore2])
        outf.write("\t".join(map(str, ll)) + "\n")
    inf.close()
    outf.close()
Пример #7
0
def get_signal(inputfile, output, signalbw):
    p = BwIO(signalbw)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle = BigWigFile(open(signalbw, 'rb'))
    inf = open(inputfile)
    outf = open(output, 'w')
    for line in inf:
        ll = line.split()
        if not chrom_len.has_key(ll[0]):
            continue
        signal = bwHandle.summarize(ll[0], max(int(ll[1]) - 50, 0),
                                    int(ll[2]) + 50, 1)
        ll.append(str(float(signal.sum_data)))
        outf.write("\t".join(ll) + "\n")
    inf.close()
    outf.close()
Пример #8
0
def summary(bwfile, bedfile, out, central_max, central_min, flanking_max,
            flanking_min, cutoff):
    total_result = []
    p = BwIO(bwfile)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    bwHandle = BigWigFile(open(bwfile, 'rb'))
    inf = open(bedfile)
    outf = open(out, 'w')
    t = time.time()
    for line in inf:
        ll = line.split()
        if chrom_len.has_key(ll[0]):
            #t = time.time()
            summary = bwHandle.summarize(ll[0], int(ll[1]), int(ll[2]),
                                         (int(ll[2]) - int(ll[1])))
            #        print "bw sum time",time.time()-t
            #       t=time.time()
            digital = list(summary.sum_data)
            #      print "trans to list time",time.time()-t
            #      t=time.time()
            FT = (caculate_footprint(digital, central_max, central_min,
                                     flanking_max, flanking_min, cutoff))
            #     print "scan footprint time",time.time()-t
            #     time.time()
            for ft in FT:
                bed = "\t".join(
                    map(str, [
                        ll[0],
                        int(ll[1]) + ft[0],
                        int(ll[1]) + ft[1], ll[3], ft[2]
                    ])) + "\n"
                outf.write(bed)
            #print "single time",time.time()-t
            #print (int(ll[2])-int(ll[1]))#*1.0/(time.time()-t)
    inf.close()
    outf.close()
    print "scaning 1st ", time.time() - t
Пример #9
0
def sitepro_scan(pattern,peak,out,w_plus,w_minus,trunk):
    inf = open(pattern)
    pattern_plus = map(float,inf.readline().strip().split(","))
    pattern_minus = map(float,inf.readline().strip().split(","))
    all = sum(pattern_plus)+sum(pattern_minus)
    p_plus = []
    p_minus= []
    for i in pattern_plus:
        p_plus.append(i/all)
    for i in pattern_minus:
        p_minus.append(i/all)
    inf.close()
    l = len(pattern_plus)
    p0 = [1.0/(2*l)]*l
    inf = open(peak)
    p=BwIO(w_plus)
    q=BwIO(w_minus)
    chrom_len1 = {}
    chrom_len2 = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len1[i['key']] = i['chromSize']
    for i in q.chromosomeTree['nodes']:
        chrom_len2[i['key']] = i['chromSize']
    w_plus_H=BigWigFile(open(w_plus, 'rb'))
    w_minus_H=BigWigFile(open(w_minus, 'rb'))
    footprint = []
    count = 0
    t=time.time()
    for line in inf:
        ll = line.split()
        if chrom_len1.has_key(ll[0])  and chrom_len2.has_key(ll[0]):
#            print ll[0],int(ll[1])-l,int(ll[2])+l,(int(ll[2])-int(ll[1])+2*l)
            p_sum = list(w_plus_H.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))).sum_data)
            m_sum = list(w_minus_H.summarize(ll[0],int(ll[1]),int(ll[2]),(int(ll[2])-int(ll[1]))).sum_data)
            #print len(p_sum)
            last_start = "NA"
            last_end = "NA"
            last_value = "NA"
            for i in range(len(p_sum)-l):
                o_plus = map(float,p_sum[i:i+l])
                o_minus = map(float,m_sum[i:i+l])
                for k in range(len(o_plus)):
                    if o_plus[k] > trunk:
                        o_plus[k]=trunk
                    if o_minus[k] > trunk:
                        o_minus[k] = trunk
                #print pattern_plus,p0
                score =  match_pattern(p_plus,p_minus,p0,p0,o_plus,o_minus,l)
                if score == "NA":
                    continue
        #        print score#i,i+l,score,last_start,last_end,last_value
                if last_start == "NA" :
                    last_start = i
                    last_end = i+l
                    last_value = score
                elif i >= last_end:
                    footprint.append([ll[0],int(ll[1])+last_start+3,int(ll[1])+last_end-3,last_value])
                    last_start = i
                    last_end = i+l
                    last_value = score
                elif score > last_value:
                    last_start = i
                    last_end = i+l
                    last_value = score
            footprint.append([ll[0],int(ll[1])+last_start+3,int(ll[1])+last_end-3,last_value])
        if count%100 ==0:
            print time.time()-t
            t = time.time()
        count += 1
    outf = open(out,'w')
    for fp in footprint:
        newline = "\t".join(map(str,fp))+"\n"
        outf.write(newline)
    outf.close()
Пример #10
0
def sitepro_scan(pattern,peak,out,w_plus,w_minus,trunk,text,w_chip):
    inf = open(pattern)
    pattern_plus_pmotif = map(float,inf.readline().strip().split(","))
    pattern_minus_pmotif = map(float,inf.readline().strip().split(","))
    pattern_plus_mmotif = map(float,inf.readline().strip().split(","))
    pattern_minus_mmotif = map(float,inf.readline().strip().split(","))

    all_sum_p = sum(pattern_plus_pmotif)+sum(pattern_minus_pmotif)
    all_sum_m = sum(pattern_plus_mmotif)+sum(pattern_minus_mmotif)

    p_plus = []
    p_minus= []
    m_plus = []
    m_minus = []
    for i in range(len(pattern_plus_pmotif)):
        p_plus.append(pattern_plus_pmotif[i]/all_sum_p)
        p_minus.append(pattern_minus_pmotif[i]/all_sum_p)
        m_plus.append(pattern_plus_mmotif[i]/all_sum_m)
        m_minus.append(pattern_minus_mmotif[i]/all_sum_m)
    inf.close()
    l = len(pattern_plus_pmotif)
    p0 = [1.0/(2*l)]*l	
    inf = open(peak)
    p=BwIO(w_plus)
    q=BwIO(w_minus)
    chrom_len1 = {}
    chrom_len2 = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len1[i['key']] = i['chromSize']
    for i in q.chromosomeTree['nodes']:
        chrom_len2[i['key']] = i['chromSize']
    w_plus_H=BigWigFile(open(w_plus, 'rb'))
    w_minus_H=BigWigFile(open(w_minus, 'rb'))
    w_chip_H=BigWigFile(open(w_chip, 'rb'))
    footprint = []
    ls=[0]*2*len(pattern_plus_pmotif)
    for line in inf:### chr start end name motifscore strand FP DNase chip
        ll = line.split()#####  3 below is flanking length
        if chrom_len1.has_key(ll[0])  and chrom_len2.has_key(ll[0]):
            DNase = float(w_plus_H.summarize(ll[0],(int(ll[1])+int(ll[2]))/2-50,(int(ll[1])+int(ll[2]))/2+50,1).sum_data) + float(w_minus_H.summarize(ll[0],(int(ll[1])+int(ll[2]))/2-50,(int(ll[1])+int(ll[2]))/2+50,1).sum_data) 
            Chip = float(w_chip_H.summarize(ll[0],(int(ll[1])+int(ll[2]))/2-10,(int(ll[1])+int(ll[2]))/2+10,1).sum_data) 
            p_sum = list(w_plus_H.summarize(ll[0],int(ll[1])-3-len(pattern_plus_pmotif),int(ll[2])+3+len(pattern_plus_pmotif),(int(ll[2])-int(ll[1])+2*(3+len(pattern_plus_pmotif)))).sum_data)
            m_sum = list(w_minus_H.summarize(ll[0],int(ll[1])-3-len(pattern_plus_pmotif),int(ll[2])+3+len(pattern_plus_pmotif),(int(ll[2])-int(ll[1])+2*(3+len(pattern_plus_pmotif)))).sum_data)
            last_start = "NA"
            last_end = "NA"
            last_value = "NA"
            for i in range(len(p_sum)-l):
                o_plus = map(int,p_sum[i:i+l])
                o_minus = map(int,m_sum[i:i+l])
        #        for k in range(len(o_plus)):
        #            if o_plus[k] > trunk:
        #                o_plus[k]=trunk
        #            if o_minus[k] > trunk:
        #                o_minus[k] = trunk
                if ll[5]=="+":
                    score =  match_pattern(p_plus,p_minus,o_plus,o_minus,l,1)
                elif ll[5]=="-":
                    score =  match_pattern(m_plus,m_minus,o_plus,o_minus,l,1)
                if i == len(pattern_plus_pmotif):
                    footprint.append(ll+[score,DNase,Chip])
                if last_start == "NA" :
                    last_start = i
                    last_end = i+l
                    last_value = score
                elif score > last_value:
                    last_start = i
                    last_end = i+l
                    last_value = score
            
            if last_start ==0 and last_value ==0 :
                pass
            else:   
                ls[last_start]+=1
        
    outf = open(out,'w')
    for fp in footprint:
        newline = "\t".join(map(str,fp))+"\n"
        outf.write(newline)
    outf.close()
    outf = open(text,'w')
    outf.write("\t".join(map(str,ls))+"\n")
    outf.close()
def sitepro_scan(peak, out, w_plus, w_minus):

    inf = open(peak)
    p = BwIO(w_plus)
    q = BwIO(w_minus)
    chrom_len1 = {}
    chrom_len2 = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len1[i['key']] = i['chromSize']
    for i in q.chromosomeTree['nodes']:
        chrom_len2[i['key']] = i['chromSize']
    w_plus_H = BigWigFile(open(w_plus, 'rb'))
    w_minus_H = BigWigFile(open(w_minus, 'rb'))
    footprint = []
    for line in inf:  ### chr start end name motifscore strand FP DNase chip
        ll = line.split()  #####  3 below is flanking length
        if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]):
            DNase100p = sum(
                list(
                    w_plus_H.summarize(ll[0],
                                       (int(ll[1]) + int(ll[2])) / 2 - 50,
                                       (int(ll[1]) + int(ll[2])) / 2 + 50,
                                       2).sum_data))
            DNase100m = sum(
                list(
                    w_minus_H.summarize(ll[0],
                                        (int(ll[1]) + int(ll[2])) / 2 - 50,
                                        (int(ll[1]) + int(ll[2])) / 2 + 50,
                                        2).sum_data))
            DNase200p = sum(
                list(
                    w_plus_H.summarize(ll[0],
                                       (int(ll[1]) + int(ll[2])) / 2 - 100,
                                       (int(ll[1]) + int(ll[2])) / 2 + 100,
                                       2).sum_data))
            DNase200m = sum(
                list(
                    w_minus_H.summarize(ll[0],
                                        (int(ll[1]) + int(ll[2])) / 2 - 100,
                                        (int(ll[1]) + int(ll[2])) / 2 + 100,
                                        2).sum_data))
            DNase300p = sum(
                list(
                    w_plus_H.summarize(ll[0],
                                       (int(ll[1]) + int(ll[2])) / 2 - 150,
                                       (int(ll[1]) + int(ll[2])) / 2 + 150,
                                       2).sum_data))
            DNase300m = sum(
                list(
                    w_minus_H.summarize(ll[0],
                                        (int(ll[1]) + int(ll[2])) / 2 - 150,
                                        (int(ll[1]) + int(ll[2])) / 2 + 150,
                                        2).sum_data))
            DNase400p = sum(
                list(
                    w_plus_H.summarize(ll[0],
                                       (int(ll[1]) + int(ll[2])) / 2 - 200,
                                       (int(ll[1]) + int(ll[2])) / 2 + 200,
                                       2).sum_data))
            DNase400m = sum(
                list(
                    w_minus_H.summarize(ll[0],
                                        (int(ll[1]) + int(ll[2])) / 2 - 200,
                                        (int(ll[1]) + int(ll[2])) / 2 + 200,
                                        2).sum_data))
            #  Chip = float(w_chip_H.summarize(ll[0],int(ll[1]),int(ll[2]),1).sum_data)
            p_sum = list(
                w_plus_H.summarize(ll[0],
                                   int(ll[1]) - 200,
                                   int(ll[1]) + 200, 400).sum_data)
            m_sum = list(
                w_minus_H.summarize(ll[0],
                                    int(ll[1]) - 200,
                                    int(ll[1]) + 200, 400).sum_data)
            footprint.append(ll + [
                DNase100p, DNase100m, DNase200p, DNase200m, DNase300p,
                DNase300m, DNase400p, DNase400m
            ] + p_sum + m_sum)

    outf = open(out, 'w')
    for fp in footprint:
        newline = "\t".join(map(str, fp)) + "\n"
        outf.write(newline)
    outf.close()
Пример #12
0
def main():
    usage = "usage: %prog <-r rfile> [options] <bigwig files> ..."
    description = "Draw correlation plot for many bigwig files. Based on qc_chIP_whole.py"
    
    optparser = OptionParser(version="%prog 0.1",description=description,usage=usage,add_help_option=False)
    optparser.add_option("-h","--help",action="help",help="Show this help message and exit.")
    #optparser.add_option("-d","--db",type="str",dest="dbname",help="UCSC db name for the assembly. Default: ce4",default="ce4")
    optparser.add_option("-r","--rfile",dest="rfile",
                         help="R output file. If not set, do not save R file.")
    optparser.add_option("-s","--step",dest="step",type="int",
                         help="sampling step in kbps. default: 100, minimal: 1",default=100)
    optparser.add_option("-z","--imgsize",dest="imgsize",type="int",
                         help="image size in inches, note the PNG dpi is 72. default: 10, minimal: 10",default=10)    
    optparser.add_option("-f","--format",dest="imgformat",type="string",
                         help="image format. PDF or PNG",default='PDF')
    #optparser.add_option("-m","--method",dest="method",type="string",default="median",
    #                     help="method to process the paired two sets of data in the sampling step. Choices are 'median', 'mean', and 'sample' (just take one point out of a data set). Default: median")
    optparser.add_option("-l","--wig-label",dest="wiglabel",type="string",action="append",
                         help="the wiggle file labels in the figure. No space is allowed. This option should be used same times as wiggle files, and please input them in the same order as -w option. default: will use the wiggle file filename as labels.")
    optparser.add_option("--min-score",dest="minscore",type="float",default=-10000,
                         help="minimum score included in calculation. Points w/ score lower than this will be discarded.")
    optparser.add_option("--max-score",dest="maxscore",type="float",default=10000,
                         help="maximum score included in calculation. Points w/ score larger than this will be discarded.")
    optparser.add_option("-H","--heatmap",dest="heatmap",action="store_true",default=False,
                         help="If True, a heatmap image will be generated instead of paired scatterplot image.")
    
    (options,wigfiles) = optparser.parse_args()

    imgfmt = options.imgformat.upper()
    if imgfmt != 'PDF' and imgfmt != 'PNG':
        print "unrecognized format: %s" % imgfmt
        sys.exit(1)

    medfunc = mean

    wigfilenum = len(wigfiles)
    if wigfilenum < 2 or not options.rfile:
        error("must provide >=2 wiggle files")
        optparser.print_help()
        sys.exit(1)

    # wig labels
    if options.wiglabel and len(options.wiglabel) == wigfilenum:
        wiglabel = options.wiglabel
    else:  # or use the filename
        wiglabel = map(lambda x:os.path.basename(x),wigfiles)
        
    if options.step < 1:
        error("Step can not be lower than 1!")
        sys.exit(1)
    if options.imgsize < 10:
        error("Image size can not be lower than 10!")
        sys.exit(1)

    # check the files
    for f in wigfiles:
        if not os.path.isfile(f):
            error("%s is not valid!" % f)
            sys.exit(1)
        
    info("number of bigwig files: %d" % wigfilenum)

    #get chromosome length from optins.wig[0]:
    p=BwIO(wigfiles[0])
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
        
    # get the common chromosome list:
    chrset = set([t['key'] for t in p.chromosomeTree['nodes']])
    for bw in wigfiles[1:]:
        p=BwIO(bw)
        chrset = chrset.intersection(set([t['key'] for t in p.chromosomeTree['nodes']]))
    chroms = list(chrset)

    if not chroms:
        error('No common chrom found')
        sys.exit()
    info("common chromosomes are %s." % ",".join(chroms))

    # Start writing R file
    if options.rfile:
        rfhd = open(options.rfile,"w")
        rfhd.write('''require("RColorBrewer") ## from CRAN\n''')

    # for each wig file, sample...
    for i in range(len(wigfiles)):
        bw = BigWigFile(open(wigfiles[i],'rb'))
        
        info("read wiggle track from bigwig file #%d" % (i+1))
        profile = []
        for chrom in chroms:

            # The too-short chromosome will cause error in bw.summarize function below
            # So filter them out
            if chrom_len[chrom]/options.step/1000==0:
                warn("A very-short chromosome (%s) found and skipped"%chrom)
                continue
            
            summary = bw.summarize(chrom, 0, chrom_len[chrom], chrom_len[chrom]/options.step/1000)
            if not summary:
                continue
            profile_chr = summary.sum_data / summary.valid_count
            profile_chr = [str(t).replace('nan', 'NA') for t in profile_chr]
            profile.extend(profile_chr)
            
        info("write values to r file")
        rfhd.write("p%d <- c(%s)\n" %(i, ','.join(profile)))
        
    rfhd.write("c <- cbind(p0")
    for i in range(wigfilenum-1):
        rfhd.write(",p%d" % (i+1))
    rfhd.write(")\n")
    
    rfhd.write("c <- c[ c[,1]<=%f & c[,1]>=%f " % (options.maxscore,options.minscore))
    for i in range(wigfilenum-1):
        rfhd.write("& c[,%d]<=%f & c[,%d]>=%f " % (i+2,options.maxscore,i+2,options.minscore))
    rfhd.write(",]\n")
    if imgfmt == 'PDF':
        rfhd.write("pdf(\"%s.pdf\",width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize))
    elif imgfmt == 'PNG':
        rfhd.write("png(\"%s.png\",units=\"in\",res=150,width=%d,height=%d)\n" % (options.rfile,options.imgsize,options.imgsize))

    if options.heatmap:                 # heatmap
        rfhd.write('library(gplots)\n')
        rfhd.write('''
m <- cor(c, method="pearson", use="pairwise.complete.obs")
''')
        labels = ",".join(map(lambda x:"\""+x+"\"",wiglabel))
        rfhd.write("rownames(m) <- c(%s)\n" % labels)
        rfhd.write("colnames(m) <- c(%s)\n" % labels)         
        rfhd.write('# draw the heatmap using gplots heatmap.2\n') 
        rfhd.write('mn <- -1\n')
        rfhd.write('mx <- 1\n')
        rfhd.write('n <- 98\n')
        rfhd.write('bias <- 1\n')
        rfhd.write('mc <- matrix(as.character(round(m, 2)), ncol=dim(m)[2])\n')
        rfhd.write('breaks <- seq(mn, mx, (mx-mn)/(n))\n')
        rfhd.write('cr <- colorRampPalette(colors = c("#2927FF","#FFFFFF","#DF5C5C"), bias=bias)\n')
        rfhd.write('heatmap.2(m, col = cr(n), breaks=breaks, trace="none", cellnote=mc, notecol="black", notecex=1.8, keysize=0.5, density.info="histogram", margins=c(27.0,27.0), cexRow=2.20, cexCol=2.20, revC=T, symm=T)\n')
    else:                               # scatterplot
        rfhd.write('''
panel.plot <- function( x,y, ... )
{
  par(new=TRUE)
  m <- cbind(x,y)
  plot(m,col=densCols(m),pch=20)
  lines(lowess(m[!is.na(m[,1])&!is.na(m[,2]),]),col="red")  
}
    
panel.cor <- function(x, y, digits=2, prefix="", cex.cor, ...)
{
  usr <- par("usr"); on.exit(par(usr))
  par(usr = c(0, 1, 0, 1))
  r <- cor(x, y,use="complete.obs")
  txt <- format(round(r,2),width=5,nsmall=2)
  #format(c(r, 0.123456789), digits=digits)[1]
  txt <- paste(prefix, txt, sep="")
  if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
  #text(0.5, 0.5, txt, cex = cex.cor * abs(r))
  text(0.5, 0.5, txt, cex = cex.cor)
}
''')
        labels = ",".join(map(lambda x:"\""+x+"\"",wiglabel))
        rfhd.write('''
pairs(c, lower.panel=panel.plot, upper.panel=panel.cor, labels=c(%s))
''' % (labels))

    rfhd.write("dev.off()\n")
    rfhd.close()

    # try to call R
    try:
        subprocess.call(['Rscript',options.rfile])
    except:
        info("Please check %s" % options.rfile)
    else:
        info("Please check %s" % (options.rfile+'.'+imgfmt))
Пример #13
0
def sitepro_scan(peak, out, w_plus, w_minus):

    inf = open(peak)
    p = BwIO(w_plus)
    q = BwIO(w_minus)
    chrom_len1 = {}
    chrom_len2 = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len1[i['key']] = i['chromSize']
    for i in q.chromosomeTree['nodes']:
        chrom_len2[i['key']] = i['chromSize']
    w_plus_H = BigWigFile(open(w_plus, 'rb'))
    w_minus_H = BigWigFile(open(w_minus, 'rb'))
    footprint = []
    outf = open(out, 'w')
    for line in inf:  ### chr start end name motifscore strand FP DNase chip
        ll = line.split()  #####  3 below is flanking length
        try:
            START = int(ll[1])
        except:
            print 'start:', ll[1], line
            continue
        try:
            END = int(ll[2])
        except:
            print 'end:', ll[2], line
            continue
        if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]):
            try:
                DNase100p = sum(
                    list(
                        w_plus_H.summarize(ll[0], (START + END) / 2 - 50,
                                           (START + END) / 2 + 50,
                                           2).sum_data))
            except:
                #print ll[0],START,END,(START+END)/2-50,(START+END)/2+50,type(START),type(END),line
                DNase100p = sum(
                    list(
                        w_plus_H.summarize(ll[0], (START + END) / 2 - 50,
                                           (START + END) / 2 + 50,
                                           1).sum_data))
            try:
                DNase100m = sum(
                    list(
                        w_minus_H.summarize(ll[0], (START + END) / 2 - 50,
                                            (START + END) / 2 + 50,
                                            2).sum_data))
            except:
                DNase100m = sum(
                    list(
                        w_minus_H.summarize(ll[0], (START + END) / 2 - 50,
                                            (START + END) / 2 + 50,
                                            1).sum_data))
            DNase200p = sum(
                list(
                    w_plus_H.summarize(ll[0], (START + END) / 2 - 100,
                                       (START + END) / 2 + 100, 2).sum_data))
            DNase200m = sum(
                list(
                    w_minus_H.summarize(ll[0], (START + END) / 2 - 100,
                                        (START + END) / 2 + 100, 2).sum_data))
            DNase300p = sum(
                list(
                    w_plus_H.summarize(ll[0], (START + END) / 2 - 150,
                                       (START + END) / 2 + 150, 2).sum_data))
            DNase300m = sum(
                list(
                    w_minus_H.summarize(ll[0], (START + END) / 2 - 150,
                                        (START + END) / 2 + 150, 2).sum_data))
            DNase400p = sum(
                list(
                    w_plus_H.summarize(ll[0], (START + END) / 2 - 200,
                                       (START + END) / 2 + 200, 2).sum_data))
            DNase400m = sum(
                list(
                    w_minus_H.summarize(ll[0], (START + END) / 2 - 200,
                                        (START + END) / 2 + 200, 2).sum_data))
            #  Chip = float(w_chip_H.summarize(ll[0],START,END,1).sum_data)
            p_sum = list(
                w_plus_H.summarize(ll[0], START - 200, START + 200,
                                   400).sum_data)
            m_sum = list(
                w_minus_H.summarize(ll[0], START - 200, START + 200,
                                    400).sum_data)
            fpline = (ll + [
                DNase100p, DNase100m, DNase200p, DNase200m, DNase300p,
                DNase300m, DNase400p, DNase400m
            ] + p_sum + m_sum)
            newline = "\t".join(map(str, fpline)) + "\n"
            outf.write(newline)
    outf.close()
Пример #14
0
def sitepro_scan(pattern, peak, out, w_plus, w_minus, trunk, text):
    inf = open(pattern)
    pattern_plus = map(float, inf.readline().strip().split(","))
    pattern_minus = map(float, inf.readline().strip().split(","))
    all_sum = sum(pattern_plus) + sum(pattern_minus)
    p_plus = []
    p_minus = []
    for i in pattern_plus:
        p_plus.append(i / all_sum)
    for i in pattern_minus:
        p_minus.append(i / all_sum)
    inf.close()
    l = len(pattern_plus)
    p0 = [1.0 / (2 * l)] * l
    inf = open(peak)
    p = BwIO(w_plus)
    q = BwIO(w_minus)
    chrom_len1 = {}
    chrom_len2 = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len1[i['key']] = i['chromSize']
    for i in q.chromosomeTree['nodes']:
        chrom_len2[i['key']] = i['chromSize']
    w_plus_H = BigWigFile(open(w_plus, 'rb'))
    w_minus_H = BigWigFile(open(w_minus, 'rb'))
    footprint = []
    #    count = 0
    #    t=time.time()
    ls = [0] * 44
    for line in inf:
        #       s=[]
        ll = line.split()
        if chrom_len1.has_key(ll[0]) and chrom_len2.has_key(ll[0]):
            p_sum = list(
                w_plus_H.summarize(
                    ll[0],
                    int(ll[1]) - 3 - 22,
                    int(ll[2]) + 3 + 22,
                    (int(ll[2]) - int(ll[1]) + 6 + 44)).sum_data)
            m_sum = list(
                w_minus_H.summarize(
                    ll[0],
                    int(ll[1]) - 3 - 22,
                    int(ll[2]) + 3 + 22,
                    (int(ll[2]) - int(ll[1]) + 6 + 44)).sum_data)
            last_start = "NA"
            last_end = "NA"
            last_value = "NA"
            for i in range(len(p_sum) - l):
                o_plus = map(int, p_sum[i:i + l])
                o_minus = map(int, m_sum[i:i + l])
                for k in range(len(o_plus)):
                    if o_plus[k] > trunk:
                        o_plus[k] = trunk
                    if o_minus[k] > trunk:
                        o_minus[k] = trunk
            # o_sum = sum(o_plus)+sum(o_minus)

            #print pattern_plus,p0
                score = match_pattern(p_plus, p_minus, p0, p0, o_plus, o_minus,
                                      l)
                #            s.append(score)
                if i == 22:
                    footprint.append(ll + [score])
                if last_start == "NA":
                    last_start = i
                    last_end = i + l
                    last_value = score
                elif score > last_value:
                    last_start = i
                    last_end = i + l
                    last_value = score

            if last_start == 0 and last_value == 0:
                pass
            else:
                ls[last_start] += 1
            #footprint.append(ll+[])
    # if count%100 ==0:
    #     print time.time()-t
    #     print ls
    #     t = time.time()
    # count += 1
    outf = open(out, 'w')
    for fp in footprint:
        newline = "\t".join(map(str, fp)) + "\n"
        outf.write(newline)
    outf.close()
    outf = open(text, 'w')
    outf.write("\t".join(map(str, ls)) + "\n")
    outf.close()
Пример #15
0
def getsignal(inputfile,outputfile,BGmatrix,pcut,ncut,pspan,fetch_length=100,gen='hg19'):

    
    p=BwIO(pcut)
    chrom_len = {}
    for i in p.chromosomeTree['nodes']:
        chrom_len[i['key']] = i['chromSize']
    pcutbw = BigWigFile(open(pcut, 'rb'))
    ncutbw = BigWigFile(open(ncut, 'rb'))
    inf = open(inputfile)    
    pp=[]
    pm=[]
    X = c.interval(genome=gen)
    X.chrom,X.start,X.end,X.val = [],[],[],[]
    pBG,nBG = readBG(BGmatrix)
    for line in inf:
        ll = line.split()
        if not chrom_len.has_key(ll[0]):
            continue
        pout = make_cut(pcutbw,ll,pspan,fetch_length)
        nout = make_cut(ncutbw,ll,pspan,fetch_length)
        if ll[5] == "-":
            pout,nout = nout,pout
        if pout == 'NA':
            continue
        #print len(pout),len(nout),ll[:3]
        pp.append(pout)
        pm.append(nout)
        X.chrom.append(ll[0])
        X.start.append(int(ll[1])-pspan -3   + 1)
        X.end.append(int(ll[2]) + pspan +3   + 1)
        X.val.append(ll[5])
#total[ ( flength - span ) : ( flength + int(ll[2]) - int(ll[1]) + span ) ]

    meanp = apply_mean(pp)
    meanm = apply_mean(pm) 

    X.getSequence()
    
    pbglist = []
    nbglist = []
    for i,elem in  enumerate(X.seq):
        seq = X.seq[i]
        strand = X.val[i]
        if 'N' in seq.upper():
            continue
        pseq = seq[:-1]
        nseq = seq[1:]
        #if 'N' in pseq  or 'N' in nseq:
        #    continue
        p=[]
        n=[]
        for k in range(len(pseq)  +1 - 6):
            p.append(pBG[pseq[k:k+6].upper()])
            n.append(nBG[nseq[k:k+6].upper()])
        if strand != '-':
            pbglist.append(p)
            nbglist.append(n)
        else:
            pbglist.append(n[::-1])
            nbglist.append(p[::-1])
    #print nbglist
    meanpbglist = apply_mean(pbglist)
    meanmbglist = apply_mean(nbglist)        

    plot_template(meanp,meanm,meanpbglist,meanmbglist,outputfile)