Exemplo n.º 1
0
def refine_with_summit(_soft,_mark,_tissue):
    _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/top_bed/{0}.{1}.{2}.bed"\
                                                       .format(_soft,_mark,_tissue))]

    _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.rep0.bw".format(_mark,_tissue))
    _temp_enrich = open("/Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w')
    _bw = BigWigFile(file=_temp_bw)

    for line in _temp_peak:
        vals = _bw.get(line[0],int(line[1]),int(line[2]))
        vals =tuple(vals)
        if len(vals)>0:
            maxs = 0
            for _key in vals:
                if float(_key[2])>maxs:
                    maxs = float(_key[2])
                    summit = _key[:2]
            summit_p=int((float(summit[0])+float(summit[1]))/2)
            if summit_p-1000>0:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999))
            else:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000)
    _temp_enrich.close()
    sh('sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\
     >/Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed'.format(_soft,_mark,_tissue))

    sh('bash ../get_enrich.sh /Data/adam/dnase/enrich_merge_bed/{0}.{1}.{2}.bed {1} {2} {3}'\
       .format(_soft,_mark,_tissue,_soft))
Exemplo n.º 2
0
def get_phastcons(
    bedtool,
    phastcons_location,
    species=None,
    index=None,
):
    """
    
    Get phastcons scores for intervals in a bed tool
    
    """

    if species is None and index is None:
        print "Error, must select species or index"

    f = open(phastcons_location, 'r')
    bw = BigWigFile(file=f)

    try:

        #if its a line
        #for each line fetch bigwig values
        type(bedtool)
        v = bedtool.chrom  #is a single interval
        vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop)
        consvals = list(v[-1] for v in vals)
        if len(consvals) > 0:
            mean_phastcons = np.mean(consvals)
        else:
            mean_phastcons = 0
        data = mean_phastcons
    except:

        #if bedtool
        for i, bedline in enumerate(bedtool):
            data = np.ndarray(len(bedtool))
            vals = bw.get(bedline.chrom, bedline.start, bedline.stop)
            consvals = list(v[-1] for v in vals)
            if len(consvals) > 0:
                mean_phastcons = np.mean(consvals)
            else:
                mean_phastcons = 0
            data[i] = mean_phastcons

    #returns mean phastcons score for each line
    #returns inconistant data types, need to convert so it just returns an array
    return data
Exemplo n.º 3
0
def refine_with_summit(_soft,_mark,_tissue,_reps):
    _temp_peak = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/sort_bed/{0}.{1}.{2}.bed"\
                                                       .format(_soft,_mark,_tissue))]

    _temp_bw = open("/Data/adam/dnase/bigwig/{0}.{1}.{2}.bw".format(_mark,_tissue,))
    _temp_enrich = open("/Data/adam/dnase/enrich_all_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w')
    _bw = BigWigFile(file=_temp_bw)

    for line in _temp_peak:
        vals = _bw.get(line[0],int(line[1]),int(line[2]))
        vals =tuple(vals)
        if len(vals)>0:
            maxs = 0
            for _key in vals:
                if float(_key[2])>maxs:
                    maxs = float(_key[2])
                    summit = _key[:2]
            summit_p=int((float(summit[0])+float(summit[1]))/2)
            if summit_p-1000>0:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],str(summit_p-1000),str(summit_p+999))
            else:
                print >> _temp_enrich, "{0}\t{1}\t{2}".format(line[0],1,2000)
    _temp_enrich.close()

    awk_args='{printf "%s\\t%s\\n", $0,NR}'
    sh("sort -k 1,1 -k 2g,2g /Data/adam/dnase/enrich_all_bed/{0}.{1}.{2}.bed| bedtools merge -i stdin\
     | awk '{3}'>/Data/adam/dnase/enrich_all_merge_bed/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue,awk_args))
    enhancer_dir ="/Data/adam/dnase/enhancer/tissue_enhancer/"
    sh("bash /Data/adam/dnase/enhancer/roc_pr.sh {0}{1}_enhancer.txt {0}negative_enhancer.txt \
    {2} {3} {1}".format(enhancer_dir, _tissue,_soft,_mark))
    raw_pr = [i.rstrip().split('\t') for i in open("/Data/adam/dnase/roc_pr_value/{0}.{1}.{2}.bed"\
                                                   .format(_soft,_mark,_tissue))]
    pr_refine = []
    temp_positive = [0,0]
    temp_negative = [0,0]
    for _line in raw_pr:
        if _line[3]==0 and _line[4]==0:
            temp_positive = _line[1:3]
            out_line = _line[:]
            out_line[3:5] = temp_negative
            precision = float(out_line[1])/(float(out_line[1])+float(out_line[3]))
            out_line.append(str(precision))
            pr_refine.append(out_line)
        elif _line[1]==0 and _line[2]==0:
            temp_negative = _line[3:5]
            out_line = _line[:]
            out_line[1:3] = temp_positive
            precision = float(out_line[1])/(float(out_line[1])+float(out_line[3]))
            out_line.append(str(precision))
            pr_refine.append(out_line)
        else:
            print "error in {0}.{1}.{2}, {3}".format(_soft,_mark,_tissue,_line)
    pr_refine2 = ['\t'.join(i) for i in pr_refine]
    with open("/Data/adam/dnase/roc_pr_final/{0}.{1}.{2}.bed".format(_soft,_mark,_tissue),'w') as f:
        for item in pr_refine2:
            print >>f, item
Exemplo n.º 4
0
def evaluateTC((signalFileName,chrom,start,end)):
  signalFile = open(signalFileName,"r")
  bw = BigWigFile(signalFile)
  mid = (int(start)+int(end))/2
  p1 = max(mid - halfWindow,0)
  p2 = mid + halfWindow
  try: nCount = int(sum(correctBW(bw.get(chrom,p1,p2),p1,p2)))
  except Exception: nCount = 0
  signalFile.close()
  return nCount
Exemplo n.º 5
0
def get_phastcons(bedtool, phastcons_location, species=None, index=None, ):
    
    """
    
    Get phastcons scores for intervals in a bed tool
    
    """
    
    if species is None and index is None:
        print "Error, must select species or index"
    
    f = open(phastcons_location, 'r')
    bw = BigWigFile(file=f)

    try:
        
        #if its a line
        #for each line fetch bigwig values 
        type(bedtool)
        v = bedtool.chrom #is a single interval
        vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop)
        consvals = list(v[-1] for v in vals)
        if len(consvals) > 0:
            mean_phastcons = np.mean(consvals)
        else:
            mean_phastcons=0
        data = mean_phastcons
    except:
        
        #if bedtool
        for i, bedline in enumerate(bedtool):
            data = np.ndarray(len(bedtool))        
            vals = bw.get(bedline.chrom, bedline.start, bedline.stop)
            consvals = list(v[-1] for v in vals)
            if len(consvals) > 0:
                mean_phastcons = np.mean(consvals)
            else:
                mean_phastcons=0
            data[i] = mean_phastcons
            
    #returns mean phastcons score for each line 
    #returns inconistant data types, need to convert so it just returns an array 
    return data
Exemplo n.º 6
0
 def get_GA_from_bw(self, plus, minus, GTF, filterfxn):
     ##bx-python 'get' method is 0 based, fully closed
     ga = HTSeq.GenomicArray( "auto", typecode='d' , stranded = True)
     with open(plus) as f:
         bw_file = BigWigFile(file=f)
         for GF in GTF:
             if filterfxn( GF ) == False: continue
             window = GF.iv
             chrom, start, stop = window.chrom, window.start, window.end
             vals = bw_file.get(chrom, start, stop)
             for start, stop, value in vals:
                 ga[ HTSeq.GenomicPosition(chrom, start, '+') ] = value
     with open(minus) as f:
         bw_file = BigWigFile(file=f)
         for GF in GTF:
             if filterfxn( GF ) == False: continue
             window = GF.iv
             chrom, start, stop = window.chrom, window.start, window.end
             vals = bw_file.get(chrom, start, stop)
             for start, stop, value in vals:
                 ga[ HTSeq.GenomicPosition(chrom, start, '-') ] = value
     return ga
Exemplo n.º 7
0
def main():
    input_filename, output_filename, loc_filename, loc_key, chrom_col, start_col = sys.argv[
        1:]

    # open input, output, and bigwig files
    location_file = LocationFile(loc_filename)
    bigwig_filename = location_file.get_values(loc_key)
    bwfh = open_or_die(bigwig_filename,
                       message='Error opening BigWig file %s' %
                       bigwig_filename)
    bw = BigWigFile(file=bwfh)
    ifh = open_or_die(input_filename,
                      message='Error opening input file %s' % input_filename)
    ofh = open_or_die(output_filename,
                      mode='w',
                      message='Error opening output file %s' % output_filename)

    # make column numbers 0-based
    chrom_col = int(chrom_col) - 1
    start_col = int(start_col) - 1
    min_cols = max(chrom_col, start_col)

    # add score column to imput file
    line_number = 0
    for line in ifh:
        line_number += 1
        line = line.rstrip('\r\n')
        elems = line.split('\t')
        if len(elems) > min_cols:
            chrom = elems[chrom_col].strip()
            # base-0 position in chrom
            start = int(elems[start_col])
            score_list = bw.get(chrom, start, start + 1)
            score_list_len = len(score_list)
            if score_list_len == 1:
                beg, end, score = score_list[0]
                score_val = '%1.3f' % score
            elif score_list_len == 0:
                score_val = 'NA'
            else:
                die('%s line %d: chrom=%s, start=%d, score_list_len = %d' %
                    (input_filename, line_number, chrom, start,
                     score_list_len))
            print >> ofh, '\t'.join([line, score_val])
        else:
            print >> ofh, line

    bwfh.close()
    ifh.close()
    ofh.close()
Exemplo n.º 8
0
def get_phastcons(bedtool, species=None, index=None):
    """
    Get phastcons scores for intervals in a bed tool
    """
    if species is None and index is None:
        print "Error, must select species or index"
    if species is not None and index is None:
        if species == "mm9":
            index= basedir + "/yeolab/Conservation/phastCons/mm9_30way/placental/mm9_phastcons.bw"
        elif species == "hg19":
            index = basedir + "/yeolab/Conservation/phastCons/hg19_46way/placentalMammals/reformat/hg19_phastcons.bw"
    f = open(index, 'r')
    bw = BigWigFile(file=f)

    try:
        type(bedtool)
        v = bedtool.chrom #is a single interval
        vals = bw.get(bedtool.chrom, bedtool.start, bedtool.stop)
        consvals = list(v[-1] for v in vals)
        if len(consvals) > 0:
            mean_phastcons = np.mean(consvals)
        else:
            mean_phastcons=0
        data = mean_phastcons


    except:
        for i, bedline in enumerate(bedtool):
            data = np.ndarray(len(bedtool))        
            vals = bw.get(bedline.chrom, bedline.start, bedline.stop)
            consvals = list(v[-1] for v in vals)
            if len(consvals) > 0:
                mean_phastcons = np.mean(consvals)
            else:
                mean_phastcons=0
            data[i] = mean_phastcons
    return data
Exemplo n.º 9
0
def main():
    input_filename, output_filename, loc_filename, loc_key, chrom_col, start_col = sys.argv[1:]

    # open input, output, and bigwig files
    location_file = LocationFile( loc_filename )
    bigwig_filename = location_file.get_values( loc_key )
    bwfh = open_or_die( bigwig_filename, message='Error opening BigWig file %s' % bigwig_filename )
    bw = BigWigFile( file=bwfh )
    ifh = open_or_die( input_filename, message='Error opening input file %s' % input_filename )
    ofh = open_or_die( output_filename, mode='w', message='Error opening output file %s' % output_filename )

    # make column numbers 0-based
    chrom_col = int( chrom_col ) - 1
    start_col = int( start_col ) - 1
    min_cols = max( chrom_col, start_col )

    # add score column to imput file
    line_number = 0
    for line in ifh:
        line_number += 1
        line = line.rstrip( '\r\n' )
        elems = line.split( '\t' )
        if len( elems ) > min_cols:
            chrom = elems[chrom_col].strip()
            # base-0 position in chrom
            start = int( elems[start_col] )
            score_list = bw.get( chrom, start, start + 1 )
            score_list_len = len( score_list )
            if score_list_len == 1:
                beg, end, score = score_list[0]
                score_val = '%1.3f' % score
            elif score_list_len == 0:
                score_val = 'NA'
            else:
                die( '%s line %d: chrom=%s, start=%d, score_list_len = %d' % ( input_filename, line_number, chrom, start, score_list_len ) )
            print('\t'.join( [line, score_val] ), file=ofh)
        else:
            print(line, file=ofh)

    bwfh.close()
    ifh.close()
    ofh.close()
Exemplo n.º 10
0
def getNumberOfFragmentsPerRegionFromBigWig(bw, chromSizes):
    """
    Get the number of all mapped fragments per region in all chromosomes
    from a bigWig. Utilizing bx-python.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    Get number of fragments in sample.
    >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile1, [('3R', 200)])
    3.0
    >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile2, [('3R', 200)])
    4.0
    """
    bwh = BigWigFile(open(bw, "rb"))
    mapped = 0
    for cname, csize in chromSizes:
        regions = bwh.get(cname, 0, csize) # region = bwh.get(chrom_name, start, end)
        for region in regions:
            mapped += region[2]
    return mapped
Exemplo n.º 11
0
def getNumberOfFragmentsPerRegionFromBigWig(bw, chromSizes):
    """
    Get the number of all mapped fragments per region in all chromosomes
    from a bigWig. Utilizing bx-python.

    Test dataset with two samples covering 200 bp.
    >>> test = Tester()

    Get number of fragments in sample.
    >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile1, [('3R', 200)])
    3.0
    >>> getNumberOfFragmentsPerRegionFromBigWig(test.bwFile2, [('3R', 200)])
    4.0
    """
    bwh = BigWigFile(open(bw, "rb"))
    mapped = 0
    for cname, csize in chromSizes:
        regions = bwh.get(cname, 0, csize) # region = bwh.get(chrom_name, start, end)
        for region in regions:
            mapped += region[2]
    return mapped
Exemplo n.º 12
0
def _get_resized(peakfile, bigwigs, width, outprefix):
    # Input is peak file name which is sort by rank
    peaklis = [i.rstrip().split('\t') for i in open(peakfile)]
    peaklist = _check_file(peaklis)
    _temp_bw = open(bigwigs)
    _bw = BigWigFile(file=_temp_bw)

    # print peaklist[0]
    with open("{0}.bed".format(outprefix), "w") as f:
        for _ids, line in enumerate(peaklist):
            vals = _bw.get(line[0], int(line[1]), int(line[2]))
            vals = tuple(vals)
            if len(vals) > 0:
                maxs = 0
                for _key in vals:
                    if float(_key[2]) > maxs:
                        maxs = float(_key[2])
                        summit = _key[:2]
                summit_p = int((float(summit[0]) + float(summit[1])) / 2)
                # out_summit.append([line[0], str(summit_p)])
                print >>f, "{0}\t{1}\t{2}\t{3}"\
                    .format(line[0], str(summit_p-width),str(summit_p+width-1),str(_ids+1))
Exemplo n.º 13
0
    # Fetching bed coordinates
    bedDict = aux.createBedDictFromSingleFile(bedFileNameList[b], separator="\t")

    # Iterating on chromosomes
    for chrName in constants.getChromList(reference=[bedDict]):

        # Iterating on coordinates
        for coord in bedDict[chrName]:

            # Positions
            mid = (coord[0] + coord[1])/2
            p1 = max(mid-int(math.floor(windowSize/2.0)),0)
            p2 = mid+int(math.ceil(windowSize/2.0))

            # Fetching sequence
            sequence = aux.correctBW(bw.get(chrName,p1,p2),p1,p2)
            if(invNeg and len(coord) >= 5 and coord[4] == "-"): sequence = sequence[::-1] # Inverting negative strand
            sequence = [e*normFactor for e in sequence] # Normalizing the values
            if(useLog): sequence = [math.log(e+1.0,2) for e in sequence] # Log the values

            # Updating counters
            counter = 0; currMean = 0.0
            for v in sequence:
                currMean += v
                sumVec[b][counter] += v
                sqSumVec[b][counter] += v**2
                counter += 1
            totVec[b] += 1.0
            boxplotVec[b].append(currMean/len(sequence))

# Evaluating mean and std
Exemplo n.º 14
0
#############################################################################################

# Initializing prior matrix with MPBS scores
priorMatrix = []
for coord in mpbsList:
    priorMatrix.append([float(coord[4])])

# Creating conservation prior
consFile = open(consFileName, "r")
bw = BigWigFile(consFile)
for i in range(0, len(mpbsList)):
    coord = mpbsList[i]
    chrName = coord[0]
    pos1 = int(coord[1])
    pos2 = int(coord[2])
    bwQuery = aux.correctBW(bw.get(chrName, pos1, pos2), pos1, pos2)
    priorMatrix[i].append(sum(bwQuery) / float(len(bwQuery)))
consFile.close()

# Creating TSS distance prior
tssFile = open(tssFileName, "r")
bw = BigWigFile(tssFile)
for i in range(0, len(mpbsList)):
    coord = mpbsList[i]
    chrName = coord[0]
    pos1 = int(coord[1])
    pos2 = int(coord[2])
    mid = (pos1 + pos2) / 2
    bwQuery = aux.correctBW(bw.get(chrName, mid, mid + 1), mid, mid + 1)
    priorMatrix[i].append(bwQuery[0])
tssFile.close()
Exemplo n.º 15
0
outputFileName = sys.argv[3]

# Opening signal file
signalFile = open(signalFileName, "r")
bw = BigWigFile(signalFile)

# Iterating on the mpbsfile to update the score
mpbsFile = open(mpbsFileName, "r")
outputFile = open(outputFileName, "w")
for line in mpbsFile:
    ll = line.strip().split()
    chrName = ll[0]
    p1 = int(ll[1])
    p2 = int(ll[2])
    mLen = p2 - p1
    p1_ext = p1 - mLen
    p2_ext = p2 + mLen
    if (p1_ext < 0): continue
    signalVec = aux.correctBW(bw.get(chrName, p1_ext, p2_ext), p1_ext, p2_ext)
    nl = sum(signalVec[:mLen]) + 1.0
    nc = sum(signalVec[mLen:2 * mLen]) + 1.0
    nr = sum(signalVec[2 * mLen:]) + 1.0
    try:
        ll[4] = str(round(-((nc / nr) + (nc / nl)), 4))
    except Exception:
        ll[4] = "-999"
    outputFile.write("\t".join(ll) + "\n")
signalFile.close()
mpbsFile.close()
outputFile.close()
Exemplo n.º 16
0
# Storing coordinates into disctionary (already in order of score)
coordList = aux.createBedListFromSingleFile(coordFileName)

# Iterating on coordDict to crop desired regions
for coord in coordList:

    # Obtaining real coordenates (rc)
    if(isBed): mPoint =  (coord[1] + coord[2]) / 2
    else: mPoint = coord[1] + coord[9]
    rc = [mPoint - windowLen , mPoint + windowLen]

    # Obtaining bw object
    wigFile = open(glob.glob(wigLocation+coord[0]+"_*.bw")[0],"r")
    bw = BigWigFile(wigFile)

    # Fetching and writing sequence
    bwQuery = bw.get(coord[0],rc[0],rc[1])
    if(bwQuery == None or bwQuery == []): continue
    outputFile.write("fixedStep chrom="+coord[0]+" start="+str(rc[0]+1)+" step=1\n")
    for (c1,c2,value) in bwQuery: outputFile.write(str(round(value,6))+"\n")

    # Closing wig file
    wigFile.close()

# End for chrName in chrNameList

# Termination
outputFile.close()


Exemplo n.º 17
0
# Evaluating overlap
for chrom in constants.getChromList(reference=[mpbsDict]):
    c = 0
    for interval in mpbsDict[chrom]:
        didBreak = False
        while (c < len(tfbsDict[chrom])):
            check = aux.checkTuplesOverlap(interval, tfbsDict[chrom][c])
            if (check == 0):
                interval[2] = mpbsName + ":Y"
                interval += [interval[0], interval[1], "0,150,0"]
                didBreak = True
                break
            elif (check == -1):
                treatSignalQ = bwTreat.get(
                    chrom, max(interval[0] + (peakExt / 2) - (negExt / 2), 0),
                    interval[1] - (peakExt / 2) + (negExt / 2))
                controlSignalQ = bwControl.get(
                    chrom, max(interval[0] + (peakExt / 2) - (negExt / 2), 0),
                    interval[1] - (peakExt / 2) + (negExt / 2))
                treatSum = sum([e[2] for e in treatSignalQ])
                controlSum = sum([e[2] for e in controlSignalQ])
                if (treatSum > controlSum):
                    interval[2] = mpbsName + ":."
                    interval += [interval[0], interval[1], "0,0,0"]
                else:
                    interval[2] = mpbsName + ":N"
                    interval += [interval[0], interval[1], "150,0,0"]
                didBreak = True
                break
            c += 1
Exemplo n.º 18
0
class BigWig(object):
    def __init__(self, filename):
        self.filename = filename
        self.determine_sizes()
        self.bwf = BigWigFile(open(filename))

    def determine_sizes(self):
        self.sizes = {}
        fh = open(self.filename, "rb")
        # read magic number to guess endianness
        magic = fh.read(4)
        if magic == '&\xfc\x8f\x88':
            endianness = '<'
        elif magic == '\x88\x8f\xfc&':
            endianness = '>'
        else:
            raise IOError("The file is not in bigwig format")

        # read the header
        info = struct.unpack(endianness + 'HHQQQHHQQIQ', fh.read(60))
        self.version = info[0]
        self.zoom_levels = info[1]
        self.chromosome_tree_offset = info[2]
        self.full_data_offset = info[3]
        self.full_index_offset = info[4]
        self.field_count = info[5]
        self.defined_field_count = info[6]
        self.auto_SQL_offset = info[7]
        self.total_summary_offset = info[8]
        self.uncompress_buf_size = info[9]
        
        # go to the data
        fh.seek(self.chromosome_tree_offset)
        # read magic again
        magic = fh.read(4)
        if magic == '\x91\x8c\xcax':
            endianness = '<'
        elif magic == 'x\xca\x8c\x91':
            endianness = '>'
        else:
            raise ValueError("Wrong magic for this bigwig data file")

        info2 = struct.unpack(endianness + 'IIIQQ', fh.read(28))
        self.block_size = info2[0]
        self.key_size = info2[1]
        self.val_size = info2[2]
        self.item_count = info2[3]

        info3 = struct.unpack(endianness + 'BBH', fh.read(4))
        self.is_leaf = info3[0]
        self.count = info3[2]

        for n in range(self.count):
            format_code = endianness + str(self.key_size) + 'sII'
            info = struct.unpack(format_code, fh.read(self.key_size + 2 * 4))
            key, chrom_id, chrom_size = info

            key = key.replace('\x00', '')
            self.sizes[key] = chrom_size

    def get_as_array(self, chrom, start, end):
        return self.bwf.get_as_array(chrom, start, end)

    def get(self, chrom, start, end):
        return self.bwf.get(chrom, start, end)

    def query(self, chrom, start, end, number):
        return self.bwf.query(chrom, start, end, number)
Exemplo n.º 19
0
outputFile = open(outputFileName,"w")
counter = 1
for line in bedFile:

    # Positions
    ll = line.strip().split("\t")
    mid = (int(ll[1]) + int(ll[2]))/2
    if(windowSize == 0):
        p1 = int(ll[1])
        p2 = int(ll[2])
    else:
        p1 = max(0,mid-int(math.floor(windowSize/2.0)))
        p2 = mid+int(math.ceil(windowSize/2.0))

    # Fetching sequence
    sequence1 = aux.correctBW(bw1.get(ll[0],p1,p2),p1,p2)
    sequence2 = aux.correctBW(bw2.get(ll[0],p1,p2),p1,p2)

    # Normalize
    sequence1 = [e*normFactor1 for e in sequence1]
    sequence2 = [e*normFactor2 for e in sequence2]
 
    # Log
    if(useLog):
        sequence1 = [math.log(e+1.0,2) for e in sequence1]
        sequence2 = [math.log(e+1.0,2) for e in sequence2]
    
    # Calculate fold change
    foldSequence = [sequence2[i]-sequence1[i] for i in range(0,len(sequence1))]
    v = np.array(foldSequence).mean()
Exemplo n.º 20
0
            newCoord = [r,min(r+coordLen,spCoord[1])]
            randDict[chrName].append(newCoord)
else:
    randDict = spacingDict

# Iterating on bed files
boxplotVec = []
for bedDict in [evDict,randDict]:

    # Iterating on chromosomes
    boxplotVec.append([])
    for chrName in constants.getChromList(reference=[bedDict]):

        # Iterating on coordinates
        for coord in bedDict[chrName]:
            sequence1 = aux.correctBW(bw1.get(chrName,coord[0],coord[1]),coord[0],coord[1])
            sequence2 = aux.correctBW(bw2.get(chrName,coord[0],coord[1]),coord[0],coord[1])
            sequence1 = [e*normFactor1 for e in sequence1] # Normalization 1
            sequence2 = [e*normFactor2 for e in sequence2] # Normalization 2
            if(useLog):
                sequence1 = [math.log(e+1.0,2) for e in sequence1] # Log 1
                sequence2 = [math.log(e+1.0,2) for e in sequence2] # Log 2
            finalSeq = [sequence2[i]-sequence1[i] for i in range(0,len(sequence1))]
            boxplotVec[-1].append(np.array(finalSeq).mean())

# Closing wig file
wigFile1.close()
wigFile2.close()

###############################################################################################################
### WRITING RESULTS
Exemplo n.º 21
0
            newCoord = [r, min(r + coordLen, spCoord[1])]
            randDict[chrName].append(newCoord)
else:
    randDict = spacingDict

# Iterating on bed files
boxplotVec = []
for bedDict in [evDict, randDict]:

    # Iterating on chromosomes
    boxplotVec.append([])
    for chrName in constants.getChromList(reference=[bedDict]):

        # Iterating on coordinates
        for coord in bedDict[chrName]:
            sequence = aux.correctBW(bw.get(chrName, coord[0], coord[1]),
                                     coord[0], coord[1])
            sequence = [e * normFactor for e in sequence]  # Normalization
            if (useLog):
                sequence = [math.log(e + 1.0, 2) for e in sequence]  # Log
            boxplotVec[-1].append(np.array(sequence).mean())

# Closing wig file
wigFile.close()

###############################################################################################################
### WRITING RESULTS
###############################################################################################################

# Creating output file name
outputFileName = outputLocation + bedLabel + "_" + wigLabel
Exemplo n.º 22
0
import numpy as np
fl = sys.argv[1]
dist = int(sys.argv[2])
from bx.bbi.bigwig_file import BigWigFile

genes = read.dat("/home/ssaberi/resources/list.genes.txt", '\t')
table = read.dat("/projects/epigenomics/MarcoJuliaPon/peaks.txt", '\t')
mygenes = read.dat("/projects/epigenomics/MarcoJuliaPon/mygenes.txt", '\t')
ens = []
for i in mygenes:
    for gn in genes:
        if i in gn[0]:
            ens.append(gn[1])
            break

genespos = read.read_gene_pos(
    '/home/ssaberi/resources/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage'
)
genesbed = bedtools.makebed_genpos(ens, genespos, 100000)

f = open(fl)
bw = BigWigFile(file=f)
mat = []
for bed_i in genesbed:
    vals = bw.get(bed_i[0], bed_i[1], bed_i[2])
    mat.append(np.array(vals))
mat = np.array(mat)
plt.matshow(mat, aspect='auto', cmap='YlOrBr')
fl = fl[-fl[::-1].index('/'):-fl[::-1].index('.')]
plt.save(fl + ".pdf")
Exemplo n.º 23
0
# Iterating on coordinate file
coordFile = open(coordFileName, "r")
outputFile = open(outputFileName, "w")
counter = 1
for line in coordFile:

    # Initialization
    ll = line.strip().split("\t")
    chrName = ll[0]
    p1 = int(ll[1])
    p2 = int(ll[2])

    # Create Neph signal input
    nephSignalFile = open(nephSignalFileName, "w")
    nephSignalFile.write("\n".join(
        [str(e) for e in aux.correctBW(bw.get(chrName, p1, p2), p1, p2)]))
    nephSignalFile.close()

    # Apply neph
    nephCommand = "detect-cache "
    nephCommand += "--flankmin 3 --flankmax 10 --centermin 6 --centermax 40 --maxthold 10 "
    nephCommand += nephSignalFileName + " > " + nephResultFileName
    os.system(nephCommand)
    os.system("rm " + nephSignalFileName)

    # Read/Write neph results
    nephResultFile = open(nephResultFileName, "r")
    for line in nephResultFile:
        ll = line.strip().split("\t")
        if (float(ll[4]) > fosThresh): continue
        outputFile.write("\t".join([
Exemplo n.º 24
0
# Signal files
posSigFile = open(posSignalFileName,"r")
posBw = BigWigFile(posSigFile)
negSigFile = open(negSignalFileName,"r")
negBw = BigWigFile(negSigFile)

# Chrom sizes dictionary
chromSizesFileName = constants.getChromSizesLocation()
chromSizesFile = open(chromSizesFileName,"r")
chromSizesDict = dict()
for line in chromSizesFile:
    ll = line.strip().split("\t")
    chromSizesDict[ll[0]] = int(ll[1])
chrList = constants.getChromList(x=False, y=False)

# Sumarizing
outputFile = open(outputLocation+posSigName+"_"+negSigName+"_"+str(windowLen)+".bed","w")
for chrName in chrList:
    for k in range(0,chromSizesDict[chrName],windowLen):
        p1 = k; p2 = min(k+windowLen,chromSizesDict[chrName])
        posMean = np.array(aux.correctBW(posBw.get(chrName,p1,p2),p1,p2)).mean()
        negMean = np.array([-e for e in aux.correctBW(negBw.get(chrName,p1,p2),p1,p2)]).mean()
        outputFile.write("\t".join(["hs"+chrName[3:],str(p1),str(p2),str(posMean)])+"\n"+"\t".join(["hs"+chrName[3:],str(p1),str(p2),str(negMean)])+"\n")

# Termination
posSigFile.close()
negSigFile.close()
outputFile.close()


Exemplo n.º 25
0
# Reading input
mpbsFileName = sys.argv[1]
signalFileName = sys.argv[2]
outputFileName = sys.argv[3]

# Parameters
halfWindow = 100

# Opening signal file
signalFile = open(signalFileName, "r")
bw = BigWigFile(signalFile)

# Iterating on the mpbsfile to update the score
mpbsFile = open(mpbsFileName, "r")
outputFile = open(outputFileName, "w")
for line in mpbsFile:
    ll = line.strip().split()
    mid = (int(ll[1]) + int(ll[2])) / 2
    p1 = max(mid - halfWindow, 0)
    p2 = mid + halfWindow
    try:
        nCount = int(sum(aux.correctBW(bw.get(ll[0], p1, p2), p1, p2)))
    except Exception:
        nCount = 0
    ll[4] = str(nCount)
    outputFile.write("\t".join(ll) + "\n")
signalFile.close()
mpbsFile.close()
outputFile.close()
Exemplo n.º 26
0
    if (re.match('^-BigWigFile', argument)):
        bw_file = argv.pop(0)
    elif (re.match('^-enhancers', argument)):
        enhancer_list = argv.pop(0)

# Load BigWig file and peak coordinates.
f = open(bw_file)
bw = BigWigFile(file=f)

enhancer_f = open(enhancer_list, 'rb')
data = csv.reader(enhancer_f, delimiter='\t')
enhancer_tab = [row for row in data]

# Create matrix to store summit positions.
summits = [[0 for x in xrange(3)] for x in xrange(len(enhancer_tab))]

# Get the summit for each peak and save the coordinates.
for i in range(0, len(enhancer_tab)):
    vals = bw.get(enhancer_tab[i][0], int(enhancer_tab[i][1]),
                  int(enhancer_tab[i][2]))
    max_start, max_end, max_val = max(vals, key=lambda x: x[-1])
    summits[i][0] = enhancer_tab[i][0]
    summits[i][1] = max_start
    summits[i][2] = max_end

# Write bed file with summit coordinates.
print "\n".join("\t".join(str(col) for col in row) for row in summits)

enhancer_f.close()
f.close()
Exemplo n.º 27
0
import numpy as np
fl=sys.argv[1]
dist=int(sys.argv[2])
from bx.bbi.bigwig_file import BigWigFile

genes=read.dat("/home/ssaberi/resources/list.genes.txt",'\t')
table=read.dat("/projects/epigenomics/MarcoJuliaPon/peaks.txt",'\t')
mygenes=read.dat("/projects/epigenomics/MarcoJuliaPon/mygenes.txt",'\t')
ens=[]
for i in mygenes:
	for gn in genes:
		if i in gn[0]:
			ens.append(gn[1])
			break

genespos=read.read_gene_pos('/home/ssaberi/resources/hg19v69_genes.TSS_2000.pc.A03480.H3K27me3.GE02.coverage')
genesbed=bedtools.makebed_genpos(ens,genespos,100000)
              


f = open(fl)
bw = BigWigFile(file=f)
mat=[]
for bed_i in genesbed:
   vals = bw.get( bed_i[0], bed_i[1], bed_i[2])
   mat.append(np.array(vals))
mat=np.array(mat)
plt.matshow(mat,aspect='auto',cmap='YlOrBr')
fl=fl[-fl[::-1].index('/'):-fl[::-1].index('.')]
plt.save(fl+".pdf")
Exemplo n.º 28
0
    bedFile = aux.createBedDictFromSingleFile(outputLocation+"_".join(bedLabelList)+"temp.bed", separator="\t")
    os.system("rm "+outputLocation+"_".join(bedLabelList)+"temp.bed")

# Fetching signal
nbSig = len(wigFileNameList)
sumVec = [[0.0 for e in range(0,windowSize)] for k in range(0,nbSig)]
sqSumVec = [[0.0 for e in range(0,windowSize)] for k in range(0,nbSig)]
totVec = [0.0 for k in range(0,nbSig)]
boxplotVec = [[] for k in range(0,nbSig)]
for s in range(0,nbSig):
    bwFile = open(wigFileNameList[s],"r")
    bw = BigWigFile(bwFile)
    for chrName in constants.getChromList(reference=[bedFile]):
        for coord in bedFile[chrName]:
            mid = (coord[0] + coord[1])/2
            bwQuery = aux.correctBW(bw.get(chrName,mid-int(math.floor(windowSize/2.0)),mid+int(math.ceil(windowSize/2.0))),mid-int(math.floor(windowSize/2.0)),mid+int(math.ceil(windowSize/2.0)))
            if(len(bwQuery) < windowSize): continue
            counter = 0; currMean = 0.0
            for value in bwQuery:
                currMean += (value*normFactorList[s])
                sumVec[s][counter] += (value*normFactorList[s])
                sqSumVec[s][counter] += ((value*normFactorList[s])**2)
                counter += 1
            totVec[s] += 1.0
            boxplotVec[s].append(currMean/len(bwQuery))
    bwFile.close()

# Evaluating mean and std
meanVec = [[0.0 for e in range(0,windowSize)] for k in range(0,nbSig)]
stdVec = [[0.0 for e in range(0,windowSize)] for k in range(0,nbSig)]
for s in range(0,nbSig):