def structureFromBed(path, chrom=None, start=None, end=None, offset=0): """Initializes structure from intrachromosomal BED file.""" if chrom is None: chrom = chromFromBed(path) if start is None: start = chrom.minPos if end is None: end = chrom.maxPos structure = Structure([], [], chrom, offset) structure.points = np.zeros(int((end - start)/chrom.res) + 1, dtype=object) #true if locus should be added tracker = Tracker("Identifying loci", structure.chrom.size) #add loci with open(path) as listFile: for line in listFile: line = line.strip().split() pos1 = int(line[1]) pos2 = int(line[4]) if pos1 >= start and pos1 <= end and pos2 >= start and pos2 <= end: abs_index1 = structure.chrom.getAbsoluteIndex(pos1) abs_index2 = structure.chrom.getAbsoluteIndex(pos2) if abs_index1 != abs_index2: #non-self-interacting structure.points[int((pos1 - start)/chrom.res)] = Point((0,0,0), structure.chrom, abs_index1, 0) structure.points[int((pos2 - start)/chrom.res)] = Point((0,0,0), structure.chrom, abs_index2, 0) tracker.increment() listFile.close() structure.set_rel_indices() return structure
def structureFromBed(path, chrom=None, start=None, end=None, offset=0, tads=None): """Initializes structure from intrachromosomal BED file.""" if chrom is None: chrom = chromFromBed(path) if start is None: start = chrom.minPos if end is None: end = chrom.maxPos structure = Structure([], [], chrom, offset) #get TAD for every locus #if tads is None: # tadNums = np.zeros(structure.chrom.getLength()) #else: # tadNums = [] # for i, tad in enumerate(tads): # for j in range(tad[0], tad[1]): # tadNums.append(i) #maxIndex = len(tadNums) - 1 structure.points = np.zeros((end - start) / chrom.res + 1, dtype=object) #true if locus should be added tracker = Tracker("Identifying loci", structure.chrom.size) #add loci with open(path) as listFile: for line in listFile: line = line.strip().split() pos1 = int(line[1]) pos2 = int(line[4]) if pos1 >= start and pos1 <= end and pos2 >= start and pos2 <= end: pointNum1 = structure.chrom.getPointNum(pos1) pointNum2 = structure.chrom.getPointNum(pos2) #tadNum1 = tadNums[min(pointNum1, maxIndex)] #tadNum2 = tadNums[min(pointNum2, maxIndex)] #if pointNum1 != pointNum2 and tadNum1 == tadNum2: #must be in same TAD if pointNum1 != pointNum2: #non-self-interacting structure.points[(pos1 - start) / chrom.res] = Point( (0, 0, 0), pointNum1, structure.chrom, 0) structure.points[(pos2 - start) / chrom.res] = Point( (0, 0, 0), pointNum2, structure.chrom, 0) tracker.increment() listFile.close() structure.indexPoints() return structure
def clusterFromBed(path, chrom, tads): """Initializes cluster from intrachromosomal BED file.""" if chrom is None: chrom = intraChromFromBed(path, None) cluster = Cluster([], [], chrom, 0) #get TAD for every locus if tads is None: tadNums = np.zeros(cluster.chrom.getLength()) else: tadNums = [] tadNum = 1 for tad in tads: for i in range(tad[0], tad[1]): tadNums.append(tadNum) tadNum += 1 maxIndex = len(tadNums) - 1 points_to_add = np.zeros(cluster.chrom.getLength(), dtype=np.bool) #true if locus should be added tracker = Tracker("Identifying loci", cluster.chrom.size) #find which loci should be added with open(path) as listFile: for line in listFile: line = line.strip().split() pos1 = int(line[1]) pos2 = int(line[4]) pointNum1 = cluster.chrom.getPointNum(pos1) pointNum2 = cluster.chrom.getPointNum(pos2) if pointNum1 is not None and pointNum2 is not None: tadNum1 = tadNums[min(pointNum1, maxIndex)] tadNum2 = tadNums[min(pointNum2, maxIndex)] if pointNum1 != pointNum2 and tadNum1 == tadNum2: #must be in same TAD if points_to_add[pointNum1] == False: points_to_add[pointNum1] = True if points_to_add[pointNum2] == False: points_to_add[pointNum2] = True tracker.increment() listFile.close() #create points points = np.zeros(cluster.chrom.getLength(), dtype=np.object) pointNums = np.where(points_to_add == True)[0] for pointNum in pointNums: points[pointNum] = Point((0, 0, 0), pointNum, cluster.chrom, None) cluster.points = points cluster.indexPoints() return cluster
def matFromDixon(path, chrom): """Creates contact matrix from Dixon tsv data""" numBins = chrom.getLength() mat = np.zeros((numBins, numBins)) tracker = Tracker("Reading " + path, chrom.size) with open(path) as infile: for line in infile: line = line.strip().split() pos1 = int(line[0]) pos2 = int(line[1]) if pos1 != pos2: if pos1 >= chrom.minPos and pos1 <= chrom.maxPos and pos2 >= chrom.minPos and pos2 <= chrom.maxPos: bin1 = chrom.getAbsoluteIndex(pos1) bin2 = chrom.getAbsoluteIndex(pos2) if bin1 > bin2: row = bin1 col = bin2 else: row = bin1 col = bin2 mat[row, col] += 1 tracker.increment() infile.close() return mat
import numpy as np in_path = sys.argv[1] out_path = sys.argv[2] f = h5py.File(in_path) counts = np.array(f["pixels"]["count"]) bin_ids1 = np.array(f["pixels"]["bin1_id"]) bin_ids2 = np.array(f["pixels"]["bin2_id"]) chroms = np.array(f["bins"]["chrom"]) starts = np.array(f["bins"]["start"]) ends = np.array(f["bins"]["end"]) f.close() tracker = Tracker("Converting to BED", len(counts)) print "Begin converting to BED" with open(out_path, "w") as out_file: for count, bin_id1, bin_id2 in zip(counts, bin_ids1, bin_ids2): if count != 0: chrom1 = str(chroms[bin_id1] + 1) #switch to 1-indexed chrom2 = str(chroms[bin_id2] + 1) start1 = str(starts[bin_id1]) end1 = str(ends[bin_id1]) start2 = str(starts[bin_id2]) end2 = str(ends[bin_id2]) out_file.write("\t".join( ("chr" + chrom1, start1, end1, "chr" + chrom2, start2, end2, str(count)))) out_file.write("\n")
""""Convert fixedStep wig to binned bed""" import sys sys.path.append("..") from tools import Tracker wig = sys.argv[1] bin_size = int(sys.argv[2]) file_size = int(sys.argv[3]) prefix = wig.split(".")[0] tracker = Tracker("Converting {}".format(wig), file_size) tot = 0 count = 0 with open(wig) as in_file: with open("{}_{}kb.bed".format(prefix, bin_size / 1000), "w") as out_file: for line in in_file: line = line.strip().split() if line[0] == "fixedStep": #header chrom = line[1].split("=")[1] curr_pos = int(line[2].split("=")[1]) step = int(line[3].split("=")[1]) span = int(line[4].split("=")[1]) else: tot += float(line[0]) count += span if curr_pos % bin_size == 0: