def infer_structure(contactMat, structure, alpha, num_threads, classical=False): """Infers 3D coordinates for one structure""" assert len(structure.nonzero_abs_indices()) == len(contactMat) at.makeSymmetric(contactMat) rowsums = np.array([sum(row) for row in contactMat]) assert len(np.where(rowsums == 0)[0]) == 0 distMat = at.contactToDist(contactMat, alpha) at.makeSymmetric(distMat) if classical: #classical MDS coords = la.cmds(distMat) else: coords = manifold.MDS(n_components=3, metric=True, random_state=np.random.RandomState(), verbose=0, dissimilarity="precomputed", n_jobs=num_threads).fit_transform(distMat) structure.setCoords(coords)
def infer_structure(contactMat, structure, alpha, num_threads, weight, classical=False): """Infers 3D coordinates for one structure""" assert len(structure.nonzero_abs_indices()) == len(contactMat) expected = get_expected(contactMat) for i in range(len(contactMat)): for j in range(i): contactMat[i, j] = ( 1 - weight) * contactMat[i, j] + weight * expected[i - j - 1] at.makeSymmetric(contactMat) rowsums = np.array([sum(row) for row in contactMat]) assert len(np.where(rowsums == 0)[0]) == 0 distMat = at.contactToDist(contactMat, alpha) at.makeSymmetric(distMat) distMat = distMat / np.mean(distMat) #normalize if classical: #classical MDS coords = la.cmds(distMat) else: coords = manifold.MDS(n_components=3, metric=True, random_state=np.random.RandomState(), verbose=0, dissimilarity="precomputed", n_jobs=num_threads).fit_transform(distMat) structure.setCoords(coords)
def matFromBed(path, structure=None): """Converts BED file to matrix. Only includes loci in structure.""" if structure is None: structure = structureFromBed(path, None, None) abs_indices = structure.nonzero_abs_indices() numpoints = len(abs_indices) mat = np.zeros((numpoints, numpoints)) assert max(abs_indices) - structure.offset < len(structure.points) with open(path) as infile: for line in infile: line = line.strip().split() loc1 = int(line[1]) loc2 = int(line[4]) index1 = structure.get_rel_index(loc1) index2 = structure.get_rel_index(loc2) if index1 is not None and index2 is not None: if index1 > index2: row = index1 col = index2 else: row = index2 col = index1 mat[row, col] += float(line[6]) infile.close() at.makeSymmetric(mat) rowsums = np.array([sum(row) for row in mat]) assert len(np.where(rowsums == 0)[0]) == 0 return mat
def infer_clusters(contactMat, clusters, offsets, classical=False): """Infers 3D coordinates for multiple clusters with same contact matrix""" assert sum([len(cluster.getPointNums()) for cluster in clusters]) == len(contactMat) at.makeSymmetric(contactMat) rowsums = np.array([sum(row) for row in contactMat]) assert len(np.where(rowsums == 0)[0]) == 0 distMat = at.contactToDist(contactMat) at.makeSymmetric(distMat) if classical: #classical MDS coords = st.cmds(distMat) else: mds = manifold.MDS(n_components=3, metric=True, random_state=np.random.RandomState(seed=3), verbose=0, dissimilarity="precomputed", n_jobs=-1) coords = mds.fit_transform(distMat) for offset, cluster in zip(offsets, clusters): for i in range(len(cluster.getPoints())): cluster.getPoints()[i].pos = coords[i + offset]
def get_compartments(mat, enrichments=None, active=True): """From Lieberman-Aiden et al (2009)""" oe_mat = oe(mat) at.makeSymmetric(oe_mat) cor_mat = cor(oe_mat) at.makeSymmetric(cor_mat) pca = PCA(n_components=1) pca.fit(cor_mat) scores = pca.fit_transform(cor_mat)[:, 0] #enforce positive score = active chromatin if enrichments is not None: r, p = st.pearsonr(scores, enrichments) if active and r < 0: scores = -scores elif not active and r > 0: scores = -scores #normalize max_val = max(scores) min_val = -min(scores) for i, score in enumerate(scores): if score > 0: scores[i] = score / max_val else: scores[i] = score / min_val return scores
def removeInfinite(mat): """Replaces infinite values in matrix with zeroes""" n = len(mat) copy = np.copy(mat) for i in range(n): for j in range(i+1): if not np.isfinite(copy[i,j]): copy[i,j] = 0 at.makeSymmetric(copy) return copy
def infer_structures(contactMat, structures, offsets, alpha, num_threads, classical=False): """Infers 3D coordinates for multiple structures with same contact matrix""" assert sum([len(structure.getPointNums()) for structure in structures]) == len(contactMat) at.makeSymmetric(contactMat) rowsums = np.array([sum(row) for row in contactMat]) assert len(np.where(rowsums == 0)[0]) == 0 distMat = at.contactToDist(contactMat, alpha) at.makeSymmetric(distMat) if classical: #classical MDS coords = la.cmds(distMat) else: coords = manifold.MDS(n_components=3, metric=True, random_state=np.random.RandomState(), verbose=0, dissimilarity="precomputed", n_jobs=num_threads).fit_transform(distMat) for offset, structure in zip(offsets, structures): structure.setCoords(coords[offset:offset+len(structure.getPoints())])
def distmat(contactMat, structure, alpha, weight, num_threads): assert len(structure.nonzero_abs_indices()) == len(contactMat) expected = get_expected(contactMat) for i in range(len(contactMat)): for j in range(i): contactMat[i, j] = ( 1 - weight) * contactMat[i, j] + weight * expected[i - j - 1] at.makeSymmetric(contactMat) rowsums = np.array([sum(row) for row in contactMat]) assert len(np.where(rowsums == 0)[0]) == 0 distMat = at.contactToDist(contactMat, alpha) at.makeSymmetric(distMat) distMat = distMat / np.mean(distMat) #normalize return distMat
def matFromBed(path, cluster, interpolate=True): """Converts BED file to matrix. Only includes loci in cluster.""" cluster.indexPoints() pointNums = cluster.getPointNums() numpoints = len(pointNums) maxPointNum = max(pointNums) minPointNum = min(pointNums) mat = np.zeros((numpoints, numpoints)) assert maxPointNum - cluster.offset < len(cluster.points) with open(path) as infile: for line in infile: linearray = line.strip().split() #line as array of strings loc1 = int(linearray[1]) loc2 = int(linearray[4]) index1 = cluster.getIndex(loc1) index2 = cluster.getIndex(loc2) if index1 is not None and index2 is not None: if index1 > index2: row = index1 col = index2 else: row = index2 col = index1 mat[row, col] += float(linearray[6]) infile.close() at.makeSymmetric(mat) rowsums = np.array([sum(row) for row in mat]) empty = np.where(rowsums == 0)[0] assert len(np.where(rowsums == 0)[0]) == 0 #if interpolate: # mat = at.sp_interpolate(mat) at.makeSymmetric(mat) return mat
def fullMatFromBed(path, chrom): """Converts BED file to matrix""" numpoints = (chrom.maxPos - chrom.minPos) / chrom.res + 1 mat = np.zeros((numpoints, numpoints)) with open(path) as infile: for line in infile: line = line.strip().split() #line as array of strings loc1 = int(line[1]) loc2 = int(line[4]) index1 = chrom.getAbsoluteIndex(loc1) index2 = chrom.getAbsoluteIndex(loc2) if index1 > index2: row = index1 col = index2 else: row = index2 col = index1 mat[row, col] += float(line[6]) infile.close() at.makeSymmetric(mat) return mat
def matFromBed(path, cluster): """Converts BED file to matrix. Only includes loci in cluster.""" if cluster is None: cluster = clusterFromBed(path, None, None) cluster.indexPoints() pointNums = cluster.getPointNums() numpoints = len(pointNums) mat = np.zeros((numpoints, numpoints)) maxPointNum = max(pointNums) assert maxPointNum - cluster.offset < len(cluster.points) with open(path) as infile: for line in infile: line = line.strip().split() loc1 = int(line[1]) loc2 = int(line[4]) index1 = cluster.getIndex(loc1) index2 = cluster.getIndex(loc2) if index1 is not None and index2 is not None: if index1 > index2: row = index1 col = index2 else: row = index2 col = index1 mat[row, col] += float(line[6]) infile.close() at.makeSymmetric(mat) rowsums = np.array([sum(row) for row in mat]) assert len(np.where(rowsums == 0)[0]) == 0 return mat
def infer_cluster(contactMat, cluster, classical=False): """Infers 3D coordinates for one cluster""" assert len(cluster.getPointNums()) == len(contactMat) at.makeSymmetric(contactMat) rowsums = np.array([sum(row) for row in contactMat]) assert len(np.where(rowsums == 0)[0]) == 0 distMat = at.contactToDist(contactMat) at.makeSymmetric(distMat) if classical: #classical MDS coords = st.cmds(distMat) else: mds = manifold.MDS(n_components=3, metric=True, random_state=np.random.RandomState(seed=3), verbose=0, dissimilarity="precomputed", n_jobs=-1) coords = mds.fit(distMat).embedding_ for i in range(len(cluster.getPoints())): cluster.getPoints()[i].pos = coords[i]
import array_tools as at import os import numpy as np res = int(sys.argv[1]) res_kb = res / 1000 if os.path.isfile("A_compartment_{}kb.bed".format(res_kb)): os.system("rm A_compartment_{}kb.bed".format(res_kb)) for chrom in (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22): path = "hic_data/GM12878_combined_{}_100kb.bed".format(chrom) structure = dt.structureFromBed(path) contacts = dt.matFromBed(path, structure) at.makeSymmetric(contacts) enrichments = np.array(np.loadtxt( "binding_data/Gm12878_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:, 6], dtype=float) bin_nums = structure.nonzero_abs_indices( ) + structure.chrom.minPos / structure.chrom.res enrichments = enrichments[bin_nums] compartments = np.array(ca.get_compartments(contacts, enrichments)) gen_coords = np.array(structure.getGenCoords()) a_gen_coords = gen_coords[np.where(compartments > 0)] with open("A_compartment_{}kb.bed".format(res_kb), "a") as out: for a_gen_coord in a_gen_coords: for i in range(100 / res_kb): out.write("\t".join( (structure.chrom.name,
from matplotlib import pyplot as plt import numpy as np import sys sys.path.append("..") import data_tools as dt import array_tools as at import misc #"true" distance matrix cluster = dt.clusterFromBed(bedpath, None, None) contactMat = dt.matFromBed(bedpath, cluster) distMat = at.contactToDist(contactMat) at.makeSymmetric(distMat) for j in range(len(distMat)): #remove diagonal distMat[j, j] = 0 chromthreed_distMat = misc.distsFromCoords( "Chromosome3D/output/chr22_100kb/chr22_100kb_coords.tsv") chromthreed_r = misc.pearson(distMat, chromthreed_distMat) mmds_distMat = dt.clusterFromFile( "hic_data/GM12878_combined_22_10kb_mmds_coords.tsv").distMat() mmds_r = misc.pearson(distMat, mmds_distMat) cmds_distMat = dt.clusterFromFile( "hic_data/GM12878_combined_22_10kb_cmds_coords.tsv").distMat() cmds_r = misc.pearson(distMat, cmds_distMat) minimds_distMat = dt.clusterFromFile( "hic_data/GM12878_combined_22_10kb_minimds_coords.tsv").distMat() minimds_r = misc.pearson(distMat, minimds_distMat)
def heatMapFromMat(mat, maxvalue, tads, outpath, colors=None): at.makeSymmetric(mat) if maxvalue is not None: threshold(mat, maxvalue) createHeatmap(mat, tads, outpath, colors)
def normalized_dist_mat(path, structure): """Standard processing for creating distance matrix""" contacts = matFromBed(path, structure) dists = at.contactToDist(contacts, 4) at.makeSymmetric(dists) return dists / np.mean(dists) #normalize
os.system("python ~/git/multimds/multimds.py --full {} {}".format( path1, path2)) structure1 = dt.structure_from_file( "hic_data/{}_{}_{}kb_structure.tsv".format(cell_type1, chrom, res_kb)) structure2 = dt.structure_from_file( "hic_data/{}_{}_{}kb_structure.tsv".format(cell_type2, chrom, res_kb)) #plot.plot_structures_interactive((structure1, structure2)) #compartments contacts1 = dt.matFromBed(path1, structure1) contacts2 = dt.matFromBed(path2, structure2) at.makeSymmetric(contacts1) at.makeSymmetric(contacts2) compartments1 = np.array(ca.get_compartments(contacts1)) compartments2 = np.array(ca.get_compartments(contacts2)) r, p = st.pearsonr(compartments1, compartments2) if r < 0: compartments2 = -compartments2 #SVR coords1 = structure1.getCoords() coords2 = structure2.getCoords() coords = np.concatenate((coords1, coords2)) compartments = np.concatenate((compartments1, compartments2)) clf = svm.LinearSVR()
20, 21, 22, "X") n = len(chroms) mmds_rs = np.zeros(n) cmds_rs = np.zeros(n) minimds_rs = np.zeros(n) mogen_rs = np.zeros(n) for i, chrom in enumerate(chroms): bedpath = "hic_data/GM12878_combined_{}_10kb.bed".format(chrom) mmds_structure = dt.structure_from_file( "hic_data/GM12878_combined_{}_10kb_mmds_coords.tsv".format(chrom)) contactMat = dt.matFromBed(bedpath, mmds_structure) mmds_true_mat = at.contactToDist(contactMat) at.makeSymmetric(mmds_true_mat) for j in range(len(mmds_true_mat)): #remove diagonal mmds_true_mat[j, j] = 0 mmds_distMat = misc.distMat(mmds_structure) mmds_rs[i] = misc.pearson(mmds_true_mat, mmds_distMat) cmds_structure = dt.structure_from_file( "hic_data/GM12878_combined_{}_10kb_cmds_coords.tsv".format(chrom)) contactMat = dt.matFromBed(bedpath, cmds_structure) cmds_true_mat = at.contactToDist(contactMat) at.makeSymmetric(cmds_true_mat) for j in range(len(cmds_true_mat)): #remove diagonal cmds_true_mat[j, j] = 0 cmds_distMat = misc.distMat(cmds_structure) cmds_rs[i] = misc.pearson(cmds_true_mat, cmds_distMat)