def full_mds(path1, path2, alpha=4, penalty=0.05, num_threads=3, weight=0.05, prefix=""): """MDS without partitioning""" structure1 = dt.structureFromBed(path1) structure2 = dt.structureFromBed(path2) dt.make_compatible((structure1, structure2)) contactMat1 = dt.matFromBed(path1, structure1) contactMat2 = dt.matFromBed(path2, structure2) infer_structures(contactMat1, structure1, contactMat2, structure2, alpha, penalty, num_threads, weight) prefix1 = os.path.splitext(os.path.basename(path1))[0] structure1.write("{}{}_structure.tsv".format(prefix, prefix1)) prefix2 = os.path.splitext(os.path.basename(path2))[0] structure2.write("{}{}_structure.tsv".format(prefix, prefix2)) dists = la.calculate_distances(structure1, structure2) with open("{}{}_{}_relocalization.bed".format(prefix, prefix1, prefix2), "w") as out: for gen_coord, dist in zip(structure1.getGenCoords(), dists): out.write("\t".join( (structure1.chrom.name, str(gen_coord), str(gen_coord + structure1.chrom.res), str(dist)))) out.write("\n") out.close() print("Fractional compartment change: ") print(calculate_compartment_fraction(structure1, structure2, path1, path2)) return structure1, structure2
def fullMDS(path1, path2, alpha, penalty, num_threads, weight): """MDS without partitioning""" structure1 = dt.structureFromBed(path1) structure2 = dt.structureFromBed(path2) dt.make_compatible((structure1, structure2)) contactMat1 = dt.matFromBed(path1, structure1) contactMat2 = dt.matFromBed(path2, structure2) infer_structures(contactMat1, structure1, contactMat2, structure2, alpha, penalty, num_threads, weight) return structure1, structure2
def fullMDS(path, classical, alpha, num_threads, weight): """MDS without partitioning""" structure = dt.structureFromBed(path) contactMat = dt.matFromBed(path, structure) infer_structure(contactMat, structure, alpha, num_threads, weight, classical) return structure
def fullMDS(path, classical=False): """MDS without partitioning""" cluster = dt.clusterFromBed(path, None, None) contactMat = dt.matFromBed(path, cluster) distMat = at.contactToDist(contactMat) infer_cluster(contactMat, cluster, classical) return cluster
def calculate_compartment_fraction(structure1, structure2, path1, path2): #compartments contacts1 = dt.matFromBed(path1, structure1) contacts2 = dt.matFromBed(path2, structure2) compartments1 = np.array(ca.get_compartments(contacts1)) compartments2 = np.array(ca.get_compartments(contacts2)) r, p = st.pearsonr(compartments1, compartments2) if r < 0: compartments2 = -compartments2 #SVR coords1 = structure1.getCoords() coords2 = structure2.getCoords() coords = np.concatenate((coords1, coords2)) compartments = np.concatenate((compartments1, compartments2)) clf = svm.LinearSVR() clf.fit(coords, compartments) coef = clf.coef_ transformed_coords1 = np.array(la.change_coordinate_system(coef, coords1)) transformed_coords2 = np.array(la.change_coordinate_system(coef, coords2)) x_diffs = transformed_coords1[:,0] - transformed_coords2[:,0] y_diffs = transformed_coords1[:,1] - transformed_coords2[:,1] z_diffs = transformed_coords1[:,2] - transformed_coords2[:,2] #axis lengths centroid1 = np.mean(transformed_coords1, axis=0) centroid2 = np.mean(transformed_coords2, axis=0) x_length1 = np.mean([np.abs(coord1[0] - centroid1[0]) for coord1 in transformed_coords1]) y_length1 = np.mean([np.abs(coord1[1] - centroid1[1]) for coord1 in transformed_coords1]) z_length1 = np.mean([np.abs(coord1[2] - centroid1[2]) for coord1 in transformed_coords1]) x_length2 = np.mean([np.abs(coord2[0] - centroid2[0]) for coord2 in transformed_coords2]) y_length2 = np.mean([np.abs(coord2[1] - centroid2[1]) for coord2 in transformed_coords2]) z_length2 = np.mean([np.abs(coord2[2] - centroid2[2]) for coord2 in transformed_coords2]) x_length = np.mean((x_length1, x_length2)) y_length = np.mean((y_length1, y_length2)) z_length = np.mean((z_length1, z_length2)) x_mean = np.mean(np.abs(x_diffs))/x_length y_mean = np.mean(np.abs(y_diffs))/y_length z_mean = np.mean(np.abs(z_diffs))/z_length return z_mean/(x_mean + y_mean + z_mean)
from matplotlib import pyplot as plt import sys sys.path.append("..") import compartment_analysis as ca import data_tools as dt import os paths = sys.argv[1:len(sys.argv)] prefixes = [os.path.basename(path) for path in paths] structs = [dt.structureFromBed(path) for path in paths] mats = [dt.matFromBed(path, struct) for path, struct in zip(paths, structs)] all_comps = [ca.get_compartments(mat) for mat in mats] all_gen_coords = [struct.getGenCoords() for struct in structs] #all_comps[len(all_comps)-1] = -all_comps[len(all_comps)-1] for gen_coords, comps, prefix in zip(all_gen_coords, all_comps, prefixes): plt.plot(gen_coords, comps, label=prefix) plt.legend() plt.show()
path2 = "hic_data/{}_{}_{}kb.bed".format(cell_type2, chrom, res_kb) if os.path.isfile(path1) and os.path.isfile(path2): os.system("python ~/git/multimds/multimds.py --full {} {}".format( path1, path2)) structure1 = dt.structure_from_file( "hic_data/{}_{}_{}kb_structure.tsv".format(cell_type1, chrom, res_kb)) structure2 = dt.structure_from_file( "hic_data/{}_{}_{}kb_structure.tsv".format(cell_type2, chrom, res_kb)) #plot.plot_structures_interactive((structure1, structure2)) #compartments contacts1 = dt.matFromBed(path1, structure1) contacts2 = dt.matFromBed(path2, structure2) at.makeSymmetric(contacts1) at.makeSymmetric(contacts2) compartments1 = np.array(ca.get_compartments(contacts1)) compartments2 = np.array(ca.get_compartments(contacts2)) r, p = st.pearsonr(compartments1, compartments2) if r < 0: compartments2 = -compartments2 #SVR coords1 = structure1.getCoords() coords2 = structure2.getCoords()
cell_type1 = sys.argv[1] cell_type2 = sys.argv[2] res_kb = int(sys.argv[3]) struct1 = dt.structure_from_file("{}_21_{}kb_structure.tsv".format( cell_type1, res_kb)) struct2 = dt.structure_from_file("{}_21_{}kb_structure.tsv".format( cell_type2, res_kb)) gen_coords = np.array(struct1.getGenCoords()) dists = np.array([ la.calcDistance(coord1, coord2) for coord1, coord2 in zip(struct1.getCoords(), struct2.getCoords()) ]) mat1 = dt.matFromBed("hic_data/{}_21_{}kb.bed".format(cell_type1, res_kb), struct1) comps1 = ca.get_compartments(mat1, struct1) mat2 = dt.matFromBed("hic_data/{}_21_{}kb.bed".format(cell_type2, res_kb), struct2) comps2 = ca.get_compartments(mat2, struct2) r, p = st.pearsonr(comps1, comps2) if r < 0: comps1 = -comps1 comp_diffs = np.abs(comps1 - comps2) plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False) plt.plot(gen_coords, dists / max(dists), lw=2,
def partitioned_mds(path1, path2, prefix="", centromere=0, num_partitions=4, maxmemory=32000000, num_threads=3, alpha=4, res_ratio=10, penalty=0.05, weight=0.05): """Partitions structure into substructures and performs MDS""" #create low-res structures lowstructure1 = create_low_res_structure(path1, res_ratio) lowstructure2 = create_low_res_structure(path2, res_ratio) dt.make_compatible((lowstructure1, lowstructure2)) #get partitions n = len(lowstructure1.getPoints()) if centromere == 0: midpoint = int(n / 2) else: midpoint = lowstructure1.chrom.getAbsoluteIndex(centromere) assert num_partitions % 2 == 0 partition_size1 = int(np.ceil(float(midpoint) / (num_partitions / 2))) partition_size2 = int(np.ceil(float(n - midpoint) / (num_partitions / 2))) lowpartitions = [ ] #low substructures, defined on absolute indices not relative indices for i in range(int(num_partitions / 2)): lowpartitions.append( (i * partition_size1, min(((i + 1) * partition_size1), midpoint))) for i in range(int(num_partitions / 2)): lowpartitions.append((midpoint + i * partition_size2, min((midpoint + (i + 1) * partition_size2), n - 1))) lowpartitions = np.array(lowpartitions) low_contactMat1 = dt.matFromBed(path1, lowstructure1) low_contactMat2 = dt.matFromBed(path2, lowstructure2) tad.substructuresFromAbsoluteTads(lowstructure1, lowpartitions) tad.substructuresFromAbsoluteTads(lowstructure2, lowpartitions) #create high-res chroms size1, res1 = dt.basicParamsFromBed(path1) highChrom1 = dt.ChromParameters(lowstructure1.chrom.minPos, lowstructure1.chrom.maxPos, res1, lowstructure1.chrom.name, size1) size2, res2 = dt.basicParamsFromBed(path2) highChrom2 = dt.ChromParameters(lowstructure2.chrom.minPos, lowstructure2.chrom.maxPos, res2, lowstructure2.chrom.name, size2) #initialize high-res substructures high_substructures1 = [] high_substructures2 = [] low_gen_coords = lowstructure1.getGenCoords() offset1 = 0 #initialize offset2 = 0 for partition in lowpartitions: start_gen_coord = low_gen_coords[partition[0]] end_gen_coord = low_gen_coords[partition[1]] high_substructure1 = dt.structureFromBed(path1, highChrom1, start_gen_coord, end_gen_coord, offset1) high_substructure2 = dt.structureFromBed(path2, highChrom2, start_gen_coord, end_gen_coord, offset2) high_substructures1.append(high_substructure1) high_substructures2.append(high_substructure2) offset1 += (len(high_substructure1.points) - 1) #update offset2 += (len(high_substructure2.points) - 1) #update for high_substructure1, high_substructure2 in zip(high_substructures1, high_substructures2): dt.make_points_compatible((high_substructure1, high_substructure2)) highstructure1 = dt.Structure([], high_substructures1, highChrom1, 0) highstructure2 = dt.Structure([], high_substructures2, highChrom2, 0) infer_structures(low_contactMat1, lowstructure1, low_contactMat2, lowstructure2, alpha, penalty, num_threads, weight) print("Low-resolution MDS complete") highSubstructures1 = pymp.shared.list(highstructure1.structures) highSubstructures2 = pymp.shared.list(highstructure2.structures) lowSubstructures1 = pymp.shared.list(lowstructure1.structures) lowSubstructures2 = pymp.shared.list(lowstructure2.structures) numSubstructures = len(highstructure1.structures) num_threads = min( (num_threads, mp.cpu_count(), numSubstructures) ) #don't exceed number of requested threads, available threads, or structures with pymp.Parallel(num_threads) as p: for substructurenum in p.range(numSubstructures): highSubstructure1 = highSubstructures1[substructurenum] highSubstructure2 = highSubstructures2[substructurenum] trueLow1 = lowSubstructures1[substructurenum] trueLow2 = lowSubstructures2[substructurenum] #joint MDS structure_contactMat1 = dt.matFromBed( path1, highSubstructure1) #contact matrix for this structure only structure_contactMat2 = dt.matFromBed( path2, highSubstructure2) #contact matrix for this structure only infer_structures(structure_contactMat1, highSubstructure1, structure_contactMat2, highSubstructure2, 2.5, penalty, num_threads, weight) transform(trueLow1, highSubstructure1, res_ratio) transform(trueLow2, highSubstructure2, res_ratio) highSubstructures1[substructurenum] = highSubstructure1 highSubstructures2[substructurenum] = highSubstructure2 print("MDS performed on structure {} of {}".format( substructurenum + 1, numSubstructures)) highstructure1.setstructures(highSubstructures1) highstructure2.setstructures(highSubstructures2) highstructure1.set_rel_indices() highstructure2.set_rel_indices() return highstructure1, highstructure2
import sys sys.path.append("..") import data_tools as dt import numpy as np import compartment_analysis as ca from sklearn import svm import linear_algebra as la from mayavi import mlab path1 = "hic_data/GM12878_combined_21_100kb.bed" path2 = "hic_data/K562_21_100kb.bed" struct1 = dt.structure_from_file("GM12878_combined_21_100kb_structure.tsv") struct2 = dt.structure_from_file("K562_21_100kb_structure.tsv") contacts1 = dt.matFromBed(path1, struct1) enrichments1 = np.loadtxt("binding_data/GM12878_21_100kb_active_coverage.bed", usecols=6) bin_nums1 = struct1.nonzero_abs_indices() + int( struct1.chrom.minPos / struct1.chrom.res) enrichments1 = enrichments1[bin_nums1] comps1 = np.array(ca.get_compartments(contacts1, struct1, enrichments1)) contacts2 = dt.matFromBed(path2, struct2) enrichments2 = np.loadtxt("binding_data/K562_21_100kb_active_coverage.bed", usecols=6) bin_nums2 = struct2.nonzero_abs_indices() + int( struct2.chrom.minPos / struct2.chrom.res) enrichments2 = enrichments2[bin_nums2] comps2 = np.array(ca.get_compartments(contacts2, struct2, enrichments2))
import sys sys.path.append("..") import data_tools as dt from matplotlib import pyplot as plt import numpy as np mat = dt.matFromBed(sys.argv[1]) n = len(mat) tots = np.zeros(n - 1) counts = np.zeros_like(tots) for i in range(n): for j in range(i): s = i - j if mat[i, j] != 0: tots[s - 1] += mat[i, j] counts[s - 1] += 1 avgs = np.zeros_like(tots) for i, (tot, count), in enumerate(zip(tots, counts)): if count != 0: avgs[i] = tot / count plt.plot(list(range(n - 1)), avgs) plt.xlabel("Separation (number of bins)") plt.ylabel("Average contact frequency") plt.show()
import numpy as np import sys sys.path.append("..") import data_tools as dt in_path = sys.argv[1] out_path = sys.argv[2] contactMat = dt.matFromBed(in_path) np.savetxt(out_path, contactMat, delimiter="\t")
import numpy as np all_species = ("Mouse", "Human", "Yeast") all_res_kb = (100, 100, 32) boxes = [[] for species in all_species] for i, (species, res_kb) in enumerate(zip(all_species, all_res_kb)): with open("{}_list.txt".format(species)) as infile: for line in infile: prefix = line.strip() for chrom in range(1, 23): path = "hic_data/{}_{}_{}kb.bed".format(prefix, chrom, res_kb) if os.path.isfile(path): mat = dt.matFromBed(path) oe_mat = ca.oe(mat) cor_mat = ca.cor(oe_mat) pca = PCA(n_components=1) pca.fit(cor_mat) boxes[i].append(pca.explained_variance_ratio_[0]) infile.close() #start with a frameless plot (extra room on the left) plt.subplot2grid((10,10), (0,0), 9, 10, frameon=False) #label axes plt.ylabel("PC1 explained variance ratio", fontsize=10) #define offsets
import numpy as np import sys sys.path.append("..") import data_tools as dt inpath = sys.argv[1] outpath = sys.argv[2] structure = dt.structureFromBed(inpath, None, None) contactMat = dt.matFromBed(inpath, structure) n = len(contactMat) fullMat = np.zeros((n, n + 2)) #locus IDs for i, pointNum in enumerate(structure.getPointNums()): fullMat[i, 0] = structure.chrom.minPos + structure.chrom.res * pointNum fullMat[i, 1] = structure.chrom.minPos + structure.chrom.res * (pointNum + 1) fullMat[:, 2:n + 2] = contactMat maxNumDigits = int(np.ceil(np.log10(np.amax(fullMat)))) formatstring = "%" + str(maxNumDigits) + "d" np.savetxt(outpath, fullMat, formatstring, delimiter="\t")
import numpy as np import sys sys.path.append("..") import data_tools as dt in_path = sys.argv[1] out_path = sys.argv[2] cluster = dt.clusterFromBed(in_path, None, None) contactMat = dt.matFromBed(in_path, cluster) np.savetxt(out_path, contactMat, delimiter="\t")
with open("{}_design.txt".format(comparison)) as infile: for line in infile: prefix1, prefix2 = line.strip().split() for chrom in range(1, 23): path1 = "hic_data/{}_{}_100kb.bed".format(prefix1, chrom) path2 = "hic_data/{}_{}_100kb.bed".format(prefix2, chrom) if os.path.isfile(path1) and os.path.isfile(path2): #load structures structure1 = dt.structureFromBed(path1) structure2 = dt.structureFromBed(path2) dt.make_compatible((structure1, structure2)) mat1 = dt.matFromBed(path1, structure1) mat2 = dt.matFromBed(path2, structure2) comps1 = ca.get_compartments(mat1, structure1) comps2 = ca.get_compartments(mat2, structure2) r, p = st.pearsonr(comps1, comps2) boxes[i].append(np.abs(r)) infile.close() plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False) #label axes plt.ylabel("Compartment correlation", fontsize=10)
from matplotlib import pyplot as plt import numpy as np import sys sys.path.append("..") import data_tools as dt import array_tools as at import misc bedpath = "hic_data/GM12878_combined_22_100kb.bed" mmds_cluster = dt.clusterFromFile( "hic_data/GM12878_combined_22_100kb_mmds_coords.tsv") contactMat = dt.matFromBed(bedpath, mmds_cluster) mmds_true_mat = at.contactToDist(contactMat) at.makeSymmetric(mmds_true_mat) for j in range(len(mmds_true_mat)): #remove diagonal mmds_true_mat[j, j] = 0 mmds_distMat = misc.distMat(mmds_cluster) mmds_r = misc.pearson(mmds_true_mat, mmds_distMat) cmds_cluster = dt.clusterFromFile( "hic_data/GM12878_combined_22_100kb_cmds_coords.tsv") contactMat = dt.matFromBed(bedpath, cmds_cluster) cmds_true_mat = at.contactToDist(contactMat) at.makeSymmetric(cmds_true_mat) for j in range(len(cmds_true_mat)): #remove diagonal cmds_true_mat[j, j] = 0 cmds_distMat = misc.distMat(cmds_cluster) cmds_r = misc.pearson(cmds_true_mat, cmds_distMat) minimds_cluster = dt.clusterFromFile(
def partitionedMDS(path, lowpath, args): """Partitions cluster into subclusters and performs MDS""" domainSmoothingParameter = args[0] minSizeFraction = args[1] maxmemory = args[2] num_threads = args[3] #create low-res cluster lowCluster = dt.clusterFromBed(lowpath, None, None) #get TADs low_contactMat = dt.matFromBed(lowpath, lowCluster) lowTads = tad.getDomains(low_contactMat, lowCluster, domainSmoothingParameter, minSizeFraction) #low subclusters #create high-res chrom size, res = dt.basicParamsFromBed(path) highChrom = dt.ChromParameters(lowCluster.chrom.minPos, lowCluster.chrom.maxPos, res, lowCluster.chrom.name, size) #create high-res cluster resRatio = lowCluster.chrom.res / highChrom.res highTads = lowTads * resRatio highCluster = dt.clusterFromBed(path, highChrom, highTads) #create compatible subclusters tad.subclustersFromTads(highCluster, lowCluster, lowTads) infer_cluster(low_contactMat, lowCluster, False) print "Low-resolution MDS complete" highSubclusters = pymp.shared.list(highCluster.clusters) lowSubclusters = pymp.shared.list(lowCluster.clusters) numSubclusters = len(highCluster.clusters) num_threads = min( (num_threads, mp.cpu_count(), numSubclusters) ) #don't exceed number of requested threads, available threads, or clusters with pymp.Parallel(num_threads) as p: for subclusternum in p.range(numSubclusters): highSubcluster = highSubclusters[subclusternum] trueLow = lowSubclusters[subclusternum] #perform MDS individually cluster_contactMat = dt.matFromBed( path, highSubcluster) #contact matrix for this cluster only infer_cluster(cluster_contactMat, highSubcluster, False) #approximate as low resolution inferredLow = dt.highToLow(highSubcluster, resRatio) #recover the transformation for inferred from true low cluster r, t, reflect = la.getTransformation(inferredLow, trueLow) t *= resRatio**(2. / 3) #rescale #transform high cluster highSubcluster.transform(r, t, reflect) highSubclusters[subclusternum] = highSubcluster print "MDS performed on cluster {} of {}".format( subclusternum + 1, numSubclusters) highCluster.setClusters(highSubclusters) return highCluster
sys.path.append("/home/lur159/git/miniMDS") import data_tools as dt import numpy as np import tools path = sys.argv[1] res = int(sys.argv[2]) outpath = sys.argv[3] chrom = dt.chromFromBed(path) chrom.res = res chrom.minPos = int(np.floor(float(chrom.minPos) / res)) * res #round chrom.maxPos = int(np.ceil(float(chrom.maxPos) / res)) * res struct = dt.structureFromBed(path, chrom) mat = dt.matFromBed(path, struct) points = struct.getPoints() with open(outpath, "w") as out: for i in range(len(mat)): abs_index1 = points[i].absolute_index for j in range(i): if mat[i, j] != 0: abs_index2 = points[j].absolute_index out.write("\t".join( (chrom.name, str(chrom.getGenCoord(abs_index1)), str(chrom.getGenCoord(abs_index1) + res), chrom.name, str(chrom.getGenCoord(abs_index2)), str(chrom.getGenCoord(abs_index2) + res), str(mat[i, j]))))
import data_tools as dt import array_tools as at import os import numpy as np res = int(sys.argv[1]) res_kb = res / 1000 if os.path.isfile("A_compartment_{}kb.bed".format(res_kb)): os.system("rm A_compartment_{}kb.bed".format(res_kb)) for chrom in (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22): path = "hic_data/GM12878_combined_{}_100kb.bed".format(chrom) structure = dt.structureFromBed(path) contacts = dt.matFromBed(path, structure) at.makeSymmetric(contacts) enrichments = np.array(np.loadtxt( "binding_data/Gm12878_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:, 6], dtype=float) bin_nums = structure.nonzero_abs_indices( ) + structure.chrom.minPos / structure.chrom.res enrichments = enrichments[bin_nums] compartments = np.array(ca.get_compartments(contacts, enrichments)) gen_coords = np.array(structure.getGenCoords()) a_gen_coords = gen_coords[np.where(compartments > 0)] with open("A_compartment_{}kb.bed".format(res_kb), "a") as out: for a_gen_coord in a_gen_coords: for i in range(100 / res_kb): out.write("\t".join(
from matplotlib import pyplot as plt import numpy as np import sys sys.path.append("..") import data_tools as dt import array_tools as at import misc #"true" distance matrix cluster = dt.clusterFromBed(bedpath, None, None) contactMat = dt.matFromBed(bedpath, cluster) distMat = at.contactToDist(contactMat) at.makeSymmetric(distMat) for j in range(len(distMat)): #remove diagonal distMat[j, j] = 0 chromthreed_distMat = misc.distsFromCoords( "Chromosome3D/output/chr22_100kb/chr22_100kb_coords.tsv") chromthreed_r = misc.pearson(distMat, chromthreed_distMat) mmds_distMat = dt.clusterFromFile( "hic_data/GM12878_combined_22_10kb_mmds_coords.tsv").distMat() mmds_r = misc.pearson(distMat, mmds_distMat) cmds_distMat = dt.clusterFromFile( "hic_data/GM12878_combined_22_10kb_cmds_coords.tsv").distMat() cmds_r = misc.pearson(distMat, cmds_distMat) minimds_distMat = dt.clusterFromFile( "hic_data/GM12878_combined_22_10kb_minimds_coords.tsv").distMat() minimds_r = misc.pearson(distMat, minimds_distMat)
chroms = (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, "X") n = len(chroms) mmds_rs = np.zeros(n) cmds_rs = np.zeros(n) minimds_rs = np.zeros(n) mogen_rs = np.zeros(n) for i, chrom in enumerate(chroms): bedpath = "hic_data/GM12878_combined_{}_10kb.bed".format(chrom) mmds_structure = dt.structure_from_file( "hic_data/GM12878_combined_{}_10kb_mmds_coords.tsv".format(chrom)) contactMat = dt.matFromBed(bedpath, mmds_structure) mmds_true_mat = at.contactToDist(contactMat) at.makeSymmetric(mmds_true_mat) for j in range(len(mmds_true_mat)): #remove diagonal mmds_true_mat[j, j] = 0 mmds_distMat = misc.distMat(mmds_structure) mmds_rs[i] = misc.pearson(mmds_true_mat, mmds_distMat) cmds_structure = dt.structure_from_file( "hic_data/GM12878_combined_{}_10kb_cmds_coords.tsv".format(chrom)) contactMat = dt.matFromBed(bedpath, cmds_structure) cmds_true_mat = at.contactToDist(contactMat) at.makeSymmetric(cmds_true_mat) for j in range(len(cmds_true_mat)): #remove diagonal cmds_true_mat[j, j] = 0 cmds_distMat = misc.distMat(cmds_structure)
def partitionedMDS(path, args): """Partitions structure into substructures and performs MDS""" domainSmoothingParameter = args[0] minSizeFraction = args[1] maxmemory = args[2] num_threads = args[3] alpha = args[4] res_ratio = args[5] alpha2 = args[6] #create low-res structure low_chrom = dt.chromFromBed(path) low_chrom.res *= res_ratio lowstructure = dt.structureFromBed(path, low_chrom) #low global structure #get TADs low_contactMat = dt.matFromBed(path, lowstructure) low_tad_indices = tad.getDomains( low_contactMat, lowstructure, domainSmoothingParameter, minSizeFraction ) #low substructures, defined on relative indices not absolute indices tad.substructuresFromTads(lowstructure, low_tad_indices) #create high-res chrom size, res = dt.basicParamsFromBed(path) highChrom = dt.ChromParameters(lowstructure.chrom.minPos, lowstructure.chrom.maxPos, res, lowstructure.chrom.name, size) highstructure = dt.Structure([], [], highChrom, 0) high_substructures = [] low_gen_coords = lowstructure.getGenCoords() offset = 0 #initialize for td in low_tad_indices: start_gen_coord = low_gen_coords[td[0]] end_gen_coord = low_gen_coords[td[1]] high_substructure = dt.structureFromBed(path, highChrom, start_gen_coord, end_gen_coord, offset) high_substructures.append(high_substructure) offset += len(high_substructure.points) #update offset -= 1 highstructure.setstructures(high_substructures) infer_structure(low_contactMat, lowstructure, alpha, num_threads) print "Low-resolution MDS complete" highSubstructures = pymp.shared.list(highstructure.structures) lowSubstructures = pymp.shared.list(lowstructure.structures) numSubstructures = len(highstructure.structures) num_threads = min( (num_threads, mp.cpu_count(), numSubstructures) ) #don't exceed number of requested threads, available threads, or structures with pymp.Parallel(num_threads) as p: for substructurenum in p.range(numSubstructures): highSubstructure = highSubstructures[substructurenum] if len(highSubstructure.getPoints()) > 0: #skip empty trueLow = lowSubstructures[substructurenum] #perform MDS individually structure_contactMat = dt.matFromBed( path, highSubstructure) #contact matrix for this structure only infer_structure(structure_contactMat, highSubstructure, alpha2, num_threads) #approximate as low resolution inferredLow = dt.highToLow(highSubstructure, res_ratio) #rescale scaling_factor = la.radius_of_gyration( trueLow) / la.radius_of_gyration(inferredLow) for i, point in enumerate(inferredLow.points): if point != 0: x, y, z = point.pos inferredLow.points[i].pos = (x * scaling_factor, y * scaling_factor, z * scaling_factor) #recover the transformation for inferred from true low structure r, t = la.getTransformation(inferredLow, trueLow) t /= scaling_factor #transform high structure highSubstructure.transform(r, t) highSubstructures[substructurenum] = highSubstructure print "MDS performed on structure {} of {}".format( substructurenum + 1, numSubstructures) highstructure.setstructures(highSubstructures) return highstructure
def fullMDS(path, classical, alpha): """MDS without partitioning""" cluster = dt.clusterFromBed(path, None, None) contactMat = dt.matFromBed(path, cluster) infer_cluster(contactMat, cluster, alpha, classical) return cluster