def fullMDS(path, classical, alpha, num_threads, weight): """MDS without partitioning""" size = dt.size_from_bed(path) structure = dt.structureFromBed(path, size) contactMat = dt.matFromBed(path, size, structure) infer_structure(contactMat, structure, alpha, num_threads, weight, classical) return structure
from multimds import compartment_analysis as ca from multimds import data_tools as dt from scipy import stats as st from matplotlib import pyplot as plt import numpy as np from multimds import linear_algebra as la from scipy import signal as sg from multimds import multimds as mm path1 = "hic_data/GM12878_combined_19_100kb.bed" path2 = "hic_data/K562_19_100kb.bed" struct1, struct2 = mm.full_mds(path1, path2, prefix="test_") mat1 = dt.matFromBed( "hic_data/GM12878_combined_{}_{}kb.bed".format(chrom, res_kb), struct1) comps1 = ca.get_compartments(mat1, struct1) mat2 = dt.matFromBed("hic_data/K562_{}_{}kb.bed".format(chrom, res_kb), struct2) comps2 = ca.get_compartments(mat2, struct2) r, p = st.pearsonr(comps1, comps2) if r < 0: comps1 = -comps1 comp_diffs = np.abs(comps1 - comps2) dists = np.array([ la.calcDistance(coord1, coord2) for coord1, coord2 in zip(struct1.getCoords(), struct2.getCoords()) ])
from multimds import data_tools as dt from multimds import compartment_analysis as ca import numpy as np from sklearn import svm from multimds import linear_algebra as la from mayavi import mlab struct = dt.structure_from_file( "hic_data/GM12878_combined_21_100kb_structure.tsv") new_start = struct.chrom.getAbsoluteIndex(15000000) struct.subsamplePoints(new_start, len(struct.points) - 3) #compartments contacts = dt.matFromBed("hic_data/GM12878_combined_21_100kb.bed", struct) compartments = np.array(ca.get_compartments(contacts, struct)) #SVR coords = struct.getCoords() clf = svm.LinearSVR() clf.fit(coords, compartments) coef = clf.coef_ transformed_coords = np.array(la.change_coordinate_system(coef, coords)) xs = transformed_coords[:, 0] min_x = min(xs) max_x = max(xs) x_range = max_x - min_x ys = transformed_coords[:, 1] min_y = min(ys)
for chrom in chroms: path1 = "hic_data/{}_{}_{}kb.bed".format(cell_type1, chrom, res_kb) path2 = "hic_data/{}_{}_{}kb.bed".format(cell_type2, chrom, res_kb) if os.path.isfile(path1) and os.path.isfile(path2): structure1, structure2 = multimds.full_mds(path1, path2, penalty=penalty) structure1.rescale() structure2.rescale() r, t = la.getTransformation(structure1, structure2) structure1.transform(r, t) #compartments contacts1 = dt.matFromBed(path1, structure=structure1) contacts2 = dt.matFromBed(path2, structure=structure2) compartments1 = np.array( ca.get_compartments(contacts1, structure1)) compartments2 = np.array( ca.get_compartments(contacts2, structure2)) r, p = st.pearsonr(compartments1, compartments2) if r < 0: compartments2 = -compartments2 #SVR coords1 = structure1.getCoords() coords2 = structure2.getCoords() coords = np.concatenate((coords1, coords2))
#compartments chrom1 = dt.chromFromBed(path1) chrom2 = dt.chromFromBed(path2) chrom1.res = 100000 #reduce res to reduce RAM usage in compartment calculation chrom2.res = 100000 chrom1.minPos = int(np.floor( float(chrom1.minPos) / chrom1.res)) * chrom1.res #round chrom1.maxPos = int(np.ceil(float(chrom1.maxPos) / chrom1.res)) * chrom1.res chrom2.minPos = int(np.floor( float(chrom2.minPos) / chrom2.res)) * chrom2.res #round chrom2.maxPos = int(np.ceil(float(chrom2.maxPos) / chrom2.res)) * chrom2.res low_struct1 = dt.structureFromBed(path1, chrom1) low_struct2 = dt.structureFromBed(path2, chrom2) dt.make_compatible((low_struct1, low_struct2)) contacts1 = dt.matFromBed(path1, low_struct1) contacts2 = dt.matFromBed(path2, low_struct2) enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format( format_celltype(cell_type1), chrom), usecols=6) bin_nums = low_struct1.nonzero_bins_whole_chrom() enrichments = enrichments[bin_nums] compartments1 = np.array( ca.get_compartments(contacts1, low_struct1, enrichments)) enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format( format_celltype(cell_type2), chrom), usecols=6) bin_nums = low_struct2.nonzero_bins_whole_chrom() enrichments = enrichments[bin_nums]
from multimds import data_tools as dt import numpy as np from multimds import tools path = sys.argv[1] res = int(sys.argv[2]) outpath = sys.argv[3] size = dt.size_from_bed(path) chrom = dt.chromFromBed(path) chrom.res = res chrom.minPos = int(np.floor(float(chrom.minPos) / res)) * res #round chrom.maxPos = int(np.ceil(float(chrom.maxPos) / res)) * res struct = dt.structureFromBed(path, size, chrom) mat = dt.matFromBed(path, size, struct) points = struct.getPoints() with open(outpath, "w") as out: for i in range(len(mat)): abs_index1 = points[i].absolute_index for j in range(i): if mat[i, j] != 0: abs_index2 = points[j].absolute_index out.write("\t".join( (chrom.name, str(chrom.getGenCoord(abs_index1)), str(chrom.getGenCoord(abs_index1) + res), chrom.name, str(chrom.getGenCoord(abs_index2)), str(chrom.getGenCoord(abs_index2) + res), str(mat[i, j]))))
import numpy as np all_species = ("Mouse", "Human", "Yeast") all_res_kb = (100, 100, 32) boxes = [[] for species in all_species] for i, (species, res_kb) in enumerate(zip(all_species, all_res_kb)): with open("{}_list.txt".format(species)) as infile: for line in infile: prefix = line.strip() for chrom in range(1, 23): path = "hic_data/{}_{}_{}kb.bed".format(prefix, chrom, res_kb) if os.path.isfile(path): mat = dt.matFromBed(path) oe_mat = ca.oe(mat) cor_mat = ca.cor(oe_mat) pca = PCA(n_components=1) pca.fit(cor_mat) boxes[i].append(pca.explained_variance_ratio_[0]) infile.close() #start with a frameless plot (extra room on the left) plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False) #label axes plt.ylabel("PC1 explained variance ratio", fontsize=10) #define offsets
all_res_kb = (100, 100, 32) boxes = [[] for species in all_species] for i, (species, res_kb) in enumerate(zip(all_species, all_res_kb)): with open("{}_list.txt".format(species)) as infile: for line in infile: prefix = line.strip() for chrom in range(1, 23): path = "hic_data/{}_{}_{}kb.bed".format(prefix, chrom, res_kb) if os.path.isfile(path): os.system("python ../minimds.py {}".format(path)) structure = dt.structure_from_file( "hic_data/{}_{}_{}kb_structure.tsv".format( prefix, chrom, res_kb)) mat = dt.matFromBed(path, structure) comps = ca.get_compartments(mat, structure) coords = structure.getCoords() clf = svm.LinearSVR() clf.fit(coords, comps) boxes[i].append(clf.score(coords, comps)) infile.close() plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False) #label axes plt.ylabel("SVR R^2", fontsize=10) #define offsets ys = boxes
def partitionedMDS(path, args): """Partitions structure into substructures and performs MDS""" domainSmoothingParameter = args[0] minSizeFraction = args[1] maxmemory = args[2] num_threads = args[3] alpha = args[4] res_ratio = args[5] alpha2 = args[6] weight = args[7] #create low-res structure low_chrom = dt.chromFromBed(path) low_chrom.res *= res_ratio lowstructure = dt.structureFromBed(path, low_chrom) #low global structure #get TADs low_contactMat = dt.matFromBed(path, lowstructure) low_tads = tad.getDomains( low_contactMat, lowstructure, domainSmoothingParameter, minSizeFraction ) #low substructures, defined on relative indices not absolute indices tad.substructuresFromTads(lowstructure, low_tads) #create high-res chrom size, res = dt.basicParamsFromBed(path) highChrom = dt.ChromParameters(lowstructure.chrom.minPos, lowstructure.chrom.maxPos, res, lowstructure.chrom.name, size) #create high-res structure highstructure = dt.Structure([], [], highChrom, 0) #initialize high-res substructures high_substructures = [] low_gen_coords = lowstructure.getGenCoords() offset = 0 #initialize for i, low_tad in enumerate(low_tads): start_gen_coord = low_gen_coords[low_tad[0]] if i == len(low_tads) - 1: #for last tad, avoid rounding error end_gen_coord = highstructure.chrom.maxPos else: end_gen_coord = low_gen_coords[low_tad[1]] high_substructure = dt.structureFromBed(path, highChrom, start_gen_coord, end_gen_coord, offset) high_substructures.append(high_substructure) offset += len(high_substructure.points) #update offset -= 1 highstructure.setstructures(high_substructures) infer_structure(low_contactMat, lowstructure, alpha, num_threads, weight) print("Low-resolution MDS complete") highSubstructures = pymp.shared.list(highstructure.structures) lowSubstructures = pymp.shared.list(lowstructure.structures) numSubstructures = len(highstructure.structures) num_threads = min( (num_threads, mp.cpu_count(), numSubstructures) ) #don't exceed number of requested threads, available threads, or structures with pymp.Parallel(num_threads) as p: for substructurenum in p.range(numSubstructures): highSubstructure = highSubstructures[substructurenum] if len(highSubstructure.getPoints()) > 0: #skip empty trueLow = lowSubstructures[substructurenum] #perform MDS individually structure_contactMat = dt.matFromBed( path, highSubstructure) #contact matrix for this structure only infer_structure(structure_contactMat, highSubstructure, alpha2, num_threads, weight) #approximate as low resolution inferredLow = dt.highToLow(highSubstructure, res_ratio) #rescale scaling_factor = la.radius_of_gyration( trueLow) / la.radius_of_gyration(inferredLow) for i, point in enumerate(inferredLow.points): if point != 0: x, y, z = point.pos inferredLow.points[i].pos = (x * scaling_factor, y * scaling_factor, z * scaling_factor) #recover the transformation for inferred from true low structure r, t = la.getTransformation(inferredLow, trueLow) t /= scaling_factor #transform high structure highSubstructure.transform(r, t) highSubstructures[substructurenum] = highSubstructure print("MDS performed on structure {} of {}".format( substructurenum + 1, numSubstructures)) highstructure.setstructures(highSubstructures) highstructure.set_rel_indices() return highstructure
import numpy as np res = int(sys.argv[1]) res_kb = int(res / 1000) for chrom in (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21): path1 = "hic_data/GM12878_combined_{}_100kb.bed".format(chrom) structure1 = dt.structureFromBed(path1) path2 = "hic_data/K562_{}_100kb.bed".format(chrom) structure2 = dt.structureFromBed(path2) dt.make_compatible((structure1, structure2)) contacts = dt.matFromBed(path1, structure1) enrichments = np.array(np.loadtxt( "binding_data/GM12878_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:, 6], dtype=float) bin_nums = structure1.nonzero_abs_indices( ) + structure1.chrom.minPos / structure1.chrom.res enrichments = enrichments[bin_nums] compartments1 = np.array( ca.get_compartments(contacts, structure1, enrichments)) contacts = dt.matFromBed(path2, structure2) enrichments = np.array(np.loadtxt( "binding_data/K562_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:, 6], dtype=float)