示例#1
0
def full_mds(path1,
             path2,
             alpha=4,
             penalty=0.05,
             num_threads=3,
             weight=0.05,
             prefix=""):
    """MDS without partitioning"""
    structure1 = dt.structureFromBed(path1)
    structure2 = dt.structureFromBed(path2)
    dt.make_compatible((structure1, structure2))
    contactMat1 = dt.matFromBed(path1, structure1)
    contactMat2 = dt.matFromBed(path2, structure2)
    infer_structures(contactMat1, structure1, contactMat2, structure2, alpha,
                     penalty, num_threads, weight)

    prefix1 = os.path.splitext(os.path.basename(path1))[0]
    structure1.write("{}{}_structure.tsv".format(prefix, prefix1))
    prefix2 = os.path.splitext(os.path.basename(path2))[0]
    structure2.write("{}{}_structure.tsv".format(prefix, prefix2))

    dists = la.calculate_distances(structure1, structure2)
    with open("{}{}_{}_relocalization.bed".format(prefix, prefix1, prefix2),
              "w") as out:
        for gen_coord, dist in zip(structure1.getGenCoords(), dists):
            out.write("\t".join(
                (structure1.chrom.name, str(gen_coord),
                 str(gen_coord + structure1.chrom.res), str(dist))))
            out.write("\n")
        out.close()

    print("Fractional compartment change: ")
    print(calculate_compartment_fraction(structure1, structure2, path1, path2))

    return structure1, structure2
示例#2
0
def fullMDS(path1, path2, alpha, penalty, num_threads, weight):
    """MDS without partitioning"""
    structure1 = dt.structureFromBed(path1)
    structure2 = dt.structureFromBed(path2)
    dt.make_compatible((structure1, structure2))
    contactMat1 = dt.matFromBed(path1, structure1)
    contactMat2 = dt.matFromBed(path2, structure2)
    infer_structures(contactMat1, structure1, contactMat2, structure2, alpha,
                     penalty, num_threads, weight)
    return structure1, structure2
示例#3
0
def fullMDS(path, classical, alpha, num_threads, weight):
    """MDS without partitioning"""
    structure = dt.structureFromBed(path)
    contactMat = dt.matFromBed(path, structure)
    infer_structure(contactMat, structure, alpha, num_threads, weight,
                    classical)
    return structure
示例#4
0
def create_low_res_structure(path, res_ratio):
    low_chrom = dt.chromFromBed(path)
    low_chrom.res *= res_ratio
    low_chrom.minPos = int(np.floor(
        float(low_chrom.minPos) / low_chrom.res)) * low_chrom.res  #round
    low_chrom.maxPos = int(np.ceil(
        float(low_chrom.maxPos) / low_chrom.res)) * low_chrom.res
    return dt.structureFromBed(path, low_chrom)
示例#5
0
def interMDS(names, prefix, inter_res, intra_res, full, args):
	inter_res_string = tools.get_res_string(inter_res)
	intra_res_string = tools.get_res_string(intra_res)

	#get low-res structures from intra files
	low_structures = []
	for name in names:
		path = "{}_{}_{}.bed".format(prefix, name, intra_res_string)
		chrom = dt.chromFromBed(path)
		#reduce res
		chrom.res = inter_res
		chrom.minPos = int(np.floor(float(chrom.minPos)/chrom.res)) * chrom.res	#round
		chrom.maxPos = int(np.ceil(float(chrom.maxPos)/chrom.res)) * chrom.res
		low_structures.append(dt.structureFromBed(path, chrom))

	#for correct indexing
	n = len(names)
	offsets = np.zeros(n, dtype=int)
	for i in range(1, n):
		offsets[i] = offsets[i-1] + len(low_structures[i-1].getPoints())

	inter_mat = get_inter_mat(prefix, inter_res_string, intra_res_string, low_structures, offsets)

	#perform MDS at low resolution on all chroms
	infer_structures(inter_mat, low_structures, offsets, args[3], args[4])

	#perform MDS at high resolution on each chrom
	high_structures = []
	inferred_low_structures = []
	ts = []
	for true_low, name in zip(low_structures, names):
		path = "{}_{}_{}.bed".format(prefix, name, intra_res_string)
		if full:
			high_structure = mm.fullMDS(path, False, args[4], args[3])
		else:
			high_structure = mm.partitionedMDS(path, args)
		high_structures.append(high_structure)
		inferred_low = dt.highToLow(high_structure, true_low.chrom.res/high_structure.chrom.res)
		inferred_low_structures.append(inferred_low)

		#rescale
		rescaling_factor = la.radius_of_gyration(true_low)/la.radius_of_gyration(inferred_low)
		rescaled_coords = [rescaling_factor * coord for coord in inferred_low.getCoords()]
		for i, point in enumerate(inferred_low.getPoints()):
			point.pos = rescaled_coords[i]

		r, t = la.getTransformation(inferred_low, true_low)
		high_structure.transform(r, None)	#do not translate now (need to rescale)
		ts.append(t)	

	#translate (with rescaling)
	low_rgs = np.array([la.radius_of_gyration(structure) for structure in low_structures])
	high_rgs = np.array([la.radius_of_gyration(structure) for structure in high_structures])
	scaling_factor = np.mean(high_rgs/low_rgs)
	for high_structure, t in zip(high_structures, ts):
		high_structure.transform(None, scaling_factor*t)	#rescale translation

	return high_structures
示例#6
0
def partitionedMDS(path, args):
    """Partitions structure into substructures and performs MDS"""
    domainSmoothingParameter = args[0]
    minSizeFraction = args[1]
    maxmemory = args[2]
    num_threads = args[3]
    alpha = args[4]
    res_ratio = args[5]
    alpha2 = args[6]

    #create low-res structure
    low_chrom = dt.chromFromBed(path)
    low_chrom.res *= res_ratio
    lowstructure = dt.structureFromBed(path, low_chrom)  #low global structure

    #get TADs
    low_contactMat = dt.matFromBed(path, lowstructure)
    low_tad_indices = tad.getDomains(
        low_contactMat, lowstructure, domainSmoothingParameter, minSizeFraction
    )  #low substructures, defined on relative indices not absolute indices
    tad.substructuresFromTads(lowstructure, low_tad_indices)

    #create high-res chrom
    size, res = dt.basicParamsFromBed(path)
    highChrom = dt.ChromParameters(lowstructure.chrom.minPos,
                                   lowstructure.chrom.maxPos, res,
                                   lowstructure.chrom.name, size)

    highstructure = dt.Structure([], [], highChrom, 0)
    high_substructures = []

    low_gen_coords = lowstructure.getGenCoords()
    offset = 0  #initialize
    for td in low_tad_indices:
        start_gen_coord = low_gen_coords[td[0]]
        end_gen_coord = low_gen_coords[td[1]]
        high_substructure = dt.structureFromBed(path, highChrom,
                                                start_gen_coord, end_gen_coord,
                                                offset)
        high_substructures.append(high_substructure)
        offset += len(high_substructure.points)  #update
        offset -= 1

    highstructure.setstructures(high_substructures)

    infer_structure(low_contactMat, lowstructure, alpha, num_threads)
    print "Low-resolution MDS complete"

    highSubstructures = pymp.shared.list(highstructure.structures)
    lowSubstructures = pymp.shared.list(lowstructure.structures)

    numSubstructures = len(highstructure.structures)
    num_threads = min(
        (num_threads, mp.cpu_count(), numSubstructures)
    )  #don't exceed number of requested threads, available threads, or structures
    with pymp.Parallel(num_threads) as p:
        for substructurenum in p.range(numSubstructures):
            highSubstructure = highSubstructures[substructurenum]
            if len(highSubstructure.getPoints()) > 0:  #skip empty
                trueLow = lowSubstructures[substructurenum]

                #perform MDS individually
                structure_contactMat = dt.matFromBed(
                    path,
                    highSubstructure)  #contact matrix for this structure only
                infer_structure(structure_contactMat, highSubstructure, alpha2,
                                num_threads)

                #approximate as low resolution
                inferredLow = dt.highToLow(highSubstructure, res_ratio)

                #rescale
                scaling_factor = la.radius_of_gyration(
                    trueLow) / la.radius_of_gyration(inferredLow)
                for i, point in enumerate(inferredLow.points):
                    if point != 0:
                        x, y, z = point.pos
                        inferredLow.points[i].pos = (x * scaling_factor,
                                                     y * scaling_factor,
                                                     z * scaling_factor)

                #recover the transformation for inferred from true low structure
                r, t = la.getTransformation(inferredLow, trueLow)
                t /= scaling_factor

                #transform high structure
                highSubstructure.transform(r, t)
                highSubstructures[substructurenum] = highSubstructure

                print "MDS performed on structure {} of {}".format(
                    substructurenum + 1, numSubstructures)

    highstructure.setstructures(highSubstructures)

    return highstructure
示例#7
0
import data_tools as dt
import numpy as np
import sys
import linear_algebra as la
from sklearn.manifold import MDS

chrom = sys.argv[1]
res_kb = 100
prefix1 = "GM12878_combined"
prefix2 = "K562"

path1 = "hic_data/{}_{}_{}kb.bed".format(prefix1, chrom, res_kb)
path2 = "hic_data/{}_{}_{}kb.bed".format(prefix2, chrom, res_kb)

structure1 = dt.structureFromBed(path1, None, None)
structure2 = dt.structureFromBed(path2, None, None)

#make structures compatible
dt.make_compatible((structure1, structure2))

#get distance matrices
dists1 = dt.normalized_dist_mat(path1, structure1)
dists2 = dt.normalized_dist_mat(path2, structure2)

#MDS
coords1 = MDS(n_components=3,
              random_state=np.random.RandomState(),
              dissimilarity="precomputed",
              n_jobs=-1).fit_transform(dists1)
coords2 = MDS(n_components=3,
              random_state=np.random.RandomState(),
示例#8
0
import sys
sys.path.append("..")
import data_tools as dt

res_kb = 100
chrom = sys.argv[1]
cell_type1 = "GM12878_combined"
cell_type2 = "K562"

path1 = "hic_data/{}_{}_{}kb.bed".format(cell_type1, chrom, res_kb)
path2 = "hic_data/{}_{}_{}kb.bed".format(cell_type2, chrom, res_kb)

structure1 = dt.structureFromBed(path1)
structure2 = dt.structureFromBed(path2)

dt.make_compatible((structure1, structure2))

print "size\t" + str(len(structure1.getPoints()))
示例#9
0
import numpy as np
import sys
sys.path.append("..")
import data_tools as dt

inpath = sys.argv[1]
outpath = sys.argv[2]

structure = dt.structureFromBed(inpath, None, None)
contactMat = dt.matFromBed(inpath, structure)
n = len(contactMat)
fullMat = np.zeros((n, n + 2))

#locus IDs
for i, pointNum in enumerate(structure.getPointNums()):
    fullMat[i, 0] = structure.chrom.minPos + structure.chrom.res * pointNum
    fullMat[i,
            1] = structure.chrom.minPos + structure.chrom.res * (pointNum + 1)

fullMat[:, 2:n + 2] = contactMat

maxNumDigits = int(np.ceil(np.log10(np.amax(fullMat))))
formatstring = "%" + str(maxNumDigits) + "d"
np.savetxt(outpath, fullMat, formatstring, delimiter="\t")
示例#10
0
def partitioned_mds(path1,
                    path2,
                    prefix="",
                    centromere=0,
                    num_partitions=4,
                    maxmemory=32000000,
                    num_threads=3,
                    alpha=4,
                    res_ratio=10,
                    penalty=0.05,
                    weight=0.05):
    """Partitions structure into substructures and performs MDS"""
    #create low-res structures
    lowstructure1 = create_low_res_structure(path1, res_ratio)
    lowstructure2 = create_low_res_structure(path2, res_ratio)
    dt.make_compatible((lowstructure1, lowstructure2))

    #get partitions
    n = len(lowstructure1.getPoints())
    if centromere == 0:
        midpoint = int(n / 2)
    else:
        midpoint = lowstructure1.chrom.getAbsoluteIndex(centromere)

    assert num_partitions % 2 == 0

    partition_size1 = int(np.ceil(float(midpoint) / (num_partitions / 2)))
    partition_size2 = int(np.ceil(float(n - midpoint) / (num_partitions / 2)))

    lowpartitions = [
    ]  #low substructures, defined on absolute indices not relative indices

    for i in range(int(num_partitions / 2)):
        lowpartitions.append(
            (i * partition_size1, min(((i + 1) * partition_size1), midpoint)))

    for i in range(int(num_partitions / 2)):
        lowpartitions.append((midpoint + i * partition_size2,
                              min((midpoint + (i + 1) * partition_size2),
                                  n - 1)))

    lowpartitions = np.array(lowpartitions)

    low_contactMat1 = dt.matFromBed(path1, lowstructure1)
    low_contactMat2 = dt.matFromBed(path2, lowstructure2)

    tad.substructuresFromAbsoluteTads(lowstructure1, lowpartitions)
    tad.substructuresFromAbsoluteTads(lowstructure2, lowpartitions)

    #create high-res chroms
    size1, res1 = dt.basicParamsFromBed(path1)
    highChrom1 = dt.ChromParameters(lowstructure1.chrom.minPos,
                                    lowstructure1.chrom.maxPos, res1,
                                    lowstructure1.chrom.name, size1)
    size2, res2 = dt.basicParamsFromBed(path2)
    highChrom2 = dt.ChromParameters(lowstructure2.chrom.minPos,
                                    lowstructure2.chrom.maxPos, res2,
                                    lowstructure2.chrom.name, size2)

    #initialize high-res substructures
    high_substructures1 = []
    high_substructures2 = []
    low_gen_coords = lowstructure1.getGenCoords()
    offset1 = 0  #initialize
    offset2 = 0
    for partition in lowpartitions:
        start_gen_coord = low_gen_coords[partition[0]]
        end_gen_coord = low_gen_coords[partition[1]]
        high_substructure1 = dt.structureFromBed(path1, highChrom1,
                                                 start_gen_coord,
                                                 end_gen_coord, offset1)
        high_substructure2 = dt.structureFromBed(path2, highChrom2,
                                                 start_gen_coord,
                                                 end_gen_coord, offset2)
        high_substructures1.append(high_substructure1)
        high_substructures2.append(high_substructure2)
        offset1 += (len(high_substructure1.points) - 1)  #update
        offset2 += (len(high_substructure2.points) - 1)  #update

    for high_substructure1, high_substructure2 in zip(high_substructures1,
                                                      high_substructures2):
        dt.make_points_compatible((high_substructure1, high_substructure2))

    highstructure1 = dt.Structure([], high_substructures1, highChrom1, 0)
    highstructure2 = dt.Structure([], high_substructures2, highChrom2, 0)

    infer_structures(low_contactMat1, lowstructure1, low_contactMat2,
                     lowstructure2, alpha, penalty, num_threads, weight)
    print("Low-resolution MDS complete")

    highSubstructures1 = pymp.shared.list(highstructure1.structures)
    highSubstructures2 = pymp.shared.list(highstructure2.structures)
    lowSubstructures1 = pymp.shared.list(lowstructure1.structures)
    lowSubstructures2 = pymp.shared.list(lowstructure2.structures)

    numSubstructures = len(highstructure1.structures)
    num_threads = min(
        (num_threads, mp.cpu_count(), numSubstructures)
    )  #don't exceed number of requested threads, available threads, or structures
    with pymp.Parallel(num_threads) as p:
        for substructurenum in p.range(numSubstructures):
            highSubstructure1 = highSubstructures1[substructurenum]
            highSubstructure2 = highSubstructures2[substructurenum]
            trueLow1 = lowSubstructures1[substructurenum]
            trueLow2 = lowSubstructures2[substructurenum]

            #joint MDS
            structure_contactMat1 = dt.matFromBed(
                path1,
                highSubstructure1)  #contact matrix for this structure only
            structure_contactMat2 = dt.matFromBed(
                path2,
                highSubstructure2)  #contact matrix for this structure only

            infer_structures(structure_contactMat1, highSubstructure1,
                             structure_contactMat2, highSubstructure2, 2.5,
                             penalty, num_threads, weight)

            transform(trueLow1, highSubstructure1, res_ratio)
            transform(trueLow2, highSubstructure2, res_ratio)

            highSubstructures1[substructurenum] = highSubstructure1
            highSubstructures2[substructurenum] = highSubstructure2

            print("MDS performed on structure {} of {}".format(
                substructurenum + 1, numSubstructures))

    highstructure1.setstructures(highSubstructures1)
    highstructure2.setstructures(highSubstructures2)

    highstructure1.set_rel_indices()
    highstructure2.set_rel_indices()

    return highstructure1, highstructure2
示例#11
0
import compartment_analysis as ca
import data_tools as dt
import array_tools as at
import os
import numpy as np

res = int(sys.argv[1])
res_kb = res / 1000

if os.path.isfile("A_compartment_{}kb.bed".format(res_kb)):
    os.system("rm A_compartment_{}kb.bed".format(res_kb))

for chrom in (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
              20, 21, 22):
    path = "hic_data/GM12878_combined_{}_100kb.bed".format(chrom)
    structure = dt.structureFromBed(path)
    contacts = dt.matFromBed(path, structure)
    at.makeSymmetric(contacts)
    enrichments = np.array(np.loadtxt(
        "binding_data/Gm12878_{}_100kb_active_coverage.bed".format(chrom),
        dtype=object)[:, 6],
                           dtype=float)
    bin_nums = structure.nonzero_abs_indices(
    ) + structure.chrom.minPos / structure.chrom.res
    enrichments = enrichments[bin_nums]
    compartments = np.array(ca.get_compartments(contacts, enrichments))
    gen_coords = np.array(structure.getGenCoords())
    a_gen_coords = gen_coords[np.where(compartments > 0)]
    with open("A_compartment_{}kb.bed".format(res_kb), "a") as out:
        for a_gen_coord in a_gen_coords:
            for i in range(100 / res_kb):
示例#12
0
import sys
sys.path.append("/home/lur159/git/miniMDS")
import data_tools as dt
import numpy as np
import tools

path = sys.argv[1]
res = int(sys.argv[2])
outpath = sys.argv[3]

chrom = dt.chromFromBed(path)
chrom.res = res
chrom.minPos = int(np.floor(float(chrom.minPos) / res)) * res  #round
chrom.maxPos = int(np.ceil(float(chrom.maxPos) / res)) * res

struct = dt.structureFromBed(path, chrom)
mat = dt.matFromBed(path, struct)

points = struct.getPoints()

with open(outpath, "w") as out:
    for i in range(len(mat)):
        abs_index1 = points[i].absolute_index
        for j in range(i):
            if mat[i, j] != 0:
                abs_index2 = points[j].absolute_index
                out.write("\t".join(
                    (chrom.name, str(chrom.getGenCoord(abs_index1)),
                     str(chrom.getGenCoord(abs_index1) + res), chrom.name,
                     str(chrom.getGenCoord(abs_index2)),
                     str(chrom.getGenCoord(abs_index2) + res), str(mat[i,
from matplotlib import pyplot as plt
import sys
sys.path.append("..")
import compartment_analysis as ca
import data_tools as dt
import os

paths = sys.argv[1:len(sys.argv)]
prefixes = [os.path.basename(path) for path in paths]
structs = [dt.structureFromBed(path) for path in paths]
mats = [dt.matFromBed(path, struct) for path, struct in zip(paths, structs)]
all_comps = [ca.get_compartments(mat) for mat in mats]
all_gen_coords = [struct.getGenCoords() for struct in structs]

#all_comps[len(all_comps)-1] = -all_comps[len(all_comps)-1]

for gen_coords, comps, prefix in zip(all_gen_coords, all_comps, prefixes):
    plt.plot(gen_coords, comps, label=prefix)

plt.legend()
plt.show()