Пример #1
0
from multimds import data_tools as dt
from scipy import stats as st
from matplotlib import pyplot as plt
import numpy as np
from multimds import linear_algebra as la
from scipy import signal as sg
from multimds import multimds as mm

path1 = "hic_data/GM12878_combined_19_100kb.bed"
path2 = "hic_data/K562_19_100kb.bed"

struct1, struct2 = mm.full_mds(path1, path2, prefix="test_")

mat1 = dt.matFromBed(
    "hic_data/GM12878_combined_{}_{}kb.bed".format(chrom, res_kb), struct1)
comps1 = ca.get_compartments(mat1, struct1)
mat2 = dt.matFromBed("hic_data/K562_{}_{}kb.bed".format(chrom, res_kb),
                     struct2)
comps2 = ca.get_compartments(mat2, struct2)

r, p = st.pearsonr(comps1, comps2)
if r < 0:
    comps1 = -comps1

comp_diffs = np.abs(comps1 - comps2)

dists = np.array([
    la.calcDistance(coord1, coord2)
    for coord1, coord2 in zip(struct1.getCoords(), struct2.getCoords())
])
dist_peaks = sg.find_peaks_cwt(dists, np.arange(1, 10))
Пример #2
0
            if os.path.isfile(path1) and os.path.isfile(path2):
                structure1, structure2 = multimds.full_mds(path1,
                                                           path2,
                                                           penalty=penalty)

                structure1.rescale()
                structure2.rescale()
                r, t = la.getTransformation(structure1, structure2)
                structure1.transform(r, t)

                #compartments
                contacts1 = dt.matFromBed(path1, structure=structure1)
                contacts2 = dt.matFromBed(path2, structure=structure2)

                compartments1 = np.array(
                    ca.get_compartments(contacts1, structure1))
                compartments2 = np.array(
                    ca.get_compartments(contacts2, structure2))

                r, p = st.pearsonr(compartments1, compartments2)
                if r < 0:
                    compartments2 = -compartments2

                #SVR
                coords1 = structure1.getCoords()
                coords2 = structure2.getCoords()
                coords = np.concatenate((coords1, coords2))
                compartments = np.concatenate((compartments1, compartments2))
                clf = svm.LinearSVR()
                clf.fit(coords, compartments)
                coef = clf.coef_
Пример #3
0
from multimds import compartment_analysis as ca
import numpy as np
from sklearn import svm
from multimds import linear_algebra as la
from mayavi import mlab

struct = dt.structure_from_file(
    "hic_data/GM12878_combined_21_100kb_structure.tsv")

new_start = struct.chrom.getAbsoluteIndex(15000000)
struct.subsamplePoints(new_start, len(struct.points) - 3)

#compartments
contacts = dt.matFromBed("hic_data/GM12878_combined_21_100kb.bed", struct)

compartments = np.array(ca.get_compartments(contacts, struct))

#SVR
coords = struct.getCoords()
clf = svm.LinearSVR()
clf.fit(coords, compartments)
coef = clf.coef_

transformed_coords = np.array(la.change_coordinate_system(coef, coords))
xs = transformed_coords[:, 0]
min_x = min(xs)
max_x = max(xs)
x_range = max_x - min_x
ys = transformed_coords[:, 1]
min_y = min(ys)
max_y = max(ys)
Пример #4
0
          1)
multimds_coeffs = np.zeros_like(chroms, dtype=float)
unaligned_coeffs = np.zeros_like(multimds_coeffs)

for i, chrom in enumerate(chroms):

    path1 = "hic_data/{}_{}_{}kb.bed".format(cell_type1, chrom, res_kb)
    path2 = "hic_data/{}_{}_{}kb.bed".format(cell_type2, chrom, res_kb)

    structure1, structure2 = mm.multimds(path1, path2, penalty=penalty)

    #compartments
    contacts1 = dt.matFromBed(path1, structure1)
    contacts2 = dt.matFromBed(path2, structure2)

    compartments1 = np.array(ca.get_compartments(contacts1, structure1))
    compartments2 = np.array(ca.get_compartments(contacts2, structure2))

    r, p = st.pearsonr(compartments1, compartments2)
    if r < 0:
        compartments2 = -compartments2

    #SVR
    coords1 = structure1.getCoords()
    coords2 = structure2.getCoords()
    coords = np.concatenate((coords1, coords2))
    compartments = np.concatenate((compartments1, compartments2))
    clf = svm.LinearSVR()
    clf.fit(coords, compartments)
    multimds_coeffs[i] = clf.score(coords, compartments)
Пример #5
0
    float(chrom2.minPos) / chrom2.res)) * chrom2.res  #round
chrom2.maxPos = int(np.ceil(float(chrom2.maxPos) / chrom2.res)) * chrom2.res

low_struct1 = dt.structureFromBed(path1, chrom1)
low_struct2 = dt.structureFromBed(path2, chrom2)
dt.make_compatible((low_struct1, low_struct2))
contacts1 = dt.matFromBed(path1, low_struct1)
contacts2 = dt.matFromBed(path2, low_struct2)

enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format(
    format_celltype(cell_type1), chrom),
                         usecols=6)
bin_nums = low_struct1.nonzero_bins_whole_chrom()
enrichments = enrichments[bin_nums]
compartments1 = np.array(
    ca.get_compartments(contacts1, low_struct1, enrichments))

enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format(
    format_celltype(cell_type2), chrom),
                         usecols=6)
bin_nums = low_struct2.nonzero_bins_whole_chrom()
enrichments = enrichments[bin_nums]
compartments2 = np.array(
    ca.get_compartments(contacts2, low_struct2, enrichments))

gen_coords = structure1.getGenCoords()

compartment_diffs = np.abs(compartments1 - compartments2)

dist_peaks = sg.find_peaks_cwt(dists, np.arange(1, 10))
Пример #6
0
chrom2.res = 100000
chrom1.minPos = int(np.floor(float(chrom1.minPos)/chrom1.res)) * chrom1.res	#round
chrom1.maxPos = int(np.ceil(float(chrom1.maxPos)/chrom1.res)) * chrom1.res
chrom2.minPos = int(np.floor(float(chrom2.minPos)/chrom2.res)) * chrom2.res	#round
chrom2.maxPos = int(np.ceil(float(chrom2.maxPos)/chrom2.res)) * chrom2.res

low_struct1 = dt.structureFromBed(path1, chrom1)
low_struct2 = dt.structureFromBed(path2, chrom2)
dt.make_compatible((low_struct1, low_struct2))
contacts1 = dt.matFromBed(path1, low_struct1)		
contacts2 = dt.matFromBed(path2, low_struct2)

enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format(format_celltype(cell_type1), chrom), usecols=6)
bin_nums = low_struct1.nonzero_bins_whole_chrom()
enrichments = enrichments[bin_nums]
compartments1 = np.array(ca.get_compartments(contacts1, low_struct1, enrichments))

enrichments = np.loadtxt("binding_data/{}_{}_100kb_active_coverage.bed".format(format_celltype(cell_type2), chrom), usecols=6)
bin_nums = low_struct2.nonzero_bins_whole_chrom()
enrichments = enrichments[bin_nums]
compartments2 = np.array(ca.get_compartments(contacts2, low_struct2, enrichments))

gen_coords = structure1.getGenCoords()

compartment_diffs = np.abs(compartments1 - compartments2)

dist_peaks = sg.find_peaks_cwt(dists, np.arange(1,10))

high_coords = structure1.getGenCoords()
low_coords = low_struct1.getGenCoords()
Пример #7
0
boxes = [[] for species in all_species]

for i, (species, res_kb) in enumerate(zip(all_species, all_res_kb)):
    with open("{}_list.txt".format(species)) as infile:
        for line in infile:
            prefix = line.strip()
            for chrom in range(1, 23):
                path = "hic_data/{}_{}_{}kb.bed".format(prefix, chrom, res_kb)

                if os.path.isfile(path):
                    os.system("python ../minimds.py {}".format(path))
                    structure = dt.structure_from_file(
                        "hic_data/{}_{}_{}kb_structure.tsv".format(
                            prefix, chrom, res_kb))
                    mat = dt.matFromBed(path, structure)
                    comps = ca.get_compartments(mat, structure)
                    coords = structure.getCoords()
                    clf = svm.LinearSVR()
                    clf.fit(coords, comps)
                    boxes[i].append(clf.score(coords, comps))

        infile.close()

plt.subplot2grid((10, 10), (0, 0), 9, 10, frameon=False)

#label axes
plt.ylabel("SVR R^2", fontsize=10)

#define offsets
ys = boxes
n = len(ys)
Пример #8
0
res_kb = int(res/1000)

for chrom in (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21):
	path1 = "hic_data/GM12878_combined_{}_100kb.bed".format(chrom)
	structure1 = dt.structureFromBed(path1)

	path2 = "hic_data/K562_{}_100kb.bed".format(chrom)
	structure2 = dt.structureFromBed(path2)

	dt.make_compatible((structure1, structure2))

	contacts = dt.matFromBed(path1, structure1)
	enrichments = np.array(np.loadtxt("binding_data/GM12878_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:,6], dtype=float)
	bin_nums = structure1.nonzero_abs_indices() + structure1.chrom.minPos/structure1.chrom.res
	enrichments = enrichments[bin_nums]
	compartments1 = np.array(ca.get_compartments(contacts, structure1, enrichments))
	
	contacts = dt.matFromBed(path2, structure2)
	enrichments = np.array(np.loadtxt("binding_data/K562_{}_100kb_active_coverage.bed".format(chrom), dtype=object)[:,6], dtype=float)
	bin_nums = structure2.nonzero_abs_indices() + structure2.chrom.minPos/structure2.chrom.res
	enrichments = enrichments[bin_nums]
	compartments2 = np.array(ca.get_compartments(contacts, structure2, enrichments))

	gen_coords = structure1.getGenCoords()

	with open("A_compartment_{}kb.bed".format(res_kb), "a") as out:
		for gen_coord, compartment1, compartment2 in zip(gen_coords, compartments1, compartments2):
			if compartment1 > 0 and compartment2 > 0 and np.abs(compartment1 - compartment2) < 0.2:
				for i in range(int(100/res_kb)):
					out.write("\t".join((structure1.chrom.name, str(gen_coord + i*res), str(gen_coord + (i+1)*res), str(compartment1), str(compartment2))))
					out.write("\n")
Пример #9
0
    path2 = "hic_data/K562_{}_100kb.bed".format(chrom)
    structure2 = dt.structureFromBed(path2)

    dt.make_compatible((structure1, structure2))

    contacts = dt.matFromBed(path1, structure1)
    enrichments = np.array(np.loadtxt(
        "binding_data/GM12878_{}_100kb_active_coverage.bed".format(chrom),
        dtype=object)[:, 6],
                           dtype=float)
    bin_nums = structure1.nonzero_abs_indices(
    ) + structure1.chrom.minPos / structure1.chrom.res
    enrichments = enrichments[bin_nums]
    compartments1 = np.array(
        ca.get_compartments(contacts, structure1, enrichments))

    contacts = dt.matFromBed(path2, structure2)
    enrichments = np.array(np.loadtxt(
        "binding_data/K562_{}_100kb_active_coverage.bed".format(chrom),
        dtype=object)[:, 6],
                           dtype=float)
    bin_nums = structure2.nonzero_abs_indices(
    ) + structure2.chrom.minPos / structure2.chrom.res
    enrichments = enrichments[bin_nums]
    compartments2 = np.array(
        ca.get_compartments(contacts, structure2, enrichments))

    gen_coords = structure1.getGenCoords()

    with open("B_compartment_{}kb.bed".format(res_kb), "a") as out:
Пример #10
0
from multimds import linear_algebra as la
from mayavi import mlab
from multimds import multimds as mm

path1 = "hic_data/GM12878_combined_21_100kb.bed"
path2 = "hic_data/K562_21_100kb.bed"

struct1, struct2 = mm.full_mds(path1, path2)

contacts1 = dt.matFromBed(path1, struct1)
enrichments1 = np.loadtxt("binding_data/GM12878_21_100kb_active_coverage.bed",
                          usecols=6)
bin_nums1 = struct1.nonzero_abs_indices() + int(
    struct1.chrom.minPos / struct1.chrom.res)
enrichments1 = enrichments1[bin_nums1]
comps1 = np.array(ca.get_compartments(contacts1, struct1, enrichments1))

contacts2 = dt.matFromBed(path2, struct2)
enrichments2 = np.loadtxt("binding_data/K562_21_100kb_active_coverage.bed",
                          usecols=6)
bin_nums2 = struct2.nonzero_abs_indices() + int(
    struct2.chrom.minPos / struct2.chrom.res)
enrichments2 = enrichments2[bin_nums2]
comps2 = np.array(ca.get_compartments(contacts2, struct2, enrichments2))

coords1 = struct1.getCoords()
coords2 = struct2.getCoords()
coords = np.concatenate((coords1, coords2))
compartments = np.concatenate((comps1, comps2))
clf = svm.LinearSVR()
clf.fit(coords, compartments)