예제 #1
0
hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-40k.hm.IC"
domains_files_Arm = "mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-40k.hm.gzipped_matrix/ChEF-all-HindIII-40k.hm.gzipped_matrix.jucebox_domains.annotation"
domains_files_Dix = "/mnt/storage/home/vsfishman/HiC/data/chick/DixonDomainsChEF_all_HindIII_40k.hm.IC_domains_40KB/DixonDomainsChEF_all_HindIII_40k.hm.IC_domains_40KB.jucebox_domains.annotation"

second_hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/Blood-all-HindIII-40k.hm.IC"
second_domains_files_Arm = "mapped-GalGal5filtered/GalGal5filteredChrmLevel/Blood-all-HindIII-40k.hm.gzipped_matrix/Blood-all-HindIII-40k.hm.gzipped_matrix.jucebox_domains.annotation"
second_domains_files_Dix = "/mnt/storage/home/vsfishman/HiC/data/chick/DixonDomainsBlood_all_HindIII_40k.hm.IC_domains_40KB/DixonDomainsBlood_all_HindIII_40k.hm.IC_domains_40KB.jucebox_domains.annotation"


resolution = extractResolutionFromFileName(hm_file)
assert resolution == extractResolutionFromFileName(second_hm_file)

data_dict = {} # a structure to keep chrms arrays
data_dict_second_hmap = {} # a structure to keep chrms arrays
				
for chrm,array in enumerate(get_chromosomes(hm_file,genome_db_contig,resolution)):
	data_dict[chrm] = array
for chrm,array in enumerate(get_chromosomes(second_hm_file,genome_db_contig,resolution)):
	data_dict_second_hmap[chrm] = array

#chrm,nt_start,nt_end,title

#######################################ADD REGIONS TO PLOT HERE
###EXAMPLE:
###points = [
#["chr4",40100000,44000000,"Chromomer_Anja"],
#["chr5",4000,440000,"Chromomer_Anja"]
#]

points = [["chr15",7200000,7280000,"Gamatu_Gene"]]
#points = [["chr15",6200000,8280000,"Gamatu_Gene"]]
예제 #2
0
np.savetxt(results_file_name + ".criteias.txt",
           criteias,
           header="distances(columns): " +
           "\t".join(map(str, sorted(distances.keys()))) +
           " percentiles(rows): " + "\t".join(map(str, percentiles)))

results_file = open(results_file_name, "w")
results_file.write("Contig\tNt\tChr\tNt\tMax_percentile\t" +
                   "\t".join(map(str, sorted(distances.keys()))) + "\n")

print "Calculating..."

result_counts = dict([(p, 0) for p in percentiles])

for chr_numb, data in enumerate(
        get_chromosomes(hm_file, genome_db_contig, resolution)):

    for bin in xrange(
            max_distance + min_distance_from_chrm_end_in_bins,
            len(data) - max_distance - min_distance_from_chrm_end_in_bins):
        max_bin_percentile = 101  #percentile can not be > 100, so 101 indicates we are in the begining of the loop
        sign = None  #sign means left contacts > right contacts (True),  left contacts < right contacts (False), or not defined (None)
        ratios = []
        for jnd, dist in enumerate(sorted(distances.keys())):
            max_curren_percentile = 101
            left = data[bin][bin - dist]  #contacts on the left side of the bin
            right = data[bin][bin +
                              dist]  #contacts on the right side of the bin
            if left * right == 0:  #bin has 0 contacts on left or right side
                max_bin_percentile = 101
                break  #than we do not consider it at all (max_percentile will be -1)
예제 #3
0
	if len(points)==0:
		print "No points relevant for threshold found in a file",results_file
		continue
	all_result_files_points[results_file] = points
	figs_dir = results_file+".tr_"+str(threshold)+".figs/"
	if os.path.exists(figs_dir):
		import shutil
		shutil.rmtree(figs_dir)

	os.mkdir(figs_dir)

assert len(all_result_files_points) > 0
	
data_dict = {} # a structure to keep chrms arrays
if len(np.unique([p[0] for p in points for points in all_result_files_points.values()]))==1:
	data_dict[points[0][0]] = get_chromosomes(hm_file,genome_db,resolution,chrNumb=points[0][0])
else:
	for chrm,array in enumerate(get_chromosomes(hm_file,genome_db,resolution)):
		data_dict[chrm] = array
	
for results_file in all_result_files_points:
	points = all_result_files_points[results_file]
	figs_dir = results_file+".tr_"+str(threshold)+".figs/"
	for i in points:
		chrm,nt = i
		bin = int(nt/resolution)
		data = data_dict[chrm]
		#data is a matrix (2D numpu array)
		left_border = max(0,bin-distance)
		right_border = min(len(data),bin+distance)
		to_plot = np.log(data[left_border:right_border,left_border:right_border])
    import shutil
    shutil.rmtree(final_figs_dir)

os.mkdir(final_figs_dir)

assert len(all_result_files_points) > 0

data_dict = {}  # a structure to keep chrms arrays
data_dict_second_hmap = {}  # a structure to keep chrms arrays
if len(
        np.unique([
            p[0] for p in points
            for points in all_result_files_points.values()
        ])) == 1:
    data_dict[points[0][0]] = get_chromosomes(hm_file,
                                              genome_db_contig,
                                              resolution,
                                              chrNumb=points[0][0])
    data_dict_second_hmap[points[0][0]] = get_chromosomes(second_hm_file,
                                                          genome_db_contig,
                                                          resolution,
                                                          chrNumb=points[0][0])
else:
    for chrm, array in enumerate(
            get_chromosomes(hm_file, genome_db_contig, resolution)):
        data_dict[chrm] = array
    for chrm, array in enumerate(
            get_chromosomes(second_hm_file, genome_db_contig, resolution)):
        data_dict_second_hmap[chrm] = array

final_results = open(hm_file + ".FISH_regions.txt", "w")
for results_file in all_result_files_points:
예제 #5
0
np.savetxt(results_file_name + ".criteias.txt",
           criteias,
           header="distances(columns): " +
           "\t".join(map(str, sorted(distances.keys()))) +
           " percentiles(rows): " + "\t".join(map(str, percentiles)))

results_file = open(results_file_name, "w")
results_file.write("chr\tNucleotide\tMax_percentile\t" +
                   "\t".join(map(str, sorted(distances.keys()))) + "\n")

print "Calculating..."

result_counts = dict([(p, 0) for p in percentiles])

for chr_numb, data in enumerate(get_chromosomes(hm_file, genome_db,
                                                resolution)):

    for bin in xrange(
            max_distance + min_distance_from_chrm_end_in_bins,
            len(data) - max_distance - min_distance_from_chrm_end_in_bins):
        max_bin_percentile = 101  #percentile can not be > 100, so 101 indicates we are in the begining of the loop
        sign = None  #sign means left contacts > right contacts (True),  left contacts < right contacts (False), or not defined (None)
        ratios = []
        for jnd, dist in enumerate(sorted(distances.keys())):
            max_curren_percentile = 101
            left = data[bin][bin - dist]  #contacts on the left side of the bin
            right = data[bin][bin +
                              dist]  #contacts on the right side of the bin
            if left * right == 0:  #bin has 0 contacts on left or right side
                max_bin_percentile = 101
                break  #than we do not consider it at all (max_percentile will be -1)
예제 #6
0
genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName,
                   readChrms=["#", "X", "Y"])

#hm_file = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/mESC-all-HindIII_refined.frag_res25k_hiRes.hm.IC.cis.hdf5"
hm_file = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/mESC-all-HindIII_refined.frag_res25k_hiRes.hm.IC.cis.hdf5"
#hm_file = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/mESC-all-HindIII_refined.frag_res10k_hiRes.hm.IC.cis.hdf5"
resolution = extractResolutionFromFileName(hm_file)

distances = range(2, 11) + [30, 50]
min_distance_from_chrm_end_in_bins = 100000 / resolution

result = {}
for d in distances:
    result[d] = []

for data in get_chromosomes(hm_file, genome_db, resolution):
    for dist in distances:
        for bin in xrange(
                dist + min_distance_from_chrm_end_in_bins,
                len(data) - dist - min_distance_from_chrm_end_in_bins):
            left = data[bin][bin - dist]
            right = data[bin][bin + dist]
            if left * right == 0:
                continue
            ratio = max(left, right) / min(left, right)
            result[dist].append(ratio)
print "min\tmax\tmean\tmedian\n"

if os.path.exists(hm_file + ".stat"):
    import shutil
    shutil.rmtree(hm_file + ".stat")
예제 #7
0
from mirnylib import h5dict
from mirnylib import plotting
from hiclib import binnedData
from hiclib import fragmentHiC
import math

genome_db = genome.Genome(
    "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/",
    readChrms=[],
    chrmFileTemplate="%s.fna")

hm_file = sys.argv[1]
figure_path = "/mnt/storage/home/vsfishman/HiC/pics/"

domain_res = extractResolutionFromFileName(hm_file)
all_chrms = get_chromosomes(hm_file, genome_db, domain_res)

chrms = range(genome_db.chrmCount)  #numner of chrms in genome
st = genome_db.chrmStartsBinCont  #array of numbers of chrms start (in bins)
end = genome_db.chrmEndsBinCont  #array of numbers of chrms ends (in bins). Numer end[-1] is not in chromosome (this number is 1st bin of next chrm)


def plot_one_chr_fragment(matrix, figure_path, chr_name, ch_start, ch_end):
    print "Plotting picture"
    ch_start = ch_start / domain_res
    ch_end = ch_end / domain_res
    # domain_st = domain_st / domain_res
    # domain_end = domain_end / domain_res
    i = genome_db.label2idx[chr_name]
    q2_0 = matrix[i]
    q2_0 = q2_0[ch_start:ch_end, ch_start:ch_end]
예제 #8
0
	domains=args.domains.split(",")
	colors=args.colors.split(",")
else:
	domains=[]
	colors=[]

assert len(domains)==len(colors)
domains=[np.genfromtxt(domain_file,dtype=np.dtype([('chrm','S10'),('start',np.uint32),('end',np.uint32)]),usecols = (0,1,2))
					for domain_file in domains]
resolution = extractResolutionFromFileName(hm_file)
print "Using resolution ",resolution

data_dict = {} # a structure to keep chrms arrays
data_dict_second_hmap = {} # a structure to keep chrms arrays
				
for chrm,array in enumerate(get_chromosomes(hm_file,genome_db,resolution)):
	data_dict[chrm] = array

if args.reg == "all":
	args.reg = ",".join([i+":0-"+str(genome_db.chrmLens[genome_db.label2idx[i]]) for i in genome_db.chrmLabels 
												if genome_db.chrmLens[genome_db.label2idx[i]] > resolution*10])
	
for i in args.reg.split(","):
		chrm = i.split(":")[0]
		nt_start,nt_end = map(int,i.split(":")[1].split("-"))
		title = "_".join(map(str,[chrm,nt_start,nt_end]))
		chrmLabel = chrm
		chrm = genome_db.label2idx[chrm]
		data = data_dict[chrm]
		left_border = nt_start/resolution
		right_border = nt_end/resolution