hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-40k.hm.IC" domains_files_Arm = "mapped-GalGal5filtered/GalGal5filteredChrmLevel/ChEF-all-HindIII-40k.hm.gzipped_matrix/ChEF-all-HindIII-40k.hm.gzipped_matrix.jucebox_domains.annotation" domains_files_Dix = "/mnt/storage/home/vsfishman/HiC/data/chick/DixonDomainsChEF_all_HindIII_40k.hm.IC_domains_40KB/DixonDomainsChEF_all_HindIII_40k.hm.IC_domains_40KB.jucebox_domains.annotation" second_hm_file = "/mnt/storage/home/vsfishman/HiC/tutorial_Fishman/chick/mapped-GalGal5filtered/GalGal5filteredChrmLevel/Blood-all-HindIII-40k.hm.IC" second_domains_files_Arm = "mapped-GalGal5filtered/GalGal5filteredChrmLevel/Blood-all-HindIII-40k.hm.gzipped_matrix/Blood-all-HindIII-40k.hm.gzipped_matrix.jucebox_domains.annotation" second_domains_files_Dix = "/mnt/storage/home/vsfishman/HiC/data/chick/DixonDomainsBlood_all_HindIII_40k.hm.IC_domains_40KB/DixonDomainsBlood_all_HindIII_40k.hm.IC_domains_40KB.jucebox_domains.annotation" resolution = extractResolutionFromFileName(hm_file) assert resolution == extractResolutionFromFileName(second_hm_file) data_dict = {} # a structure to keep chrms arrays data_dict_second_hmap = {} # a structure to keep chrms arrays for chrm,array in enumerate(get_chromosomes(hm_file,genome_db_contig,resolution)): data_dict[chrm] = array for chrm,array in enumerate(get_chromosomes(second_hm_file,genome_db_contig,resolution)): data_dict_second_hmap[chrm] = array #chrm,nt_start,nt_end,title #######################################ADD REGIONS TO PLOT HERE ###EXAMPLE: ###points = [ #["chr4",40100000,44000000,"Chromomer_Anja"], #["chr5",4000,440000,"Chromomer_Anja"] #] points = [["chr15",7200000,7280000,"Gamatu_Gene"]] #points = [["chr15",6200000,8280000,"Gamatu_Gene"]]
np.savetxt(results_file_name + ".criteias.txt", criteias, header="distances(columns): " + "\t".join(map(str, sorted(distances.keys()))) + " percentiles(rows): " + "\t".join(map(str, percentiles))) results_file = open(results_file_name, "w") results_file.write("Contig\tNt\tChr\tNt\tMax_percentile\t" + "\t".join(map(str, sorted(distances.keys()))) + "\n") print "Calculating..." result_counts = dict([(p, 0) for p in percentiles]) for chr_numb, data in enumerate( get_chromosomes(hm_file, genome_db_contig, resolution)): for bin in xrange( max_distance + min_distance_from_chrm_end_in_bins, len(data) - max_distance - min_distance_from_chrm_end_in_bins): max_bin_percentile = 101 #percentile can not be > 100, so 101 indicates we are in the begining of the loop sign = None #sign means left contacts > right contacts (True), left contacts < right contacts (False), or not defined (None) ratios = [] for jnd, dist in enumerate(sorted(distances.keys())): max_curren_percentile = 101 left = data[bin][bin - dist] #contacts on the left side of the bin right = data[bin][bin + dist] #contacts on the right side of the bin if left * right == 0: #bin has 0 contacts on left or right side max_bin_percentile = 101 break #than we do not consider it at all (max_percentile will be -1)
if len(points)==0: print "No points relevant for threshold found in a file",results_file continue all_result_files_points[results_file] = points figs_dir = results_file+".tr_"+str(threshold)+".figs/" if os.path.exists(figs_dir): import shutil shutil.rmtree(figs_dir) os.mkdir(figs_dir) assert len(all_result_files_points) > 0 data_dict = {} # a structure to keep chrms arrays if len(np.unique([p[0] for p in points for points in all_result_files_points.values()]))==1: data_dict[points[0][0]] = get_chromosomes(hm_file,genome_db,resolution,chrNumb=points[0][0]) else: for chrm,array in enumerate(get_chromosomes(hm_file,genome_db,resolution)): data_dict[chrm] = array for results_file in all_result_files_points: points = all_result_files_points[results_file] figs_dir = results_file+".tr_"+str(threshold)+".figs/" for i in points: chrm,nt = i bin = int(nt/resolution) data = data_dict[chrm] #data is a matrix (2D numpu array) left_border = max(0,bin-distance) right_border = min(len(data),bin+distance) to_plot = np.log(data[left_border:right_border,left_border:right_border])
import shutil shutil.rmtree(final_figs_dir) os.mkdir(final_figs_dir) assert len(all_result_files_points) > 0 data_dict = {} # a structure to keep chrms arrays data_dict_second_hmap = {} # a structure to keep chrms arrays if len( np.unique([ p[0] for p in points for points in all_result_files_points.values() ])) == 1: data_dict[points[0][0]] = get_chromosomes(hm_file, genome_db_contig, resolution, chrNumb=points[0][0]) data_dict_second_hmap[points[0][0]] = get_chromosomes(second_hm_file, genome_db_contig, resolution, chrNumb=points[0][0]) else: for chrm, array in enumerate( get_chromosomes(hm_file, genome_db_contig, resolution)): data_dict[chrm] = array for chrm, array in enumerate( get_chromosomes(second_hm_file, genome_db_contig, resolution)): data_dict_second_hmap[chrm] = array final_results = open(hm_file + ".FISH_regions.txt", "w") for results_file in all_result_files_points:
np.savetxt(results_file_name + ".criteias.txt", criteias, header="distances(columns): " + "\t".join(map(str, sorted(distances.keys()))) + " percentiles(rows): " + "\t".join(map(str, percentiles))) results_file = open(results_file_name, "w") results_file.write("chr\tNucleotide\tMax_percentile\t" + "\t".join(map(str, sorted(distances.keys()))) + "\n") print "Calculating..." result_counts = dict([(p, 0) for p in percentiles]) for chr_numb, data in enumerate(get_chromosomes(hm_file, genome_db, resolution)): for bin in xrange( max_distance + min_distance_from_chrm_end_in_bins, len(data) - max_distance - min_distance_from_chrm_end_in_bins): max_bin_percentile = 101 #percentile can not be > 100, so 101 indicates we are in the begining of the loop sign = None #sign means left contacts > right contacts (True), left contacts < right contacts (False), or not defined (None) ratios = [] for jnd, dist in enumerate(sorted(distances.keys())): max_curren_percentile = 101 left = data[bin][bin - dist] #contacts on the left side of the bin right = data[bin][bin + dist] #contacts on the right side of the bin if left * right == 0: #bin has 0 contacts on left or right side max_bin_percentile = 101 break #than we do not consider it at all (max_percentile will be -1)
genome_db = Genome("/mnt/storage/home/vsfishman/HiC/fasta/" + genomeName, readChrms=["#", "X", "Y"]) #hm_file = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/mESC-all-HindIII_refined.frag_res25k_hiRes.hm.IC.cis.hdf5" hm_file = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/mESC-all-HindIII_refined.frag_res25k_hiRes.hm.IC.cis.hdf5" #hm_file = "/mnt/storage/home/vsfishman/HiC/data/mESC/mapped-mm10/mm10/mESC-all-HindIII_refined.frag_res10k_hiRes.hm.IC.cis.hdf5" resolution = extractResolutionFromFileName(hm_file) distances = range(2, 11) + [30, 50] min_distance_from_chrm_end_in_bins = 100000 / resolution result = {} for d in distances: result[d] = [] for data in get_chromosomes(hm_file, genome_db, resolution): for dist in distances: for bin in xrange( dist + min_distance_from_chrm_end_in_bins, len(data) - dist - min_distance_from_chrm_end_in_bins): left = data[bin][bin - dist] right = data[bin][bin + dist] if left * right == 0: continue ratio = max(left, right) / min(left, right) result[dist].append(ratio) print "min\tmax\tmean\tmedian\n" if os.path.exists(hm_file + ".stat"): import shutil shutil.rmtree(hm_file + ".stat")
from mirnylib import h5dict from mirnylib import plotting from hiclib import binnedData from hiclib import fragmentHiC import math genome_db = genome.Genome( "/mnt/storage/home/vsfishman/HiC/fasta/GalGal5/GCF_000002315.4_Gallus_gallus-5.0_assembly_structure/Primary_Assembly/assembled_chromosomes/FASTA/", readChrms=[], chrmFileTemplate="%s.fna") hm_file = sys.argv[1] figure_path = "/mnt/storage/home/vsfishman/HiC/pics/" domain_res = extractResolutionFromFileName(hm_file) all_chrms = get_chromosomes(hm_file, genome_db, domain_res) chrms = range(genome_db.chrmCount) #numner of chrms in genome st = genome_db.chrmStartsBinCont #array of numbers of chrms start (in bins) end = genome_db.chrmEndsBinCont #array of numbers of chrms ends (in bins). Numer end[-1] is not in chromosome (this number is 1st bin of next chrm) def plot_one_chr_fragment(matrix, figure_path, chr_name, ch_start, ch_end): print "Plotting picture" ch_start = ch_start / domain_res ch_end = ch_end / domain_res # domain_st = domain_st / domain_res # domain_end = domain_end / domain_res i = genome_db.label2idx[chr_name] q2_0 = matrix[i] q2_0 = q2_0[ch_start:ch_end, ch_start:ch_end]
domains=args.domains.split(",") colors=args.colors.split(",") else: domains=[] colors=[] assert len(domains)==len(colors) domains=[np.genfromtxt(domain_file,dtype=np.dtype([('chrm','S10'),('start',np.uint32),('end',np.uint32)]),usecols = (0,1,2)) for domain_file in domains] resolution = extractResolutionFromFileName(hm_file) print "Using resolution ",resolution data_dict = {} # a structure to keep chrms arrays data_dict_second_hmap = {} # a structure to keep chrms arrays for chrm,array in enumerate(get_chromosomes(hm_file,genome_db,resolution)): data_dict[chrm] = array if args.reg == "all": args.reg = ",".join([i+":0-"+str(genome_db.chrmLens[genome_db.label2idx[i]]) for i in genome_db.chrmLabels if genome_db.chrmLens[genome_db.label2idx[i]] > resolution*10]) for i in args.reg.split(","): chrm = i.split(":")[0] nt_start,nt_end = map(int,i.split(":")[1].split("-")) title = "_".join(map(str,[chrm,nt_start,nt_end])) chrmLabel = chrm chrm = genome_db.label2idx[chrm] data = data_dict[chrm] left_border = nt_start/resolution right_border = nt_end/resolution