def test_04_chromosome_batch(self): if ONLY and ONLY != "04": return if CHKTIME: t0 = time() test_chr = Chromosome( name="Test Chromosome", experiment_resolutions=[20000] * 3, experiment_hic_data=[ PATH + "/20Kb/chrT/chrT_A.tsv", PATH + "/20Kb/chrT/chrT_D.tsv", PATH + "/20Kb/chrT/chrT_C.tsv", ], experiment_names=["exp1", "exp2", "exp3"], silent=True, ) test_chr.find_tad(["exp1", "exp2", "exp3"], batch_mode=True, verbose=False, silent=True) tads = test_chr.get_experiment("batch_exp1_exp2_exp3").tads found = [tads[t]["end"] for t in tads if tads[t]["score"] > 0] # Values obtained with square root normalization. # self.assertEqual([3.0, 8.0, 16.0, 21.0, 28.0, 35.0, 43.0, # 49.0, 61.0, 66.0, 75.0, 89.0, 94.0, 99.0], found) self.assertEqual([3.0, 14.0, 19.0, 33.0, 43.0, 49.0, 61.0, 66.0, 71.0, 89.0, 94.0, 99.0], found) if CHKTIME: print "4", time() - t0
def test_04_chromosome_batch(self): if ONLY and ONLY != '04': return if CHKTIME: t0 = time() test_chr = Chromosome(name='Test Chromosome', experiment_resolutions=[20000]*3, experiment_hic_data=[ PATH + '/20Kb/chrT/chrT_A.tsv', PATH + '/20Kb/chrT/chrT_D.tsv', PATH + '/20Kb/chrT/chrT_C.tsv'], experiment_names=['exp1', 'exp2', 'exp3'], silent=True) test_chr.find_tad(['exp1', 'exp2', 'exp3'], batch_mode=True, verbose=False, silent=True) tads = test_chr.get_experiment('batch_exp1_exp2_exp3').tads found = [tads[t]['end'] for t in tads if tads[t]['score'] > 0] # Values obtained with square root normalization. #self.assertEqual([3.0, 8.0, 16.0, 21.0, 28.0, 35.0, 43.0, # 49.0, 61.0, 66.0, 75.0, 89.0, 94.0, 99.0], found) self.assertEqual([3.0, 14.0, 19.0, 33.0, 43.0, 49.0, 61.0, 66.0, 71.0, 89.0, 94.0, 99.0], found) if CHKTIME: print '4', time() - t0
def test_04_chromosome_batch(self): if ONLY and not "04" in ONLY: return if CHKTIME: t0 = time() test_chr = Chromosome(name="Test Chromosome", experiment_resolutions=[20000] * 3, experiment_hic_data=[ PATH + "/20Kb/chrT/chrT_A.tsv", PATH + "/20Kb/chrT/chrT_D.tsv", PATH + "/20Kb/chrT/chrT_C.tsv" ], experiment_names=["exp1", "exp2", "exp3"], silent=True) test_chr.find_tad(["exp1", "exp2", "exp3"], batch_mode=True, verbose=False, silent=True) tads = test_chr.get_experiment("batch_exp1_exp2_exp3").tads found = [tads[t]["end"] for t in tads if tads[t]["score"] > 0] # Values obtained with square root normalization. #self.assertEqual([3.0, 8.0, 16.0, 21.0, 28.0, 35.0, 43.0, # 49.0, 61.0, 66.0, 75.0, 89.0, 94.0, 99.0], found) self.assertEqual([ 3.0, 14.0, 19.0, 33.0, 43.0, 49.0, 61.0, 66.0, 71.0, 89.0, 94.0, 99.0 ], found) if CHKTIME: print "4", time() - t0
def main(): """ main function """ # retieve HOX genes distmatrix, geneids = get_genes() # compute TADs for human chromosome 19 test_chr = Chromosome(name='Test Chromosome') test_chr.add_experiment('exp1', 100000, xp_handler=PATH + 'HIC_gm06690_chr19_chr19_100000_obs.txt') test_chr.find_tad(['exp1']) exp = test_chr.experiments['exp1'] clust = linkage(distmatrix['19']) cl_idx = list(fcluster(clust, t=1, criterion='inconsistent')) print max(cl_idx), 'clusters' cluster = [[] for _ in xrange(1, max(cl_idx) + 1)] for i, j in enumerate(cl_idx): cluster[j - 1].append(geneids['19'][i][1]) for i, _ in enumerate(cluster): cluster[i] = min(cluster[i]), max(cluster[i]) tad_breaker(exp.tads, cluster, exp.resolution, show_plot=True, bins=5, title='Proportion of HOX genes according to position in a TAD')
def tb_generate_tads(self, expt_name, adj_list, chrom, resolution, normalized, tad_file): """ Function to the predict TAD sites for a given resolution from the Hi-C matrix Parameters ---------- expt_name : str Location of the adjacency list matrix_file : str Location of the HDF5 output matrix file resolution : int Resolution to read the Hi-C adjacency list at tad_file : str Location of the output TAD file Returns ------- tad_file : str Location of the output TAD file """ # chr_hic_data = read_matrix(matrix_file, resolution=int(resolution)) print("TB TAD GENERATOR:", expt_name, adj_list, chrom, resolution, normalized, tad_file) hic_data = load_hic_data_from_reads(adj_list, resolution=int(resolution)) if normalized is False: hic_data.normalize_hic(iterations=9, max_dev=0.1) save_matrix_file = adj_list + "_" + str(chrom) + "_tmp.txt" hic_data.write_matrix(save_matrix_file, (chrom, chrom), normalized=True) chr_hic_data = hic_data.get_matrix((chrom, chrom)) print("TB - chr_hic_data:", chr_hic_data) my_chrom = Chromosome(name=chrom, centromere_search=True) my_chrom.add_experiment(expt_name, hic_data=save_matrix_file, resolution=int(resolution)) # Run core TADbit function to find TADs on each expt. my_chrom.find_tad(expt_name, n_cpus=15) exp = my_chrom.experiments[expt_name] exp.write_tad_borders(savedata=tad_file + ".tmp") with open(tad_file, "wb") as f_out: with open(tad_file + ".tmp", "rb") as f_in: f_out.write(f_in.read()) return True
def test_04_chromosome_batch(self): test_chr = Chromosome(name='Test Chromosome', experiment_resolutions=[20000]*3, experiment_hic_data=['20Kb/chrT/chrT_A.tsv', '20Kb/chrT/chrT_D.tsv', '20Kb/chrT/chrT_C.tsv'], experiment_names=['exp1', 'exp2', 'exp3']) test_chr.find_tad(['exp1', 'exp2', 'exp3'], batch_mode=True, verbose=False) tads = test_chr.get_experiment('batch_exp1_exp2_exp3').tads found = [tads[t]['end'] for t in tads if tads[t]['score'] > 0] self.assertEqual([3.0, 8.0, 16.0, 21.0, 28.0, 35.0, 43.0, 49.0, 61.0, 66.0, 75.0, 89.0, 99.0], found)
def main(): """ main function """ n_pick = 4 n_tot = 10 test_chr = Chromosome(name='Test Chromosome') test_chr.add_experiment('exp1', 100000, xp_handler=PATH + 'HIC_gm06690_chr19_chr19_100000_obs.txt') test_chr.find_tad(['exp1']) real_tads = {} for i, t in enumerate(test_chr.iter_tads('exp1', normed=False)): real_tads[i] = test_chr.experiments['exp1'].tads[i] real_tads[i]['hic'] = t[1] global DISTRA global DISTRD DISTRA, DISTRD = get_hic_distr(real_tads) # pick some tads picked_tads = [] picked_keys = [] for i in xrange(n_pick): key, new_tad = get_random_tad(real_tads) while key in picked_keys or (new_tad['end'] - new_tad['start'] < 15): key, new_tad = get_random_tad(real_tads) picked_tads.append(new_tad) picked_keys.append(key) # mutate this tads tads = {} tad_matrices = [] tad_names = [] for i in xrange(n_pick): print i tads[uppercase[i] + '_' + str(0)] = picked_tads[i] tad_names.append(uppercase[i] + '_' + str(0)) for j in xrange(1, n_tot): hic, indels = generate_random_contacts( tad1=picked_tads[i]['hic'], prob=0.05, ext=int(random()*4) + 1, indel=int(random() * 4) + 1)[1:] # indels = '|'.join([str(n-1) if n>0 else '-' + str((abs(n)-1)) for n in indels]) tads[uppercase[i] + '_' + str(j)] = { 'hic' : hic, 'start': picked_tads[i]['start'], 'end' : picked_tads[i]['end']} tad_matrices.append(hic) tad_names.append(uppercase[i] + '_' + str(j)) distances, cci = get_distances(tad_matrices, max_num_v=4, n_cpus=mu.cpu_count()) results, clusters = pre_cluster(distances, cci, len(tad_matrices)) paint_clustering(results, clusters, len(tad_matrices), test_chr, tad_names, tad_matrices)
def main(): test_chr = Chromosome(name='Test Chromosome') test_chr.add_experiment('exp1', 100000, xp_handler=PATH + 'HIC_k562_chr19_chr19_100000_obs.txt') test_chr.find_tad(['exp1']) tad_names = [] tad_matrices = [] for name, matrix in test_chr.iter_tads('exp1'): if test_chr.experiments['exp1'].tads[name]['score'] < 0: continue if (test_chr.experiments['exp1'].tads[name]['end'] - test_chr.experiments['exp1'].tads[name]['start']) < 10: continue tad_names.append(name) tad_matrices.append(matrix) num = len(tad_names) distances, cci = get_distances(tad_matrices, max_num_v=mu.cpu_count()) results, clusters = pre_cluster(distances, cci, num) paint_clustering(results, clusters, num, test_chr, tad_names)
def main(): """ main function """ # retieve HOX genes distmatrix, geneids = get_genes() # compute TADs for human chromosome 19 test_chr = Chromosome(name='Test Chromosome') test_chr.add_experiment('exp1', 100000, xp_handler=PATH + 'HIC_gm06690_chr19_chr19_100000_obs.txt') test_chr.find_tad(['exp1']) exp = test_chr.experiments['exp1'] clust = linkage(distmatrix['19']) cl_idx = list(fcluster(clust, t=1, criterion='inconsistent')) print max(cl_idx), 'clusters' cluster=[[] for _ in xrange(1, max(cl_idx)+1)] for i, j in enumerate(cl_idx): cluster[j-1].append(geneids['19'][i][1]) for i, _ in enumerate(cluster): cluster[i] = min(cluster[i]), max(cluster[i]) tad_breaker(exp.tads, cluster, exp.resolution, show_plot=True, bins=5, title='Proportion of HOX genes according to position in a TAD')
def process(): if (options.outputFilename != ""): outfilefileprefix = options.outputDir + options.outputFilename else: outfilefileprefix = options.outputDir + os.path.basename(args[0]) for matrixFile in xrange(len(args)): sample = os.path.splitext(os.path.basename( args[matrixFile]))[0].split(".matrix")[0] chr = sample.rsplit(".", 1)[-1] sample = sample.rsplit(".", 1)[0] chrom = Chromosome(name=chr, centromere_search=True, species=options.species, assembly=options.assembly) chrom.set_max_tad_size(5000000) chrom.add_experiment(sample, exp_type='Hi-C', identifier=sample, hic_data=args[matrixFile], resolution=options.resolution) exp = chrom.experiments[sample] exp.normalize_hic(silent=True) chrom.find_tad(sample, n_cpus=options.threads, normalized=True, verbose=False) exp.write_tad_borders(outfilefileprefix + "." + chr + ".border") # chrom.tad_density_plot(sample,savefig=outfilefileprefix+".density."+chr+".pdf") chrom.visualize(exp.name, paint_tads=True, savefig=outfilefileprefix + "chr." + chr + ".pdf") chrom.save_chromosome(outfilefileprefix + "chr." + chr + ".tdb", force=True)
def generate_tads(self, chrom): """ Uses TADbit to generate the TAD borders based on the computed hic_data """ from pytadbit import Chromosome exptName = self.library + "_" + str( self.resolution) + "_" + str(chrom) + "-" + str(chrom) fname = self.parsed_reads_dir + '/adjlist_map_' + str( chrom) + '-' + str(chrom) + '_' + str(self.resolution) + '.tsv' chr_hic_data = read_matrix(fname, resolution=int(self.resolution)) my_chrom = Chromosome(name=exptName, centromere_search=True) my_chrom.add_experiment(exptName, hic_data=chr_hic_data, resolution=int(self.resolution)) # Run core TADbit function to find TADs on each expt. # For the current dataset required 61GB of RAM my_chrom.find_tad(exptName, n_cpus=15) exp = my_chrom.experiments[exptName] tad_file = self.library_dir + exptName + '_tads.tsv' exp.write_tad_borders(savedata=tad_file)
def process(): if ( options.outputFilename != "" ): outfilefileprefix=options.outputDir+options.outputFilename else: outfilefileprefix=options.outputDir+os.path.basename(args[0]) for matrixFile in xrange(len(args)): sample=os.path.splitext(os.path.basename(args[matrixFile]))[0].split(".matrix")[0] chr = sample.rsplit(".",1)[-1] sample = sample.rsplit(".",1)[0] chrom = Chromosome(name=chr, centromere_search=True, species=options.species, assembly=options.assembly) chrom.set_max_tad_size(5000000) chrom.add_experiment(sample, exp_type='Hi-C', identifier=sample, hic_data=args[matrixFile], resolution=options.resolution) exp = chrom.experiments[sample] exp.normalize_hic(silent=True) chrom.find_tad(sample, n_cpus=options.threads, normalized=True, verbose=False) exp.write_tad_borders(outfilefileprefix+"."+chr+".border") # chrom.tad_density_plot(sample,savefig=outfilefileprefix+".density."+chr+".pdf") chrom.visualize(exp.name, paint_tads=True, savefig=outfilefileprefix+"chr."+chr+".pdf") chrom.save_chromosome(outfilefileprefix+"chr."+chr+".tdb", force=True)
# Parsing input arguments parser = argparse.ArgumentParser() parser.add_argument('path_input_raw', type = str, help = 'Input maps raw', default = 'RESOLUTION/Ecoli_20M_3kb_rep2_raw.tab') parser.add_argument('path_input_norm', type = str, help = 'Input maps normalized', default = 'RESOLUTION/Ecoli_20M_3kb_rep2.tab') parser.add_argument('path_output', type = str, help = 'Output folder for bed files', default = 'RESOLUTION/Tadbit/Ecoli_20M_3kb_rep1_final') parser.add_argument('restictase', type = str, help = 'Restrictase', default = 'HpaII') parser.add_argument('resolution', type = int, help = 'Resolution of map') args = parser.parse_args() path_input_raw = args.path_input_raw path_input_norm = args.path_input_norm path_output = args.path_output restrictase = args.restictase resolution = args.resolution my_chrom = Chromosome(name = '1',centromere_search = False) my_chrom.add_experiment(restrictase + '1_stat', resolution = resolution, hic_data = path_input_raw,\ norm_data = path_input_norm, enzyme = restrictase) exp = my_chrom.experiments[restrictase + '1_stat'] my_chrom.find_tad([restrictase + '1_stat'], verbose = True, batch_mode = False) my_chrom.experiments[restrictase + '1_stat'] exp.write_tad_borders(density = True, savedata = 'tmp.txt', normalized = False) data = pd.read_csv('tmp.txt', sep = '\t') data['start'] -= 1 data *= resolution data = data[ data['end'] - data['start'] >= resolution * 4 ] data['ix'] = 'chr1' data = data[['ix', 'start', 'end']] data.to_csv(path_output, sep = '\t', header = False, index = False)
def call_tads(matrix_filenames, chrom_name): print print "Call TADs for chromosome " + chrom_name + '...' print "Contact matrices: " for matrix_filename in matrix_filenames: print matrix_filename chrom_number = search(r'\d+|X|Y', chrom_name).group(0) if len(chrom_number) == 1 and chrom_number != 'X' and chrom_number != 'Y': chrom_number = '0' + chrom_number chrom_id = 'chr' + chrom_number else: chrom_id = chrom_name output_txt_filename = join(txt_directory, chrom_id + '_TADs.txt') print 'Output TXT file:', output_txt_filename output_bed_filename = join(bed_directory, chrom_id + '_TADs.bed') print 'Output BED file:', output_bed_filename filename_list.append(output_bed_filename) """tads_2D_filename = join(png_directory, chrom_id + '_TADs_2D.png') print 'Output 2D TAD plot file:', tads_2D_filename tads_1D_filename = join(png_directory, chrom_id + '_TADs_1D.png') print 'Output 1D TAD plot file:', tads_1D_filename""" # Call TADs and write their borders in TADbit text format and in BED format chrom = Chromosome(name=chrom_name) if len(matrix_filenames) > 1: # several matrices for one chromosome combined_experiment_name = 'batch' experiment_names = [] for matrix_index, matrix_filename in enumerate(matrix_filenames): experiment_name = splitext(basename(matrix_filename))[0] + '_' + str(matrix_index) experiment_names.append(experiment_name) combined_experiment_name += '_' + experiment_name chrom.add_experiment(experiment_name, hic_data=matrix_filename, \ resolution=matrix_resolution) chrom.find_tad(experiment_names, batch_mode = True, n_cpus=thread_number) chrom.experiments[combined_experiment_name].write_tad_borders(savedata=output_txt_filename) #chrom.visualize(combined_experiment_name, paint_tads=True, savefig=tads_2D_filename) #chrom.tad_density_plot(combined_experiment_name, savefig=tads_1D_filename) else: # only one matrix for one chromosome matrix_filename = matrix_filenames[0] experiment_name = splitext(basename(matrix_filename))[0] chrom.add_experiment(experiment_name, hic_data=matrix_filename, \ resolution=matrix_resolution) chrom.find_tad(experiment_name, n_cpus=thread_number) chrom.experiments[experiment_name].write_tad_borders(savedata=output_txt_filename) #chrom.visualize(experiment_name, paint_tads=True, savefig=tads_2D_filename) #chrom.tad_density_plot(experiment_name, savefig=tads_1D_filename) with open(output_txt_filename, 'r') as src, open(output_bed_filename, 'w') as dst: track_line = 'track name="' + chrom_name + '_TADs" visibility=1 itemRgb="On"' dst.write(track_line + '\n') for i, line in enumerate(src): if line.split()[0] == '#': continue line_list = line.split() tad_name = chrom_name + '.' + 'TAD' + '.' + str(i) # Coordinates in BED format are 0-based, # and a region is presented by [x,y) interval. start_pos = (int(line_list[1]) - 1) * matrix_resolution end_pos = int(line_list[2]) * matrix_resolution score = 0 # Just to fill in the field strand = '.' # Just to fill in the field if i%2: color = '0,0,255' # blue else: color = '255,0,0' # red bed_line = chrom_name + '\t' + str(start_pos) + '\t' + str(end_pos) + '\t' + \ tad_name + '\t' + str(score) + '\t' + strand + '\t' + \ str(start_pos) + '\t' + str(end_pos) + '\t' + color dst.write(bed_line + '\n') chrom_filename = join(tdb_directory, chrom_id + '.tdb') chrom.save_chromosome(chrom_filename, force=True) print 'Finish.'
def call_tads(matrix_filenames, chrom_name): print print "Call TADs for chromosome " + chrom_name + '...' print "Contact matrices: " for matrix_filename in matrix_filenames: print matrix_filename chrom_number = search(r'\d+|X|Y', chrom_name).group(0) if len(chrom_number) == 1 and chrom_number != 'X' and chrom_number != 'Y': chrom_number = '0' + chrom_number chrom_id = 'chr' + chrom_number else: chrom_id = chrom_name output_txt_filename = join(txt_directory, chrom_id + '_TADs.txt') print 'Output TXT file:', output_txt_filename output_bed_filename = join(bed_directory, chrom_id + '_TADs.bed') print 'Output BED file:', output_bed_filename filename_list.append(output_bed_filename) """tads_2D_filename = join(png_directory, chrom_id + '_TADs_2D.png') print 'Output 2D TAD plot file:', tads_2D_filename tads_1D_filename = join(png_directory, chrom_id + '_TADs_1D.png') print 'Output 1D TAD plot file:', tads_1D_filename""" # Call TADs and write their borders in TADbit text format and in BED format chrom = Chromosome(name=chrom_name) if len(matrix_filenames) > 1: # several matrices for one chromosome combined_experiment_name = 'batch' experiment_names = [] for matrix_index, matrix_filename in enumerate(matrix_filenames): experiment_name = splitext( basename(matrix_filename))[0] + '_' + str(matrix_index) experiment_names.append(experiment_name) combined_experiment_name += '_' + experiment_name chrom.add_experiment(experiment_name, hic_data=matrix_filename, \ resolution=matrix_resolution) chrom.find_tad(experiment_names, batch_mode=True, n_cpus=thread_number) chrom.experiments[combined_experiment_name].write_tad_borders( savedata=output_txt_filename) #chrom.visualize(combined_experiment_name, paint_tads=True, savefig=tads_2D_filename) #chrom.tad_density_plot(combined_experiment_name, savefig=tads_1D_filename) else: # only one matrix for one chromosome matrix_filename = matrix_filenames[0] experiment_name = splitext(basename(matrix_filename))[0] chrom.add_experiment(experiment_name, hic_data=matrix_filename, \ resolution=matrix_resolution) chrom.find_tad(experiment_name, n_cpus=thread_number) chrom.experiments[experiment_name].write_tad_borders( savedata=output_txt_filename) #chrom.visualize(experiment_name, paint_tads=True, savefig=tads_2D_filename) #chrom.tad_density_plot(experiment_name, savefig=tads_1D_filename) with open(output_txt_filename, 'r') as src, open(output_bed_filename, 'w') as dst: track_line = 'track name="' + chrom_name + '_TADs" visibility=1 itemRgb="On"' dst.write(track_line + '\n') for i, line in enumerate(src): if line.split()[0] == '#': continue line_list = line.split() tad_name = chrom_name + '.' + 'TAD' + '.' + str(i) # Coordinates in BED format are 0-based, # and a region is presented by [x,y) interval. start_pos = (int(line_list[1]) - 1) * matrix_resolution end_pos = int(line_list[2]) * matrix_resolution score = 0 # Just to fill in the field strand = '.' # Just to fill in the field if i % 2: color = '0,0,255' # blue else: color = '255,0,0' # red bed_line = chrom_name + '\t' + str(start_pos) + '\t' + str(end_pos) + '\t' + \ tad_name + '\t' + str(score) + '\t' + strand + '\t' + \ str(start_pos) + '\t' + str(end_pos) + '\t' + color dst.write(bed_line + '\n') chrom_filename = join(tdb_directory, chrom_id + '.tdb') chrom.save_chromosome(chrom_filename, force=True) print 'Finish.'
02 Jul 2013 script that follows Tadbit tutorial presented in the documentation """ from pytadbit import Chromosome # initiate a chromosome object that will store all Hi-C data and analysis my_chrom = Chromosome(name='My fisrt chromsome') # load Hi-C data my_chrom.add_experiment('First Hi-C experiment', xp_handler="sample_data/HIC_k562_chr19_chr19_100000_obs.txt", resolution=100000) my_chrom.add_experiment('Second Hi-C experiment', xp_handler="sample_data/HIC_gm06690_chr19_chr19_100000_obs.txt", resolution=100000) # run core tadbit function to find TADs, on each experiment my_chrom.find_tad('First Hi-C experiment' , n_cpus=8, verbose=False) my_chrom.find_tad('Second Hi-C experiment', n_cpus=8, verbose=False) print my_chrom.experiments my_chrom.align_experiments(names=["First Hi-C experiment", "Second Hi-C experiment"]) print my_chrom.alignment ali = my_chrom.alignment[('First Hi-C experiment', 'Second Hi-C experiment')] print ali.write_alignment(ftype='html') score, pval = my_chrom.align_experiments(randomize=True, rnd_num=1000)
example run: python2 script_TADbit.py infile.txt outfile.txt S2 chr2L 20000 8 From examples folder: ~/anaconda3/envs/tadbit/bin/python ../TADselect/script_TADbit.py ../data/test_S2.20000.chr2L.txt tmp/tadbit_output.txt S2 chr2L 20000 8 """ from sys import argv infile = argv[1] # txt matrix output = argv[2] exp = argv[3] ch = argv[4] resolution = int(argv[5]) # in bp nth = int(argv[6]) # 8 from pytadbit import Chromosome my_chrom = Chromosome(name=ch, centromere_search=False) my_chrom.add_experiment(exp, exp_type='Hi-C', identifier=exp, hic_data=infile, resolution=resolution) my_chrom.find_tad(exp, n_cpus=nth) experiment = my_chrom.experiments[exp] experiment.write_tad_borders(savedata=output, density=True)