def test_19_matrix_manip(self): if ONLY and not "19" in ONLY: return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads("lala-map~", resolution=10000) hic_map(hic_data1, savedata="lala-map.tsv~", savefig="lala.pdf") hic_map(hic_data1, by_chrom="intra", savedata="lala-maps~", savefig="lalalo~") hic_map(hic_data1, by_chrom="inter", savedata="lala-maps~", savefig="lalala~") # slowest part of the all test: hic_data2 = read_matrix("lala-map.tsv~", resolution=10000) self.assertEqual(hic_data1, hic_data2) # vals = plot_distance_vs_interactions(hic_data1) # self.assertEqual([round(i, 2) if str(i)!="nan" else 0.0 for i in # reduce(lambda x, y: x + y, vals)], # [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0]) a, b = insert_sizes("lala-map~") self.assertEqual([int(a), int(b)], [43, 1033]) hic_data1 = read_matrix(PATH + "/20Kb/chrT/chrT_A.tsv", resolution=20000) hic_data2 = read_matrix(PATH + "/20Kb/chrT/chrT_B.tsv", resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i, 3) for i in corr[0]] self.assertEqual(corr, [ 0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832 ]) ecorr = eig_correlate_matrices(hic_data1, hic_data2, savefig='lala3.pdf') ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)] self.assertEqual(ecorr, [ 0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89 ]) system("rm -rf lala*") if CHKTIME: self.assertEqual(True, True) print "19", time() - t0
def test_19_matrix_manip(self): if ONLY and ONLY != '19': return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000) hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~') hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~') hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~') # slowest part of the all test: hic_data2 = read_matrix('lala-map.tsv~', resolution=10000) self.assertEqual(hic_data1, hic_data2) vals = plot_distance_vs_interactions(hic_data1) self.assertEqual([ round(i, 2) if str(i) != 'nan' else 0.0 for i in reduce(lambda x, y: x + y, vals) ], [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0]) a, b = insert_sizes('lala-map~') self.assertEqual([int(a), int(b)], [43, 1033]) hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000) hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i, 3) for i in corr[0]] self.assertEqual(corr, [ 0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832 ]) ecorr = eig_correlate_matrices(hic_data1, hic_data2) ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)] self.assertEqual(ecorr, [ 0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89 ]) system('rm -rf lala*') if CHKTIME: self.assertEqual(True, True) print '19', time() - t0
def load_hic_read_data(self): """ Load the interactions into the HiC-Data data type This should be used as the primary way of loading the HiC-data as the data is loaded in the right form for later functions. Options like the TAD calling also require non-normalised data. """ filter_reads = self.parsed_reads_dir + '/filtered_map.tsv' print "\nfilter_reads: " + filter_reads self.hic_data = load_hic_data_from_reads(filter_reads, resolution=int( self.resolution))
def test_19_matrix_manip(self): if ONLY and ONLY != '19': return if CHKTIME: t0 = time() hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000) hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~') hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~') hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~') # slowest part of the all test: hic_data2 = read_matrix('lala-map.tsv~', resolution=10000) self.assertEqual(hic_data1, hic_data2) vals = plot_distance_vs_interactions(hic_data1) self.assertEqual([round(i, 2) if str(i)!='nan' else 0.0 for i in reduce(lambda x, y: x + y, vals)], [-1.74, 4.2, 0.52, 1.82, -0.44, 0.0, -0.5, 2.95, 0.0]) a, b = insert_sizes('lala-map~') self.assertEqual([int(a),int(b)], [43, 1033]) hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000) hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000) corr = correlate_matrices(hic_data1, hic_data2) corr = [round(i,3) for i in corr[0]] self.assertEqual(corr, [0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797, 0.832]) ecorr = eig_correlate_matrices(hic_data1, hic_data2) ecorr = [round(i,3) for i in reduce(lambda x, y:x+y, ecorr)] self.assertEqual(ecorr, [0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002, 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89]) system('rm -rf lala*') if CHKTIME: self.assertEqual(True, True) print '19', time() - t0
def hic_map(data, resolution=None, normalized=False, masked=None, by_chrom=False, savefig=None, show=False, savedata=None, focus=None, clim=None, cmap='jet', pdf=False, decay=True, perc=10, name=None, decay_resolution=None, **kwargs): """ function to retrieve data from HiC-data object. Data can be stored as a square matrix, or drawn using matplotlib :param data: can be either a path to a file with pre-processed reads (filtered or not), or a Hi-C-data object :param None resolution: at which to bin the data (try having a dense matrix with < 10% of cells with zero interaction counts). Note: not necessary if a hic_data object is passed as 'data'. :param False normalized: used normalized data, based on precalculated biases :param masked: a list of columns to be removed. Usually because to few interactions :param False by_chrom: data can be stored in a partitioned way. This parameter can take the values of: * 'intra': one output per each chromosome will be created * 'inter': one output per each possible pair of chromosome will be created * 'all' : both of the above outputs :param None savefig: path where to store the output images. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None savedata: path where to store the output matrices. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None focus: can be either two number (i.e.: (1, 100)) specifying the start and end position of the sub-matrix to display (start and end, along the diagonal of the original matrix); or directly a chromosome name; or two chromosome names (i.e.: focus=('chr2, chrX')), in order to store the data corresponding to inter chromosomal interactions between these two chromosomes :param True decay: plot the correlation between genomic distance and interactions (usually a decay). :param False force_image: force to generate an image even if resolution is crazy... :param None clim: cutoff for the upper and lower bound in the coloring scale of the heatmap :param False pdf: when using the bny_chrom option, to specify the format of the stored images :param Reds cmap: color map to be used for the heatmap :param None decay_resolution: chromatin fragment size to consider when calculating decay of the number of interactions with genomic distance. Default is equal to resolution of the matrix. """ if isinstance(data, str): data = load_hic_data_from_reads(data, resolution=resolution, **kwargs) if not kwargs.get('get_sections', True) and decay: warn('WARNING: not decay not available when get_sections is off.') decay = False hic_data = data resolution = data.resolution if not decay_resolution: decay_resolution = resolution if hic_data.bads and not masked: masked = hic_data.bads # save and draw the data if by_chrom: if focus: raise Exception('Incompatible options focus and by_chrom\n') if savedata: mkdir(savedata) if savefig: mkdir(savefig) for i, crm1 in enumerate(hic_data.chromosomes): for crm2 in hic_data.chromosomes.keys()[i:]: if by_chrom == 'intra' and crm1 != crm2: continue if by_chrom == 'inter' and crm1 == crm2: continue try: subdata = hic_data.get_matrix(focus=(crm1, crm2), normalized=normalized) start1, _ = hic_data.section_pos[crm1] start2, _ = hic_data.section_pos[crm2] masked1 = {} masked2 = {} if focus and hic_data.bads: # rescale masked masked1 = dict([(m - start1, hic_data.bads[m]) for m in hic_data.bads]) masked2 = dict([(m - start2, hic_data.bads[m]) for m in hic_data.bads]) if masked1 or masked2: for i in xrange(len(subdata)): if i in masked1: subdata[i] = [float('nan') for j in xrange(len(subdata))] for j in xrange(len(subdata)): if j in masked2: subdata[i][j] = float('nan') if savedata: hic_data.write_matrix('%s/%s.mat' % ( savedata, '_'.join(set((crm1, crm2)))), focus=(crm1, crm2), normalized=normalized) if show or savefig: if (len(subdata) > 10000 and not kwargs.get('force_image', False)): warn('WARNING: Matrix image not created, more than ' '10000 rows, use a lower resolution to create images') continue draw_map(subdata, OrderedDict([(k, hic_data.chromosomes[k]) for k in hic_data.chromosomes.keys() if k in [crm1, crm2]]), hic_data.section_pos, '%s/%s.%s' % (savefig, '_'.join(set((crm1, crm2))), 'pdf' if pdf else 'png'), show, one=True, clim=clim, cmap=cmap, decay_resolution=decay_resolution, perc=perc, name=name, cistrans=float('NaN')) except ValueError, e: print 'Value ERROR: problem with chromosome %s' % crm1 print str(e) except IndexError, e: print 'Index ERROR: problem with chromosome %s' % crm1 print str(e)
def hic_map(data, resolution=None, normalized=False, masked=None, by_chrom=False, savefig=None, show=False, savedata=None, focus=None, clim=None, cmap='jet', pdf=False, decay=True, perc=10, name=None, decay_resolution=None, **kwargs): """ function to retrieve data from HiC-data object. Data can be stored as a square matrix, or drawn using matplotlib :param data: can be either a path to a file with pre-processed reads (filtered or not), or a Hi-C-data object :param None resolution: at which to bin the data (try having a dense matrix with < 10% of cells with zero interaction counts). Note: not necessary if a hic_data object is passed as 'data'. :param False normalized: used normalized data, based on precalculated biases :param masked: a list of columns to be removed. Usually because to few interactions :param False by_chrom: data can be stored in a partitioned way. This parameter can take the values of: * 'intra': one output per each chromosome will be created * 'inter': one output per each possible pair of chromosome will be created * 'all' : both of the above outputs :param None savefig: path where to store the output images. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None savedata: path where to store the output matrices. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None focus: can be either two number (i.e.: (1, 100)) specifying the start and end position of the sub-matrix to display (start and end, along the diagonal of the original matrix); or directly a chromosome name; or two chromosome names (i.e.: focus=('chr2, chrX')), in order to store the data corresponding to inter chromosomal interactions between these two chromosomes :param True decay: plot the correlation between genomic distance and interactions (usually a decay). :param None clim: cutoff for the upper and lower bound in the coloring scale of the heatmap :param False pdf: when using the bny_chrom option, to specify the format of the stored images :param Reds cmap: color map to be used for the heatmap :param None decay_resolution: chromatin fragment size to consider when calculating decay of the number of interactions with genomic distance. Default is equal to resolution of the matrix. """ if isinstance(data, str): data = load_hic_data_from_reads(data, resolution=resolution, **kwargs) if not kwargs.get('get_sections', True) and decay: warn('WARNING: not decay not available when get_sections is off.') decay = False hic_data = data resolution = data.resolution if not decay_resolution: decay_resolution = resolution if hic_data.bads and not masked: masked = hic_data.bads # save and draw the data if by_chrom: if focus: raise Exception('Incompatible options focus and by_chrom\n') os.system('mkdir -p ' + (savedata if savedata else savefig)) for i, crm1 in enumerate(hic_data.chromosomes): for crm2 in hic_data.chromosomes.keys()[i:]: if by_chrom == 'intra' and crm1 != crm2: continue if by_chrom == 'inter' and crm1 == crm2: continue subdata = hic_data.get_matrix(focus=(crm1, crm2), normalized=normalized) if savedata: hic_data.write_matrix('%s/%s.mat' % ( savedata, '_'.join(set((crm1, crm2)))), focus=(crm1, crm2), normalized=normalized) if show or savefig: draw_map(subdata, OrderedDict([(k, hic_data.chromosomes[k]) for k in hic_data.chromosomes.keys() if k in [crm1, crm2]]), hic_data.section_pos, '%s/%s.%s' % (savefig, '_'.join(set((crm1, crm2))), 'pdf' if pdf else 'png'), show, one=True, clim=clim, cmap=cmap, decay_resolution=decay_resolution, perc=perc, name=name, cistrans=float('NaN')) else: if savedata: hic_data.write_matrix(savedata, focus=focus, normalized=normalized) if show or savefig: subdata = hic_data.get_matrix(focus=focus, normalized=normalized) if focus and masked: # rescale masked masked = dict([(m - focus[0], masked[m]) for m in masked]) if masked: for i in xrange(len(subdata)): if i in masked: subdata[i] = [float('nan') for j in xrange(len(subdata))] for j in xrange(len(subdata)): if j in masked: subdata[i][j] = float('nan') draw_map(subdata, {} if focus else hic_data.chromosomes, hic_data.section_pos, savefig, show, one = True if focus else False, decay=decay, clim=clim, cmap=cmap, decay_resolution=decay_resolution, perc=perc, normalized=normalized, max_diff=kwargs.get('max_diff', None), name=name, cistrans=float('NaN') if focus else hic_data.cis_trans_ratio(kwargs.get('normalized', False), kwargs.get('exclude', None), kwargs.get('diagonal', True), kwargs.get('equals', None), kwargs.get('verbose', False)))
def hic_map( data, resolution=None, normalized=False, masked=None, by_chrom=False, savefig=None, show=False, savedata=None, focus=None, clim=None, cmap="jet", pdf=False, decay=True, perc=10, name=None, decay_resolution=None, **kwargs ): """ function to retrieve data from HiC-data object. Data can be stored as a square matrix, or drawn using matplotlib :param data: can be either a path to a file with pre-processed reads (filtered or not), or a Hi-C-data object :param None resolution: at which to bin the data (try having a dense matrix with < 10% of cells with zero interaction counts). Note: not necessary if a hic_data object is passed as 'data'. :param False normalized: used normalized data, based on precalculated biases :param masked: a list of columns to be removed. Usually because to few interactions :param False by_chrom: data can be stored in a partitioned way. This parameter can take the values of: * 'intra': one output per each chromosome will be created * 'inter': one output per each possible pair of chromosome will be created * 'all' : both of the above outputs :param None savefig: path where to store the output images. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None savedata: path where to store the output matrices. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None focus: can be either two number (i.e.: (1, 100)) specifying the start and end position of the sub-matrix to display (start and end, along the diagonal of the original matrix); or directly a chromosome name; or two chromosome names (i.e.: focus=('chr2, chrX')), in order to store the data corresponding to inter chromosomal interactions between these two chromosomes :param True decay: plot the correlation between genomic distance and interactions (usually a decay). :param None clim: cutoff for the upper and lower bound in the coloring scale of the heatmap :param False pdf: when using the bny_chrom option, to specify the format of the stored images :param Reds cmap: color map to be used for the heatmap :param None decay_resolution: chromatin fragment size to consider when calculating decay of the number of interactions with genomic distance. Default is equal to resolution of the matrix. """ if isinstance(data, str): data = load_hic_data_from_reads(data, resolution=resolution, **kwargs) if not kwargs.get("get_sections", True) and decay: warn("WARNING: not decay not available when get_sections is off.") decay = False hic_data = data resolution = data.resolution if not decay_resolution: decay_resolution = resolution if hic_data.bads and not masked: masked = hic_data.bads # save and draw the data if by_chrom: if focus: raise Exception("Incompatible options focus and by_chrom\n") os.system("mkdir -p " + (savedata if savedata else savefig)) for i, crm1 in enumerate(hic_data.chromosomes): for crm2 in hic_data.chromosomes.keys()[i:]: if by_chrom == "intra" and crm1 != crm2: continue if by_chrom == "inter" and crm1 == crm2: continue subdata = hic_data.get_matrix(focus=(crm1, crm2), normalized=normalized) if savedata: hic_data.write_matrix( "%s/%s.mat" % (savedata, "_".join(set((crm1, crm2)))), focus=(crm1, crm2), normalized=normalized ) if show or savefig: draw_map( subdata, OrderedDict( [(k, hic_data.chromosomes[k]) for k in hic_data.chromosomes.keys() if k in [crm1, crm2]] ), hic_data.section_pos, "%s/%s.%s" % (savefig, "_".join(set((crm1, crm2))), "pdf" if pdf else "png"), show, one=True, clim=clim, cmap=cmap, decay_resolution=decay_resolution, perc=perc, name=name, cistrans=float("NaN"), ) else: if savedata: hic_data.write_matrix(savedata, focus=focus, normalized=normalized) if show or savefig: subdata = hic_data.get_matrix(focus=focus, normalized=normalized) if focus and masked: # rescale masked masked = dict([(m - focus[0], masked[m]) for m in masked]) if masked: for i in xrange(len(subdata)): if i in masked: subdata[i] = [float("nan") for j in xrange(len(subdata))] for j in xrange(len(subdata)): if j in masked: subdata[i][j] = float("nan") draw_map( subdata, {} if focus else hic_data.chromosomes, hic_data.section_pos, savefig, show, one=True if focus else False, decay=decay, clim=clim, cmap=cmap, decay_resolution=decay_resolution, perc=perc, normalized=normalized, max_diff=kwargs.get("max_diff", None), name=name, cistrans=float("NaN") if focus else hic_data.cis_trans_ratio( kwargs.get("normalized", False), kwargs.get("exclude", None), kwargs.get("diagonal", True), kwargs.get("equals", None), kwargs.get("verbose", False), ), )
def hic_map(data, resolution=None, normalized=False, masked=None, by_chrom=False, savefig=None, show=False, savedata=None, focus=None, clim=None, cmap='tadbit', pdf=False, decay=True, perc=10, name=None, **kwargs): """ function to retrieve data from HiC-data object. Data can be stored as a square matrix, or drawn using matplotlib :param data: can be either a path to a file with pre-processed reads (filtered or not), or a Hi-C-data object :param None resolution: at which to bin the data (try having a dense matrix with < 10% of cells with zero interaction counts). Note: not necessary if a hic_data object is passed as 'data'. :param False normalized: used normalized data, based on precalculated biases :param masked: a list of columns to be removed. Usually because to few interactions :param False by_chrom: data can be stored in a partitioned way. This parameter can take the values of: * 'intra': one output per each chromosome will be created * 'inter': one output per each possible pair of chromosome will be created * 'all' : both of the above outputs :param None savefig: path where to store the output images. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None savedata: path where to store the output matrices. Note that, if the by_chrom option is used, then savefig will be the name of the directory containing the output files. :param None focus: can be either two number (i.e.: (1, 100)) specifying the start and end position of the sub-matrix to display (start and end, along the diagonal of the original matrix); or directly a chromosome name; or two chromosome names (i.e.: focus=('chr2, chrX')), in order to store the data corresponding to inter chromosomal interactions between these two chromosomes :param True decay: plot the correlation between genomic distance and interactions (usually a decay). :param None clim: cutoff for the upper and lower bound in the coloring scale of the heatmap :param False pdf: when using the bny_chrom option, to specify the format of the stored images :param Reds cmap: color map to be used for the heatmap :param False get_sections: for very very high resolution, when the column index does not fit in memory """ if isinstance(data, str): data = load_hic_data_from_reads(data, resolution=resolution, **kwargs) if not kwargs.get('get_sections', True) and decay: warn('WARNING: not decay not available when get_sections is off.') decay = False hic_data = data if hic_data.bads and not masked: masked = hic_data.bads # save and draw the data if by_chrom: if focus: raise Exception('Incompatible options focus and by_chrom\n') os.system('mkdir -p ' + (savedata if savedata else savefig)) for i, crm1 in enumerate(hic_data.chromosomes): for crm2 in hic_data.chromosomes.keys()[i:]: if by_chrom == 'intra' and crm1 != crm2: continue if by_chrom == 'inter' and crm1 == crm2: continue subdata = hic_data.get_matrix(focus=(crm1, crm2), normalized=normalized) if savedata: out = open('%s/%s.mat' % ( savedata, '_'.join(set((crm1, crm2)))), 'w') out.write('\n'.join(['\t'.join([str(i) for i in d]) for d in subdata]) + '\n') out.close() if show or savefig: draw_map(subdata, OrderedDict([(k, hic_data.chromosomes[k]) for k in hic_data.chromosomes.keys() if k in [crm1, crm2]]), hic_data.section_pos, '%s/%s.%s' % (savefig, '_'.join(set((crm1, crm2))), 'pdf' if pdf else 'png'), show, one=True, clim=clim, cmap=cmap, resolution=resolution, perc=perc, name=name, cistrans=float('NaN')) else: if savedata: out = open(savedata, 'w') out.write('\n'.join( ['\t'.join([str(i) for i in line]) for line in hic_data.get_matrix( focus=focus, normalized=normalized)]) + '\n') out.close() if show or savefig: subdata = hic_data.get_matrix(focus=focus, normalized=normalized) if focus and masked: # rescale masked masked = dict([(m - focus[0], masked[m]) for m in masked]) if masked: for i in xrange(len(subdata)): if i in masked: subdata[i] = [float('nan') for j in xrange(len(subdata))] for j in xrange(len(subdata)): if j in masked: subdata[i][j] = float('nan') draw_map(subdata, {} if focus else hic_data.chromosomes, hic_data.section_pos, savefig, show, one = True if focus else False, decay=decay, clim=clim, cmap=cmap, resolution=resolution, perc=perc, name=name, cistrans=float('NaN') if focus else hic_data.cis_trans_ratio(kwargs.get('normalized', False), kwargs.get('exclude', None), kwargs.get('diagonal', False), kwargs.get('equals', None), kwargs.get('verbose', False)))