def from_upper_triu(vector_repr, matrix_len, num_diags): z = np.zeros((matrix_len, matrix_len)) triu_tup = np.triu_indices(matrix_len, num_diags) z[triu_tup] = vector_repr for i in range(-num_diags + 1, num_diags): set_diag(z, np.nan, i) return z + z.T
def plot_juicebox_from_predicted_array(mat, binsize, interval, out_dir, diagonal_offset, chr_dict, use_control=False, **kwargs): predicted_data = mat_to_pandas_df(mat=mat, binsize=binsize, interval=interval) print(predicted_data.isna().sum()) print(predicted_data) mp = MatrixPlotter() mp.set_data(predicted_data) if not use_control: mp.set_control(predicted_data) else: if 'genome_cool_file' not in kwargs: print("please add path to control cool file") raise Exception # process hic data print("open and process control cool file") genome_hic_cool = kwargs['ghc'] mseq_str = '%s:%d-%d' % (interval.chr, interval.start, interval.end) seq_hic_raw = genome_hic_cool.matrix(balance=True).fetch(mseq_str) print("seq_hic from cool file shape:", seq_hic_raw.shape, "predicted matrix shape:", mat.shape) assert seq_hic_raw.shape == mat.shape clipval = np.nanmedian(np.diag(seq_hic_raw, diagonal_offset)) for i in range(-diagonal_offset + 1, diagonal_offset): set_diag(seq_hic_raw, clipval, i) seq_hic_raw = np.clip(seq_hic_raw, 0, clipval) # adaptively coarsegrain based on raw counts seq_hic_smoothed = adaptive_coarsegrain( seq_hic_raw, genome_hic_cool.matrix(balance=False).fetch(mseq_str), cutoff=2, max_levels=8) control_data = mat_to_pandas_df(mat=seq_hic_smoothed, binsize=binsize, interval=interval) print(len(control_data)) # choose only contacts <= seqlen control_data_merge = pd.merge(predicted_data, control_data, on=["chr", "contact_st", "contact_en"]) control_data = control_data_merge[[ 'chr', 'contact_st', 'contact_en', 'contact_count_y' ]] control_data.rename(columns={'contact_count_y': 'contact_count'}, inplace=True) print(control_data) print(len(control_data)) mp.set_control(control_data) # mp.set_apply_log(self.apply_log) MatPlot2HiC( mp, interval.chr + "_" + str(interval.start) + '_' + str(interval.end), out_dir, chr_dict)
def from_oe_to_contacts(seq_hic_obsexp, genome_hic_expected_file, interval, seq_len_pool, no_log=False): if no_log == False: seq_hic_obsexp = np.exp(seq_hic_obsexp) genome_hic_expected = pd.read_csv(genome_hic_expected_file, sep='\t') exp_chr = genome_hic_expected.iloc[genome_hic_expected['chrom'].values == interval.chr][0:seq_len_pool] if len(exp_chr) == 0: raise ValueError('no expected values found for chr:' + interval.chr) exp_map = np.zeros((seq_len_pool, seq_len_pool)) print(exp_map.shape) for i in range(seq_len_pool): set_diag(exp_map, exp_chr['balanced.avg'].values[i], i) set_diag(exp_map, exp_chr['balanced.avg'].values[i], -i) seq_hic_smoothed = exp_map * seq_hic_obsexp return seq_hic_smoothed
def _insul_diamond_dense(mat, window=10, ignore_diags=2, norm_by_median=True): """ Calculates the insulation score of a Hi-C interaction matrix. Parameters ---------- mat : numpy.array A dense square matrix of Hi-C interaction frequencies. May contain nans, e.g. in rows/columns excluded from the analysis. window : int The width of the window to calculate the insulation score. ignore_diags : int If > 0, the interactions at separations < `ignore_diags` are ignored when calculating the insulation score. Typically, a few first diagonals of the Hi-C map should be ignored due to contamination with Hi-C artifacts. norm_by_median : bool If True, normalize the insulation score by its NaN-median. """ if (ignore_diags): mat = mat.copy() for i in range(-ignore_diags + 1, ignore_diags): numutils.set_diag(mat, np.nan, i) with warnings.catch_warnings(): warnings.simplefilter("ignore") N = mat.shape[0] score = np.nan * np.ones(N) for i in range(0, N): lo = max(0, i + 1 - window) hi = min(i + window, N) # nanmean of interactions to reduce the effect of bad bins score[i] = np.nanmean(mat[lo:i + 1, i:hi]) if norm_by_median: score /= np.nanmedian(score) return score
def main(): usage = 'usage: %prog [options] <genome_hic_file> <seqs_bed_file> <seqs_hic_file>' parser = OptionParser(usage) parser.add_option('-b', dest='blacklist_bed', help='Set blacklist nucleotides to a baseline value.') parser.add_option('--clip', dest='clip', default=None, type='float', help='Clip values post-summary to a maximum [Default: %default]') parser.add_option('--crop', dest='crop_bp', default=0, type='int', help='Crop bp off each end [Default: %default]') parser.add_option('-d', dest='diagonal_offset', default=2, type='int', help='Positions on the diagonal to ignore [Default: %default]') parser.add_option('-k', dest='kernel_stddev', default=0, type='int', help='Gaussian kernel stddev to smooth values [Default: %default]') # parser.add_option('-s', dest='scale', # default=1., type='float', # help='Scale values by [Default: %default]') parser.add_option('-w',dest='pool_width', default=1, type='int', help='Average pooling width [Default: %default]') parser.add_option('--as_obsexp',dest='as_obsexp', default=False,action="store_true", help='save targets as obsexp profiles') parser.add_option('--global_obsexp',dest='global_obsexp', default=False,action="store_true", help='use global obs/exp') parser.add_option('--no_log',dest='no_log', default=False,action="store_true", help='no not take log for obs/exp') (options, args) = parser.parse_args() if len(args) != 3: parser.error('') else: genome_hic_file = args[0] seqs_bed_file = args[1] seqs_hic_file = args[2] # read model sequences model_seqs = [] for line in open(seqs_bed_file): a = line.split() model_seqs.append(ModelSeq(a[0],int(a[1]),int(a[2]),None)) # read blacklist regions black_chr_trees = read_blacklist(options.blacklist_bed) # compute dimensions num_seqs = len(model_seqs) seq_len_nt = model_seqs[0].end - model_seqs[0].start seq_len_pool = seq_len_nt // options.pool_width if options.crop_bp == 0: seq_len_crop = seq_len_pool else: crop_start = options.crop_bp // options.pool_width crop_end = seq_len_pool - crop_start seq_len_crop = seq_len_pool - 2*crop_start # compute upper triangular indexes triu_tup = np.triu_indices(seq_len_crop, options.diagonal_offset) seq_len_nodiag = seq_len_crop - options.diagonal_offset seq_len_hic = seq_len_nodiag*(seq_len_nodiag + 1) // 2 # initialize sequences coverage file seqs_hic_open = h5py.File(seqs_hic_file, 'w') seqs_hic_open.create_dataset('targets', shape=(num_seqs, seq_len_hic), dtype='float16') if options.kernel_stddev > 0: # initialize Gaussian kernel kernel = Gaussian2DKernel(x_stddev=options.kernel_stddev) else: kernel = None # open genome coverage file genome_hic_cool = cooler.Cooler(genome_hic_file) if options.global_obsexp: try: print('loading by-chromosome expected') genome_hic_expected = pd.read_csv(genome_hic_file.replace('.cool','.expected'), sep='\t') except: print('not found: '+genome_hic_file.replace('cool','expected')) raise ValueError('invalid expected file') # check for "chr" prefix chr_pre = 'chr1' in genome_hic_cool.chromnames # assert that resolution matches assert(options.pool_width == genome_hic_cool.info['bin-size']) # for each model sequence for si in range(num_seqs): mseq = model_seqs[si] try: # pull hic values if chr_pre: mseq_str = '%s:%d-%d' % (mseq.chr, mseq.start, mseq.end) else: mseq_str = '%s:%d-%d' % (mseq.chr[3:], mseq.start, mseq.end) #print('mseq_str:', mseq_str) seq_hic_raw = genome_hic_cool.matrix(balance=True).fetch(mseq_str) seq_hic_nan = np.isnan(seq_hic_raw) num_filtered_bins = np.sum(np.sum(seq_hic_nan,axis=0) == len(seq_hic_nan)) if num_filtered_bins > (.5*len(seq_hic_nan)): print("WARNING: %s >50% bins filtered, check: %s. " % (genome_hic_file, mseq_str)) # set blacklist to NaNs if mseq.chr in black_chr_trees: for black_interval in black_chr_trees[mseq.chr][mseq.start:mseq.end]: # adjust for sequence indexes black_seq_start = (black_interval.begin - mseq.start)// options.pool_width black_seq_end = int( np.ceil( (black_interval.end - mseq.start)/ options.pool_width ) ) seq_hic_raw[:,black_seq_start:black_seq_end] = np.nan seq_hic_raw[black_seq_start:black_seq_end,:] = np.nan seq_hic_nan = np.isnan(seq_hic_raw) # clip first diagonals and high values clipval = np.nanmedian(np.diag(seq_hic_raw,options.diagonal_offset)) for i in range(-options.diagonal_offset+1,options.diagonal_offset): set_diag(seq_hic_raw, clipval, i) seq_hic_raw = np.clip(seq_hic_raw, 0, clipval) seq_hic_raw[seq_hic_nan] = np.nan # adaptively coarsegrain based on raw counts seq_hic_smoothed = adaptive_coarsegrain( seq_hic_raw, genome_hic_cool.matrix(balance=False).fetch(mseq_str), cutoff=2, max_levels=8) seq_hic_nan = np.isnan(seq_hic_smoothed) #todo: pass an option to add a certain pseudocount value, or the minimum nonzero value if options.as_obsexp: # compute obs/exp if options.global_obsexp: # compute global obs/exp exp_chr = genome_hic_expected.iloc[ genome_hic_expected['chrom'].values ==mseq.chr][0:seq_len_pool] if len(exp_chr) ==0: raise ValueError('no expected values found for chr:'+mseq.chr) exp_map= np.zeros((seq_len_pool,seq_len_pool)) for i in range(seq_len_pool): set_diag(exp_map,exp_chr['balanced.avg'].values[i],i) set_diag(exp_map,exp_chr['balanced.avg'].values[i],-i) seq_hic_obsexp = seq_hic_smoothed / exp_map for i in range(-options.diagonal_offset+1,options.diagonal_offset): set_diag(seq_hic_obsexp,1.0,i) seq_hic_obsexp[seq_hic_nan] = np.nan else: # compute local obs/exp seq_hic_obsexp = observed_over_expected(seq_hic_smoothed, ~seq_hic_nan)[0] # log if options.no_log==False: seq_hic_obsexp = np.log(seq_hic_obsexp) if options.clip is not None: seq_hic_obsexp = np.clip(seq_hic_obsexp, -options.clip, options.clip) seq_hic_obsexp = interp_nan(seq_hic_obsexp) for i in range(-options.diagonal_offset+1, options.diagonal_offset): set_diag(seq_hic_obsexp, 0,i) else: if options.clip is not None: seq_hic_obsexp = np.clip(seq_hic_obsexp, 0, options.clip) seq_hic_obsexp = interp_nan(seq_hic_obsexp) for i in range(-options.diagonal_offset+1, options.diagonal_offset): set_diag(seq_hic_obsexp, 1,i) # apply kernel if kernel is not None: seq_hic = convolve(seq_hic_obsexp, kernel) else: seq_hic = seq_hic_obsexp else: # interpolate all missing bins seq_hic_interpolated = interp_nan(seq_hic_smoothed) # rescale, reclip seq_hic = 100000*seq_hic_interpolated clipval = np.nanmedian(np.diag(seq_hic,options.diagonal_offset)) for i in range(-options.diagonal_offset+1, options.diagonal_offset): set_diag(seq_hic,clipval,i) seq_hic = np.clip(seq_hic, 0, clipval) #extra smoothing. todo pass kernel specs if kernel is not None: seq_hic = convolve(seq_hic, kernel) except ValueError: print("WARNING: %s doesn't see %s. Setting to all zeros." % (genome_hic_file, mseq_str)) seq_hic = np.zeros((seq_len_pool,seq_len_pool), dtype='float16') # crop if options.crop_bp > 0: seq_hic = seq_hic[crop_start:crop_end,:] seq_hic = seq_hic[:,crop_start:crop_end] # unroll upper triangular seq_hic = seq_hic[triu_tup] # write seqs_hic_open['targets'][si,:] = seq_hic.astype('float16') # close sequences coverage file seqs_hic_open.close()
def main(): usage = 'usage: %prog [options] <genome_hic_file> <seqs_bed_file> <seqs_hic_file>' parser = OptionParser(usage) parser.add_option('-b', dest='blacklist_bed', help='Set blacklist nucleotides to a baseline value.') parser.add_option( '-c', dest='clip', default=None, type='float', help='Clip values post-summary to a maximum [Default: %default]') parser.add_option('-s', dest='scale', default=1., type='float', help='Scale values by [Default: %default]') parser.add_option( '--soft', dest='soft_clip', default=False, action='store_true', help= 'Soft clip values, applying sqrt to the execess above the threshold [Default: %default]' ) parser.add_option( '-u', dest='sum_stat', default='sum', help='Summary statistic to compute in windows [Default: %default]') parser.add_option('-w', dest='pool_width', default=1, type='int', help='Average pooling width [Default: %default]') parser.add_option('--as_obsexp', dest='as_obsexp', default=False, action="store_true", help='save targets as obsexp profiles') (options, args) = parser.parse_args() if len(args) != 3: parser.error('') else: genome_hic_file = args[0] seqs_bed_file = args[1] seqs_hic_file = args[2] print('saving TFRs as obsexp:', options.as_obsexp) # read model sequences model_seqs = [] for line in open(seqs_bed_file): a = line.split() model_seqs.append(ModelSeq(a[0], int(a[1]), int(a[2]), None)) # read blacklist regions black_chr_trees = read_blacklist(options.blacklist_bed) # compute dimensions num_seqs = len(model_seqs) seq_len_nt = model_seqs[0].end - model_seqs[0].start seq_len_pool = seq_len_nt // options.pool_width # initialize sequences coverage file seqs_hic_open = h5py.File(seqs_hic_file, 'w') seqs_hic_open.create_dataset('seqs_hic', shape=(num_seqs, seq_len_pool, seq_len_pool), dtype='float16') # open genome coverage file genome_hic_cool = cooler.Cooler(genome_hic_file) # check for "chr" prefix chr_pre = 'chr1' in genome_hic_cool.chromnames # assert that resolution matches assert (options.pool_width == genome_hic_cool.info['bin-size']) # for each model sequence for si in range(num_seqs): mseq = model_seqs[si] try: # pull hic values if chr_pre: mseq_str = '%s:%d-%d' % (mseq.chr, mseq.start, mseq.end) else: mseq_str = '%s:%d-%d' % (mseq.chr[3:], mseq.start, mseq.end) #print('mseq_str:', mseq_str) seq_hic_raw = genome_hic_cool.matrix(balance=True).fetch(mseq_str) seq_hic_nan = np.isnan(seq_hic_raw) if np.sum(seq_hic_nan[len(seq_hic_nan) // 2 - 1:len(seq_hic_nan) // 2 + 1, len(seq_hic_nan) // 2 - 2:len(seq_hic_nan) // 2 + 2]) > 4: print( "WARNING: %s lots of zeros, check that umap_midpoint is correct %s. " % (genome_hic_file, mseq_str)) # set blacklist to NaNs if mseq.chr in black_chr_trees: for black_interval in black_chr_trees[ mseq.chr][mseq.start:mseq.end]: # adjust for sequence indexes black_seq_start = (black_interval.begin - mseq.start) // options.pool_width black_seq_end = int( np.ceil((black_interval.end - mseq.start) / options.pool_width)) seq_hic_raw[:, black_seq_start:black_seq_end] = np.nan seq_hic_raw[black_seq_start:black_seq_end, :] = np.nan seq_hic_nan = np.isnan(seq_hic_raw) # clip first diagonals and high values clipval = np.nanmedian(np.diag(seq_hic_raw, 2)) for i in [-1, 0, 1]: set_diag(seq_hic_raw, clipval, i) seq_hic_raw = np.clip(seq_hic_raw, 0, seq_hic_raw) seq_hic_raw[seq_hic_nan] = np.nan # adaptively coarsegrain based on raw counts seq_hic_smoothed = adaptive_coarsegrain( seq_hic_raw, genome_hic_cool.matrix(balance=False).fetch(mseq_str), cutoff=2, max_levels=8) #todo: pass an option to add a certain pseudocount value, or the minimum nonzero value if options.as_obsexp == True: # interpolate single missing bins seq_hic_interpolated = interpolate_bad_singletons( seq_hic_smoothed, mask=(~seq_hic_nan), fillDiagonal=True, returnMask=False, secondPass=True, verbose=False) seq_hic_nan = np.isnan(seq_hic_interpolated) # compute observed/expected seq_hic_obsexp = observed_over_expected( seq_hic_interpolated, ~seq_hic_nan)[0] # todo: allow passing a global expected rather than computing locally # log seq_hic_obsexp = np.log(seq_hic_obsexp) # set nan to 0 seq_hic_obsexp = np.nan_to_num(seq_hic_obsexp) # todo: make obsexp_clip an option for obs/exp seq_hic = np.clip(seq_hic_obsexp, -2, 2) else: # interpolate all missing bins seq_hic_interpolated = interp_nan(seq_hic_smoothed) # rescale seq_hic = 100000 * seq_hic_interpolated # todo add extra smoothing except ValueError: print("WARNING: %s doesn't see %s. Setting to all zeros." % (genome_hic_file, mseq_str)) seq_hic = np.zeros((seq_len_pool, seq_len_pool), dtype='float16') # write seqs_hic_open['seqs_hic'][si, :, :] = seq_hic.astype('float16') # close sequences coverage file seqs_hic_open.close()