def find_compartments(self, crms=None, savefig=None, savedata=None, show=False, **kwargs): """ Search for A/B copartments in each chromsome of the Hi-C matrix. Hi-C matrix is normalized by the number interaction expected at a given distance, and by visibility (one iteration of ICE). A correlation matrix is then calculated from this normalized matrix, and its first eigenvector is used to identify compartments. Changes in sign marking boundaries between compartments. Result is stored as a dictionary of compartment boundaries, keys being chromsome names. :param 99 perc_zero: to filter bad columns :param 0.05 signal_to_noise: to calculate expected interaction counts, if not enough reads are observed at a given distance the observations of the distance+1 are summed. a signal to noise ratio of < 0.05 corresponds to > 400 reads. :param None crms: only runs these given list of chromosomes :param None savefig: path to a directory to store matrices with compartment predictions, one image per chromosome, stored under 'chromosome-name.png'. :param False show: show the plot :param None savedata: path to a new file to store compartment predictions, one file only. :param -1 vmin: for the color scale of the plotted map :param 1 vmax: for the color scale of the plotted map TODO: this is really slow... Notes: building the distance matrix using the amount of interactions instead of the mean correlation, gives generally worse results. """ if not self.bads: if kwargs.get('verbose', True): print 'Filtering bad columns %d' % 99 self.filter_columns(perc_zero=kwargs.get('perc_zero', 99), by_mean=False, silent=True) if not self.expected: if kwargs.get('verbose', True): print 'Normalizing by expected values' self.expected = expected(self, bads=self.bads, **kwargs) if not self.bias: if kwargs.get('verbose', True): print 'Normalizing by ICE (1 round)' self.normalize_hic(iterations=0) if savefig: mkdir(savefig) cmprts = {} for sec in self.section_pos: if crms and sec not in crms: continue if kwargs.get('verbose', False): print 'Processing chromosome', sec warn('Processing chromosome %s' % (sec)) matrix = [[(float(self[i,j]) / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in xrange(*self.section_pos[sec]) if not i in self.bads] for j in xrange(*self.section_pos[sec]) if not j in self.bads] if not matrix: # MT chromosome will fall there warn('Chromosome %s is probably MT :)' % (sec)) cmprts[sec] = [] continue for i in xrange(len(matrix)): for j in xrange(i+1, len(matrix)): matrix[i][j] = matrix[j][i] matrix = [list(m) for m in corrcoef(matrix)] try: # This eighs is very very fast, only ask for one eigvector _, evect = eigsh(array(matrix), k=1) except LinAlgError: warn('Chromosome %s too small to compute PC1' % (sec)) cmprts[sec] = [] # Y chromosome, or so... continue first = list(evect[:, -1]) beg, end = self.section_pos[sec] bads = [k - beg for k in self.bads if beg <= k <= end] _ = [first.insert(b, 0) for b in bads] _ = [matrix.insert(b, [float('nan')] * len(matrix[0])) for b in bads] _ = [matrix[i].insert(b, float('nan')) for b in bads for i in xrange(len(first))] breaks = [0] + [i for i, (a, b) in enumerate(zip(first[1:], first[:-1])) if a * b < 0] + [len(first)] breaks = [{'start': b, 'end': breaks[i+1]} for i, b in enumerate(breaks[: -1])] cmprts[sec] = breaks # calculate compartment internal density for k, cmprt in enumerate(cmprts[sec]): beg = self.section_pos[sec][0] beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg sec_matrix = [(self[i,j] / self.expected[abs(j-i)] / self.bias[i] / self.bias[j]) for i in xrange(beg1, end1) if not i in self.bads for j in xrange(i, end1) if not j in self.bads] try: cmprt['dens'] = sum(sec_matrix) / len(sec_matrix) except ZeroDivisionError: cmprt['dens'] = 0. try: meanh = sum([cmprt['dens'] for cmprt in cmprts[sec]]) / len(cmprts[sec]) except ZeroDivisionError: meanh = 1. for cmprt in cmprts[sec]: try: cmprt['dens'] /= meanh except ZeroDivisionError: cmprt['dens'] = 1. gammas = {} for gamma in range(101): gammas[gamma] = _find_ab_compartments(float(gamma)/100, matrix, breaks, cmprts[sec], save=False) # print gamma, gammas[gamma] gamma = min(gammas.keys(), key=lambda k: gammas[k][0]) _ = _find_ab_compartments(float(gamma)/100, matrix, breaks, cmprts[sec], save=True) if savefig or show: vmin = kwargs.get('vmin', -1) vmax = kwargs.get('vmax', 1) if vmin == 'auto' == vmax: vmax = max([abs(npmin(matrix)), abs(npmax(matrix))]) vmin = -vmax plot_compartments(sec, first, cmprts, matrix, show, savefig + '/chr' + sec + '.pdf', vmin=vmin, vmax=vmax) plot_compartments_summary(sec, cmprts, show, savefig + '/chr' + sec + '_summ.pdf') self.compartments = cmprts if savedata: self.write_compartments(savedata)
def find_compartments(self, crms=None, savefig=None, savedata=None, show=False, **kwargs): """ Search for A/B copartments in each chromsome of the Hi-C matrix. Hi-C matrix is normalized by the number interaction expected at a given distance, and by visibility (one iteration of ICE). A correlation matrix is then calculated from this normalized matrix, and its first eigenvector is used to identify compartments. Changes in sign marking boundaries between compartments. Result is stored as a dictionary of compartment boundaries, keys being chromsome names. :param 99 perc_zero: to filter bad columns :param 0.05 signal_to_noise: to calculate expected interaction counts, if not enough reads are observed at a given distance the observations of the distance+1 are summed. a signal to noise ratio of < 0.05 corresponds to > 400 reads. :param None crms: only runs these given list of chromosomes :param None savefig: path to a directory to store matrices with compartment predictions, one image per chromosome, stored under 'chromosome-name.png'. :param False show: show the plot :param None savedata: path to a new file to store compartment predictions, one file only. :param -1 vmin: for the color scale of the plotted map :param 1 vmax: for the color scale of the plotted map TODO: this is really slow... Notes: building the distance matrix using the amount of interactions instead of the mean correlation, gives generally worse results. """ if not self.bads: if kwargs.get('verbose', True): print 'Filtering bad columns %d' % 99 self.filter_columns(perc_zero=kwargs.get('perc_zero', 99), by_mean=False, silent=True) if not self.expected: if kwargs.get('verbose', True): print 'Normalizing by expected values' self.expected = expected(self, bads=self.bads, **kwargs) if not self.bias: if kwargs.get('verbose', True): print 'Normalizing by ICE (1 round)' self.normalize_hic(iterations=0) if savefig: mkdir(savefig) cmprts = {} for sec in self.section_pos: if crms and sec not in crms: continue if kwargs.get('verbose', False): print 'Processing chromosome', sec warn('Processing chromosome %s' % (sec)) matrix = [[(float(self[i, j]) / self.expected[abs(j - i)] / self.bias[i] / self.bias[j]) for i in xrange(*self.section_pos[sec]) if not i in self.bads] for j in xrange(*self.section_pos[sec]) if not j in self.bads] if not matrix: # MT chromosome will fall there warn('Chromosome %s is probably MT :)' % (sec)) cmprts[sec] = [] continue for i in xrange(len(matrix)): for j in xrange(i + 1, len(matrix)): matrix[i][j] = matrix[j][i] matrix = [list(m) for m in corrcoef(matrix)] try: # This eighs is very very fast, only ask for one eigvector _, evect = eigsh(array(matrix), k=1) except LinAlgError: warn('Chromosome %s too small to compute PC1' % (sec)) cmprts[sec] = [] # Y chromosome, or so... continue first = list(evect[:, -1]) beg, end = self.section_pos[sec] bads = [k - beg for k in self.bads if beg <= k <= end] _ = [first.insert(b, 0) for b in bads] _ = [ matrix.insert(b, [float('nan')] * len(matrix[0])) for b in bads ] _ = [ matrix[i].insert(b, float('nan')) for b in bads for i in xrange(len(first)) ] breaks = [0] + [ i for i, (a, b) in enumerate(zip(first[1:], first[:-1])) if a * b < 0 ] + [len(first)] breaks = [{ 'start': b, 'end': breaks[i + 1] } for i, b in enumerate(breaks[:-1])] cmprts[sec] = breaks # calculate compartment internal density for k, cmprt in enumerate(cmprts[sec]): beg = self.section_pos[sec][0] beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg sec_matrix = [(self[i, j] / self.expected[abs(j - i)] / self.bias[i] / self.bias[j]) for i in xrange(beg1, end1) if not i in self.bads for j in xrange(i, end1) if not j in self.bads] try: cmprt['dens'] = sum(sec_matrix) / len(sec_matrix) except ZeroDivisionError: cmprt['dens'] = 0. try: meanh = sum([cmprt['dens'] for cmprt in cmprts[sec]]) / len(cmprts[sec]) except ZeroDivisionError: meanh = 1. for cmprt in cmprts[sec]: try: cmprt['dens'] /= meanh except ZeroDivisionError: cmprt['dens'] = 1. gammas = {} for gamma in range(101): gammas[gamma] = _find_ab_compartments(float(gamma) / 100, matrix, breaks, cmprts[sec], save=False) # print gamma, gammas[gamma] gamma = min(gammas.keys(), key=lambda k: gammas[k][0]) _ = _find_ab_compartments(float(gamma) / 100, matrix, breaks, cmprts[sec], save=True) if savefig or show: vmin = kwargs.get('vmin', -1) vmax = kwargs.get('vmax', 1) if vmin == 'auto' == vmax: vmax = max([abs(npmin(matrix)), abs(npmax(matrix))]) vmin = -vmax plot_compartments(sec, first, cmprts, matrix, show, savefig + '/chr' + sec + '.pdf', vmin=vmin, vmax=vmax) plot_compartments_summary(sec, cmprts, show, savefig + '/chr' + sec + '_summ.pdf') self.compartments = cmprts if savedata: self.write_compartments(savedata)