def process_TAD(hic_data, perc_zero, reso, cpus, outdir, bins):

    # Get poor bins

    print 'Get poor bins...'

    try:

        hic_data.filter_columns(perc_zero=perc_zero, by_mean=True)

    except ValueError:

        perc_zero = 100
        hic_data.filter_columns(perc_zero=perc_zero, by_mean=True)

    binsrev = {y:x for x,y in bins.iteritems()}

    bad_file = outdir + 'bad_rows_%s_%d.tsv' % (nice(reso), perc_zero)
    bads = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + str(i) for i in hic_data.bads.keys()]

    compress(bads, bad_file)

    # Identify biases

    print 'Get biases using ICE...'

    hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0,
                           factor=1) # cells of the matrix have a mean of 1

    bias_file = outdir + 'bias_%s.tsv' % nice(reso)
    bias = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + '%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]

    compress(bias, bias_file)

    # percentage of cis interactions

    print 'Getting percentage of cis interactions...'

    cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True )
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True )
    cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False)
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)

    cistrans_file = outdir + 'cis_trans_ratio_%s.tsv' % nice(reso)

    out_cistrans = open(cistrans_file, "w")
    out_cistrans.write("Cis/trans_ratio\tnormalized\twith_diagonal\t" + str(cis_trans_N_D) + "\n")
    out_cistrans.write("Cis/trans_ratio\tnormalized\twithout_diagonal\t" + str(cis_trans_N_d) + "\n")
    out_cistrans.write("Cis/trans_ratio\traw\twith_diagonal\t" + str(cis_trans_n_D) + "\n")
    out_cistrans.write("Cis/trans_ratio\traw\twithout_diagonal\t" + str(cis_trans_n_d) + "\n")
    out_cistrans.close()

    # Compute expected

    print 'Get expected counts ...'

    hic_data.expected = expected(hic_data, bads = hic_data.bads)

    # store matrices

    print 'Store matrices'

    write_matrices(hic_data, outdir, reso)

    # getting TAD borders

    print 'Searching TADs'

    for crm in hic_data.chromosomes:

        print '  - %s' % crm

        matrix = hic_data.get_matrix(focus=crm)
        beg, end = hic_data.section_pos[crm]
        size = len(matrix)

        if size < 10:
            print "     Chromosome too short (%d bins), skipping..." % size
            continue

        # transform bad column in chromosome referential

        remove = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])

        # maximum size of a TAD

        max_tad_size = size

        result = tadbit([matrix], remove=remove,
                        n_cpus=cpus, verbose=False,
                        max_tad_size=max_tad_size,
                        no_heuristic=0)
        
        tads = load_tad_height(result, size, beg, end, hic_data)

        table = ''
        table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density')

        for tad in tads:

            table += '%s\t%s\t%s\t%s%s\n' % (
                tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                abs(tads[tad]['score']), '\t%s' % (round(
                    float(tads[tad]['height']), 3)))

        out_tad = outdir + 'tads_%s_%s.tsv' % (
            crm, nice(reso))

        out = open(out_tad, 'w')
        out.write(table)
        out.close()
Пример #2
0
    def find_compartments(self, crms=None, savefig=None, savedata=None,
                          show=False, **kwargs):
        """
        Search for A/B copartments in each chromsome of the Hi-C matrix.
        Hi-C matrix is normalized by the number interaction expected at a given
        distance, and by visibility (one iteration of ICE). A correlation matrix
        is then calculated from this normalized matrix, and its first
        eigenvector is used to identify compartments. Changes in sign marking
        boundaries between compartments.
        Result is stored as a dictionary of compartment boundaries, keys being
        chromsome names.
        
        :param 99 perc_zero: to filter bad columns
        :param 0.05 signal_to_noise: to calculate expected interaction counts,
           if not enough reads are observed at a given distance the observations
           of the distance+1 are summed. a signal to noise ratio of < 0.05
           corresponds to > 400 reads.
        :param None crms: only runs these given list of chromosomes
        :param None savefig: path to a directory to store matrices with
           compartment predictions, one image per chromosome, stored under
           'chromosome-name.png'.
        :param False show: show the plot
        :param None savedata: path to a new file to store compartment
           predictions, one file only.
        :param -1 vmin: for the color scale of the plotted map
        :param 1 vmax: for the color scale of the plotted map

        TODO: this is really slow...

        Notes: building the distance matrix using the amount of interactions
               instead of the mean correlation, gives generally worse results.
        
        """
        if not self.bads:
            if kwargs.get('verbose', True):
                print 'Filtering bad columns %d' % 99
            self.filter_columns(perc_zero=kwargs.get('perc_zero', 99),
                                by_mean=False, silent=True)
        if not self.expected:
            if kwargs.get('verbose', True):
                print 'Normalizing by expected values'
            self.expected = expected(self, bads=self.bads, **kwargs)
        if not self.bias:
            if kwargs.get('verbose', True):
                print 'Normalizing by ICE (1 round)'
            self.normalize_hic(iterations=0)
        if savefig:
            mkdir(savefig)

        cmprts = {}
        for sec in self.section_pos:
            if crms and sec not in crms:
                continue
            if kwargs.get('verbose', False):
                print 'Processing chromosome', sec
                warn('Processing chromosome %s' % (sec))
            matrix = [[(float(self[i,j]) / self.expected[abs(j-i)]
                       / self.bias[i] / self.bias[j])
                      for i in xrange(*self.section_pos[sec])
                       if not i in self.bads]
                     for j in xrange(*self.section_pos[sec])
                      if not j in self.bads]
            if not matrix: # MT chromosome will fall there
                warn('Chromosome %s is probably MT :)' % (sec))
                cmprts[sec] = []
                continue
            for i in xrange(len(matrix)):
                for j in xrange(i+1, len(matrix)):
                    matrix[i][j] = matrix[j][i]
            matrix = [list(m) for m in corrcoef(matrix)]
            try:
                # This eighs is very very fast, only ask for one eigvector
                _, evect = eigsh(array(matrix), k=1)
            except LinAlgError:
                warn('Chromosome %s too small to compute PC1' % (sec))
                cmprts[sec] = [] # Y chromosome, or so...
                continue
            first = list(evect[:, -1])
            beg, end = self.section_pos[sec]
            bads = [k - beg for k in self.bads if beg <= k <= end]
            _ = [first.insert(b, 0) for b in bads]
            _ = [matrix.insert(b, [float('nan')] * len(matrix[0]))
                 for b in bads]
            _ = [matrix[i].insert(b, float('nan'))
                 for b in bads for i in xrange(len(first))]
            breaks = [0] + [i for i, (a, b) in
                            enumerate(zip(first[1:], first[:-1]))
                            if a * b < 0] + [len(first)]
            breaks = [{'start': b, 'end': breaks[i+1]}
                      for i, b in enumerate(breaks[: -1])]
            cmprts[sec] = breaks
            
            # calculate compartment internal density
            for k, cmprt in enumerate(cmprts[sec]):
                beg = self.section_pos[sec][0]
                beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg
                sec_matrix = [(self[i,j] / self.expected[abs(j-i)]
                               / self.bias[i] / self.bias[j])
                              for i in xrange(beg1, end1) if not i in self.bads
                              for j in xrange(i, end1) if not j in self.bads]
                try:
                    cmprt['dens'] = sum(sec_matrix) / len(sec_matrix)
                except ZeroDivisionError:
                    cmprt['dens'] = 0.
            try:
                meanh = sum([cmprt['dens'] for cmprt in cmprts[sec]]) / len(cmprts[sec])
            except ZeroDivisionError:
                meanh = 1.
            for cmprt in cmprts[sec]:
                try:
                    cmprt['dens'] /= meanh
                except ZeroDivisionError:
                    cmprt['dens'] = 1.
            gammas = {}
            for gamma in range(101):
                gammas[gamma] = _find_ab_compartments(float(gamma)/100, matrix,
                                                      breaks, cmprts[sec],
                                                      save=False)
                # print gamma, gammas[gamma]
            gamma = min(gammas.keys(), key=lambda k: gammas[k][0])
            _ = _find_ab_compartments(float(gamma)/100, matrix, breaks,
                                      cmprts[sec], save=True)
            if savefig or show:
                vmin = kwargs.get('vmin', -1)
                vmax = kwargs.get('vmax',  1)
                if vmin == 'auto' == vmax:
                    vmax = max([abs(npmin(matrix)), abs(npmax(matrix))])
                    vmin = -vmax
                plot_compartments(sec, first, cmprts, matrix, show,
                                  savefig + '/chr' + sec + '.pdf',
                                  vmin=vmin, vmax=vmax)
                plot_compartments_summary(sec, cmprts, show,
                                          savefig + '/chr' + sec + '_summ.pdf')
            
        self.compartments = cmprts
        if savedata:
            self.write_compartments(savedata)
def process_AB(hic_data, perc_zero, reso, outdir, bins):

    # Get poor bins

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(perc_zero=perc_zero, by_mean=True)
    except ValueError:
        perc_zero = 100
        hic_data.filter_columns(perc_zero=perc_zero, by_mean=True)

    binsrev = {y:x for x,y in bins.iteritems()}

    bad_file = outdir + 'bad_rows_%s_%d.tsv' % (nice(reso), perc_zero)
    bads = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + str(i)
            for i in hic_data.bads.keys()]

    compress(bads, bad_file)

    # Identify biases

    print 'Get biases using ICE...'

    hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0,
                           factor=1) # cells of the matrix have a mean of 1

    bias_file = outdir + 'bias_%s.tsv' % nice(reso)
    bias = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + '%d\t%f' % (i, hic_data.bias[i])
            for i in hic_data.bias]

    compress(bias, bias_file)

    # percentage of cis interactions

    print 'Getting percentage of cis interactions...'

    cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True )
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True )
    cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False)
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)

    cistrans_file = outdir + 'cis_trans_ratio_%s.tsv' % nice(reso)
    out_cistrans = open(cistrans_file, "w")

    out_cistrans.write("Cis/trans_ratio\tnormalized\twith_diagonal\t" + str(cis_trans_N_D) + "\n")
    out_cistrans.write("Cis/trans_ratio\tnormalized\twithout_diagonal\t" + str(cis_trans_N_d) + "\n")
    out_cistrans.write("Cis/trans_ratio\traw\twith_diagonal\t" + str(cis_trans_n_D) + "\n")
    out_cistrans.write("Cis/trans_ratio\traw\twithout_diagonal\t" + str(cis_trans_n_d) + "\n")
    out_cistrans.close()

    # Compute expected

    print 'Get expected counts ...'

    hic_data.expected = expected(hic_data, bads = hic_data.bads)

    # store matrices

    print 'Store matrices'
    write_matrices(hic_data, outdir, reso)

    # getting compartments

    print 'Searching compartments'

    ev = hic_data.find_compartments()
    ev_file = outdir + 'ev_%s.tsv' % nice(reso)

    out = []
    chroms = ev.keys()
    chroms.sort()
    for ch in chroms:
        for i in xrange(len(ev[ch][0]) - 1):
            out.append("\t".join((ch, str(i * reso), str(ev[ch][0][i]), str(ev[ch][1][i]))))

    compress(out, ev_file)

    cmprt_file = outdir + 'compartments_%s.tsv' % nice(reso)
    hic_data.write_compartments(cmprt_file)
Пример #4
0
    def find_compartments(self,
                          crms=None,
                          savefig=None,
                          savedata=None,
                          show=False,
                          **kwargs):
        """
        Search for A/B copartments in each chromsome of the Hi-C matrix.
        Hi-C matrix is normalized by the number interaction expected at a given
        distance, and by visibility (one iteration of ICE). A correlation matrix
        is then calculated from this normalized matrix, and its first
        eigenvector is used to identify compartments. Changes in sign marking
        boundaries between compartments.
        Result is stored as a dictionary of compartment boundaries, keys being
        chromsome names.
        
        :param 99 perc_zero: to filter bad columns
        :param 0.05 signal_to_noise: to calculate expected interaction counts,
           if not enough reads are observed at a given distance the observations
           of the distance+1 are summed. a signal to noise ratio of < 0.05
           corresponds to > 400 reads.
        :param None crms: only runs these given list of chromosomes
        :param None savefig: path to a directory to store matrices with
           compartment predictions, one image per chromosome, stored under
           'chromosome-name.png'.
        :param False show: show the plot
        :param None savedata: path to a new file to store compartment
           predictions, one file only.
        :param -1 vmin: for the color scale of the plotted map
        :param 1 vmax: for the color scale of the plotted map

        TODO: this is really slow...

        Notes: building the distance matrix using the amount of interactions
               instead of the mean correlation, gives generally worse results.
        
        """
        if not self.bads:
            if kwargs.get('verbose', True):
                print 'Filtering bad columns %d' % 99
            self.filter_columns(perc_zero=kwargs.get('perc_zero', 99),
                                by_mean=False,
                                silent=True)
        if not self.expected:
            if kwargs.get('verbose', True):
                print 'Normalizing by expected values'
            self.expected = expected(self, bads=self.bads, **kwargs)
        if not self.bias:
            if kwargs.get('verbose', True):
                print 'Normalizing by ICE (1 round)'
            self.normalize_hic(iterations=0)
        if savefig:
            mkdir(savefig)

        cmprts = {}
        for sec in self.section_pos:
            if crms and sec not in crms:
                continue
            if kwargs.get('verbose', False):
                print 'Processing chromosome', sec
                warn('Processing chromosome %s' % (sec))
            matrix = [[(float(self[i, j]) / self.expected[abs(j - i)] /
                        self.bias[i] / self.bias[j])
                       for i in xrange(*self.section_pos[sec])
                       if not i in self.bads]
                      for j in xrange(*self.section_pos[sec])
                      if not j in self.bads]
            if not matrix:  # MT chromosome will fall there
                warn('Chromosome %s is probably MT :)' % (sec))
                cmprts[sec] = []
                continue
            for i in xrange(len(matrix)):
                for j in xrange(i + 1, len(matrix)):
                    matrix[i][j] = matrix[j][i]
            matrix = [list(m) for m in corrcoef(matrix)]
            try:
                # This eighs is very very fast, only ask for one eigvector
                _, evect = eigsh(array(matrix), k=1)
            except LinAlgError:
                warn('Chromosome %s too small to compute PC1' % (sec))
                cmprts[sec] = []  # Y chromosome, or so...
                continue
            first = list(evect[:, -1])
            beg, end = self.section_pos[sec]
            bads = [k - beg for k in self.bads if beg <= k <= end]
            _ = [first.insert(b, 0) for b in bads]
            _ = [
                matrix.insert(b, [float('nan')] * len(matrix[0])) for b in bads
            ]
            _ = [
                matrix[i].insert(b, float('nan')) for b in bads
                for i in xrange(len(first))
            ]
            breaks = [0] + [
                i for i, (a, b) in enumerate(zip(first[1:], first[:-1]))
                if a * b < 0
            ] + [len(first)]
            breaks = [{
                'start': b,
                'end': breaks[i + 1]
            } for i, b in enumerate(breaks[:-1])]
            cmprts[sec] = breaks

            # calculate compartment internal density
            for k, cmprt in enumerate(cmprts[sec]):
                beg = self.section_pos[sec][0]
                beg1, end1 = cmprt['start'] + beg, cmprt['end'] + beg
                sec_matrix = [(self[i, j] / self.expected[abs(j - i)] /
                               self.bias[i] / self.bias[j])
                              for i in xrange(beg1, end1) if not i in self.bads
                              for j in xrange(i, end1) if not j in self.bads]
                try:
                    cmprt['dens'] = sum(sec_matrix) / len(sec_matrix)
                except ZeroDivisionError:
                    cmprt['dens'] = 0.
            try:
                meanh = sum([cmprt['dens']
                             for cmprt in cmprts[sec]]) / len(cmprts[sec])
            except ZeroDivisionError:
                meanh = 1.
            for cmprt in cmprts[sec]:
                try:
                    cmprt['dens'] /= meanh
                except ZeroDivisionError:
                    cmprt['dens'] = 1.
            gammas = {}
            for gamma in range(101):
                gammas[gamma] = _find_ab_compartments(float(gamma) / 100,
                                                      matrix,
                                                      breaks,
                                                      cmprts[sec],
                                                      save=False)
                # print gamma, gammas[gamma]
            gamma = min(gammas.keys(), key=lambda k: gammas[k][0])
            _ = _find_ab_compartments(float(gamma) / 100,
                                      matrix,
                                      breaks,
                                      cmprts[sec],
                                      save=True)
            if savefig or show:
                vmin = kwargs.get('vmin', -1)
                vmax = kwargs.get('vmax', 1)
                if vmin == 'auto' == vmax:
                    vmax = max([abs(npmin(matrix)), abs(npmax(matrix))])
                    vmin = -vmax
                plot_compartments(sec,
                                  first,
                                  cmprts,
                                  matrix,
                                  show,
                                  savefig + '/chr' + sec + '.pdf',
                                  vmin=vmin,
                                  vmax=vmax)
                plot_compartments_summary(sec, cmprts, show,
                                          savefig + '/chr' + sec + '_summ.pdf')

        self.compartments = cmprts
        if savedata:
            self.write_compartments(savedata)