Exemplo n.º 1
0
    def test_01_tadbit(self):

        print 'PYTHON SIDE'
        print '-----------'
        
        if CHKTIME:
            t0 = time()

        
        global exp1, exp2, exp3, exp4
        exp1 = tadbit(PATH + '/40Kb/chrT/chrT_A.tsv', max_tad_size="max",
                      verbose=False, no_heuristic=False, n_cpus='max')
        exp2 = tadbit(PATH + '/20Kb/chrT/chrT_B.tsv', max_tad_size="max",
                      verbose=False, no_heuristic=False, n_cpus='max')
        exp3 = tadbit(PATH + '/20Kb/chrT/chrT_C.tsv', max_tad_size="max",
                      verbose=False, no_heuristic=False, n_cpus='max')
        exp4 = tadbit(PATH + '/20Kb/chrT/chrT_D.tsv', max_tad_size="max",
                      n_cpus='max',
                      verbose=False, no_heuristic=False, get_weights=True)

        # Breaks and scores with square root normalization.
        #breaks = [0, 4, 10, 15, 23, 29, 38, 45]
        #scores = [7.0, 7.0, 5.0, 7.0, 4.0, 6.0, 8.0, None]
        breaks = [0, 4, 10, 15, 20, 25, 31, 36, 45]
        scores = [7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 7.0, None]
        self.assertEqual(exp1['start'], breaks)
        self.assertEqual(exp1['score'], scores)

        if CHKTIME:
            print '1', time() - t0
Exemplo n.º 2
0
    def test_01_tadbit(self):

        print 'PYTHON SIDE'
        print '-----------'

        # if ONLY and ONLY != '01':
        #     return
        
        if CHKTIME:
            t0 = time()

        
        global exp1, exp2, exp3, exp4
        exp1 = tadbit(PATH + '/40Kb/chrT/chrT_A.tsv', max_tad_size="max",
                      verbose=False, no_heuristic=False, n_cpus='max')
        exp2 = tadbit(PATH + '/20Kb/chrT/chrT_B.tsv', max_tad_size="max",
                      verbose=False, no_heuristic=False, n_cpus='max')
        exp3 = tadbit(PATH + '/20Kb/chrT/chrT_C.tsv', max_tad_size="max",
                      verbose=False, no_heuristic=False, n_cpus='max')
        exp4 = tadbit(PATH + '/20Kb/chrT/chrT_D.tsv', max_tad_size="max",
                      n_cpus='max',
                      verbose=False, no_heuristic=False, get_weights=True)

        # Breaks and scores with square root normalization.
        #breaks = [0, 4, 10, 15, 23, 29, 38, 45]
        #scores = [7.0, 7.0, 5.0, 7.0, 4.0, 6.0, 8.0, None]
        breaks = [0, 4, 10, 15, 20, 25, 31, 36, 45]
        scores = [7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 7.0, None]
        self.assertEqual(exp1['start'], breaks)
        self.assertEqual(exp1['score'], scores)

        if CHKTIME:
            print '1', time() - t0
Exemplo n.º 3
0
 def test_tad_multi_aligner(self):
     exp1 = tadbit("chrT/chrT_A.tsv", max_tad_size="auto", verbose=False, no_heuristic=False)
     exp2 = tadbit("chrT/chrT_B.tsv", max_tad_size="auto", verbose=False, no_heuristic=False)
     exp3 = tadbit("chrT/chrT_C.tsv", max_tad_size="auto", verbose=False, no_heuristic=False)
     exp4 = tadbit("chrT/chrT_D.tsv", max_tad_size="auto", verbose=False, no_heuristic=False)
     test_chr = Chromosome(
         name="Test Chromosome",
         resolution=20000,
         experiments=[exp1, exp2, exp3, exp4],
         experiment_names=["exp1", "exp2", "exp3", "exp4"],
     )
     score, pval = test_chr.align_experiments(verbose=False, randomize=True)
     self.assertEqual(round(19.555803, 3), round(score, 3))
     self.assertEqual(round(0.4, 1), round(pval, 1))
Exemplo n.º 4
0
def main():
    """
    main function
    """
    chrom = argv[0]
    chrom = 'chrT/chrT_A.tsv'
    chrom = get_matrix(chrom)

    out = tadbit(chrom, verbose=True, heuristic=True)
    print('{:>6} ' * len(out[0])).format(*out[0])
    print('{:>6.1f} ' * len(out[1])).format(*out[1])

    plt.imshow(log2(chrom.T), origin='lower')
    plt.vlines(out[0], 0, chrom.shape[0])
    plt.hlines(out[0], 0, chrom.shape[0])
    plt.show()

    chrom_path = 'chrT/'
    out_batch = batch_tadbit(chrom_path, n_cpus=1, heuristic=True)
    print('{:>6} ' * len(out_batch[0])).format(*out_batch[0])
    print('{:>6.1f} ' * len(out_batch[1])).format(*out_batch[1])

    plt.imshow(log2(chrom.T), origin='lower')
    plt.vlines(out[0], 0, chrom.shape[0])
    plt.hlines(out[0], 0, chrom.shape[0])
    plt.vlines(out_batch[0], 0, chrom.shape[0], color='red')
    plt.hlines(out_batch[0], 0, chrom.shape[0], color='red')
    plt.show()
Exemplo n.º 5
0
def main():
    """
    main function
    """
    chrom = argv[0]
    chrom = "chrT/chrT_A.tsv"
    chrom = get_matrix(chrom)

    out = tadbit(chrom, verbose=True, heuristic=True)
    print("{:>6} " * len(out[0])).format(*out[0])
    print("{:>6.1f} " * len(out[1])).format(*out[1])

    plt.imshow(log2(chrom.T), origin="lower")
    plt.vlines(out[0], 0, chrom.shape[0])
    plt.hlines(out[0], 0, chrom.shape[0])
    plt.show()

    chrom_path = "chrT/"
    out_batch = batch_tadbit(chrom_path, n_cpus=1, heuristic=True)
    print("{:>6} " * len(out_batch[0])).format(*out_batch[0])
    print("{:>6.1f} " * len(out_batch[1])).format(*out_batch[1])

    plt.imshow(log2(chrom.T), origin="lower")
    plt.vlines(out[0], 0, chrom.shape[0])
    plt.hlines(out[0], 0, chrom.shape[0])
    plt.vlines(out_batch[0], 0, chrom.shape[0], color="red")
    plt.hlines(out_batch[0], 0, chrom.shape[0], color="red")
    plt.show()
Exemplo n.º 6
0
 def find_TAD(self, experiments, n_cpus=None, verbose=True, max_tad_size="auto", no_heuristic=False):
     """
     Call tadbit function to calculate the position of Topologically associated
     domains
     
     :argument experiment: A square matrix of interaction counts in hi-C data or a list of\
     such matrices for replicated experiments. The counts must be evenly sampled\
     and not normalized.\
     'experiment' might be either a list of list, a path to a file or a file handler
     :argument None n_cpus: The number of CPUs to allocate to tadbit. The value default\
     is the total number of CPUs minus 1.
     :argument auto max_tad_size: an integer defining maximum size of TAD.\
     Default (auto) defines it to the number of rows/columns.
     :argument False no_heuristic: whether to use or not some heuristics
     
     """
     for experiment in experiments:
         result, weights = tadbit(
             self.experiments[experiment]["hi-c"],
             n_cpus=n_cpus,
             verbose=verbose,
             max_tad_size=max_tad_size,
             no_heuristic=no_heuristic,
             get_weights=True,
         )
         self.add_TAD_def(result, name=experiment, weights=weights)
Exemplo n.º 7
0
    def test_01_tadbit(self):

        global exp1, exp2, exp3, exp4
        exp1 = tadbit('40Kb/chrT/chrT_A.tsv', max_tad_size="auto",
                     verbose=False, no_heuristic=False)
        exp2 = tadbit('20Kb/chrT/chrT_B.tsv', max_tad_size="auto",
                     verbose=False, no_heuristic=False)
        exp3 = tadbit('20Kb/chrT/chrT_C.tsv', max_tad_size="auto",
                     verbose=False, no_heuristic=False)
        exp4 = tadbit('20Kb/chrT/chrT_D.tsv', max_tad_size="auto",
                     verbose=False, no_heuristic=False, get_weights=True)

        breaks = [0, 4, 10, 15, 23, 29, 38, 45]
        scores = [8.0, 7.0, 5.0, 7.0, 4.0, 7.0, 7.0, None]
        self.assertEqual(exp1['start'], breaks)
        self.assertEqual(exp1['score'], scores)
Exemplo n.º 8
0
    def test_01_tadbit(self):

        global exp1, exp2, exp3, exp4
        exp1 = tadbit('40Kb/chrT/chrT_A.tsv', max_tad_size="auto",
                      verbose=False, no_heuristic=False, n_cpus='max')
        exp2 = tadbit('20Kb/chrT/chrT_B.tsv', max_tad_size="auto",
                      verbose=False, no_heuristic=False, n_cpus='max')
        exp3 = tadbit('20Kb/chrT/chrT_C.tsv', max_tad_size="auto",
                      verbose=False, no_heuristic=False, n_cpus='max')
        exp4 = tadbit('20Kb/chrT/chrT_D.tsv', max_tad_size="auto",
                      n_cpus='max',
                      verbose=False, no_heuristic=False, get_weights=True)

        breaks = [0, 4, 10, 15, 23, 29, 38, 45]
        scores = [8.0, 7.0, 5.0, 7.0, 4.0, 7.0, 7.0, None]
        self.assertEqual(exp1['start'], breaks)
        self.assertEqual(exp1['score'], scores)
Exemplo n.º 9
0
    def test_01_tadbit(self):

        print "PYTHON SIDE"
        print "-----------"

        # if ONLY and ONLY != '01':
        #     return

        if CHKTIME:
            t0 = time()

        global exp1, exp2, exp3, exp4
        exp1 = tadbit(
            PATH + "/40Kb/chrT/chrT_A.tsv", max_tad_size="max", verbose=False, no_heuristic=False, n_cpus="max"
        )
        exp2 = tadbit(
            PATH + "/20Kb/chrT/chrT_B.tsv", max_tad_size="max", verbose=False, no_heuristic=False, n_cpus="max"
        )
        exp3 = tadbit(
            PATH + "/20Kb/chrT/chrT_C.tsv", max_tad_size="max", verbose=False, no_heuristic=False, n_cpus="max"
        )
        exp4 = tadbit(
            PATH + "/20Kb/chrT/chrT_D.tsv",
            max_tad_size="max",
            n_cpus="max",
            verbose=False,
            no_heuristic=False,
            get_weights=True,
        )

        # Breaks and scores with square root normalization.
        # breaks = [0, 4, 10, 15, 23, 29, 38, 45]
        # scores = [7.0, 7.0, 5.0, 7.0, 4.0, 6.0, 8.0, None]
        breaks = [0, 4, 10, 15, 20, 25, 31, 36, 45]
        scores = [7.0, 7.0, 4.0, 4.0, 4.0, 4.0, 4.0, 7.0, None]
        self.assertEqual(exp1["start"], breaks)
        self.assertEqual(exp1["score"], scores)

        if CHKTIME:
            print "1", time() - t0
Exemplo n.º 10
0
    def find_tad(self, experiments, name=None, n_cpus=1,
                 verbose=True, max_tad_size="max", heuristic=True,
                 batch_mode=False, **kwargs):
        """
        Call the :func:`pytadbit.tadbit.tadbit` function to calculate the
        position of Topologically Associated Domain boundaries

        :param experiment: A square matrix of interaction counts of Hi-C
           data or a list of such matrices for replicated experiments. The
           counts must be evenly sampled and not normalized. 'experiment'
           can be either a list of lists, a path to a file or a file handler
        :param True normalized: if False simple normalization will be computed,
           as well as a simple column filtering will be applied (remove columns
           where value at the diagonal is null)
        :param 1 n_cpus: The number of CPUs to allocate to TADbit. If
           n_cpus='max' the total number of CPUs will be used
        :param max max_tad_size: an integer defining the maximum size of a
           TAD. Default (auto) defines it as the number of rows/columns
        :param True heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all the experiments will be
           concatenated into one for the search of TADs. The resulting TADs
           found are stored under the name 'batch' plus a concatenation of the
           experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the
           name would be: 'batch_exp1_exp2').

        """
        experiments = experiments or self.experiments
        if not isinstance(experiments, list):
            experiments = [experiments]
        xprs = []
        for xpr in experiments:
            if not isinstance(xpr, Experiment):
                xpr = self.get_experiment(xpr)
            xprs.append(xpr)
            # if normalized and (not xpr._zeros or not xpr._normalization):
            #     raise Exception('ERROR: Experiments should be normalized, and' +
            #                     ' filtered first')
        if len(xprs) <= 1 and batch_mode:
            raise Exception('ERROR: batch_mode implies that more than one ' +
                            'experiment is passed')
        if batch_mode:
            matrix = []
            if not name:
                name = 'batch'
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception('All Experiments must have the same ' +
                                    'resolution\n')
                matrix.append(xpr.hic_data[0])
                if name.startswith('batch'):
                    name += '_' + xpr.name
            siz = xprs[0].size
            tmp = reduce(lambda x, y: x.__add__(y, silent=True), xprs)
            tmp.filter_columns(silent=kwargs.get('silent', False))
            remove = tuple([1 if i in tmp._zeros else 0
                            for i in range(siz)])
            result = tadbit(matrix,
                            remove=remove,
                            n_cpus=n_cpus, verbose=verbose,
                            max_tad_size=max_tad_size,
                            no_heuristic=not heuristic, **kwargs)
            xpr = Experiment(name, resolution, hic_data=matrix,
                             tad_def=result, **kwargs)
            xpr._zeros = xprs[0]._zeros
            for other in xprs[1:]:
                xpr._zeros = dict([(k, None) for k in
                                   set(xpr._zeros.keys()).intersection(
                                       list(other._zeros.keys()))])
            self.add_experiment(xpr)
            return
        for xpr in xprs:
            result = tadbit(
                xpr.hic_data,
                remove=tuple([1 if i in xpr._zeros else 0 for i in
                              range(xpr.size)]),
                n_cpus=n_cpus, verbose=verbose,
                max_tad_size=max_tad_size,
                no_heuristic=not heuristic, **kwargs)
            xpr.load_tad_def(result)
            self._get_forbidden_region(xpr)
Exemplo n.º 11
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts)

    if not opts.nosql:
        (bad_co, bad_co_id, biases, biases_id,
         mreads, mreads_id, reso) = load_parameters_fromdb(opts)
        # store path ids to be saved in database
        inputs = bad_co_id, biases_id, mreads_id
    else:
        bad_co = opts.bad_co
        biases = opts.biases
        mreads = opts.mreads
        reso   = opts.reso

    mreads = path.join(opts.workdir, mreads)
    bad_co = path.join(opts.workdir, bad_co)
    biases = path.join(opts.workdir, biases)

    mkdir(path.join(opts.workdir, '05_segmentation'))

    print 'loading %s at resolution %s' % (mreads, nice(reso))
    hic_data = load_hic_data_from_reads(mreads, reso)
    hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
    hic_data.bias = dict((int(l.split()[0]), float(l.split()[1]))
                         for l in open(biases))

    # compartments
    cmp_result = {}
    if not opts.only_tads:
        print 'Searching compartments'
        hic_data.find_compartments(crms=opts.crms)

        cmprt_dir = path.join(opts.workdir, '05_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        for crm in opts.crms or hic_data.chromosomes:
            cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            hic_data.write_compartments(cmprt_file,
                                        chroms=[crm])
            cmp_result[crm] = {'path': cmprt_file,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '05_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            # maximum size of a TAD
            max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=True,
                            max_tad_size=max_tad_size,
                            no_heuristic=True)
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        save_to_db(opts, cmp_result, tad_result, reso, inputs, 
                   launch_time, finish_time)
Exemplo n.º 12
0
    def find_tad(
        self,
        experiments,
        name=None,
        n_cpus=None,
        verbose=True,
        max_tad_size="auto",
        no_heuristic=False,
        batch_mode=False,
    ):
        """
        Call :func:`pytadbit.tadbit.tadbit` function to calculate the position
        of Topologically associated domains
        
        :param experiment: A square matrix of interaction counts in hi-C
            data or a list of such matrices for replicated experiments. The
            counts must be evenly sampled and not normalized. 'experiment'
            might be either a list of list, a path to a file or a file handler
        :param None n_cpus: The number of CPUs to allocate to tadbit. The
            value default is the total number of CPUs minus 1.
        :param auto max_tad_size: an integer defining maximum size of TAD.
            Default (auto) defines it to the number of rows/columns.
        :param False no_heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all experiments will be concatenated
            into one for the search of TADs. The resulting TADs found are stored
            under the name 'batch' plus a concatenation of the experiment names
            passed (i.e.: if experiments=['exp1', 'exp2'], the name would be:
            'batch_exp1_exp2').

        TODO: check option -> name for batch mode... some dirty changes....
        
        """
        if batch_mode:
            matrix = []
            if not name:
                name = "batch"
            experiments = experiments or self.experiments
            xprs = []
            for xpr in experiments:
                if not type(xpr) == Experiment:
                    xprs.append(self.get_experiment(xpr))
                else:
                    xprs.append(xpr)
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception("All Experiments might have the same " + "resolution\n")
                matrix.append(xpr.hic_data[0])
                if name.startswith("batch"):
                    name += "_" + xpr.name
            result, weights = tadbit(
                matrix,
                n_cpus=n_cpus,
                verbose=verbose,
                max_tad_size=max_tad_size,
                no_heuristic=no_heuristic,
                get_weights=True,
            )
            experiment = Experiment(name, resolution, xp_handler=matrix, tad_handler=result, weights=weights)
            self.add_experiment(experiment)
            return
        if type(experiments) is not list:
            experiments = [experiments]
        for experiment in experiments:
            if not type(experiment) == Experiment:
                xpr = self.get_experiment(experiment)
            result, weights = tadbit(
                xpr.hic_data,
                n_cpus=n_cpus,
                verbose=verbose,
                max_tad_size=max_tad_size,
                no_heuristic=no_heuristic,
                get_weights=True,
            )
            xpr.load_tad_def(result, weights=weights)
            self._get_forbidden_region(xpr)
Exemplo n.º 13
0
    def find_tad(self, experiments, name=None, n_cpus=1, verbose=True,
                 max_tad_size="auto", no_heuristic=False, batch_mode=False,
                 use_visibility=False):
        """
        Call the :func:`pytadbit.tadbit.tadbit` function to calculate the
        position of Topologically Associated Domains
        
        :param experiment: A square matrix of interaction counts of Hi-C
           data or a list of such matrices for replicated experiments. The
           counts must be evenly sampled and not normalized. 'experiment'
           can be either a list of lists, a path to a file or a file handler
        :param 1 n_cpus: The number of CPUs to allocate to TADBit. If
           n_cpus='max' the total number of CPUs will be used
        :param auto max_tad_size: an integer defining the maximum size of a 
           TAD. Default (auto) defines it as the number of rows/columns
        :param False no_heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all the experiments will be 
           concatenated into one for the search of TADs. The resulting TADs 
           found are stored under the name 'batch' plus a concatenation of the
           experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the
           name would be: 'batch_exp1_exp2').

        TODO: check option -> name for batch mode... some dirty changes....

        """
        if batch_mode:
            matrix = []
            if not name:
                name = 'batch'
            experiments = experiments or self.experiments
            xprs = []
            for xpr in experiments:
                if not type(xpr) == Experiment:
                    xprs.append(self.get_experiment(xpr))
                else:
                    xprs.append(xpr)
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception('All Experiments might have the same ' +
                                    'resolution\n')
                matrix.append(xpr.hic_data[0])
                if name.startswith('batch'):
                    name += '_' + xpr.name
            result, weights = tadbit(matrix,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True,
                                     use_visibility=use_visibility)
            experiment = Experiment(name, resolution, hic_data=matrix,
                                    tad_def=result, weights=weights)
            self.add_experiment(experiment)
            return
        if type(experiments) is not list:
            experiments = [experiments]
        for experiment in experiments:
            if not type(experiment) == Experiment:
                xpr = self.get_experiment(experiment)
            result, weights = tadbit(xpr.hic_data,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True,
                                     use_visibility=use_visibility)
            xpr.load_tad_def(result, weights=weights)
            self._get_forbidden_region(xpr)
Exemplo n.º 14
0
 def test_tadbit(self):
     out = tadbit("chrT/chrT_A.tsv", max_tad_size="auto", verbose=False, no_heuristic=False)
     breaks = [0, 3, 9, 14, 20, 30, 38, 44, 50, 67, 72, 77, 82, 89, 94]
     scores = [10.0, 10.0, 8.0, 10.0, 10.0, 6.0, 8.0, 5.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, None]
     self.assertEqual(out["start"], breaks)
     self.assertEqual(out["score"], scores)
Exemplo n.º 15
0
    def find_tad(self, experiments, name=None, n_cpus=1, verbose=True,
                 max_tad_size="auto", no_heuristic=False, batch_mode=False,
                 **kwargs):
        """
        Call the :func:`pytadbit.tadbit.tadbit` function to calculate the
        position of Topologically Associated Domain boundaries
        
        :param experiment: A square matrix of interaction counts of Hi-C
           data or a list of such matrices for replicated experiments. The
           counts must be evenly sampled and not normalized. 'experiment'
           can be either a list of lists, a path to a file or a file handler
        :param 1 n_cpus: The number of CPUs to allocate to TADbit. If
           n_cpus='max' the total number of CPUs will be used
        :param auto max_tad_size: an integer defining the maximum size of a 
           TAD. Default (auto) defines it as the number of rows/columns
        :param False no_heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all the experiments will be 
           concatenated into one for the search of TADs. The resulting TADs 
           found are stored under the name 'batch' plus a concatenation of the
           experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the
           name would be: 'batch_exp1_exp2').

        """
        if batch_mode:
            matrix = []
            if not name:
                name = 'batch'
            experiments = experiments or self.experiments
            xprs = []
            for xpr in experiments:
                if not type(xpr) == Experiment:
                    xprs.append(self.get_experiment(xpr))
                else:
                    xprs.append(xpr)
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception('All Experiments must have the same ' +
                                    'resolution\n')
                matrix.append(xpr.hic_data[0])
                if name.startswith('batch'):
                    name += '_' + xpr.name
            result, weights = tadbit(matrix,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True, **kwargs)
            xpr = Experiment(name, resolution, hic_data=matrix,
                             tad_def=result, weights=weights, **kwargs)
            xpr._zeros = xprs[0]._zeros
            for other in xprs[1:]:
                xpr._zeros = dict([(k, None) for k in
                                   set(xpr._zeros.keys()).intersection(
                                       other._zeros.keys())])
            self.add_experiment(xpr)
            return
        if type(experiments) is not list:
            experiments = [experiments]
        for experiment in experiments:
            if not type(experiment) == Experiment:
                experiment = self.get_experiment(experiment)
            result, weights = tadbit(experiment.hic_data,
                                     n_cpus=n_cpus, verbose=verbose,
                                     max_tad_size=max_tad_size,
                                     no_heuristic=no_heuristic,
                                     get_weights=True, **kwargs)
            experiment.load_tad_def(result, weights=weights)
            if self._search_centromere:
                self._get_forbidden_region(experiment)
Exemplo n.º 16
0
    def find_tad(self, experiments, weights=None, name=None, n_cpus=1,
                 verbose=True, max_tad_size="max", heuristic=True,
                 batch_mode=False, **kwargs):
        """
        Call the :func:`pytadbit.tadbit.tadbit` function to calculate the
        position of Topologically Associated Domain boundaries
        
        :param experiment: A square matrix of interaction counts of Hi-C
           data or a list of such matrices for replicated experiments. The
           counts must be evenly sampled and not normalized. 'experiment'
           can be either a list of lists, a path to a file or a file handler
        :param True normalized: if False simple normalization will be computed,
           as well as a simple column filtering will be applied (remove columns
           where value at the diagonal is null)
        :param 1 n_cpus: The number of CPUs to allocate to TADbit. If
           n_cpus='max' the total number of CPUs will be used
        :param max max_tad_size: an integer defining the maximum size of a 
           TAD. Default (auto) defines it as the number of rows/columns
        :param True heuristic: whether to use or not some heuristics
        :param False batch_mode: if True, all the experiments will be 
           concatenated into one for the search of TADs. The resulting TADs 
           found are stored under the name 'batch' plus a concatenation of the
           experiment names passed (e.g.: if experiments=['exp1', 'exp2'], the
           name would be: 'batch_exp1_exp2').

        """
        experiments = experiments or self.experiments
        if not isinstance(experiments, list):
            experiments = [experiments]
        xprs = []
        for xpr in experiments:
            if not isinstance(xpr, Experiment):
                xpr = self.get_experiment(xpr)
            xprs.append(xpr)
            # if normalized and (not xpr._zeros or not xpr._normalization):
            #     raise Exception('ERROR: Experiments should be normalized, and' +
            #                     ' filtered first')
        if len(xprs) <= 1 and batch_mode:
            raise Exception('ERROR: batch_mode implies that more than one ' +
                            'experiment is passed')
        if batch_mode:
            matrix = []
            if not name:
                name = 'batch'
            resolution = xprs[0].resolution
            for xpr in sorted(xprs, key=lambda x: x.name):
                if xpr.resolution != resolution:
                    raise Exception('All Experiments must have the same ' +
                                    'resolution\n')
                matrix.append(xpr.hic_data[0])
                if name.startswith('batch'):
                    name += '_' + xpr.name
            siz = xprs[0].size
            tmp = reduce(lambda x, y: x.__add__(y, silent=True), xprs)
            tmp.filter_columns(silent=kwargs.get('silent', False))
            remove = tuple([1 if i in tmp._zeros else 0
                            for i in xrange(siz)])
            result = tadbit(matrix,
                            remove=remove,
                            n_cpus=n_cpus, verbose=verbose,
                            max_tad_size=max_tad_size,
                            no_heuristic=not heuristic, **kwargs)
            xpr = Experiment(name, resolution, hic_data=matrix,
                             tad_def=result, **kwargs)
            xpr._zeros = xprs[0]._zeros
            for other in xprs[1:]:
                xpr._zeros = dict([(k, None) for k in
                                   set(xpr._zeros.keys()).intersection(
                                       other._zeros.keys())])
            self.add_experiment(xpr)
            return
        for xpr in xprs:
            result = tadbit(
                xpr.hic_data,
                remove=tuple([1 if i in xpr._zeros else 0 for i in
                              xrange(xpr.size)]),
                n_cpus=n_cpus, verbose=verbose,
                max_tad_size=max_tad_size,
                no_heuristic=not heuristic, **kwargs)
            xpr.load_tad_def(result)
            self._get_forbidden_region(xpr)
Exemplo n.º 17
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, get_md5=True)

    if opts.nosql:
        biases = opts.biases
        mreads = opts.mreads
        inputs = []
    elif opts.biases or opts.mreads:
        if not opts.mreads:
            raise Exception('ERROR: also need to provide BAM file')
        if not opts.biases:
            raise Exception('ERROR: also need to provide biases file')
        biases = opts.biases
        mreads = opts.mreads
        inputs = ['NA', 'NA']
        mkdir(path.join(opts.workdir))
    else:
        biases, mreads, biases_id, mreads_id = load_parameters_fromdb(opts)
        inputs = [biases_id, mreads_id]
        # store path ids to be saved in database
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases)

    reso   = opts.reso

    mkdir(path.join(opts.workdir, '06_segmentation'))

    print 'loading %s \n    at resolution %s' % (mreads, nice(reso))
    region = None
    if opts.crms and len(opts.crms) == 1:
        region = opts.crms[0]
    hic_data = load_hic_data_from_bam(mreads, reso, ncpus=opts.cpus,
                                      region=region,
                                      biases=None if opts.all_bins else biases,
                                      filter_exclude=opts.filter)

    # compartments
    cmp_result = {}
    richA_stats = {}
    firsts = {}
    if not opts.only_tads:
        print 'Searching compartments'
        cmprt_dir = path.join(opts.workdir, '06_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        if opts.fasta:
            print '  - Computing GC content to label compartments'
            rich_in_A = get_gc_content(parse_fasta(opts.fasta, chr_filter=opts.crms), reso,
                                       chromosomes=opts.crms,
                                       by_chrom=True, n_cpus=opts.cpus)
        elif opts.rich_in_A:
            rich_in_A = opts.rich_in_A
        else:
            rich_in_A = None
        n_evs = opts.n_evs if opts.n_evs > 0 else 3
        firsts, richA_stats = hic_data.find_compartments(
            crms=opts.crms, savefig=cmprt_dir, verbose=True, suffix=param_hash,
            rich_in_A=rich_in_A, show_compartment_labels=rich_in_A is not None,
            savecorr=cmprt_dir if opts.savecorr else None,
            max_ev=n_evs,
            ev_index=opts.ev_index,
            vmin=None if opts.fix_corr_scale else 'auto',
            vmax=None if opts.fix_corr_scale else 'auto')

        for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes):
            if not crm in firsts:
                continue
            ev_file = open(path.join(
                cmprt_dir, '%s_EigVect%d_%s.tsv' % (
                    crm, opts.ev_index[ncrm] if opts.ev_index else 1,
                    param_hash)), 'w')
            ev_file.write('# %s\n' % ('\t'.join(
                'EV_%d (%.4f)' % (i, v)
                for i, v in enumerate(firsts[crm][0], 1))))
            ev_file.write('\n'.join(['\t'.join([str(v) for v in vs])
                                     for vs in zip(*firsts[crm][1])]))
            ev_file.close()

        for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes):
            cmprt_file1 = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            cmprt_file2 = path.join(cmprt_dir, '%s_EigVect%d_%s.tsv' % (
                crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash))
            cmprt_image = path.join(cmprt_dir, '%s_EV%d_%s.%s' % (
                crm, opts.ev_index[ncrm] if opts.ev_index else 1,
                param_hash, opts.format))
            if opts.savecorr:
                cormat_file = path.join(cmprt_dir, '%s_corr-matrix%s.tsv' %
                                       (crm, param_hash))
            else:
                cormat_file = None
            hic_data.write_compartments(cmprt_file1, chroms=[crm])
            cmp_result[crm] = {'path_cmprt1': cmprt_file1,
                               'path_cmprt2': cmprt_file2,
                               'path_cormat': cormat_file,
                               'image_cmprt': cmprt_image,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '06_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            if hic_data.bads:
                to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            else:
                to_rm = None
            # maximum size of a TAD
            max_tad_size = (size - 1) if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=opts.verbose,
                            max_tad_size=max_tad_size,
                            no_heuristic=False)

            # use normalization to compute height on TADs called
            if opts.all_bins:
                if opts.nosql:
                    biases = load(open(biases))
                else:
                    biases = load(open(path.join(opts.workdir, biases)))
                hic_data.bads = biases['badcol']
                hic_data.bias = biases['biases']
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s\t%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        try:
            save_to_db(opts, cmp_result, tad_result, reso, inputs,
                       richA_stats, firsts, param_hash,
                       launch_time, finish_time)
        except:
            # release lock anyway
            print_exc()
            try:
                remove(path.join(opts.workdir, '__lock_db'))
            except OSError:
                pass
            exit(1)
def process_TAD(hic_data, perc_zero, reso, cpus, outdir, bins):

    # Get poor bins

    print 'Get poor bins...'

    try:

        hic_data.filter_columns(perc_zero=perc_zero, by_mean=True)

    except ValueError:

        perc_zero = 100
        hic_data.filter_columns(perc_zero=perc_zero, by_mean=True)

    binsrev = {y:x for x,y in bins.iteritems()}

    bad_file = outdir + 'bad_rows_%s_%d.tsv' % (nice(reso), perc_zero)
    bads = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + str(i) for i in hic_data.bads.keys()]

    compress(bads, bad_file)

    # Identify biases

    print 'Get biases using ICE...'

    hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0,
                           factor=1) # cells of the matrix have a mean of 1

    bias_file = outdir + 'bias_%s.tsv' % nice(reso)
    bias = [binsrev[i][0] + "\t" + str(binsrev[i][1] * reso) + "\t" + '%d\t%f' % (i, hic_data.bias[i]) for i in hic_data.bias]

    compress(bias, bias_file)

    # percentage of cis interactions

    print 'Getting percentage of cis interactions...'

    cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True )
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True )
    cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False)
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)

    cistrans_file = outdir + 'cis_trans_ratio_%s.tsv' % nice(reso)

    out_cistrans = open(cistrans_file, "w")
    out_cistrans.write("Cis/trans_ratio\tnormalized\twith_diagonal\t" + str(cis_trans_N_D) + "\n")
    out_cistrans.write("Cis/trans_ratio\tnormalized\twithout_diagonal\t" + str(cis_trans_N_d) + "\n")
    out_cistrans.write("Cis/trans_ratio\traw\twith_diagonal\t" + str(cis_trans_n_D) + "\n")
    out_cistrans.write("Cis/trans_ratio\traw\twithout_diagonal\t" + str(cis_trans_n_d) + "\n")
    out_cistrans.close()

    # Compute expected

    print 'Get expected counts ...'

    hic_data.expected = expected(hic_data, bads = hic_data.bads)

    # store matrices

    print 'Store matrices'

    write_matrices(hic_data, outdir, reso)

    # getting TAD borders

    print 'Searching TADs'

    for crm in hic_data.chromosomes:

        print '  - %s' % crm

        matrix = hic_data.get_matrix(focus=crm)
        beg, end = hic_data.section_pos[crm]
        size = len(matrix)

        if size < 10:
            print "     Chromosome too short (%d bins), skipping..." % size
            continue

        # transform bad column in chromosome referential

        remove = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])

        # maximum size of a TAD

        max_tad_size = size

        result = tadbit([matrix], remove=remove,
                        n_cpus=cpus, verbose=False,
                        max_tad_size=max_tad_size,
                        no_heuristic=0)
        
        tads = load_tad_height(result, size, beg, end, hic_data)

        table = ''
        table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density')

        for tad in tads:

            table += '%s\t%s\t%s\t%s%s\n' % (
                tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                abs(tads[tad]['score']), '\t%s' % (round(
                    float(tads[tad]['height']), 3)))

        out_tad = outdir + 'tads_%s_%s.tsv' % (
            crm, nice(reso))

        out = open(out_tad, 'w')
        out.write(table)
        out.close()
Exemplo n.º 19
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts)

    if opts.nosql:
        bad_co = opts.bad_co
        biases = opts.biases
        mreads = opts.mreads
        reso   = opts.reso
        inputs = []
    else:
        (bad_co, bad_co_id, biases, biases_id,
         mreads, mreads_id, reso) = load_parameters_fromdb(opts)
        # store path ids to be saved in database
        inputs = bad_co_id, biases_id, mreads_id

    mreads = path.join(opts.workdir, mreads)
    bad_co = path.join(opts.workdir, bad_co)
    biases = path.join(opts.workdir, biases)

    mkdir(path.join(opts.workdir, '05_segmentation'))

    print 'loading %s \n    at resolution %s' % (mreads, nice(reso))
    hic_data = load_hic_data_from_reads(mreads, reso)
    hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
    print 'loading filtered columns %s' % (bad_co)
    print '    with %d of %d filtered out columns' % (len(hic_data.bads),
                                                      len(hic_data))
    try:
        hic_data.bias = dict((int(l.split()[0]), float(l.split()[1]))
                             for l in open(biases))
    except IOError:
        if not opts.only_tads:
            raise Exception('ERROR: data should be normalized to get compartments')

    # compartments
    cmp_result = {}
    if not opts.only_tads:
        print 'Searching compartments'
        cmprt_dir = path.join(opts.workdir, '05_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        firsts = hic_data.find_compartments(crms=opts.crms,
                                            label_compartments='cluster',
                                            savefig=cmprt_dir,
                                            suffix=param_hash, log=cmprt_dir,
                                            rich_in_A=opts.rich_in_A)

        for crm in opts.crms or hic_data.chromosomes:
            if not crm in firsts:
                continue
            ev_file = open(path.join(cmprt_dir,
                                     '%s_EigVect_%s.tsv' % (crm, param_hash)), 'w')
            ev_file.write('# first EV\tsecond EV\n')
            ev_file.write('\n'.join(['\t'.join([str(v) for v in vs])
                                     for vs in zip(*firsts[crm])]))
            ev_file.close()

        for crm in opts.crms or hic_data.chromosomes:
            cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            hic_data.write_compartments(cmprt_file,
                                        chroms=[crm])
            cmp_result[crm] = {'path': cmprt_file,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '05_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            # maximum size of a TAD
            max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=False,
                            max_tad_size=max_tad_size,
                            no_heuristic=False)
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        save_to_db(opts, cmp_result, tad_result, reso, inputs, 
                   launch_time, finish_time)