Exemplo n.º 1
0
    def tb_generate_tads(self, expt_name, adj_list, chrom, resolution,
                         normalized, tad_file):
        """
        Function to the predict TAD sites for a given resolution from the Hi-C
        matrix

        Parameters
        ----------
        expt_name : str
                Location of the adjacency list
        matrix_file : str
            Location of the HDF5 output matrix file
        resolution : int
            Resolution to read the Hi-C adjacency list at
        tad_file : str
            Location of the output TAD file

        Returns
        -------
        tad_file : str
            Location of the output TAD file

        """
        # chr_hic_data = read_matrix(matrix_file, resolution=int(resolution))

        print("TB TAD GENERATOR:", expt_name, adj_list, chrom, resolution,
              normalized, tad_file)

        hic_data = load_hic_data_from_reads(adj_list,
                                            resolution=int(resolution))

        if normalized is False:
            hic_data.normalize_hic(iterations=9, max_dev=0.1)

        save_matrix_file = adj_list + "_" + str(chrom) + "_tmp.txt"
        hic_data.write_matrix(save_matrix_file, (chrom, chrom),
                              normalized=True)

        chr_hic_data = hic_data.get_matrix((chrom, chrom))
        print("TB - chr_hic_data:", chr_hic_data)

        my_chrom = Chromosome(name=chrom, centromere_search=True)
        my_chrom.add_experiment(expt_name,
                                hic_data=save_matrix_file,
                                resolution=int(resolution))

        # Run core TADbit function to find TADs on each expt.
        my_chrom.find_tad(expt_name, n_cpus=15)

        exp = my_chrom.experiments[expt_name]
        exp.write_tad_borders(savedata=tad_file + ".tmp")

        with open(tad_file, "wb") as f_out:
            with open(tad_file + ".tmp", "rb") as f_in:
                f_out.write(f_in.read())

        return True
    def tb_matrix_hdf5(self, adjlist_file, adj_hdf5, normalized, resolution,
                       chromosomes):
        """
        Function to the Hi-C matrix into an HDF5 file

        This has to be run sequentially as it is not possible for multiple
        streams to write to the same HDF5 file. This is a run once and leave
        operatation. There also needs to be a check that no other process is
        writing to the HDF5 file at the same time. This should be done at the
        stage and unstaging level to prevent to file getting written to by
        multiple processes and generating conflicts.

        This needs to include attributes for the chromosomes for each resolution
        - See the mg-rest-adjacency hdf5_reader for further details about the
        requirement. This prevents the need for secondary storage details
        outside of the HDF5 file.

        Parameters
        ----------
        hic_data : hic_data
            Hi-C data object
        hdf5_file : str
            Location of the HDF5 output matrix file
        resolution : int
            Resolution to read teh Hi-C adjacency list at
        chromosomes : list
            List of listsd of the chromosome names and their size in the order
            that they are presented for indexing

        Returns
        -------
        hdf5_file : str
            Location of the HDF5 output matrix file

        """

        hic_data = load_hic_data_from_reads(adjlist_file,
                                            resolution=int(resolution))

        if normalized is False:
            hic_data.normalize_hic(iterations=9, max_dev=0.1)

        d_size = len(hic_data)
        d_tmp = np.zeros([d_size, d_size], dtype='int32')
        d_tmp += hic_data.get_matrix()

        hdf5_handle = h5py.File(adj_hdf5, "a")
        dset = hdf5_handle.create_dataset(str(resolution), (d_size, d_size),
                                          dtype='int32',
                                          chunks=True,
                                          compression="gzip")
        dset.attrs['chromosomes'] = chromosomes
        dset[0:d_size, 0:d_size] += d_tmp
        hdf5_handle.close()

        return True
Exemplo n.º 3
0
    def tb_hic_chr(self, adj_list, resolution):  # pylint: disable=no-self-use
        """
        Get the list of chromosomes in the adjacency list
        """
        print("TB LOADED HIC MATRIX")
        hic_data = load_hic_data_from_reads(adj_list,
                                            resolution=int(resolution))

        print("TB LOADED HIC MATRIX")

        return hic_data.chromosomes.keys()
Exemplo n.º 4
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    # prepare output folders
    mkdir(path.join(opts.workdir, '06_model'))
    outdir = path.join(opts.workdir, '06_model',
                       'chr%s_%s-%s' % (opts.crm, opts.beg, opts.end))
    mkdir(outdir)

    # load data
    if opts.matrix:
        crm = load_hic_data(opts)
    else:
        (bad_co, bad_co_id, biases, biases_id,
         mreads, mreads_id, reso) = load_parameters_fromdb(opts)
        hic_data = load_hic_data_from_reads(mreads, reso)
        hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
        hic_data.bias = dict((int(l.split()[0]), float(l.split()[1]))
                             for l in open(biases))
        
    exp = crm.experiments[0]
    opts.beg, opts.end = opts.beg or 1, opts.end or exp.size

    # in case we are not going to run
    if opts.job_list:
        job_file_handler = open(path.join(outdir, 'job_list.q'), 'w')
    else:
        job_file_handler = None

    # optimization
    if opts.optimize:
        optimization(exp, opts, job_file_handler, outdir)
        finish_time = time.localtime()
        return

    # correlate all optimizations and get best set of parqameters
    optpar, dcutoff = correlate_models(opts, outdir, exp)

    # run good mmodels
    big_run(exp, opts, job_file_handler, outdir, optpar)

    finish_time = time.localtime()
Exemplo n.º 5
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(
            perc_zero=opts.perc_zeros,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)
    except ValueError:
        hic_data.filter_columns(
            perc_zero=100,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)

    # bad columns
    bad_columns_file = path.join(
        opts.workdir, '04_normalization',
        'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    print 'Get biases using ICE...'
    hic_data.normalize_hic(silent=False,
                           max_dev=0.1,
                           iterations=0,
                           factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True)
    cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False)
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)

    print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
    print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.pdf_%s_%s.pdf' %
        (opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data,
        max_diff=10000,
        resolution=opts.reso,
        normalized=True,
        savefig=inter_vs_gcoord)

    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = open(bias_file, 'w')
    out_bias.write(
        '\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                   for i in hic_data.bias]) + '\n')
    out_bias.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_nrm_fig,
                savedata=intra_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig,
                savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            inter_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        inter_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_nrm_fig,
                savedata=inter_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig,
                savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        else:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        genom_map_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig,
                savedata=genom_map_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig,
                savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D,
               cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord,
               mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig,
               inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt,
               intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig,
               inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt,
               launch_time, finish_time)
Exemplo n.º 6
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bed1:
        mreads1 = path.realpath(opts.bed1)
        bad_co1 = opts.bad_co1
        biases1 = opts.biases1
    else:
        bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb(
            opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)

    if opts.bed2:
        mreads2 = path.realpath(opts.bed2)
        bad_co2 = opts.bad_co2
        biases2 = opts.biases2
    else:
        bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb(
            opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        print 'Comparison'
        print ' - loading first sample', mreads1
        hic_data1 = load_hic_data_from_reads(mreads1, opts.reso)

        print ' - loading second sample', mreads2
        hic_data2 = load_hic_data_from_reads(mreads2, opts.reso)

        if opts.norm and biases1:
            bad_co1 = path.join(opts.workdir1, bad_co1)
            print ' - loading bad columns from first sample', bad_co1
            hic_data1.bads = dict(
                (int(l.strip()), True) for l in open(bad_co1))
            biases1 = path.join(opts.workdir1, biases1)
            print ' - loading biases from first sample', biases1
            hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1]))
                                  for l in open(biases1))
        elif opts.norm:
            raise Exception('ERROR: biases or filtered-columns not found')
        if opts.norm and biases2:
            bad_co2 = path.join(opts.workdir2, bad_co2)
            print ' - loading bad columns from second sample', bad_co2
            hic_data2.bads = dict(
                (int(l.strip()), True) for l in open(bad_co2))
            biases2 = path.join(opts.workdir2, biases2)
            print ' - loading biases from second sample', biases2
            hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1]))
                                  for l in open(biases2))
        elif opts.norm:
            raise Exception('ERROR: biases or filtered-columns not found')
        decay_corr_dat = path.join(
            opts.workdir, '00_merge',
            'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(
            opts.workdir, '00_merge',
            'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(
            opts.workdir, '00_merge',
            'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(
            opts.workdir, '00_merge',
            'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))
    else:
        hic_data1 = {}
        hic_data2 = {}
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'

    # if opts.norm:
    # has bias file

    if not opts.skip_comparison:
        print '  => correlation between equidistant loci'
        corr, _, bads = correlate_matrices(hic_data1,
                                           hic_data2,
                                           normalized=opts.norm,
                                           remove_bad_columns=True,
                                           savefig=decay_corr_fig,
                                           savedata=decay_corr_dat,
                                           get_bads=True)
        print '  => correlation between eigenvectors'
        eig_corr = eig_correlate_matrices(hic_data1,
                                          hic_data2,
                                          normalized=opts.norm,
                                          remove_bad_columns=True,
                                          nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)
    else:
        corr = eig_corr = 0
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbed = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % (param_hash))

    print '\nMergeing...'
    nreads = merge_2d_beds(mreads1, mreads2, outbed)

    finish_time = time.localtime()
    save_to_db(opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
               len(bads.keys()), len(hic_data1), nreads, eigen_corr_dat,
               eigen_corr_fig, outbed, corr, eig_corr, biases1, bad_co1,
               biases2, bad_co2, launch_time, finish_time)
    print '\n\nDone.'
Exemplo n.º 7
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts)

    if not opts.nosql:
        (bad_co, bad_co_id, biases, biases_id,
         mreads, mreads_id, reso) = load_parameters_fromdb(opts)
        # store path ids to be saved in database
        inputs = bad_co_id, biases_id, mreads_id
    else:
        bad_co = opts.bad_co
        biases = opts.biases
        mreads = opts.mreads
        reso   = opts.reso

    mreads = path.join(opts.workdir, mreads)
    bad_co = path.join(opts.workdir, bad_co)
    biases = path.join(opts.workdir, biases)

    mkdir(path.join(opts.workdir, '05_segmentation'))

    print 'loading %s at resolution %s' % (mreads, nice(reso))
    hic_data = load_hic_data_from_reads(mreads, reso)
    hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
    hic_data.bias = dict((int(l.split()[0]), float(l.split()[1]))
                         for l in open(biases))

    # compartments
    cmp_result = {}
    if not opts.only_tads:
        print 'Searching compartments'
        hic_data.find_compartments(crms=opts.crms)

        cmprt_dir = path.join(opts.workdir, '05_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        for crm in opts.crms or hic_data.chromosomes:
            cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            hic_data.write_compartments(cmprt_file,
                                        chroms=[crm])
            cmp_result[crm] = {'path': cmprt_file,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '05_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            # maximum size of a TAD
            max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=True,
                            max_tad_size=max_tad_size,
                            no_heuristic=True)
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        save_to_db(opts, cmp_result, tad_result, reso, inputs, 
                   launch_time, finish_time)
Exemplo n.º 8
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)

    reso1 = reso2 = None
    if opts.bed1:
        mreads1 = path.realpath(opts.bed1)
        bad_co1 = opts.bad_co1
        biases1 = opts.biases1
    else:
        bad_co1, biases1, mreads1, reso1 = load_parameters_fromdb(
                opts.workdir1, opts.jobid1, opts, opts.tmpdb1)
        mreads1 = path.join(opts.workdir1, mreads1)

    if opts.bed2:
        mreads2 = path.realpath(opts.bed2)
        bad_co2 = opts.bad_co2
        biases2 = opts.biases2
    else:
        bad_co2, biases2, mreads2, reso2 = load_parameters_fromdb(
                opts.workdir2, opts.jobid2, opts, opts.tmpdb2)
        mreads2 = path.join(opts.workdir2, mreads2)

    if reso1 != reso2:
        raise Exception('ERROR: differing resolutions between experiments to '
                        'be merged')

    print 'loading first sample', mreads1
    hic_data1 = load_hic_data_from_reads(mreads1, opts.reso)

    print 'loading second sample', mreads2
    hic_data2 = load_hic_data_from_reads(mreads2, opts.reso)

    if opts.norm and biases1:
        bad_co1 = path.join(opts.workdir1, bad_co1)
        print 'loading bad columns from first sample', bad_co1
        hic_data1.bads = dict((int(l.strip()), True) for l in open(bad_co1))
        biases1 = path.join(opts.workdir1, biases1)
        print 'loading biases from first sample', biases1
        hic_data1.bias = dict((int(l.split()[0]), float(l.split()[1]))
                              for l in open(biases1))
    elif opts.norm:
        raise Exception('ERROR: biases or filtered-columns not found')
    if opts.norm and biases2:
        bad_co2 = path.join(opts.workdir2, bad_co2)
        print 'loading bad columns from second sample', bad_co2
        hic_data2.bads = dict((int(l.strip()), True) for l in open(bad_co2))
        biases2 = path.join(opts.workdir2, biases2)
        print 'loading biases from second sample', biases2
        hic_data2.bias = dict((int(l.split()[0]), float(l.split()[1]))
                              for l in open(biases2))
    elif opts.norm:
        raise Exception('ERROR: biases or filtered-columns not found')

    mkdir(path.join(opts.workdir, '00_merge'))

    if not opts.skip_comparison:
        decay_corr_dat = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        decay_corr_fig = path.join(opts.workdir, '00_merge', 'decay_corr_dat_%s_%s.png' % (opts.reso, param_hash))
        eigen_corr_dat = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.txt' % (opts.reso, param_hash))
        eigen_corr_fig = path.join(opts.workdir, '00_merge', 'eigen_corr_dat_%s_%s.png' % (opts.reso, param_hash))
    else:
        decay_corr_dat = 'None'
        decay_corr_fig = 'None'
        eigen_corr_dat = 'None'
        eigen_corr_fig = 'None'
        
    # if opts.norm:
        # has bias file

    if not opts.skip_comparison:
        print 'correlation between equidistant loci'
        corr, _, bads = correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                           remove_bad_columns=True,
                                           savefig=decay_corr_fig,
                                           savedata=decay_corr_dat, get_bads=True)
        print 'correlation between eigenvectors'
        eig_corr = eig_correlate_matrices(hic_data1, hic_data2, normalized=opts.norm,
                                          remove_bad_columns=True, nvect=6,
                                          savefig=eigen_corr_fig,
                                          savedata=eigen_corr_dat)
    else:
        corr = eig_corr = None
        bads = {}

    # merge inputs
    mkdir(path.join(opts.workdir, '03_filtered_reads'))
    outbed = path.join(opts.workdir, '03_filtered_reads', 'valid_r1-r2_intersection_%s.tsv' % (
        param_hash))

    nreads = merge_2d_beds(mreads1, mreads2, outbed)

    finish_time = time.localtime()
    save_to_db (opts, mreads1, mreads2, decay_corr_dat, decay_corr_fig,
                len(bads.keys()), len(hic_data1), nreads,
                eigen_corr_dat, eigen_corr_fig, outbed, corr, eig_corr,
                biases1, bad_co1, biases2, bad_co2, launch_time, finish_time)
Exemplo n.º 9
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    print(
        '''
%s%s

  - Region: Chromosome %s from %d to %d at resolution %s (%d particles)
    ''' % ('Preparing ' if opts.job_list else '',
           ('Optimization\n' + '*' *
            (21 if opts.job_list else 11)) if opts.optimize else
           ('Modeling\n' + '*' * (18 if opts.job_list else 8)), opts.crm,
           opts.ori_beg, opts.ori_end, nicer(opts.reso), opts.end - opts.beg))

    # load data
    if opts.matrix:
        crm = load_hic_data(opts)
    else:
        # FIXME: copied from somewhere else
        (bad_co, bad_co_id, biases, biases_id, mreads, mreads_id,
         reso) = load_parameters_fromdb(opts)
        hic_data = load_hic_data_from_reads(mreads, reso)
        hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
        hic_data.bias = dict(
            (int(l.split()[0]), float(l.split()[1])) for l in open(biases))

    exp = crm.experiments[0]
    opts.beg, opts.end = opts.beg or 1, opts.end or exp.size

    # prepare output folders
    batch_job_hash = digest_parameters(
        opts,
        get_md5=True,
        extra=[
            'maxdist', 'upfreq', 'lowfreq', 'scale', 'dcutoff', 'nmodels_run',
            'job_list', 'rand', 'nmodels', 'nkeep', 'optimize',
            'optimization_id', 'cpus', 'workdir', 'matrix', 'ori_beg',
            'ori_end'
        ])

    mkdir(path.join(opts.workdir, '06_model'))
    outdir = path.join(
        opts.workdir, '06_model',
        '%s_chr%s_%s-%s' % (batch_job_hash, opts.crm, opts.beg, opts.end))
    mkdir(outdir)

    # in case we are not going to run
    if opts.job_list:
        job_file_handler = open(
            path.join(
                outdir, 'job_list_%s.q' %
                ('optimization' if opts.optimize else 'modeling')), 'w')
    else:
        job_file_handler = None

    ###############
    # Optimization
    print '     o Optimizing parameters'
    if opts.optimize:
        optimization(exp, opts, job_file_handler, outdir)
        finish_time = time.localtime()
        print('\n optimization done')
        # correlate all optimization and get best set of parameters

    if not (opts.optimize and opts.job_list):
        optpar, results = correlate_models(opts, outdir, exp)
    else:
        results = []

    ###########
    # Modeling
    if not opts.optimize:
        big_run(exp, opts, job_file_handler, outdir, optpar)

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, outdir, results, batch_job_hash, launch_time, finish_time)
Exemplo n.º 10
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count,
                                draw_hist=True,
                                by_mean=not opts.fast_filter, savefig=path.join(
                                    opts.workdir, '04_normalization',
                                    'bad_columns_%s_%d_%d_%s.pdf' % (
                                        opts.reso, opts.perc_zeros, opts.min_count,
                                        param_hash)) if
                                not opts.fast_filter else None)
    except ValueError:
        raise ValueError('ERROR: probably all columns filtered out...')
    # bad columns
    bad_columns_file = path.join(opts.workdir, '04_normalization',
                                 'bad_columns_%s_%d_%d_%s.tsv' % (
                                     opts.reso, opts.perc_zeros, opts.min_count, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    if not opts.filter_only:
        print 'Get biases using ICE...'
        hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0,
                               factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = cis_trans_N_d = float('nan')
    if not opts.filter_only:
        cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True )
        cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True )
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)
        
    if not opts.filter_only:
        print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
        print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (
                                    opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
        savefig=inter_vs_gcoord)
    
    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = 'NA'
    if not opts.filter_only:
        out_bias = open(bias_file, 'w')
        out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                                  for i in hic_data.bias])
                       + '\n')
        out_bias.close()


    # pickle the HiC-data object
    print 'Saving genomic matrix'
    pickle_path = path.join(opts.workdir, '04_normalization',
                            'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash))
    out = open(pickle_path, 'w')
    dump(hic_data, out)
    out.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            if not opts.filter_only:
                inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                      'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                  'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        else:
            if not opts.filter_only:
                genom_map_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        if not opts.filter_only:
            genom_map_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, cmap='jet',
                    name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt)
        hic_map(hic_data, normalized=False, cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig, savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d,
                a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads,
                len(hic_data.bads.keys()), len(hic_data),
                intra_dir_nrm_fig, intra_dir_nrm_txt,
                inter_dir_nrm_fig, inter_dir_nrm_txt,
                genom_map_nrm_fig, genom_map_nrm_txt,
                intra_dir_raw_fig, intra_dir_raw_txt,
                inter_dir_raw_fig, inter_dir_raw_txt,
                genom_map_raw_fig, genom_map_raw_txt,
                pickle_path, launch_time, finish_time)
Exemplo n.º 11
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts)

    if opts.nosql:
        bad_co = opts.bad_co
        biases = opts.biases
        mreads = opts.mreads
        reso   = opts.reso
        inputs = []
    else:
        (bad_co, bad_co_id, biases, biases_id,
         mreads, mreads_id, reso) = load_parameters_fromdb(opts)
        # store path ids to be saved in database
        inputs = bad_co_id, biases_id, mreads_id

    mreads = path.join(opts.workdir, mreads)
    bad_co = path.join(opts.workdir, bad_co)
    biases = path.join(opts.workdir, biases)

    mkdir(path.join(opts.workdir, '05_segmentation'))

    print 'loading %s \n    at resolution %s' % (mreads, nice(reso))
    hic_data = load_hic_data_from_reads(mreads, reso)
    hic_data.bads = dict((int(l.strip()), True) for l in open(bad_co))
    print 'loading filtered columns %s' % (bad_co)
    print '    with %d of %d filtered out columns' % (len(hic_data.bads),
                                                      len(hic_data))
    try:
        hic_data.bias = dict((int(l.split()[0]), float(l.split()[1]))
                             for l in open(biases))
    except IOError:
        if not opts.only_tads:
            raise Exception('ERROR: data should be normalized to get compartments')

    # compartments
    cmp_result = {}
    if not opts.only_tads:
        print 'Searching compartments'
        cmprt_dir = path.join(opts.workdir, '05_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        firsts = hic_data.find_compartments(crms=opts.crms,
                                            label_compartments='cluster',
                                            savefig=cmprt_dir,
                                            suffix=param_hash, log=cmprt_dir,
                                            rich_in_A=opts.rich_in_A)

        for crm in opts.crms or hic_data.chromosomes:
            if not crm in firsts:
                continue
            ev_file = open(path.join(cmprt_dir,
                                     '%s_EigVect_%s.tsv' % (crm, param_hash)), 'w')
            ev_file.write('# first EV\tsecond EV\n')
            ev_file.write('\n'.join(['\t'.join([str(v) for v in vs])
                                     for vs in zip(*firsts[crm])]))
            ev_file.close()

        for crm in opts.crms or hic_data.chromosomes:
            cmprt_file = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            hic_data.write_compartments(cmprt_file,
                                        chroms=[crm])
            cmp_result[crm] = {'path': cmprt_file,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '05_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            # maximum size of a TAD
            max_tad_size = size if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=False,
                            max_tad_size=max_tad_size,
                            no_heuristic=False)
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        save_to_db(opts, cmp_result, tad_result, reso, inputs, 
                   launch_time, finish_time)