示例#1
0
    def test_19_matrix_manip(self):
        if ONLY and not "19" in ONLY:
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads("lala-map~", resolution=10000)
        hic_map(hic_data1, savedata="lala-map.tsv~", savefig="lala.pdf")
        hic_map(hic_data1,
                by_chrom="intra",
                savedata="lala-maps~",
                savefig="lalalo~")
        hic_map(hic_data1,
                by_chrom="inter",
                savedata="lala-maps~",
                savefig="lalala~")
        # slowest part of the all test:
        hic_data2 = read_matrix("lala-map.tsv~", resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        # vals = plot_distance_vs_interactions(hic_data1)

        # self.assertEqual([round(i, 2) if str(i)!="nan" else 0.0 for i in
        #                   reduce(lambda x, y: x + y, vals)],
        #                  [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0])

        a, b = insert_sizes("lala-map~")
        self.assertEqual([int(a), int(b)], [43, 1033])

        hic_data1 = read_matrix(PATH + "/20Kb/chrT/chrT_A.tsv",
                                resolution=20000)
        hic_data2 = read_matrix(PATH + "/20Kb/chrT/chrT_B.tsv",
                                resolution=20000)

        corr = correlate_matrices(hic_data1, hic_data2)
        corr = [round(i, 3) for i in corr[0]]
        self.assertEqual(corr, [
            0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797,
            0.832
        ])

        ecorr = eig_correlate_matrices(hic_data1,
                                       hic_data2,
                                       savefig='lala3.pdf')

        ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)]
        self.assertEqual(ecorr, [
            0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01,
            0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
            0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974,
            0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89
        ])
        system("rm -rf lala*")
        if CHKTIME:
            self.assertEqual(True, True)
            print "19", time() - t0
示例#2
0
    def test_19_matrix_manip(self):
        if ONLY and ONLY != '19':
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000)
        hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~')
        hic_map(hic_data1,
                by_chrom='intra',
                savedata='lala-maps~',
                savefig='lalalo~')
        hic_map(hic_data1,
                by_chrom='inter',
                savedata='lala-maps~',
                savefig='lalala~')
        # slowest part of the all test:
        hic_data2 = read_matrix('lala-map.tsv~', resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        vals = plot_distance_vs_interactions(hic_data1)

        self.assertEqual([
            round(i, 2) if str(i) != 'nan' else 0.0
            for i in reduce(lambda x, y: x + y, vals)
        ], [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0])

        a, b = insert_sizes('lala-map~')
        self.assertEqual([int(a), int(b)], [43, 1033])

        hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000)
        hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000)

        corr = correlate_matrices(hic_data1, hic_data2)
        corr = [round(i, 3) for i in corr[0]]
        self.assertEqual(corr, [
            0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797,
            0.832
        ])

        ecorr = eig_correlate_matrices(hic_data1, hic_data2)
        ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)]
        self.assertEqual(ecorr, [
            0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01,
            0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
            0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974,
            0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89
        ])
        system('rm -rf lala*')
        if CHKTIME:
            self.assertEqual(True, True)
            print '19', time() - t0
示例#3
0
    def test_19_matrix_manip(self):
        if ONLY and ONLY != '19':
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000)
        hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~')
        hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~')
        hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~')
        # slowest part of the all test:
        hic_data2 = read_matrix('lala-map.tsv~', resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        vals = plot_distance_vs_interactions(hic_data1)
        
        self.assertEqual([round(i, 2) if str(i)!='nan' else 0.0 for i in
                          reduce(lambda x, y: x + y, vals)],
                         [-1.74, 4.2, 0.52, 1.82, -0.44, 0.0, -0.5, 2.95, 0.0])
        
        a, b = insert_sizes('lala-map~')
        self.assertEqual([int(a),int(b)], [43, 1033])

        hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000)
        hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000)
        
        corr = correlate_matrices(hic_data1, hic_data2)
        corr =  [round(i,3) for i in corr[0]]
        self.assertEqual(corr, [0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828,
                                0.757, 0.797, 0.832])
        
        ecorr = eig_correlate_matrices(hic_data1, hic_data2)
        ecorr = [round(i,3) for i in reduce(lambda x, y:x+y, ecorr)]
        self.assertEqual(ecorr, [0.997, 0.322, 0.442, 0.017, 0.243, 0.014,
                                 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451,
                                 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
                                 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013,
                                 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0,
                                 0.028, 0.034, 0.89])
        system('rm -rf lala*')
        if CHKTIME:
            self.assertEqual(True, True)
            print '19', time() - t0
示例#4
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(
            perc_zero=opts.perc_zeros,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)
    except ValueError:
        hic_data.filter_columns(
            perc_zero=100,
            draw_hist=True,
            by_mean=not opts.fast_filter,
            savefig=path.join(
                opts.workdir, '04_normalization', 'bad_columns_%s_%d_%s.pdf' %
                (opts.reso, opts.perc_zeros, param_hash))
            if not opts.fast_filter else None)

    # bad columns
    bad_columns_file = path.join(
        opts.workdir, '04_normalization',
        'bad_columns_%s_%d_%s.tsv' % (opts.reso, opts.perc_zeros, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    print 'Get biases using ICE...'
    hic_data.normalize_hic(silent=False,
                           max_dev=0.1,
                           iterations=0,
                           factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True, diagonal=True)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True)
    cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True, diagonal=False)
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)

    print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
    print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.pdf_%s_%s.pdf' %
        (opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data,
        max_diff=10000,
        resolution=opts.reso,
        normalized=True,
        savefig=inter_vs_gcoord)

    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = open(bias_file, 'w')
    out_bias.write(
        '\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                   for i in hic_data.bias]) + '\n')
    out_bias.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_nrm_fig,
                savedata=intra_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='intra',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig,
                savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            inter_dir_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        inter_dir_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_nrm_fig,
                savedata=inter_dir_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                by_chrom='inter',
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig,
                savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(
                opts.workdir, '04_normalization',
                'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        else:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        genom_map_nrm_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(
            opts.workdir, '04_normalization',
            'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        hic_map(hic_data,
                normalized=True,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig,
                savedata=genom_map_nrm_txt)
        hic_map(hic_data,
                normalized=False,
                cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig,
                savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db(opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D,
               cis_trans_n_d, a2, bad_columns_file, bias_file, inter_vs_gcoord,
               mreads, intra_dir_nrm_fig, intra_dir_nrm_txt, inter_dir_nrm_fig,
               inter_dir_nrm_txt, genom_map_nrm_fig, genom_map_nrm_txt,
               intra_dir_raw_fig, intra_dir_raw_txt, inter_dir_raw_fig,
               inter_dir_raw_txt, genom_map_raw_fig, genom_map_raw_txt,
               launch_time, finish_time)
plt.rcParams['savefig.bbox'] = 'tight'

bin_size = {}
bin_size['Mb'] = 1e6

pair_id = filtered_reads.split("/")[-1].replace("_filtered_map.tsv", "")

outfile = '%s/%s_plot_genomic_coverage_filtered_%s.png' % (POSTFILTERING_PLOTS, pair_id, genomic_coverage_resolution)
plt.rcParams['font.size'] = 20
coverages = plot_genomic_distribution(filtered_reads, name='filtered', savefig=outfile, resolution=genomic_coverage_resolution, pair_id=pair_id)
outfile = '%s/%s_genomic_coverage_filtered_%s.bed' % (COVERAGES, pair_id, genomic_coverage_resolution)
coverages.to_csv(outfile, sep='\t', index=False)
# Filtered reads: interaction matrix
outfile = '%s/%s_plot_interaction_matrix_filtered_%s.png' % (POSTFILTERING_PLOTS, pair_id, genomic_coverage_resolution)
plt.rcParams['font.size'] = 12
hic_map(filtered_reads, resolution=int(bin_size[genomic_coverage_resolution]), savefig=outfile, decay=False, cmap='jet')  

# Dangling ends: sequencing coverage along chromosomes
outfile = '%s/%s_plot_genomic_coverage_dangling_ends_%s.png' % (POSTFILTERING_PLOTS, pair_id, genomic_coverage_resolution)
plt.rcParams['font.size'] = 20
coverages = plot_genomic_distribution(dangling_ends, name='dangling_ends', savefig=outfile, resolution=genomic_coverage_resolution, pair_id=pair_id)
outfile = '%s/%s_genomic_coverage_dangling_ends_%s.bed' % (COVERAGES, pair_id, genomic_coverage_resolution)
coverages.to_csv(outfile, sep='\t', index=False)

# self-circle ends: sequencing coverage along chromosomes
outfile = '%s/%s_plot_genomic_coverage_self_circle_%s.png' % (POSTFILTERING_PLOTS, pair_id, genomic_coverage_resolution)
plt.rcParams['font.size'] = 20
coverages = plot_genomic_distribution(self_circle, name='self_circle', savefig=outfile, resolution=genomic_coverage_resolution, pair_id=pair_id)
outfile = '%s/%s_genomic_coverage_self_circle_%s.bed' % (COVERAGES, pair_id, genomic_coverage_resolution)
coverages.to_csv(outfile, sep='\t', index=False)
示例#6
0
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '': # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else: # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read 
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '': # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else: # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    # debug
    print "uniq_reads =", uniq_reads
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp: # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        map(os.remove, glob.glob(out_sam_left_path + '*'))
        map(os.remove, glob.glob(out_sam_right_path + '*'))
        map(os.remove, glob.glob(join(output_directory, '*.tsv')))
        print 'Done.'
        stdout.flush()
示例#7
0
                                       # 8, but leave some room for you other applications
            max_reads_per_chunk  = chunk,
            single_end           = True)

sams1 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r1.txt')]
sams2 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r2.txt')]

print 'created thes SAM files:', sams2

parse_sam(sams1, sams2, frags, 
          OUTPATH + 'reads1_%s.tsv' % rep, OUTPATH + 'reads2_%s.tsv' % rep,
          genome_seq, 'HindIII', verbose=True)

reads1 = OUTPATH + 'reads1_%s.tsv' % rep
reads2 = OUTPATH + 'reads2_%s.tsv' % rep
reads  = OUTPATH + 'reads12_%s.tsv' % rep

get_intersection(reads1, reads2, reads, verbose=True)

from pytadbit.mapping.analyze import hic_map
hic_map(reads, genome_seq, resolution=100000, savedata='lala')

from pytadbit.mapping.analyze import plot_genomic_distribution

plot_genomic_distribution(reads, resolution=50000, genome_seq=genome_seq) # because I know it

#  690691 SRR_test_10000000/reads1_SRR_test.tsv
#  690691 SRR_test_100000/reads1_SRR_test.tsv
# 1242927 SRR_test_200000/reads1_SRR_test.tsv
# 1035866 SRR_test_500000/reads1_SRR_test.tsv
示例#8
0
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '':  # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else:  # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(
        threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '':  # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else:  # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads,
            resolution=res,
            by_chrom='intra',
            savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp:  # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        remove(out_sam_left_path + '*')
        remove(out_sam_right_path + '*')
        remove(join(output_directory, '*.tsv'))
        print 'Done.'
        stdout.flush()
示例#9
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bed:
        mreads = path.realpath(opts.bed)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    print 'loading', mreads
    hic_data = load_hic_data_from_reads(mreads, opts.reso)

    mkdir(path.join(opts.workdir, '04_normalization'))

    print 'Get poor bins...'
    try:
        hic_data.filter_columns(perc_zero=opts.perc_zeros, min_count=opts.min_count,
                                draw_hist=True,
                                by_mean=not opts.fast_filter, savefig=path.join(
                                    opts.workdir, '04_normalization',
                                    'bad_columns_%s_%d_%d_%s.pdf' % (
                                        opts.reso, opts.perc_zeros, opts.min_count,
                                        param_hash)) if
                                not opts.fast_filter else None)
    except ValueError:
        raise ValueError('ERROR: probably all columns filtered out...')
    # bad columns
    bad_columns_file = path.join(opts.workdir, '04_normalization',
                                 'bad_columns_%s_%d_%d_%s.tsv' % (
                                     opts.reso, opts.perc_zeros, opts.min_count, param_hash))
    out_bad = open(bad_columns_file, 'w')
    out_bad.write('\n'.join([str(i) for i in hic_data.bads.keys()]))
    out_bad.close()

    # Identify biases
    if not opts.filter_only:
        print 'Get biases using ICE...'
        hic_data.normalize_hic(silent=False, max_dev=0.1, iterations=0,
                               factor=opts.factor)

    print 'Getting cis/trans...'
    cis_trans_N_D = cis_trans_N_d = float('nan')
    if not opts.filter_only:
        cis_trans_N_D = hic_data.cis_trans_ratio(normalized=True , diagonal=True )
        cis_trans_N_d = hic_data.cis_trans_ratio(normalized=True , diagonal=False)
    cis_trans_n_D = hic_data.cis_trans_ratio(normalized=False, diagonal=True )
    cis_trans_n_d = hic_data.cis_trans_ratio(normalized=False, diagonal=False)
        
    if not opts.filter_only:
        print 'Cis/Trans ratio of normalized matrix including the diagonal', cis_trans_N_D
        print 'Cis/Trans ratio of normalized matrix excluding the diagonal', cis_trans_N_d
    print 'Cis/Trans ratio of raw matrix including the diagonal', cis_trans_n_D
    print 'Cis/Trans ratio of raw matrix excluding the diagonal', cis_trans_n_d

    # Plot genomic distance vs interactions
    print 'Plot genomic distance vs interactions...'
    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.pdf_%s_%s.pdf' % (
                                    opts.reso, param_hash))
    (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
        hic_data, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
        savefig=inter_vs_gcoord)
    
    print 'Decay slope 0.7-10 Mb\t%s' % a2

    # write biases
    bias_file = path.join(opts.workdir, '04_normalization',
                          'bias_%s_%s.tsv' % (opts.reso, param_hash))
    out_bias = 'NA'
    if not opts.filter_only:
        out_bias = open(bias_file, 'w')
        out_bias.write('\n'.join(['%d\t%f' % (i, hic_data.bias[i])
                                  for i in hic_data.bias])
                       + '\n')
        out_bias.close()


    # pickle the HiC-data object
    print 'Saving genomic matrix'
    pickle_path = path.join(opts.workdir, '04_normalization',
                            'hic-data_%s_%s.pickle' % (nice(opts.reso), param_hash))
    out = open(pickle_path, 'w')
    dump(hic_data, out)
    out.close()

    # to feed the save_to_db funciton
    intra_dir_nrm_fig = intra_dir_nrm_txt = None
    inter_dir_nrm_fig = inter_dir_nrm_txt = None
    genom_map_nrm_fig = genom_map_nrm_txt = None
    intra_dir_raw_fig = intra_dir_raw_txt = None
    inter_dir_raw_fig = inter_dir_raw_txt = None
    genom_map_raw_fig = genom_map_raw_txt = None

    if "intra" in opts.keep:
        print "  Saving intra chromosomal raw and normalized matrices..."
        if opts.only_txt:
            intra_dir_nrm_fig = None
            intra_dir_raw_fig = None
        else:
            intra_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            intra_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'intra_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        intra_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        intra_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'intra_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='intra', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=intra_dir_nrm_fig, savedata=intra_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='intra', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=intra_dir_raw_fig, savedata=intra_dir_raw_txt)

    if "inter" in opts.keep:
        print "  Saving inter chromosomal raw and normalized matrices..."
        if opts.only_txt:
            inter_dir_nrm_fig = None
            inter_dir_raw_fig = None
        else:
            if not opts.filter_only:
                inter_dir_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'inter_chromosome_nrm_images_%s_%s' % (opts.reso, param_hash))
            inter_dir_raw_fig = path.join(opts.workdir, '04_normalization',
                                      'inter_chromosome_raw_images_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            inter_dir_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'inter_chromosome_nrm_matrices_%s_%s' % (opts.reso, param_hash))
        inter_dir_raw_txt = path.join(opts.workdir, '04_normalization',
                                  'inter_chromosome_raw_matrices_%s_%s' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, by_chrom='inter', cmap='jet',
                    name=path.split(opts.workdir)[-1],
                    savefig=inter_dir_nrm_fig, savedata=inter_dir_nrm_txt)
        hic_map(hic_data, normalized=False, by_chrom='inter', cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=inter_dir_raw_fig, savedata=inter_dir_raw_txt)

    if "genome" in opts.keep:
        print "  Saving normalized genomic matrix..."
        if opts.only_txt:
            genom_map_nrm_fig = None
            genom_map_raw_fig = None
        else:
            if not opts.filter_only:
                genom_map_nrm_fig = path.join(opts.workdir, '04_normalization',
                                              'genomic_maps_nrm_%s_%s.pdf' % (opts.reso, param_hash))
            genom_map_raw_fig = path.join(opts.workdir, '04_normalization',
                                          'genomic_maps_raw_%s_%s.pdf' % (opts.reso, param_hash))
        if not opts.filter_only:
            genom_map_nrm_txt = path.join(opts.workdir, '04_normalization',
                                          'genomic_nrm_%s_%s.tsv' % (opts.reso, param_hash))
        genom_map_raw_txt = path.join(opts.workdir, '04_normalization',
                                      'genomic_raw_%s_%s.tsv' % (opts.reso, param_hash))
        if not opts.filter_only:
            hic_map(hic_data, normalized=True, cmap='jet',
                    name=path.split(opts.workdir)[-1],
                savefig=genom_map_nrm_fig, savedata=genom_map_nrm_txt)
        hic_map(hic_data, normalized=False, cmap='jet',
                name=path.split(opts.workdir)[-1],
                savefig=genom_map_raw_fig, savedata=genom_map_raw_txt)

    finish_time = time.localtime()

    save_to_db (opts, cis_trans_N_D, cis_trans_N_d, cis_trans_n_D, cis_trans_n_d,
                a2, bad_columns_file, bias_file, inter_vs_gcoord, mreads,
                len(hic_data.bads.keys()), len(hic_data),
                intra_dir_nrm_fig, intra_dir_nrm_txt,
                inter_dir_nrm_fig, inter_dir_nrm_txt,
                genom_map_nrm_fig, genom_map_nrm_txt,
                intra_dir_raw_fig, intra_dir_raw_txt,
                inter_dir_raw_fig, inter_dir_raw_txt,
                genom_map_raw_fig, genom_map_raw_txt,
                pickle_path, launch_time, finish_time)