예제 #1
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = insert_sizes(
            reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'),
            savefig=hist_path)
        
        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f
    
        max_mole = max_f # pseudo DEs
        min_dist = max_f + mad # random breaks
        print ('   Using the maximum continuous fragment size'
               '(%d bp) to check '
               'for pseudo-dangling ends') % max_mole
        print ('   Using maximum continuous fragment size plus the MAD '
               '(%d bp) to check for random breaks') % min_dist
    
        print "identify pairs to filter..."
        masked = filter_reads(reads, max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              min_dist_to_re=min_dist, fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked,
                                 filters=opts.apply)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time)
예제 #2
0
    def test_19_matrix_manip(self):
        if ONLY and not "19" in ONLY:
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads("lala-map~", resolution=10000)
        hic_map(hic_data1, savedata="lala-map.tsv~", savefig="lala.pdf")
        hic_map(hic_data1,
                by_chrom="intra",
                savedata="lala-maps~",
                savefig="lalalo~")
        hic_map(hic_data1,
                by_chrom="inter",
                savedata="lala-maps~",
                savefig="lalala~")
        # slowest part of the all test:
        hic_data2 = read_matrix("lala-map.tsv~", resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        # vals = plot_distance_vs_interactions(hic_data1)

        # self.assertEqual([round(i, 2) if str(i)!="nan" else 0.0 for i in
        #                   reduce(lambda x, y: x + y, vals)],
        #                  [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0])

        a, b = insert_sizes("lala-map~")
        self.assertEqual([int(a), int(b)], [43, 1033])

        hic_data1 = read_matrix(PATH + "/20Kb/chrT/chrT_A.tsv",
                                resolution=20000)
        hic_data2 = read_matrix(PATH + "/20Kb/chrT/chrT_B.tsv",
                                resolution=20000)

        corr = correlate_matrices(hic_data1, hic_data2)
        corr = [round(i, 3) for i in corr[0]]
        self.assertEqual(corr, [
            0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797,
            0.832
        ])

        ecorr = eig_correlate_matrices(hic_data1,
                                       hic_data2,
                                       savefig='lala3.pdf')

        ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)]
        self.assertEqual(ecorr, [
            0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01,
            0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
            0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974,
            0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89
        ])
        system("rm -rf lala*")
        if CHKTIME:
            self.assertEqual(True, True)
            print "19", time() - t0
예제 #3
0
    def test_19_matrix_manip(self):
        if ONLY and ONLY != '19':
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000)
        hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~')
        hic_map(hic_data1,
                by_chrom='intra',
                savedata='lala-maps~',
                savefig='lalalo~')
        hic_map(hic_data1,
                by_chrom='inter',
                savedata='lala-maps~',
                savefig='lalala~')
        # slowest part of the all test:
        hic_data2 = read_matrix('lala-map.tsv~', resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        vals = plot_distance_vs_interactions(hic_data1)

        self.assertEqual([
            round(i, 2) if str(i) != 'nan' else 0.0
            for i in reduce(lambda x, y: x + y, vals)
        ], [-1.68, -2.08, 0.02, 2.76, -8.99, 0.0, 0.82, -6.8, 0.0])

        a, b = insert_sizes('lala-map~')
        self.assertEqual([int(a), int(b)], [43, 1033])

        hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000)
        hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000)

        corr = correlate_matrices(hic_data1, hic_data2)
        corr = [round(i, 3) for i in corr[0]]
        self.assertEqual(corr, [
            0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828, 0.757, 0.797,
            0.832
        ])

        ecorr = eig_correlate_matrices(hic_data1, hic_data2)
        ecorr = [round(i, 3) for i in reduce(lambda x, y: x + y, ecorr)]
        self.assertEqual(ecorr, [
            0.997, 0.322, 0.442, 0.017, 0.243, 0.014, 0.321, 0.999, 0.01,
            0.006, 0.0, 0.007, 0.451, 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
            0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013, 0.031, 0.08, 0.974,
            0.018, 0.028, 0.004, 0.0, 0.028, 0.034, 0.89
        ])
        system('rm -rf lala*')
        if CHKTIME:
            self.assertEqual(True, True)
            print '19', time() - t0
예제 #4
0
    def test_19_matrix_manip(self):
        if ONLY and ONLY != '19':
            return
        if CHKTIME:
            t0 = time()
        hic_data1 = load_hic_data_from_reads('lala-map~', resolution=10000)
        hic_map(hic_data1, savedata='lala-map.tsv~', savefig='lala.pdf~')
        hic_map(hic_data1, by_chrom='intra', savedata='lala-maps~', savefig='lalalo~')
        hic_map(hic_data1, by_chrom='inter', savedata='lala-maps~', savefig='lalala~')
        # slowest part of the all test:
        hic_data2 = read_matrix('lala-map.tsv~', resolution=10000)
        self.assertEqual(hic_data1, hic_data2)
        vals = plot_distance_vs_interactions(hic_data1)
        
        self.assertEqual([round(i, 2) if str(i)!='nan' else 0.0 for i in
                          reduce(lambda x, y: x + y, vals)],
                         [-1.74, 4.2, 0.52, 1.82, -0.44, 0.0, -0.5, 2.95, 0.0])
        
        a, b = insert_sizes('lala-map~')
        self.assertEqual([int(a),int(b)], [43, 1033])

        hic_data1 = read_matrix('20Kb/chrT/chrT_A.tsv', resolution=20000)
        hic_data2 = read_matrix('20Kb/chrT/chrT_B.tsv', resolution=20000)
        
        corr = correlate_matrices(hic_data1, hic_data2)
        corr =  [round(i,3) for i in corr[0]]
        self.assertEqual(corr, [0.755, 0.729, 0.804, 0.761, 0.789, 0.776, 0.828,
                                0.757, 0.797, 0.832])
        
        ecorr = eig_correlate_matrices(hic_data1, hic_data2)
        ecorr = [round(i,3) for i in reduce(lambda x, y:x+y, ecorr)]
        self.assertEqual(ecorr, [0.997, 0.322, 0.442, 0.017, 0.243, 0.014,
                                 0.321, 0.999, 0.01, 0.006, 0.0, 0.007, 0.451,
                                 0.012, 0.996, 0.031, 0.013, 0.004, 0.002,
                                 0.006, 0.029, 0.974, 0.076, 0.03, 0.219, 0.013,
                                 0.031, 0.08, 0.974, 0.018, 0.028, 0.004, 0.0,
                                 0.028, 0.034, 0.89])
        system('rm -rf lala*')
        if CHKTIME:
            self.assertEqual(True, True)
            print '19', time() - t0
예제 #5
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = insert_sizes(reads,
                                          nreads=1000000,
                                          stats=('median', 'first_decay',
                                                 'MAD'),
                                          savefig=hist_path)

        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print(
            '   Using the maximum continuous fragment size'
            '(%d bp) to check '
            'for pseudo-dangling ends') % max_mole
        print(
            '   Using maximum continuous fragment size plus the MAD '
            '(%d bp) to check for random breaks') % min_dist

        print "identify pairs to filter..."
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=0.001,
                              max_frag_size=100000,
                              min_frag_size=50,
                              re_proximity=5,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time)
예제 #6
0
    savefig=outfile)
reads_mapped_per_iteration = pd.DataFrame.from_dict(reads_mapped_per_iteration)
reads_mapped_per_iteration.columns = ['read1', 'read2']
fraction_mapped_read1 = list(
    reads_mapped_per_iteration['read1'])[-1] / float(n_reads_trimmed)
fraction_mapped_read2 = list(
    reads_mapped_per_iteration['read2'])[-1] / float(n_reads_trimmed)
fraction_mapped_str = ",".join(
    [str(i) for i in [fraction_mapped_read1, fraction_mapped_read2]])

# Plot: distribution of dangling-end lengths
plt.rcParams['font.size'] = 12
infile = '%s/%s_both_map.tsv' % (PROCESSED, pair_id)
outfile = '%s/%s_plot_distribution_dangling_ends_lengths.png' % (
    POSTMAPPING_PLOTS, pair_id)
insert_sizes(infile, xlog=False, max_size=99.9, savefig=outfile)

# Plot: Decay of interaction counts with genomic distamce
plt.rcParams['font.size'] = 12
outfile = '%s/%s_plot_decay_interaction_counts_genomic_distance.png' % (
    POSTMAPPING_PLOTS, pair_id)
myvalues = plot_distance_vs_interactions(infile,
                                         max_diff=50000000,
                                         resolution=10000,
                                         savefig=outfile)
slope = str(myvalues[1][0])

# Plot: sequencing coverage along chromosomes
outfile = '%s/%s_plot_genomic_coverage_mapped_%s.png' % (
    POSTMAPPING_PLOTS, pair_id, genomic_coverage_resolution)
plt.rcParams['font.size'] = 20