예제 #1
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = fragment_size(
            reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'),
            savefig=hist_path)

        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f

        max_mole = max_f # pseudo DEs
        min_dist = max_f + mad # random breaks
        print ('   Using the maximum continuous fragment size'
               '(%d bp) to check '
               'for pseudo-dangling ends') % max_mole
        print ('   Using maximum continuous fragment size plus the MAD '
               '(%d bp) to check for random breaks') % min_dist

        print "identify pairs to filter..."
        masked = filter_reads(reads, max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              min_dist_to_re=min_dist, fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    if opts.valid:
        infile = mreads
    else:
        infile = reads
    bed2D_to_BAMhic(infile, opts.valid, opts.cpus, outbam, opts.format, masked,
                    samtools=opts.samtools)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam + '.bam', hist_path, median, max_f, mad, launch_time, finish_time)
예제 #2
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        if opts.fast_fragment:
            reads = fname1
            counts_multis = [
                '#' in line.split('\t')[0] for line in open(reads)
            ]
            count = len(counts_multis)
            multiples = {}
            multiples[1] = sum(
                [count_mult for count_mult in counts_multis if count_mult])
            del counts_multis
        else:
            # compute the intersection of the two read ends
            print('Getting intersection between read 1 and read 2')
            count, multiples = get_intersection(fname1,
                                                fname2,
                                                reads,
                                                compress=opts.compress_input)

        # compute insert size
        print('Get insert size...')
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        try:
            median, max_f, mad = fragment_size(reads,
                                               nreads=1000000,
                                               stats=('median', 'first_decay',
                                                      'MAD'),
                                               savefig=hist_path)
        except ZeroDivisionError:
            warn('WARNING: cannot compute fragment length, too few '
                 'dangling-ends. Setting median length to 400 nt.')
            median = max_f = mad = 0
        if median < 50:
            warn('WARNING: fragment length too short ({}). '
                 'Setting median length to 400 nt.'.format(mad))
            median, max_f, mad = 400, 100, 40
        if opts.median:
            median = opts.median
        if opts.max_f:
            max_f = opts.max_f
        if opts.mad:
            mad = opts.mad

        print('  - median insert size =', median)
        print('  - median absolution of insert size =', mad)
        print(
            '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =',
            max_f)

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print('   Using the maximum continuous fragment size'
              '(%d bp) to check '
              'for pseudo-dangling ends' % max_mole)
        print('   Using maximum continuous fragment size plus the MAD '
              '(%d bp) to check for random breaks' % min_dist)

        print("identify pairs to filter...")
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              strict_duplicates=opts.strict_duplicates,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    if opts.valid:
        infile = mreads
    else:
        infile = reads
    bed2D_to_BAMhic(infile,
                    opts.valid,
                    opts.cpus,
                    outbam,
                    opts.format,
                    masked,
                    samtools=opts.samtools)

    finish_time = time.localtime()
    print(median, max_f, mad)
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam + '.bam', hist_path, median, max_f, mad, launch_time,
               finish_time)
예제 #3
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = fragment_size(reads,
                                           nreads=1000000,
                                           stats=('median', 'first_decay',
                                                  'MAD'),
                                           savefig=hist_path)

        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print(
            '   Using the maximum continuous fragment size'
            '(%d bp) to check '
            'for pseudo-dangling ends') % max_mole
        print(
            '   Using maximum continuous fragment size plus the MAD '
            '(%d bp) to check for random breaks') % min_dist

        print "identify pairs to filter..."
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    if opts.valid:
        infile = mreads
    else:
        infile = reads
    bed2D_to_BAMhic(infile,
                    opts.valid,
                    opts.cpus,
                    outbam,
                    opts.format,
                    masked,
                    samtools=opts.samtools)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam + '.bam', hist_path, median, max_f, mad, launch_time,
               finish_time)