示例#1
0
    def test_18_filter_reads(self):
        if ONLY and ONLY != '18':
            return
        if CHKTIME:
            t0 = time()
        for ali in ['map', 'sam']:
            seed(1)
            if 13436 == int(random()*100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta('test.fa~', verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta('test.fa~')
            # PARSE SAM
            if ali == 'map':
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print 'ERROR: PYSAM not found, skipping test\n'
                    continue

            parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
                   './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome,
                   re_name='DPNII', mapper='GEM')

            # GET INTERSECTION
            from pytadbit.mapping import get_intersection
            get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali),
                             'lala-%s~' % (ali))
            # FILTER
            masked = filter_reads('lala-%s~' % (ali), verbose=False,
                                  fast=(ali=='map'))
            self.assertEqual(masked[1]['reads'], 1000)
            self.assertEqual(masked[2]['reads'], 1000)
            self.assertEqual(masked[3]['reads'], 1000)
            self.assertEqual(masked[4]['reads'], 1000)
            if same_seed:
                self.assertEqual(masked[5]['reads'], 1110)
                self.assertEqual(masked[6]['reads'], 2332)
                self.assertEqual(masked[7]['reads'], 0)
                self.assertEqual(masked[8]['reads'], 141)
                self.assertEqual(masked[10]['reads'], 1)
            else:
                self.assertTrue (masked[5]['reads'] > 1000)
            self.assertEqual(masked[9]['reads'], 1000)
        apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1],
                     reverse=True, verbose=False)
        self.assertEqual(len([True for l in open('lala-map-filt~')
                              if not l.startswith('#')]), 1000)
        d = plot_iterative_mapping('lala1-map~', 'lala2-map~')
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print '18', time() - t0
示例#2
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = insert_sizes(
            reads, nreads=1000000, stats=('median', 'first_decay', 'MAD'),
            savefig=hist_path)
        
        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f
    
        max_mole = max_f # pseudo DEs
        min_dist = max_f + mad # random breaks
        print ('   Using the maximum continuous fragment size'
               '(%d bp) to check '
               'for pseudo-dangling ends') % max_mole
        print ('   Using maximum continuous fragment size plus the MAD '
               '(%d bp) to check for random breaks') % min_dist
    
        print "identify pairs to filter..."
        masked = filter_reads(reads, max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              min_dist_to_re=min_dist, fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked,
                                 filters=opts.apply)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               hist_path, median, max_f, mad, launch_time, finish_time)
示例#3
0
    def test_18_filter_reads(self):
        if ONLY and ONLY != "18":
            return
        if CHKTIME:
            t0 = time()
        for ali in ["map", "sam"]:
            seed(1)
            if 13436 == int(random() * 100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta("test.fa~", verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta("test.fa~")
            # PARSE SAM
            if ali == "map":
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print "ERROR: PYSAM not found, skipping test\n"
                    continue

            parser(
                ["test_read1.%s~" % (ali)],
                ["test_read2.%s~" % (ali)],
                "./lala1-%s~" % (ali),
                "./lala2-%s~" % (ali),
                genome,
                re_name="DPNII",
                mapper="GEM",
            )

            # GET INTERSECTION
            from pytadbit.mapping import get_intersection

            get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali), "lala-%s~" % (ali))
            # FILTER
            masked = filter_reads("lala-%s~" % (ali), verbose=False, fast=(ali == "map"))
            self.assertEqual(masked[1]["reads"], 1000)
            self.assertEqual(masked[2]["reads"], 1000)
            self.assertEqual(masked[3]["reads"], 1000)
            self.assertEqual(masked[4]["reads"], 1000)
            if same_seed:
                self.assertEqual(masked[5]["reads"], 1110)
                self.assertEqual(masked[6]["reads"], 2332)
                self.assertEqual(masked[7]["reads"], 0)
                self.assertEqual(masked[8]["reads"], 141)
                self.assertEqual(masked[10]["reads"], 1)
            else:
                self.assertTrue(masked[5]["reads"] > 1000)
            self.assertEqual(masked[9]["reads"], 1000)
        apply_filter("lala-map~", "lala-map-filt~", masked, filters=[1], reverse=True, verbose=False)
        self.assertEqual(len([True for l in open("lala-map-filt~") if not l.startswith("#")]), 1000)
        d = plot_iterative_mapping("lala1-map~", "lala2-map~")
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print "18", time() - t0
示例#4
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        # compute the intersection of the two read ends
        print 'Getting intersection between read 1 and read 2'
        count, multiples = get_intersection(fname1, fname2, reads)

        # compute insert size
        print 'Get insert size...'
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        median, max_f, mad = fragment_size(reads,
                                           nreads=1000000,
                                           stats=('median', 'first_decay',
                                                  'MAD'),
                                           savefig=hist_path)

        print '  - median insert size =', median
        print '  - double median absolution of insert size =', mad
        print '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =', max_f

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print(
            '   Using the maximum continuous fragment size'
            '(%d bp) to check '
            'for pseudo-dangling ends') % max_mole
        print(
            '   Using maximum continuous fragment size plus the MAD '
            '(%d bp) to check for random breaks') % min_dist

        print "identify pairs to filter..."
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    if opts.valid:
        infile = mreads
    else:
        infile = reads
    bed2D_to_BAMhic(infile,
                    opts.valid,
                    opts.cpus,
                    outbam,
                    opts.format,
                    masked,
                    samtools=opts.samtools)

    finish_time = time.localtime()
    print median, max_f, mad
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam + '.bam', hist_path, median, max_f, mad, launch_time,
               finish_time)
示例#5
0
    def test_18_filter_reads(self):
        if ONLY and not "18" in ONLY:
            return
        if CHKTIME:
            t0 = time()
        for ali in ["map", "sam"]:
            seed(1)
            if 13436 == int(random() * 100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta("test.fa~",
                                         verbose=False,
                                         save_cache=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta("test.fa~", save_cache=False)
            # PARSE SAM
            if ali == "map":
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print("ERROR: PYSAM not found, skipping test\n")
                    continue

            parser(["test_read1.%s~" % (ali)], ["test_read2.%s~" % (ali)],
                   "./lala1-%s~" % (ali),
                   "./lala2-%s~" % (ali),
                   genome,
                   re_name="DPNII",
                   mapper="GEM")

            # GET INTERSECTION
            from pytadbit.mapping import get_intersection
            get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali),
                             "lala-%s~" % (ali))
            # FILTER
            masked = filter_reads("lala-%s~" % (ali),
                                  verbose=False,
                                  fast=(ali == "map"))
            self.assertEqual(masked[1]["reads"], 1000)
            self.assertEqual(masked[2]["reads"], 1000)
            self.assertEqual(masked[3]["reads"], 1000)
            self.assertEqual(masked[4]["reads"], 1000)
            if same_seed:
                self.assertEqual(masked[5]["reads"], 1091)
                self.assertEqual(masked[6]["reads"], 2230)
                self.assertEqual(masked[7]["reads"], 0)
                self.assertEqual(masked[8]["reads"], 100)
                self.assertEqual(masked[10]["reads"], 5)
            else:
                self.assertTrue(masked[5]["reads"] > 1000)
            self.assertEqual(masked[9]["reads"], 1001)
        apply_filter("lala-map~",
                     "lala-map-filt~",
                     masked,
                     filters=[1],
                     reverse=True,
                     verbose=False)
        with open("lala-map-filt~") as f_lala_filt:
            self.assertEqual(
                len([True for l in f_lala_filt if not l.startswith("#")]),
                1000)
        d = plot_iterative_mapping("lala1-map~", "lala2-map~")
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print("18", time() - t0)
from pytadbit.parsers.map_parser import parse_map
from pytadbit.parsers.genome_parser import parse_fasta
from pytadbit.mapping import get_intersection

# Load the genome
genome_seq = parse_fasta(fasta)

# Output directory
RESULTS = '%s/results/%s/processed_reads' % (SAMPLE, version)
if not os.path.exists(RESULTS):
    os.makedirs(RESULTS)

infiles = []
outfiles = []
for infile in [paired1, paired2]:
    bname = infile.split("/")[-1].replace(".fastq.gz", "")
    maps = glob.glob('%s/%s/*' % (MAP_DIR, bname))
    infiles.append(maps)
    outfiles.append('%s/%s_map.tsv' % (RESULTS, bname))

parse_map(infiles[0],
          infiles[1],
          outfiles[0],
          outfiles[1],
          genome_seq,
          restriction_enzyme,
          verbose=True,
          ncpus=slots)
final_output = outfiles[0].replace('read1', 'both')
get_intersection(outfiles[0], outfiles[1], final_output, verbose=True)
示例#7
0
    def tb_parse_mapping_iter(self, genome_seq, enzyme_name, window1_1,
                              window1_2, window1_3, window1_4, window2_1,
                              window2_2, window2_3, window2_4, reads):
        """
        Function to map the aligned reads and return the matching pairs

        Parameters
        ----------
        genome_seq : dict
            Object containing the sequence of each of the chromosomes
        enzyme_name : str
            Name of the enzyme used to digest the genome
        window1_1 : str
            Location of the first window index file
        window1_2 : str
            Location of the second window index file
        window1_3 : str
            Location of the third window index file
        window1_4 : str
            Location of the fourth window index file
        window2_1 : str
            Location of the first window index file
        window2_2 : str
            Location of the second window index file
        window2_3 : str
            Location of the third window index file
        window2_4 : str
            Location of the fourth window index file
        reads : str
            Location of the reads thats that has a matching location at both
            ends of the paired reads


        Returns
        -------
        reads : str
            Location of the intersection of mapped reads that have matching
            reads in both pair end files

        """

        reads1 = reads + '_reads_1.tsv'
        reads2 = reads + '_reads_2.tsv'
        reads_both = reads + '_reads_both.tsv'

        parse_map(
            [window1_1, window1_2, window1_3, window1_4],
            [window2_1, window2_2, window2_3, window2_4],
            out_file1=reads1,
            out_file2=reads2,
            genome_seq=genome_seq,
            re_name=enzyme_name,
            verbose=True,
            # ncpus=32
        )

        get_intersection(reads1, reads2, reads_both, verbose=True)

        with open(reads, "wb") as f_out:
            with open(reads_both, "rb") as f_in:
                f_out.write(f_in.read())

        return True
示例#8
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    fname1, fname2 = load_parameters_fromdb(opts)

    param_hash = digest_parameters(opts)

    reads = path.join(opts.workdir, '03_filtered_reads',
                      'all_r1-r2_intersection_%s.tsv' % param_hash)
    mreads = path.join(opts.workdir, '03_filtered_reads',
                       'valid_r1-r2_intersection_%s.tsv' % param_hash)

    if not opts.resume:
        mkdir(path.join(opts.workdir, '03_filtered_reads'))

        if opts.fast_fragment:
            reads = fname1
            counts_multis = [
                '#' in line.split('\t')[0] for line in open(reads)
            ]
            count = len(counts_multis)
            multiples = {}
            multiples[1] = sum(
                [count_mult for count_mult in counts_multis if count_mult])
            del counts_multis
        else:
            # compute the intersection of the two read ends
            print('Getting intersection between read 1 and read 2')
            count, multiples = get_intersection(fname1,
                                                fname2,
                                                reads,
                                                compress=opts.compress_input)

        # compute insert size
        print('Get insert size...')
        hist_path = path.join(opts.workdir,
                              'histogram_fragment_sizes_%s.pdf' % param_hash)
        try:
            median, max_f, mad = fragment_size(reads,
                                               nreads=1000000,
                                               stats=('median', 'first_decay',
                                                      'MAD'),
                                               savefig=hist_path)
        except ZeroDivisionError:
            warn('WARNING: cannot compute fragment length, too few '
                 'dangling-ends. Setting median length to 400 nt.')
            median, max_f, mad = 400, 100, 1000

        print('  - median insert size =', median)
        print('  - double median absolution of insert size =', mad)
        print(
            '  - max insert size (when a gap in continuity of > 10 bp is found in fragment lengths) =',
            max_f)

        max_mole = max_f  # pseudo DEs
        min_dist = max_f + mad  # random breaks
        print('   Using the maximum continuous fragment size'
              '(%d bp) to check '
              'for pseudo-dangling ends' % max_mole)
        print('   Using maximum continuous fragment size plus the MAD '
              '(%d bp) to check for random breaks' % min_dist)

        print("identify pairs to filter...")
        masked = filter_reads(reads,
                              max_molecule_length=max_mole,
                              over_represented=opts.over_represented,
                              max_frag_size=opts.max_frag_size,
                              min_frag_size=opts.min_frag_size,
                              re_proximity=opts.re_proximity,
                              strict_duplicates=opts.strict_duplicates,
                              min_dist_to_re=min_dist,
                              fast=True)

    n_valid_pairs = apply_filter(reads, mreads, masked, filters=opts.apply)

    outbam = path.join(opts.workdir, '03_filtered_reads',
                       'intersection_%s' % param_hash)

    if opts.valid:
        infile = mreads
    else:
        infile = reads
    bed2D_to_BAMhic(infile,
                    opts.valid,
                    opts.cpus,
                    outbam,
                    opts.format,
                    masked,
                    samtools=opts.samtools)

    finish_time = time.localtime()
    print(median, max_f, mad)
    # save all job information to sqlite DB
    save_to_db(opts, count, multiples, reads, mreads, n_valid_pairs, masked,
               outbam + '.bam', hist_path, median, max_f, mad, launch_time,
               finish_time)