Пример #1
0
    def test_18_filter_reads(self):
        if ONLY and ONLY != '18':
            return
        if CHKTIME:
            t0 = time()
        for ali in ['map', 'sam']:
            seed(1)
            if 13436 == int(random()*100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta('test.fa~', verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta('test.fa~')
            # PARSE SAM
            if ali == 'map':
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print 'ERROR: PYSAM not found, skipping test\n'
                    continue

            parser(['test_read1.%s~' % (ali)], ['test_read2.%s~' % (ali)],
                   './lala1-%s~' % (ali), './lala2-%s~' % (ali), genome,
                   re_name='DPNII', mapper='GEM')

            # GET INTERSECTION
            from pytadbit.mapping import get_intersection
            get_intersection('lala1-%s~' % (ali), 'lala2-%s~' % (ali),
                             'lala-%s~' % (ali))
            # FILTER
            masked = filter_reads('lala-%s~' % (ali), verbose=False,
                                  fast=(ali=='map'))
            self.assertEqual(masked[1]['reads'], 1000)
            self.assertEqual(masked[2]['reads'], 1000)
            self.assertEqual(masked[3]['reads'], 1000)
            self.assertEqual(masked[4]['reads'], 1000)
            if same_seed:
                self.assertEqual(masked[5]['reads'], 1110)
                self.assertEqual(masked[6]['reads'], 2332)
                self.assertEqual(masked[7]['reads'], 0)
                self.assertEqual(masked[8]['reads'], 141)
                self.assertEqual(masked[10]['reads'], 1)
            else:
                self.assertTrue (masked[5]['reads'] > 1000)
            self.assertEqual(masked[9]['reads'], 1000)
        apply_filter('lala-map~', 'lala-map-filt~', masked, filters=[1],
                     reverse=True, verbose=False)
        self.assertEqual(len([True for l in open('lala-map-filt~')
                              if not l.startswith('#')]), 1000)
        d = plot_iterative_mapping('lala1-map~', 'lala2-map~')
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print '18', time() - t0
Пример #2
0
    def add_sections_from_fasta(self, fasta):
        """
        Add genomic coordinate to HiC_data object by getting them from a fasta
        file containing chromosome sequences

        :param fasta: path to a fasta file
        """
        genome = parse_fasta(fasta, verbose=False)
        sections = []
        genome_seq = OrderedDict()
        size = 0
        for crm in  genome:
            genome_seq[crm] = int(len(genome[crm])) / self.resolution + 1
            size += genome_seq[crm]
        section_sizes = {}
        for crm in genome_seq:
            len_crm = genome_seq[crm]
            section_sizes[(crm,)] = len_crm
            sections.extend([(crm, i) for i in xrange(len_crm)])
        dict_sec = dict([(j, i) for i, j in enumerate(sections)])
        self.chromosomes = genome_seq
        self.sections = dict_sec
        if self.chromosomes:
            total = 0
            for crm in self.chromosomes:
                self.section_pos[crm] = (total, total + self.chromosomes[crm])
                total += self.chromosomes[crm]
        if size != self.__size:
            warn('WARNING: different sizes (%d, now:%d), ' % (self.__size, size)
                 + 'should adjust the resolution')
        self.__size = size
        self._size2 = size**2
Пример #3
0
def main():
    opts = get_options()
    iterative_mapping()

    ## PARSE FASTA
    genome = parse_fasta(opts.fasta if len(opts.fasta) <= 1 else opts.fasta[0],
                         chr_names=opts.chr_name, verbose=True)
Пример #4
0
 def test_17_map_re_sites(self):
     """
     test fasta parsing and mapping re sites
     """
     ref_genome = parse_fasta(PATH + '/ref_genome/chr2L_chr4_dm3.bz2',
                              verbose=False)
     self.assertEqual(len(ref_genome['chr4']), 1351857)
     frags = map_re_sites('dpnIi', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 16)
     self.assertEqual(frags['chr4'][10][50], 1018069)
     frags = map_re_sites('hindiii', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 3)
     self.assertEqual(frags['chr4'][10][5], 1017223)
Пример #5
0
def main():

    fastq          = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq'
    fastq          = 'short_dixon-2012_200bp.fastq'
    # fastq        = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq'
    gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem'
    out_map_dir1   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/'
    out_map_dir2   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/'
    temp_dir1      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/'
    temp_dir2      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/'

    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, windows=((1,100),), add_site=True)
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, windows=((101, 200),), add_site=True)
    # print 'read 1'
    # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
    #                     temp_dir=temp_dir1,
    #                     windows=(zip(*([0] * len(range(25, 105, 5)),
    #                                    range(25,105,5)))))
    # print 'read 2'
    # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
    #                     temp_dir=temp_dir2,
    #                     windows=(zip(*([100] * len(range(125, 205, 5)),
    #                                            range(125,205,5)))))
    
    print outfiles1
    print 'xcmvnkljnv'
    print outfiles2
    
    from pytadbit.parsers.map_parser import parse_map
    from pytadbit.parsers.genome_parser import parse_fasta
    from pytadbit.mapping.mapper import get_intersection
    from pytadbit.mapping.filter import filter_reads, apply_filter
    
    read1, read2 = 'read1.tsv', 'read2.tsv',
    parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
              genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
              re_name='HindIII', verbose=True)

    reads = 'both_reads.tsv'
    get_intersection(read1, read2, reads)

    masked = filter_reads(reads)
    freads = 'filtered_reads.tsv'
    apply_filter(reads, freads, masked)
Пример #6
0
 def test_17_map_re_sites(self):
     """
     test fasta parsing and mapping re sites
     """
     if ONLY and ONLY != "17":
         return
     if CHKTIME:
         t0 = time()
     ref_genome = parse_fasta(PATH + "/ref_genome/chr2L_chr4_dm3.bz2", verbose=False)
     self.assertEqual(len(ref_genome["chr4"]), 1351857)
     frags = map_re_sites("dpnIi", ref_genome)
     self.assertEqual(len(frags["chr2L"]), 231)
     self.assertEqual(len(frags["chr2L"][230]), 16)
     self.assertEqual(frags["chr4"][10][50], 1018069)
     frags = map_re_sites("hindiii", ref_genome)
     self.assertEqual(len(frags["chr2L"]), 231)
     self.assertEqual(len(frags["chr2L"][230]), 3)
     self.assertEqual(frags["chr4"][10][5], 1017223)
     if CHKTIME:
         self.assertEqual(True, True)
         print "17", time() - t0
Пример #7
0
 def test_17_map_re_sites(self):
     """
     test fasta parsing and mapping re sites
     """
     if ONLY and ONLY != '17':
         return
     if CHKTIME:
         t0 = time()
     ref_genome = parse_fasta(PATH + '/ref_genome/chr2L_chr4_dm3.bz2',
                              verbose=False)
     self.assertEqual(len(ref_genome['chr4']), 1351857)
     frags = map_re_sites('dpnIi', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 16)
     self.assertEqual(frags['chr4'][10][50], 1018069)
     frags = map_re_sites('hindiii', ref_genome)
     self.assertEqual(len(frags['chr2L']), 231)
     self.assertEqual(len(frags['chr2L'][230]), 3)
     self.assertEqual(frags['chr4'][10][5], 1017223)
     if CHKTIME:
         self.assertEqual(True, True)
         print '17', time() - t0
Пример #8
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    f_names1, f_names2, renz = load_parameters_fromdb(opts.workdir, reads, opts.jobids)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2  = None
    elif opts.read == 2:
        out_file2 = None
        f_names1  = f_names2
        f_names2  = None
        out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))
        
    logging.info('parsing genomic sequence')
    try:
        # allows the use of cPickle genome to make it faster
        genome = load(open(opts.genome[0]))
    except UnpicklingError:
        genome = parse_fasta(opts.genome)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1,
                                   out_file2=out_file2, re_name=renz, verbose=True,
                                   genome_seq=genome, compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = 0
        for line in fhandler:
            if '|||' in line:
                multis[0] += line.count('|||')
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')                

    # write machine log
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        fcntl.flock(mlog, fcntl.LOCK_EX)
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (
                    read, counts[read][item],
                    out_file1 if read == 1 else out_file2))
        fcntl.flock(mlog, fcntl.LOCK_UN)

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)
Пример #9
0
    def test_18_filter_reads(self):
        if ONLY and ONLY != "18":
            return
        if CHKTIME:
            t0 = time()
        for ali in ["map", "sam"]:
            seed(1)
            if 13436 == int(random() * 100000):
                same_seed = True
                genome = generate_random_ali(ali)
                genome_bis = parse_fasta("test.fa~", verbose=False)
                self.assertEqual(genome, genome_bis)
            else:
                same_seed = False
                genome = parse_fasta("test.fa~")
            # PARSE SAM
            if ali == "map":
                from pytadbit.parsers.map_parser import parse_map as parser
            else:
                try:
                    from pytadbit.parsers.sam_parser import parse_sam as parser
                except ImportError:
                    print "ERROR: PYSAM not found, skipping test\n"
                    continue

            parser(
                ["test_read1.%s~" % (ali)],
                ["test_read2.%s~" % (ali)],
                "./lala1-%s~" % (ali),
                "./lala2-%s~" % (ali),
                genome,
                re_name="DPNII",
                mapper="GEM",
            )

            # GET INTERSECTION
            from pytadbit.mapping import get_intersection

            get_intersection("lala1-%s~" % (ali), "lala2-%s~" % (ali), "lala-%s~" % (ali))
            # FILTER
            masked = filter_reads("lala-%s~" % (ali), verbose=False, fast=(ali == "map"))
            self.assertEqual(masked[1]["reads"], 1000)
            self.assertEqual(masked[2]["reads"], 1000)
            self.assertEqual(masked[3]["reads"], 1000)
            self.assertEqual(masked[4]["reads"], 1000)
            if same_seed:
                self.assertEqual(masked[5]["reads"], 1110)
                self.assertEqual(masked[6]["reads"], 2332)
                self.assertEqual(masked[7]["reads"], 0)
                self.assertEqual(masked[8]["reads"], 141)
                self.assertEqual(masked[10]["reads"], 1)
            else:
                self.assertTrue(masked[5]["reads"] > 1000)
            self.assertEqual(masked[9]["reads"], 1000)
        apply_filter("lala-map~", "lala-map-filt~", masked, filters=[1], reverse=True, verbose=False)
        self.assertEqual(len([True for l in open("lala-map-filt~") if not l.startswith("#")]), 1000)
        d = plot_iterative_mapping("lala1-map~", "lala2-map~")
        self.assertEqual(d[0][1], 6000)

        if CHKTIME:
            self.assertEqual(True, True)
            print "18", time() - t0
Пример #10
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()
    param_hash = digest_parameters(opts, get_md5=True)

    if opts.nosql:
        biases = opts.biases
        mreads = opts.mreads
        inputs = []
    elif opts.biases or opts.mreads:
        if not opts.mreads:
            raise Exception('ERROR: also need to provide BAM file')
        if not opts.biases:
            raise Exception('ERROR: also need to provide biases file')
        biases = opts.biases
        mreads = opts.mreads
        inputs = ['NA', 'NA']
        mkdir(path.join(opts.workdir))
    else:
        biases, mreads, biases_id, mreads_id = load_parameters_fromdb(opts)
        inputs = [biases_id, mreads_id]
        # store path ids to be saved in database
        mreads = path.join(opts.workdir, mreads)
        biases = path.join(opts.workdir, biases)

    reso   = opts.reso

    mkdir(path.join(opts.workdir, '06_segmentation'))

    print 'loading %s \n    at resolution %s' % (mreads, nice(reso))
    region = None
    if opts.crms and len(opts.crms) == 1:
        region = opts.crms[0]
    hic_data = load_hic_data_from_bam(mreads, reso, ncpus=opts.cpus,
                                      region=region,
                                      biases=None if opts.all_bins else biases,
                                      filter_exclude=opts.filter)

    # compartments
    cmp_result = {}
    richA_stats = {}
    firsts = {}
    if not opts.only_tads:
        print 'Searching compartments'
        cmprt_dir = path.join(opts.workdir, '06_segmentation',
                              'compartments_%s' % (nice(reso)))
        mkdir(cmprt_dir)
        if opts.fasta:
            print '  - Computing GC content to label compartments'
            rich_in_A = get_gc_content(parse_fasta(opts.fasta, chr_filter=opts.crms), reso,
                                       chromosomes=opts.crms,
                                       by_chrom=True, n_cpus=opts.cpus)
        elif opts.rich_in_A:
            rich_in_A = opts.rich_in_A
        else:
            rich_in_A = None
        n_evs = opts.n_evs if opts.n_evs > 0 else 3
        firsts, richA_stats = hic_data.find_compartments(
            crms=opts.crms, savefig=cmprt_dir, verbose=True, suffix=param_hash,
            rich_in_A=rich_in_A, show_compartment_labels=rich_in_A is not None,
            savecorr=cmprt_dir if opts.savecorr else None,
            max_ev=n_evs,
            ev_index=opts.ev_index,
            vmin=None if opts.fix_corr_scale else 'auto',
            vmax=None if opts.fix_corr_scale else 'auto')

        for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes):
            if not crm in firsts:
                continue
            ev_file = open(path.join(
                cmprt_dir, '%s_EigVect%d_%s.tsv' % (
                    crm, opts.ev_index[ncrm] if opts.ev_index else 1,
                    param_hash)), 'w')
            ev_file.write('# %s\n' % ('\t'.join(
                'EV_%d (%.4f)' % (i, v)
                for i, v in enumerate(firsts[crm][0], 1))))
            ev_file.write('\n'.join(['\t'.join([str(v) for v in vs])
                                     for vs in zip(*firsts[crm][1])]))
            ev_file.close()

        for ncrm, crm in enumerate(opts.crms or hic_data.chromosomes):
            cmprt_file1 = path.join(cmprt_dir, '%s_%s.tsv' % (crm, param_hash))
            cmprt_file2 = path.join(cmprt_dir, '%s_EigVect%d_%s.tsv' % (
                crm, opts.ev_index[ncrm] if opts.ev_index else 1, param_hash))
            cmprt_image = path.join(cmprt_dir, '%s_EV%d_%s.%s' % (
                crm, opts.ev_index[ncrm] if opts.ev_index else 1,
                param_hash, opts.format))
            if opts.savecorr:
                cormat_file = path.join(cmprt_dir, '%s_corr-matrix%s.tsv' %
                                       (crm, param_hash))
            else:
                cormat_file = None
            hic_data.write_compartments(cmprt_file1, chroms=[crm])
            cmp_result[crm] = {'path_cmprt1': cmprt_file1,
                               'path_cmprt2': cmprt_file2,
                               'path_cormat': cormat_file,
                               'image_cmprt': cmprt_image,
                               'num' : len(hic_data.compartments[crm])}

    # TADs
    tad_result = {}
    if not opts.only_compartments:
        print 'Searching TADs'
        tad_dir = path.join(opts.workdir, '06_segmentation',
                             'tads_%s' % (nice(reso)))
        mkdir(tad_dir)
        for crm in hic_data.chromosomes:
            if opts.crms and not crm in opts.crms:
                continue
            print '  - %s' % crm
            matrix = hic_data.get_matrix(focus=crm)
            beg, end = hic_data.section_pos[crm]
            size = len(matrix)
            if size < 10:
                print "     Chromosome too short (%d bins), skipping..." % size
                continue
            # transform bad column in chromosome referential
            if hic_data.bads:
                to_rm = tuple([1 if i in hic_data.bads else 0 for i in xrange(beg, end)])
            else:
                to_rm = None
            # maximum size of a TAD
            max_tad_size = (size - 1) if opts.max_tad_size is None else opts.max_tad_size
            result = tadbit([matrix], remove=to_rm,
                            n_cpus=opts.cpus, verbose=opts.verbose,
                            max_tad_size=max_tad_size,
                            no_heuristic=False)

            # use normalization to compute height on TADs called
            if opts.all_bins:
                if opts.nosql:
                    biases = load(open(biases))
                else:
                    biases = load(open(path.join(opts.workdir, biases)))
                hic_data.bads = biases['badcol']
                hic_data.bias = biases['biases']
            tads = load_tad_height(result, size, beg, end, hic_data)
            table = ''
            table += '%s\t%s\t%s\t%s\t%s\n' % ('#', 'start', 'end', 'score', 'density')
            for tad in tads:
                table += '%s\t%s\t%s\t%s%s\n' % (
                    tad, int(tads[tad]['start'] + 1), int(tads[tad]['end'] + 1),
                    abs(tads[tad]['score']), '\t%s' % (round(
                        float(tads[tad]['height']), 3)))
            out_tad = path.join(tad_dir, '%s_%s.tsv' % (crm, param_hash))
            out = open(out_tad, 'w')
            out.write(table)
            out.close()
            tad_result[crm] = {'path' : out_tad,
                               'num': len(tads)}

    finish_time = time.localtime()

    if not opts.nosql:
        try:
            save_to_db(opts, cmp_result, tad_result, reso, inputs,
                       richA_stats, firsts, param_hash,
                       launch_time, finish_time)
        except:
            # release lock anyway
            print_exc()
            try:
                remove(path.join(opts.workdir, '__lock_db'))
            except OSError:
                pass
            exit(1)
Пример #11
0
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \
                  output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \
                  clean_tmp, tmp_dir):

    print 'Begin to process reads.'

    left_reads = ''
    right_reads = ''
    if reads_fastq != '': # left and right reads are stored in one file
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq)
        print 'Reads:                     ', reads_fastq
        left_reads = reads_fastq
        right_reads = reads_fastq
    else: # left and right reads are stored separately
        range_start_left, range_stop_left, \
        range_start_right, range_stop_right = calc_range(left_reads_fastq)
        print 'Left reads:                ', left_reads_fastq
        print 'Right reads:               ', right_reads_fastq
        print 'Output prefix:             ', output_prefix
        left_reads = left_reads_fastq
        right_reads = right_reads_fastq

    print 'Reference genome FASTA:    ', genome_fasta
    print 'Reference genome GEM index:', genome_index
    print 'Output directory:          ', output_directory
    print 'Temp directory:            ', tmp_dir
    print 'Enzyme:                    ', enzyme
    print 'Resolution:                ', res, 'bp'
    print 'Number of threads:         ', threads_number
    print 'Start pos for left reads:  ', range_start_left
    print 'Stop pos for left reads:   ', range_stop_left
    print 'Start pos for right reads: ', range_start_right
    print 'Stop pos for right reads:  ', range_stop_right
    stdout.flush()

    # map left reads to reference genome
    out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam'
    out_sam_left_path = join(output_directory, out_sam_left_name)
    print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \
                                  range_start_left, range_stop_left, nthreads=threads_number,
                                  temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # map right reads to reference genome
    out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam'
    out_sam_right_path = join(output_directory, out_sam_right_name)
    print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...'
    stdout.flush()
    sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \
                                   range_start_right, range_stop_right, nthreads=threads_number,
                                   temp_dir=tmp_dir)
    print 'Done.'
    stdout.flush()

    # load reference genome sequence
    print 'Load reference genome sequence...'
    stdout.flush()
    chroms = chromosomes[:]
    genome_seq = parse_fasta(genome_fasta, chr_names=chroms)
    print 'Done.'
    stdout.flush()

    # create files with information about every left and right read 
    # and about their placement with respect to restriction sites
    tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv'
    tsv_left = join(output_directory, tsv_left_name)
    tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv'
    tsv_right = join(output_directory, tsv_right_name)
    print 'Get information about restriction sites and reads placement...'
    stdout.flush()
    parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \
              verbose=True, ncpus=8)
    print 'Done.'
    stdout.flush()

    # create file with both left and right reads that uniquelly mapped to reference genome
    if reads_fastq != '': # left and right reads are stored in one file
        common_reads_prefix = splitext(basename(reads_fastq))[0]
    else: # left and right reads are stored separately
        common_reads_prefix = output_prefix
    uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv'
    uniq_reads = join(output_directory, uniq_reads_name)
    print 'Merge info about left and right reads in one file...'
    stdout.flush()
    get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True)
    print 'Done.'
    stdout.flush()

    # find read IDs that are filtered by default TADbit filters
    print 'Mask reads...'
    stdout.flush()
    # debug
    print "uniq_reads =", uniq_reads
    masked = filter_reads(uniq_reads)
    print 'Done.'
    stdout.flush()

    # apply all filters (exclude reads that were filtered)
    print 'Filter masked reads...'
    stdout.flush()
    filtered_reads_name = common_reads_prefix + '_filtered.tsv'
    filtered_reads = join(output_directory, filtered_reads_name)
    apply_filter(uniq_reads, filtered_reads, masked)
    print 'Done.'
    stdout.flush()

    # create matrices (one matrix per chromosome)
    print 'Create Hi-C maps (one per chromosome)...'
    stdout.flush()
    hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add resolution (' + str(resolution) + ') to matrix filenames...'
    stdout.flush()
    add_resolution(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    print 'Add headers to matrix files...'
    stdout.flush()
    add_headers(chromosomes, resolution, output_directory)
    print 'Done.'
    stdout.flush()
    if clean_tmp: # Remove all SAM and TSV files from the output directory
        print 'Remove SAM and TSV files from the output directory.'
        stdout.flush()
        map(os.remove, glob.glob(out_sam_left_path + '*'))
        map(os.remove, glob.glob(out_sam_right_path + '*'))
        map(os.remove, glob.glob(join(output_directory, '*.tsv')))
        print 'Done.'
        stdout.flush()
Пример #12
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids)

    renz = renz.split('-')

    opts.workdir = path.abspath(opts.workdir)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2  = None
    elif opts.read == 2:
        out_file2 = None
        f_names1  = f_names2
        f_names2  = None
        out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))

    logging.info('parsing genomic sequence')
    try:
        # allows the use of cPickle genome to make it faster
        genome = load(open(opts.genome[0]))
    except UnpicklingError:
        genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1,
                                   out_file2=out_file2, re_name=renz, verbose=True,
                                   genome_seq=genome, compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = {}
        for line in fhandler:
            if '|||' in line:
                try:
                    multis[0][line.count('|||')] += 1
                except KeyError:
                    multis[0][line.count('|||')] = 1
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (
                    read, counts[read][item],
                    out_file1 if read == 1 else out_file2))
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)
Пример #13
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception('ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception('ERROR: missing restriction enzyme name for oneD normalization')
        if not opts.mappability:
            raise Exception('ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print 'WARNING: %d extra chromosomes in FASTA (removing them)' % (len(fas - bam))
            if len(fas - bam) <= 50:
                print '\n'.join([('  - ' + c) for c in (fas - bam)])
        if bam - fas:
            txt = ('\n'.join([('  - ' + c) for c in (bam - fas)])
                   if len(bam - fas) <= 50 else '')
            raise Exception('ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' % (
                len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability, opts.reso,
            wanted_chrom=refs[0] if len(refs)==1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) / opts.reso + 1)
            if len(mappability[c]) < len(refs) / opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) / opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome, opts.reso, chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites  = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in xrange(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos-200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads, filter_exclude, opts.reso, min_count=opts.min_count, sigma=2,
        factor=1, outdir=outdir, extra_out=param_hash, ncpus=opts.cpus,
        normalization=opts.normalization, mappability=mappability,
        p_fit=opts.p_fit, cg_content=gc_content, n_rsites=n_rsites,
        min_perc=opts.min_perc, max_perc=opts.max_perc, seed=opts.seed,
        normalize_only=opts.normalize_only, max_njobs=opts.max_njobs,
        extra_bads=opts.badcols, biases_path=opts.biases_path)

    bad_col_image = path.join(outdir, 'filtered_bins_%s_%s.png' % (
        nicer(opts.reso).replace(' ', ''), param_hash))

    inter_vs_gcoord = path.join(opts.workdir, '04_normalization',
                                'interactions_vs_genomic-coords.png_%s_%s.png' % (
                                    opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay, max_diff=10000, resolution=opts.reso, normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print ('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(outdir, 'biases_%s_%s.pickle' % (
        nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'w')

    dump({'biases'    : biases,
          'decay'     : decay,
          'badcol'    : badcol,
          'resolution': opts.reso}, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, bad_col_image,
                   len(badcol), len(biases), raw_cisprc, norm_cisprc,
                   inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Пример #14
0
def run(opts):
    check_options(opts)
    launch_time = time.localtime()

    param_hash = digest_parameters(opts)
    if opts.bam:
        mreads = path.realpath(opts.bam)
    else:
        mreads = path.join(opts.workdir, load_parameters_fromdb(opts))

    filter_exclude = opts.filter

    outdir = path.join(opts.workdir, '04_normalization')
    mkdir(outdir)

    mappability = gc_content = n_rsites = None
    if opts.normalization == 'oneD':
        if not opts.fasta:
            raise Exception(
                'ERROR: missing path to FASTA for oneD normalization')
        if not opts.renz:
            raise Exception(
                'ERROR: missing restriction enzyme name for oneD normalization'
            )
        if not opts.mappability:
            raise Exception(
                'ERROR: missing path to mappability for oneD normalization')
        bamfile = AlignmentFile(mreads, 'rb')
        refs = bamfile.references
        bamfile.close()

        # get genome sequence ~1 min
        printime('  - parsing FASTA')
        genome = parse_fasta(opts.fasta, verbose=False)

        fas = set(genome.keys())
        bam = set(refs)
        if fas - bam:
            print('WARNING: %d extra chromosomes in FASTA (removing them)' %
                  (len(fas - bam)))
            if len(fas - bam) <= 50:
                print('\n'.join([('  - ' + c) for c in (fas - bam)]))
        if bam - fas:
            txt = ('\n'.join([('  - ' + c)
                              for c in (bam -
                                        fas)]) if len(bam - fas) <= 50 else '')
            raise Exception(
                'ERROR: %d extra chromosomes in BAM (remove them):\n%s\n' %
                (len(bam - fas), txt))
        refs = [crm for crm in refs if crm in genome]
        if len(refs) == 0:
            raise Exception("ERROR: chromosomes in FASTA different the ones"
                            " in BAM")

        # get mappability ~2 min
        printime('  - Parsing mappability')
        mappability = parse_mappability_bedGraph(
            opts.mappability,
            opts.reso,
            wanted_chrom=refs[0] if len(refs) == 1 else None)
        # resize chomosomes
        for c in refs:
            if not c in mappability:
                mappability[c] = [float('nan')] * (len(refs) // opts.reso + 1)
            if len(mappability[c]) < len(refs) // opts.reso + 1:
                mappability[c] += [float('nan')] * (
                    (len(refs) // opts.reso + 1) - len(mappability[c]))
        # concatenates
        mappability = reduce(lambda x, y: x + y,
                             (mappability.get(c, []) for c in refs))

        printime('  - Computing GC content per bin (removing Ns)')
        gc_content = get_gc_content(genome,
                                    opts.reso,
                                    chromosomes=refs,
                                    n_cpus=opts.cpus)
        # compute r_sites ~30 sec
        # TODO: read from DB
        printime('  - Computing number of RE sites per bin (+/- 200 bp)')
        n_rsites = []
        re_site = RESTRICTION_ENZYMES[opts.renz].replace('|', '')
        for crm in refs:
            for pos in range(200, len(genome[crm]) + 200, opts.reso):
                seq = genome[crm][pos - 200:pos + opts.reso + 200]
                n_rsites.append(seq.count(re_site))

        ## CHECK TO BE REMOVED
        # out = open('tmp_mappability.txt', 'w')
        # i = 0
        # for crm in refs:
        #     for pos in xrange(len(genome[crm]) / opts.reso + 1):
        #         out.write('%s\t%d\t%d\t%f\n' % (crm, pos * opts.reso, pos * opts.reso + opts.reso, mappability[i]))
        #         i += 1
        # out.close()
        # compute GC content ~30 sec
        # TODO: read from DB
    biases, decay, badcol, raw_cisprc, norm_cisprc = read_bam(
        mreads,
        filter_exclude,
        opts.reso,
        min_count=opts.min_count,
        sigma=2,
        factor=1,
        outdir=outdir,
        extra_out=param_hash,
        ncpus=opts.cpus,
        normalization=opts.normalization,
        mappability=mappability,
        p_fit=opts.p_fit,
        cg_content=gc_content,
        n_rsites=n_rsites,
        min_perc=opts.min_perc,
        max_perc=opts.max_perc,
        seed=opts.seed,
        normalize_only=opts.normalize_only,
        max_njobs=opts.max_njobs,
        extra_bads=opts.badcols,
        biases_path=opts.biases_path)

    inter_vs_gcoord = path.join(
        opts.workdir, '04_normalization',
        'interactions_vs_genomic-coords.png_%s_%s.png' %
        (opts.reso, param_hash))

    # get and plot decay
    if not opts.normalize_only:
        printime('  - Computing interaction decay vs genomic distance')
        (_, _, _), (a2, _, _), (_, _, _) = plot_distance_vs_interactions(
            decay,
            max_diff=10000,
            resolution=opts.reso,
            normalized=not opts.filter_only,
            savefig=inter_vs_gcoord)

        print('    -> Decay slope 0.7-10 Mb\t%s' % a2)
    else:
        a2 = 0.

    printime('  - Saving biases and badcol columns')
    # biases
    bias_file = path.join(
        outdir, 'biases_%s_%s.pickle' %
        (nicer(opts.reso).replace(' ', ''), param_hash))
    out = open(bias_file, 'wb')

    dump(
        {
            'biases': biases,
            'decay': decay,
            'badcol': badcol,
            'resolution': opts.reso
        }, out, HIGHEST_PROTOCOL)
    out.close()

    finish_time = time.localtime()

    try:
        save_to_db(opts, bias_file, mreads, len(badcol), len(biases),
                   raw_cisprc, norm_cisprc, inter_vs_gcoord, a2, opts.filter,
                   launch_time, finish_time)
    except:
        # release lock anyway
        print_exc()
        try:
            remove(path.join(opts.workdir, '__lock_db'))
        except OSError:
            pass
        exit(1)
Пример #15
0
genome = {}
for crm in xrange(1, num_crms + 1):
    crm_len = int(mean_crm_size * random())
    genome['chr' + str(crm)] = ''.join([nts[int(401 * random())]
                                        for _ in xrange(crm_len)])

out = open('test.fa~', 'w')
for crm in xrange(1, num_crms + 1):
    out.write('>chr%d\n' % crm)
    crm = 'chr' + str(crm)
    for p in xrange(0, len(genome[crm]), 60):
        out.write(genome[crm][p:p+60] + '\n')
out.close()


genome_bis = parse_fasta('test.fa~')

if genome_bis == genome:
    genome = genome_bis
else:
    raise Exception('problem with genome parser')

# RE FRAGMENTS
frags = {}
for crm in genome:
    frags[crm] = {}
    beg = 0
    for pos in re.finditer(re_seq, genome[crm]):
        end = pos.start() + 1 + enz_cut
        if beg == end:
            continue
Пример #16
0
from pytadbit.parsers.genome_parser import parse_fasta
from pytadbit.mapping.mapper import get_intersection

chunk = 10000000
chunk = int(sys.argv[1])


PATH   = '/home/fransua/Documents/Courses/given/2014_CSDM/notebooks/'
INFILE = '/home/fransua/Documents/Courses/given/2014_CSDM/notebooks/fastq/%s.fastq'
rep = 'SRR_test'
INFILE = INFILE % rep

OUTPATH = PATH + rep + '_' + str(chunk) + '/'

chr_names = ['2L', '2R', '3L', '3R', '4', 'X']
genome_seq = parse_fasta([PATH + 'dmel_reference/chr%s.fa' % crm for crm in chr_names], chr_names)

frags = map_re_sites('HindIII', genome_seq, verbose=True)

sams1 = iterative_mapping(
            gem_index_path       = PATH + 'dmel_reference/dm3.genome.gem',
            fastq_path           = INFILE,
            out_sam_path         = OUTPATH + '%s_r1.txt' % rep,
            temp_dir             = PATH + 'tmp_dir/',
            range_start          = [10] * 5, # starts with a flag sequence
            range_stop           = range(30, 55, 5),
            nthreads             = 8,  # on intel corei7 CPUs 4 threads are as fast as
                                       # 8, but leave some room for you other applications
            max_reads_per_chunk  = chunk,
            single_end           = True)
print 'created thes SAM files:', sams1
Пример #17
0
def main():

    fastq = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq'
    fastq = 'short_dixon-2012_200bp.fastq'
    # fastq        = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq'
    gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem'
    out_map_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/'
    out_map_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/'
    temp_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/'
    temp_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/'

    print 'read 1'
    outfiles1 = full_mapping(gem_index_path,
                             fastq,
                             out_map_dir1,
                             'HindIII',
                             temp_dir=temp_dir1,
                             windows=((1, 100), ),
                             add_site=True)
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path,
                             fastq,
                             out_map_dir2,
                             'HindIII',
                             temp_dir=temp_dir2,
                             windows=((101, 200), ),
                             add_site=True)
    # print 'read 1'
    # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
    #                     temp_dir=temp_dir1,
    #                     windows=(zip(*([0] * len(range(25, 105, 5)),
    #                                    range(25,105,5)))))
    # print 'read 2'
    # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
    #                     temp_dir=temp_dir2,
    #                     windows=(zip(*([100] * len(range(125, 205, 5)),
    #                                            range(125,205,5)))))

    print outfiles1
    print 'xcmvnkljnv'
    print outfiles2

    from pytadbit.parsers.map_parser import parse_map
    from pytadbit.parsers.genome_parser import parse_fasta
    from pytadbit.mapping.mapper import get_intersection
    from pytadbit.mapping.filter import filter_reads, apply_filter

    read1, read2 = 'read1.tsv', 'read2.tsv',
    parse_map(outfiles1,
              outfiles2,
              out_file1=read1,
              out_file2=read2,
              genome_seq=parse_fasta(
                  '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
              re_name='HindIII',
              verbose=True)

    reads = 'both_reads.tsv'
    get_intersection(read1, read2, reads)

    masked = filter_reads(reads)
    freads = 'filtered_reads.tsv'
    apply_filter(reads, freads, masked)
Пример #18
0
for crm in xrange(1, num_crms + 1):
    crm_len = int(mean_crm_size * random())
    genome['chr' + str(crm)] = ''.join([nts[int(401 * random())]
                                        for _ in xrange(crm_len)])

out = open('test.fa~', 'w')
for crm in xrange(1, num_crms + 1):
    out.write('>chr%d\n' % crm)
    crm = 'chr' + str(crm)
    for p in xrange(0, len(genome[crm]), 60):
        out.write(genome[crm][p:p+60] + '\n')
out.close()

from pytadbit.parsers.genome_parser import parse_fasta

genome_bis = parse_fasta('test.fa~')

if genome_bis == genome:
    genome = genome_bis
else:
    raise Exception('problem with genome parser')

# RE FRAGMENTS
frags = {}
for crm in genome:
    frags[crm] = {}
    beg = 0
    for pos in re.finditer(re_seq, genome[crm]):
        end = pos.start() + 1 + enz_cut
        if beg == end:
            continue
Пример #19
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids)

    renz = renz.split('-')

    opts.workdir = path.abspath(opts.workdir)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2  = None
    elif opts.read == 2:
        out_file2 = None
        f_names1  = f_names2
        f_names2  = None
        out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))

    logging.info('parsing genomic sequence')
    try:
        # allows the use of pickle genome to make it faster
        genome = load(open(opts.genome[0],'rb'))
    except (UnpicklingError, KeyError):
        genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1,
                                   out_file2=out_file2, re_name=renz, verbose=True,
                                   genome_seq=genome, compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = {}
        for line in fhandler:
            if '|||' in line:
                try:
                    multis[0][line.count('|||')] += 1
                except KeyError:
                    multis[0][line.count('|||')] = 1
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (
                    read, counts[read][item],
                    out_file1 if read == 1 else out_file2))
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)
Пример #20
0
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, frag_map=False,
                             windows=(zip(*(r_beg1, r_end1))))
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, frag_map=False,
                             windows=(zip(*(r_beg2, r_end2))))
    parse_thing = parse_map
elif mapper == 3:
    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1,
                             windows=(zip(*(r_beg1, r_end1))))
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2,
                             windows=(zip(*(r_beg2, r_end2))))
    parse_thing = parse_map

read1, read2 = 'read1.tsv_%s-%s' % (mapper, win), 'read2.tsv_%s-%s' % (mapper, win)
parse_thing(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
            genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
            re_name='HindIII', verbose=True)

reads = 'both_reads.tsv_%s-%s' % (mapper, win)
get_intersection(read1, read2, reads)

masked = filter_reads(reads)
freads = 'filtered_reads.tsv_%s-%s' % (mapper, win)
apply_filter(reads, freads, masked)
Пример #21
0
    def run(self, input_files, output_files, metadata=None):  # pylint: disable=too-many-locals,arguments-differ,inconsistent-return-statements
        """
        The main function to map the aligned reads and return the matching
        pairs. Parsing of the mappings can be either iterative of fragment
        based. If it is to be iteractive then the locations of 4 output file
        windows for each end of the paired end window need to be provided. If
        it is fragment based, then only 2 window locations need to be provided
        along within an enzyme name.

        Parameters
        ----------
        input_files : list
            genome_file : str
                Location of the genome FASTA file
            window1_1 : str
                Location of the first window index file
            window1_2 : str
                Location of the second window index file
            window1_3 : str
                [OPTIONAL] Location of the third window index file
            window1_4 : str
                [OPTIONAL] Location of the fourth window index file
            window2_1 : str
                Location of the first window index file
            window2_2 : str
                Location of the second window index file
            window2_3 : str
                [OPTIONAL] Location of the third window index file
            window2_4 : str
                [OPTIONAL] Location of the fourth window index file
        metadata : dict
            windows : list
                List of lists with the window sizes to be computed
            enzyme_name : str
                Restricture enzyme name
            mapping : list
                The mapping function used. The options are iter or frag.


        Returns
        -------
        output_files : list
            List of locations for the output files.
        output_metadata : dict
            Dict of matching metadata dict objects

        Example
        -------

        Iterative:

        .. code-block:: python

            from tool import tb_parse_mapping

            genome_file = 'genome.fasta'

            root_name_1 = "/tmp/data/expt_source_1".split
            root_name_2 = "/tmp/data/expt_source_2".split
            windows = [[1,25], [1,50], [1,75], [1,100]]

            windows1 = []
            windows2 = []

            for w in windows:
                tail = "_full_" + w[0] + "-" + w[1] + ".map"
                windows1.append('/'.join(root_name_1) + tail)
                windows2.append('/'.join(root_name_2) + tail)

            files = [genome_file] + windows1 + windows2

            tpm = tb_parse_mapping.tb_parse_mapping()
            metadata = {'enzyme_name' : 'MboI', 'mapping' : ['iter', 'iter'], 'expt_name' = 'test'}
            tpm_files, tpm_meta = tpm.run(files, metadata)


        Fragment based mapping:

        .. code-block:: python

            from tool import tb_parse_mapping

            genome_file = 'genome.fasta'

            root_name_1 = "/tmp/data/expt_source_1".split
            root_name_2 = "/tmp/data/expt_source_2".split
            windows = [[1,100]]

            start = windows[0][0]
            end   = windows[0][1]

            window1_1 = '/'.join(root_name_1) + "_full_" + start + "-" + end + ".map"
            window1_2 = '/'.join(root_name_1) + "_frag_" + start + "-" + end + ".map"

            window2_1 = '/'.join(root_name_2) + "_full_" + start + "-" + end + ".map"
            window2_2 = '/'.join(root_name_2) + "_frag_" + start + "-" + end + ".map"

            files = [
                genome_file,
                window1_1, window1_2,
                window2_1, window2_2,
            ]

            tpm = tb_parse_mapping.tb_parse_mapping()
            metadata = {'enzyme_name' : 'MboI', 'mapping' : ['frag', 'frag'], 'expt_name' = 'test'}
            tpm_files, tpm_meta = tpm.run(files, metadata)

        """

        genome_file = input_files[0]

        enzyme_name = metadata['enzyme_name']
        mapping_list = metadata['mapping']
        expt_name = metadata['expt_name']

        root_name = input_files[1].split("/")

        reads = "/".join(root_name[0:-1]) + '/'

        genome_seq = parse_fasta(genome_file)

        chromosome_meta = []
        for k in genome_seq:
            chromosome_meta.append([k, len(genome_seq[k])])

        # input and output share most metadata
        output_metadata = {'chromosomes': chromosome_meta}

        if mapping_list[0] == mapping_list[1]:
            if mapping_list[0] == 'iter':
                window1_1 = input_files[1]
                window1_2 = input_files[2]
                window1_3 = input_files[3]
                window1_4 = input_files[4]

                window2_1 = input_files[5]
                window2_2 = input_files[6]
                window2_3 = input_files[7]
                window2_4 = input_files[8]

                read_iter = reads + expt_name + '_iter.tsv'

                self.tb_parse_mapping_iter(genome_seq, enzyme_name, window1_1,
                                           window1_2, window1_3, window1_4,
                                           window2_1, window2_2, window2_3,
                                           window2_4, read_iter)
                # results = compss_wait_on(results)
                return ([read_iter], output_metadata)

            elif mapping_list[0] == 'frag':
                window1_full = input_files[1]
                window1_frag = input_files[2]

                window2_full = input_files[3]
                window2_frag = input_files[4]

                read_frag = reads + expt_name + '_frag.tsv'

                self.tb_parse_mapping_frag(genome_seq, enzyme_name,
                                           window1_full, window1_frag,
                                           window2_full, window2_frag,
                                           read_frag)

                # results = compss_wait_on(results)
                return ([read_frag], output_metadata)

            reads = None
            return ([reads], output_metadata)