예제 #1
0
    def tb_parse_mapping_frag(self, genome_seq, enzyme_name, window1_full,
                              window1_frag, window2_full, window2_frag, reads):
        """
        Function to map the aligned reads and return the matching pairs

        Parameters
        ----------
        genome_seq : dict
            Object containing the sequence of each of the chromosomes
        enzyme_name : str
            Name of the enzyme used to digest the genome
        window1_full : str
            Location of the first window index file
        window1_frag : str
            Location of the second window index file
        window2_full : str
            Location of the first window index file
        window2_frag : str
            Location of the second window index file
        reads : str
            Location of the reads thats that has a matching location at both
            ends of the paired reads


        Returns
        -------
        reads : str
            Location of the intersection of mapped reads that have matching
            reads in both pair end files

        """

        print("TB WINDOWS - full 1", window1_full)
        print("TB WINDOWS - frag 1", window1_frag)
        print("TB WINDOWS - full 2", window2_full)
        print("TB WINDOWS - frag 2", window2_frag)

        # root_name = reads.split("/")

        # reads1 = "/".join(root_name) + '/reads_1.tsv'
        # reads2 = "/".join(root_name) + '/reads_2.tsv'
        reads1 = reads + '_reads_1.tsv'
        reads2 = reads + '_reads_2.tsv'
        reads_both = reads + '_reads_both.tsv'

        parse_map([window1_frag, window1_full], [window2_frag, window2_full],
                  out_file1=reads1,
                  out_file2=reads2,
                  genome_seq=genome_seq,
                  re_name=enzyme_name,
                  verbose=True)

        get_intersection(reads1, reads2, reads_both, verbose=True)

        with open(reads, "wb") as f_out:
            with open(reads_both, "rb") as f_in:
                f_out.write(f_in.read())

        return True
예제 #2
0
def main():

    fastq          = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq'
    fastq          = 'short_dixon-2012_200bp.fastq'
    # fastq        = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq'
    gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem'
    out_map_dir1   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/'
    out_map_dir2   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/'
    temp_dir1      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/'
    temp_dir2      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/'

    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, windows=((1,100),), add_site=True)
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, windows=((101, 200),), add_site=True)
    # print 'read 1'
    # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
    #                     temp_dir=temp_dir1,
    #                     windows=(zip(*([0] * len(range(25, 105, 5)),
    #                                    range(25,105,5)))))
    # print 'read 2'
    # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
    #                     temp_dir=temp_dir2,
    #                     windows=(zip(*([100] * len(range(125, 205, 5)),
    #                                            range(125,205,5)))))
    
    print outfiles1
    print 'xcmvnkljnv'
    print outfiles2
    
    from pytadbit.parsers.map_parser import parse_map
    from pytadbit.parsers.genome_parser import parse_fasta
    from pytadbit.mapping.mapper import get_intersection
    from pytadbit.mapping.filter import filter_reads, apply_filter
    
    read1, read2 = 'read1.tsv', 'read2.tsv',
    parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
              genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
              re_name='HindIII', verbose=True)

    reads = 'both_reads.tsv'
    get_intersection(read1, read2, reads)

    masked = filter_reads(reads)
    freads = 'filtered_reads.tsv'
    apply_filter(reads, freads, masked)
예제 #3
0
def main():

    fastq          = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq'
    fastq          = 'short_dixon-2012_200bp.fastq'
    # fastq        = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq'
    gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem'
    out_map_dir1   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/'
    out_map_dir2   = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/'
    temp_dir1      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/'
    temp_dir2      = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/'

    print 'read 1'
    outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
                             temp_dir=temp_dir1, windows=((1,100),), add_site=True)
    print 'read 2'
    outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
                             temp_dir=temp_dir2, windows=((101, 200),), add_site=True)
    # print 'read 1'
    # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII',
    #                     temp_dir=temp_dir1,
    #                     windows=(zip(*([0] * len(range(25, 105, 5)),
    #                                    range(25,105,5)))))
    # print 'read 2'
    # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII',
    #                     temp_dir=temp_dir2,
    #                     windows=(zip(*([100] * len(range(125, 205, 5)),
    #                                            range(125,205,5)))))
    
    print outfiles1
    print 'xcmvnkljnv'
    print outfiles2
    
    from pytadbit.parsers.map_parser import parse_map
    from pytadbit.parsers.genome_parser import parse_fasta
    from pytadbit.mapping.mapper import get_intersection
    from pytadbit.mapping.filter import filter_reads, apply_filter
    
    read1, read2 = 'read1.tsv', 'read2.tsv',
    parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2,
              genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'),
              re_name='HindIII', verbose=True)

    reads = 'both_reads.tsv'
    get_intersection(read1, read2, reads)

    masked = filter_reads(reads)
    freads = 'filtered_reads.tsv'
    apply_filter(reads, freads, masked)
예제 #4
0
    def parseMaps(self, num_cpus=8):
        """
        Merge the 2 read maps together 
        Requires 8 CPU
        """
        # new file with info of each "read1" and its placement with respect to RE sites
        reads1 = self.parsed_reads_dir + '/read1.tsv'
        # new file with info of each "read2" and its placement with respect to RE sites
        reads2 = self.parsed_reads_dir + '/read2.tsv'

        mapped_rN = self.getMappedWindows()

        print 'Parse MAP files...'
        parse_map(mapped_rN["mapped_r1"],
                  mapped_rN["mapped_r2"],
                  out_file1=reads1,
                  out_file2=reads2,
                  genome_seq=self.genome_seq,
                  re_name=self.enzyme_name,
                  verbose=True,
                  ncpus=num_cpus)
예제 #5
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    if not opts.mapped1 and not opts.mapped2:
        f_names1, f_names2, renz = load_parameters_fromdb(
            opts, reads, opts.jobids)
    else:
        if opts.mapped1:
            f_names1 = opts.mapped1
        if opts.mapped2:
            f_names2 = opts.mapped2
        renz = opts.renz

    renz = renz.split('-')

    opts.workdir = path.abspath(opts.workdir)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir,
                              '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2 = None
    elif opts.read == 2:
        out_file2 = None
        f_names1 = f_names2
        f_names2 = None
        out_file1 = path.join(opts.workdir, outdir,
                              '%s_r2_%s.tsv' % (name, param_hash))

    logging.info('parsing genomic sequence')
    try:
        # allows the use of pickle genome to make it faster
        genome = load(open(opts.genome[0], 'rb'))
    except (UnpicklingError, KeyError):
        genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        if opts.mapped1 or opts.mapped2:
            counts, multis = parse_sam(f_names1,
                                       f_names2,
                                       out_file1=out_file1,
                                       out_file2=out_file2,
                                       re_name=renz,
                                       verbose=True,
                                       genome_seq=genome,
                                       compress=opts.compress_input)
        else:
            counts, multis = parse_map(f_names1,
                                       f_names2,
                                       out_file1=out_file1,
                                       out_file2=out_file2,
                                       re_name=renz,
                                       verbose=True,
                                       genome_seq=genome,
                                       compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = {}
        for line in fhandler:
            if '|||' in line:
                try:
                    multis[0][line.count('|||')] += 1
                except KeyError:
                    multis[0][line.count('|||')] = 1
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' %
                           (read, counts[read][item],
                            out_file1 if read == 1 else out_file2))
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)
예제 #6
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids)

    renz = renz.split('-')

    opts.workdir = path.abspath(opts.workdir)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2  = None
    elif opts.read == 2:
        out_file2 = None
        f_names1  = f_names2
        f_names2  = None
        out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))

    logging.info('parsing genomic sequence')
    try:
        # allows the use of cPickle genome to make it faster
        genome = load(open(opts.genome[0]))
    except UnpicklingError:
        genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1,
                                   out_file2=out_file2, re_name=renz, verbose=True,
                                   genome_seq=genome, compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = {}
        for line in fhandler:
            if '|||' in line:
                try:
                    multis[0][line.count('|||')] += 1
                except KeyError:
                    multis[0][line.count('|||')] = 1
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')

    # write machine log
    while path.exists(path.join(opts.workdir, '__lock_log')):
        time.sleep(0.5)
    open(path.join(opts.workdir, '__lock_log'), 'a').close()
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (
                    read, counts[read][item],
                    out_file1 if read == 1 else out_file2))
    # release lock
    try:
        remove(path.join(opts.workdir, '__lock_log'))
    except OSError:
        pass

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)
from pytadbit.parsers.map_parser import parse_map
from pytadbit.parsers.genome_parser import parse_fasta
from pytadbit.mapping import get_intersection

# Load the genome
genome_seq = parse_fasta(fasta)

# Output directory
RESULTS = '%s/results/%s/processed_reads' % (SAMPLE, version)
if not os.path.exists(RESULTS):
    os.makedirs(RESULTS)

infiles = []
outfiles = []
for infile in [paired1, paired2]:
    bname = infile.split("/")[-1].replace(".fastq.gz", "")
    maps = glob.glob('%s/%s/*' % (MAP_DIR, bname))
    infiles.append(maps)
    outfiles.append('%s/%s_map.tsv' % (RESULTS, bname))

parse_map(infiles[0],
          infiles[1],
          outfiles[0],
          outfiles[1],
          genome_seq,
          restriction_enzyme,
          verbose=True,
          ncpus=slots)
final_output = outfiles[0].replace('read1', 'both')
get_intersection(outfiles[0], outfiles[1], final_output, verbose=True)
예제 #8
0
    def tb_parse_mapping_iter(self, genome_seq, enzyme_name, window1_1,
                              window1_2, window1_3, window1_4, window2_1,
                              window2_2, window2_3, window2_4, reads):
        """
        Function to map the aligned reads and return the matching pairs

        Parameters
        ----------
        genome_seq : dict
            Object containing the sequence of each of the chromosomes
        enzyme_name : str
            Name of the enzyme used to digest the genome
        window1_1 : str
            Location of the first window index file
        window1_2 : str
            Location of the second window index file
        window1_3 : str
            Location of the third window index file
        window1_4 : str
            Location of the fourth window index file
        window2_1 : str
            Location of the first window index file
        window2_2 : str
            Location of the second window index file
        window2_3 : str
            Location of the third window index file
        window2_4 : str
            Location of the fourth window index file
        reads : str
            Location of the reads thats that has a matching location at both
            ends of the paired reads


        Returns
        -------
        reads : str
            Location of the intersection of mapped reads that have matching
            reads in both pair end files

        """

        reads1 = reads + '_reads_1.tsv'
        reads2 = reads + '_reads_2.tsv'
        reads_both = reads + '_reads_both.tsv'

        parse_map(
            [window1_1, window1_2, window1_3, window1_4],
            [window2_1, window2_2, window2_3, window2_4],
            out_file1=reads1,
            out_file2=reads2,
            genome_seq=genome_seq,
            re_name=enzyme_name,
            verbose=True,
            # ncpus=32
        )

        get_intersection(reads1, reads2, reads_both, verbose=True)

        with open(reads, "wb") as f_out:
            with open(reads_both, "rb") as f_in:
                f_out.write(f_in.read())

        return True
예제 #9
0
def run(opts):
    check_options(opts)

    launch_time = time.localtime()

    reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2]
    f_names1, f_names2, renz = load_parameters_fromdb(opts.workdir, reads, opts.jobids)

    name = path.split(opts.workdir)[-1]

    param_hash = digest_parameters(opts)

    outdir = '02_parsed_reads'

    mkdir(path.join(opts.workdir, outdir))

    if not opts.read:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))
    elif opts.read == 1:
        out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash))
        out_file2 = None
        f_names2  = None
    elif opts.read == 2:
        out_file2 = None
        f_names1  = f_names2
        f_names2  = None
        out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash))
        
    logging.info('parsing genomic sequence')
    try:
        # allows the use of cPickle genome to make it faster
        genome = load(open(opts.genome[0]))
    except UnpicklingError:
        genome = parse_fasta(opts.genome)

    if not opts.skip:
        logging.info('parsing reads in %s project', name)
        counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1,
                                   out_file2=out_file2, re_name=renz, verbose=True,
                                   genome_seq=genome, compress=opts.compress_input)
    else:
        counts = {}
        counts[0] = {}
        fhandler = open(out_file1)
        for line in fhandler:
            if line.startswith('# MAPPED '):
                _, _, item, value = line.split()
                counts[0][item] = int(value)
            elif not line.startswith('#'):
                break
        multis = {}
        multis[0] = 0
        for line in fhandler:
            if '|||' in line:
                multis[0] += line.count('|||')
        if out_file2:
            counts[1] = {}
            fhandler = open(out_file2)
            for line in fhandler:
                if line.startswith('# MAPPED '):
                    _, _, item, value = line.split()
                    counts[1][item] = int(value)
                elif not line.startswith('#'):
                    break
            multis[1] = 0
            for line in fhandler:
                if '|||' in line:
                    multis[1] += line.count('|||')                

    # write machine log
    with open(path.join(opts.workdir, 'trace.log'), "a") as mlog:
        fcntl.flock(mlog, fcntl.LOCK_EX)
        for read in counts:
            for item in counts[read]:
                mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (
                    read, counts[read][item],
                    out_file1 if read == 1 else out_file2))
        fcntl.flock(mlog, fcntl.LOCK_UN)

    finish_time = time.localtime()

    # save all job information to sqlite DB
    save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2,
               launch_time, finish_time)