def tb_parse_mapping_frag(self, genome_seq, enzyme_name, window1_full, window1_frag, window2_full, window2_frag, reads): """ Function to map the aligned reads and return the matching pairs Parameters ---------- genome_seq : dict Object containing the sequence of each of the chromosomes enzyme_name : str Name of the enzyme used to digest the genome window1_full : str Location of the first window index file window1_frag : str Location of the second window index file window2_full : str Location of the first window index file window2_frag : str Location of the second window index file reads : str Location of the reads thats that has a matching location at both ends of the paired reads Returns ------- reads : str Location of the intersection of mapped reads that have matching reads in both pair end files """ print("TB WINDOWS - full 1", window1_full) print("TB WINDOWS - frag 1", window1_frag) print("TB WINDOWS - full 2", window2_full) print("TB WINDOWS - frag 2", window2_frag) # root_name = reads.split("/") # reads1 = "/".join(root_name) + '/reads_1.tsv' # reads2 = "/".join(root_name) + '/reads_2.tsv' reads1 = reads + '_reads_1.tsv' reads2 = reads + '_reads_2.tsv' reads_both = reads + '_reads_both.tsv' parse_map([window1_frag, window1_full], [window2_frag, window2_full], out_file1=reads1, out_file2=reads2, genome_seq=genome_seq, re_name=enzyme_name, verbose=True) get_intersection(reads1, reads2, reads_both, verbose=True) with open(reads, "wb") as f_out: with open(reads_both, "rb") as f_in: f_out.write(f_in.read()) return True
def main(): fastq = '/scratch/db/FASTQs/hsap/dixon_2012/dixon-2012_200bp.fastq' fastq = 'short_dixon-2012_200bp.fastq' # fastq = '/scratch/test/sample_dataset/FASTQs/sample_hsap_HindIII.fastq' gem_index_path = '/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.gem' out_map_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read1/' out_map_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/read2/' temp_dir1 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp1/' temp_dir2 = '/home/fransua/Box/tadbits/tadbit/_pytadbit/mapping/tmp2/' print 'read 1' outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, windows=((1,100),), add_site=True) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, windows=((101, 200),), add_site=True) # print 'read 1' # outfiles1 = mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', # temp_dir=temp_dir1, # windows=(zip(*([0] * len(range(25, 105, 5)), # range(25,105,5))))) # print 'read 2' # outfiles2 = mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', # temp_dir=temp_dir2, # windows=(zip(*([100] * len(range(125, 205, 5)), # range(125,205,5))))) print outfiles1 print 'xcmvnkljnv' print outfiles2 from pytadbit.parsers.map_parser import parse_map from pytadbit.parsers.genome_parser import parse_fasta from pytadbit.mapping.mapper import get_intersection from pytadbit.mapping.filter import filter_reads, apply_filter read1, read2 = 'read1.tsv', 'read2.tsv', parse_map(outfiles1, outfiles2, out_file1=read1, out_file2=read2, genome_seq=parse_fasta('/scratch/db/index_files/Homo_sapiens-79/Homo_sapiens.fa'), re_name='HindIII', verbose=True) reads = 'both_reads.tsv' get_intersection(read1, read2, reads) masked = filter_reads(reads) freads = 'filtered_reads.tsv' apply_filter(reads, freads, masked)
def parseMaps(self, num_cpus=8): """ Merge the 2 read maps together Requires 8 CPU """ # new file with info of each "read1" and its placement with respect to RE sites reads1 = self.parsed_reads_dir + '/read1.tsv' # new file with info of each "read2" and its placement with respect to RE sites reads2 = self.parsed_reads_dir + '/read2.tsv' mapped_rN = self.getMappedWindows() print 'Parse MAP files...' parse_map(mapped_rN["mapped_r1"], mapped_rN["mapped_r2"], out_file1=reads1, out_file2=reads2, genome_seq=self.genome_seq, re_name=self.enzyme_name, verbose=True, ncpus=num_cpus)
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] if not opts.mapped1 and not opts.mapped2: f_names1, f_names2, renz = load_parameters_fromdb( opts, reads, opts.jobids) else: if opts.mapped1: f_names1 = opts.mapped1 if opts.mapped2: f_names2 = opts.mapped2 renz = opts.renz renz = renz.split('-') opts.workdir = path.abspath(opts.workdir) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of pickle genome to make it faster genome = load(open(opts.genome[0], 'rb')) except (UnpicklingError, KeyError): genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom) if not opts.skip: logging.info('parsing reads in %s project', name) if opts.mapped1 or opts.mapped2: counts, multis = parse_sam(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = {} for line in fhandler: if '|||' in line: try: multis[0][line.count('|||')] += 1 except KeyError: multis[0][line.count('|||')] = 1 if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (read, counts[read][item], out_file1 if read == 1 else out_file2)) # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] f_names1, f_names2, renz = load_parameters_fromdb(opts, reads, opts.jobids) renz = renz.split('-') opts.workdir = path.abspath(opts.workdir) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of cPickle genome to make it faster genome = load(open(opts.genome[0])) except UnpicklingError: genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom) if not opts.skip: logging.info('parsing reads in %s project', name) counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = {} for line in fhandler: if '|||' in line: try: multis[0][line.count('|||')] += 1 except KeyError: multis[0][line.count('|||')] = 1 if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % ( read, counts[read][item], out_file1 if read == 1 else out_file2)) # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
from pytadbit.parsers.map_parser import parse_map from pytadbit.parsers.genome_parser import parse_fasta from pytadbit.mapping import get_intersection # Load the genome genome_seq = parse_fasta(fasta) # Output directory RESULTS = '%s/results/%s/processed_reads' % (SAMPLE, version) if not os.path.exists(RESULTS): os.makedirs(RESULTS) infiles = [] outfiles = [] for infile in [paired1, paired2]: bname = infile.split("/")[-1].replace(".fastq.gz", "") maps = glob.glob('%s/%s/*' % (MAP_DIR, bname)) infiles.append(maps) outfiles.append('%s/%s_map.tsv' % (RESULTS, bname)) parse_map(infiles[0], infiles[1], outfiles[0], outfiles[1], genome_seq, restriction_enzyme, verbose=True, ncpus=slots) final_output = outfiles[0].replace('read1', 'both') get_intersection(outfiles[0], outfiles[1], final_output, verbose=True)
def tb_parse_mapping_iter(self, genome_seq, enzyme_name, window1_1, window1_2, window1_3, window1_4, window2_1, window2_2, window2_3, window2_4, reads): """ Function to map the aligned reads and return the matching pairs Parameters ---------- genome_seq : dict Object containing the sequence of each of the chromosomes enzyme_name : str Name of the enzyme used to digest the genome window1_1 : str Location of the first window index file window1_2 : str Location of the second window index file window1_3 : str Location of the third window index file window1_4 : str Location of the fourth window index file window2_1 : str Location of the first window index file window2_2 : str Location of the second window index file window2_3 : str Location of the third window index file window2_4 : str Location of the fourth window index file reads : str Location of the reads thats that has a matching location at both ends of the paired reads Returns ------- reads : str Location of the intersection of mapped reads that have matching reads in both pair end files """ reads1 = reads + '_reads_1.tsv' reads2 = reads + '_reads_2.tsv' reads_both = reads + '_reads_both.tsv' parse_map( [window1_1, window1_2, window1_3, window1_4], [window2_1, window2_2, window2_3, window2_4], out_file1=reads1, out_file2=reads2, genome_seq=genome_seq, re_name=enzyme_name, verbose=True, # ncpus=32 ) get_intersection(reads1, reads2, reads_both, verbose=True) with open(reads, "wb") as f_out: with open(reads_both, "rb") as f_in: f_out.write(f_in.read()) return True
def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] f_names1, f_names2, renz = load_parameters_fromdb(opts.workdir, reads, opts.jobids) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of cPickle genome to make it faster genome = load(open(opts.genome[0])) except UnpicklingError: genome = parse_fasta(opts.genome) if not opts.skip: logging.info('parsing reads in %s project', name) counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = 0 for line in fhandler: if '|||' in line: multis[0] += line.count('|||') if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: fcntl.flock(mlog, fcntl.LOCK_EX) for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % ( read, counts[read][item], out_file1 if read == 1 else out_file2)) fcntl.flock(mlog, fcntl.LOCK_UN) finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)