def run(opts): check_options(opts) launch_time = time.localtime() reads = [1] if opts.read == 1 else [2] if opts.read == 2 else [1, 2] if not opts.mapped1 and not opts.mapped2: f_names1, f_names2, renz = load_parameters_fromdb( opts, reads, opts.jobids) else: if opts.mapped1: f_names1 = opts.mapped1 if opts.mapped2: f_names2 = opts.mapped2 renz = opts.renz renz = renz.split('-') opts.workdir = path.abspath(opts.workdir) name = path.split(opts.workdir)[-1] param_hash = digest_parameters(opts) outdir = '02_parsed_reads' mkdir(path.join(opts.workdir, outdir)) if not opts.read: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) elif opts.read == 1: out_file1 = path.join(opts.workdir, outdir, '%s_r1_%s.tsv' % (name, param_hash)) out_file2 = None f_names2 = None elif opts.read == 2: out_file2 = None f_names1 = f_names2 f_names2 = None out_file1 = path.join(opts.workdir, outdir, '%s_r2_%s.tsv' % (name, param_hash)) logging.info('parsing genomic sequence') try: # allows the use of pickle genome to make it faster genome = load(open(opts.genome[0], 'rb')) except (UnpicklingError, KeyError): genome = parse_fasta(opts.genome, chr_regexp=opts.filter_chrom) if not opts.skip: logging.info('parsing reads in %s project', name) if opts.mapped1 or opts.mapped2: counts, multis = parse_sam(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts, multis = parse_map(f_names1, f_names2, out_file1=out_file1, out_file2=out_file2, re_name=renz, verbose=True, genome_seq=genome, compress=opts.compress_input) else: counts = {} counts[0] = {} fhandler = open(out_file1) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[0][item] = int(value) elif not line.startswith('#'): break multis = {} multis[0] = {} for line in fhandler: if '|||' in line: try: multis[0][line.count('|||')] += 1 except KeyError: multis[0][line.count('|||')] = 1 if out_file2: counts[1] = {} fhandler = open(out_file2) for line in fhandler: if line.startswith('# MAPPED '): _, _, item, value = line.split() counts[1][item] = int(value) elif not line.startswith('#'): break multis[1] = 0 for line in fhandler: if '|||' in line: multis[1] += line.count('|||') # write machine log while path.exists(path.join(opts.workdir, '__lock_log')): time.sleep(0.5) open(path.join(opts.workdir, '__lock_log'), 'a').close() with open(path.join(opts.workdir, 'trace.log'), "a") as mlog: for read in counts: for item in counts[read]: mlog.write('# PARSED READ%s PATH\t%d\t%s\n' % (read, counts[read][item], out_file1 if read == 1 else out_file2)) # release lock try: remove(path.join(opts.workdir, '__lock_log')) except OSError: pass finish_time = time.localtime() # save all job information to sqlite DB save_to_db(opts, counts, multis, f_names1, f_names2, out_file1, out_file2, launch_time, finish_time)
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() # debug print "uniq_reads =", uniq_reads masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() map(os.remove, glob.glob(out_sam_left_path + '*')) map(os.remove, glob.glob(out_sam_right_path + '*')) map(os.remove, glob.glob(join(output_directory, '*.tsv'))) print 'Done.' stdout.flush()
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() remove(out_sam_left_path + '*') remove(out_sam_right_path + '*') remove(join(output_directory, '*.tsv')) print 'Done.' stdout.flush()
out_sam_path = OUTPATH + '%s_r2.txt' % rep, temp_dir = PATH + 'tmp_dir/', range_start = range(80, 55, -5), # starts with a flag sequence range_stop = [100] * 5, nthreads = 8, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications max_reads_per_chunk = chunk, single_end = True) sams1 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r1.txt')] sams2 = [OUTPATH + fnam for fnam in os.listdir(OUTPATH) if fnam.rsplit('.', 2)[0].endswith('_r2.txt')] print 'created thes SAM files:', sams2 parse_sam(sams1, sams2, frags, OUTPATH + 'reads1_%s.tsv' % rep, OUTPATH + 'reads2_%s.tsv' % rep, genome_seq, 'HindIII', verbose=True) reads1 = OUTPATH + 'reads1_%s.tsv' % rep reads2 = OUTPATH + 'reads2_%s.tsv' % rep reads = OUTPATH + 'reads12_%s.tsv' % rep get_intersection(reads1, reads2, reads, verbose=True) from pytadbit.mapping.analyze import hic_map hic_map(reads, genome_seq, resolution=100000, savedata='lala') from pytadbit.mapping.analyze import plot_genomic_distribution plot_genomic_distribution(reads, resolution=50000, genome_seq=genome_seq) # because I know it
pos1 -= 3 break read1 = {'crm': crm, 'pos': pos1, 'flag': sd1, 'id': 'lala04.%012d' % (i)} read2 = {'crm': crm, 'pos': pos2, 'flag': sd2, 'id': 'lala04.%012d' % (i)} out1.write(read_str.format(**read1)) out2.write(read_str.format(**read2)) # TOO CLOSE FROM RE out1.close() out2.close() # PARSE SAM from pytadbit.parsers.sam_parser import parse_sam parse_sam(['test_read1.sam~'], ['test_read2.sam~'], 'lala1~', 'lala2~', genome, re_name='DPNII', mapper='GEM') # GET INTERSECTION from pytadbit.mapping.mapper import get_intersection get_intersection('lala1~', 'lala2~', 'lala~') # FILTER from pytadbit.mapping.filter import filter_reads masked = filter_reads('lala~')