def main(): opts = get_options() iterative_mapping() ## PARSE FASTA genome = parse_fasta(opts.fasta if len(opts.fasta) <= 1 else opts.fasta[0], chr_names=opts.chr_name, verbose=True)
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str(threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() # debug print "uniq_reads =", uniq_reads masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() map(os.remove, glob.glob(out_sam_left_path + '*')) map(os.remove, glob.glob(out_sam_right_path + '*')) map(os.remove, glob.glob(join(output_directory, '*.tsv'))) print 'Done.' stdout.flush()
# print 'read 2' # mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', # temp_dir=temp_dir2, windows=((100, 200),)) from pytadbit.mapping.mapper import iterative_mapping from pytadbit.mapping.full_mapper import full_mapping from pytadbit.parsers.map_parser import parse_map from pytadbit.parsers.sam_parser import parse_sam from pytadbit.parsers.genome_parser import parse_fasta from pytadbit.mapping.mapper import get_intersection from pytadbit.mapping.filter import filter_reads, apply_filter if mapper == 1: print 'read 1' outfiles1 = iterative_mapping(gem_index_path, fastq, out_map_dir1, r_beg1, [e + 2 for e in r_end1], temp_dir=temp_dir1) print 'read 2' outfiles2 = iterative_mapping(gem_index_path, fastq, out_map_dir2, r_beg2, [e + 2 for e in r_end2], temp_dir=temp_dir2) parse_thing = parse_sam elif mapper == 2: print 'read 1' outfiles1 = full_mapping(gem_index_path, fastq, out_map_dir1, 'HindIII', temp_dir=temp_dir1, frag_map=False, windows=(zip(*(r_beg1, r_end1)))) print 'read 2' outfiles2 = full_mapping(gem_index_path, fastq, out_map_dir2, 'HindIII', temp_dir=temp_dir2, frag_map=False, windows=(zip(*(r_beg2, r_end2))))
rep = 'SRR_test' INFILE = INFILE % rep OUTPATH = PATH + rep + '_' + str(chunk) + '/' chr_names = ['2L', '2R', '3L', '3R', '4', 'X'] genome_seq = parse_fasta([PATH + 'dmel_reference/chr%s.fa' % crm for crm in chr_names], chr_names) frags = map_re_sites('HindIII', genome_seq, verbose=True) sams1 = iterative_mapping( gem_index_path = PATH + 'dmel_reference/dm3.genome.gem', fastq_path = INFILE, out_sam_path = OUTPATH + '%s_r1.txt' % rep, temp_dir = PATH + 'tmp_dir/', range_start = [10] * 5, # starts with a flag sequence range_stop = range(30, 55, 5), nthreads = 8, # on intel corei7 CPUs 4 threads are as fast as # 8, but leave some room for you other applications max_reads_per_chunk = chunk, single_end = True) print 'created thes SAM files:', sams1 sams2 = iterative_mapping( gem_index_path = PATH + 'dmel_reference/dm3.genome.gem', fastq_path = INFILE, out_sam_path = OUTPATH + '%s_r2.txt' % rep, temp_dir = PATH + 'tmp_dir/', range_start = range(80, 55, -5), # starts with a flag sequence range_stop = [100] * 5, nthreads = 8, # on intel corei7 CPUs 4 threads are as fast as
def make_matrices(left_reads_fastq, right_reads_fastq, reads_fastq, genome_fasta, genome_index, \ output_directory, output_prefix, enzyme, res, chromosomes, threads_number, \ clean_tmp, tmp_dir): print 'Begin to process reads.' left_reads = '' right_reads = '' if reads_fastq != '': # left and right reads are stored in one file range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_left_right_ranges(reads_fastq) print 'Reads: ', reads_fastq left_reads = reads_fastq right_reads = reads_fastq else: # left and right reads are stored separately range_start_left, range_stop_left, \ range_start_right, range_stop_right = calc_range(left_reads_fastq) print 'Left reads: ', left_reads_fastq print 'Right reads: ', right_reads_fastq print 'Output prefix: ', output_prefix left_reads = left_reads_fastq right_reads = right_reads_fastq print 'Reference genome FASTA: ', genome_fasta print 'Reference genome GEM index:', genome_index print 'Output directory: ', output_directory print 'Temp directory: ', tmp_dir print 'Enzyme: ', enzyme print 'Resolution: ', res, 'bp' print 'Number of threads: ', threads_number print 'Start pos for left reads: ', range_start_left print 'Stop pos for left reads: ', range_stop_left print 'Start pos for right reads: ', range_start_right print 'Stop pos for right reads: ', range_stop_right stdout.flush() # map left reads to reference genome out_sam_left_name = splitext(basename(left_reads))[0] + '_left.sam' out_sam_left_path = join(output_directory, out_sam_left_name) print 'Iterative mapping of left reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_left = iterative_mapping(genome_index, left_reads, out_sam_left_path, \ range_start_left, range_stop_left, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # map right reads to reference genome out_sam_right_name = splitext(basename(right_reads))[0] + '_right.sam' out_sam_right_path = join(output_directory, out_sam_right_name) print 'Iterative mapping of right reads (using ' + str( threads_number) + ' threads)...' stdout.flush() sams_right = iterative_mapping(genome_index, right_reads, out_sam_right_path, \ range_start_right, range_stop_right, nthreads=threads_number, temp_dir=tmp_dir) print 'Done.' stdout.flush() # load reference genome sequence print 'Load reference genome sequence...' stdout.flush() chroms = chromosomes[:] genome_seq = parse_fasta(genome_fasta, chr_names=chroms) print 'Done.' stdout.flush() # create files with information about every left and right read # and about their placement with respect to restriction sites tsv_left_name = splitext(basename(left_reads))[0] + '_left.tsv' tsv_left = join(output_directory, tsv_left_name) tsv_right_name = splitext(basename(right_reads))[0] + '_right.tsv' tsv_right = join(output_directory, tsv_right_name) print 'Get information about restriction sites and reads placement...' stdout.flush() parse_sam(sams_left, sams_right, tsv_left, tsv_right, genome_seq, enzyme, \ verbose=True, ncpus=8) print 'Done.' stdout.flush() # create file with both left and right reads that uniquelly mapped to reference genome if reads_fastq != '': # left and right reads are stored in one file common_reads_prefix = splitext(basename(reads_fastq))[0] else: # left and right reads are stored separately common_reads_prefix = output_prefix uniq_reads_name = common_reads_prefix + '_both_map_uniq.tsv' uniq_reads = join(output_directory, uniq_reads_name) print 'Merge info about left and right reads in one file...' stdout.flush() get_intersection(tsv_left, tsv_right, uniq_reads, verbose=True) print 'Done.' stdout.flush() # find read IDs that are filtered by default TADbit filters print 'Mask reads...' stdout.flush() masked = filter_reads(uniq_reads) print 'Done.' stdout.flush() # apply all filters (exclude reads that were filtered) print 'Filter masked reads...' stdout.flush() filtered_reads_name = common_reads_prefix + '_filtered.tsv' filtered_reads = join(output_directory, filtered_reads_name) apply_filter(uniq_reads, filtered_reads, masked) print 'Done.' stdout.flush() # create matrices (one matrix per chromosome) print 'Create Hi-C maps (one per chromosome)...' stdout.flush() hic_map(filtered_reads, resolution=res, by_chrom='intra', savedata=output_directory) print 'Done.' stdout.flush() print 'Add resolution (' + str(resolution) + ') to matrix filenames...' stdout.flush() add_resolution(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() print 'Add headers to matrix files...' stdout.flush() add_headers(chromosomes, resolution, output_directory) print 'Done.' stdout.flush() if clean_tmp: # Remove all SAM and TSV files from the output directory print 'Remove SAM and TSV files from the output directory.' stdout.flush() remove(out_sam_left_path + '*') remove(out_sam_right_path + '*') remove(join(output_directory, '*.tsv')) print 'Done.' stdout.flush()