def check_options(opts): if not opts.mapper_binary: if opts.mapper == 'gem': opts.mapper_binary = 'gem-mapper' else: opts.mapper_binary = opts.mapper opts.mapper_binary = which(opts.mapper_binary) if not opts.mapper_binary: raise Exception( '\n\nERROR: Mapper binary not found, for GEM install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') opts.gem_version = 0 if opts.mapper == 'gem': opts.gem_version = None try: out, _ = Popen([opts.mapper_binary, '--version'], stdout=PIPE, stderr=STDOUT, universal_newlines=True).communicate() opts.gem_version = int(out[1]) except ValueError as e: opts.gem_version = 2 print('Falling to gem v2') if opts.fast_fragment: if opts.gem_version < 3: raise Exception('ERROR: Fast fragment mapping needs GEM v3') if not opts.fastq2 or not path.exists(opts.fastq2): raise Exception( 'ERROR: Fast fragment mapping needs both fastq files. ' 'Please specify --fastq2') if opts.read != 0: raise Exception( 'ERROR: Fast fragment mapping needs to be specified with --read 0' ) if not opts.genome: raise Exception('ERROR: Fast fragment mapping needs ' 'the genome parameter.') # check RE name if opts.renz == ['CHECK']: print('\nSearching for most probable restriction enzyme in file: %s' % (opts.fastq)) try: pat, enz, pv = identify_re(opts.fastq, nreads=100000) print(' -> Most probable digested site: %s (pv: %f)' % (pat, pv)) print(' -> Enzymes matching: %s' % (', '.join(enz))) except ValueError: print(' -> Nothing found...') exit() for n, renz in enumerate(opts.renz): if renz == 'NONE': opts.renz[n] = None continue try: _ = RESTRICTION_ENZYMES[renz] except KeyError: print('\n\nERROR: restriction enzyme %s not found.' % (renz) + 'Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n') raise KeyError() except AttributeError: pass # check skip if not path.exists(opts.workdir) and opts.skip: print('WARNING: can use output files, found, not skipping...') opts.skip = False # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check paths if opts.mapper == 'gem' and not path.exists(opts.index): raise IOError('ERROR: index file not found at ' + opts.index) if not path.exists(opts.fastq): raise IOError('ERROR: FASTQ file not found at ' + opts.fastq) if not is_fastq(opts.fastq): raise IOError( ('ERROR: FASTQ file %s wrong format, check') % (opts.fastq)) try: opts.windows = [[int(i) for i in win.split(':')] for win in opts.windows] except TypeError: pass mkdir(opts.workdir) # write log # if opts.mapping_only: log_format = '[MAPPING {} READ{}] %(message)s'.format( opts.fastq, opts.read) # else: # log_format = '[DEFAULT] %(message)s' # reset logging logging.getLogger().handlers = [] try: print('Writing log to ' + path.join(opts.workdir, 'process.log')) logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='a+') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='a+') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # check mapper extra options if opts.mapper_param: if (len(opts.mapper_param) == 1 and ('-' in opts.mapper_param[0] or '--' in opts.mapper_param[0])): # Single string surrounded by quotes opts.mapper_param = opts.mapper_param[0].split() else: opts.mapper_param = dict([o.split(':') for o in opts.mapper_param]) else: opts.mapper_param = {} if opts.mapper == 'gem' and opts.gem_version < 3: gem_valid_option = set([ "granularity", "q", "quality-format", "gem-quality-threshold", "mismatch-alphabet", "m", "e", "min-matched-bases", "max-big-indel-length", "s", "strata-after-best", "fast-mapping", "unique-mapping", "d", "D", "allow-incomplete-strata", "max-decoded-matches", "min-decoded-strata", "p", "paired-end-alignment", "b", "map-both-ends", "min-insert-size", "max-insert-size", "E", "max-extendable-matches", "max-extensions-per-match", "unique-pairing" ]) for k in opts.mapper_param: if not k in gem_valid_option: raise NotImplementedError( ('ERROR: option "%s" not a valid GEM option' 'or not suported by this tool.') % k) # create empty DB if don't exists dbpath = path.join(opts.workdir, 'trace.db') open(dbpath, 'a').close() # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # check if job already run using md5 digestion of parameters if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit('WARNING: exact same job already computed, see JOBs table above')
def full_mapping(gem_index_path, fastq_path, out_map_dir, r_enz=None, frag_map=True, min_seq_len=15, windows=None, add_site=True, clean=False, get_nread=False, **kwargs): """ Maps FASTQ reads to an indexed reference genome. Mapping can be done either without knowledge of the restriction enzyme used, or for experiments performed without one, like Micro-C (iterative mapping), or using the ligation sites created from the digested ends (fragment-based mapping). :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to FASTQ file, either compressed or not. :param out_map_dir: path to a directory where to store mapped reads in MAP format . :param None r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. This is optional if frag_map option is False :param True frag_map: two step mapper, first full length is mapped, then remaining, unmapped reads, are divided into restriction-enzyme fragments andeach is mapped. :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. :param 15 min_seq_len: minimum size of a fragment to map :param None windows: tuple of ranges for beginning and end of the mapping. This parameter allows to do classical iterative mapping, e.g. windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50)) A unique window can also be passed, for trimming, like this: windows=((1,101),) :param False clean: remove intermediate files created in temp_dir :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param False get_nreads: returns a list of lists where each element contains a path and the number of reads processed :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.map_parser.parse_map` """ skip = kwargs.get('skip', False) suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) outfiles = [] temp_dir = os.path.abspath( os.path.expanduser(kwargs.get('temp_dir', gettempdir()))) # create directories for rep in [temp_dir, out_map_dir]: mkdir(rep) # check space fspace = int(get_free_space_mb(temp_dir, div=3)) if fspace < 200: warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir)) # iterative mapping base_name = os.path.split(fastq_path)[-1].replace('.gz', '') base_name = '.'.join(base_name.split('.')[:-1]) input_reads = fastq_path if windows is None: light_storage = True windows = (None, ) elif isinstance(windows[0], int): # if windows starts at zero we do not need to store all the sequence # otherwise we need it because sequence can be trimmed two times # in fragment based mapping light_storage = True if not windows[0] else False windows = [tuple(windows)] else: # ensure that each element is a tuple, not a list windows = [tuple(win) for win in windows] # in this case we will need to keep the information about original # sequence at any point, light storage is thus not possible. light_storage = False for win in windows: # Prepare the FASTQ file and iterate over them curr_map, counter = transform_fastq(input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], fastq=is_fastq(input_reads), min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if input_reads != fastq_path and clean: print ' x removing original input %s' % input_reads os.system('rm -f %s' % (input_reads)) # First mapping, full length if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix) if end: print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix) else: print 'Mapping full reads...', curr_map if not skip: _gem_mapping(gem_index_path, curr_map, out_map_path, **kwargs) # parse map file to extract not uniquely mapped reads print 'Parsing result...' _gem_filter( out_map_path, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) # clean if clean: print ' x removing GEM input %s' % curr_map os.system('rm -f %s' % (curr_map)) print ' x removing map %s' % out_map_path os.system('rm -f %s' % (out_map_path)) # for next round, we will use remaining unmapped reads input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix) outfiles.append((os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter)) # map again splitting unmapped reads into RE fragments # (no need to trim this time) if frag_map: if not r_enz: raise Exception('ERROR: need enzyme name to fragment.') frag_map, counter = transform_fastq(input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz, add_site=add_site, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if clean: print ' x removing pre-GEM input %s' % input_reads os.system('rm -f %s' % (input_reads)) if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix) if not skip: print 'Mapping fragments of remaining reads...' _gem_mapping(gem_index_path, frag_map, out_map_path, **kwargs) print 'Parsing result...' _gem_filter( out_map_path, curr_map + '_fail%s.map' % (suffix), os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) # clean if clean: print ' x removing GEM input %s' % frag_map os.system('rm -f %s' % (frag_map)) print ' x removing failed to map ' + curr_map + '_fail%s.map' % ( suffix) os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix))) print ' x removing tmp mapped %s' % out_map_path os.system('rm -f %s' % (out_map_path)) outfiles.append((os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter)) if get_nread: return outfiles return [out for out, _ in outfiles]
def fast_fragment_mapping(mapper_index_path, fastq_path1, fastq_path2, r_enz, genome_seq, out_map, clean=True, get_nread=False, mapper_binary=None, mapper_params=None, samtools='samtools', **kwargs): """ Maps FASTQ reads to an indexed reference genome with the knowledge of the restriction enzyme used (fragment-based mapping). :param mapper_index_path: path to index file created from a reference genome using gem-index tool, bowtie2-build or hisat2-build :param fastq_path1: PATH to FASTQ file of read 1, either compressed or not. :param fastq_path2: PATH to FASTQ file of read 2, either compressed or not. :param out_map_dir: path to outfile tab separated format containing mapped read information. :param r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. :param genome_seq: a dictionary generated by :func:`pyatdbit.parser.genome_parser.parse_fasta`. containing the genomic sequence :param False clean: remove intermediate files created in temp_dir :param False get_nread: returns a list of lists where each element contains a path and the number of reads processed :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param gem-mapper mapper_binary: path to the binary mapper :param None mapper_params: extra parameters for the mapper :param samtools samtools: path to samtools binary. :returns: outfile with the intersected read pairs """ suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) samtools = which(samtools) # check out folder if not os.path.isdir(os.path.dirname(os.path.abspath(out_map))): raise Exception( '\n\nERROR: Path to store the output does not exist.\n') temp_dir = os.path.abspath( os.path.expanduser(kwargs.get('temp_dir', gettempdir()))) gem_version = None # check that we have the GEM binary: gem_binary = mapper_binary or 'gem-mapper' gem_binary = which(gem_binary) if not gem_binary: raise Exception('\n\nERROR: GEM v3 binary not found, install it from:' '\nhttps://github.com/smarco/gem3-mapper' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n') try: out, err = Popen([gem_binary, '--version'], stdout=PIPE, stderr=STDOUT, universal_newlines=True).communicate() gem_version = int(out[1]) except ValueError as e: gem_version = 2 print('Falling to gem v2') if gem_version < 3: raise Exception('\n\nERROR: GEM v3 binary not found, install it from:' '\nhttps://github.com/smarco/gem3-mapper' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n') if mapper_params: kwargs.update(mapper_params) # create directories for rep in [temp_dir]: mkdir(rep) # check space fspace = int(get_free_space_mb(temp_dir, div=3)) if fspace < 200: warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir)) # iterative mapping base_name1 = os.path.split(fastq_path1)[-1].replace('.gz', '') base_name1 = '.'.join(base_name1.split('.')[:-1]) curr_map1, _ = transform_fastq(fastq_path1, mkstemp(prefix=base_name1 + '_', dir=temp_dir)[1], fastq=is_fastq(fastq_path1), nthreads=nthreads, light_storage=True) base_name2 = os.path.split(fastq_path2)[-1].replace('.gz', '') base_name2 = '.'.join(base_name2.split('.')[:-1]) curr_map2, count_fastq = transform_fastq(fastq_path2, mkstemp(prefix=base_name2 + '_', dir=temp_dir)[1], fastq=is_fastq(fastq_path1), nthreads=nthreads, light_storage=True) out_map_path = curr_map1 + '_frag%s.map' % (suffix) print('Mapping fragments of remaining reads...') _gem_mapping(mapper_index_path, curr_map1, out_map_path, fastq_path2=curr_map2, r_enz=r_enz, gem_binary=gem_binary, gem_version=gem_version, **kwargs) # clean if clean: print(' x removing GEM 3 input %s' % (curr_map1)) os.system('rm -f %s' % (curr_map1)) print(' x removing GEM 3 input %s' % (curr_map2)) os.system('rm -f %s' % (curr_map2)) #sort sam file os.system(samtools + ' sort -n -O SAM -@ %d -T %s -o %s %s' % (nthreads, out_map_path, out_map_path, out_map_path)) genome_lengths = dict((crm, len(genome_seq[crm])) for crm in genome_seq) frag_chunk = kwargs.get('frag_chunk', 100000) frags = map_re_sites(r_enz, genome_seq, frag_chunk=frag_chunk) if samtools and nthreads > 1: print('Splitting sam file') # headers for i in range(nthreads): os.system(samtools + ' view -H -O SAM %s > "%s_%d"' % (out_map_path, out_map_path, (i + 1))) chunk_lines = int( (count_fastq * 2.3) / nthreads) # estimate lines in sam with reads and frags os.system(samtools + ''' view -O SAM %s | awk -v n=%d -v FS="\\t" ' BEGIN { part=0; line=n } { if( line>=n && $1!=last_read ) {part++; line=1; print $0 >> "%s_"part } else { print $0 >> "%s_"part; line++; } last_read = $1; }' ''' % (out_map_path, chunk_lines, out_map_path, out_map_path)) if clean: print(' x removing tmp mapped %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) print('Parsing results...') kwargs['nthreads'] = 1 procs = [] pool = mu.Pool(nthreads) for i in range(nthreads): frags_shared = copy.deepcopy(frags) procs.append( pool.apply_async(parse_gem_3c, args=('%s_%d' % (out_map_path, (i + 1)), '%s_parsed_%d' % (out_map_path, (i + 1)), copy.deepcopy(genome_lengths), frags_shared, False, True), kwds=kwargs)) #results.append('%s_parsed_%d' % (out_map_path,(i+1))) pool.close() pool.join() results = [proc.get() for proc in procs if proc.get()] if clean: for i in range(nthreads): print(' x removing tmp mapped %s_%d' % (out_map_path, (i + 1))) os.system('rm -f %s_%d' % (out_map_path, (i + 1))) #Final sort and merge nround = 0 while len(results) > 1: nround += 1 num_procs = min(nthreads, int(len(results) / 2)) pool = mu.Pool(num_procs) procs = [ pool.apply_async(merge_sort, (results.pop(0), results.pop(0), out_map_path + '_%d' % nround, i, True)) for i in range(num_procs) ] pool.close() pool.join() results = [proc.get() for proc in procs if proc.get()] map_out = open(out_map, 'w') tmp_reads_fh = open(results[0], 'r') for crm in genome_seq: map_out.write('# CRM %s\t%d\n' % (crm, len(genome_seq[crm]))) for read_line in tmp_reads_fh: read = read_line.split('\t') map_out.write('\t'.join([read[0]] + read[2:8] + read[9:])) map_out.close() if clean: print(' x removing tmp mapped %s' % results[0]) os.system('rm -f %s' % (results[0])) else: print('Parsing result...') parse_gem_3c(out_map_path, out_map, genome_lengths, frags, verbose=False, tmp_format=False, **kwargs) # clean if clean: print(' x removing tmp mapped %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) if get_nread: return [(out_map, count_fastq)] return out_map
def full_mapping(mapper_index_path, fastq_path, out_map_dir, mapper='gem', r_enz=None, frag_map=True, min_seq_len=15, windows=None, add_site=True, clean=False, get_nread=False, mapper_binary=None, mapper_params=None, **kwargs): """ Maps FASTQ reads to an indexed reference genome. Mapping can be done either without knowledge of the restriction enzyme used, or for experiments performed without one, like Micro-C (iterative mapping), or using the ligation sites created from the digested ends (fragment-based mapping). :param mapper_index_path: path to index file created from a reference genome using gem-index tool or bowtie2-build :param fastq_path: PATH to FASTQ file, either compressed or not. :param out_map_dir: path to a directory where to store mapped reads in MAP format . :param None r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. This is optional if frag_map option is False :param True frag_map: two step mapper, first full length is mapped, then remaining, unmapped reads, are divided into restriction-enzyme fragments andeach is mapped. :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. :param 15 min_seq_len: minimum size of a fragment to map :param None windows: tuple of ranges for beginning and end of the mapping. This parameter allows to do classical iterative mapping, e.g. windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50)) A unique window can also be passed, for trimming, like this: windows=((1,101),) :param False clean: remove intermediate files created in temp_dir :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param False get_nreads: returns a list of lists where each element contains a path and the number of reads processed :param gem-mapper mapper_binary: path to the binary mapper :param None mapper_params: extra parameters for the mapper :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.map_parser.parse_map` """ skip = kwargs.get('skip', False) suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) outfiles = [] temp_dir = os.path.abspath(os.path.expanduser( kwargs.get('temp_dir', gettempdir()))) if mapper_params: kwargs.update(mapper_params) # create directories for rep in [temp_dir, out_map_dir]: mkdir(rep) # check space fspace = int(get_free_space_mb(temp_dir, div=3)) if fspace < 200: warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir)) # iterative mapping base_name = os.path.split(fastq_path)[-1].replace('.gz', '') base_name = '.'.join(base_name.split('.')[:-1]) input_reads = fastq_path if windows is None: light_storage = True windows = (None, ) elif isinstance(windows[0], int): # if windows starts at zero we do not need to store all the sequence # otherwise we need it because sequence can be trimmed two times # in fragment based mapping light_storage = True if not windows[0] else False windows = [tuple(windows)] else: # ensure that each element is a tuple, not a list windows = [tuple(win) for win in windows] # in this case we will need to keep the information about original # sequence at any point, light storage is thus not possible. light_storage = False for win in windows: # Prepare the FASTQ file and iterate over them curr_map, counter = transform_fastq( input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], fastq=is_fastq(input_reads), min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if input_reads != fastq_path and clean: print ' x removing original input %s' % input_reads os.system('rm -f %s' % (input_reads)) # First mapping, full length if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix) if end: print 'Mapping reads in window %s-%s%s...' % (beg, end, suffix) else: print 'Mapping full reads...', curr_map if not skip: if mapper == 'gem': _gem_mapping(mapper_index_path, curr_map, out_map_path, gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'), **kwargs) # parse map file to extract not uniquely mapped reads print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % ( beg, end, suffix))) elif mapper == 'bowtie2': _bowtie2_mapping(mapper_index_path, curr_map, out_map_path, bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'), bowtie2_params=mapper_params, **kwargs) # parse map file to extract not uniquely mapped reads print 'Parsing result...' _bowtie2_filter(out_map_path, curr_map, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) else: raise Exception('ERROR: unknown mapper.') # clean if clean: print ' x removing %s input %s' % (mapper.upper(),curr_map) os.system('rm -f %s' % (curr_map)) print ' x removing map %s' % out_map_path os.system('rm -f %s' % (out_map_path)) # for next round, we will use remaining unmapped reads input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix) outfiles.append( (os.path.join(out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter)) # map again splitting unmapped reads into RE fragments # (no need to trim this time) if frag_map: if not r_enz: raise Exception('ERROR: need enzyme name to fragment.') frag_map, counter = transform_fastq( input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz, add_site=add_site, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if clean: print ' x removing pre-%s input %s' % (mapper.upper(),input_reads) os.system('rm -f %s' % (input_reads)) if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix) if not skip: if mapper == 'gem': print 'Mapping fragments of remaining reads...' _gem_mapping(mapper_index_path, frag_map, out_map_path, gem_binary=(mapper_binary if mapper_binary else 'gem-mapper'), **kwargs) print 'Parsing result...' _gem_filter(out_map_path, curr_map + '_fail%s.map' % (suffix), os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) elif mapper == 'bowtie2': print 'Mapping fragments of remaining reads...' _bowtie2_mapping(mapper_index_path, frag_map, out_map_path, bowtie2_binary=(mapper_binary if mapper_binary else 'bowtie2'), bowtie2_params=mapper_params, **kwargs) print 'Parsing result...' _bowtie2_filter(out_map_path, frag_map, curr_map + '_fail%s.map' % (suffix), os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) else: raise Exception('ERROR: unknown mapper.') # clean if clean: print ' x removing %s input %s' % (mapper.upper(),frag_map) os.system('rm -f %s' % (frag_map)) print ' x removing failed to map ' + curr_map + '_fail%s.map' % (suffix) os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix))) print ' x removing tmp mapped %s' % out_map_path os.system('rm -f %s' % (out_map_path)) outfiles.append((os.path.join(out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter)) if get_nread: return outfiles return [out for out, _ in outfiles]
def full_mapping(mapper_index_path, fastq_path, out_map_dir, mapper='gem', r_enz=None, frag_map=True, min_seq_len=15, windows=None, add_site=True, clean=False, get_nread=False, mapper_binary=None, mapper_params=None, **kwargs): """ Maps FASTQ reads to an indexed reference genome. Mapping can be done either without knowledge of the restriction enzyme used, or for experiments performed without one, like Micro-C (iterative mapping), or using the ligation sites created from the digested ends (fragment-based mapping). :param mapper_index_path: path to index file created from a reference genome using gem-index tool or bowtie2-build :param fastq_path: PATH to FASTQ file, either compressed or not. :param out_map_dir: path to a directory where to store mapped reads in MAP format . :param None r_enz: name of the restriction enzyme used in the experiment e.g. HindIII. This is optional if frag_map option is False :param True frag_map: two step mapper, first full length is mapped, then remaining, unmapped reads, are divided into restriction-enzyme fragments andeach is mapped. :param True add_site: when splitting the sequence by ligated sites found, removes the ligation site, and put back the original RE site. :param 15 min_seq_len: minimum size of a fragment to map :param None windows: tuple of ranges for beginning and end of the mapping. This parameter allows to do classical iterative mapping, e.g. windows=((1,25),(1,30),(1,35),(1,40),(1,45),(1,50)) A unique window can also be passed, for trimming, like this: windows=((1,101),) :param False clean: remove intermediate files created in temp_dir :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :param False get_nreads: returns a list of lists where each element contains a path and the number of reads processed :param gem-mapper mapper_binary: path to the binary mapper :param None mapper_params: extra parameters for the mapper :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.map_parser.parse_map` """ skip = kwargs.get('skip', False) suffix = kwargs.get('suffix', '') suffix = ('_' * (suffix != '')) + suffix nthreads = kwargs.get('nthreads', 8) outfiles = [] temp_dir = os.path.abspath( os.path.expanduser(kwargs.get('temp_dir', gettempdir()))) if mapper == 'gem': gem_version = None # check that we have the GEM binary: gem_binary = mapper_binary or 'gem-mapper' gem_binary = which(gem_binary) if not gem_binary: raise Exception( '\n\nERROR: GEM binary not found, install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') try: out, err = Popen([gem_binary, '--version'], stdout=PIPE, stderr=STDOUT, universal_newlines=True).communicate() gem_version = int(out[1]) except ValueError as e: gem_version = 2 print('Falling to gem v2') if mapper_params and isinstance(mapper_params, dict): kwargs.update(mapper_params) # create directories for rep in [temp_dir, out_map_dir]: mkdir(rep) # check space fspace = int(get_free_space_mb(temp_dir, div=3)) if fspace < 200: warn('WARNING: only %d Gb left on tmp_dir: %s\n' % (fspace, temp_dir)) # iterative mapping base_name = os.path.split(fastq_path)[-1].replace('.gz', '') base_name = '.'.join(base_name.split('.')[:-1]) input_reads = fastq_path if windows is None: light_storage = True windows = (None, ) elif isinstance(windows[0], int): # if windows starts at zero we do not need to store all the sequence # otherwise we need it because sequence can be trimmed two times # in fragment based mapping light_storage = True if not windows[0] else False windows = [tuple(windows)] else: # ensure that each element is a tuple, not a list windows = [tuple(win) for win in windows] # in this case we will need to keep the information about original # sequence at any point, light storage is thus not possible. light_storage = False for win in windows: # Prepare the FASTQ file and iterate over them curr_map, counter = transform_fastq(input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], fastq=is_fastq(input_reads), min_seq_len=min_seq_len, trim=win, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if input_reads != fastq_path and clean: print(' x removing original input %s' % input_reads) os.system('rm -f %s' % (input_reads)) # First mapping, full length if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = curr_map + '_full_%s-%s%s.map' % (beg, end, suffix) if end: print('Mapping reads in window %s-%s%s...' % (beg, end, suffix)) else: print('Mapping full reads...', curr_map) if not skip: if mapper == 'gem': _gem_mapping(mapper_index_path, curr_map, out_map_path, gem_binary=gem_binary, gem_version=gem_version, gem_params=mapper_params, **kwargs) # parse map file to extract not uniquely mapped reads print('Parsing result...') if gem_version >= 3: _sam_filter( out_map_path, curr_map, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) else: _gem_filter( out_map_path, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) elif mapper == 'bowtie2' or mapper == 'hisat2': _bowtie2_mapping(mapper_index_path, curr_map, out_map_path, bowtie2_binary=(mapper_binary if mapper_binary else mapper), bowtie2_params=mapper_params, **kwargs) # parse map file to extract not uniquely mapped reads print('Parsing result...') _sam_filter( out_map_path, curr_map, curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix), os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix))) else: raise Exception('ERROR: unknown mapper.') # clean if clean: print(' x removing %s input %s' % (mapper.upper(), curr_map)) os.system('rm -f %s' % (curr_map)) print(' x removing map %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) # for next round, we will use remaining unmapped reads input_reads = curr_map + '_filt_%s-%s%s.map' % (beg, end, suffix) outfiles.append((os.path.join( out_map_dir, base_name + '_full_%s-%s%s.map' % (beg, end, suffix)), counter)) # map again splitting unmapped reads into RE fragments # (no need to trim this time) if frag_map: if not r_enz: raise Exception('ERROR: need enzyme name to fragment.') frag_map, counter = transform_fastq(input_reads, mkstemp(prefix=base_name + '_', dir=temp_dir)[1], min_seq_len=min_seq_len, trim=win, fastq=False, r_enz=r_enz, add_site=add_site, skip=skip, nthreads=nthreads, light_storage=light_storage) # clean if clean: print(' x removing pre-%s input %s' % (mapper.upper(), input_reads)) os.system('rm -f %s' % (input_reads)) if not win: beg, end = 1, 'end' else: beg, end = win out_map_path = frag_map + '_frag_%s-%s%s.map' % (beg, end, suffix) if not skip: if mapper == 'gem': print('Mapping fragments of remaining reads...') _gem_mapping(mapper_index_path, frag_map, out_map_path, gem_binary=gem_binary, gem_version=gem_version, **kwargs) print('Parsing result...') # check if output is sam format for gem3 if gem_version >= 3: _sam_filter( out_map_path, frag_map, curr_map + '_fail%s.map' % (suffix), os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) else: _gem_filter( out_map_path, curr_map + '_fail%s.map' % (suffix), os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) elif mapper == 'bowtie2' or mapper == 'hisat2': print('Mapping fragments of remaining reads...') _bowtie2_mapping(mapper_index_path, frag_map, out_map_path, bowtie2_binary=(mapper_binary if mapper_binary else mapper), bowtie2_params=mapper_params, **kwargs) print('Parsing result...') _sam_filter( out_map_path, frag_map, curr_map + '_fail%s.map' % (suffix), os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix))) else: raise Exception('ERROR: unknown mapper.') # clean if clean: print(' x removing %s input %s' % (mapper.upper(), frag_map)) os.system('rm -f %s' % (frag_map)) print(' x removing failed to map ' + curr_map + '_fail%s.map' % (suffix)) os.system('rm -f %s' % (curr_map + '_fail%s.map' % (suffix))) print(' x removing tmp mapped %s' % out_map_path) os.system('rm -f %s' % (out_map_path)) outfiles.append((os.path.join( out_map_dir, base_name + '_frag_%s-%s%s.map' % (beg, end, suffix)), counter)) if clean: os.system('rm -rf %s' % (temp_dir)) if get_nread: return outfiles return [out for out, _ in outfiles]