def test_async_mapper_execution(): input = files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index) assert mappings is not None assert mappings.process is not None assert mappings.filename is None assert sum(1 for x in mappings) == 10000
def gem_mapping(gem_index_path, fastq_path, out_map_path, **kwargs): """ :param None focus: trims the sequence in the input FASTQ file according to a (start, end) position, or the name of a restriction enzyme. By default it uses the full sequence. :param 33 quality: set it to 'ignore' in order to speed-up the mapping """ gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path)) fastq_path = os.path.abspath(os.path.expanduser(fastq_path)) out_map_path = os.path.abspath(os.path.expanduser(out_map_path)) nthreads = kwargs.get('nthreads' , 8) max_edit_distance = kwargs.get('max_edit_distance' , 0.04) mismatches = kwargs.get('mismatches' , 0.04) quality = kwargs.get('quality' , 33) # check kwargs for kw in kwargs: if not kw in ['nthreads', 'max_edit_distance', 'mismatches', 'max_reads_per_chunk', 'out_files', 'temp_dir']: warn('WARNING: %s not is usual keywords, misspelled?' % kw) # input inputf = gem.files.open(fastq_path) # mapping print 'TO GEM', fastq_path return gem.mapper(inputf, gem_index_path, min_decoded_strata=0, max_decoded_matches=1, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, quality=quality, output=out_map_path, threads=nthreads)
def test_quality_pass_on_execution(): input = files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index, output=results_dir + "/quality_passon_mapping.map") assert mappings.quality == "offset-33", "Quality should be 'offset-33' but is %s" % ( str(mappings.quality))
def test_interleaved_pair_aligner_run(): input1 = files.open(testfiles["reads_1.fastq"]) input2 = files.open(testfiles["reads_2.fastq"]) mappings = gem.mapper(filter.interleave([input1, input2]), index) paired = gem.pairalign(mappings, index) assert paired is not None assert sum(1 for x in paired) == 20000 # test dataset does not pair at all
def test_sync_mapper_execution(): input = files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index, results_dir + "/result.mapping") assert mappings is not None assert mappings.process is not None assert mappings.filename is not None assert mappings.filename == results_dir + "/result.mapping" assert sum(1 for x in mappings) == 10000
def test_gem2sam_execution_to_file(): input = gem.files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index) result = results_dir + "/test_sam.sam" sam = gem.gem2sam(mappings, index, output=result, compact=True) assert sam is not None assert sam.process is not None assert sam.filename == result assert os.path.exists(result)
def test_interleaved_mapper_run(): input1 = files.open(testfiles["reads_1.fastq"]) input2 = files.open(testfiles["reads_2.fastq"]) mappings = gem.mapper(filter.interleave([input1, input2]), index) assert mappings is not None assert mappings.process is not None assert mappings.filename is None assert sum(1 for x in mappings) == 20000
def test_gem2sam_sam2bam(): input = gem.files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index) sam = gem.gem2sam(mappings, index, compact=True) result = results_dir + "/test_sam.bam" bam = gem.sam2bam(sam, output=result) assert os.path.exists(result) count = 0 for l in gem.files.open(result): count += 1 assert count == 10000, "Count 10000!=%d" % count
def test_gem2sam_execution(): input = files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index) sam = gem.gem2sam(mappings, index, compact=True) assert sam is not None assert sam.process is not None assert sam.filename is None count = 0 for read in sam: count += 1 assert count == 10000
def test_gem2sam_sam2bam(): input = gem.files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index) sam = gem.gem2sam(mappings, index, compact=True) result = results_dir+"/test_sam.bam" bam = gem.sam2bam(sam, output=result) assert os.path.exists(result) count = 0 for l in gem.files.open(result): count += 1 assert count == 10000, "Count 10000!=%d" % count
def test_quality_pass_on_execution(): input = files.open(testfiles["reads_1.fastq"]) mappings = gem.mapper(input, index, output=results_dir+"/quality_passon_mapping.map") assert mappings.quality == "offset-33", "Quality should be 'offset-33' but is %s" % (str(mappings.quality))
# output local_out_sam = out_sam_path + '.%d:%d-%d' % ( N_WINDOWS - len(range_stop), seq_beg, seq_end) out_files.append(local_out_sam) # input inputf = gem.files.open(fastq_path) # trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam/bam if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration. unmapped_fastq_path = os.path.split(fastq_path)[1] if unmapped_fastq_path[-1].isdigit():
# output local_out_sam = out_sam_path + '.%d' % (seq_len) out_files.append(local_out_sam) # input inputf = gem.files.open(fastq_path) # trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration.
def iterative_mapping(gem_index_path, fastq_path, out_sam_path, range_start, range_stop, **kwargs): """ Map iteratively a given FASTQ file to a reference genome. :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to fastq file, either compressed or not. :param out_sam_path: path to a directory where to store mapped reads in SAM/ BAM format (see option output_is_bam). :param range_start: list of integers representing the start position of each read fragment to be mapped (starting at 1 includes the first nucleotide of the read). :param range_stop: list of integers representing the end position of each read fragment to be mapped. :param True single_end: when FASTQ contains paired-ends flags :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param -1 max_reads_per_chunk: maximum number of reads to process at a time. If -1, all reads will be processed in one run (more RAM memory needed). :param False output_is_bam: Use binary (compressed) form of generated out-files with mapped reads (recommended to save disk space). :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.sam_parser.parse_sam` """ gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path)) fastq_path = os.path.abspath(os.path.expanduser(fastq_path)) out_sam_path = os.path.abspath(os.path.expanduser(out_sam_path)) single_end = kwargs.get('single_end' , True) max_edit_distance = kwargs.get('max_edit_distance' , 0.04) mismatches = kwargs.get('mismatches' , 0.04) nthreads = kwargs.get('nthreads' , 4) max_reads_per_chunk = kwargs.get('max_reads_per_chunk' , -1) out_files = kwargs.get('out_files' , []) output_is_bam = kwargs.get('output_is_bam' , False) temp_dir = os.path.abspath(os.path.expanduser( kwargs.get('temp_dir', tempfile.gettempdir()))) # check kwargs for kw in kwargs: if not kw in ['single_end', 'nthreads', 'max_edit_distance', 'mismatches', 'max_reads_per_chunk', 'out_files', 'output_is_bam', 'temp_dir']: warn('WARNING: %s not is usual keywords, misspelled?' % kw) # check windows: if not isinstance(range_start, list) or not isinstance(range_stop, list): if (not isinstance(range_start, tuple) or not isinstance(range_stop, tuple)): raise Exception('ERROR: range_start and range_stop should be lists') range_start = list(range_start) range_stop = list(range_stop) if (not all(isinstance(i, int) for i in range_start) or not all(isinstance(i, int) for i in range_stop)): try: range_start = map(int, range_start) range_stop = map(int, range_stop) warn('WARNING: range_start and range_stop converted to integers') except ValueError: raise Exception('ERROR: range_start and range_stop should contain' + ' integers only') if (len(zip(range_start, range_stop)) < len(range_start) or len(range_start) != len(range_stop)): raise Exception('ERROR: range_start and range_stop should have the ' + 'same sizes and windows should be uniques.') if any([i >= j for i, j in zip(range_start, range_stop)]): raise Exception('ERROR: start positions should always be lower than ' + 'stop positions.') if any([i <= 0 for i in range_start]): raise Exception('ERROR: start positions should be strictly positive.') # create directories for rep in [temp_dir, os.path.split(out_sam_path)[0]]: mkdir(rep) #get the length of a read if fastq_path.endswith('.gz'): fastqh = gzip.open(fastq_path) else: fastqh = open(fastq_path) # get the length from the length of the second line, which is the sequence # can not use the "length" keyword, as it is not always present try: _ = fastqh.next() raw_seq_len = len(fastqh.next().strip()) fastqh.close() except StopIteration: raise IOError('ERROR: problem reading %s\n' % fastq_path) if not N_WINDOWS: N_WINDOWS = len(range_start) # Split input files if required and apply iterative mapping to each # segment separately. if max_reads_per_chunk > 0: kwargs['max_reads_per_chunk'] = -1 print 'Split input file %s into chunks' % fastq_path chunked_files = _chunk_file( fastq_path, os.path.join(temp_dir, os.path.split(fastq_path)[1]), max_reads_per_chunk * 4) print '%d chunks obtained' % len(chunked_files) for i, fastq_chunk_path in enumerate(chunked_files): global N_WINDOWS N_WINDOWS = 0 print 'Run iterative_mapping recursively on %s' % fastq_chunk_path out_files.extend(iterative_mapping( gem_index_path, fastq_chunk_path, out_sam_path + '.%d' % (i + 1), range_start[:], range_stop[:], **kwargs)) for i, fastq_chunk_path in enumerate(chunked_files): # Delete chunks only if the file was really chunked. if len(chunked_files) > 1: print 'Remove the chunks: %s' % ' '.join(chunked_files) os.remove(fastq_chunk_path) return out_files # end position according to sequence in the file # removes 1 in order to start at 1 instead of 0 try: seq_end = range_stop.pop(0) seq_beg = range_start.pop(0) except IndexError: return out_files # define what we trim seq_len = seq_end - seq_beg trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1) # output local_out_sam = out_sam_path + '.%d:%d-%d' % ( N_WINDOWS - len(range_stop), seq_beg, seq_end) out_files.append(local_out_sam) # input inputf = gem.files.open(fastq_path) # trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam/bam if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration. unmapped_fastq_path = os.path.split(fastq_path)[1] if unmapped_fastq_path[-1].isdigit(): unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0] unmapped_fastq_path = os.path.join( temp_dir, unmapped_fastq_path + '.%d:%d-%d' % ( N_WINDOWS - len(range_stop), seq_beg, seq_end)) _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path) out_files.extend(iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path, range_start, range_stop, **kwargs)) os.remove(unmapped_fastq_path) return out_files
def iterative_mapping(gem_index_path, fastq_path, out_sam_path, range_start, range_stop, **kwargs): """ Map iteratively a given FASTQ file to a reference genome. :param gem_index_path: path to index file created from a reference genome using gem-index tool :param fastq_path: PATH to fastq file, either compressed or not. :param out_sam_path: path to a directory where to store mapped reads in SAM/ BAM format (see option output_is_bam). :param range_start: list of integers representing the start position of each read fragment to be mapped (starting at 1 includes the first nucleotide of the read). :param range_stop: list of integers representing the end position of each read fragment to be mapped. :param True single_end: when FASTQ contains paired-ends flags :param 4 nthreads: number of threads to use for mapping (number of CPUs) :param 0.04 max_edit_distance: The maximum number of edit operations allowed while verifying candidate matches by dynamic programming. :param 0.04 mismatches: The maximum number of nucleotide substitutions allowed while mapping each k-mer. It is always guaranteed that, however other options are chosen, all the matches up to the specified number of substitutions will be found by the program. :param -1 max_reads_per_chunk: maximum number of reads to process at a time. If -1, all reads will be processed in one run (more RAM memory needed). :param False output_is_bam: Use binary (compressed) form of generated out-files with mapped reads (recommended to save disk space). :param /tmp temp_dir: important to change. Intermediate FASTQ files will be written there. :returns: a list of paths to generated outfiles. To be passed to :func:`pytadbit.parsers.sam_parser.parse_sam` """ gem_index_path = os.path.abspath(os.path.expanduser(gem_index_path)) fastq_path = os.path.abspath(os.path.expanduser(fastq_path)) out_sam_path = os.path.abspath(os.path.expanduser(out_sam_path)) single_end = kwargs.get('single_end', True) max_edit_distance = kwargs.get('max_edit_distance', 0.04) mismatches = kwargs.get('mismatches', 0.04) nthreads = kwargs.get('nthreads', 4) max_reads_per_chunk = kwargs.get('max_reads_per_chunk', -1) out_files = kwargs.get('out_files', []) output_is_bam = kwargs.get('output_is_bam', False) temp_dir = os.path.abspath( os.path.expanduser(kwargs.get('temp_dir', tempfile.gettempdir()))) # check kwargs for kw in kwargs: if not kw in [ 'single_end', 'nthreads', 'max_edit_distance', 'mismatches', 'max_reads_per_chunk', 'out_files', 'output_is_bam', 'temp_dir' ]: warn('WARNING: %s not is usual keywords, misspelled?' % kw) # check windows: if not isinstance(range_start, list) or not isinstance(range_stop, list): if (not isinstance(range_start, tuple) or not isinstance(range_stop, tuple)): raise Exception( 'ERROR: range_start and range_stop should be lists') range_start = list(range_start) range_stop = list(range_stop) if (not all(isinstance(i, int) for i in range_start) or not all(isinstance(i, int) for i in range_stop)): try: range_start = map(int, range_start) range_stop = map(int, range_stop) warn('WARNING: range_start and range_stop converted to integers') except ValueError: raise Exception( 'ERROR: range_start and range_stop should contain' + ' integers only') if (len(zip(range_start, range_stop)) < len(range_start) or len(range_start) != len(range_stop)): raise Exception('ERROR: range_start and range_stop should have the ' + 'same sizes and windows should be uniques.') if any([i >= j for i, j in zip(range_start, range_stop)]): raise Exception('ERROR: start positions should always be lower than ' + 'stop positions.') if any([i <= 0 for i in range_start]): raise Exception('ERROR: start positions should be strictly positive.') # create directories for rep in [temp_dir, os.path.split(out_sam_path)[0]]: mkdir(rep) #get the length of a read if fastq_path.endswith('.gz'): fastqh = gzip.open(fastq_path) else: fastqh = open(fastq_path) # get the length from the length of the second line, which is the sequence # can not use the "length" keyword, as it is not always present try: _ = fastqh.next() raw_seq_len = len(fastqh.next().strip()) fastqh.close() except StopIteration: raise IOError('ERROR: problem reading %s\n' % fastq_path) if not N_WINDOWS: N_WINDOWS = len(range_start) # Split input files if required and apply iterative mapping to each # segment separately. if max_reads_per_chunk > 0: kwargs['max_reads_per_chunk'] = -1 print 'Split input file %s into chunks' % fastq_path chunked_files = _chunk_file( fastq_path, os.path.join(temp_dir, os.path.split(fastq_path)[1]), max_reads_per_chunk * 4) print '%d chunks obtained' % len(chunked_files) for i, fastq_chunk_path in enumerate(chunked_files): global N_WINDOWS N_WINDOWS = 0 print 'Run iterative_mapping recursively on %s' % fastq_chunk_path out_files.extend( iterative_mapping(gem_index_path, fastq_chunk_path, out_sam_path + '.%d' % (i + 1), range_start[:], range_stop[:], **kwargs)) for i, fastq_chunk_path in enumerate(chunked_files): # Delete chunks only if the file was really chunked. if len(chunked_files) > 1: print 'Remove the chunks: %s' % ' '.join(chunked_files) os.remove(fastq_chunk_path) return out_files # end position according to sequence in the file # removes 1 in order to start at 1 instead of 0 try: seq_end = range_stop.pop(0) seq_beg = range_start.pop(0) except IndexError: return out_files # define what we trim seq_len = seq_end - seq_beg trim_5, trim_3 = trimming(raw_seq_len, seq_beg - 1, seq_len - 1) # output local_out_sam = out_sam_path + '.%d:%d-%d' % (N_WINDOWS - len(range_stop), seq_beg, seq_end) out_files.append(local_out_sam) # input inputf = gem.files.open(fastq_path) # trimming trimmed = gem.filter.run_filter( inputf, ['--hard-trim', '%d,%d' % (trim_5, trim_3)], threads=nthreads, paired=not single_end) # mapping mapped = gem.mapper(trimmed, gem_index_path, min_decoded_strata=0, max_decoded_matches=2, unique_mapping=False, max_edit_distance=max_edit_distance, mismatches=mismatches, output=temp_dir + '/test.map', threads=nthreads) # convert to sam/bam if output_is_bam: sam = gem.gem2sam(mapped, index=gem_index_path, threads=nthreads, single_end=single_end) _ = gem.sam2bam(sam, output=local_out_sam, threads=nthreads) else: sam = gem.gem2sam(mapped, index=gem_index_path, output=local_out_sam, threads=nthreads, single_end=single_end) # Recursively go to the next iteration. unmapped_fastq_path = os.path.split(fastq_path)[1] if unmapped_fastq_path[-1].isdigit(): unmapped_fastq_path = unmapped_fastq_path.rsplit('.', 1)[0] unmapped_fastq_path = os.path.join( temp_dir, unmapped_fastq_path + '.%d:%d-%d' % (N_WINDOWS - len(range_stop), seq_beg, seq_end)) _filter_unmapped_fastq(fastq_path, local_out_sam, unmapped_fastq_path) out_files.extend( iterative_mapping(gem_index_path, unmapped_fastq_path, out_sam_path, range_start, range_stop, **kwargs)) os.remove(unmapped_fastq_path) return out_files
paired_out = "%s_paired.map" % name # scored mappings final_out = "%s.map" % name # sam/bam output sam_out = "%s.bam" % name ## Create initial mapping # we deal with a single file with interleaved paired reads here, but # creating input from two files for the read pairs is straight forward using the interleave filter # # input_1 = gem.files.open(input_file) # input_2 = gem.files.open(input_file2) # input = gem.filter.interleave([input_1, input_2]) print "Running initial mapping" input = gem.files.open(reads) initial_mapping = gem.mapper(input, index, initial_out, mismatches=0.07, delta=1, threads=THREADS) ## junction sites # before we can do the split mapping, we have to load # the junction sites from a gtf annotation and # run the denovo-junction detection. This will also give # us a mapping that preserves short indels detected during the # extraction run. print "Loading GTF junctions from %s" % annotation junctions = gem.junctions.from_gtf(annotation) # now the denovo run. This returns a tuple : (mapping, junctions) # and here we use the merge_with parameter to merge the denovo junctions with # the previously loaded gtf junctions. # # Also note that we pass only unmapped reads from the initial mapping to the junction