class TestHiRapiProperties(unittest.TestCase): def setUp(self): self.hi = HiRapiAligner('rapi_bwa') def tearDown(self): self.hi.release_resources() def test_defaults(self): self.assertTrue(self.hi.paired) self.assertEqual(pyrapi.rapi.QENC_SANGER, self.hi.q_offset) def test_get_plugin_info(self): self.assertEquals('bwa-mem', self.hi.aligner_name) self.assertTrue(self.hi.aligner_version) self.assertTrue(self.hi.plugin_version) def test_set_some_options(self): self.hi.opts.n_threads = 11 self.assertEquals(11, self.hi.opts.n_threads) self.hi.opts.mapq_min = 5 self.assertEquals(5, self.hi.opts.mapq_min) self.hi.opts.isize_min = 250 self.assertEquals(250, self.hi.opts.isize_min) self.hi.opts.isize_max = 500 self.assertEquals(500, self.hi.opts.isize_max)
def setUp(self): self.hi = HiRapiAligner('rapi_bwa') self.reads = test_utils.get_mini_ref_seqs() for row in self.reads: if len(row) != 5: raise RuntimeError( "Unexpected number of fields in mini_ref read record") self.hi.load_pair(*row)
def setUp(self): self.hi = HiRapiAligner('rapi_bwa') self.reads = test_utils.get_mini_ref_seqs() for row in self.reads: if len(row) != 5: raise RuntimeError("Unexpected number of fields in mini_ref read record") self.hi.load_pair(*row)
def test_base_quality(self): hi = HiRapiAligner('rapi_bwa', paired=False) one_read = self.reads[0][0:3] hi.q_offset = self.hi.Qenc_Sanger hi.load_read('sanger_read', one_read[1], one_read[2]) # 64: Illumina base quality offset # 33: Sanger base quality offset ill_quality = ''.join(chr(ord(c) + (64 - 33)) for c in one_read[2]) hi.q_offset = self.hi.Qenc_Illumina hi.load_read('illumina_read', one_read[1], ill_quality) loaded_qualities = [frag[0].qual for frag in hi.ifragments()] self.assertEquals(2, len(loaded_qualities)) self.assertEquals(loaded_qualities[0], loaded_qualities[1])
def __init__(self, ctx): super(mapper, self).__init__(ctx) self.logger = logging.getLogger("seqal") self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) pe = True # single-end sequencen alignment not yet supported by Seqal self.hi_rapi = HiRapiAligner('rapi_bwa', paired=pe) # opts self.hi_rapi.opts.n_threads = self.nthreads self.hi_rapi.opts.isize_max = self.max_isize if self.min_isize is not None: self.hi_rapi.opts.isize_min = self.min_isize self.hi_rapi.qoffset = self.hi_rapi.Qenc_Illumina if self.format == "fastq-illumina" else self.hi_rapi.Qenc_Sanger # end opts self.logger.info("Using the %s aligner plugin, aligner version %s, plugin version %s", self.hi_rapi.aligner_name, self.hi_rapi.aligner_version, self.hi_rapi.plugin_version) self.logger.info("Working in %s mode", 'paired-end' if pe else 'single-end') # allocate space for reads self.logger.debug("Reserving batch space for %s reads", self.batch_size) self.hi_rapi.reserve_space(self.batch_size) # load reference reference_root = self.get_reference_root_from_archive(utils.get_ref_archive(ctx.getJobConf())) self.logger.info("Full reference path (prefix): %s", reference_root) with self.event_monitor.time_block("Loading reference %s" % reference_root): self.hi_rapi.load_ref(reference_root) ######## assemble hit processor chain chain = RapiFilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next( RapiEmitSamLink(ctx, self.event_monitor, self.hi_rapi) ) else: raise NotImplementedError("Only mapping mode is supported at the moment") self.hit_visitor_chain = chain
def test_base_quality(self): hi = HiRapiAligner('rapi_bwa', paired=False) one_read = self.reads[0][0:3] hi.q_offset = self.hi.Qenc_Sanger hi.load_read('sanger_read', one_read[1], one_read[2]) # 64: Illumina base quality offset # 33: Sanger base quality offset ill_quality = ''.join( chr(ord(c) + (64-33)) for c in one_read[2] ) hi.q_offset = self.hi.Qenc_Illumina hi.load_read('illumina_read', one_read[1], ill_quality) loaded_qualities = [ frag[0].qual for frag in hi.ifragments() ] self.assertEquals(2, len(loaded_qualities)) self.assertEquals(loaded_qualities[0], loaded_qualities[1])
class TestHiRapiBatch(unittest.TestCase): def setUp(self): self.hi = HiRapiAligner('rapi_bwa') self.reads = test_utils.get_mini_ref_seqs() for row in self.reads: if len(row) != 5: raise RuntimeError("Unexpected number of fields in mini_ref read record") self.hi.load_pair(*row) def tearDown(self): self.hi.release_resources() def test_fragment_iteration(self): read_id_counts = dict() for frag in self.hi.ifragments(): for read in frag: read_id = read.id read_id_counts[read_id] = 1 + read_id_counts.get(read_id, 0) # 5 pairs self.assertEquals(5, len(read_id_counts)) unique_counts = set(read_id_counts.values()) # all ids appearing twice self.assertEquals(1, len(unique_counts)) self.assertEquals(2, unique_counts.pop()) def test_batch_management(self): self.assertEquals(10, self.hi.batch_size) self.hi.clear_batch() self.assertEquals(0, self.hi.batch_size) self.hi.load_ref(test_utils.MiniRefMemPath) self.hi.align_batch() # should not raise just because it's empty for _ in self.hi.ifragments(): self.fail("iterating over an empty batch!") def test_base_quality(self): hi = HiRapiAligner('rapi_bwa', paired=False) one_read = self.reads[0][0:3] hi.q_offset = self.hi.Qenc_Sanger hi.load_read('sanger_read', one_read[1], one_read[2]) # 64: Illumina base quality offset # 33: Sanger base quality offset ill_quality = ''.join( chr(ord(c) + (64-33)) for c in one_read[2] ) hi.q_offset = self.hi.Qenc_Illumina hi.load_read('illumina_read', one_read[1], ill_quality) loaded_qualities = [ frag[0].qual for frag in hi.ifragments() ] self.assertEquals(2, len(loaded_qualities)) self.assertEquals(loaded_qualities[0], loaded_qualities[1])
def setUp(self): self.hi = HiRapiAligner('rapi_bwa') self._align_mini_ref_seqs()
class TestHiRapiAlignments(unittest.TestCase): def setUp(self): self.hi = HiRapiAligner('rapi_bwa') self._align_mini_ref_seqs() def tearDown(self): self.hi.release_resources() def test_load_reference_again(self): # should "just work" self.hi.load_ref(test_utils.MiniRefMemPath) def test_sam(self): io = StringIO() self.hi.write_sam(io, include_header=False) sam = io.getvalue() expected_sam = test_utils.rapi_mini_ref_seqs_sam_no_header() self.assertEquals(expected_sam, sam) def _align_mini_ref_seqs(self): self.hi.load_ref(test_utils.MiniRefMemPath) reads = test_utils.get_mini_ref_seqs() for row in reads: if len(row) != 5: raise RuntimeError("Unexpected number of fields in mini_ref read record") self.hi.load_pair(*row) self.hi.align_batch()
def setUp(self): self.hi = HiRapiAligner('rapi_bwa')
class TestHiRapiBatch(unittest.TestCase): def setUp(self): self.hi = HiRapiAligner('rapi_bwa') self.reads = test_utils.get_mini_ref_seqs() for row in self.reads: if len(row) != 5: raise RuntimeError( "Unexpected number of fields in mini_ref read record") self.hi.load_pair(*row) def tearDown(self): self.hi.release_resources() def test_fragment_iteration(self): read_id_counts = dict() for frag in self.hi.ifragments(): for read in frag: read_id = read.id read_id_counts[read_id] = 1 + read_id_counts.get(read_id, 0) # 5 pairs self.assertEquals(5, len(read_id_counts)) unique_counts = set(read_id_counts.values()) # all ids appearing twice self.assertEquals(1, len(unique_counts)) self.assertEquals(2, unique_counts.pop()) def test_batch_management(self): self.assertEquals(10, self.hi.batch_size) self.hi.clear_batch() self.assertEquals(0, self.hi.batch_size) self.hi.load_ref(test_utils.MiniRefMemPath) self.hi.align_batch() # should not raise just because it's empty for _ in self.hi.ifragments(): self.fail("iterating over an empty batch!") def test_base_quality(self): hi = HiRapiAligner('rapi_bwa', paired=False) one_read = self.reads[0][0:3] hi.q_offset = self.hi.Qenc_Sanger hi.load_read('sanger_read', one_read[1], one_read[2]) # 64: Illumina base quality offset # 33: Sanger base quality offset ill_quality = ''.join(chr(ord(c) + (64 - 33)) for c in one_read[2]) hi.q_offset = self.hi.Qenc_Illumina hi.load_read('illumina_read', one_read[1], ill_quality) loaded_qualities = [frag[0].qual for frag in hi.ifragments()] self.assertEquals(2, len(loaded_qualities)) self.assertEquals(loaded_qualities[0], loaded_qualities[1])
class TestHiRapiAlignments(unittest.TestCase): def setUp(self): self.hi = HiRapiAligner('rapi_bwa') self._align_mini_ref_seqs() def tearDown(self): self.hi.release_resources() def test_load_reference_again(self): # should "just work" self.hi.load_ref(test_utils.MiniRefMemPath) def test_sam(self): io = StringIO() self.hi.write_sam(io, include_header=False) sam = io.getvalue() expected_sam = test_utils.rapi_mini_ref_seqs_sam_no_header() self.assertEquals(expected_sam, sam) def _align_mini_ref_seqs(self): self.hi.load_ref(test_utils.MiniRefMemPath) reads = test_utils.get_mini_ref_seqs() for row in reads: if len(row) != 5: raise RuntimeError( "Unexpected number of fields in mini_ref read record") self.hi.load_pair(*row) self.hi.align_batch()
class mapper(Mapper): """ Aligns sequences to a reference genome. @input-record: C{key} does not matter (standard LineRecordReader); C{value} is a tab-separated text line with 5 fields: ID, read_seq, read_qual, mate_seq, mate_qual. @output-record: protobuf-serialized mapped pairs (map-reduce job) or alignment records in SAM format (map-only job). @jobconf-param: C{mapred.reduce.tasks} number of Hadoop reduce tasks to launch. If the value of this property is set to 0, then the mapper will directly output the mappings in SAM format, like BWA. If set to a value > 0 the mapper will output mappings in the protobuf serialized format for the rmdup reducer. @jobconf-param: C{seal.seqal.log.level} logging level, specified as a logging module literal. @jobconf-param: C{mapred.cache.archives} distributed cache entry for the bwa index archive. The entry is of the form HDFS_PATH#LINK_NAME. The archive for a given chromosome must contain (at the top level, i.e., no directories) all files generated by 'bwa index' for that chromosome. @jobconf-param: C{seal.seqal.alignment.max.isize}: if the inferred isize is greater than this value, Smith-Waterman alignment for unmapped reads will be skipped. @jobconf-param: C{seal.seqal.pairing.batch.size}: how many sequences should be processed at a time by the pairing function. Status will be updated at each new batch: therefore, lowering this value can help avoid timeouts. @jobconf-param: C{seal.seqal.fastq-subformat} Specifies base quality score encoding. Supported types are: 'fastq-sanger' and 'fastq-illumina'. @jobconf-param: C{mapred.create.symlink} must be set to 'yes'. @jobconf-param: C{seal.seqal.min_hit_quality} mapping quality threshold below which the mapping will be discarded. """ SUPPORTED_FORMATS = "fastq-illumina", "fastq-sanger" DEFAULT_FORMAT = "fastq-sanger" COUNTER_CLASS = "SEQAL" DeprecationMap = { "seal.seqal.log.level": "bl.seqal.log.level", "seal.seqal.alignment.max.isize": "bl.seqal.alignment.max.isize", "seal.seqal.pairing.batch.size": "bl.seqal.pairing.batch.size", "seal.seqal.fastq-subformat": "bl.seqal.fastq-subformat", "seal.seqal.min_hit_quality": "bl.seqal.min_hit_quality", "seal.seqal.remove_unmapped": "bl.seqal.remove_unmapped", "seal.seqal.discard_duplicates": "bl.seqal.discard_duplicates", "seal.seqal.nthreads": "bl.seqal.nthreads", "seal.seqal.trim.qual": "bl.seqal.trim.qual", } def __get_configuration(self, ctx): # TODO: refactor settings common to mapper and reducer jc = ctx.getJobConf() jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, self.logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT) jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000) jc_configure_int(self, jobconf, 'seal.seqal.alignment.min.isize', 'min_isize', None) jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000) jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0) jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False) jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1) jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0) try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) if self.format not in self.SUPPORTED_FORMATS: raise_pydoop_exception( "seal.seqal.fastq-subformat must be one of %r" % (self.SUPPORTED_FORMATS,) ) if self.remove_unmapped: raise NotImplementedError("seal.seqal.remove_unmapped is currently unsupported") if self.min_hit_quality > 0: raise NotImplementedError("seal.seqal.min_hit_quality is currently unsupported") if self.trim_qual > 0: raise NotImplementedError("seal.seqal.trim_qual is currently unsupported") if self.max_isize <= 0: raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]") if self.batch_size <= 0: raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]") # minimum qual value required for a hit to be kept. By default outputs all the # hits BWA returns. if self.min_hit_quality < 0: raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]") # number of concurrent threads for main alignment operation if self.nthreads <= 0: raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]") # trim quality parameter used by BWA from read trimming. Equivalent to # the -q parameter for bwa align if self.trim_qual < 0: raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]") if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0: self.__map_only = False else: self.__map_only = True def get_reference_root_from_archive(self, ref_dir): """ Given a directory containing an indexed reference, such that all its files have a common name (except the extension), this method find the path to the reference including the common name. e.g. my_reference/hg_18.bwt my_reference/hg_18.rsax my_reference/hg_18.sax => "my_references/hg_18" my_reference/hg_18.pac my_reference/.irrelevant_file """ file_list = [ p for p in os.listdir(ref_dir) ] if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug("file_list extracted from reference archive: %s", file_list) filtered_file_list = [ p for p in file_list if not p.startswith('.') and os.path.splitext(p)[1].lstrip('.') in _BWA_INDEX_EXT ] prefix = os.path.commonprefix(filtered_file_list).rstrip('.') if not prefix: raise RuntimeError("Could not determine common prefix from list of files (%s)" %\ filtered_file_list if len(filtered_file_list) < 15 else "{}, ...".format(', '.join(filtered_file_list[0:15]))) full_prefix = os.path.join(ref_dir, prefix) return full_prefix def __init__(self, ctx): super(mapper, self).__init__(ctx) self.logger = logging.getLogger("seqal") self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) pe = True # single-end sequencen alignment not yet supported by Seqal self.hi_rapi = HiRapiAligner('rapi_bwa', paired=pe) # opts self.hi_rapi.opts.n_threads = self.nthreads self.hi_rapi.opts.isize_max = self.max_isize if self.min_isize is not None: self.hi_rapi.opts.isize_min = self.min_isize self.hi_rapi.qoffset = self.hi_rapi.Qenc_Illumina if self.format == "fastq-illumina" else self.hi_rapi.Qenc_Sanger # end opts self.logger.info("Using the %s aligner plugin, aligner version %s, plugin version %s", self.hi_rapi.aligner_name, self.hi_rapi.aligner_version, self.hi_rapi.plugin_version) self.logger.info("Working in %s mode", 'paired-end' if pe else 'single-end') # allocate space for reads self.logger.debug("Reserving batch space for %s reads", self.batch_size) self.hi_rapi.reserve_space(self.batch_size) # load reference reference_root = self.get_reference_root_from_archive(utils.get_ref_archive(ctx.getJobConf())) self.logger.info("Full reference path (prefix): %s", reference_root) with self.event_monitor.time_block("Loading reference %s" % reference_root): self.hi_rapi.load_ref(reference_root) ######## assemble hit processor chain chain = RapiFilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next( RapiEmitSamLink(ctx, self.event_monitor, self.hi_rapi) ) else: raise NotImplementedError("Only mapping mode is supported at the moment") self.hit_visitor_chain = chain def _visit_hits(self): for read_tpl in self.hi_rapi.ifragments(): self.hit_visitor_chain.process(read_tpl) def map(self, ctx): # Accumulates reads in self.pairs, until batch size is reached. # At that point it calls run_alignment and emits the output. v = ctx.value f_id, r1, q1, r2, q2 = v.split("\t") self.hi_rapi.load_pair(f_id, r1, q1, r2, q2) if self.hi_rapi.batch_size >= self.batch_size: self.hi_rapi.align_batch() self._visit_hits() self.hi_rapi.clear_batch() def close(self): # If there are any reads left in the aligner batch, # align them too if self.hi_rapi.batch_size > 0: self.hi_rapi.align_batch() self._visit_hits() self.hi_rapi.clear_batch() self.hi_rapi.release_resources()