def __init__(self, ctx): super(reducer, self).__init__(ctx) jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx) self.__output_sink = EmitSamLink(ctx, self.event_monitor)
def __init__(self, ctx): super(type(self), self).__init__(ctx) self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) self.aligner = BwaAligner() self.aligner.event_monitor = self.event_monitor self.aligner.qformat = self.format self.aligner.max_isize = self.max_isize self.aligner.nthreads = self.nthreads self.aligner.trim_qual = self.trim_qual self.aligner.mmap_enabled = True ######## assemble hit processor chain chain = FilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next(EmitSamLink(ctx, self.event_monitor)) else: chain.set_next(MarkDuplicatesEmitter(ctx, self.event_monitor)) self.aligner.hit_visitor = chain ######## set the path to the reference index self.ref_archive = utils.get_ref_archive(ctx.getJobConf()) self.aligner.reference = self.get_reference_root(self.ref_archive) # part of the code is a workaround for accumulating records, see #331 isplit = InputSplit(ctx.getInputSplit()) self.split_end = isplit.offset + isplit.length
def setUp(self): self.ctx = map_context(None, None) self.count_group = "Test" self.logger = SavingLogger() self.monitor = HadoopEventMonitor(self.count_group, self.logger, self.ctx) self.link = MarkDuplicatesEmitter(self.ctx, self.monitor) self.pair1 = test_utils.pair1() self.pair2 = test_utils.pair2()
def __init__(self, ctx): super(mapper, self).__init__(ctx) self.logger = logging.getLogger("seqal") self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) pe = True # single-end sequencen alignment not yet supported by Seqal self.hi_rapi = HiRapiAligner('rapi_bwa', paired=pe) # opts self.hi_rapi.opts.n_threads = self.nthreads self.hi_rapi.opts.isize_max = self.max_isize if self.min_isize is not None: self.hi_rapi.opts.isize_min = self.min_isize self.hi_rapi.qoffset = self.hi_rapi.Qenc_Illumina if self.format == "fastq-illumina" else self.hi_rapi.Qenc_Sanger # end opts self.logger.info("Using the %s aligner plugin, aligner version %s, plugin version %s", self.hi_rapi.aligner_name, self.hi_rapi.aligner_version, self.hi_rapi.plugin_version) self.logger.info("Working in %s mode", 'paired-end' if pe else 'single-end') # allocate space for reads self.logger.debug("Reserving batch space for %s reads", self.batch_size) self.hi_rapi.reserve_space(self.batch_size) # load reference reference_root = self.get_reference_root_from_archive(utils.get_ref_archive(ctx.getJobConf())) self.logger.info("Full reference path (prefix): %s", reference_root) with self.event_monitor.time_block("Loading reference %s" % reference_root): self.hi_rapi.load_ref(reference_root) ######## assemble hit processor chain chain = RapiFilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next( RapiEmitSamLink(ctx, self.event_monitor, self.hi_rapi) ) else: raise NotImplementedError("Only mapping mode is supported at the moment") self.hit_visitor_chain = chain
def setUp(self): self.map_ctx = map_context(None, None) self.count_group = "Test" self.logger = SavingLogger() self.monitor = HadoopEventMonitor(self.count_group, self.logger, self.map_ctx) self.emitter = EmitSamLink(self.map_ctx, self.monitor) # create two mappings, m1, m2. We put them in self.pair # m1 has: # name = first # tid = tid1 # m2 has: # name = second # tid = tid2 self.pair = [ SimpleMapping(), SimpleMapping() ] self.m1, self.m2 = self.pair self.m1.set_name("first") self.m1.tid = "tid1" self.m2.set_name("second") self.m2.tid = "tid2"
class reducer(Reducer): COUNTER_CLASS = "SEQAL" # TODO: refactor so that mapper and reducer have a common place for things like this constant DeprecationMap = { 'seal.seqal.log.level': 'bl.seqal.log.level', 'seal.seqal.discard_duplicates': 'bl.seqal.discard_duplicates' } def __init__(self, ctx): super(reducer, self).__init__(ctx) jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx) self.__output_sink = EmitSamLink(ctx, self.event_monitor) def __process_unmapped_pairs(self, ctx): while ctx.nextValue(): value = ctx.getInputValue() pair = protobuf_mapping.unserialize_pair(value) self.__output_sink.process(pair) def reduce(self, ctx): # create the "workspace" self.__pairs = [] self.__unpaired = [] # gather input key_values = ctx.getInputKey().split(':') if key_values[0] == seqal_app.UNMAPPED_STRING: # pair of unmapped sequences self.__process_unmapped_pairs(ctx) else: if len(key_values) != 3: raise RuntimeError("Unexpected key length %d. Expected key format is ref_id:pos:orient" % len(key)) # convert key values and make it a tuple key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R') # last value is True if reverse strand have_pairs = False # keep track of whether we have at least one real pair. # load mappings while ctx.nextValue(): value = ctx.getInputValue() if value == seqal_app.PAIR_STRING: have_pairs = True else: pair = protobuf_mapping.unserialize_pair(value) if pair[0] is None or pair[0].is_unmapped(): # Sanity check. pair[0] should never be None or unmapped here. raise ValueError("Error! Got None or unmapped in first read for key %s. pair: %s" % (key, pair)) if pair[1] and pair[1].is_unmapped(): self.__output_sink.process( (pair[1], None) ) self.__unpaired.append( (pair[0], None) ) elif pair[1] is None: self.__unpaired.append(pair) else: # Two mapped reads. # pair[0] should never be unmapped. That case should be handled by # __process_unmapped_pairs. self.__pairs.append(pair) have_pairs = True self.__process_pairs() self.__process_fragments(have_pairs) # clean-up the workspace self.__pairs = None self.__unpaired = None def __process_pairs(self): # All pairs whose 5'-most coordinate matches the key, # and are not duplicate pairs, will be emitted keep_pairs = dict() for p in self.__pairs: p_key = get_pair_key(p) # makes the key on which we base the comparison between pairs # If we already have a pair with this key, then keep the one with the highest score. # If we haven't already seen the key, put the pair in the hash. if keep_pairs.has_key(p_key): if get_map_pair_score(keep_pairs[p_key]) < get_map_pair_score(p): dup_pair = keep_pairs[p_key] keep_pairs[p_key] = p else: dup_pair = p self.event_monitor.count("duplicate pairs") if not self.discard_duplicates: # emit the duplicates if we need to for r in dup_pair: r.set_duplicate(True) self.__output_sink.process(dup_pair) else: keep_pairs[p_key] = p # finally, emit the pairs that we've kept self.event_monitor.count("rmdup unique pairs", len(keep_pairs)) for pair in keep_pairs.itervalues(): self.__output_sink.process(pair) def __process_fragments(self, with_pairs): # All fragments that are not the duplicate of another # fragment, be it in a pair or alone, will be emitted. # # All fragments we analyze here will have been emitted for the same coordinate # (the one referenced by the key). Therefore, they automatically have a # duplicate in any pairs we have received. As a consequence, we only look at # them if we haven't seen any pairs. # # with_pairs => implies we have proper pairs for the key position, # so all lone fragments are to be discarded as duplicates. # # not with_pairs => we have no proper pairs for the key position. # Duplicates will be selected by quality.""" if with_pairs: # all fragments are duplicates self.event_monitor.count("duplicate fragments", len(self.__unpaired)) if not self.discard_duplicates: for dup in self.__unpaired: # for each unpaired fragment dup[0].set_duplicate(True) self.__output_sink.process(dup) else: fragments = dict() for m,none in self.__unpaired: # for each unpaired fragment k = get_mapping_key(m) if fragments.has_key(k): if get_map_score(fragments[k]) < get_map_score(m): dup = fragments[k] fragments[k] = m else: dup = m self.event_monitor.count("duplicate fragments") if not self.discard_duplicates: dup.set_duplicate(True) self.__output_sink.process((dup,None)) else: fragments[k] = m # now emit the remaining fragments self.event_monitor.count("rmdup unique fragments", len(fragments)) for m in fragments.itervalues(): self.__output_sink.process( (m,None) )
class reducer(Reducer): COUNTER_CLASS = "SEQAL" # TODO: refactor so that mapper and reducer have a common place for things like this constant DeprecationMap = { 'seal.seqal.log.level': 'bl.seqal.log.level', 'seal.seqal.discard_duplicates': 'bl.seqal.discard_duplicates' } def __init__(self, ctx): super(reducer, self).__init__(ctx) jc = ctx.getJobConf() logger = logging.getLogger("seqal") jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx) self.__output_sink = EmitSamLink(ctx, self.event_monitor) def __process_unmapped_pairs(self, ctx): while ctx.nextValue(): value = ctx.getInputValue() pair = protobuf_mapping.unserialize_pair(value) self.__output_sink.process(pair) def reduce(self, ctx): # create the "workspace" self.__pairs = [] self.__unpaired = [] # gather input key_values = ctx.getInputKey().split(':') if key_values[0] == seqal_app.UNMAPPED_STRING: # pair of unmapped sequences self.__process_unmapped_pairs(ctx) else: if len(key_values) != 3: raise RuntimeError( "Unexpected key length %d. Expected key format is ref_id:pos:orient" % len(key)) # convert key values and make it a tuple key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R' ) # last value is True if reverse strand have_pairs = False # keep track of whether we have at least one real pair. # load mappings while ctx.nextValue(): value = ctx.getInputValue() if value == seqal_app.PAIR_STRING: have_pairs = True else: pair = protobuf_mapping.unserialize_pair(value) if pair[0] is None or pair[0].is_unmapped(): # Sanity check. pair[0] should never be None or unmapped here. raise ValueError( "Error! Got None or unmapped in first read for key %s. pair: %s" % (key, pair)) if pair[1] and pair[1].is_unmapped(): self.__output_sink.process((pair[1], None)) self.__unpaired.append((pair[0], None)) elif pair[1] is None: self.__unpaired.append(pair) else: # Two mapped reads. # pair[0] should never be unmapped. That case should be handled by # __process_unmapped_pairs. self.__pairs.append(pair) have_pairs = True self.__process_pairs() self.__process_fragments(have_pairs) # clean-up the workspace self.__pairs = None self.__unpaired = None def __process_pairs(self): # All pairs whose 5'-most coordinate matches the key, # and are not duplicate pairs, will be emitted keep_pairs = dict() for p in self.__pairs: p_key = get_pair_key( p ) # makes the key on which we base the comparison between pairs # If we already have a pair with this key, then keep the one with the highest score. # If we haven't already seen the key, put the pair in the hash. if keep_pairs.has_key(p_key): if get_map_pair_score( keep_pairs[p_key]) < get_map_pair_score(p): dup_pair = keep_pairs[p_key] keep_pairs[p_key] = p else: dup_pair = p self.event_monitor.count("duplicate pairs") if not self.discard_duplicates: # emit the duplicates if we need to for r in dup_pair: r.set_duplicate(True) self.__output_sink.process(dup_pair) else: keep_pairs[p_key] = p # finally, emit the pairs that we've kept self.event_monitor.count("rmdup unique pairs", len(keep_pairs)) for pair in keep_pairs.itervalues(): self.__output_sink.process(pair) def __process_fragments(self, with_pairs): # All fragments that are not the duplicate of another # fragment, be it in a pair or alone, will be emitted. # # All fragments we analyze here will have been emitted for the same coordinate # (the one referenced by the key). Therefore, they automatically have a # duplicate in any pairs we have received. As a consequence, we only look at # them if we haven't seen any pairs. # # with_pairs => implies we have proper pairs for the key position, # so all lone fragments are to be discarded as duplicates. # # not with_pairs => we have no proper pairs for the key position. # Duplicates will be selected by quality.""" if with_pairs: # all fragments are duplicates self.event_monitor.count("duplicate fragments", len(self.__unpaired)) if not self.discard_duplicates: for dup in self.__unpaired: # for each unpaired fragment dup[0].set_duplicate(True) self.__output_sink.process(dup) else: fragments = dict() for m, none in self.__unpaired: # for each unpaired fragment k = get_mapping_key(m) if fragments.has_key(k): if get_map_score(fragments[k]) < get_map_score(m): dup = fragments[k] fragments[k] = m else: dup = m self.event_monitor.count("duplicate fragments") if not self.discard_duplicates: dup.set_duplicate(True) self.__output_sink.process((dup, None)) else: fragments[k] = m # now emit the remaining fragments self.event_monitor.count("rmdup unique fragments", len(fragments)) for m in fragments.itervalues(): self.__output_sink.process((m, None))
class mapper(Mapper): """ Aligns sequences to a reference genome. @input-record: C{key} does not matter (standard LineRecordReader); C{value} is a tab-separated text line with 5 fields: ID, read_seq, read_qual, mate_seq, mate_qual. @output-record: protobuf-serialized mapped pairs (map-reduce job) or alignment records in SAM format (map-only job). @jobconf-param: C{mapred.reduce.tasks} number of Hadoop reduce tasks to launch. If the value of this property is set to 0, then the mapper will directly output the mappings in SAM format, like BWA. If set to a value > 0 the mapper will output mappings in the protobuf serialized format for the rmdup reducer. @jobconf-param: C{seal.seqal.log.level} logging level, specified as a logging module literal. @jobconf-param: C{mapred.cache.archives} distributed cache entry for the bwa index archive. The entry is of the form HDFS_PATH#LINK_NAME. The archive for a given chromosome must contain (at the top level, i.e., no directories) all files generated by 'bwa index' for that chromosome. @jobconf-param: C{seal.seqal.alignment.max.isize}: if the inferred isize is greater than this value, Smith-Waterman alignment for unmapped reads will be skipped. @jobconf-param: C{seal.seqal.pairing.batch.size}: how many sequences should be processed at a time by the pairing function. Status will be updated at each new batch: therefore, lowering this value can help avoid timeouts. @jobconf-param: C{seal.seqal.fastq-subformat} Specifies base quality score encoding. Supported types are: 'fastq-sanger' and 'fastq-illumina'. @jobconf-param: C{mapred.create.symlink} must be set to 'yes'. @jobconf-param: C{seal.seqal.min_hit_quality} mapping quality threshold below which the mapping will be discarded. """ SUPPORTED_FORMATS = "fastq-illumina", "fastq-sanger" DEFAULT_FORMAT = "fastq-sanger" COUNTER_CLASS = "SEQAL" DeprecationMap = { "seal.seqal.log.level": "bl.seqal.log.level", "seal.seqal.alignment.max.isize": "bl.seqal.alignment.max.isize", "seal.seqal.pairing.batch.size": "bl.seqal.pairing.batch.size", "seal.seqal.fastq-subformat": "bl.seqal.fastq-subformat", "seal.seqal.min_hit_quality": "bl.seqal.min_hit_quality", "seal.seqal.remove_unmapped": "bl.seqal.remove_unmapped", "seal.seqal.discard_duplicates": "bl.seqal.discard_duplicates", "seal.seqal.nthreads": "bl.seqal.nthreads", "seal.seqal.trim.qual": "bl.seqal.trim.qual", } def __get_configuration(self, ctx): # TODO: refactor settings common to mapper and reducer jc = ctx.getJobConf() jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, self.logger) jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO') jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT) jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000) jc_configure_int(self, jobconf, 'seal.seqal.alignment.min.isize', 'min_isize', None) jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000) jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0) jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False) jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1) jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0) try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) if self.format not in self.SUPPORTED_FORMATS: raise_pydoop_exception( "seal.seqal.fastq-subformat must be one of %r" % (self.SUPPORTED_FORMATS,) ) if self.remove_unmapped: raise NotImplementedError("seal.seqal.remove_unmapped is currently unsupported") if self.min_hit_quality > 0: raise NotImplementedError("seal.seqal.min_hit_quality is currently unsupported") if self.trim_qual > 0: raise NotImplementedError("seal.seqal.trim_qual is currently unsupported") if self.max_isize <= 0: raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]") if self.batch_size <= 0: raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]") # minimum qual value required for a hit to be kept. By default outputs all the # hits BWA returns. if self.min_hit_quality < 0: raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]") # number of concurrent threads for main alignment operation if self.nthreads <= 0: raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]") # trim quality parameter used by BWA from read trimming. Equivalent to # the -q parameter for bwa align if self.trim_qual < 0: raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]") if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0: self.__map_only = False else: self.__map_only = True def get_reference_root_from_archive(self, ref_dir): """ Given a directory containing an indexed reference, such that all its files have a common name (except the extension), this method find the path to the reference including the common name. e.g. my_reference/hg_18.bwt my_reference/hg_18.rsax my_reference/hg_18.sax => "my_references/hg_18" my_reference/hg_18.pac my_reference/.irrelevant_file """ file_list = [ p for p in os.listdir(ref_dir) ] if self.logger.isEnabledFor(logging.DEBUG): self.logger.debug("file_list extracted from reference archive: %s", file_list) filtered_file_list = [ p for p in file_list if not p.startswith('.') and os.path.splitext(p)[1].lstrip('.') in _BWA_INDEX_EXT ] prefix = os.path.commonprefix(filtered_file_list).rstrip('.') if not prefix: raise RuntimeError("Could not determine common prefix from list of files (%s)" %\ filtered_file_list if len(filtered_file_list) < 15 else "{}, ...".format(', '.join(filtered_file_list[0:15]))) full_prefix = os.path.join(ref_dir, prefix) return full_prefix def __init__(self, ctx): super(mapper, self).__init__(ctx) self.logger = logging.getLogger("seqal") self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) pe = True # single-end sequencen alignment not yet supported by Seqal self.hi_rapi = HiRapiAligner('rapi_bwa', paired=pe) # opts self.hi_rapi.opts.n_threads = self.nthreads self.hi_rapi.opts.isize_max = self.max_isize if self.min_isize is not None: self.hi_rapi.opts.isize_min = self.min_isize self.hi_rapi.qoffset = self.hi_rapi.Qenc_Illumina if self.format == "fastq-illumina" else self.hi_rapi.Qenc_Sanger # end opts self.logger.info("Using the %s aligner plugin, aligner version %s, plugin version %s", self.hi_rapi.aligner_name, self.hi_rapi.aligner_version, self.hi_rapi.plugin_version) self.logger.info("Working in %s mode", 'paired-end' if pe else 'single-end') # allocate space for reads self.logger.debug("Reserving batch space for %s reads", self.batch_size) self.hi_rapi.reserve_space(self.batch_size) # load reference reference_root = self.get_reference_root_from_archive(utils.get_ref_archive(ctx.getJobConf())) self.logger.info("Full reference path (prefix): %s", reference_root) with self.event_monitor.time_block("Loading reference %s" % reference_root): self.hi_rapi.load_ref(reference_root) ######## assemble hit processor chain chain = RapiFilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next( RapiEmitSamLink(ctx, self.event_monitor, self.hi_rapi) ) else: raise NotImplementedError("Only mapping mode is supported at the moment") self.hit_visitor_chain = chain def _visit_hits(self): for read_tpl in self.hi_rapi.ifragments(): self.hit_visitor_chain.process(read_tpl) def map(self, ctx): # Accumulates reads in self.pairs, until batch size is reached. # At that point it calls run_alignment and emits the output. v = ctx.value f_id, r1, q1, r2, q2 = v.split("\t") self.hi_rapi.load_pair(f_id, r1, q1, r2, q2) if self.hi_rapi.batch_size >= self.batch_size: self.hi_rapi.align_batch() self._visit_hits() self.hi_rapi.clear_batch() def close(self): # If there are any reads left in the aligner batch, # align them too if self.hi_rapi.batch_size > 0: self.hi_rapi.align_batch() self._visit_hits() self.hi_rapi.clear_batch() self.hi_rapi.release_resources()