Python HadoopEventMonitor示例，seal.lib.mr.hadoop_event_monitor.HadoopEventMonitor Python示例

示例#1

0

显示文件

文件： reducer.py 项目： ilveroluca/seal

    def __init__(self, ctx):
        super(reducer, self).__init__(ctx)

        jc = ctx.getJobConf()
        logger = logging.getLogger("seqal")
        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
        jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False)

        logging.basicConfig(level=self.log_level)

        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx)
        self.__output_sink = EmitSamLink(ctx, self.event_monitor)

示例#2

0

显示文件

文件： mapper.py 项目： okulev/seal

    def __init__(self, ctx):
        super(type(self), self).__init__(ctx)
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS,
                                                logging.getLogger("mapper"),
                                                ctx)

        self.aligner = BwaAligner()
        self.aligner.event_monitor = self.event_monitor
        self.aligner.qformat = self.format
        self.aligner.max_isize = self.max_isize
        self.aligner.nthreads = self.nthreads
        self.aligner.trim_qual = self.trim_qual
        self.aligner.mmap_enabled = True

        ######## assemble hit processor chain
        chain = FilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next(EmitSamLink(ctx, self.event_monitor))
        else:
            chain.set_next(MarkDuplicatesEmitter(ctx, self.event_monitor))
        self.aligner.hit_visitor = chain

        ######## set the path to the reference index
        self.ref_archive = utils.get_ref_archive(ctx.getJobConf())
        self.aligner.reference = self.get_reference_root(self.ref_archive)

        # part of the code is a workaround for accumulating records, see #331
        isplit = InputSplit(ctx.getInputSplit())
        self.split_end = isplit.offset + isplit.length

示例#3

0

显示文件

 def setUp(self):
     self.ctx = map_context(None, None)
     self.count_group = "Test"
     self.logger = SavingLogger()
     self.monitor = HadoopEventMonitor(self.count_group, self.logger,
                                       self.ctx)
     self.link = MarkDuplicatesEmitter(self.ctx, self.monitor)
     self.pair1 = test_utils.pair1()
     self.pair2 = test_utils.pair2()

示例#4

0

显示文件

文件： mapper.py 项目： ilveroluca/seal

    def __init__(self, ctx):
        super(mapper, self).__init__(ctx)
        self.logger = logging.getLogger("seqal")
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx)

        pe = True # single-end sequencen alignment not yet supported by Seqal
        self.hi_rapi = HiRapiAligner('rapi_bwa', paired=pe)

        # opts
        self.hi_rapi.opts.n_threads = self.nthreads
        self.hi_rapi.opts.isize_max = self.max_isize
        if self.min_isize is not None:
            self.hi_rapi.opts.isize_min = self.min_isize
        self.hi_rapi.qoffset = self.hi_rapi.Qenc_Illumina if self.format == "fastq-illumina" else self.hi_rapi.Qenc_Sanger
        # end opts

        self.logger.info("Using the %s aligner plugin, aligner version %s, plugin version %s",
                self.hi_rapi.aligner_name, self.hi_rapi.aligner_version, self.hi_rapi.plugin_version)
        self.logger.info("Working in %s mode", 'paired-end' if pe else 'single-end')

        # allocate space for reads
        self.logger.debug("Reserving batch space for %s reads", self.batch_size)
        self.hi_rapi.reserve_space(self.batch_size) 

        # load reference
        reference_root = self.get_reference_root_from_archive(utils.get_ref_archive(ctx.getJobConf()))
        self.logger.info("Full reference path (prefix): %s", reference_root)
        with self.event_monitor.time_block("Loading reference %s" % reference_root):
            self.hi_rapi.load_ref(reference_root)

        ######## assemble hit processor chain
        chain = RapiFilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next( RapiEmitSamLink(ctx, self.event_monitor, self.hi_rapi) )
        else:
            raise NotImplementedError("Only mapping mode is supported at the moment")
        self.hit_visitor_chain = chain

示例#5

0

显示文件

文件： reducer.py 项目： QwertyManiac/seal-cdh4

	def __init__(self, ctx):
		super(reducer, self).__init__(ctx)

		jc = ctx.getJobConf()
		logger = logging.getLogger("seqal")
		jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger)

		jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
		jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False)

		logging.basicConfig(level=self.log_level)

		self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx)
		self.__output_sink = EmitSamLink(ctx, self.event_monitor)

示例#6

0

显示文件

	def setUp(self):
		self.map_ctx = map_context(None, None)
		self.count_group = "Test"
		self.logger = SavingLogger()
		self.monitor = HadoopEventMonitor(self.count_group, self.logger, self.map_ctx)
		self.emitter = EmitSamLink(self.map_ctx, self.monitor)
		# create two mappings, m1, m2.  We put them in self.pair
		# m1 has:
		#   name = first
		# 	tid = tid1
		# m2 has:
		#   name = second
		#   tid = tid2
		self.pair = [ SimpleMapping(), SimpleMapping() ]
		self.m1, self.m2 = self.pair
		self.m1.set_name("first")
		self.m1.tid = "tid1"
		self.m2.set_name("second")
		self.m2.tid = "tid2"

示例#7

0

显示文件

文件： reducer.py 项目： QwertyManiac/seal-cdh4

class reducer(Reducer):
	COUNTER_CLASS = "SEQAL" # TODO:  refactor so that mapper and reducer have a common place for things like this constant

	DeprecationMap = {
	  'seal.seqal.log.level': 'bl.seqal.log.level',
	  'seal.seqal.discard_duplicates': 'bl.seqal.discard_duplicates'
	}

	def __init__(self, ctx):
		super(reducer, self).__init__(ctx)

		jc = ctx.getJobConf()
		logger = logging.getLogger("seqal")
		jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger)

		jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
		jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False)

		logging.basicConfig(level=self.log_level)

		self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx)
		self.__output_sink = EmitSamLink(ctx, self.event_monitor)

	def __process_unmapped_pairs(self, ctx):
		while ctx.nextValue():
			value = ctx.getInputValue()
			pair = protobuf_mapping.unserialize_pair(value)
			self.__output_sink.process(pair)

	def reduce(self, ctx):
		# create the "workspace"
		self.__pairs = []
		self.__unpaired = []

		# gather input
		key_values = ctx.getInputKey().split(':')

		if key_values[0] == seqal_app.UNMAPPED_STRING:
			# pair of unmapped sequences
			self.__process_unmapped_pairs(ctx)
		else:
			if len(key_values) != 3:
				raise RuntimeError("Unexpected key length %d.  Expected key format is ref_id:pos:orient" % len(key))
			# convert key values and make it a tuple
			key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R') # last value is True if reverse strand

			have_pairs = False # keep track of whether we have at least one real pair.
			# load mappings
			while ctx.nextValue():
				value = ctx.getInputValue()
				if value == seqal_app.PAIR_STRING:
					have_pairs = True
				else:
					pair = protobuf_mapping.unserialize_pair(value)
					if pair[0] is None or pair[0].is_unmapped():
						# Sanity check. pair[0] should never be None or unmapped here.
						raise ValueError("Error!  Got None or unmapped in first read for key %s.  pair: %s" % (key, pair))

					if pair[1] and pair[1].is_unmapped():
						self.__output_sink.process( (pair[1], None) )
						self.__unpaired.append( (pair[0], None) )
					elif pair[1] is None:
						self.__unpaired.append(pair)
					else:
						# Two mapped reads.
						# pair[0] should never be unmapped.  That case should be handled by
						# __process_unmapped_pairs.
						self.__pairs.append(pair)
						have_pairs = True

			self.__process_pairs()
			self.__process_fragments(have_pairs)

		# clean-up the workspace
		self.__pairs = None
		self.__unpaired = None

	def __process_pairs(self):
		# All pairs whose 5'-most coordinate matches the key,
		# and are not duplicate pairs, will be emitted
		keep_pairs = dict()
		for p in self.__pairs:
			p_key = get_pair_key(p) # makes the key on which we base the comparison between pairs

			# If we already have a pair with this key, then keep the one with the highest score.
			# If we haven't already seen the key, put the pair in the hash.
			if keep_pairs.has_key(p_key):
				if get_map_pair_score(keep_pairs[p_key]) < get_map_pair_score(p):
					dup_pair = keep_pairs[p_key]
					keep_pairs[p_key] = p
				else:
					dup_pair = p
				self.event_monitor.count("duplicate pairs")
				if not self.discard_duplicates: # emit the duplicates if we need to
					for r in dup_pair:
						r.set_duplicate(True)
					self.__output_sink.process(dup_pair)
			else:
				keep_pairs[p_key] = p
		# finally, emit the pairs that we've kept
		self.event_monitor.count("rmdup unique pairs", len(keep_pairs))
		for pair in keep_pairs.itervalues():
			self.__output_sink.process(pair)

	def __process_fragments(self, with_pairs):
		# All fragments that are not the duplicate of another
		# fragment, be it in a pair or alone, will be emitted.
		#
		# All fragments we analyze here will have been emitted for the same coordinate
		# (the one referenced by the key).  Therefore, they automatically have a
		# duplicate in any pairs we have received. As a consequence, we only look at
		# them if we haven't seen any pairs.
		#
		# with_pairs => implies we have proper pairs for the key position,
		# so all lone fragments are to be discarded as duplicates.
		#
		# not with_pairs => we have no proper pairs for the key position.
		# Duplicates will be selected by quality."""
		if with_pairs:
			# all fragments are duplicates
			self.event_monitor.count("duplicate fragments", len(self.__unpaired))
			if not self.discard_duplicates:
				for dup in self.__unpaired: # for each unpaired fragment
					dup[0].set_duplicate(True)
					self.__output_sink.process(dup)
		else:
			fragments = dict()
			for m,none in self.__unpaired: # for each unpaired fragment
				k = get_mapping_key(m)
				if fragments.has_key(k):
					if get_map_score(fragments[k]) < get_map_score(m):
						dup = fragments[k]
						fragments[k] = m
					else:
						dup = m
					self.event_monitor.count("duplicate fragments")
					if not self.discard_duplicates:
						dup.set_duplicate(True)
						self.__output_sink.process((dup,None))
				else:
					fragments[k] = m
			# now emit the remaining fragments
			self.event_monitor.count("rmdup unique fragments", len(fragments))
			for m in fragments.itervalues():
				self.__output_sink.process( (m,None) )

示例#8

0

显示文件

class reducer(Reducer):
    COUNTER_CLASS = "SEQAL"  # TODO:  refactor so that mapper and reducer have a common place for things like this constant

    DeprecationMap = {
        'seal.seqal.log.level': 'bl.seqal.log.level',
        'seal.seqal.discard_duplicates': 'bl.seqal.discard_duplicates'
    }

    def __init__(self, ctx):
        super(reducer, self).__init__(ctx)

        jc = ctx.getJobConf()
        logger = logging.getLogger("seqal")
        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap,
                                                     logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level',
                     'INFO')
        jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates',
                          'discard_duplicates', False)

        logging.basicConfig(level=self.log_level)

        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS,
                                                logging.getLogger("reducer"),
                                                ctx)
        self.__output_sink = EmitSamLink(ctx, self.event_monitor)

    def __process_unmapped_pairs(self, ctx):
        while ctx.nextValue():
            value = ctx.getInputValue()
            pair = protobuf_mapping.unserialize_pair(value)
            self.__output_sink.process(pair)

    def reduce(self, ctx):
        # create the "workspace"
        self.__pairs = []
        self.__unpaired = []

        # gather input
        key_values = ctx.getInputKey().split(':')

        if key_values[0] == seqal_app.UNMAPPED_STRING:
            # pair of unmapped sequences
            self.__process_unmapped_pairs(ctx)
        else:
            if len(key_values) != 3:
                raise RuntimeError(
                    "Unexpected key length %d.  Expected key format is ref_id:pos:orient"
                    % len(key))
            # convert key values and make it a tuple
            key = (int(key_values[0]), int(key_values[1]), key_values[2] == 'R'
                   )  # last value is True if reverse strand

            have_pairs = False  # keep track of whether we have at least one real pair.
            # load mappings
            while ctx.nextValue():
                value = ctx.getInputValue()
                if value == seqal_app.PAIR_STRING:
                    have_pairs = True
                else:
                    pair = protobuf_mapping.unserialize_pair(value)
                    if pair[0] is None or pair[0].is_unmapped():
                        # Sanity check. pair[0] should never be None or unmapped here.
                        raise ValueError(
                            "Error!  Got None or unmapped in first read for key %s.  pair: %s"
                            % (key, pair))

                    if pair[1] and pair[1].is_unmapped():
                        self.__output_sink.process((pair[1], None))
                        self.__unpaired.append((pair[0], None))
                    elif pair[1] is None:
                        self.__unpaired.append(pair)
                    else:
                        # Two mapped reads.
                        # pair[0] should never be unmapped.  That case should be handled by
                        # __process_unmapped_pairs.
                        self.__pairs.append(pair)
                        have_pairs = True

            self.__process_pairs()
            self.__process_fragments(have_pairs)

        # clean-up the workspace
        self.__pairs = None
        self.__unpaired = None

    def __process_pairs(self):
        # All pairs whose 5'-most coordinate matches the key,
        # and are not duplicate pairs, will be emitted
        keep_pairs = dict()
        for p in self.__pairs:
            p_key = get_pair_key(
                p
            )  # makes the key on which we base the comparison between pairs

            # If we already have a pair with this key, then keep the one with the highest score.
            # If we haven't already seen the key, put the pair in the hash.
            if keep_pairs.has_key(p_key):
                if get_map_pair_score(
                        keep_pairs[p_key]) < get_map_pair_score(p):
                    dup_pair = keep_pairs[p_key]
                    keep_pairs[p_key] = p
                else:
                    dup_pair = p
                self.event_monitor.count("duplicate pairs")
                if not self.discard_duplicates:  # emit the duplicates if we need to
                    for r in dup_pair:
                        r.set_duplicate(True)
                    self.__output_sink.process(dup_pair)
            else:
                keep_pairs[p_key] = p
        # finally, emit the pairs that we've kept
        self.event_monitor.count("rmdup unique pairs", len(keep_pairs))
        for pair in keep_pairs.itervalues():
            self.__output_sink.process(pair)

    def __process_fragments(self, with_pairs):
        # All fragments that are not the duplicate of another
        # fragment, be it in a pair or alone, will be emitted.
        #
        # All fragments we analyze here will have been emitted for the same coordinate
        # (the one referenced by the key).  Therefore, they automatically have a
        # duplicate in any pairs we have received. As a consequence, we only look at
        # them if we haven't seen any pairs.
        #
        # with_pairs => implies we have proper pairs for the key position,
        # so all lone fragments are to be discarded as duplicates.
        #
        # not with_pairs => we have no proper pairs for the key position.
        # Duplicates will be selected by quality."""
        if with_pairs:
            # all fragments are duplicates
            self.event_monitor.count("duplicate fragments",
                                     len(self.__unpaired))
            if not self.discard_duplicates:
                for dup in self.__unpaired:  # for each unpaired fragment
                    dup[0].set_duplicate(True)
                    self.__output_sink.process(dup)
        else:
            fragments = dict()
            for m, none in self.__unpaired:  # for each unpaired fragment
                k = get_mapping_key(m)
                if fragments.has_key(k):
                    if get_map_score(fragments[k]) < get_map_score(m):
                        dup = fragments[k]
                        fragments[k] = m
                    else:
                        dup = m
                    self.event_monitor.count("duplicate fragments")
                    if not self.discard_duplicates:
                        dup.set_duplicate(True)
                        self.__output_sink.process((dup, None))
                else:
                    fragments[k] = m
            # now emit the remaining fragments
            self.event_monitor.count("rmdup unique fragments", len(fragments))
            for m in fragments.itervalues():
                self.__output_sink.process((m, None))

示例#9

0

显示文件

文件： mapper.py 项目： ilveroluca/seal

class mapper(Mapper):
    """
    Aligns sequences to a reference genome.

    @input-record: C{key} does not matter (standard LineRecordReader);
    C{value} is a tab-separated text line with 5 fields: ID, read_seq,
    read_qual, mate_seq, mate_qual.

    @output-record: protobuf-serialized mapped pairs (map-reduce job) or alignment
    records in SAM format (map-only job).

    @jobconf-param: C{mapred.reduce.tasks} number of Hadoop reduce tasks to launch.
    If the value of this property is set to 0, then the mapper will directly output
    the mappings in SAM format, like BWA.  If set to a value > 0 the mapper will output
    mappings in the protobuf serialized format for the rmdup reducer.

    @jobconf-param: C{seal.seqal.log.level} logging level,
    specified as a logging module literal.

    @jobconf-param: C{mapred.cache.archives} distributed
    cache entry for the bwa index archive. The entry
    is of the form HDFS_PATH#LINK_NAME. The archive for a given
    chromosome must contain (at the top level, i.e., no directories) all
    files generated by 'bwa index' for that chromosome.

    @jobconf-param: C{seal.seqal.alignment.max.isize}: if the
    inferred isize is greater than this value, Smith-Waterman alignment
    for unmapped reads will be skipped.

    @jobconf-param: C{seal.seqal.pairing.batch.size}: how many
    sequences should be processed at a time by the pairing
    function. Status will be updated at each new batch: therefore,
    lowering this value can help avoid timeouts.

    @jobconf-param: C{seal.seqal.fastq-subformat} Specifies base quality
    score encoding.  Supported types are: 'fastq-sanger' and 'fastq-illumina'.

    @jobconf-param: C{mapred.create.symlink} must be set to 'yes'.

    @jobconf-param: C{seal.seqal.min_hit_quality} mapping quality
    threshold below which the mapping will be discarded.
    """
    SUPPORTED_FORMATS = "fastq-illumina", "fastq-sanger"
    DEFAULT_FORMAT = "fastq-sanger"
    COUNTER_CLASS = "SEQAL"
    DeprecationMap = {
      "seal.seqal.log.level":           "bl.seqal.log.level",
      "seal.seqal.alignment.max.isize": "bl.seqal.alignment.max.isize",
      "seal.seqal.pairing.batch.size":  "bl.seqal.pairing.batch.size",
      "seal.seqal.fastq-subformat":     "bl.seqal.fastq-subformat",
      "seal.seqal.min_hit_quality":     "bl.seqal.min_hit_quality",
      "seal.seqal.remove_unmapped":     "bl.seqal.remove_unmapped",
      "seal.seqal.discard_duplicates":  "bl.seqal.discard_duplicates",
      "seal.seqal.nthreads":            "bl.seqal.nthreads",
      "seal.seqal.trim.qual":           "bl.seqal.trim.qual",
    }

    def __get_configuration(self, ctx):
        # TODO:  refactor settings common to mapper and reducer
        jc = ctx.getJobConf()

        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, self.logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
        jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.min.isize', 'min_isize', None)
        jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000)
        jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0)
        jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False)
        jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1)
        jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0)

        try:
            self.log_level = getattr(logging, self.log_level)
        except AttributeError:
            raise ValueError("Unsupported log level: %r" % self.log_level)

        if self.format not in self.SUPPORTED_FORMATS:
            raise_pydoop_exception(
              "seal.seqal.fastq-subformat must be one of %r" %
              (self.SUPPORTED_FORMATS,)
              )

        if self.remove_unmapped:
            raise NotImplementedError("seal.seqal.remove_unmapped is currently unsupported")
        if self.min_hit_quality > 0:
            raise NotImplementedError("seal.seqal.min_hit_quality is currently unsupported")
        if self.trim_qual > 0:
            raise NotImplementedError("seal.seqal.trim_qual is currently unsupported")

        if self.max_isize <= 0:
            raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]")

        if self.batch_size <= 0:
            raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]")

        # minimum qual value required for a hit to be kept.  By default outputs all the
        # hits BWA returns.
        if self.min_hit_quality < 0:
            raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]")

        # number of concurrent threads for main alignment operation
        if self.nthreads <= 0:
            raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]")

        # trim quality parameter used by BWA from read trimming.  Equivalent to
        # the -q parameter for bwa align
        if self.trim_qual < 0:
            raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]")

        if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0:
            self.__map_only = False
        else:
            self.__map_only = True


    def get_reference_root_from_archive(self, ref_dir):
        """
        Given a directory containing an indexed reference,
        such that all its files have a common name (except the extension),
        this method find the path to the reference including the common name.
         e.g. my_reference/hg_18.bwt
              my_reference/hg_18.rsax
              my_reference/hg_18.sax   => "my_references/hg_18"
              my_reference/hg_18.pac
              my_reference/.irrelevant_file
        """
        file_list = [ p for p in os.listdir(ref_dir) ]

        if self.logger.isEnabledFor(logging.DEBUG):
            self.logger.debug("file_list extracted from reference archive: %s", file_list)

        filtered_file_list = [ p for p in file_list if not p.startswith('.') and os.path.splitext(p)[1].lstrip('.') in _BWA_INDEX_EXT ]
        prefix = os.path.commonprefix(filtered_file_list).rstrip('.')
        if not prefix:
            raise RuntimeError("Could not determine common prefix from list of files (%s)" %\
                    filtered_file_list if len(filtered_file_list) < 15 else "{}, ...".format(', '.join(filtered_file_list[0:15])))
        full_prefix = os.path.join(ref_dir, prefix)
        return full_prefix

    def __init__(self, ctx):
        super(mapper, self).__init__(ctx)
        self.logger = logging.getLogger("seqal")
        self.__get_configuration(ctx)
        logging.basicConfig(level=self.log_level)
        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx)

        pe = True # single-end sequencen alignment not yet supported by Seqal
        self.hi_rapi = HiRapiAligner('rapi_bwa', paired=pe)

        # opts
        self.hi_rapi.opts.n_threads = self.nthreads
        self.hi_rapi.opts.isize_max = self.max_isize
        if self.min_isize is not None:
            self.hi_rapi.opts.isize_min = self.min_isize
        self.hi_rapi.qoffset = self.hi_rapi.Qenc_Illumina if self.format == "fastq-illumina" else self.hi_rapi.Qenc_Sanger
        # end opts

        self.logger.info("Using the %s aligner plugin, aligner version %s, plugin version %s",
                self.hi_rapi.aligner_name, self.hi_rapi.aligner_version, self.hi_rapi.plugin_version)
        self.logger.info("Working in %s mode", 'paired-end' if pe else 'single-end')

        # allocate space for reads
        self.logger.debug("Reserving batch space for %s reads", self.batch_size)
        self.hi_rapi.reserve_space(self.batch_size) 

        # load reference
        reference_root = self.get_reference_root_from_archive(utils.get_ref_archive(ctx.getJobConf()))
        self.logger.info("Full reference path (prefix): %s", reference_root)
        with self.event_monitor.time_block("Loading reference %s" % reference_root):
            self.hi_rapi.load_ref(reference_root)

        ######## assemble hit processor chain
        chain = RapiFilterLink(self.event_monitor)
        chain.remove_unmapped = self.remove_unmapped
        chain.min_hit_quality = self.min_hit_quality
        if self.__map_only:
            chain.set_next( RapiEmitSamLink(ctx, self.event_monitor, self.hi_rapi) )
        else:
            raise NotImplementedError("Only mapping mode is supported at the moment")
        self.hit_visitor_chain = chain

    def _visit_hits(self):
        for read_tpl in self.hi_rapi.ifragments():
            self.hit_visitor_chain.process(read_tpl)

    def map(self, ctx):
        # Accumulates reads in self.pairs, until batch size is reached.
        # At that point it calls run_alignment and emits the output.
        v = ctx.value
        f_id, r1, q1, r2, q2 = v.split("\t")
        self.hi_rapi.load_pair(f_id, r1, q1, r2, q2)
        if self.hi_rapi.batch_size >= self.batch_size:
            self.hi_rapi.align_batch()
            self._visit_hits()
            self.hi_rapi.clear_batch()

    def close(self):
        # If there are any reads left in the aligner batch,
        # align them too
        if self.hi_rapi.batch_size > 0:
            self.hi_rapi.align_batch()
            self._visit_hits()
            self.hi_rapi.clear_batch()
        self.hi_rapi.release_resources()