Пример #1
0
 def __get_conf(self, jc):
   self.__get_log_conf(jc)  # log always comes first
   self.__get_blastall_conf(jc)
   self.__get_tiget_conf(jc)
   pu.jc_configure(self, jc, 'bl.mr.seq.formatdb.exe',
                   'formatdb_exe', '/usr/bin/formatdb')
   pu.jc_configure_bool(self, jc, 'bl.spawner.guardian', 'guardian', True)
Пример #2
0
 def _configure(self):
     jc = self.ctx.getJobConf()
     pu.jc_configure_log_level(self, jc, "bl.mr.loglevel", "log_level",
                               "INFO")
     self.logger = logging.getLogger("mapper")
     self.logger.setLevel(self.log_level)
     pu.jc_configure_int(self, jc, "mapred.task.timeout", "timeout")
     pu.jc_configure(self, jc, "bl.hdfs.user", "user", "")
Пример #3
0
    def __get_configuration(self, ctx):
        # TODO:  refactor settings common to mapper and reducer
        jc = ctx.getJobConf()

        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, self.logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
        jc_configure(self, jobconf, "seal.seqal.fastq-subformat", "format", self.DEFAULT_FORMAT)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.max.isize', 'max_isize', 1000)
        jc_configure_int(self, jobconf, 'seal.seqal.alignment.min.isize', 'min_isize', None)
        jc_configure_int(self, jobconf, 'seal.seqal.pairing.batch.size', 'batch_size', 10000)
        jc_configure_int(self, jobconf, 'seal.seqal.min_hit_quality', 'min_hit_quality', 0)
        jc_configure_bool(self, jobconf, 'seal.seqal.remove_unmapped', 'remove_unmapped', False)
        jc_configure_int(self, jobconf, 'seal.seqal.nthreads', 'nthreads', 1)
        jc_configure_int(self, jobconf, 'seal.seqal.trim.qual', 'trim_qual', 0)

        try:
            self.log_level = getattr(logging, self.log_level)
        except AttributeError:
            raise ValueError("Unsupported log level: %r" % self.log_level)

        if self.format not in self.SUPPORTED_FORMATS:
            raise_pydoop_exception(
              "seal.seqal.fastq-subformat must be one of %r" %
              (self.SUPPORTED_FORMATS,)
              )

        if self.remove_unmapped:
            raise NotImplementedError("seal.seqal.remove_unmapped is currently unsupported")
        if self.min_hit_quality > 0:
            raise NotImplementedError("seal.seqal.min_hit_quality is currently unsupported")
        if self.trim_qual > 0:
            raise NotImplementedError("seal.seqal.trim_qual is currently unsupported")

        if self.max_isize <= 0:
            raise ValueError("'seal.seqal.alignment.max.isize' must be > 0, if specified [1000]")

        if self.batch_size <= 0:
            raise ValueError("'seal.seqal.pairing.batch.size' must be > 0, if specified [10000]")

        # minimum qual value required for a hit to be kept.  By default outputs all the
        # hits BWA returns.
        if self.min_hit_quality < 0:
            raise ValueError("'seal.seqal.min_hit_quality' must be >= 0, if specified [0]")

        # number of concurrent threads for main alignment operation
        if self.nthreads <= 0:
            raise ValueError("'seal.seqal.nthreads' must be > 0, if specified [1]")

        # trim quality parameter used by BWA from read trimming.  Equivalent to
        # the -q parameter for bwa align
        if self.trim_qual < 0:
            raise ValueError("'seal.seqal.trim.qual' must be >= 0, if specified [0]")

        if jc.hasKey('mapred.reduce.tasks') and jc.getInt('mapred.reduce.tasks') > 0:
            self.__map_only = False
        else:
            self.__map_only = True
Пример #4
0
 def __init__(self, context):
   super(Mapper, self).__init__(context)
   context.setStatus("Initialization started")
   self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES")
   jc = context.getJobConf()
   pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "")
   if self.excludes_fn:
     with open(self.excludes_fn) as f:
       self.excludes = set(l.strip() for l in f if not l.isspace())
   else:
     self.excludes = set()
   context.setStatus("Initialization done")
Пример #5
0
 def __init__(self, context):
     super(Mapper, self).__init__(context)
     context.setStatus("Initialization started")
     self.excluded_counter = context.getCounter("IPCOUNT", "EXCLUDED_LINES")
     jc = context.getJobConf()
     pu.jc_configure(self, jc, "ipcount.excludes", "excludes_fn", "")
     if self.excludes_fn:
         with open(self.excludes_fn) as f:
             self.excludes = set(l.strip() for l in f if not l.isspace())
     else:
         self.excludes = set()
     context.setStatus("Initialization done")
Пример #6
0
	def __init__(self, ctx):
		super(reducer, self).__init__(ctx)

		jc = ctx.getJobConf()
		logger = logging.getLogger("seqal")
		jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger)

		jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
		jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False)

		logging.basicConfig(level=self.log_level)

		self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx)
		self.__output_sink = EmitSamLink(ctx, self.event_monitor)
Пример #7
0
    def __init__(self, ctx):
        super(reducer, self).__init__(ctx)

        jc = ctx.getJobConf()
        logger = logging.getLogger("seqal")
        jobconf = deprecation_utils.convert_job_conf(jc, self.DeprecationMap, logger)

        jc_configure(self, jobconf, 'seal.seqal.log.level', 'log_level', 'INFO')
        jc_configure_bool(self, jobconf, 'seal.seqal.discard_duplicates', 'discard_duplicates', False)

        logging.basicConfig(level=self.log_level)

        self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("reducer"), ctx)
        self.__output_sink = EmitSamLink(ctx, self.event_monitor)
Пример #8
0
 def __get_configuration(self, jc):
     jc_configure(self, jc, 'bl.mr.fasta-reader.log.level', 'log_level',
                  self.DEFAULT_LOG_LEVEL)
     try:
         self.log_level = getattr(logging, self.log_level)
     except AttributeError:
         raise_pydoop_exception("Unsupported log level: %r" %
                                self.log_level)
     jc_configure(self, jc, "bl.libhdfs.opts", "libhdfs_opts", "")
     if self.libhdfs_opts:
         os.environ["LIBHDFS_OPTS"] = self.libhdfs_opts
     jc_configure_bool(self, jc, 'bl.mr.fasta-reader.compress.header',
                       'compress_header', False)
     jc_configure_bool(self, jc, 'bl.mr.fasta-reader.compress.seq',
                       'compress_seq', True)
     jc_configure_int(self, jc, 'bl.mr.fasta-reader.compression.level',
                      'compression_level', 6)
Пример #9
0
 def test_jc_configure_default(self):
     w = CONFIGURE_EXAMPLES
     d = {}
     for k in w.keys():
         d[k] = w[k][1]
     jc = pp.get_JobConf_object(d)
     o = Obj()
     for k in w.keys():
         nk = 'not-here-%s' % k
         self.assertFalse(jc.hasKey(nk))
         if w[k][0] == 'str':
             pu.jc_configure(o, jc, nk, k, w[k][1])
             self.assertEqual(getattr(o, k), w[k][1])
         elif w[k][0] == 'int':
             pu.jc_configure_int(o, jc, nk, k, int(w[k][1]))
             self.assertEqual(getattr(o, k), int(w[k][1]))
         elif w[k][0] == 'bool':
             pu.jc_configure_bool(o, jc, nk, k, w[k][1] == 'true')
             self.assertEqual(getattr(o, k), w[k][1] == 'true')
         elif w[k][0] == 'log_level':
             pu.jc_configure_log_level(o, jc, nk, k, w[k][1])
             self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
Пример #10
0
 def test_jc_configure_default(self):
   w = CONFIGURE_EXAMPLES
   d = {}
   for k in w.keys():
     d[k] = w[k][1]
   jc = pp.get_JobConf_object(d)
   o = Obj()
   for k in w.keys():
     nk = 'not-here-%s' % k
     self.assertFalse(jc.hasKey(nk))
     if w[k][0] == 'str':
       pu.jc_configure(o, jc, nk, k, w[k][1])
       self.assertEqual(getattr(o,k), w[k][1])
     elif w[k][0] == 'int':
       pu.jc_configure_int(o, jc, nk, k, int(w[k][1]))
       self.assertEqual(getattr(o, k), int(w[k][1]))
     elif w[k][0] == 'bool':
       pu.jc_configure_bool(o, jc, nk, k, w[k][1]=='true')
       self.assertEqual(getattr(o, k), w[k][1] == 'true')
     elif w[k][0] == 'log_level':
       pu.jc_configure_log_level(o, jc, nk, k, w[k][1])
       self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
Пример #11
0
 def test_jc_configure_plain(self):
   w = CONFIGURE_EXAMPLES
   d = {}
   for k in w.keys():
     d[k] = w[k][1]
   jc = pp.get_JobConf_object(d)
   o = Obj()
   for k in w.keys():
     self.assertTrue(jc.hasKey(k))
     if w[k][0] == 'str':
       pu.jc_configure(o, jc, k, k)
       self.assertEqual(getattr(o,k), w[k][1])
     elif w[k][0] == 'int':
       pu.jc_configure_int(o, jc, k, k)
       self.assertEqual(getattr(o, k), int(w[k][1]))
     elif w[k][0] == 'bool':
       pu.jc_configure_bool(o, jc, k, k)
       self.assertEqual(getattr(o, k), w[k][1] == 'true')
     elif w[k][0] == 'float':
       pu.jc_configure_float(o, jc, k, k)
       self.assertAlmostEqual(getattr(o, k), float(w[k][1]))
     elif w[k][0] == 'log_level':
       pu.jc_configure_log_level(o, jc, k, k)
       self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
Пример #12
0
 def test_jc_configure_plain(self):
     w = CONFIGURE_EXAMPLES
     d = {}
     for k in w.keys():
         d[k] = w[k][1]
     jc = pp.get_JobConf_object(d)
     o = Obj()
     for k in w.keys():
         self.assertTrue(jc.hasKey(k))
         if w[k][0] == 'str':
             pu.jc_configure(o, jc, k, k)
             self.assertEqual(getattr(o, k), w[k][1])
         elif w[k][0] == 'int':
             pu.jc_configure_int(o, jc, k, k)
             self.assertEqual(getattr(o, k), int(w[k][1]))
         elif w[k][0] == 'bool':
             pu.jc_configure_bool(o, jc, k, k)
             self.assertEqual(getattr(o, k), w[k][1] == 'true')
         elif w[k][0] == 'float':
             pu.jc_configure_float(o, jc, k, k)
             self.assertAlmostEqual(getattr(o, k), float(w[k][1]))
         elif w[k][0] == 'log_level':
             pu.jc_configure_log_level(o, jc, k, k)
             self.assertEqual(getattr(o, k), getattr(logging, w[k][1]))
Пример #13
0
 def __init__(self, context):
   super(Writer, self).__init__(context)
   self.logger = logging.getLogger("Writer")
   jc = context.getJobConf()
   jc_configure_int(self, jc, "mapred.task.partition", "part")
   jc_configure(self, jc, "mapred.work.output.dir", "outdir")
   jc_configure(self, jc, "mapred.textoutputformat.separator", "sep", "\t")
   jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
   self.outfn = "%s/part-%05d" % (self.outdir, self.part)
   self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
Пример #14
0
 def __init__(self, context):
     super(Writer, self).__init__(context)
     self.logger = logging.getLogger("Writer")
     jc = context.getJobConf()
     jc_configure_int(self, jc, "mapred.task.partition", "part")
     jc_configure(self, jc, "mapred.work.output.dir", "outdir")
     jc_configure(self, jc, "mapred.textoutputformat.separator", "sep",
                  "\t")
     jc_configure(self, jc, "pydoop.hdfs.user", "hdfs_user", None)
     self.outfn = "%s/part-%05d" % (self.outdir, self.part)
     self.file = hdfs.open(self.outfn, "w", user=self.hdfs_user)
Пример #15
0
 def __get_blastall_conf(self, jc):
   pu.jc_configure(self, jc, 'bl.mr.seq.blastall.exe',
                   'blastall_exe', '/usr/bin/blastall')
   pu.jc_configure(self, jc, 'bl.mr.seq.blastall.program', 'program', 'blastn')
   pu.jc_configure(self, jc, 'bl.mr.seq.blastall.db.name', 'db_name')
   pu.jc_configure_float(self, jc, 'bl.mr.seq.blastall.evalue', 'evalue', 1.0)
   pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.gap.cost', 'gap_cost', 1)
   pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.word.size',
                       'word_size', 20)
   pu.jc_configure_bool(self, jc, 'bl.mr.seq.blastall.filter',
                       'filter', False)
Пример #16
0
 def __get_configuration(self, jc):
     pu.jc_configure(self, jc, 'bl.mr.seq.blastall.log.level', 'log_level',
                     'WARNING')
     try:
         self.log_level = getattr(logging, self.log_level)
     except AttributeError:
         raise ValueError("Unsupported log level: %r" % self.log_level)
     pu.jc_configure(self, jc, 'bl.mr.seq.blastall.exe', 'blastall_exe',
                     '/usr/bin/blastall')
     pu.jc_configure(self, jc, 'bl.mr.seq.blastall.program', 'program',
                     'blastn')
     pu.jc_configure(self, jc, 'bl.mr.seq.blastall.db.name', 'db_name')
     pu.jc_configure_float(self, jc, 'bl.mr.seq.blastall.evalue', 'evalue',
                           1.0)
     pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.gap.cost',
                         'gap_cost', 1)
     pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.word.size',
                         'word_size', 20)
     pu.jc_configure_bool(self, jc, 'bl.mr.seq.blastall.filter', 'filter',
                          False)
Пример #17
0
 def __get_log_conf(self, jc):
   pu.jc_configure(self, jc, 'bl.mr.log.level', 'log_level', 'WARNING')
   try:
     self.log_level = getattr(logging, self.log_level)
   except AttributeError:
     raise ValueError("Unsupported log level: %r" % self.log_level)