def __init__(self, ctx): super(Mapper, self).__init__(ctx) self.ctx = ctx jc = self.ctx.getJobConf() self.__get_configuration(jc) self.hit_counter = self.ctx.getCounter(self.COUNTER_CLASS, "BLAST_HITS") self.logger = logging.getLogger("mapper") self.logger.setLevel(self.log_level) self.input_file = "temp.in" self.output_file = "temp.out" engine_logger = logging.getLogger("blastall") engine_logger.setLevel(self.log_level) self.engine = Engine(exe_file=self.blastall_exe, logger=engine_logger) try: self.db_dir = jc.get("mapred.cache.archives").split(",")[0].split( "#")[1] except IndexError: raise ValueError('bad format for "mapred.cache.archives"') self.opts = { "blastall.program": self.program, "blastall.database": os.path.join(self.db_dir, self.db_name), "blastall.out.tabular": True, "blastall.input.file": self.input_file, "blastall.output.file": self.output_file, "blastall.evalue": self.evalue, "blastall.gap.cost": self.gap_cost, "blastall.word.size": self.word_size, "blastall.filter": self.filter, }
def main(argv): logger = logging.getLogger("main") logger.setLevel(logging.DEBUG) parser = make_parser() opt, args = parser.parse_args(argv) try: input_fn = args[1] output_fn = args[2] except IndexError: parser.print_help() sys.exit(2) if opt.get_db: get_db() if opt.format_db: retcode = format_db() if retcode: logging.warn("formatdb returned %d" % retcode) OPTS["blastall.input.file"] = input_fn OPTS["blastall.output.file"] = output_fn engine = Engine(logger=logger) engine.blastall(opts=OPTS)
def __init__(self, ctx): super(Mapper, self).__init__(ctx) self.ctx = ctx jc = self.ctx.getJobConf() self.__get_conf(jc) self.__get_counters() self.logger = logging.getLogger("mapper") self.logger.setLevel(self.log_level) self.input_file = "temp.in" self.output_file = "temp.out" engine_logger = logging.getLogger("blastall") engine_logger.setLevel(self.log_level) self.engine = Engine(exe_file=self.blastall_exe, logger=engine_logger, create_guardian=self.guardian) try: self.db_dir = jc.get("mapred.cache.archives").split(",")[0].split("#")[1] except IndexError: raise ValueError('bad format for "mapred.cache.archives"') self.opts = { "blastall.program": self.program, "blastall.database": os.path.join(self.db_dir, self.db_name), "blastall.out.tabular": True, "blastall.input.file": self.input_file, "blastall.output.file": self.output_file, "blastall.evalue": self.evalue, "blastall.gap.cost": self.gap_cost, "blastall.word.size": self.word_size, "blastall.filter": self.filter, } c = BlastallLKCalculator(self.formatdb_exe, self.blastall_exe, log_level=self.log_level, engine_opts=self.opts) self.lambda_, kappa = c.calculate() self.lnK = math.log(kappa)
class Mapper(pp.Mapper): """ Maps query sequences to blastall hits. @input-record: C{key} does not matter (LineRecordReader), C{value} = whole sequence as <HEADER>\t<SEQUENCE> @output-record: tabular blastall hit against the specified db. @jobconf-param: C{bl.mr.seq.blastall.log.level} logging level, specified as a logging module literal; defaults to 'WARNING'. @jobconf-param: C{bl.mr.seq.blastall.db.name} The BLAST database name (REQUIRED). A BLAST db is typically obtained by running the C{formatdb} command on one or more fasta files. The archive provided through C{mapred.cache.archives} MUST contain BLAST db files beginning with the db name (DB_NAME.nin, etc.). @jobconf-param: C{bl.mr.seq.blastall.program} The BLAST program to use ('blastn', 'blastp', etc.); defaults to 'blastn'. @jobconf-param: C{mapred.cache.archives} distributed cache entry (HDFS_PATH#LINK_NAME) for an archive containing the pre-formatted db files at the top level, i.e., no directories. @jobconf-param: C{mapred.create.symlink} must be set to 'yes'. """ COUNTER_CLASS = "BLASTALL" def __get_configuration(self, jc): pu.jc_configure(self, jc, 'bl.mr.seq.blastall.log.level', 'log_level', 'WARNING') try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) pu.jc_configure(self, jc, 'bl.mr.seq.blastall.exe', 'blastall_exe', '/usr/bin/blastall') pu.jc_configure(self, jc, 'bl.mr.seq.blastall.program', 'program', 'blastn') pu.jc_configure(self, jc, 'bl.mr.seq.blastall.db.name', 'db_name') pu.jc_configure_float(self, jc, 'bl.mr.seq.blastall.evalue', 'evalue', 1.0) pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.gap.cost', 'gap_cost', 1) pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.word.size', 'word_size', 20) pu.jc_configure_bool(self, jc, 'bl.mr.seq.blastall.filter', 'filter', False) def __init__(self, ctx): super(Mapper, self).__init__(ctx) self.ctx = ctx jc = self.ctx.getJobConf() self.__get_configuration(jc) self.hit_counter = self.ctx.getCounter(self.COUNTER_CLASS, "BLAST_HITS") self.logger = logging.getLogger("mapper") self.logger.setLevel(self.log_level) self.input_file = "temp.in" self.output_file = "temp.out" engine_logger = logging.getLogger("blastall") engine_logger.setLevel(self.log_level) self.engine = Engine(exe_file=self.blastall_exe, logger=engine_logger) try: self.db_dir = jc.get("mapred.cache.archives").split(",")[0].split( "#")[1] except IndexError: raise ValueError('bad format for "mapred.cache.archives"') self.opts = { "blastall.program": self.program, "blastall.database": os.path.join(self.db_dir, self.db_name), "blastall.out.tabular": True, "blastall.input.file": self.input_file, "blastall.output.file": self.output_file, "blastall.evalue": self.evalue, "blastall.gap.cost": self.gap_cost, "blastall.word.size": self.word_size, "blastall.filter": self.filter, } def map(self, ctx): header, seq = ctx.getInputValue().rstrip().split("\t", 1) # TODO: use stdin/stdout instead self.__write_input(header, seq) self.engine.blastall(opts=self.opts) for result in self.__read_output(): ctx.incrementCounter(self.hit_counter, 1) k, v = result.split("\t", 1) ctx.emit(k, v) def __write_input(self, header, seq): f = open(self.input_file, "w") f.write(">%s\n%s\n" % (header, seq)) f.close() def __read_output(self): f = open(self.output_file) for line in f: yield line.rstrip() f.close()
class Mapper(pp.Mapper): """ Maps query sequences to blastall hits, discarding those that do not meet the following criteria: query start <= n % identity >= m The output key is al_type.UNAMBIGUOUS, al_type.REPEAT or al_type.NO_HIT. If there are no hits after filtering, the value is the sequence tag; otherwise, one k/v pair is emitted for each hit, where values are the tabular blast hits (the sequence tag is the first field). @input-record: C{key} does not matter (LineRecordReader), C{value} = whole sequence as output by fasta2tab (<HEADER>\t<SEQUENCE>) @output-record: tabular blastall hit against the specified db (first field is removed from the hit and emitted as key, so that the tab-separated k/v pair is the original tab-separated output). @jobconf-param: C{bl.mr.log.level} logging level, specified as a logging module literal; defaults to 'WARNING'. @jobconf-param: C{bl.mr.seq.blastall.db.name} The BLAST database name (REQUIRED). A BLAST db is typically obtained by running the C{formatdb} command on one or more fasta files. The archive provided through C{mapred.cache.archives} MUST contain BLAST db files beginning with the db name (DB_NAME.nin, etc.). @jobconf-param: C{bl.mr.seq.formatdb.exe} Full path to the formatdb executable. @jobconf-param: C{bl.mr.seq.blastall.exe} Full path to the blastall executable. @jobconf-param: C{bl.mr.seq.blastall.program} The BLAST program to use ('blastn', 'blastp', etc.); defaults to 'blastn'. @jobconf-param: C{bl.mr.seq.blastall.evalue} upper threshold for the expectation value. @jobconf-param: C{bl.mr.seq.blastall.gap.cost} gap opening cost. @jobconf-param: C{bl.mr.seq.blastall.word.size} length of best perfect match. @jobconf-param: C{bl.mr.seq.blastall.filter} filter options for DUST or SEG. @jobconf-param: C{mapred.cache.archives} distributed cache entry (HDFS_PATH#LINK_NAME) for an archive containing the pre-formatted db files at the top level, i.e., no directories. @jobconf_param: C{bl.mr.seq.tiget.max.hits} for each query seq, use only the first N blast hits. @jobconf_param: C{bl.mr.seq.tiget.max.start}: discard blast hits with sequence start higher than this value. @jobconf-param: C{bl.mr.seq.tiget.min.identity.percent} discard blast hits with identity lower than this value @jobconf-param: C{mapred.create.symlink} must be set to 'yes'. """ COUNTER_CLASS = "BLASTALL" def __get_log_conf(self, jc): pu.jc_configure(self, jc, 'bl.mr.log.level', 'log_level', 'WARNING') try: self.log_level = getattr(logging, self.log_level) except AttributeError: raise ValueError("Unsupported log level: %r" % self.log_level) def __get_blastall_conf(self, jc): pu.jc_configure(self, jc, 'bl.mr.seq.blastall.exe', 'blastall_exe', '/usr/bin/blastall') pu.jc_configure(self, jc, 'bl.mr.seq.blastall.program', 'program', 'blastn') pu.jc_configure(self, jc, 'bl.mr.seq.blastall.db.name', 'db_name') pu.jc_configure_float(self, jc, 'bl.mr.seq.blastall.evalue', 'evalue', 1.0) pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.gap.cost', 'gap_cost', 1) pu.jc_configure_int(self, jc, 'bl.mr.seq.blastall.word.size', 'word_size', 20) pu.jc_configure_bool(self, jc, 'bl.mr.seq.blastall.filter', 'filter', False) def __get_tiget_conf(self, jc): pu.jc_configure_int(self, jc, 'bl.mr.seq.tiget.max.hits', 'max_hits', 10) pu.jc_configure_int(self, jc, 'bl.mr.seq.tiget.max.start', 'max_start', 4) pu.jc_configure_float(self, jc, 'bl.mr.seq.tiget.min.identity.percent', 'min_identity', 95.0) pu.jc_configure_float(self, jc, 'bl.mr.seq.tiget.min.al2seq.percent', 'min_al2seq', 15.0) self.min_al2seq /= 100 pu.jc_configure_float(self, jc, 'bl.mr.seq.tiget.min.score.diff', 'min_score_diff', 20.0) def __get_conf(self, jc): self.__get_log_conf(jc) # log always comes first self.__get_blastall_conf(jc) self.__get_tiget_conf(jc) pu.jc_configure(self, jc, 'bl.mr.seq.formatdb.exe', 'formatdb_exe', '/usr/bin/formatdb') pu.jc_configure_bool(self, jc, 'bl.spawner.guardian', 'guardian', True) def __get_counters(self): self.hit_counter = self.ctx.getCounter(self.COUNTER_CLASS, "TOTAL_HITS") self.hom_rej_hit_counter = self.ctx.getCounter(self.COUNTER_CLASS, "IDENTITY_REJECTED_HITS") self.start_rej_hit_counter = self.ctx.getCounter(self.COUNTER_CLASS, "START_REJECTED_HITS") def __init__(self, ctx): super(Mapper, self).__init__(ctx) self.ctx = ctx jc = self.ctx.getJobConf() self.__get_conf(jc) self.__get_counters() self.logger = logging.getLogger("mapper") self.logger.setLevel(self.log_level) self.input_file = "temp.in" self.output_file = "temp.out" engine_logger = logging.getLogger("blastall") engine_logger.setLevel(self.log_level) self.engine = Engine(exe_file=self.blastall_exe, logger=engine_logger, create_guardian=self.guardian) try: self.db_dir = jc.get("mapred.cache.archives").split(",")[0].split("#")[1] except IndexError: raise ValueError('bad format for "mapred.cache.archives"') self.opts = { "blastall.program": self.program, "blastall.database": os.path.join(self.db_dir, self.db_name), "blastall.out.tabular": True, "blastall.input.file": self.input_file, "blastall.output.file": self.output_file, "blastall.evalue": self.evalue, "blastall.gap.cost": self.gap_cost, "blastall.word.size": self.word_size, "blastall.filter": self.filter, } c = BlastallLKCalculator(self.formatdb_exe, self.blastall_exe, log_level=self.log_level, engine_opts=self.opts) self.lambda_, kappa = c.calculate() self.lnK = math.log(kappa) def map(self, ctx): header, query_seq = ctx.getInputValue().rstrip().split("\t", 1) # TODO: use stdin/stdout instead self.__write_input(header, query_seq) self.engine.blastall(opts=self.opts) results = list(self.__filter_results(self.__read_output())) if not results: ctx.emit(str(al_type.NO_HIT), header) else: repeat = is_repeat(len(query_seq), results, self.min_al2seq, self.min_score_diff) key = str(al_type.REPEAT) if repeat else str(al_type.UNAMBIGUOUS) for r in results: ctx.emit(key, "\t".join(r)) def __filter_results(self, results_stream): for i, r in enumerate(results_stream): if i > self.max_hits: break self.ctx.incrementCounter(self.hit_counter, 1) identity = float(r[2]) # percentage query_start, query_end = map(int, r[6:8]) if query_start > self.max_start: self.ctx.incrementCounter(self.start_rej_hit_counter, 1) continue if identity < self.min_identity: self.ctx.incrementCounter(self.hom_rej_hit_counter, 1) continue r[-1] = str(self.__bit2raw(float(r[-1]))) yield r def __write_input(self, header, query_seq): f = open(self.input_file, "w") f.write(">%s\n%s\n" % (header, query_seq)) f.close() def __read_output(self): f = open(self.output_file) for line in f: yield line.rstrip().split() f.close() def __bit2raw(self, bit_score): return (LN2*bit_score+self.lnK)/self.lambda_