def __init__(self, ctx): super(type(self), self).__init__(ctx) self.__get_configuration(ctx) logging.basicConfig(level=self.log_level) self.event_monitor = HadoopEventMonitor(self.COUNTER_CLASS, logging.getLogger("mapper"), ctx) self.aligner = BwaAligner() self.aligner.event_monitor = self.event_monitor self.aligner.qformat = self.format self.aligner.max_isize = self.max_isize self.aligner.nthreads = self.nthreads self.aligner.trim_qual = self.trim_qual self.aligner.mmap_enabled = True ######## assemble hit processor chain chain = FilterLink(self.event_monitor) chain.remove_unmapped = self.remove_unmapped chain.min_hit_quality = self.min_hit_quality if self.__map_only: chain.set_next(EmitSamLink(ctx, self.event_monitor)) else: chain.set_next(MarkDuplicatesEmitter(ctx, self.event_monitor)) self.aligner.hit_visitor = chain ######## set the path to the reference index self.ref_archive = utils.get_ref_archive(ctx.getJobConf()) self.aligner.reference = self.get_reference_root(self.ref_archive) # part of the code is a workaround for accumulating records, see #331 isplit = InputSplit(ctx.getInputSplit()) self.split_end = isplit.offset + isplit.length
def run_bwa_py_sampe(refseq_fname, read_fname, mate_fname, log_level=logging.INFO, pairing_batch_size=None, seq_list_len=None, fastq_subfmt="fastq-illumina"): logger = logging.getLogger("PY") logger.setLevel(log_level) logger.info("RUNNING PYTHON VERSION") def debug_dump(seq, state): logger.debug("%s: name=%s" % (state, seq.get_name())) logger.debug("%s: qual=%s" % (state, seq.get_qual_string())) logger.debug("%s: strand=%d" % (state, seq.strand)) logger.debug("%s: pos=%d" % (state, seq.pos)) logger.debug("%s: mapQ=%d" % (state, seq.mapQ)) read_flow = Bio.SeqIO.parse(open(read_fname), fastq_subfmt) mate_flow = Bio.SeqIO.parse(open(mate_fname), fastq_subfmt) pairs_flow = it.izip(read_flow, mate_flow) class ResultCollector(object): def __init__(self): self.result = [] def process(self, pair): self.result.append(pair[0]) self.result.append(pair[1]) result = ResultCollector() while 1: start = time.time() pairs = list(it.islice(pairs_flow, 0, seq_list_len)) if len(pairs) == 0: break # turn the biopython SeqRecords into simple tuples tuples = map(lambda t: (t[0].name, t[0].seq.tostring(), None, t[1].seq.tostring(), None), pairs[0:5]) for t in tuples: print t logger.info('reading seqs %f sec' % (time.time() - start)) start = time.time() aligner = BwaAligner() aligner.reference = refseq_fname aligner.hit_visitor = result for t in tuples[0:5]: aligner.load_pair_record(t) aligner.run_alignment() aligner.clear_batch() logger.info('alignment %f sec' % (time.time() - start)) # map bwa mappings to dictionaries def bwam_to_hash(bwa_m): h = dict( name=bwa_m.name, aux=bwa_m.tags, seq=bwa_m.get_seq_5() ) return h return map(bwam_to_hash, result.result)
def setUp(self): utils.build_ref_index() self.aligner = BwaAligner() self.aligner.reference = utils.reference self.aligner.hit_visitor = type(self).SimpleVisitor() self.pairs = [] with open(utils.get_fixture_path("pairs.txt")) as f: for line in f: if not line.startswith("#"): # leave #-lines for comments self.pairs.append(line.rstrip("\r\n").split("\t"))
def setUp(self): self.aligner = BwaAligner() test_dir = os.path.abspath( os.path.join(os.path.dirname(__file__), '..', '..', '..')) self.aligner.reference = os.path.join(test_dir, 'seal', 'mini_ref_fixture', 'mini_ref.fasta') self.aligner.hit_visitor = MappingsCollector() self.aligner.qformat = "fastq-sanger" self.pair = ( "HWI-ST301L:236:C0EJ5ACXX:1:1101:18292:2904", "GGGAGGTGTTAGGGACAAGCCTGGAGGCAGCATGCGTCACTCCCATGCAGAGTCCATTGGCCAATGCTGGCTCCGATGGCCACATCTCACTCCAGGGGCAG", "?@@B?<=AADFCFH@FB?EFEGAAFGEEGEGHCGEGIGH?B?CGEFHGIIGAEEEEHEAEEEH937;;@3=;>@8;?8;9A:<A#################", "AATAGAATGTAATATAATATATGTAAAACACCAGGTGCCTAACCTGGCACAGAGCAGGAGGGCTAAGCATGACATCCAGCACGTGGTCAGTGGAATCCAGT", "@@@DFDDDBHDD<EHEHIFEEB<IHIEGHDFEH?B:CBEHICEGCGGIIGFGCFCE@FAFEGAAGHIIHF;A?DBDFB);@@35;?,;@35(:5:ACCC<>" )