def __init__(self, args): self.sample_file = args.infile self.var_type = args.var_type self.ref = args.ref self.refidx = args.ref + '.fai' self.min_af = args.min_AF self.skip_on = args.skip_on self.q = GridEngineQueue() self.hold_jid = {} self._regist_caller(args) self._prepare_dir()
class Somatic: def __init__(self, args): self.sample_file = args.infile self.var_type = args.var_type self.ref = args.ref self.refidx = args.ref + '.fai' self.min_af = args.min_AF self.skip_on = args.skip_on self.q = GridEngineQueue() self.hold_jid = {} self._regist_caller(args) self._prepare_dir() @property def worker_name(self): return re.sub("(?!^)([A-Z]+)", r" \1", type(self).__name__).lower().split()[0] @property def sample_name(self): clname = os.path.splitext(os.path.basename(self.clone))[0] tiname = os.path.splitext(os.path.basename(self.tissue))[0] return '{}_-_{}'.format(clname, tiname) @property def bin_path(self): return os.path.dirname(os.path.realpath(__file__)) @property def script_dir(self): return '{}/job_scripts/{}'.format(self.bin_path, self.worker_name) @property def qerr_dir(self): return 'q.err/{}.all'.format(self.sample_name) @property def qout_dir(self): return 'q.out/{}.all'.format(self.sample_name) @property def af_dir(self): return self.worker_name + '.AF' @property def out_dir(self): return self.worker_name + '.out' @property def sample_list(self): pairs = set() with open(self.sample_file) as f: for line in f: try: clone, tissue = line.split()[:2] pairs.add((clone, tissue)) except ValueError as e: msg = 'Sample list file should have at least 2 columns.' raise e(msg) return pairs @property def concall_file(self): return '{}/{}.{}_call_n{}_{}AFcutoff.txt'.format( self.out_dir, self.sample_name, self.var_type, len(self.caller), str(self.min_af).replace('0.', '')) @property def concall_file_ok(self): return self.skip_on and checksum_match(self.concall_file) @staticmethod def _check_ref(fasta): if not os.path.isfile(fasta): msg = "Can't find the reference file '{}'".format(fasta) raise FileNotFoundError(msg) if not os.path.isfile(fasta + '.fai'): msg = "Can't find index for the fasta file '{}'".format(fasta) raise FileNotFoundError(msg) refname = os.path.splitext(fasta)[0] if not os.path.isfile(refname + '.dict'): msg = "Can't find dictionary for the fasta file '{}'".format(fasta) raise FileNotFoundError(msg) @staticmethod def _check_bam(bam): if not os.path.isfile(bam): msg = "Can't find the bam file '{}'".format(bam) raise FileNotFoundError(msg) bamname = os.path.splitext(bam)[0] if not os.path.isfile(bamname + '.bai') and not os.path.isfile(bamname + '.bam.bai'): msg = "Can't find index for the bam file '{}'".format(bam) raise FileNotFoundError(msg) def _regist_caller(self, args): if self.var_type == "snv": self.caller = [SomaticSniper(args, self.worker_name), Strelka(args, self.worker_name)] if args.chunk_on: chunk_size = 25000000 chunk_file = self._chunkfile(chunk_size) self.caller.extend([ MuTect(args, self.worker_name, chunk_file), VarScan(args, self.worker_name, chunk_file)]) else: self.caller.extend([ MuTect(args, self.worker_name), VarScan(args, self.worker_name)]) elif self.var_type == "indel": self.caller = [Strelka(args, self.worker_name)] if args.chunk_on: chunk_size = 25000000 chunk_file = self._chunkfile(chunk_size) chrom_file = self._chromfile() self.caller.extend([ Scalpel(args, self.worker_name, chrom_file), VarScan(args, self.worker_name, chunk_file)]) else: self.caller.extend([ Scalpel(args, self.worker_name), VarScan(args, self.worker_name)]) def _prepare_dir(self): make_dir(self.out_dir) def _check_data(self): self._check_ref(self.ref) for clone, tissue in self.sample_list: self._check_bam(clone) self._check_bam(tissue) def _qopt(self, jprefix, hold_jid=''): qopt = '-N {}.{} -e {} -o {} -v BIN_PATH={}'.format( jprefix, self.sample_name, self.qerr_dir, self.qout_dir, self.bin_path) if hold_jid == '': return qopt else: return qopt + ' -hold_jid {}'.format(hold_jid) def _chunkfile(self, chunk_size): f=lambda n,i=0:"{:.0f}{}".format(n," kMG"[i])*(n<1e3)or f(n/1e3, i+1) chunk_file = "{}.call/genomic_regions_{}_chunk.txt".format( self.worker_name, f(chunk_size).strip()) if not os.path.isfile(chunk_file): make_dir(os.path.dirname(chunk_file)) with open(chunk_file, 'w') as out: with open(self.refidx) as f: for line in f: chrom, chrom_size = line.split()[:2] chrom_size = int(chrom_size) for start in range(1, chrom_size, chunk_size): end = start + chunk_size - 1 if end > chrom_size: end = chrom_size out.write('{}:{}-{}\n'.format(chrom, start, end)) return chunk_file def _chromfile(self): chrom_file = "{}.call/genomic_regions_chrom.txt".format( self.worker_name) if not os.path.isfile(chrom_file): make_dir(os.path.dirname(chrom_file)) with open(chrom_file, 'w') as out: with open(self.refidx) as f: chroms = [] for line in f: chrom, chrom_size = line.split()[:2] chrom_size = int(chrom_size) chroms.append((chrom, chrom_size)) chroms.sort(key=lambda chrom:chrom[1], reverse=True) for chrom, end in chroms: out.write('{}:1-{}\n'.format(chrom, end)) return chrom_file def _concall(self, hold_jid): qopt = self._qopt('con_call', hold_jid) cmd = '{}/{}_con_call.sh {} {} {}'.format( self.script_dir, self.var_type, self.min_af, self.af_dir, self.concall_file) return self.q.submit(qopt, cmd) def _skip_msg(self, jname): if self.q.__class__.is_1st_print: self.q.__class__.is_1st_print = False else: print('\x1b[2A', end='\r') skip_msg(jname, '{}.all'.format(self.sample_name)) def _run_msg(self, jname): run_msg(jname, '{}.all'.format(self.sample_name)) def _run(self): hold_jid = '' if self.concall_file_ok: self._skip_msg('calling') self._skip_msg('af_calc') self._skip_msg('con_call') else: make_dir(self.qerr_dir) make_dir(self.qout_dir) jids = (caller.run(self.clone, self.tissue) for caller in self.caller) hold_jid = ','.join(jid for jid in jids if jid != '') hold_jid = self._concall(hold_jid) self._run_msg('con_call') return hold_jid def run(self): self._check_data() for clone, tissue in self.sample_list: self.clone = clone self.tissue = tissue self._run() end_msg(self.q.j_total)