예제 #1
0
 def __init__(self, args):
     self.sample_file = args.infile
     self.var_type = args.var_type
     self.ref = args.ref
     self.refidx = args.ref + '.fai'
     self.min_af = args.min_AF
     self.skip_on = args.skip_on
         
     self.q = GridEngineQueue()
     self.hold_jid = {}
     
     self._regist_caller(args)
     self._prepare_dir()
예제 #2
0
class Somatic:
    def __init__(self, args):
        self.sample_file = args.infile
        self.var_type = args.var_type
        self.ref = args.ref
        self.refidx = args.ref + '.fai'
        self.min_af = args.min_AF
        self.skip_on = args.skip_on
            
        self.q = GridEngineQueue()
        self.hold_jid = {}
        
        self._regist_caller(args)
        self._prepare_dir()

    @property
    def worker_name(self):
        return re.sub("(?!^)([A-Z]+)", r" \1", type(self).__name__).lower().split()[0]

    @property
    def sample_name(self):
        clname = os.path.splitext(os.path.basename(self.clone))[0]
        tiname = os.path.splitext(os.path.basename(self.tissue))[0]
        return '{}_-_{}'.format(clname, tiname)
    
    @property
    def bin_path(self):
        return os.path.dirname(os.path.realpath(__file__))

    @property
    def script_dir(self):
        return '{}/job_scripts/{}'.format(self.bin_path, self.worker_name)
        
    @property
    def qerr_dir(self):
        return 'q.err/{}.all'.format(self.sample_name)

    @property
    def qout_dir(self):
        return 'q.out/{}.all'.format(self.sample_name)

    @property
    def af_dir(self):
        return self.worker_name + '.AF'

    @property
    def out_dir(self):
        return self.worker_name + '.out'

    @property
    def sample_list(self):
        pairs = set()
        with open(self.sample_file) as f:
            for line in f:
                try:
                    clone, tissue = line.split()[:2]
                    pairs.add((clone, tissue))
                except ValueError as e:
                    msg = 'Sample list file should have at least 2 columns.'
                    raise e(msg)
        return pairs
    
    @property
    def concall_file(self):
        return '{}/{}.{}_call_n{}_{}AFcutoff.txt'.format(
            self.out_dir, self.sample_name, self.var_type,
            len(self.caller), str(self.min_af).replace('0.', ''))

    @property
    def concall_file_ok(self):
        return self.skip_on and checksum_match(self.concall_file)

    @staticmethod
    def _check_ref(fasta):
        if not os.path.isfile(fasta):
            msg = "Can't find the reference file '{}'".format(fasta)
            raise FileNotFoundError(msg)
        if not os.path.isfile(fasta + '.fai'):
            msg = "Can't find index for the fasta file '{}'".format(fasta)
            raise FileNotFoundError(msg)
        refname = os.path.splitext(fasta)[0]
        if not os.path.isfile(refname + '.dict'):
            msg = "Can't find dictionary for the fasta file '{}'".format(fasta)
            raise FileNotFoundError(msg)

    @staticmethod
    def _check_bam(bam):
        if not os.path.isfile(bam):
            msg = "Can't find the bam file '{}'".format(bam)
            raise FileNotFoundError(msg)

        bamname = os.path.splitext(bam)[0]
        if not os.path.isfile(bamname + '.bai') and not os.path.isfile(bamname + '.bam.bai'):
            msg = "Can't find index for the bam file '{}'".format(bam)
            raise FileNotFoundError(msg)

    def _regist_caller(self, args):
        if self.var_type == "snv":
            self.caller = [SomaticSniper(args, self.worker_name),
                           Strelka(args, self.worker_name)]
            if args.chunk_on:
                chunk_size = 25000000
                chunk_file = self._chunkfile(chunk_size)
                self.caller.extend([
                    MuTect(args, self.worker_name, chunk_file),
                    VarScan(args, self.worker_name, chunk_file)])
            else:
                self.caller.extend([
                    MuTect(args, self.worker_name),
                    VarScan(args, self.worker_name)])
                
        elif self.var_type == "indel":
            self.caller = [Strelka(args, self.worker_name)]
            if args.chunk_on:
                chunk_size = 25000000
                chunk_file = self._chunkfile(chunk_size)
                chrom_file = self._chromfile()
                self.caller.extend([
                    Scalpel(args, self.worker_name, chrom_file),
                    VarScan(args, self.worker_name, chunk_file)])
            else:
                self.caller.extend([
                    Scalpel(args, self.worker_name),
                    VarScan(args, self.worker_name)])

    def _prepare_dir(self):
        make_dir(self.out_dir)
        
    def _check_data(self):
        self._check_ref(self.ref)
        for clone, tissue in self.sample_list:
            self._check_bam(clone)
            self._check_bam(tissue)
    
    def _qopt(self, jprefix, hold_jid=''):
        qopt = '-N {}.{} -e {} -o {} -v BIN_PATH={}'.format(
            jprefix, self.sample_name, self.qerr_dir, self.qout_dir, self.bin_path)
        if hold_jid == '':
            return qopt
        else:
            return qopt + ' -hold_jid {}'.format(hold_jid)

    def _chunkfile(self, chunk_size):
        f=lambda n,i=0:"{:.0f}{}".format(n," kMG"[i])*(n<1e3)or f(n/1e3, i+1)
        chunk_file = "{}.call/genomic_regions_{}_chunk.txt".format(
            self.worker_name, f(chunk_size).strip())
        if not os.path.isfile(chunk_file):
            make_dir(os.path.dirname(chunk_file))
            with open(chunk_file, 'w') as out:
                with open(self.refidx) as f:
                    for line in f:
                        chrom, chrom_size = line.split()[:2]
                        chrom_size = int(chrom_size)
                        for start in range(1, chrom_size, chunk_size):
                            end = start + chunk_size - 1
                            if end > chrom_size:
                                end = chrom_size
                            out.write('{}:{}-{}\n'.format(chrom, start, end))
        return chunk_file

    def _chromfile(self):
        chrom_file = "{}.call/genomic_regions_chrom.txt".format(
            self.worker_name)
        if not os.path.isfile(chrom_file):
            make_dir(os.path.dirname(chrom_file))
            with open(chrom_file, 'w') as out:
                with open(self.refidx) as f:
                    chroms = []
                    for line in f:
                        chrom, chrom_size = line.split()[:2]
                        chrom_size = int(chrom_size)
                        chroms.append((chrom, chrom_size))
                    chroms.sort(key=lambda chrom:chrom[1], reverse=True)
                for chrom, end in chroms:
                    out.write('{}:1-{}\n'.format(chrom, end))
        return chrom_file

    def _concall(self, hold_jid):
        qopt = self._qopt('con_call', hold_jid)
        cmd =  '{}/{}_con_call.sh {} {} {}'.format(
            self.script_dir, self.var_type,
            self.min_af, self.af_dir, self.concall_file)
        return self.q.submit(qopt, cmd)

    def _skip_msg(self, jname):
        if self.q.__class__.is_1st_print:
            self.q.__class__.is_1st_print = False
        else:
            print('\x1b[2A', end='\r')
        skip_msg(jname, '{}.all'.format(self.sample_name))

    def _run_msg(self, jname):
        run_msg(jname, '{}.all'.format(self.sample_name))

    def _run(self):
        hold_jid = ''
        if self.concall_file_ok:
            self._skip_msg('calling')
            self._skip_msg('af_calc')
            self._skip_msg('con_call')
        else:
            make_dir(self.qerr_dir)
            make_dir(self.qout_dir)
            jids = (caller.run(self.clone, self.tissue) for caller in self.caller)
            hold_jid = ','.join(jid for jid in jids if jid != '')
            hold_jid = self._concall(hold_jid)
            self._run_msg('con_call')
        return hold_jid
            
    def run(self):
        self._check_data()
        for clone, tissue in self.sample_list:
            self.clone = clone
            self.tissue = tissue
            self._run()
        end_msg(self.q.j_total)