示例#1
0
    def _log_likelihood(self,
                        phi,
                        alleleConfig,
                        baseline,
                        maxCopyNumber,
                        update_tree=True):
        if update_tree:
            ##################################################
            # some useful info about the tree,
            # used by CNV related computations,
            set_node_height(self.tssb)
            set_path_from_root_to_node(self.tssb)
            map_datum_to_node(self.tssb)
            ##################################################

        # 注意:此处应该不受CN\genotype的限制,但不记录目标的CN和genotype
        # 因为此处的parameter不是准确的phi,所以无法达到最优,但为了能够抽样得到
        # 最佳结构。此处要使CN和genotype自由发挥

        # 在时间上的先后顺序能够明确影响测序数据的位置,只有重叠位置的时候才会
        # 发生
        # 注意此处可以设置默认参数

        if self.fixedC < 0:
            return self.__log_likelihood_RD_BAF(phi, alleleConfig, baseline,
                                                maxCopyNumber)
        elif self.fixedC >= 0:
            return self.__log_likelihood_RD_BAF(phi, alleleConfig, baseline,
                                                maxCopyNumber, self.fixedC)
        else:
            raise Exception("fixedC is abnormal")
示例#2
0
    def _log_likelihood_GivenC(self,
                               phi,
                               C,
                               alleleConfig,
                               baseline,
                               maxCopyNumber,
                               update_tree=False):
        if update_tree:
            ##################################################
            # some useful info about the tree,
            # used by CNV related computations,
            set_node_height(self.tssb)
            set_path_from_root_to_node(self.tssb)
            map_datum_to_node(self.tssb)
            ##################################################

        copyNumbers = None
        if self.tag == "BASELINE":
            copyNumbers = [2]
        elif get_loga(self) > baseline:
            copyNumbers = range(3, maxCopyNumber + 1)
        else:
            copyNumbers = range(0, 2)
            pass

        if C not in copyNumbers:
            return -float("Inf")

        ll, pi = self._getLLStripe(C, phi, baseline, alleleConfig)

        self.copyNumber = C
        self.genotype = pi
        self.phi = phi

        return ll
        pass
示例#3
0
def do_mcmc(stateManager, backupManager, safeToExit, runSucceeded, config,
            state, treeWriter, stripes, stripeNum, tmpDir):
    startIter = state['last_iteration'] + 1
    unwrittenTreeL = []
    mcmcSampleTimesL = []
    lastMcmcSampleTime = time.time()

    # If --tmp-dir is not specified on the command line, it will by default be
    # None, which will cause mkdtemp() to place this directory under the system's
    # temporary directory. This is the desired behaviour.
    config['tmp_dir'] = tempfile.mkdtemp(prefix='pwgsdataexchange.',
                                         dir=tmpDir)

    for iteration in range(startIter, state['sample_number']):
        safeToExit.set()
        if iteration < 0:
            logmsg(iteration)

        # Referring to tssb as local variable instead of dictionary element is much
        # faster.
        tssb = state['tssb']
        tssb.resample_assignments()
        tssb.cull_tree()

        # assign node ids
        wts, nodes = tssb.get_mixture()
        for i, node in enumerate(nodes):
            node.id = i

        ##################################################
        # some useful info about the tree,
        # used by CNV related computations,
        # to be called only after resampling assignments
        set_node_height(tssb)
        set_path_from_root_to_node(tssb)
        map_datum_to_node(tssb)
        ##################################################

        state['mh_acc'] = metropolis(tssb, state['mh_itr'], state['mh_std'],
                                     state['mh_burnin'], stripeNum,
                                     state['stripes_file'], state['rand_seed'],
                                     config['tmp_dir'])

        if float(state['mh_acc']) < 0.08 and state['mh_std'] < 10000:
            state['mh_std'] = state['mh_std'] * 2.0
            logmsg("Shrinking MH proposals. Now %f" % state['mh_std'])
        if float(state['mh_acc']) > 0.5 and float(state['mh_acc']) < 0.99:
            state['mh_std'] = state['mh_std'] / 2.0
            logmsg("Growing MH proposals. Now %f" % state['mh_std'])

        tssb.resample_sticks()
        tssb.resample_stick_orders()
        tssb.resample_hypers(dpAlpha=True, alphaDecay=True, dpGamma=True)

        lastLlh = tssb.complete_data_log_likelihood()
        if iteration >= 0:
            state['cd_llh_traces'][iteration] = lastLlh
            if True or mod(iteration, 10) == 0:
                weights, nodes = tssb.get_mixture()
                logmsg(' '.join([
                    str(v) for v in (iteration, len(nodes),
                                     state['cd_llh_traces'][iteration],
                                     state['mh_acc'], tssb.dpAlpha,
                                     tssb.dpGamma, tssb.alphaDecay)
                ]))
            if argmax(state['cd_llh_traces'][:iteration + 1]) == iteration:
                logmsg("%f is best per-data complete data likelihood so far." %
                       (state['cd_llh_traces'][iteration]))
        else:
            state['burnin_cd_llh_traces'][
                iteration + state['burnin_sample_number']] = lastLlh

        # Can't just put tssb in unwrittenTreeL, as this object will be modified
        # on subsequent iterations, meaning any stored references in
        # unwrittenTreeL will all point to the same sample.
        serialized = pickle.dumps(tssb, protocol=pickle.HIGHEST_PROTOCOL)
        unwrittenTreeL.append((serialized, iteration, lastLlh))
        state['tssb'] = tssb
        state['rand_state'] = get_state()
        state['last_iteration'] = iteration

        if len([
                C for C in state['tssb'].root['children']
                if C['node'].has_data()
        ]) > 1:
            logmsg('Polyclonal tree detected with %s clones.' %
                   len(state['tssb'].root['children']))

        newMcmcSampleTime = time.time()
        mcmcSampleTimesL.append(newMcmcSampleTime - lastMcmcSampleTime)
        lastMcmcSampleTime = newMcmcSampleTime

        # It's not safe to exit while performing file IO, as we don't want
        # trees.zip or the computation state file to become corrupted from an
        # interrupted write.
        safeToExit.clear()
        shouldWriteBackup = iteration % state[
            'write_backups_every'] == 0 and iteration != startIter
        shouldWriteState = iteration % state['write_state_every'] == 0
        isLastIteration = (iteration == state['sample_number'] - 1)

        # If backup is scheduled to be written, write both it and full program
        # state regardless of whether we're scheduled to write state this
        # iteration.
        if shouldWriteBackup or shouldWriteState or isLastIteration:
            with open('mcmc_samples.txt', 'a') as mcmcf:
                llhsAndTimes = [(itr, llh, itr_time) for (
                    tssb, itr,
                    llh), itr_time in zip(unwrittenTreeL, mcmcSampleTimesL)]
                llhsAndTimes = '\n'.join([
                    '%s\t%s\t%s' % (itr, llh, itr_time)
                    for itr, llh, itr_time in llhsAndTimes
                ])
                mcmcf.write(llhsAndTimes + '\n')
            treeWriter.write_trees(unwrittenTreeL)
            stateManager.write_state(state)
            unwrittenTreeL = []
            mcmcSampleTimesL = []
            if shouldWriteBackup:
                backupManager.save_backup()

    backupManager.remove_backup()
    safeToExit.clear()
    # save the best tree
    print_top_trees(TreeWriter.defaultArchiveFn, state['top_k_trees_file'],
                    state['top_k'])

    # save clonal frequencies
    freq = dict([(g, []) for g in state['stripe_list']])
    stripeL = array(freq.keys(), str)
    stripeL.shape = (1, len(stripeL))
    savetxt(state['clonal_freqs_file'],
            vstack((stripeL, array([freq[g] for g in freq.keys()]).T)),
            fmt='%s',
            delimiter=', ')

    safeToExit.set()
    runSucceeded.set()