예제 #1
0
def get_initial_probs(args, late):
    if not args.quiet:
        newmsg("Constructing probability matrices...")

    ## CONSTRUCT EMISSIONS/TRANSITION/INITIAL PROBABILITY MATRIXES FOR R
    if args.kmeans is not None:
        nstates = args.kmeans
        data = []
        for chrom in late.count:
            data += list(late.count[chrom])
            
        r_eprobs, r_tprobs, r_iprobs, np_eprobs, np_tprobs, np_iprobs = help_get_state_emissions_from_kmeans(data, args.kmeans)
    else:
        if args.emodel == "discrete":
            nstates = len( args.mu.strip().strip('\\').split(';') )
        else:
            nstates = len( args.mu.strip().strip('\\').split(',') )
        r_eprobs, r_tprobs, r_iprobs, np_eprobs, np_tprobs, np_iprobs = help_get_prob_matrices_from_params(args.mu,
                                                                                                           args.sigma,
                                                                                                           args.mu_scale,
                                                                                                           args.leave_special_state,
                                                                                                           args.leave_other,
                                                                                                           args.special_idx,
                                                                                                           args.init_special,
                                                                                                           args.initialprobs,
                                                                                                           args.transprobs,
                                                                                                           args.discrete)

    return nstates, r_eprobs, r_tprobs, r_iprobs, np_eprobs, np_tprobs, np_iprobs
예제 #2
0
def run(parser, args):
    ## Don't have kmeans for the discrete option
    assert not (args.emodel == "discrete" and args.kmeans is not None)

    ## Set args.discrete variable based on emodel
    args.discrete = True if args.emodel == "discrete" else False
    ##sys.stderr.write("DEBUG: " + str(args.discrete)+"\n")

    
    ## Warning about iterations and output prefixes
    if args.iters > 1 and args.outpfx is None:
        newmsg("\n\tWARNING WARNING WARNING!!!!!!!!!!!!\n\tIters > 1, but no output prefix specified!\n\tIgnore if that was intentional.\n\tElse restart with --outpfx!\n\tWARNING WARNING WARNING!!!!!!!!!!!!")

    ## Transform data with specified normalization protocol (incl no normalization)
    protocol = NormalizeProtocol(args)
    late = protocol.late #normalize(latestage=args.latestage, protocol=protocol, earlystage=args.earlystage, pseudo=args.pseudo, bandwidth=args.bandwidth, quiet=args.quiet, impute=args.impute, replace=args.replace, replace_with=args.replace_with, replace_this=args.replace_this)

    ## OPTIONAL REPORTING OF TRANSFORMED DATA
    if args.counts:
        report_counts(args, late)

    ## INITIALIZE PARAMETERS
    nstates, r_eprobs, r_tprobs, r_iprobs, np_eprobs, np_tprobs, np_iprobs = get_initial_probs(args, late)

    ## ITERATE
    log10probs = do_hmm_iter_steps(args, late, nstates,
                      r_eprobs, r_tprobs, r_iprobs,
                      np_eprobs, np_tprobs, np_iprobs,
                                   args.converge)

    ## REPORTING LOGPROBS FOR ALL ITERS
    for i in range(args.iters):
        newmsg(str(i) + "\t" + str(log10probs[i]))
예제 #3
0
def report_counts(args, late):
    fn = args.counts + ".bedGraph"
    if not args.quiet:
        bdgmsg("REPORTING:: Final normalized late stage counts", args.collapsed)
        newmsg("Written to: " + fn)
    o = open(fn, 'w')
    o.write(late.get_bdg(late.count, args.collapsed))
    o.close()
예제 #4
0
def run(parser, args):
    if args.protocol1:
        protocol = 1
    elif args.protocol2:
        protocol = 2
    elif args.protocol3:
        protocol = 3
    elif args.protocol4:
        protocol = 4
    elif args.protocol5:
        protocol = 5
    elif args.protocol6:
        protocol = 6

    if not args.skipnorm:
        late = normalize(latestage=args.latestage,
                         protocol=protocol,
                         earlystage=args.earlystage,
                         pseudo=args.pseudo,
                         bandwidth=args.bandwidth,
                         quiet=args.quiet)
    else:
        late = CovBed(args.latestage)

    if args.counts:
        if not args.quiet:
            bdgmsg("final normalized late stage counts", args.collapsed)
        o = open(args.counts + ".bedGraph", 'w')
        o.write(late.get_bdg(late.count, args.collapsed))
        o.close()

    if args.stdev_above:  ## want relation in terms of SD units
        sd = late.get_sd()
        value = late.get_mean() + args.value * sd
    elif args.stdev_below:  ## want relation in terms of SD units
        sd = late.get_sd()
        value = late.get_mean() - args.value * sd
    elif args.mean:  ## want relation in terms MU units
        mu = late.get_mean()
        value = args.value * mu
    else:  ## want relation wrt given value
        value = args.value

    if not args.quiet:
        bdgmsg("filtered bdg values s.t. col4 is " + args.relation + " " +
               str(value) + "...",
               collapsed=False)
    if args.stdev_above or args.stdev_below or args.mean:
        newmsg("Mean = " + str(late.get_mean()))
        newmsg("SD = " + str(late.get_sd()))

    sys.stdout.write(
        late.filtered_bdg(relation=args.relation, value=value, bdg=None))
예제 #5
0
def run(parser, args):
    if args.protocol1:
        protocol=1
    elif args.protocol2:
        protocol=2
    elif args.protocol3:
        protocol=3
    elif args.protocol4:
        protocol=4
    elif args.protocol5:
        protocol=5
    elif args.protocol6:
        protocol=6
    
    if not args.skipnorm:
        late = normalize(latestage=args.latestage, protocol=protocol, earlystage=args.earlystage, pseudo=args.pseudo, bandwidth=args.bandwidth, quiet=args.quiet)
    else:
        late = CovBed(args.latestage)
    
    if args.counts:
        if not args.quiet:
            bdgmsg("final normalized late stage counts", args.collapsed)
        o = open(args.counts + ".bedGraph", 'w')
        o.write(late.get_bdg(late.count, args.collapsed))
        o.close()

        

    if args.stdev_above: ## want relation in terms of SD units
        sd = late.get_sd()
        value = late.get_mean() + args.value * sd
    elif args.stdev_below: ## want relation in terms of SD units
        sd = late.get_sd()
        value = late.get_mean() - args.value * sd
    elif args.mean:  ## want relation in terms MU units
        mu = late.get_mean()
        value = args.value * mu
    else: ## want relation wrt given value 
        value = args.value

    if not args.quiet:
            bdgmsg("filtered bdg values s.t. col4 is " + args.relation + " " + str(value) +"...", collapsed=False)
    if args.stdev_above or args.stdev_below or args.mean:
        newmsg("Mean = " + str(late.get_mean()))
        newmsg("SD = " + str(late.get_sd()))
        
    sys.stdout.write(late.filtered_bdg(relation = args.relation, value = value, bdg=None))
예제 #6
0
    def _normalize(self):
        if not self.args.quiet:
            newmsg("loading late stage file")
        self.late = CovBed(self.args.latestage,
                      replace=self.args.replace,
                      replace_with=self.args.replace_with,
                      replace_this=self.args.replace_this,
                      stringcols=self.args.stringcols)
        if self.args.impute:
            if not self.args.quiet:
                newmsg("imputing late stage bins with missing data")
            self.late.impute_zeros(bw=self.args.impute)
                
        if self.args.earlystage:
            if not self.args.quiet:
                newmsg("loading early stage file")
            self.early = CovBed(self.args.earlystage,
                                replace=self.args.replace,
                                replace_with=self.args.replace_with,
                                replace_this=self.args.replace_this,
                                stringcols=self.args.stringcols)
            if self.args.impute:
                if not self.args.quiet:
                    newmsg("imputing early stage bins with missing data")
                self.early.impute_zeros(bw=self.args.impute)
        else:
            self.early = False
        ## todo -- add filtering out 0 contigs option...
        

        if not self.args.quiet:
            if self.args.earlystage:
                emsg = ' with additional early stage normalization'
            else:
                emsg = ' without additional early stage normalization'
            
        self._set_protocol()
        newmsg("following normalization protocol "+str(self.protocol)+emsg)
        self._run_protocol()
예제 #7
0
def report(args, late, statepath, i):
    ## FIGURE OUT OUTPUT NAME OR STDOUT
    if args.outpfx is None:
        out = sys.stdout
    else:
        out = open(args.outpfx + '_iter' + str(i) + '.bedGraph', 'w')
    ## OPTION: If opted for, get the state means/levels
    if args.levels:
        newmsg("Getting levels for alternative reporting...")
        levels = get_levels(statepath)

    ## TALK
    if not args.quiet:
        bdgmsg("REPORTING STATE PATH ITER " + str(i), args.collapsed)

    ## REPORT
    if args.levels:
        out.write(late.get_bdg(levels, args.collapsed))
    else:
        out.write(late.get_bdg(statepath, args.collapsed))

    ## CLOSE FILE
    if args.outpfx is not None:
        out.close()
예제 #8
0
def run(parser, args):

    bedgraph = CovBed(args.bedgraph)

    if not args.quiet:
        newmsg("finding state path")

    ## CONSTRUCT EMISSIONS PROBABILITY MATRIX FOR R

    eprobs, nstates = help_get_emission_probs(args.mu, args.sigma,
                                              args.mu_scale)

    ## CONSTRUCT TRANSITIONS PROBABILITY MATRIX FOR R

    tprobs = help_get_transition_probs(args.leave_special_state,
                                       args.leave_other, args.special_idx,
                                       nstates)

    ## CONSTRUCT INITIAL PROBABILITY MATRIX FOR R

    iprobs = help_get_initial_probs(nstates, args.special_idx,
                                    args.init_special, args.initialprobs)

    ## HIDDEN MARKOV MODEL: Find most probable path through states

    statepath, emitted_data = generate_hmmR(bedgraph,
                                            args.emodel,
                                            eprobs=eprobs,
                                            tprobs=tprobs,
                                            iprobs=iprobs)

    ##
    if not args.quiet:
        bdgmsg("state path", False)

    sys.stdout.write(bedgraph.expanded_bdg_two_cols(emitted_data, statepath))
예제 #9
0
def do_hmm_iter_steps(args, late, nstates, r_eprobs, r_tprobs, r_iprobs, np_eprobs, np_tprobs, np_iprobs, converged=1e-9):
    #
    
    ## HIDDEN MARKOV MODEL: Find most probable path through states
    ## STATE PATH LEARNING AND RETURN:
    ## This is "Viterbi Training" when using the Viterbi Decoded path.
    ## It is a modified Baum-Welch when using posterior decoding -- it uses the posterior state path as the 100% solution rather than propagating probabilities.
    ## In both cases, the current algo uses a predetermined number of iters (default 1).
    ##      ...rather than looking for a small change in log likelihood of the model
    nstates = np_tprobs.shape[0]
    log10probs = np.zeros(args.iters)
    for i in range(0,args.iters):
        if not args.quiet:
            newmsg("PARAMETERS: iter " + str(i))
            newmsg("\nTransition Probs:\n"+str(r_tprobs))
            newmsg("\nEmission Probs:\n"+str(r_eprobs))
            newmsg("\nInitial Probs:\n"+str(r_iprobs))
        
        if not args.quiet:
            newmsg("FINDING STATE PATH: iter " + str(i))

        ## STEP 1: FIND STATE PATH WITH CURRENT PARAMETERS.
        statepath = hmmR(late, args.path, args.emodel,
                         eprobs=r_eprobs, tprobs=r_tprobs, iprobs=r_iprobs)
        
        report(args, late,
               statepath, i)
        
        ## STEP 1.5: GET LOG PROB OF STATEPATH
        if not args.quiet:
            newmsg("Computing LOG10 PROB STATEPATH: iter " + str(i) + ".........")
        log10probs[i] = log10_prob_state_path(late, statepath, np_eprobs, np_tprobs, np_iprobs, args.emodel)
        if not args.quiet:
            newmsg("LOG10 PROB STATEPATH: iter " + str(i) + " = " + str(log10probs[i]))
        ## CONVERGED?                                                                                   nstates) #r_eprobs, r_tprobs, r_iprobs, np_eprobs, np_tprobs, np_iprobs)
        if args.iters > 1 and i > 0:
            abslogdiff = abs(log10probs[i]-log10probs[i-1])
            if not args.quiet:
                newmsg("DIFFERENCE FROM LAST ITER: " + " = " + str(abslogdiff))
            if abslogdiff <= converged:
                ## FILL REST OF LOG10 WITH DUMMIES OF BREAKING ITER
                for i_leftover in range(i+1, args.iters):
                    log10probs[i_leftover] = log10probs[i]
                if not args.quiet:
                    newmsg("Model has converged at iter: " + str(i) + "\nLog10 State path given model = " + str(log10probs[i]) + "\n... STOPPING. ")
                break
                
        ## STEP 2: UPDATE PARAMETERS WITH CURRENT STATE PATH.
        if args.iters > 1 and i < args.iters-1: ## Don't need to do if only 1 iter, or if last iter.
            if not args.quiet and args.iters > 1:
                newmsg("UPDATING PARAMETERS: iter " + str(i))
            ## Update parameters with current state path.
            r_eprobs, r_tprobs, r_iprobs, np_eprobs, np_tprobs, np_iprobs = updateparameters(late,
                                                                                             statepath,
                                                                                             nstates,
                                                                                             old_np_eprobs=np_eprobs,
                                                                                             learnpseudo=args.learnpseudo,
                                                                                             emodel=args.emodel,
                                                                                             emitpseudo=args.emitpseudo,
                                                                                             constrainEmit=args.constrainEmit)
        
    return log10probs