def resample_changer(self,data,numiter): ''' metropolis-(hastings) / simulated annealing version ''' # TODO make another version that exploits gamma/poisson construction of # negbin distribution if len(data) == 0: self.r = sample_discrete(self.discrete) + 1 self.p = stats.beta.rvs(self.alpha,self.beta) else: assert np.min(data) >= 1 # got this general idea from web.mit.edu/~wingated/www/introductions/mcmc-gibbs-intro.pdf # get posterior value of current (r,p) current_log_prior_value = stats.beta.logpdf(self.p,self.alpha,self.beta) + np.log(self.discrete[self.r-1]) current_log_likelihood_value = np.sum(self.log_pmf(data)) for iter in xrange(numiter): # generate proposals, using prior on r and conditionally poterior on p as proposal distribution # it uses posterior information in proposing p proposal_r = sample_discrete(self.discrete)+1 proposal_p = stats.beta.rvs(self.alpha + proposal_r * float(len(data)), self.beta + np.sum(data-1.)) # get posterior value for proposal proposal_log_prior_value = stats.beta.logpdf(proposal_p,self.alpha,self.beta) + np.log(self.discrete[self.r-1]) proposal_log_likelihood_value = np.sum(self.log_pmf(x=data,r=proposal_r,p=proposal_p)) # accept proposal with some probability accept_probability = np.exp(min(0.,proposal_log_prior_value - current_log_prior_value + proposal_log_likelihood_value - current_log_likelihood_value)) #accept_probability = min(1, (proposal_prior_value / current_prior_value * np.exp(proposal_log_likelihood_value - current_log_likelihood_value)) ) if sample_discrete(np.array((1.-accept_probability, accept_probability))): self.r, self.p = proposal_r, proposal_p current_log_prior_value = proposal_log_prior_value current_log_likelihood_value = proposal_log_likelihood_value
def resample(self,data=np.array([]),numiter=10): if data.size == 0: # sample from prior self.wait = sample_discrete(self.discrete) + self.MIN self.distn.resample() else: assert data.ndim == 1 # this is a pretty simplistic method for iter in xrange(numiter*10): # resample posterior wait, given fixed distn log_probs = np.sum(self.distn.log_pmf(np.vstack([data - (wait+self.MIN) for wait in xrange(len(self.discrete))])),axis=1) log_probs -= np.amax(log_probs) self.wait = sample_discrete( self.discrete * np.exp(log_probs) ) # resample fixed distn given wait self.distn.resample(data - self.wait,numiter=numiter)
def generate_states(self): if self.left_censoring: raise NotImplementedError idx = 0 nextstate_distr = self.pi_0 A = self.trans_matrix stateseq = np.empty(self.T, dtype=np.int32) # durations = [] while idx < self.T: # sample a state state = sample_discrete(nextstate_distr) # sample a duration for that state duration = self.dur_distns[state].rvs() # save everything # durations.append(duration) stateseq[idx:idx + duration] = state # this can run off the end, that's okay # set up next state distribution nextstate_distr = A[state, ] # update index idx += duration self.stateseq = stateseq
def test(cls): from matplotlib import pyplot as plt truth = cls(1.,1.) print truth.concentration infer = cls(1.,1.) foo = [] for itr in range(50): num_die = 1 num_sides = 6 dice = stats.gamma.rvs(truth.concentration * np.ones((num_die,num_sides))/num_sides) dice /= dice.sum(1)[:,na] # get some samples num_samples = 50*np.ones(num_die) counts = np.zeros((num_die,num_sides),dtype=np.int32) for idx, (num, die) in enumerate(zip(num_samples,dice)): counts[idx] = np.bincount(sample_discrete(die,size=num),minlength=num_sides) infer.resample(counts) foo.append(infer.concentration) print np.median(foo) plt.hist(foo,bins=25,normed=True)
def _resample_a_word(self, hsmm_states): # hsmm_states = [letter_state for letter_state in self.letter_hsmm.states_list if letter_state.word_idx == word_idx] candidates = [ tuple(letter_state.stateseq_norep) for letter_state in hsmm_states ] unique_candidates = list(set(candidates)) ref_array = np.array( [unique_candidates.index(candi) for candi in candidates]) if len(candidates) == 0: return self.generate_word() elif len(unique_candidates) == 1: return unique_candidates[0] cache_score = np.empty((len(unique_candidates), len(candidates))) likelihoods = np.array( [letter_state.log_likelihood() for letter_state in hsmm_states]) range_tmp = list(range(len(candidates))) for candi_idx, candi in enumerate(unique_candidates): tmp = range_tmp[:] if (ref_array == candi_idx).sum() == 1: tmp.remove(np.where(ref_array == candi_idx)[0][0]) for tmp_idx in tmp: # print(hsmm_states[tmp_idx].likelihood_block_word(candi)[-1]) cache_score[candi_idx, tmp_idx] = hsmm_states[ tmp_idx].likelihood_block_word(candi)[-1] cache_scores_matrix = cache_score[ref_array] for i in range_tmp: cache_scores_matrix[i, i] = 0.0 scores = cache_scores_matrix.sum(axis=1) + likelihoods assert (np.exp(scores) >= 0).all(), cache_scores_matrix sampled_candi_idx = sample_discrete(np.exp(scores)) return candidates[sampled_candi_idx]
def generate_states(self): idx = 0 nextstate_distr = self.initial_distn.pi_0 A = self.transition_distn.A stateseq = -1*np.ones(self.T,dtype=np.int32) stateseq_norep = [] durations = [] while idx < self.T: # sample a state state = sample_discrete(nextstate_distr) # sample a duration for that state duration = self.dur_distns[state].rvs() # save everything stateseq_norep.append(state) durations.append(duration) stateseq[idx:idx+duration] = state # this can run off the end, that's okay # set up next state distribution nextstate_distr = A[state,] # update index idx += duration self.stateseq_norep = np.array(stateseq_norep,dtype=np.int32) self.durations = np.array(durations,dtype=np.int32) self.stateseq = stateseq # NOTE self.durations.sum() >= self.T since self.T is the censored # length assert len(self.stateseq_norep) == len(self.durations) assert (self.stateseq >= 0).all()
def generate_word(self, word_size): nextstate_distn = self.init_state_distn.pi_0 A = self.trans_distn.trans_matrix word = [-1] * word_size for idx in range(word_size): word[idx] = sample_discrete(nextstate_distn) nextstate_distn = A[word[idx]] return tuple(word)
def resample_letter_params(self): states_index = [0] hsmm = self.letter_hsmm hsmm.states_list = [] for s in self.states_list: s.letterseq = np.ones(len(s.data), dtype=np.int64) * -1 for state in range(self.state_dim): for s in self.states_list: for state2, (start, stop) in s.state_ranges: if state == state2: hsmm.add_data_parallel(s.data[start:stop]) hsmm.states_list[-1].letterseq = s.letterseq[ start:stop] states_index.append(len(hsmm.states_list)) hsmm.resample_states_parallel() likelihoods = hsmm.likelihoods() state_count = {} for state, bound in enumerate(zip(states_index[:-1], states_index[1:])): staff = range(*bound) if len(staff) == 0: self.word_list[state] = self.generate_word() continue candidates = [] scores = [] for idx in staff: rest = set(staff) - set([idx]) word = hsmm.states_list[idx].stateseq_norep score = np.sum([ hsmm.states_list[s].likelihood_block_word( 0, len(hsmm.states_list[s].data), word) for s in rest ]) + likelihoods[idx] scores.append(score) candidates.append(tuple(word)) resample_state_flag = len(set(candidates)) > 1 if resample_state_flag: word_idx = sample_discrete(np.exp(scores)) sampleseq = candidates[word_idx] else: sampleseq = candidates[0] self.word_list[state] = tuple(sampleseq) for idx in staff: s = hsmm.states_list[idx] s.letterseq[:] = s.stateseq word = tuple(s.stateseq_norep) hsmm.resample_trans_distn() hsmm.resample_init_state_distn() hsmm.resample_dur_distns() hsmm.resample_obs_distns() self.resample_length_dist()
def _sample_forwards_log(betal,trans_matrix,init_state_distn,log_likelihoods): A = trans_matrix aBl = log_likelihoods T = aBl.shape[0] stateseq = np.empty(T,dtype=np.int32) nextstate_unsmoothed = init_state_distn for idx in range(T): logdomain = betal[idx] + aBl[idx] logdomain[nextstate_unsmoothed == 0] = -np.inf if np.any(np.isfinite(logdomain)): stateseq[idx] = sample_discrete(nextstate_unsmoothed * np.exp(logdomain - np.amax(logdomain))) else: stateseq[idx] = sample_discrete(nextstate_unsmoothed) nextstate_unsmoothed = A[stateseq[idx]] return stateseq
def generate_word(self, size): next_dist = self.init_state_distn.pi_0 word = [] for _ in range(size): letter = sample_discrete(next_dist) word.append(letter) next_dist = self.trans_distn.A[letter] return tuple(word)
def _sample_forwards_log(betal, trans_matrix, init_state_distn, log_likelihoods): A = trans_matrix aBl = log_likelihoods T = aBl.shape[0] stateseq = np.empty(T, dtype=np.int32) nextstate_unsmoothed = init_state_distn for idx in xrange(T): logdomain = betal[idx] + aBl[idx] logdomain[nextstate_unsmoothed == 0] = -np.inf if np.any(np.isfinite(logdomain)): stateseq[idx] = sample_discrete(nextstate_unsmoothed * np.exp(logdomain - np.amax(logdomain))) else: stateseq[idx] = sample_discrete(nextstate_unsmoothed) nextstate_unsmoothed = A[stateseq[idx]] return stateseq
def _generate(self,T): alpha = self.alpha_0 betavec = self.beta.betavec model = self.model self.stateseq = np.array([]) ks = list(model._occupied()) + [None] firststateidx = sample_discrete(np.arange(len(ks))) if firststateidx == len(ks)-1: firststate = self._new_label(ks) else: firststate = ks[firststateidx] self.dur.resample(combinedata((model._durs_withlabel(firststate),self._durs_withlabel(firststate)))) firststate_dur = self.dur.rvs() self.stateseq = np.ones(firststate_dur,dtype=int)*firststate t = firststate_dur # run a family-CRF (CRF with durations) forwards while t < T: ks = list(model._occupied() | self._occupied()) betarest = 1-sum(betavec[k] for k in ks) fromto_counts = np.array([model._counts_fromto(self.stateseq[t-1],k) + self._counts_fromto(self.stateseq[t-1],k) for k in ks]) scores = np.array([(alpha*betavec[k] + ft if k != self.stateseq[t-1] else 0) for k,ft in zip(ks,fromto_counts)] + [alpha*(1-betavec[self.stateseq[t-1]])*betarest]) nextstateidx = sample_discrete(scores) if nextstateidx == scores.shape[0]-1: nextstate = self._new_label(ks) else: nextstate = ks[nextstateidx] # now get the duration of nextstate! self.dur.resample(combinedata((model._durs_withlabel(nextstate),self._durs_withlabel(nextstate)))) nextstate_dur = self.dur.rvs() self.stateseq = np.concatenate((self.stateseq,np.ones(nextstate_dur,dtype=int)*nextstate)) t += nextstate_dur self.T = len(self.stateseq)
def generate(self): word_size = self.letter_dur.rvs() or 1 next_state_dist = self.init_dist.pi_0 ret = [] for i in range(word_size): next_state = sample_discrete(next_state_dist) ret.append(next_state) next_state_dist = self.letter_trans.A[next_state] return tuple(ret)
def generate_states(self): T = self.T nextstate_distn = self.pi_0 A = self.trans_matrix stateseq = np.zeros(T, dtype=np.int32) for idx in xrange(T): stateseq[idx] = sample_discrete(nextstate_distn) nextstate_distn = A[stateseq[idx]] self.stateseq = stateseq return stateseq
def _sample_backwards_normalized(alphan, trans_matrix_transpose): AT = trans_matrix_transpose T = alphan.shape[0] stateseq = np.empty(T, dtype=np.int32) next_potential = np.ones(AT.shape[0]) for t in xrange(T - 1, -1, -1): stateseq[t] = sample_discrete(next_potential * alphan[t]) next_potential = AT[stateseq[t]] return stateseq
def hsmm_sample_forwards_log( trans_potentials, initial_state_potential, cumulative_obs_potentials, dur_potentials, dur_survival_potentails, betal, betastarl, left_censoring=False, right_censoring=True, ): T, _ = betal.shape stateseq = np.empty(T, dtype=np.int32) durations = [] t = 0 if left_censoring: raise NotImplementedError else: nextstate_unsmoothed = initial_state_potential while t < T: ## sample the state nextstate_distn_log = nextstate_unsmoothed + betastarl[t] nextstate_distn = np.exp(nextstate_distn_log - logsumexp(nextstate_distn_log)) assert nextstate_distn.sum() > 0 state = sample_discrete(nextstate_distn) ## sample the duration dur_logpmf = dur_potentials(t)[:, state] obs, offset = cumulative_obs_potentials(t) obs, offset = obs[:, state], offset # [state] durprob = np.random.random() dur = 0 # NOTE: always incremented at least once while durprob > 0 and dur < dur_logpmf.shape[0] and t + dur < T: p_d = np.exp(dur_logpmf[dur] + obs[dur] - offset + betal[t + dur, state] - betastarl[t, state]) assert not np.isnan(p_d) durprob -= p_d dur += 1 stateseq[t:t + dur] = state durations.append(dur) t += dur nextstate_log_distn = trans_potentials(t)[state] return stateseq, durations
def generate_states(self): T = self.T stateseq = np.zeros(T,dtype=np.int32) nextstate_distn = self.initial_distn.pi_0 A = self.transition_distn.A for idx in xrange(T): stateseq[idx] = sample_discrete(nextstate_distn) nextstate_distn = A[stateseq[idx]] self.stateseq = stateseq return stateseq
def generate_states(self): if self.left_censoring: raise NotImplementedError Tblock = len(self.changepoints) blockstateseq = self.blockstateseq = np.zeros(Tblock,dtype=np.int32) tblock = 0 nextstate_distr = self.pi_0 A = self.trans_matrix while tblock < Tblock: # sample the state state = sample_discrete(nextstate_distr) # compute possible duration info (indep. of state) possible_durations = self.segmentlens[tblock:].cumsum() # compute the pmf over those steps durprobs = self.dur_distns[state].pmf(possible_durations) # TODO censoring: the last possible duration isn't quite right durprobssum = durprobs.sum() durprobs /= durprobssum # If no duration is possible, then pick the first duration if durprobssum == 0: durprobs[0] = 1.0 durprobs[1:] = 0.0 # sample it blockdur = sample_discrete(durprobs) + 1 # set block sequence blockstateseq[tblock:tblock+blockdur] = state # set up next iteration tblock += blockdur nextstate_distr = A[state] self._stateseq_norep = None self._durations_censored = None
def _generate(self,T): self.T = T alpha, kappa = self.alpha_0, self.kappa betavec = self.beta.betavec stateseq = np.zeros(T,dtype=np.int) model = self.model self.stateseq = stateseq[:0] # NOTE: we have a choice of what state to start in; it's just a # definition choice that isn't specified in the HDP-HMM # Here, we choose just to sample from beta. Note that if this is the # first chain being sampled in this model, this will always sample # zero, since no states will be occupied. ks = list(model._occupied()) + [None] firststate = sample_discrete(np.arange(len(ks))) if firststate == len(ks)-1: stateseq[0] = self._new_label(ks) else: stateseq[0] = ks[firststate] # runs a CRF with fixed weights beta forwards for t in range(1,T): self.stateseq = stateseq[:t] ks = list(model._occupied() | self._occupied()) betarest = 1-sum(betavec[k] for k in ks) # get the counts of new states coming out of our current state # going to all other states fromto_counts = np.array([model._counts_fromto(stateseq[t-1],k) + self._counts_fromto(stateseq[t-1],k) for k in ks]) # for those states plus a new one, sample proportional to scores = np.array([(alpha*betavec[k] + (kappa if k == stateseq[t+1] else 0) + ft) for k,ft in zip(ks,fromto_counts)] + [alpha*betarest]) nextstateidx = sample_discrete(scores) if nextstateidx == scores.shape[0]-1: stateseq[t] = self._new_label(ks) else: stateseq[t] = ks[nextstateidx] self.stateseq = stateseq
def sample_forwards(self,aBl,betal): T = aBl.shape[0] stateseq = np.zeros(T,dtype=np.int32) nextstate_unsmoothed = self.initial_distn.pi_0 A = self.transition_distn.A for idx in xrange(T): logdomain = betal[idx] + aBl[idx] logdomain[nextstate_unsmoothed == 0] = -np.inf # to enforce constraints in the trans matrix stateseq[idx] = sample_discrete(nextstate_unsmoothed * np.exp(logdomain - np.amax(logdomain))) nextstate_unsmoothed = A[stateseq[idx]] self.stateseq = stateseq
def hsmm_sample_forwards_log( trans_potentials, initial_state_potential, cumulative_obs_potentials, dur_potentials, dur_survival_potentails, betal, betastarl, left_censoring=False, right_censoring=True, ): T, _ = betal.shape stateseq = np.empty(T, dtype=np.int32) durations = [] t = 0 if left_censoring: raise NotImplementedError else: nextstate_unsmoothed = initial_state_potential while t < T: ## sample the state nextstate_distn_log = nextstate_unsmoothed + betastarl[t] nextstate_distn = np.exp(nextstate_distn_log - logsumexp(nextstate_distn_log)) assert nextstate_distn.sum() > 0 state = sample_discrete(nextstate_distn) ## sample the duration dur_logpmf = dur_potentials(t)[:, state] obs, offset = cumulative_obs_potentials(t) obs, offset = obs[:, state], offset[state] durprob = np.random.random() dur = 0 # NOTE: always incremented at least once while durprob > 0 and dur < dur_logpmf.shape[0] and t + dur < T: p_d = np.exp(dur_logpmf[dur] + obs[dur] - offset + betal[t + dur, state] - betastarl[t, state]) assert not np.isnan(p_d) durprob -= p_d dur += 1 stateseq[t : t + dur] = state durations.append(dur) t += dur nextstate_log_distn = trans_potentials(t)[state] return stateseq, durations
def _sample_forwards_normalized(betan,trans_matrix,init_state_distn,log_likelihoods): A = trans_matrix aBl = log_likelihoods T = aBl.shape[0] stateseq = np.empty(T,dtype=np.int32) nextstate_unsmoothed = init_state_distn for idx in range(T): logdomain = aBl[idx] logdomain[nextstate_unsmoothed == 0] = -np.inf stateseq[idx] = sample_discrete(nextstate_unsmoothed * betan * np.exp(logdomain - np.amax(logdomain))) nextstate_unsmoothed = A[stateseq[idx]] return stateseq
def _sample_forwards_normalized(betan, trans_matrix, init_state_distn, log_likelihoods): A = trans_matrix aBl = log_likelihoods T = aBl.shape[0] stateseq = np.empty(T, dtype=np.int32) nextstate_unsmoothed = init_state_distn for idx in xrange(T): logdomain = aBl[idx] logdomain[nextstate_unsmoothed == 0] = -np.inf stateseq[idx] = sample_discrete(nextstate_unsmoothed * betan * np.exp(logdomain - np.amax(logdomain))) nextstate_unsmoothed = A[stateseq[idx]] return stateseq
def sample_forwards(self, betal, betastarl): T = self.T A = self.A aD = self.aD stateseq = self.stateseq = np.zeros(T, dtype=np.int32) state_ranges = self.state_ranges = [] idx = 0 nextstate_unsmoothed = self.model.init_dist.pi_0 while idx < T: logdomain = betastarl[idx] - np.amax(betastarl[idx]) nextstate_dist = np.exp(logdomain) * nextstate_unsmoothed if (nextstate_dist == 0.).all(): nextstate_dist = np.exp(logdomain) state = sample_discrete(nextstate_dist) durprob = np.random.random() word = self.model.word_list[state] dur = len(word) - 1 while durprob > 0: p_d_prior = aD[dur, state] if dur < T else 1. assert not np.isnan(p_d_prior) assert p_d_prior >= 0 if p_d_prior == 0: dur += 1 continue if idx + dur < T: loglikelihood = self.likelihood_block_word( idx, idx + dur + 1, word) mess_term = np.exp(loglikelihood + betal[idx + dur, state] - betastarl[idx, state]) p_d = mess_term * p_d_prior assert not np.isnan(p_d) durprob -= p_d dur += 1 else: dur += 1 break assert dur > 0 assert dur >= len(word) stateseq[idx:idx + dur] = state state_ranges.append((state, (idx, idx + dur))) nextstate_unsmoothed = A[state] idx += dur
def sample_forwards(self, betal, betastarl): T = self.T aD = np.exp(self.aDl) log_trans_matrix = self.log_trans_matrix stateseq = self._stateseq[:] stateseq[:] = -1 letter_stateseq = self._letter_stateseq[:] letter_stateseq[:] = -1 stateseq_norep = [] durations_censored = [] t = 0 nextstate_unsmoothed = self.pi_0 while t < T: logdomain = betastarl[t] - betastarl[t].max() nextstate_dist = np.exp(logdomain) * nextstate_unsmoothed if (nextstate_dist == 0.).all(): nextstate_dist = np.exp(logdomain) state = sample_discrete(nextstate_dist) durprob = np.random.random() # dur = len(self.model.word_list[state]) cache_mess_term = np.exp( self.likelihood_block_word(t, T, self.model.word_list[state]) + betal[t:T, state] - betastarl[t, state]) dur = 0 while durprob > 0 and t + dur < T: # p_d_prior = aD[dur, state] if t + dur < T else 1. p_d_prior = aD[dur, state] assert not np.isnan(p_d_prior) assert p_d_prior >= 0 p_d = cache_mess_term[dur] * p_d_prior assert not np.isnan(p_d) durprob -= p_d dur += 1 assert dur > 0 assert dur >= len(self.model.word_list[state]) stateseq[t:t + dur] = state nextstate_unsmoothed = nextstate_dist[state] t += dur stateseq_norep.append(state) durations_censored.append(dur) self._stateseq_norep = np.array(stateseq_norep, dtype=np.int32) self._durations_censored = np.array(durations_censored, dtype=np.int32)
def test(cls): from matplotlib import pyplot as plt truth = cls(1.,1.) infer = cls(1.,1.) print truth.concentration blah = [] for itr in range(200): alldata = [] sizes = [20] for size in sizes: weights = stats.gamma.rvs(truth.concentration/50,size=50) # 50 \approx inf when #draws=20 weights /= weights.sum() alldata.append(sample_discrete(weights,size=size)) infer.resample(sample_numbers=np.array(sizes),total_num_distinct=len(set(np.concatenate(alldata)))) blah.append(infer.concentration) print np.median(blah) plt.hist(blah,bins=25,normed=True)
def hlm_sample_forwards_log(likelihood_block_word_func, trans_matrix, pi_0, aDl, word_list, betal, betastarl, stateseq, stateseq_norep, durations_censored): stateseq[:] = -1 T = betal.shape[0] t = 0 aD = np.exp(aDl) nextstate_unsmoothed = pi_0 while t < T: logdomain = betastarl[t] - betastarl[t].max() nextstate_dist = np.exp(logdomain) * nextstate_unsmoothed state = sample_discrete(nextstate_dist) durprob = np.random.random() cache_mess_term = np.exp( likelihood_block_word_func(t, T, word_list[state]) + betal[t:T, state] - betastarl[t, state]) dur = 0 while durprob > 0 and t + dur < T: # p_d_prior = aD[dur, state] if t + dur < T else 1. p_d_prior = aD[dur, state] assert not np.isnan(p_d_prior) assert p_d_prior >= 0 p_d = cache_mess_term[dur] * p_d_prior assert not np.isnan(p_d) durprob -= p_d dur += 1 assert dur > 0 assert dur >= len(word_list[state]) stateseq[t:t + dur] = state nextstate_unsmoothed = trans_matrix[state] t += dur stateseq_norep.append(state) durations_censored.append(dur) stateseq_norep = np.array(stateseq_norep, dtype=np.int32) durations_censored = np.array(durations_censored, dtype=np.int32) return stateseq, stateseq_norep, durations_censored
def resample_words(self): for word_idx in range(self.num_states): hsmm_states = [letter_state for letter_state in self.letter_hsmm.states_list if letter_state.word_idx == word_idx] candidates = [tuple(letter_state.stateseq_norep) for letter_state in hsmm_states] unique_candidates = list(set(candidates)) ref_array = np.array([unique_candidates.index(candi) for candi in candidates]) if len(candidates) == 0: self._generate_word_and_set_at(word_idx) continue elif len(unique_candidates) == 1: self.word_list[word_idx] = unique_candidates[0] continue cache_score = np.empty((len(unique_candidates), len(candidates))) likelihoods = np.array([letter_state.log_likelihood() for letter_state in hsmm_states]) range_tmp = list(range(len(candidates))) for candi_idx, candi in enumerate(unique_candidates): tmp = range_tmp[:] if (ref_array == candi_idx).sum() == 1: tmp.remove(np.where(ref_array == candi_idx)[0][0]) for tmp_idx in tmp: # print(hsmm_states[tmp_idx].likelihood_block_word(candi)[-1]) cache_score[candi_idx, tmp_idx] = hsmm_states[tmp_idx].likelihood_block_word(candi)[-1] cache_scores_matrix = cache_score[ref_array] for i in range_tmp: cache_scores_matrix[i, i] = 0.0 scores = cache_scores_matrix.sum(axis=1) + likelihoods assert (np.exp(scores) >= 0).all(), cache_scores_matrix sampled_candi_idx = sample_discrete(np.exp(scores)) self.word_list[word_idx] = candidates[sampled_candi_idx] # Merge same letter seq which has different id. for i, word in enumerate(self.word_list): if word in self.word_list[:i]: existed_id = self.word_list[:i].index(word) for word_state in self.states_list: stateseq, stateseq_norep = word_state.stateseq, word_state.stateseq_norep word_state.stateseq[stateseq == i] = existed_id word_state.stateseq_norep[stateseq_norep == i] = existed_id self._generate_word_and_set_at(i)
def generate(self, limit_len=3): nextstate_dist = self.init_dist.pi_0 A = self.trans_dists.A state_list = [] for _ in range(limit_len): state = sample_discrete(nextstate_dist) state_list.append(state) nextstate_dist = A[state] stateseq = [] letseq = [] obsseq = [] for s in state_list: for l in self.word_list[s]: d = self.dur_distns[l].rvs() or 1 o = self.obs_distns[l].rvs(size=d) obsseq.append(o) letseq.append([l] * d) stateseq.append([s] * d) return map(np.concatenate, (stateseq, letseq, obsseq))
def generate_states(self): if self.left_censoring: raise NotImplementedError idx = 0 nextstate_distr = self.pi_0 A = self.trans_matrix stateseq = np.empty(self.T,dtype=np.int32) # durations = [] while idx < self.T: # sample a state state = sample_discrete(nextstate_distr) # sample a duration for that state duration = self.dur_distns[state].rvs() # save everything # durations.append(duration) stateseq[idx:idx+duration] = state # this can run off the end, that's okay # set up next state distribution nextstate_distr = A[state,] # update index idx += duration self.stateseq = stateseq
def rvs_given_less_than(self,x,num): pmf = self.pmf(np.arange(1,x)) return sample_discrete(pmf,num)+1
def rvs_given_less_than(self, x, num): pmf = self.pmf(np.arange(1, x)) return sample_discrete(pmf, num) + 1
def rvs(self,size=[]): return sample_discrete(self.distn,size=size)
def sample_forwards(self,betal,betastarl): stateseq = self.stateseq = np.zeros(self.T,dtype=np.int32) durations = [] stateseq_norep = [] idx = 0 A = self.transition_distn.A nextstate_unsmoothed = self.initial_distn.pi_0 apmf = np.zeros((self.state_dim,self.T)) arg = np.arange(1,self.T+1) for state_idx, dur_distn in enumerate(self.dur_distns): apmf[state_idx] = dur_distn.pmf(arg) while idx < self.T: logdomain = betastarl[idx] - np.amax(betastarl[idx]) nextstate_distr = np.exp(logdomain) * nextstate_unsmoothed if (nextstate_distr == 0.).all(): # this is a numerical issue; no good answer, so we'll just follow the messages. nextstate_distr = np.exp(logdomain) state = sample_discrete(nextstate_distr) assert len(stateseq_norep) == 0 or state != stateseq_norep[-1] durprob = random() dur = 0 # always incremented at least once prob_so_far = 0.0 while durprob > 0: assert dur < 2*self.T # hacky infinite loop check #assert self.dur_distns[state].pmf(dur+1) == apmf[state,dur] p_d_marg = apmf[state,dur] if dur < self.T else 1. # note funny indexing: dur variable is 1 less than actual dur we're considering assert not np.isnan(p_d_marg) assert p_d_marg >= 0 if p_d_marg == 0: dur += 1 continue if idx+dur < self.T: mess_term = np.exp(self.likelihood_block_state(idx,idx+dur+1,state) + betal[idx+dur,state] - betastarl[idx,state]) # TODO unnecessarily slow for subhmms p_d = mess_term * p_d_marg #print 'dur: %d, durprob: %f, p_d_marg: %f, p_d: %f' % (dur+1,durprob,p_d_marg,p_d) prob_so_far += p_d else: # we're out of data, so we need to sample a duration # conditioned on having lasted at least this long. the # likelihood contributes the same to all possibilities, so # we can just sample from the prior (conditioned on it being # at least this long). arg = np.arange(dur+1,2*self.T) # 2*T is just a guessed upper bound, +1 because 'dur' is one less than the duration we're actually considering remaining = dur_distn.pmf(arg) therest = sample_discrete(remaining) dur = dur + therest durprob = -1 # just to get us out of loop assert not np.isnan(p_d) durprob -= p_d dur += 1 assert dur > 0 stateseq[idx:idx+dur] = state stateseq_norep.append(state) assert len(stateseq_norep) < 2 or stateseq_norep[-1] != stateseq_norep[-2] durations.append(dur) nextstate_unsmoothed = A[state,:] idx += dur self.durations = np.array(durations,dtype=np.int32) self.stateseq_norep = np.array(stateseq_norep,dtype=np.int32)
def rvs(self,size=[]): return sample_discrete(self.pi_0,size=size)