예제 #1
0
    def fit(self, X, run_lengths, lengths=None):
        X = check_array(X)
        self._init(X, lengths=lengths)
        self._check()

        self.monitor_._reset()
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                rls = run_lengths[i:j]
                framelogprob = self._compute_log_likelihood(X[i:j])
                logprob, fwdlattice = self._do_forward_pass(framelogprob, rls)
                curr_logprob += logprob
                bwdlattice = self._do_backward_pass(framelogprob, rls)
                posteriors = self._compute_posteriors(fwdlattice, bwdlattice)
                self._accumulate_sufficient_statistics(stats, X[i:j],
                                                       framelogprob,
                                                       posteriors, fwdlattice,
                                                       bwdlattice, rls)
            self._do_mstep(stats)
            self.monitor_.report(curr_logprob)
            if self.monitor_.converged:
                break

        if (self.transmat_.sum(axis=1) == 0).any():
            _log.warning("Some rows of transmat_ have zero sum because no "
                         "transition from the state was ever observed.")

        return self
예제 #2
0
    def score(self, X, y, lengths=None):
        if type(X) == list:
            lengths = [x.shape[0] for x in X]
            X = np.concatenate(X)
            y = np.array(y)
        elif lengths is None:
            lengths = [X.shape[0]]

        Nseqs = y.shape[0]
        score = 0
        for k in range(self.n_nodes):
            lengthsk = [lengths[i] for i in range(len(lengths)) if y[i] == k]
            if not lengthsk:
                continue
            Nseqsk = len(lengthsk)
            Xk = np.concatenate([
                X[i:j, :]
                for seq_idx, (i,
                              j) in enumerate(iter_from_X_lengths(X, lengths))
                if y[seq_idx] == k
            ],
                                axis=0)
            yk = np.array([0 for i in range(Nseqsk)])

            scorek = self.hmm[k].score(Xk, yk, lengths=lengthsk)
            score += scorek * Nseqsk
        score /= Nseqs

        return score
예제 #3
0
    def _estimate_initial_prob(self, Y, lengths):
        """Returns an frequentist estimate of the initial probabilities over
        the observed hidden states.  Assumes that the hidden states are
        specified by sequential numbers 0, 1, ...  numbers).

        Parameters
        ----------
        Y : numpy array (n_samples,)
            Individual observations of hidden states.
        lengths : list of integer
            Lengths of sequences in X and Y. If None, X and Y are assumed to
            be a single sequence. The sum of lengths should be n_samples.
        """
        if not len(Y):
            return []

        ip = np.array([.0] * self.n_states)
        for i, j in iter_from_X_lengths(Y, lengths):
            ip[Y[i]] += 1

        ip /= sum(ip)
        # To dictionary
        ip = dict(list(zip(list(range(len(ip))), ip)))

        return ip
예제 #4
0
def eval_group_hmms(membership, models):
    trajectorydata = pd.read_csv("./testTrajectory_smaller.csv")
    t = Trajectory(trajectorydata)
    data, length, prob_list = t.getDataWithAllGroups(membership)
    index = 0
    test_set = []
    all_probs = [0] * len(length)
    for i, j in iter_from_X_lengths(data, length):
        # prob_sum = 0
        # for g in range(0, GROUP_NUM):
        # 	prob_sum += np.exp(models[g].score(data[i:j])) * prob_list[index][g]
        # avg_prob += prob_sum / GROUP_NUM
        test_set.append(data[i:j])
    manager = mp.Manager()
    m_all_probs = manager.list(all_probs)
    p = mp.Pool(processes=mp.cpu_count() - 1)
    get_score = partial(get_score_for_all_groups,
                        data=test_set,
                        prob_list=prob_list,
                        models=models)
    m_all_probs = p.map(get_score, range(0, len(length)))
    probs_sum = sum(list(m_all_probs))
    p.close()
    p.join()
    return np.log(probs_sum / len(length))
예제 #5
0
    def score(self, X, lengths=None):
        """Compute the log probability under the model.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.

        lengths : array-like of integers, shape (n_sequences, ), optional
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.

        Returns
        -------
        logprob : float
            Log likelihood of ``X``.

        See Also
        --------
        score_samples : Compute the log probability under the model and
            posteriors.
        decode : Find most likely state sequence corresponding to ``X``.
        """
        _utils.check_is_fitted(self, "startprob_")
        self._check()

        X = check_array(X)
        # XXX we can unroll forward pass for speed and memory efficiency.
        logprob = 0
        for i, j in iter_from_X_lengths(X, lengths):
            framelogprob = self._compute_log_likelihood(X[i:j])
            logprobij, _fwdlattice = self._do_forward_pass(framelogprob)
            logprob += logprobij
        return logprob
예제 #6
0
    def _compute_mixture_posteriors(self, X, y, lengths):
        '''
        Computes the posterior log-probability of each mixture component given
        the observations X, y.
        Inputs:
            X - np.array of size (n_samples, n_features).
            y - np.int of size n_sequences, whose entries are in the
                range [0, n_nodes-1].
            lengths - list containing the lengths of each individual sequence
                      in X, with size n_sequences.
        Outputs:
            logmixpost - np.array of size (n_sequences, mix_dim).
        '''
        N = len(lengths)

        transitions = []
        #means = []
        logmixpost = np.zeros((N, self.mix_dim))
        for m in range(self.mix_dim):
            ll_m = np.zeros(N)
            for seq_idx, (i, j) in enumerate(iter_from_X_lengths(X, lengths)):
                ll_m[seq_idx] = self.mixModels[m].score(X[i:j, :])
            transitions = np.append(transitions, self.mixModels[m].transmat_)
            #means = np.append(means, self.mixModels[m].means_)

            logmixpost[:, m] = ll_m + np.log(self.mixCoef[y, m] + .000000001)

        log_normalize(logmixpost, axis=1)

        return logmixpost, transitions  #, means
예제 #7
0
    def _compute_sufficient_statistics_in_mix_comp(self, X, y, lengths,
                                                   logmixpost, stats):
        '''
        Accumulates sufficient statistics for the parameters of each HMM in the
        mixture.
        Inputs:
            X - np.array of size (n_samples, n_features).
            y - np.int of size n_sequences, whose entries are in the
                range [0, n_nodes-1].
            lengths - list containing the lengths of each individual sequence
                      in X, with size n_sequences.
            logmixpost - np.array of size (n_sequences, mix_dim).
            stats - dictionary containing sufficient statistics (changed
                    inplace).
        '''
        for m in range(self.mix_dim):
            for seq_idx, (i, j) in enumerate(iter_from_X_lengths(X, lengths)):
                if self.mixCoef[y[seq_idx], m] == 0.:
                    continue

                framelogprob = self.mixModels[m]._compute_log_likelihood(
                    X[i:j, :])
                _, fwdlattice = (
                    self.mixModels[m]._do_forward_pass(framelogprob))
                bwdlattice = self.mixModels[m]._do_backward_pass(framelogprob)
                posteriors = self.mixModels[m]._compute_posteriors(
                    fwdlattice, bwdlattice)
                fwdlattice += logmixpost[seq_idx, m]
                bwdlattice += logmixpost[seq_idx, m]
                posteriors *= np.exp(logmixpost[seq_idx, m])

                self.mixModels[m]._accumulate_sufficient_statistics(
                    stats['mix_idx' + str(m)], X[i:j, :], framelogprob,
                    posteriors, fwdlattice, bwdlattice)
예제 #8
0
    def scores_per_seq(self, X, y, lengths=None):
        '''
        Computes the log-likelihood for each sequence in X coming from nodes y.
        Inputs:
            X - np.array of size (n_samples, n_features).
            y - np.int of size n_sequences, whose entries are in the range
                [0, n_nodes-1].
            lengths - list containing the lengths of each individual sequence
                      in X, with size n_sequences.
        Outputs:
            log_likelihood - np.array of size n_sequences.
        '''
        if type(X) == list:
            lengths = [x.shape[0] for x in X]
            X = np.concatenate(X)
            y = np.array(y)

        N = y.shape[0]

        log_likelihood = np.zeros(N)
        for seq_idx, (i, j) in enumerate(iter_from_X_lengths(X, lengths)):
            ll_per_comp = np.zeros(self.mix_dim)
            for m in range(self.mix_dim):
                if self.mixCoef[y[seq_idx], m] == 0.:
                    continue

                ll_per_comp[m] = self.mixModels[m].score(X[i:j, :])

            nonzero_idx = (self.mixCoef[y[seq_idx], :] != 0.)

            log_likelihood[seq_idx] = logsumexp(
                np.log(self.mixCoef[y[seq_idx], nonzero_idx]) +
                ll_per_comp[nonzero_idx])

        return log_likelihood
    def fit(self, X, lengths=None):
        """Estimate model parameters.

		An initialization step is performed before entering the
		EM algorithm. If you want to avoid this step for a subset of
		the parameters, pass proper ``init_params`` keyword argument
		to estimator's constructor.

		Parameters
		----------
		X : array-like, shape (n_samples, n_features)
			Feature matrix of individual samples.

		lengths : array-like of integers, shape (n_sequences, )
			Lengths of the individual sequences in ``X``. The sum of
			these should be ``n_samples``.

		Returns
		-------
		self : object
			Returns self.
		"""
        X = check_array(X)
        self._init(X, lengths=lengths)
        self._check()

        self.monitor_._reset()
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                framelogprob = self._compute_log_likelihood(X[i:j])
                logprob, fwdlattice = self._do_forward_pass(framelogprob)
                curr_logprob += logprob
                bwdlattice = self._do_backward_pass(framelogprob)
                posteriors = self._compute_posteriors(fwdlattice, bwdlattice)
                try:
                    with np.errstate(invalid="raise"):
                        self._accumulate_sufficient_statistics(
                            stats, X[i:j], framelogprob, posteriors,
                            fwdlattice, bwdlattice)
                except FloatingPointError as e:
                    print(f"{type(e).__name__}: {e}")
                    print("Divergence detected, stopping training")
                    return self

            # XXX must be before convergence check, because otherwise
            #	 there won't be any updates for the case ``n_iter=1``.
            self._do_mstep(stats)

            self.monitor_.report(curr_logprob)
            if self.monitor_.converged:
                break

        if (self.transmat_.sum(axis=1) == 0).any():
            _log.warning("Some rows of transmat_ have zero sum because no "
                         "transition from the state was ever observed.")

        return self
예제 #10
0
def np2lst(Xnp, ynp, lengths):
    X = []
    y = np.array(ynp)

    for (i, j) in iter_from_X_lengths(Xnp, lengths):
        X.append(Xnp[i:j, :])

    return X, y
예제 #11
0
    def fit(self, X, lengths=None):
        """Estimate model parameters.
        An initialization step is performed before entering the
        EM algorithm. If you want to avoid this step for a subset of
        the parameters, pass proper ``init_params`` keyword argument
        to estimator's constructor.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.
        lengths : array-like of integers, shape (n_sequences, )
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.
        Returns
        -------
        self : object
            Returns self.
        """
        X = check_array(X)
        self._init(X, lengths=lengths)
        self._check()

        self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter, self.verbose)
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                framelogprob = self._compute_log_likelihood(X[i:j])
                logprob, fwdlattice = self._do_forward_pass(framelogprob)
                curr_logprob += logprob
                bwdlattice = self._do_backward_pass(framelogprob)
                posteriors = self._compute_posteriors(fwdlattice, bwdlattice)

                # fix posteriors
                if self.states_prior is not None and self.fp_state is not None:
                    for k in range(len(self.states_prior)):
                        if self.states_prior[k] == 0:
                            # non footprint states
                            posteriors[k][self.fp_state] = 0.0
                            posteriors[k] = posteriors[k] / sum(posteriors[k])

                        elif self.states_prior[k] == 1:
                            # footprint states
                            posteriors[k] = 0.0 / sum(posteriors[k])
                            posteriors[k][self.fp_state] = 1.0

                self._accumulate_sufficient_statistics(stats, X[i:j],
                                                       framelogprob,
                                                       posteriors, fwdlattice,
                                                       bwdlattice)

            self._do_mstep(stats)

            self.monitor_.report(curr_logprob)
            if self.monitor_.converged:
                break

        return self
예제 #12
0
파일: hmm.py 프로젝트: CostaLab/reg-gen
    def fit(self, X, lengths=None):
        """Estimate model parameters.
        An initialization step is performed before entering the
        EM algorithm. If you want to avoid this step for a subset of
        the parameters, pass proper ``init_params`` keyword argument
        to estimator's constructor.
        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.
        lengths : array-like of integers, shape (n_sequences, )
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.
        Returns
        -------
        self : object
            Returns self.
        """
        X = check_array(X)
        self._init(X, lengths=lengths)
        self._check()

        self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter, self.verbose)
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                framelogprob = self._compute_log_likelihood(X[i:j])
                logprob, fwdlattice = self._do_forward_pass(framelogprob)
                curr_logprob += logprob
                bwdlattice = self._do_backward_pass(framelogprob)
                posteriors = self._compute_posteriors(fwdlattice, bwdlattice)

                # fix posteriors
                if self.states_prior is not None and self.fp_state is not None:
                    for k in range(len(self.states_prior)):
                        if self.states_prior[k] == 0:
                            # non footprint states
                            posteriors[k][self.fp_state] = 0.0
                            posteriors[k] = posteriors[k] / sum(posteriors[k])

                        elif self.states_prior[k] == 1:
                            # footprint states
                            posteriors[k] = 0.0 / sum(posteriors[k])
                            posteriors[k][self.fp_state] = 1.0

                self._accumulate_sufficient_statistics(stats, X[i:j], framelogprob, posteriors, fwdlattice, bwdlattice)

            self._do_mstep(stats)

            self.monitor_.report(curr_logprob)
            if self.monitor_.converged:
                break

        return self
예제 #13
0
    def fit(self, X, lengths=None):
        """Estimate model parameters.

        An initialization step is performed before entering the
        EM algorithm. If you want to avoid this step for a subset of
        the parameters, pass proper ``init_params`` keyword argument
        to estimator's constructor.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.

        lengths : array-like of integers, shape (n_sequences, )
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.

        Returns
        -------
        self : object
            Returns self.
        """
        X = check_array(X)
        if int(self.iepoch) == 1:
            self._init(X, lengths=lengths)
            # self.means_ += 0.1 * np.random.randn(*self.means_.shape)
            # self.covars_ += 0.1 * np.abs(np.random.randn(*self.covars_.shape))
            self.monitor_._reset()

        self._check()
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                framelogprob = self._compute_log_likelihood(X[i:j])
                logprob, fwdlattice = self._do_forward_pass(framelogprob)
                curr_logprob += logprob
                # if curr_logprob < 0:
                #     print("negative log likelihood")

                bwdlattice = self._do_backward_pass(framelogprob)
                posteriors = self._compute_posteriors(fwdlattice, bwdlattice)
                stats = self._accumulate_sufficient_statistics(
                    stats, X[i:j], framelogprob, posteriors, fwdlattice,
                    bwdlattice)

            # XXX must be before convergence check, because otherwise
            #     there won't be any updates for the case ``n_iter=1``.
            self._do_mstep(stats,X)
            
            self.monitor_.report(curr_logprob/stats["nobs"])
            # if self.monitor_.converged:
            #     break

        return self
예제 #14
0
    def fit(self, X, lengths=None):
        """Estimate model parameters.

        An initialization step is performed before entering the
        EM algorithm. If you want to avoid this step for a subset of
        the parameters, pass proper ``init_params`` keyword argument
        to estimator's constructor.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.

        lengths : array-like of integers, shape (n_sequences, )
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.

        Returns
        -------
        self : object
            Returns self.
        """
        X = check_array(X)
        self._init(X, lengths=lengths)
        self._check()
        self.init_values_FS(X)
        self.select_hyperparams(X)

        self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter, self.verbose)
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                framelogprob = self._compute_log_likelihood(X[i:j])
                logprob, fwdlattice = self._do_forward_pass(framelogprob)
                curr_logprob += logprob
                bwdlattice = self._do_backward_pass(framelogprob)
                posteriors = self._compute_posteriors(
                    fwdlattice, bwdlattice)  #posteriors <- gamma
                self._accumulate_sufficient_statistics(stats, X[i:j],
                                                       framelogprob,
                                                       posteriors, fwdlattice,
                                                       bwdlattice)
                self.compute_FS_ESTEP(X, posteriors)

            # XXX must be before convergence check, because otherwise
            #     there won't be any updates for the case ``n_iter=1``.
            self._do_mstep(X, stats)

            self.monitor_.report(curr_logprob)
            if self.monitor_.converged:
                break

        return self
예제 #15
0
    def decode(self, X, lengths=None, algorithm=None):
        """Find most likely state sequence corresponding to ``X``.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.

        lengths : array-like of integers, shape (n_sequences, ), optional
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.

        algorithm : string
            Decoder algorithm. Must be one of "viterbi" or "map".
            If not given, :attr:`decoder` is used.

        Returns
        -------
        logprob : float
            Log probability of the produced state sequence.

        state_sequence : array, shape (n_samples, )
            Labels for each sample from ``X`` obtained via a given
            decoder ``algorithm``.

        See Also
        --------
        score_samples : Compute the log probability under the model and
            posteriors.
        score : Compute the log probability under the model.
        """
        _utils.check_is_fitted(self, "startprob_")
        self._check()

        algorithm = algorithm or self.algorithm
        if algorithm not in DECODER_ALGORITHMS:
            raise ValueError("Unknown decoder {!r}".format(algorithm))

        decoder = {
            "viterbi": self._decode_viterbi,
            "map": self._decode_map
        }[algorithm]

        X = check_array(X)
        n_samples = X.shape[0]
        logprob = 0
        state_sequence = np.empty(n_samples, dtype=int)
        for i, j in iter_from_X_lengths(X, lengths):
            # XXX decoder works on a single sample at a time!
            logprobij, state_sequenceij = decoder(X[i:j])
            logprob += logprobij
            state_sequence[i:j] = state_sequenceij

        return logprob, state_sequence
예제 #16
0
    def score(self, X, run_lengths, lengths=None):
        check_is_fitted(self, "startprob_")
        self._check()

        X = check_array(X)
        # XXX we can unroll forward pass for speed and memory efficiency.
        logprob = 0
        for i, j in iter_from_X_lengths(X, lengths):
            framelogprob = self._compute_log_likelihood(X[i:j])
            logprobij, _fwdlattice = self._do_forward_pass(
                framelogprob, run_lengths[i:j])
            logprob += logprobij
        return logprob
예제 #17
0
def eval_group_hmms_old(membership, models):
    trajectorydata = pd.read_csv("./testTrajectory_final.csv")
    t = Trajectory(trajectorydata)
    data, length, prob_list = t.getDataWithAllGroups(membership)
    index = 0
    avg_prob = 0
    for i, j in iter_from_X_lengths(data, length):
        prob_sum = 0
        for g in range(0, GROUP_NUM):
            prob_sum += np.exp(models[g].score(
                data[i:j])) * prob_list[index][g]
        avg_prob += prob_sum / GROUP_NUM
        index += 1
    return np.log(avg_prob / len(length))
예제 #18
0
    def scores_per_seq(self, X, y, lengths=None):
        if type(X) == list:
            lengths = [x.shape[0] for x in X]
            X = np.concatenate(X)
            y = np.array(y)
        elif lengths is None:
            lengths = [X.shape[0]]

        N = y.shape[0]
        log_likelihood = np.zeros(N)
        for seq_idx, (i, j) in enumerate(iter_from_X_lengths(X, lengths)):
            log_likelihood[seq_idx] = (self.hmm[y[seq_idx]].scores_per_seq(
                X[i:j, :], np.array([0])))

        return log_likelihood
예제 #19
0
    def _init(self, X, lengths=None):
        if not self._check_input_symbols(X):
            raise ValueError("expected a sample from "
                             "a Multinomial distribution.")

        super(MultinomialHMM, self)._init(X, lengths=lengths)
        self.random_state = check_random_state(self.random_state)

        if 'e' in self.init_params:
            if not hasattr(self, "n_features"):
                symbols = set()
                for i, j in iter_from_X_lengths(X, lengths):
                    symbols |= set(X[i:j].flatten())
                self.n_features = len(symbols)
            self.emissionprob_ = self.random_state \
                .rand(self.n_components, self.n_features)
            normalize(self.emissionprob_, axis=1)
예제 #20
0
    def _do_fit(self, data, lengths):
        self._init_params(data, lengths=lengths, params=self.init_params)

        X = data['obs']
        self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter,
                                           self.n_iter_min, self.verbose)
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            gn = np.zeros((X.shape[0], self.n_unique))
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                flp = self._compute_log_likelihood(
                    data, from_=i, to_=j)  # n_samples, n_unique
                flp_rep = np.zeros((flp.shape[0], self.n_components))
                for u in range(self.n_unique):
                    for c in range(self.n_chain):
                        flp_rep[:, u * self.n_chain + c] = flp[:, u]

                # n_samples, n_components below
                logprob, fwdlattice = self._do_forward_pass(flp_rep)
                curr_logprob += logprob
                bwdlattice = self._do_backward_pass(flp_rep)
                posteriors = self._compute_posteriors(fwdlattice, bwdlattice)
                self._accumulate_sufficient_statistics(stats, X[i:j], flp_rep,
                                                       posteriors, fwdlattice,
                                                       bwdlattice)
                # sum responsibilities across chain if tied states exist
                if self.n_tied == 0:
                    gn[i:j, :] = posteriors
                elif self.n_tied > 0:
                    for u in range(self.n_unique):
                        cols = range(u * (self.n_chain),
                                     u * (self.n_chain) + (self.n_chain))
                        gn[i:j, u] = (np.sum(posteriors[:, cols],
                                             axis=1)).reshape(X.shape[0])

            self.monitor_.report(curr_logprob)
            if self.monitor_.converged:
                break

            self._do_mstep(stats, self.params)
            self._do_mstep_grad(gn, data)

        return self
예제 #21
0
파일: tm.py 프로젝트: sarah-strauss/autohmm
    def _do_fit(self, data, lengths):
        self._init_params(data, lengths=lengths, params=self.init_params)

        X = data['obs']
        self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter,
                                           self.n_iter_min, self.verbose)
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            gn = np.zeros((X.shape[0], self.n_unique))
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                flp = self._compute_log_likelihood(data, from_=i, to_=j) # n_samples, n_unique
                flp_rep = np.zeros((flp.shape[0], self.n_components))
                for u in range(self.n_unique):
                    for c in range(self.n_chain):
                        flp_rep[:, u*self.n_chain+c] = flp[:, u]

                # n_samples, n_components below
                logprob, fwdlattice = self._do_forward_pass(flp_rep)
                curr_logprob += logprob
                bwdlattice = self._do_backward_pass(flp_rep)
                posteriors = self._compute_posteriors(fwdlattice, bwdlattice)
                self._accumulate_sufficient_statistics(
                    stats, X[i:j], flp_rep, posteriors, fwdlattice, bwdlattice)
                # sum responsibilities across chain if tied states exist
                if self.n_tied == 0:
                    gn[i:j, :] = posteriors
                elif self.n_tied > 0:
                    for u in range(self.n_unique):
                        cols = range(u*(self.n_chain),
                                     u*(self.n_chain)+(self.n_chain))
                        gn[i:j, u] = (np.sum(posteriors[:, cols], axis=1)).reshape(X.shape[0])

            self.monitor_.report(curr_logprob)
            if self.monitor_.converged:
                break

            self._do_mstep(stats, self.params)
            self._do_mstep_grad(gn, data)

        return self
예제 #22
0
    def score_samples(self, X, lengths=None):
        """Compute the log probability under the model and compute posteriors.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_features)
            Feature matrix of individual samples.

        lengths : array-like of integers, shape (n_sequences, ), optional
            Lengths of the individual sequences in ``X``. The sum of
            these should be ``n_samples``.

        Returns
        -------
        logprob : float
            Log likelihood of ``X``.

        posteriors : array, shape (n_samples, n_components)
            State-membership probabilities for each sample in ``X``.

        See Also
        --------
        score : Compute the log probability under the model.
        decode : Find most likely state sequence corresponding to ``X``.
        """
        _utils.check_is_fitted(self, "startprob_")
        self._check()

        X = check_array(X)
        n_samples = X.shape[0]
        logprob = 0
        posteriors = np.zeros((n_samples, self.n_components))
        for i, j in iter_from_X_lengths(X, lengths):
            framelogprob = self._compute_log_likelihood(X[i:j])
            logprobij, fwdlattice = self._do_forward_pass(framelogprob)
            logprob += logprobij

            bwdlattice = self._do_backward_pass(framelogprob)
            posteriors[i:j] = self._compute_posteriors(fwdlattice, bwdlattice)
        return logprob, posteriors
예제 #23
0
    def _do_fit(self, data, lengths):
        self._init_params(data, lengths=lengths, params=self.init_params)

        X = data['obs']
        self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter,
                                           self.n_iter_min, self.verbose)
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            puc = np.zeros((X.shape[0], self.n_unique))  # posteriors unique
            # concatenated
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                framelogprob = self._compute_log_likelihood(data,
                                                            from_=i,
                                                            to_=j)
                logprob, fwdlattice = self._do_forward_pass(framelogprob)
                curr_logprob += logprob
                bwdlattice = self._do_backward_pass(framelogprob)
                posteriors = self._compute_posteriors(fwdlattice, bwdlattice)
                self._accumulate_sufficient_statistics(stats, X[i:j],
                                                       framelogprob,
                                                       posteriors, fwdlattice,
                                                       bwdlattice)
                if self.n_tied > 0:
                    for u in range(self.n_unique):
                        cols = range(u * (self.n_chain),
                                     u * (self.n_chain) + (self.n_chain))
                        puc[i:j, u] = np.sum(posteriors[:, cols], axis=1)
                else:
                    puc[i:j, :] = posteriors

            self.monitor_.report(curr_logprob)
            if self.monitor_.converged:
                break

            self._do_mstep(stats, self.params)
            self._do_mstep_grad(puc, data)  # not working with sufficient
            # statistics, to have a more
            # general framework
        return self
예제 #24
0
파일: tm.py 프로젝트: simonkamronn/autohmm
    def _do_fit(self, data, lengths):
        self._init_params(data, lengths=lengths, params=self.init_params)

        X = data['obs']
        self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter,
                                           self.n_iter_min, self.verbose)
        for iter in range(self.n_iter):
            stats = self._initialize_sufficient_statistics()
            puc = np.zeros((X.shape[0], self.n_unique))  # posteriors unique
                                                         # concatenated
            curr_logprob = 0
            for i, j in iter_from_X_lengths(X, lengths):
                framelogprob = self._compute_log_likelihood(data, from_=i,
                                                            to_=j)
                logprob, fwdlattice = self._do_forward_pass(framelogprob)
                curr_logprob += logprob
                bwdlattice = self._do_backward_pass(framelogprob)
                posteriors = self._compute_posteriors(fwdlattice, bwdlattice)
                self._accumulate_sufficient_statistics(
                    stats, X[i:j], framelogprob, posteriors, fwdlattice,
                    bwdlattice)
                if self.n_tied > 0:
                    for u in range(self.n_unique):
                        cols = range(u*(self.n_chain),
                                     u*(self.n_chain)+(self.n_chain))
                        puc[i:j, u] = np.sum(posteriors[:, cols], axis=1)
                else:
                    puc[i:j, :] = posteriors

            self.monitor_.report(curr_logprob)
            if self.monitor_.converged:
                break

            self._do_mstep(stats, self.params)
            self._do_mstep_grad(puc, data)  # not working with sufficient
                                            # statistics, to have a more
                                            # general framework
        return self
예제 #25
0
    def _estimate_transition_prob(self, Y, lengths):
        """Returns an frequentist estimate of the transition probabilities over
        the observed hidden states.  Assumes that the hidden states are
        specified by sequential numbers 0, 1, ... .

        Parameters
        ----------
        Y : numpy array (n_samples,)
            Individual observations of hidden states.
        lengths : list of integer
            Lengths of sequences in X and Y. If None, X and Y are assumed to
            be a single sequence. The sum of lengths should be n_samples.
        """
        if not len(Y):
            return np.empty()

        tp = np.zeros((self.n_states, self.n_states))
        for i, j in iter_from_X_lengths(Y, lengths):
            for k in range(i, j - 1):
                tp[Y[k], Y[k + 1]] += 1

        # Missing values, normalise
        # NOTE: here is made the assumption that states are integers
        # 0, 1, ..., self.n_states-1.
        for y in range(self.n_states):
            if sum(tp[y, :]) == 0:
                tp[y, :] = 1.0
            tp[y, :] /= sum(tp[y, :])

        # To dictionary
        tran_prob = {}
        for i in range(len(tp)):
            for j in range(len(tp[0])):
                tran_prob[(i, j)] = tp[i][j]

        return tran_prob
예제 #26
0
    def decode(self, X, run_lengths, lengths=None, algorithm=None):
        _utils.check_is_fitted(self, "startprob_")
        self._check()

        algorithm = algorithm or self.algorithm
        if algorithm not in DECODER_ALGORITHMS:
            raise ValueError("Unknown decoder {!r}".format(algorithm))

        decoder = {
            "viterbi": self._decode_viterbi,
            "map": self._decode_map
        }[algorithm]

        X = check_array(X)
        n_samples = X.shape[0]
        logprob = 0
        state_sequence = np.empty(n_samples, dtype=int)
        for i, j in iter_from_X_lengths(X, lengths):
            # XXX decoder works on a single sample at a time!
            logprobij, state_sequenceij = decoder(X[i:j], run_lengths[i:j])
            logprob += logprobij
            state_sequence[i:j] = state_sequenceij

        return logprob, state_sequence
예제 #27
0
    def _init(self, X, lengths=None):
        """
		Initializes model parameters prior to fitting.
		
		X : array-like, shape (n_samples, n_features)
			Feature matrix of individual samples.
			n_features should be equal to 4 (lat, long, time, category).
			@TODO Implement for general cases

		lengths : array-like of integers, shape (n_sequences, )
			Lengths of the individual sequences in ``X``. The sum of
			these should be ``n_samples``.

		weights : array-like, the probability that user for 
			individual sequences in ``X`` belongs to the group
			this HMM is representing. The sum of
			these should be ``n_samples``.
		"""

        super(GroupLevelHMM, self)._init(X, lengths=lengths)

        # Check the number of features
        _, n_features = X.shape
        if hasattr(self, 'n_features') and self.n_features != n_features:
            raise ValueError('Unexpected number of dimensions, got %s but '
                             'expected %s' % (n_features, self.n_features))
        if n_features != 4:
            raise ValueError('Unexpected number of features, got %s but '
                             'expected 4' % (n_features))

        self.n_features = n_features

        if len(lengths) != len(self._weights):
            raise ValueError('Unexpected number of lengths and weights')

        # split X to 3 matrices
        self.X_loc, self.X_time, self.X_category = self._split_X_by_features(X)

        # if ``means`` is initialized
        if 'm' in self.init_params or not hasattr(self, "loc_means_"):
            # set means_ for location
            loc_kmeans = cluster.KMeans(n_clusters=self.n_components,
                                        random_state=self.random_state)
            loc_kmeans.fit(self.X_loc)  # fit for lat, long
            self.loc_means_ = loc_kmeans.cluster_centers_  # loc_means_ : Mean for each states

        if 'm' in self.init_params or not hasattr(self, "time_means_"):
            # set means_ for time
            time_kmeans = cluster.KMeans(n_clusters=self.n_components,
                                         random_state=self.random_state)
            time_kmeans.fit(self.X_time)
            self.time_means_ = time_kmeans.cluster_centers_  # time_means_ : Mean for each states

        if 'c' in self.init_params or not hasattr(self, "loc_covars_"):
            cv_loc = np.cov(self.X_loc.T) + self.loc_min_covar * np.eye(
                self.X_loc.shape[1])
            if not cv_loc.shape:
                cv_loc.shape = (1, 1)
            self._loc_covars_ = \
             _utils.distribute_covar_matrix_to_match_covariance_type(
              cv_loc, self.loc_covariance_type, self.n_components).copy()

        if 'c' in self.init_params or not hasattr(self, "time_covars_"):
            cv_time = np.cov(self.X_time.T) + self.time_min_covar * np.eye(
                self.X_time.shape[1])
            if not cv_time.shape:
                cv_time.shape = (1, 1)
            self._time_covars_ = \
             _utils.distribute_covar_matrix_to_match_covariance_type(
              cv_time, self.time_covariance_type, self.n_components).copy()

        self.random_state = check_random_state(self.random_state)

        if 'e' in self.init_params:
            if not hasattr(self, "n_categories"):
                symbols = set()
                for i, j in iter_from_X_lengths(self.X_category, lengths):
                    symbols |= set(self.X_category[i:j].flatten())
                self.n_categories = len(symbols)
            self.category_emissionprob_ = self.random_state \
             .rand(self.n_components, self.n_categories)
            normalize(self.category_emissionprob_, axis=1)

        # check weights
        if (len(self._weights) != len(lengths)):
            raise ValueError("``weights`` and ``lengths`` size mismatch")
예제 #28
0
    def fit(self, X, y, lengths=None, valid_data=None):
        trainloss_hist = []

        if type(X) == list:
            lengths = [x.shape[0] for x in X]
            X = np.concatenate(X)
            y = np.array(y)
        elif lengths is None:
            lengths = [X.shape[0]]

        if valid_data is not None:
            X_valid, y_valid, lengths_valid = valid_data

            if type(X_valid) == list:
                lengths_valid = [x.shape[0] for x in X_valid]
                X_valid = np.concatenate(X_valid)
                y_valid = np.array(y_valid)

            validloss_hist = []

        self.init_params()

        for k in range(self.n_nodes):
            lengthsk = [lengths[i] for i in range(len(lengths)) if y[i] == k]
            if not lengthsk:
                continue
            Xk = np.concatenate([
                X[i:j, :]
                for seq_idx, (i,
                              j) in enumerate(iter_from_X_lengths(X, lengths))
                if y[seq_idx] == k
            ],
                                axis=0)
            yk = np.array([0 for i in range(len(lengthsk))])

            if valid_data is not None:
                Xk_valid = np.concatenate([
                    X_valid[i:j, :] for seq_idx, (i, j) in enumerate(
                        iter_from_X_lengths(X_valid, lengths_valid))
                    if y_valid[seq_idx] == k
                ],
                                          axis=0)
                lengthsk_valid = [
                    lengths_valid[i] for i in range(len(lengths_valid))
                    if y_valid[i] == k
                ]
                yk_valid = np.array([0 for i in range(len(lengthsk_valid))])

                trainlossk, validlossk = (self.hmm[k].fit(
                    Xk,
                    yk,
                    lengths=lengthsk,
                    valid_data=(Xk_valid, yk_valid, lengthsk_valid)))
                validloss_hist.append(validlossk)
            else:
                trainlossk = self.hmm[k].fit(Xk, yk, lengths=lengthsk)

            trainloss_hist.append(trainlossk)

        if valid_data is not None:
            return trainloss_hist, validloss_hist
        else:
            return trainloss_hist
예제 #29
0
    def fit(self, X, lengths=None):

        X = check_array(X)
        self._init(X, lengths=lengths)
        self._check()

        self.monitor_ = ConvergenceMonitor(self.tol, self.n_iter, self.verbose)
        for iter in range(self.n_iter):
            print('iteration: {}'.format(iter))
            stats = self._initialize_sufficient_statistics()
            curr_logprob = 0
            tt = 0
            path_list = list()

            for i, j in iter_from_X_lengths(X, lengths):
                logprob, state_sequence = self.decode(X[i:j],
                                                      algorithm="viterbi")

                curr_logprob += logprob

                epsilon = np.zeros((state_sequence.shape[0] - 1,
                                    self.n_components, self.n_components))
                gamma = np.zeros((state_sequence.shape[0], self.n_components))

                for t in range(state_sequence.shape[0] - 1):
                    epsilon[t, state_sequence[t], state_sequence[t + 1]] = 1

                for t in range(state_sequence.shape[0]):
                    for i in range(self.n_components):
                        if t != (state_sequence.shape[0] - 1):
                            gamma[t, i] = np.sum(epsilon[t, i])
                        else:
                            gamma[t, i] = gamma[t - 1, i]

                path_list.append(state_sequence)
                self._accumulate_sufficient_statistics(stats, X[i:j], epsilon,
                                                       gamma, state_sequence,
                                                       None)
                tt += 1

            print('average loss: {}'.format(curr_logprob / tt))

            if not fast_update:
                stats['start'] /= tt
                stats['trans'] /= tt

                self._do_mstep(stats)
                if update_dnn:
                    temp_path = np.zeros((0, 1))
                    for k, (i, j) in enumerate(iter_from_X_lengths(X,
                                                                   lengths)):
                        temp_path = np.vstack(
                            [temp_path,
                             np.array(path_list[k]).reshape(-1, 1)])
                    self.mlp.train(X, temp_path, 20)

                acoustic_model = np.zeros(self.n_components)
                for i, j in iter_from_X_lengths(X, lengths):
                    logprob, state_sequence = self.decode(X[i:j],
                                                          algorithm="viterbi")
                    for state in state_sequence:
                        acoustic_model[state] += 1
                self.aucoustic_model = acoustic_model / np.sum(acoustic_model)

            self.monitor_.report(curr_logprob)
            if self.monitor_.iter == self.monitor_.n_iter or \
                    (len(self.monitor_.history) == 2 and
                     abs(self.monitor_.history[1] - self.monitor_.history[0]) < self.monitor_.tol * abs(
                                self.monitor_.history[1])):
                break

        print('----------------------------------------------')
        return self