示例#1
0
def dbocpdGP(
    theta,
    X,
    model,
    num_hazard_params,
    dt,
):

    beta0 = 1
    num_scale_params = 1

    # Maximum numbers of points considered for predicting the next one regardless of
    # the run length and cov function. Set to Inf is we don't care about speed.

    maxPossibleLen = 500

    theta_h = theta[:num_hazard_params]  # num_hazard x 1
    theta_m = theta[num_hazard_params:-1]  # num_model x 1
    alpha0 = np.exp(theta[-1])  # Use exp to ensure it is positive. 1 x 1
    num_model_params = len(theta_m)  # 1 x 1

    assert dt > 0

    (T, D) = X.shape  # Number of time point observed

    assert D == 1

    # Never need to consider more than T points in the past.

    maxPossibleLen = min(T, maxPossibleLen)

    # Evaluate the hazard function for this interval.
    # H(r) = P(runlength_t = 0|runlength_t-1 = r-1)
    # Pre-computed the hazard in preperation for steps 4 & 5, alg 1, of [RPA]
    # logH = log(H), logmH = log(1-H)

    (logH, logmH, dlogH, dlogmH) = logistic_logh(np.asarray(range(1, T + 1)),
                                                 theta_h)
    assert isKosher(dlogH)
    assert isKosher(dlogmH)

    # R(r, t) = P(runlength_t-1 = r-1|X_1:t-1).
    # P(runglenth_0 = 0|nothing) = 1 => logR(1, 1) = 0

    logR = np.zeros((T + 1, 1))

    # pre-allocate the run length distribution. [P]
    dlogR_h = np.zeros((T + 1, num_hazard_params))
    dlogR_m = np.zeros((T + 1, num_model_params))
    dlogR_s = np.zeros((T + 1, num_scale_params))

    SSE = np.zeros((T + 1, D))

    # This will change with higher D
    dSSE = np.zeros((T + 1, num_model_params))

    SSE[0, 0] = 2 * beta0  # 1 x 1

    # Pre-compute GP stuff:

    (alpha, sigma2, dalpha, dsigma2) = gpr1step5(theta_m, model,
                                                 maxPossibleLen, dt)
    maxLen = alpha.shape[0]

    # Extend sigma2 to account for that we might call for its value past maxLen
    # t - maxLen x 1

    sigma2 = np.concatenate((sigma2, sigma2[-1, 0] * np.ones(
        (T - sigma2.shape[0], 1))))

    dsigma2 = np.concatenate((dsigma2, np.tile(dsigma2[-1, :],
                                               (T - maxLen, 1))))

    ddf = 2

    for t in range(1, T + 1):
        MRC = min(maxLen, t)  # How many points back to look when predicting

        mu = np.dot(alpha[:MRC, :MRC - 1], X[t - MRC:t - 1,
                                             0][::-1])  # MRC x 1. [x]

        # Extend the mu (mean) prediction for the older (> MRC) run length
        # hypothesis

        if MRC < t:
            mu = np.append(mu, mu[-1] *
                           np.ones(t - mu.shape[0]))  # t - MRC x 1. [x]

        df = np.asarray([2 * alpha0]) + np.asarray(range(t))
        pred_var = sigma2[:t, 0] * SSE[:t, 0] / df
        dpredvar_s = np.atleast_2d(ddf * -sigma2[:t, 0] * SSE[:t, 0] / df**2).T

        (logpredprobs, dlogpredprobs) = studentlogpdf(X[t - 1, 0], mu,
                                                      pred_var, df, 2)

        # Now do the derivatives. [t x 1, t x 1]

        dmu = np.zeros((t, num_model_params))
        dpredvar = np.zeros((t, num_model_params))

        for ii in range(num_model_params):

            # MRC x 1. [x/theta_m]

            dmu[:MRC, ii] = np.dot(dalpha[:MRC, :MRC - 1, ii], X[t - MRC:t - 1,
                                                                 0][::-1])
            if MRC < t:

                # Extend the mu (mean) prediction for the older (>MRC) run length
                # hypothesis

                dmu = np.concatenate((dmu, [dmu[MRC - 1]] * np.ones(
                    (t - dmu.shape[0], 1))))

                # Use the product rule. t x 1. [x^2/theta_m]

            dpredvar[:, ii] = (dsigma2[:t, ii] * SSE[:t, 0] +
                               sigma2[:t, 0] * dSSE[:t, ii]) / df

            # Use the quotient rule. t x 1. [1/theta_m]

            dSSE[1:t + 1, ii] = dSSE[:t, ii] + 2 * (mu - X[t - 1, 0]) \
                / sigma2[:t, 0] * dmu[:, ii] + -(mu - X[t - 1, 0]) ** 2 \
                / sigma2[:t, 0] ** 2 * dsigma2[:t, ii]
            dSSE[0, ii] = 0

        dlogpredprobs_m = rmult(dmu, dlogpredprobs[:, 0]) \
            + rmult(dpredvar[:t, :], dlogpredprobs[:, 1])

        # mu has zero dependence on alpha (scale). t x 1. [log(P/x)]

        dlogpredprobs_s = np.atleast_2d(dpredvar_s[:t, 0] *
                                        dlogpredprobs[:, 1] +
                                        ddf * dlogpredprobs[:, 2]).T

        # Update with the Maha error of predicting the next point. t x 1. []

        SSE[1:t + 1, 0] = SSE[:t, 0] + (mu - X[t - 1, 0]) ** 2 \
            / sigma2[:t, 0]
        SSE[0, 0] = 2 * beta0  # 1 x 1. []

        # Update the run length distributions and their derivatives.

        logMsg = logR[:t, 0] + logpredprobs + logH[:t, 0]  # t x 1
        dlogMsg_h = dlogR_h[:t, :] + dlogH[:t, :]  # t x num_hazard

        logR[1:t + 1, 0] = logR[:t, 0] + \
            logpredprobs + logmH[:t, 0]  # t x 1. [P]

        dlogR_h[1:t + 1, :] = dlogR_h[:t, :] + dlogmH[:t, :]  # t x num_hazard
        dlogR_m[1:t + 1, :] = dlogR_m[:t, :] + dlogpredprobs_m  # t x num_model

        dlogR_s[1:t + 1, :] = dlogR_s[:t, :] + dlogpredprobs_s  # t x num_model

        (logR[0, 0], normMsg, Z) = logsumexp(logMsg)  # 1 x 1. [P]

        # 1 x num_hazard

        dlogR_h[0, :] = rmult(dlogMsg_h, normMsg).sum(axis=0) / Z

        # 1 x num_mod

        dlogR_m[0, :] = rmult(dlogR_m[1:t + 1, :], normMsg).sum(axis=0) \
            / Z

        # 1 x num_sca

        dlogR_s[0, :] = rmult(dlogR_s[1:t + 1, :], normMsg).sum(axis=0) \
            / Z

    # end t loop

    # Get the log marginal likelihood of the data, X(1:end), under the model
    # = P(X_1:T), integrating out all the runlengths. 1 x 1. [log P]

    nlml = -1.0 * logsumexp(logR)[0]

    # Do the derivatives of nlml

    normR = np.exp(logR - max(logR))  # T x 1
    dnlml_h = -rmult(dlogR_h, normR).sum(axis=0) / sum(normR)  # 1 x num_hazard
    dnlml_m = -rmult(dlogR_m, normR).sum(axis=0) / sum(normR)  # 1 x num_model
    dnlml_s = -rmult(dlogR_s, normR).sum(axis=0) / sum(normR)  # 1 x num_scale

    # Correct for that input is log alpha0. 1 x num_scale.

    dnlml_s = alpha0 * dnlml_s

    # (num_hazard + num_model + num_scale) x 1
    dnlml = np.append(np.append(dnlml_h, dnlml_m), dnlml_s)

    assert isKosher(nlml)
    assert isKosher(dnlml)
    return (nlml, dnlml)
示例#2
0
def bocpdGPT_trunc(
    X,
    model,
    theta_m,
    theta_h,
    scalePrior,
    dt,
):

    # Maximum numbers of points considered for predicting the next one regardless of
    # the run length and cov function. Set to Inf is we don't care about speed.

    maxPossibleLen = 500

    num_hazard_params = len(theta_h)
    num_model_params = len(theta_m)

    assert isKosher(X)
    assert dt > 0

    (T, D) = X.shape

    # Number of time point observed. 1 x 1. [s]
    # TODO extend to higher D

    assert D == 1

    # Never need to consider more than T points in the past. 1 x 1. [points]

    maxPossibleLen = min(T, maxPossibleLen)

    # Ensure the gamma prior parameters are positive(as required). 2 x 1. []

    scalePrior = np.exp(scalePrior)
    alpha0 = scalePrior[0]
    beta0 = scalePrior[1]

    # Precompute all the gpr aspects of algorithm. [maxLen x maxLen, maxLen x
    # 1]

    (alpha, sigma2, dalpha, dsigma2) = gpr1step5(theta_m, model,
                                                 maxPossibleLen, dt)

    maxLen = alpha.shape[0]
    assert maxLen >= 1

    # Evaluate the hazard function:

    # H(r) = P(runlength_t=0 | runlength_t - 1=r - 1)
    # Pre - computed the hazard in preperation for steps 4 & 5, alg 1, of[RPA]

    (H, dH) = logistic_h2(np.asarray(range(1, maxLen + 1)), theta_h)

    R = np.zeros((maxLen + 1, T + 1))

    # The standardized square error for each runlength.
    SSE = np.zeros((maxLen, D))

    # The evidence at each time step = > Z(t) = P(X_t | X_1: t - 1).
    Z = np.zeros((T, 1))
    predMeans = np.zeros((T, 1))
    predMed = np.zeros((T, 1))

    # At time t = 1, we have complete knowledge about the run length. This assumes
    # there was surely a change point right before the first data point not at the
    # first data point. Implements step 1, alg 1, of[RPA].
    # = > P(runglenth_0=0 | nothing) = 1

    R[0, 0] = 1

    # Initialize first SSE to contribution from gamma prior.

    SSE[0] = 2 * beta0

    # How many degrees of freedom in the prediction for each run length.

    df = np.asarray([2 * alpha0]) + np.asarray(range(maxLen))

    for t in range(1, T + 1):
    # Implictly Implements step 2, alg 1, of[RPA]: oberserve new datum, simply
    # by incrementing the loop index.

    # Evaluate the predictive distribution for the new datum under each of the
    # parameters. Implements step 3, alg 1, of[RPA]. predprobs(r)
    # = p(X(t) | X(1: t - 1), runlength_t - 1=r - 1). t x 1. [P]

        predprobs = np.zeros(maxLen)
        if t < maxLen:
            mu = np.dot(alpha[:t, :t], X[:t, 0][::-1])

            # The predictive variance for each prediction
            pred_var = sigma2[:t, 0] * SSE[:t, 0] / df[:t]

            # get the posterior predictive probability for each run length
            predprobs[:t] = studentpdf(X[t - 1, 0], mu, pred_var, df[:t], 1)

            # Update the SSE for each run length
            SSE[1:t + 1, 0] = SSE[:t, 0] + \
                (mu - X[t - 1, 0]) ** 2 / sigma2[:t, 0]
            SSE[0, 0] = 2 * beta0  # 1 x 1. []
        else:
            mu = np.dot(alpha, X[t - maxLen + 1:t, 0][::-1])

            # The predictive variance for each prediction
            pred_var = sigma2[:, 0] * SSE[:, 0] / df

            # get the posterior predictive probability for each run length
            predprobs = studentpdf(X[t - 1, 0], mu, pred_var, df, 1)

            # Update the SSE for each run length
            SSE[1:maxLen, 0] = SSE[:maxLen - 1, 0] + \
                (mu[:maxLen - 1] - X[maxLen - 1, 0]) ** 2 / \
                sigma2[:maxLen - 1, 0]
            SSE[0, 0] = 2 * beta0

        # endif
        predMeans[t - 1] = np.dot(R[:mu.shape[0], t - 1].T, mu)

        predMed[t - 1] = np.median(
            MoTrnd(R[:mu.shape[0], t - 1], mu, pred_var[:mu.shape[0]], df[:mu.shape[0]], 1000))

        # Evaluate the growth probabilities - shift the probabilities up and to the
        # right, scaled by the hazard function and the predictive
        # probabilities.
        R[1:, t] = R[: maxLen, t - 1] * predprobs * (1 - H[: maxLen])

        # Evaluate the probability that there * was * a changepoint and we're
        # accumulating the mass back down at r = 0.

        R[0, t] = (R[: maxLen, t - 1] * predprobs * H[: maxLen]).sum()

        # Renormalize the run length probabilities for improved numerical stability.
        # Note that unlike in [RPA] which keeps track of P(r_t, X_1: t), we keep track
        # of P(r_t | X_1: t) = > unnormalized R(i, t + 1) = P(runlength_t=i - 1 | X_1: t)
        # * P(X_t | X_1: t - 1) = > normalization const Z(t) = P(X_t | X_1: t - 1). Sort of
        # Implements step 6, alg 1, of[RPA].

        Z[t - 1] = R[:, t].sum()

        R[: maxLen, t] /= Z[t - 1]

        R[maxLen - 1, t] = R[maxLen - 1, t] + R[maxLen, t]
        R[maxLen, t] = 0

    # endTloop

    # Get the negative log marginal likelihood of the data, X(1: end), under
    # the model = P(X_1: T), integrating out all the runlengths. 1 x 1. [log
    # P]

    nlml = -sum(np.log(Z))

    return (R, nlml, Z, predMeans, predMed)
示例#3
0
def bocpdGPT(
    X,
    model,
    theta_m,
    theta_h,
    scalePrior,
    dt,
):

    # Maximum numbers of points considered for predicting the next one regardless of
    # the run length and cov function. Set to Inf is we don't care about speed.

    maxPossibleLen = 500

    num_hazard_params = len(theta_h)
    num_model_params = len(theta_m)

    assert isKosher(X)
    assert dt > 0

    (T, D) = X.shape

    # Number of time point observed. 1 x 1. [s]
    # TODO extend to higher D

    assert D == 1

    # Never need to consider more than T points in the past. 1 x 1. [points]

    maxPossibleLen = min(T, maxPossibleLen)

    # Ensure the gamma prior parameters are positive(as required). 2 x 1. []

    scalePrior = np.exp(scalePrior)
    alpha0 = scalePrior[0]
    beta0 = scalePrior[1]

    # Evaluate the hazard function:

    # H(r) = P(runlength_t=0 | runlength_t - 1=r - 1)
    # Pre - computed the hazard in preperation for steps 4 & 5, alg 1, of[RPA]

    (H, dH) = logistic_h2(np.asarray(range(1, T + 1)), theta_h)

    R = np.zeros((T + 1, T + 1))
    S = np.zeros((T, T))

    # The standardized square error (SSE) for each runlength.
    SSE = np.zeros((T + 1, D))

    # The evidence at each time step = > Z(t) = P(X_t | X_1: t - 1).
    Z = np.zeros((T, 1))
    predMeans = np.zeros((T, 1))
    predMed = np.zeros((T, 1))

    # At time t = 1, we have complete knowledge about the run length. This assumes
    # there was surely a change point right before the first data point not at the
    # first data point. Implements step 1, alg 1, of[RPA].
    # = > P(runglenth_0=0 | nothing) = 1
    R[0, 0] = 1

    # Initialize first SSE to contribution from gamma prior.
    SSE[0] = 2 * beta0

    # Precompute all the gpr aspects of algorithm.
    (alpha, sigma2, dalpha, dsigma2) = gpr1step5(theta_m, model,
                                                 maxPossibleLen, dt)

    maxLen = alpha.shape[0]

    sigma2 = np.concatenate((sigma2, sigma2[-1, 0] * np.ones(
        (T - sigma2.shape[0], 1))))

    for t in range(1, T + 1):
        # Implictly Implements step 2, alg 1, of[RPA]: oberserve new datum, simply
        # by incrementing the loop index.

        # Evaluate the predictive distribution for the new datum under each of the
        # parameters. Implements step 3, alg 1, of[RPA]. predprobs(r)
        # = p(X(t) | X(1: t - 1), runlength_t - 1=r - 1). t x 1. [P]
        MRC = min(maxLen, t)  # How many points back to look when predicting

        mu = np.dot(alpha[:MRC, :MRC - 1], X[t - MRC:t - 1,
                                             0][::-1])  # MRC x 1. [x]

        # Extend the mu (mean) prediction for the older (> MRC) run length
        # hypothesis
        if MRC < t:
            mu = np.append(mu, mu[-1] *
                           np.ones(t - mu.shape[0]))  # t - MRC x 1. [x]

        df = np.asarray([2 * alpha0]) + np.asarray(range(t))
        pred_var = sigma2[:t, 0] * SSE[:t, 0] / df

        predprobs = studentpdf(X[t - 1, 0], mu, pred_var, df, 1)

        # Update the SSE for each run length
        SSE[1:t + 1, 0] = SSE[:t, 0] + (mu - X[t - 1, 0])**2 / sigma2[:t, 0]
        SSE[0, 0] = 2 * beta0  # 1 x 1. []

        predMeans[t - 1] = np.dot(R[:mu.shape[0], t - 1].T, mu)

        # The following is pretty slow
        #np.median(MoTrnd(R[:mu.shape[0], t - 1], mu, pred_var[:mu.shape[0]], df[:mu.shape[0]], 1000))
        predMed[t - 1] = 0

        # Evaluate the growth probabilities - shift the probabilities up and to the
        # right, scaled by the hazard function and the predictive
        # probabilities.
        R[1:t + 1, t] = R[:t, t - 1] * predprobs * (1 - H[:t])

        # Evaluate the probability that there * was * a changepoint and we're
        # accumulating the mass back down at r = 0.

        R[0, t] = (R[:t, t - 1] * predprobs * H[:t]).sum()

        # Renormalize the run length probabilities for improved numerical stability.
        # Note that unlike in [RPA] which keeps track of P(r_t, X_1: t), we keep track
        # of P(r_t | X_1: t) = > unnormalized R(i, t + 1) = P(runlength_t=i - 1 | X_1: t)
        # * P(X_t | X_1: t - 1) = > normalization const Z(t) = P(X_t | X_1: t - 1). Sort of
        # Implements step 6, alg 1, of[RPA].

        Z[t - 1] = R[:t + 1, t].sum()

        R[:t + 1, t] /= Z[t - 1]

        # Get the S matrix
        S[:t, t - 1] = R[:t, t - 1] * predprobs
        S[:, t - 1] = S[:, t - 1] / S[:, t - 1].sum()

    # endTloop

    # Get the negative log marginal likelihood of the data, X(1: end), under
    # the model = P(X_1: T), integrating out all the runlengths. 1 x 1. [log
    # P]

    nlml = -sum(np.log(Z))

    return (R, S, nlml, Z, predMeans, predMed)
示例#4
0
def dbocpdGP(
    theta,
    X,
    model,
    num_hazard_params,
    dt,
):

    beta0 = 1
    num_scale_params = 1

  # Maximum numbers of points considered for predicting the next one regardless of
  # the run length and cov function. Set to Inf is we don't care about speed.

    maxPossibleLen = 500

    theta_h = theta[:num_hazard_params]  # num_hazard x 1
    theta_m = theta[num_hazard_params:-1]  # num_model x 1
    alpha0 = np.exp(theta[-1])  # Use exp to ensure it is positive. 1 x 1
    num_model_params = len(theta_m)  # 1 x 1

    assert dt > 0

    (T, D) = X.shape  # Number of time point observed

    assert D == 1

  # Never need to consider more than T points in the past.

    maxPossibleLen = min(T, maxPossibleLen)

  # Evaluate the hazard function for this interval.
  # H(r) = P(runlength_t = 0|runlength_t-1 = r-1)
  # Pre-computed the hazard in preperation for steps 4 & 5, alg 1, of [RPA]
  # logH = log(H), logmH = log(1-H)

    (logH, logmH, dlogH, dlogmH) = logistic_logh(
        np.asarray(range(1, T + 1)), theta_h)
    assert isKosher(dlogH)
    assert isKosher(dlogmH)

  # R(r, t) = P(runlength_t-1 = r-1|X_1:t-1).
  # P(runglenth_0 = 0|nothing) = 1 => logR(1, 1) = 0

    logR = np.zeros((T + 1, 1))

    # pre-allocate the run length distribution. [P]
    dlogR_h = np.zeros((T + 1, num_hazard_params))
    dlogR_m = np.zeros((T + 1, num_model_params))
    dlogR_s = np.zeros((T + 1, num_scale_params))

    SSE = np.zeros((T + 1, D))

    # This will change with higher D
    dSSE = np.zeros((T + 1, num_model_params))

    SSE[0, 0] = 2 * beta0  # 1 x 1

  # Pre-compute GP stuff:

    (alpha, sigma2, dalpha, dsigma2) = gpr1step5(theta_m, model,
                                                 maxPossibleLen, dt)
    maxLen = alpha.shape[0]

    # Extend sigma2 to account for that we might call for its value past maxLen
    # t - maxLen x 1

    sigma2 = np.concatenate((sigma2, sigma2[-1, 0] * np.ones((T
                            - sigma2.shape[0], 1))))

    dsigma2 = np.concatenate((dsigma2, np.tile(dsigma2[-1, :], (T
                             - maxLen, 1))))

    ddf = 2

    for t in range(1, T + 1):
        MRC = min(maxLen, t)  # How many points back to look when predicting

        mu = np.dot(alpha[:MRC, :MRC - 1], X[
                    t - MRC:t - 1, 0][::-1])  # MRC x 1. [x]

        # Extend the mu (mean) prediction for the older (> MRC) run length
        # hypothesis

        if MRC < t:
            mu = np.append(mu, mu[-1] * np.ones(
                t - mu.shape[0]))  # t - MRC x 1. [x]

        df = np.asarray([2 * alpha0]) + np.asarray(range(t))
        pred_var = sigma2[:t, 0] * SSE[:t, 0] / df
        dpredvar_s = np.atleast_2d(
            ddf * -sigma2[:t, 0] * SSE[:t, 0] / df ** 2).T

        (logpredprobs, dlogpredprobs) = studentlogpdf(X[t - 1, 0], mu,
                                                      pred_var, df, 2)

        # Now do the derivatives. [t x 1, t x 1]

        dmu = np.zeros((t, num_model_params))
        dpredvar = np.zeros((t, num_model_params))

        for ii in range(num_model_params):

        # MRC x 1. [x/theta_m]

            dmu[:MRC, ii] = np.dot(dalpha[:MRC, :MRC - 1, ii], X[t
                                   - MRC:t - 1, 0][::-1])
            if MRC < t:

            # Extend the mu (mean) prediction for the older (>MRC) run length
            # hypothesis

                dmu = np.concatenate((dmu, [dmu[MRC - 1]] * np.ones((t
                                                                     - dmu.shape[0], 1))))

                # Use the product rule. t x 1. [x^2/theta_m]

            dpredvar[:, ii] = (dsigma2[:t, ii] * SSE[:t, 0] + sigma2[:
                               t, 0] * dSSE[:t, ii]) / df

            # Use the quotient rule. t x 1. [1/theta_m]

            dSSE[1:t + 1, ii] = dSSE[:t, ii] + 2 * (mu - X[t - 1, 0]) \
                / sigma2[:t, 0] * dmu[:, ii] + -(mu - X[t - 1, 0]) ** 2 \
                / sigma2[:t, 0] ** 2 * dsigma2[:t, ii]
            dSSE[0, ii] = 0

        dlogpredprobs_m = rmult(dmu, dlogpredprobs[:, 0]) \
            + rmult(dpredvar[:t, :], dlogpredprobs[:, 1])

        # mu has zero dependence on alpha (scale). t x 1. [log(P/x)]

        dlogpredprobs_s = np.atleast_2d(dpredvar_s[:t, 0]
                                        * dlogpredprobs[:, 1] + ddf * dlogpredprobs[:, 2]).T

        # Update with the Maha error of predicting the next point. t x 1. []

        SSE[1:t + 1, 0] = SSE[:t, 0] + (mu - X[t - 1, 0]) ** 2 \
            / sigma2[:t, 0]
        SSE[0, 0] = 2 * beta0  # 1 x 1. []

        # Update the run length distributions and their derivatives.

        logMsg = logR[:t, 0] + logpredprobs + logH[:t, 0]  # t x 1
        dlogMsg_h = dlogR_h[:t, :] + dlogH[:t, :]  # t x num_hazard

        logR[1:t + 1, 0] = logR[:t, 0] + \
            logpredprobs + logmH[:t, 0]  # t x 1. [P]

        dlogR_h[1:t + 1, :] = dlogR_h[:t, :] + dlogmH[:t, :]  # t x num_hazard
        dlogR_m[1:t + 1, :] = dlogR_m[:t, :] + dlogpredprobs_m  # t x num_model

        dlogR_s[1:t + 1, :] = dlogR_s[:t, :] + dlogpredprobs_s  # t x num_model

        (logR[0, 0], normMsg, Z) = logsumexp(logMsg)  # 1 x 1. [P]

        # 1 x num_hazard

        dlogR_h[0, :] = rmult(dlogMsg_h, normMsg).sum(axis=0) / Z

        # 1 x num_mod

        dlogR_m[0, :] = rmult(dlogR_m[1:t + 1, :], normMsg).sum(axis=0) \
            / Z

        # 1 x num_sca

        dlogR_s[0, :] = rmult(dlogR_s[1:t + 1, :], normMsg).sum(axis=0) \
            / Z

    # end t loop

    # Get the log marginal likelihood of the data, X(1:end), under the model
    # = P(X_1:T), integrating out all the runlengths. 1 x 1. [log P]

    nlml = -1.0 * logsumexp(logR)[0]

    # Do the derivatives of nlml

    normR = np.exp(logR - max(logR))  # T x 1
    dnlml_h = -rmult(dlogR_h, normR).sum(axis=0) / sum(normR)  # 1 x num_hazard
    dnlml_m = -rmult(dlogR_m, normR).sum(axis=0) / sum(normR)  # 1 x num_model
    dnlml_s = -rmult(dlogR_s, normR).sum(axis=0) / sum(normR)  # 1 x num_scale

  # Correct for that input is log alpha0. 1 x num_scale.

    dnlml_s = alpha0 * dnlml_s

    # (num_hazard + num_model + num_scale) x 1
    dnlml = np.append(np.append(dnlml_h, dnlml_m), dnlml_s)

    assert isKosher(nlml)
    assert isKosher(dnlml)
    return (nlml, dnlml)