示例#1
0
def bicv_scores(model,
                X,
                fit_params=None,
                strategy="speckled",
                heldout_frac=0.1,
                n_repeats=10,
                seed=None):
    """
    Estimate train and test error for a model by bi-cross-validation.
    """

    # Initialize dictionary for fit keyword args.
    if fit_params is None:
        fit_params = dict()
    m, n = X.shape

    # Initialize random number generator.
    rs = get_random_state(seed)

    # Allocate space to store train/test scores.
    train_scores = np.empty(n_repeats)
    test_scores = np.empty(n_repeats)

    # Run cross-validation.
    for itr in range(n_repeats):

        # Create shuffled view into data.
        ii = rs.permutation(m)
        jj = rs.permutation(n)
        Xs = np.copy(X[ii][:, jj])

        # Partition columns and rows.
        si = int(m - m * heldout_frac)
        sj = int(n - n * heldout_frac)

        # Fit model to training set.
        model.fit(Xs[:si, :sj], mask=None)

        # Extend model factors.
        model.bicv_extend(Xs[:si, sj:], Xs[si:, :sj])

        # Construct mask for training set.
        train_mask = np.zeros((m, n), dtype=bool)
        train_mask[:si, :sj] = True

        # Construct mask for test set.
        test_mask = np.zeros((m, n), dtype=bool)
        test_mask[si:, sj:] = True

        # Compute performance on train and test partitions.
        train_scores[itr] = model.score(Xs, mask=train_mask)
        test_scores[itr] = model.score(Xs, mask=test_mask)

    return train_scores, test_scores
示例#2
0
def speckled_cv_scores(model,
                       X,
                       fit_params=None,
                       heldout_frac=0.1,
                       n_repeats=10,
                       resampler=None,
                       return_params=False,
                       seed=None,
                       progress_bar=False):
    """
    Estimate train and test error for a model by cross-validation.
    """

    # Initialize dictionary for fit keyword args.
    if fit_params is None:
        fit_params = dict()

    # Initialize random number generator.
    rs = get_random_state(seed)

    # Allocate space to store train/test scores.
    train_scores = np.empty(n_repeats)
    test_scores = np.empty(n_repeats)

    params = []

    # Run cross-validation.
    pbar = trange(n_repeats) if progress_bar else range(n_repeats)
    for itr in pbar:

        # If desired, resample X (e.g. apply random shuffle).
        if resampler is not None:
            Xsamp = resampler(X)
        else:
            Xsamp = X

        # Generate a new holdout pattern.
        mask = speckled_mask(X.shape, heldout_frac, rs)

        # Fit model.
        model.fit(Xsamp, mask=mask)

        # Save parameters.
        if return_params:
            params.append(tuple(p.copy() for p in model.factors))

        # Compute performance on train and test partitions.
        train_scores[itr] = model.score(Xsamp, mask=mask)
        test_scores[itr] = model.score(Xsamp, mask=~mask)

    # Return data.
    return ((train_scores, test_scores, params) if return_params else
            (train_scores, test_scores))
示例#3
0
def mixed_poiss_cd(X, Y, rank, mask, tol, maxiter, seed):
    """
    Parameters
    ----------
    X : ndarray
        Matrix holding inputs data. Has shape
        (n_inputs, n_obs).
    Y : ndarray
        Matrix holding data. Has shape
        (n_features, n_obs).
    mask : ndarray
        Binary array specifying observed data points
        (where mask == 1) and unobserved data points
        (where mask == 0). Has shape
        (n_features, n_obs).
    """

    assert X.shape[1] == Y.shape[1]
    n_in, n_obs = X.shape
    n_features, n_obs = Y.shape

    # Initialize parameters.
    rs = get_random_state(seed)
    U = rs.uniform(-1, 1, size=(n_features, n_in + rank))
    Vt = rs.uniform(-1, 1, size=(n_in + rank, n_obs))
    Vt[:n_in] = X

    if mask is None:
        update_rule = _poiss_cd_update
        mask_T = None
    else:
        update_rule = _poiss_cd_update_with_mask
        mask_T = mask.T

    for itr in range(maxiter):

        # Update U.
        update_rule(Y, U, Vt, mask, inner_iters)

        # Update rows of V without over-writing inputs (X).
        ut = U[:, n_in:].T
        v = Vt[n_in:]
        ls = update_rule(Y.T, v, ut, mask_T, inner_iters)

        # Check convergence.
        loss_hist.append(ls)
        if itr > 0 and ((loss_hist[-2] - loss_hist[-1]) < tol):
            break

    return W, H, np.array(loss_hist)
示例#4
0
def _init_kmeans(X, rank, mask, init, seed):
    """
    Dispatches the desired initialization method.

    Parameters
    ----------
    X : ndarray
        Data matrix. Has shape (m, n)
    rank : int
        Number of cluster centroids.
    mask : ndarray
        Mask for missing data. Has shape (m, n).
    init : str
        Specifies initialization method.
    seed : int or numpy.random.RandomState
        Seeds random number generator.

    Returns
    -------
    W : ndarray
        First factor matrix. Has shape (m, rank).
    H : ndarray
        Second factor matrix. Has shape (rank, n).
    xtx : float
        Squared Frobenius norm of X. This is later
        used to scale the model loss.
    """

    # Seed random number generator.
    rs = get_random_state(seed)

    # Random initialization.
    if init == "rand":
        idx = rs.choice(X.shape[0], size=rank, replace=False)
        centroids = X[idx]

    # Soft k-means initialization.
    elif init == "soft":
        _, centroids = soft_kmeans_em(X, rank, mask, "rand", 100, 1e-5, seed)

    else:
        raise NotImplementedError("Did not recognize init method.")

    return centroids
示例#5
0
 def __init__(self, n_components, seed=None):
     self.nc = n_components
     self._rs = get_random_state(seed)
示例#6
0
 def __init__(self, seed=None):
     self._rs = get_random_state(seed)
示例#7
0
def poisson_lorenz(n_out,
                   n_steps,
                   x0=None,
                   dt=0.01,
                   latent_noise_scale=10.0,
                   max_rate=10.0,
                   min_rate=0.01,
                   seed=None):
    """
    Simulate high-dimensional count data series following
    low-dimensional Lorenz attractor dynamics.

    Parameters
    ----------
    n_out : int
        Dimensional of observations.
    n_steps: int
        Number of observed timesteps.
    dt : float
        Euler integration step of the continuous time
        ODE.
    latent_noise_scale : float
        Scale of Wiener process noise on latent states.
        Note that the square root of dt also scales
        this noise source (Euler–Maruyama integration).
    max_rate : float
        Maximum rate parameter in the simulated
        dataset.
    min_rate : float
        Minimum rate parameter in the simulated
        dataset.
    seed : None, int, or np.random.RandomState
        Seed for random number generator.

    Returns
    -------
    data : ndarray
        Data array holding simulated count data. Has shape
        (n_steps, n_out).
    rates : ndarray
        True time-varying rate parameters, associated with
        'data'. Has shape (n_steps, n_out).
    W : ndarray
        Weight matrix. Has shape (n_out, 3).
    X : ndarray
        Simulated latent states. Has shape (n_steps, 3).
    """

    # Initialize random number generator.
    rs = get_random_state(seed)

    # Parameters of Lorenz equations (chaotic regime).
    sigma = 10.0
    beta = 8 / 3
    rho = 28.0

    # Allocate space for simulation.
    x = x0 if x0 is not None else np.ones(3)
    dxdt = np.empty(3)
    x_hist = np.empty((n_steps, 3))

    # Draw random readout matrix.
    W = rand_orth(3, n_out, seed=rs)

    # Simulate latent states.
    for t in range(n_steps):

        # Lorenz equations
        dxdt[0] = sigma * (x[1] - x[0])
        dxdt[1] = x[0] * (rho - x[2]) - x[1]
        dxdt[2] = x[0] * x[1] - beta * x[2]

        # Euler–Maruyama integration
        eta = latent_noise_scale * rs.randn(3)
        x = x + (dt * dxdt) + (np.sqrt(dt) * eta)

        # Store latent variable traces
        x_hist[t] = x

    # Center the x's so they exert comparable effects
    # in the observed data.
    x_hist = x_hist - np.mean(x_hist, axis=0)

    # Rescale rates to desired range.
    log_rates = np.dot(x_hist, W)
    log_rates = \
        (log_rates - np.min(log_rates)) / np.ptp(log_rates)
    log_rates = \
        log_rates * np.log(max_rate / min_rate) + np.log(min_rate)
    rates = np.exp(log_rates)

    # Draw Poisson distributed observations.
    data = rs.poisson(rates)

    # Return quantities of interest.
    return data, rates, W, x_hist
示例#8
0
def _init_tsvd(X, rank, mask, init, seed):
    """
    Dispatches the desired initialization method.

    Parameters
    ----------
    X : ndarray
        Data matrix. Has shape (m, n)
    rank : int
        Number of components.
    mask : ndarray
        Mask for missing data. Has shape (m, n).
    init : str
        Specifies initialization method.
    seed : int or numpy.random.RandomState
        Seeds random number generator.

    Returns
    -------
    W : ndarray
        First factor matrix. Has shape (m, rank).
    H : ndarray
        Second factor matrix. Has shape (rank, n).
    xtx : float
        Squared Frobenius norm of X. This is later
        used to scale the model loss.
    """

    # Data dimensions.
    m, n = X.shape

    # Mask data.
    if mask is not None:
        Xm = mask * X
    else:
        Xm = X

    # Compute norm of masked data.
    xtx = np.dot(Xm.ravel(), Xm.ravel())

    # Seed random number generator.
    rs = get_random_state(seed)

    # Random initialization.
    if init == "rand_orth":

        # Randomized initialization.
        U = rand_orth(m, rank)
        Vt = rand_orth(rank, n)

        # Determine appropriate scaling.
        e = np.dot(U, Vt) * mask
        alpha = np.sqrt(xtx / np.dot(e.ravel(), e.ravel()))

        # Scale randomized initialization.
        U *= alpha
        Vt *= alpha

    else:
        raise NotImplementedError("Did not recognize init method.")

    return U, Vt, xtx
示例#9
0
def poisson_mf_cd(X, rank, mask, Vbasis, tol, maxiter, seed):
    """
    Parameters
    ----------
    X : ndarray
        Matrix holding inputs data. Has shape (m, n).
    rank : int
        Number of components.
    mask : ndarray
        Mask for missing data. Has shape (m, n).
    tol : float
        Convergence tolerance.
    maxiter : int
        Number of iterations.
    seed : int or np.random.RandomState
        Seed for random number generator for initialization.

    Returns
    -------
    U : ndarray
        First factor matrix. Has shape (m, rank).
    Vt : ndarray
        Second factor matrix. Has shape (rank, n).
    loss_hist : ndarray
        Vector holding loss values. Has shape
        (n_iterations,).
    """

    X = np.asarray(X, dtype='float')

    # Initialize parameters.
    loss_hist = []
    m, n = X.shape
    rs = get_random_state(seed)

    # Account for masked entries.
    if mask is not None:
        X = np.copy(X)
        X[~mask] = np.mean(X[mask])
        Xpred = np.empty((m, n))

    # Initialize parameters.
    U = rs.uniform(-1, 1, size=(m, rank))
    if Vbasis is None:
        Vt = rs.uniform(-1, 1, size=(rank, n))
    else:
        Vt = rs.uniform(-1, 1, size=(rank, Vbasis.shape[0]))

    # Convergence check on parameters.
    Ulast = np.empty_like(U)
    Vlast = np.empty_like(Vt)

    # Main optimization loop.
    for itr in range(maxiter):

        # Update U.
        if Vbasis is None:
            _poiss_cd_update(X, U, Vt, mask)
        else:
            _poiss_cd_update(X, U, Vt @ Vbasis, mask)

        # Update V.
        if Vbasis is None:
            ls = _poiss_cd_update(X.T, Vt.T, U.T, mask_T)
        else:
            ls = _poiss_cd_update_with_basis(X.T, Vt.T, U.T, mask_T, Vbasis)

        # Update masked elements.
        if mask is not None:
            np.dot(U, Vt, out=Xpred)
            X[~mask] = Xpred[~mask]

        # Store loss.
        loss_hist.append(ls / X.size)

        # Check convergence.
        U_upd = np.linalg.norm(Ulast - U) / np.linalg.norm(U)
        V_upd = np.linalg.norm(Vlast - V) / np.linalg.norm(Vt)
        if (itr > 0) and (U_upd < tol) and (V_upd < tol):
            break

        # Make copies of previous parameters.
        np.copyto(Ulast, U)
        np.copyto(Vlast, Vt)

    return U, Vt, np.array(loss_hist)