Exemplo n.º 1
0
def rgibbs_beta(concentrations, gamma_bar, tausq, n_peptides,
                prior_mean=np.array([0., 1.]), prior_prec=np.array([0., 0.]),
                prior_trunc_b1=(-np.Inf, np.Inf)):
    '''
    Gibbs update for slope and intercept of concentration/intensity model.

    Relatively standard conjugate Normal regression update, univariate case.
    Slight subtlety with the prior on beta_1; having support near 0 is
    problematic for the measurement error draw.
    '''
    # Setup quantities for WLS estimation. Using pseudo-observations for prior.
    w = np.r_[n_peptides / tausq, prior_prec]
    X = np.ones((np.size(concentrations) + 2, 2))
    X[:-2, 1] = concentrations
    X[-2, 1] = 0
    X[-1, 0] = 0
    y = np.r_[gamma_bar, prior_mean]
    
    # Run WLS
    estimate = glm.wls(X=X, y=y, w=w)

    # Get posterior covariance
    Sigma = linalg.cho_solve((estimate['L'], True), np.eye(2))

    # Draw beta_1 from truncated distribution
    beta = np.empty(2)
    beta[1] = np.random.randn(1) * np.sqrt(Sigma[1,1]) + estimate['b'][1]
    while beta[1] < prior_trunc_b1[0] or beta[1] > prior_trunc_b1[1]:
        beta[1] = np.random.randn(1) * np.sqrt(Sigma[1,1]) + estimate['b'][1]

    # Draw beta_0 from conditional posterior given beta_1
    beta[0] = np.random.randn(1) * \
            np.sqrt(Sigma[0,0] - Sigma[0,1]**2 / Sigma[1,1]) + \
            estimate['b'][0] + Sigma[0,1] / Sigma[1,1] * \
            (beta[1] - estimate['b'][1])
    
    return beta
Exemplo n.º 2
0
def mcmc_serial(intensities_obs, mapping_states_obs, mapping_peptides, cfg,
                known_concentrations=None, mapping_known_concentrations=None,
                peptide_features=None, **kwargs):
    '''
    Serial MCMC sampler for posterior of state-level censoring model.

    Parameters
    ----------
        - intensities_obs : array_like
            A 1d array of length n_obs_states for which each entry contains the
            observed (summed) log state intensity.
            This must be aligned to mapping_states_obs and all entires must be
            > -inf; no missing peptides.
        - mapping_states_obs : array_like, 1 dimension, nonnegative ints
            A 1d integer array of length n_obs_states for which each entry
            contains the index of the peptide that corresponds to the given
            observed state. Peptide indices can range over 0 <= i < n_peptides.
            Not every peptide index is required to appear in this mapping; only
            observed peptides should be included. Also note that peptides are
            indexed overall, not within protein.
        - mapping_peptides : array_like, 1 dimension, nonnegative ints
            A 1d integer array of length n_peptides for which each entry
            contains the index of the protein that corresponds to the given
            peptide. Protein indices can range over 0 <= i < n_proteins.
            Every peptide and protein to be included in the model should be
            included here. That is, both observed and unobserved peptides should
            appear in this mapping.
        - cfg : dictionary
            A dictionary (typically generated from a YAML file) containing
            priors and settings for the MCMC algorithm. Its exact form will be
            documented elsewhere. It will have at least three sections: priors,
            containing one entry per parameter, settings, containing settings
            for the MCMC algorithm, and init, containing initial values for
            certain parameters.

    Returns
    -------
        - draws : dictionary
            1- and 2-dimensional ndarrays containing the posterior samples for
            each parameter.
        - accept_stats : dictionary
            Dictionary containing number of acceptances for each MH step.

    '''
    # Determine whether algorithm is running with supervision
    try:
        supervised = cfg['priors']['supervised']
    except:
        print >> sys.stderr, 'Defaulting to unsupervised algorithm'
        supervised = False

    # If supervised, determine whether to model distribution of concentrations
    # If this is False, prior on $\beta_1$ is scaled by $|\beta_1|^{n_{mis}}$.
    if supervised:
        try:
            concentration_dist = cfg['priors']['concentration_dist']
        except:
            print >> sys.stderr, 'Defaulting to flat prior on concentrations'
            concentration_dist = False
    
    # Determine whether peptide features are present and, if so, their size
    if peptide_features is None:
        n_peptide_features = 0
    else:
        n_peptide_features = peptide_features.shape[1]

    # Convert inputs to np.ndarrays as needed
    if type(intensities_obs) is not np.ndarray:
        intensities_obs = np.asanyarray(intensities_obs)
    if type(mapping_states_obs) is not np.ndarray:
        mapping_states_obs = np.asanyarray(mapping_states_obs, dtype=np.int)
    if type(mapping_peptides) is not np.ndarray:
        mapping_peptides = np.asanyarray(mapping_peptides, dtype=np.int)

    # Extract proposal DFs
    try:
        prop_df_y_mis = cfg['settings']['prop_df_y_mis']
    except:
        prop_df_y_mis = 5.0

    try:
        prop_df_eta = cfg['settings']['prop_df_eta']
    except:
        prop_df_eta = 10.

    # Extract dimensions from input

    # Number of iterations from cfg
    n_iterations = cfg['settings']['n_iterations']

    # Number of peptides and proteins from mapping_peptides
    n_peptides = np.size(mapping_peptides)
    n_proteins = 1 + np.max(mapping_peptides)

    # Check for validity of mapping vectors
    if (not issubclass(mapping_states_obs.dtype.type, np.integer) or
        np.min(mapping_states_obs) < 0 or
            np.max(mapping_states_obs) > n_peptides - 1):
        raise ValueError('State to peptide mapping (mapping_states_obs)'
                         ' is not valid')

    if (not issubclass(mapping_peptides.dtype.type, np.integer) or
        np.min(mapping_peptides) < 0 or
            np.max(mapping_peptides) > n_peptides - 1):
        raise ValueError('Peptide to protein mapping (mapping_peptides)'
                         ' is not valid')

    # Compute tabulations that are invariant across iterations

    # Total number of observed states
    n_obs_states = np.size(intensities_obs)

    # Tabulate peptides per protein
    n_peptides_per_protein = np.bincount(mapping_peptides)
    peptides_obs = np.unique(mapping_states_obs)
    n_obs_peptides_per_protein = np.bincount(mapping_peptides[peptides_obs],
                                             minlength=n_proteins)

    # Tabulate number of observed states per peptide
    n_obs_states_per_peptide = np.bincount(mapping_states_obs,
                                           minlength=n_peptides)

    # Sum observed intensities per peptide
    total_intensity_obs_per_peptide = np.bincount(mapping_states_obs,
                                                  weights=intensities_obs,
                                                  minlength=n_peptides)

    # Allocate data structures for draws

    # Data structures for supervised algorithm
    if supervised:
        beta_draws = np.empty((n_iterations, 2))
        concentration_draws = np.empty((n_iterations, n_proteins))
        mean_concentration_draws = np.zeros((n_iterations))
        prec_concentration_draws = np.zeros((n_iterations))

    # Peptide- and protein-level means
    gamma_draws = np.empty((n_iterations, n_peptides))
    mu_draws = np.empty((n_iterations, n_proteins))

    # Number of censored states per peptide
    n_cen_states_per_peptide_draws = np.zeros((n_iterations, n_peptides),
                                              dtype=np.integer)

    # State- and peptide-level variances
    sigmasq_draws = np.empty((n_iterations, n_proteins))
    tausq_draws = np.empty((n_iterations, n_proteins))

    # Hyperparameters for state-level variance model
    shape_sigmasq = np.empty(n_iterations)
    rate_sigmasq = np.empty(n_iterations)

    # Hyperparameters for peptide-level variance model
    shape_tausq = np.empty(n_iterations)
    rate_tausq = np.empty(n_iterations)

    # Censoring probability model parameters
    eta_draws = np.zeros((n_iterations, 2 + n_peptide_features * 2))
    p_rnd_cen = np.empty(n_iterations)

    # Number of states model parameters
    r = np.empty(n_iterations)
    lmbda = np.empty(n_iterations)

    # Compute initial values for MCMC iterations

    # p_rnd_cen from cfg
    p_rnd_cen[0] = cfg['init']['p_rnd_cen']

    # eta from cfg; bivariate normal draw
    eta0 = cfg['init']['eta']
    eta_draws[0, 0] = eta0['mean'][0] + eta0['sd'][0] * np.random.randn(1)
    eta_draws[0, 1] = eta0['mean'][1]
    if eta0['sd'][1] > 0:
        eta_draws[0, 1] += (eta0['cor'] * eta0['sd'][1] / eta0['sd'][0] *
                           (eta_draws[0, 0] - eta0['mean'][0]))
        eta_draws[0, 1] += (np.sqrt(1. - eta0['cor'] ** 2) * eta0['sd'][1] *
                            np.random.randn(1))

    # Number of states parameters from MAP estimator based on number of observed
    # peptides; very crude, but not altogether terrible. Note that this ignores
    # the +1 location shift in the actual n_states distribution.
    kwargs = {'x': n_obs_states_per_peptide[n_obs_states_per_peptide > 0] - 1,
              'transform': True}
    kwargs.update(cfg['priors']['n_states_dist'])
    r[0], lmbda[0] = lib.map_estimator_nbinom(**kwargs)
    lmbda[0] = 1. - lmbda[0]

    # Hyperparameters for state- and peptide-level variance distributions
    # directly from cfg
    shape_sigmasq[0], rate_sigmasq[0] = (
        cfg['init']['sigmasq_dist']['shape'],
        cfg['init']['sigmasq_dist']['rate'])
    shape_tausq[0], rate_tausq[0] = (cfg['init']['tausq_dist']['shape'],
                                     cfg['init']['tausq_dist']['rate'])

    # State- and peptide-level variances via inverse-gamma draws
    sigmasq_draws[0] = 1. / np.random.gamma(shape=shape_sigmasq[0],
                                            scale=1. / rate_sigmasq[0],
                                            size=n_proteins)
    tausq_draws[0] = 1. / np.random.gamma(shape=shape_tausq[0],
                                          scale=1. / rate_tausq[0],
                                          size=n_proteins)

    # Mapping from protein to peptide conditional variances for convenience
    var_peptide_conditional = sigmasq_draws[0, mapping_peptides]

    # Protein-level means using mean observed intensity; excluding missing
    # peptides
    mu_draws[0] = (np.bincount(mapping_peptides,
                               total_intensity_obs_per_peptide /
                               np.maximum(1, n_obs_states_per_peptide)) /
                   n_obs_peptides_per_protein)
    mu_draws[0, n_obs_peptides_per_protein < 1] = np.nanmin(mu_draws[0])

    if supervised:
        # Simple initialization for supervised algorithm
        # Initialize beta from regression of mu against known concentrations
        X = np.ones((known_concentrations.size, 2))
        X[:,1] = known_concentrations
        beta_draws[0] = glm.wls(X=X,
                                y=mu_draws[0, mapping_known_concentrations],
                                w=1.)['b']

        # Adjust known concentrations in mu accordingly
        mu_draws[0, mapping_known_concentrations] = beta_draws[0,0] + \
                beta_draws[0,1] * known_concentrations

        # And, initialize the concentration draws using the updates mu's
        concentration_draws[0] = (mu_draws[0] - beta_draws[0,0]) / \
                beta_draws[0,1]

        if concentration_dist:
            # Initialize hyperparameters on concentration distribution
            mean_concentration_draws[0] = np.mean(concentration_draws[0])
            prec_concentration_draws[0] = 1. / np.var(concentration_draws[0])

    # Peptide-level means using mean observed intensity; imputing missing
    # peptides as protein observed means
    gamma_draws[0] = mu_draws[0, mapping_peptides]
    gamma_draws[0, peptides_obs] = (
        total_intensity_obs_per_peptide[peptides_obs] /
        n_obs_states_per_peptide[peptides_obs])

    # Instantiate GLM family for eta step
    try:
        glm_link_name = cfg["priors"]["glm_link"].title()
    except:
        print >> sys.stderr, "GLM link not specified; defaulting to logit"
        glm_link_name = "Logit"
    glm_link = getattr(glm.links, glm_link_name)
    glm_family = glm.families.Binomial(link=glm_link)

    # Setup function for prior log density on eta, if requested
    try:
        prior_scale = cfg["priors"]["eta"]["prior_scale"]
        prior_center = cfg["priors"]["eta"]["prior_center"]
    except:
        prior_scale = None
        prior_center = None

    if prior_scale is not None:
        # Gelman's weakly-informative prior (2008)
        def dprior_eta(eta, prior_scale=5., prior_center=0.):
            return -np.log(1. + ((eta[1] - prior_center) / prior_scale)**2)

        prior_eta_kwargs = {'prior_scale': prior_scale,
                            'prior_center': prior_center}
    else:
        dprior_eta = None
        prior_eta_kwargs = {}

    # Initialize dictionary for acceptance statistics
    accept_stats = {'sigmasq_dist': 0,
                    'tausq_dist': 0,
                    'n_states_dist': 0,
                    'eta': 0}

    # Master loop for MCMC iterations
    for t in xrange(1, n_iterations):
        # (1) Draw missing data (n_cen and censored state intensities) given all
        #   other parameters. Exact draw via rejection samplers.

        # (1a) Obtain p_int_cen per peptide and approximatations of censored
        #   intensity posteriors.
        eta_0_effective = eta_draws[t - 1, 0]
        eta_1_effective = eta_draws[t - 1, 1]
        if n_peptide_features > 0:
            eta_0_effective += np.dot(
                peptide_features, eta_draws[t - 1, 2:(2 + n_peptide_features)]
            )
            eta_1_effective += np.dot(
                peptide_features, eta_draws[t - 1, (2 + n_peptide_features):]
            )
            
        kwargs = {'eta_0': eta_0_effective,
                  'eta_1': eta_1_effective,
                  'mu': gamma_draws[t - 1],
                  'sigmasq': var_peptide_conditional,
                  'glm_link_name': glm_link_name}
        cen_dist = lib.characterize_censored_intensity_dist(**kwargs)

        # (1b) Draw number of censored states per peptide
        n_cen_states_per_peptide = lib.rncen(n_obs=n_obs_states_per_peptide,
                                             p_rnd_cen=p_rnd_cen[t - 1],
                                             p_int_cen=cen_dist['p_int_cen'],
                                             lmbda=lmbda[t - 1], r=r[t - 1])
        n_cen_states_per_peptide_draws[t] = n_cen_states_per_peptide
        # Update state-level counts
        n_states_per_peptide = (n_obs_states_per_peptide +
                                n_cen_states_per_peptide)
        n_states_per_protein = np.bincount(mapping_peptides,
                                           weights=n_states_per_peptide)
        n_states = np.sum(n_states_per_peptide)

        # (1c) Draw censored intensities
        kwargs['n_cen'] = n_cen_states_per_peptide
        kwargs['p_rnd_cen'] = p_rnd_cen[t - 1]
        kwargs['propDf'] = prop_df_y_mis
        kwargs.update(cen_dist)
        intensities_cen, mapping_states_cen, W = lib.rintensities_cen(**kwargs)

        # (2) Update random censoring probability. Gibbs step.
        p_rnd_cen[t] = updates.rgibbs_p_rnd_cen(n_rnd_cen=np.sum(W),
                                                n_states=n_states,
                                                **cfg['priors']['p_rnd_cen'])

        # Sum observed intensities per peptide
        total_intensity_cen_per_peptide = np.bincount(mapping_states_cen,
                                                      weights=intensities_cen,
                                                      minlength=n_peptides)

        # Compute mean intensities per peptide
        mean_intensity_per_peptide = ((total_intensity_obs_per_peptide +
                                       total_intensity_cen_per_peptide) /
                                      n_states_per_peptide)

        # (3) Update peptide-level mean parameters (gamma). Gibbs step.
        gamma_draws[t] = updates.rgibbs_gamma(
            mu=mu_draws[t - 1, mapping_peptides],
            tausq=tausq_draws[t - 1, mapping_peptides],
            sigmasq=var_peptide_conditional,
            y_bar=mean_intensity_per_peptide,
            n_states=n_states_per_peptide)
        mean_gamma_by_protein = np.bincount(mapping_peptides,
                                            weights=gamma_draws[t])
        mean_gamma_by_protein /= n_peptides_per_protein

        # (4) Update protein-level concentrations
        if supervised:
            if concentration_dist:
                # (4a) Update coefficients given concentrations. Gibbs step.
                # Only yields sane answers if modeling distribution of
                # concentrations.
                beta_draws[t] = updates.rgibbs_beta(
                    concentrations=concentration_draws[t-1],
                    gamma_bar=mean_gamma_by_protein, tausq=tausq_draws[t - 1],
                    n_peptides=n_peptides_per_protein,
                    **cfg['priors']['beta_concentration'])
            else:
                # (4a) Update coefficients given concentrations. Gibbs step.
                # Rao-Blackwellized version, implicitly scaling prior on
                # $\beta_1$ by $|\beta_1|^{n_{mis}}
                beta_draws[t] = updates.rgibbs_beta(
                    concentrations=known_concentrations,
                    gamma_bar=mean_gamma_by_protein[
                        mapping_known_concentrations],
                    tausq=tausq_draws[t - 1, mapping_known_concentrations],
                    n_peptides=n_peptides_per_protein[
                        mapping_known_concentrations],
                    **cfg['priors']['beta_concentration'])

            # (4b) Update concentrations given coefficients. Gibbs step.
            concentration_draws[t] = updates.rgibbs_concentration(
                gamma_bar=mean_gamma_by_protein, tausq=tausq_draws[t - 1],
                n_peptides=n_peptides_per_protein, beta=beta_draws[t],
                mean_concentration=mean_concentration_draws[t-1],
                prec_concentration=prec_concentration_draws[t-1])
            concentration_draws[t, mapping_known_concentrations] = \
                    known_concentrations

            if concentration_dist:
                # (4c) Update concentration distribution hyperparameters
                mean_concentration_draws[t] = np.random.normal(
                    loc=np.mean(concentration_draws[t]),
                    scale=np.sqrt(1. / prec_concentration_draws[t-1] /
                                  n_proteins), size=1)
                prec_concentration_draws[t] = 1. / updates.rgibbs_variances(
                    rss=np.sum((concentration_draws[t] -
                                mean_concentration_draws[t])**2),
                    n=n_proteins,
                    **cfg['priors']['prec_concentration'])

            # Set mu based on concentrations and betas
            mu_draws[t] = \
                    beta_draws[t,0] + beta_draws[t,1] * concentration_draws[t]
        else:
            # (4) Update protein-level mean parameters (mu). Gibbs step.
            mu_draws[t] = updates.rgibbs_mu(gamma_bar=mean_gamma_by_protein,
                                            tausq=tausq_draws[t - 1],
                                            n_peptides=n_peptides_per_protein,
                                            **cfg['priors']['mu'])

        # (5) Update state-level variance parameters (sigmasq). Gibbs step.
        rss_by_state = (
            intensities_obs - gamma_draws[t, mapping_states_obs]) ** 2
        rss_by_protein = np.bincount(mapping_peptides[mapping_states_obs],
                                     weights=rss_by_state,
                                     minlength=n_proteins)
        rss_by_state = (
            intensities_cen - gamma_draws[t, mapping_states_cen]) ** 2
        rss_by_protein += np.bincount(mapping_peptides[mapping_states_cen],
                                      weights=rss_by_state,
                                      minlength=n_proteins)
        sigmasq_draws[t] = updates.rgibbs_variances(
            rss=rss_by_protein, n=n_states_per_protein,
            prior_shape=shape_sigmasq[ t - 1], prior_rate=rate_sigmasq[t - 1])

        # Mapping from protein to peptide conditional variances for convenience
        var_peptide_conditional = sigmasq_draws[t, mapping_peptides]

        # (6) Update peptide-level variance parameters (tausq). Gibbs step.
        rss_by_peptide = (gamma_draws[t] - mu_draws[t, mapping_peptides]) ** 2
        rss_by_protein = np.bincount(mapping_peptides, weights=rss_by_peptide)
        tausq_draws[t] = updates.rgibbs_variances(
            rss=rss_by_protein, n=n_peptides_per_protein,
            prior_shape=shape_tausq[ t - 1], prior_rate=rate_tausq[t - 1])

        # (7) Update state-level variance hyperparameters (sigmasq
        #   distribution). Conditional independence-chain MH step.
        result = updates.rmh_variance_hyperparams(
            variances=sigmasq_draws[t], shape_prev=shape_sigmasq[ t - 1],
            rate_prev=rate_sigmasq[ t - 1], **cfg['priors']['sigmasq_dist'])
        (shape_sigmasq[t], rate_sigmasq[t]), accept = result
        accept_stats['sigmasq_dist'] += accept

        # (8) Update peptide-level variance hyperparameters (tausq
        #   distribution). Conditional independence-chain MH step.
        result = updates.rmh_variance_hyperparams(
            variances=tausq_draws[t], shape_prev=shape_tausq[ t - 1],
            rate_prev=rate_tausq[t - 1], **cfg['priors']['tausq_dist'])
        (shape_tausq[t], rate_tausq[t]), accept = result
        accept_stats['tausq_dist'] += accept

        # (9) Update parameter for negative-binomial n_states distribution (r
        #   and lmbda). Conditional independence-chain MH step.
        result = updates.rmh_nbinom_hyperparams(
            x=n_states_per_peptide - 1,
            r_prev=r[ t - 1], p_prev=1. - lmbda[t - 1],
            **cfg['priors']['n_states_dist'])
        (r[t], lmbda[t]), accept = result
        lmbda[t] = 1. - lmbda[t]
        accept_stats['n_states_dist'] += accept

        # (10) Update coefficients of intensity-based probabilistic censoring
        #   model (eta). Conditional independence-chain MH step.
        # (10a) Build design matrix and response. Only using observed and
        # intensity-censored states.
        n_at_risk = n_obs_states + np.sum(W < 1)
        X = np.zeros((n_at_risk + n_peptide_features * 2,
                     2 + n_peptide_features * 2))
        X[:n_at_risk, 0] = 1.
        X[:n_at_risk, 1] = np.r_[intensities_obs, intensities_cen[W < 1]]
        if n_peptide_features > 0:
            peptide_features_by_state = peptide_features[
                np.r_[mapping_states_obs, mapping_states_cen[W < 1]]
            ]
            X[:n_at_risk, 2:(2 + n_peptide_features)] = \
                peptide_features_by_state
            X[:n_at_risk, (2 + n_peptide_features):] = \
                (peptide_features_by_state.T * X[:n_at_risk, 1]).T
            X[n_at_risk:, 2:] = np.eye(n_peptide_features * 2)
        
        y = np.zeros(n_at_risk + n_peptide_features * 2)
        y[:n_obs_states] = 1.
        if n_peptide_features > 0:
            y[n_at_risk:] = 0.5
        
        w = np.ones_like(y)
        if n_peptide_features > 0:
            w[n_at_risk:(n_at_risk + n_peptide_features)] = \
                cfg['priors']['eta_features']['primary_pseudoobs']
            w[(n_at_risk + n_peptide_features):] = \
                cfg['priors']['eta_features']['interaction_pseudoobs']

        # (10b) Estimate GLM parameters.
        fit_eta = glm.glm(y=y, X=X, w=w, family=glm_family, info=True)
        
        if np.all(np.isfinite(fit_eta['b_hat'])):
            # (10c) Execute MH step.
            eta_draws[t], accept = glm.mh_update_glm_coef(
                b_prev=eta_draws[t - 1], y=y, X=X, family=glm_family,
                propDf=prop_df_eta, prior_log_density=dprior_eta,
                prior_kwargs=prior_eta_kwargs, **fit_eta)
            accept_stats['eta'] += accept
        else:
            eta_draws[t] = eta_draws[t-1]

        if (cfg['settings']['verbose'] > 0 and
                t % cfg['settings']['verbose_interval'] == 0):
            print >> sys.stderr, 'Iteration %d complete' % t

    # Build dictionary of draws to return
    draws = {'mu': mu_draws,
             'gamma': gamma_draws,
             'eta': eta_draws,
             'p_rnd_cen': p_rnd_cen,
             'lmbda': lmbda,
             'r': r,
             'sigmasq': sigmasq_draws,
             'tausq': tausq_draws,
             'n_cen_states_per_peptide': n_cen_states_per_peptide_draws,
             'shape_tausq': shape_tausq,
             'rate_tausq': rate_tausq,
             'shape_sigmasq': shape_sigmasq,
             'rate_sigmasq': rate_sigmasq}
    
    # Add additional information for supervised algorithm
    if supervised:
        draws.update({
            'beta': beta_draws,
            'concentration': concentration_draws})
        if concentration_dist:
            draws.update({
                'mean_concentration': mean_concentration_draws,
                'var_concentration': 1. / prec_concentration_draws})

    return (draws, accept_stats)