Exemplo n.º 1
0
def run(cfg, comm=None):
    '''
    Coordinate parallel MCMC and output based upon process rank.

    Parameters
    ----------
        - cfg : dictionary
            Configuration dictionary containing priors, settings, and paths for
            analysis. Its format is specified in detail in separate
            documentation.
        - comm : mpi4py.MPI.COMM
            Initialized MPI communicator. If None, it will be set to
            MPI.COMM_WORLD.

    '''
    if comm is None:
        # Start MPI communications if no comm provided
        comm = MPI.COMM_WORLD

    # Get process information
    rank = comm.Get_rank()
    n_proc = comm.Get_size()

    # Load data
    data = load_data(cfg=cfg, n_workers=n_proc - 1, rank=rank)

    if rank == MPIROOT:
        # Run estimation
        draws, accept_stats, mapping_peptides = master(comm=comm,
                                                       data=data, cfg=cfg)

        # Construct path for master results
        path_master = cfg['output']['path_results_master']

        # Write master results to compressed file
        lib.write_to_hdf5(fname=path_master, compress=cfg['output']['compress'],
                          draws=draws, accept_stats=accept_stats,
                          mapping_peptides=mapping_peptides)
    else:
        result_worker = worker(comm=comm, rank=rank, data=data, cfg=cfg)
        draws, mapping_peptides = result_worker[:2]
        proteins_worker, peptides_worker = result_worker[2:]

        # Construct path for worker-specific results
        path_worker = cfg['output']['pattern_results_worker'] % rank

        # Write worker-specific results to compressed file
        lib.write_to_hdf5(fname=path_worker, compress=cfg['output']['compress'],
                          draws=draws, mapping_peptides=mapping_peptides,
                          proteins_worker=proteins_worker,
                          peptides_worker=peptides_worker)
Exemplo n.º 2
0
def worker(comm, rank, data, cfg):
    '''
    Worker-node process for parallel MCMC sampler.
    Receives parameters and commands from master node.
    Runs local updates and distributed components of shared draws.

    Parameters
    ----------
        - comm : mpi4py.MPI.COMM
            Initialized MPI communicator.
        - rank : int
            Rank (>= MPIROOT) of worker.
        - data : dictionary
            Data as output from load_data with rank > 0.
        - init : dictionary
            Initial parameter values as output from initialize.
        - cfg : dictionary
            Configuration dictionary containing priors, settings, and paths for
            analysis. Its format is specified in detail in separate
            documentation.

    Returns
    -------
        - draws : dictionary
            1- and 2-dimensional ndarrays containing the posterior samples for
            each protein- and ppeptide-specific parameter. Shared parameters are
            handled by the master process.
        - mapping_peptides : integer ndarray
            Worker-specific peptide to protein mapping provided in data.
        - proteins_worker : array_like, 1 dimension, nonnegative ints
            A 1d integer array of length n_proteins containing the indices (in
            the original dataset) of the proteins assigned to the given worker.
        - peptides_worker : array_like, 1 dimension, nonnegative ints
            A 1d integer array of length n_peptides containing the indices (in
            the original dataset) of the peptides assigned to the given worker.
    '''
    # Determine whether algorithm is running with supervision
    try:
        supervised = cfg['priors']['supervised']
    except KeyError:
        print >> sys.stderr, 'Defaulting to unsupervised algorithm'
        supervised = False

    # If supervised, determine whether to model distribution of concentrations
    # If this is False, prior on $\beta_1$ is scaled by $|\beta_1|^{n_{mis}}$.
    if supervised:
        try:
            concentration_dist = cfg['priors']['concentration_dist']
        except KeyError:
            print >> sys.stderr, 'Defaulting to flat prior on concentrations'
            concentration_dist = False

    # Get information on peptide features if they're available
    have_peptide_features = cfg['priors'].has_key('path_peptide_features')
    if have_peptide_features:
        n_peptide_features = data['peptide_features_worker'].shape[1]
    else:
        n_peptide_features = 0

    # Extract proposal DFs
    try:
        prop_df_y_mis = cfg['settings']['prop_df_y_mis']
    except KeyError:
        prop_df_y_mis = 5.0

    # Create references to relevant data entries in local namespace
    mapping_peptides = data['mapping_peptides']
    intensities_obs = data['intensities_obs']
    mapping_states_obs = data['mapping_states_obs']
    # Data specific to the semi-supervised algorithm
    if supervised:
        known_concentrations = data['known_concentrations']
        mapping_known_concentrations = data['mapping_known_concentrations']

    # Extract dimensions from input

    # Number of iterations from cfg
    n_iterations = cfg['settings']['n_iterations']

    # Number of peptides and proteins from mapping_peptides
    n_peptides = np.size(mapping_peptides)
    n_proteins = 1 + np.max(mapping_peptides)

    # Compute tabulations that are invariant across iterations

    # Total number of observed states
    n_obs_states = np.size(intensities_obs)

    # Tabulate peptides per protein
    n_peptides_per_protein = np.bincount(mapping_peptides)
    peptides_obs = np.unique(mapping_states_obs)
    n_obs_peptides_per_protein = np.bincount(mapping_peptides[peptides_obs],
                                             minlength=n_proteins)

    # Tabulate number of observed states per peptide
    n_obs_states_per_peptide = np.bincount(mapping_states_obs,
                                           minlength=n_peptides)

    # Sum observed intensities per peptide
    total_intensity_obs_per_peptide = np.bincount(mapping_states_obs,
                                                  weights=intensities_obs,
                                                  minlength=n_peptides)

    # Allocate data structures for draws

    # Peptide- and protein-level means
    gamma_draws = np.empty((n_iterations, n_peptides))
    mu_draws = np.empty((n_iterations, n_proteins))

    # Concentrations, if supervised
    if supervised:
        concentration_draws = np.empty((n_iterations, n_proteins))

    # Number of censored states per peptide
    n_cen_states_per_peptide_draws = np.zeros((n_iterations, n_peptides),
                                              dtype=np.integer)

    # State- and peptide-level variances
    sigmasq_draws = np.empty((n_iterations, n_proteins))
    tausq_draws = np.empty((n_iterations, n_proteins))

    # Instantiate GLM family for eta step
    try:
        glm_link_name = cfg["priors"]["glm_link"].title()
    except KeyError:
        print >> sys.stderr, "GLM link not specified; defaulting to logit"
        glm_link_name = "Logit"
    glm_link = getattr(glm.links, glm_link_name)
    glm_family = glm.families.Binomial(link=glm_link)

    # Setup data structure for shared parameters/hyperparameters sync
    # Layout:
    #   - 0:2 : shape_sigmasq, rate_sigmasq
    #   - 2:4 : shape_tausq, rate_tausq
    #   - 4:6 : r, lmbda
    #   - 6:8 : eta
    #   - 8   : p_rnd_cen
    # If supervised, 4 additional entries are used:
    #   - 9:11: beta
    #   - 11  : mean_concentration
    #   - 12  : prec_concentration
    params_shared = np.empty(9 + 4 * supervised, dtype=np.double)

    # Prepare to receive tasks
    working = True
    status = MPI.Status()
    t = np.array(0)

    # Primary send-receive loop for MCMC iterations
    while working:
        # Receive iteration and task information
        comm.Recv([t, MPI.INT], source=MPIROOT, tag=MPI.ANY_TAG, status=status)
        task = status.Get_tag()

        if task == TAGS['STOP']:
            working = False
        elif task == TAGS['SYNC']:
            # Synchronize shared parameters/hyperparameters
            comm.Bcast(params_shared, root=MPIROOT)

            shape_sigmasq, rate_sigmasq = params_shared[0:2]
            shape_tausq, rate_tausq = params_shared[2:4]
            r, lmbda = params_shared[4:6]
            eta = params_shared[6:8]
            p_rnd_cen = params_shared[8]

            if supervised:
                beta = params_shared[9:11]
                mean_concentration = params_shared[11]
                prec_concentration = params_shared[12]
        elif task == TAGS['INIT']:
            # Compute initial values for MCMC iterations

            # Protein-level means using mean observed intensity; excluding
            # missing peptides
            mu_draws[0] = (
                np.bincount(mapping_peptides, total_intensity_obs_per_peptide /
                            np.maximum(1, n_obs_states_per_peptide)) /
                n_obs_peptides_per_protein)
            mu_draws[0, n_obs_peptides_per_protein < 1] = np.nanmin(mu_draws[0])

            # Peptide-level means using mean observed intensity; imputing
            # missing peptides as protein observed means
            gamma_draws[0] = mu_draws[0, mapping_peptides]
            gamma_draws[0, peptides_obs] = (
                total_intensity_obs_per_peptide[peptides_obs] /
                n_obs_states_per_peptide[peptides_obs]
            )

            # State- and peptide-level variances via inverse-gamma draws
            sigmasq_draws[0] = 1. / np.random.gamma(shape=shape_sigmasq,
                                                    scale=1. / rate_sigmasq,
                                                    size=n_proteins)
            tausq_draws[0] = 1. / np.random.gamma(shape=shape_tausq,
                                                  scale=1. / rate_tausq,
                                                  size=n_proteins)

            # Mapping from protein to peptide conditional variances for
            # convenience
            var_peptide_conditional = sigmasq_draws[0, mapping_peptides]

            # Number of states parameters from local MAP estimator based on
            # number of observed peptides; very crude, but not altogether
            # terrible. Note that this ignores the +1 location shift in the
            # actual n_states distribution.
            kwargs = {
                'x': n_obs_states_per_peptide[n_obs_states_per_peptide > 0] - 1,
                'transform': True}
            kwargs.update(cfg['priors']['n_states_dist'])
            r, lmbda = lib.map_estimator_nbinom(**kwargs)
            lmbda = 1. - lmbda

            # Combine local estimates at master for initialization.
            # Values synchronize at first iteration during SYNC task.
            comm.Reduce([np.array([r, lmbda]), MPI.DOUBLE], None,
                        op=MPI.SUM, root=MPIROOT)

            if supervised:
                # Run Gibbs update on concentration-intensity coefficients using
                # noninformative prior.
                updates_parallel.rgibbs_worker_beta(
                    comm=comm, concentrations=known_concentrations,
                    gamma_bar=mu_draws[0, mapping_known_concentrations],
                    tausq=tausq_draws[0, mapping_known_concentrations],
                    n_peptides=n_peptides_per_protein[
                        mapping_known_concentrations], MPIROOT=MPIROOT)
        elif task == TAGS['LOCAL']:
            # (1) Draw missing data (n_cen and censored state intensities) given
            #   all other parameters. Exact draw via rejection samplers.

            # (1a) Obtain p_int_cen per peptide and approximatations of censored
            #   intensity posteriors.
            eta_0_effective = eta[0]
            eta_1_effective = eta[1]
            if n_peptide_features > 0:
                eta_0_effective += np.dot(data['peptide_features_worker'],
                                          eta[2:(2 + n_peptide_features)])
                eta_1_effective += np.dot(data['peptide_features_worker'],
                                          eta[(2 + n_peptide_features):])

            kwargs = {'eta_0': eta_0_effective,
                      'eta_1': eta_1_effective,
                      'mu': gamma_draws[t - 1],
                      'sigmasq': var_peptide_conditional,
                      'glm_link_name': glm_link_name}
            cen_dist = lib.characterize_censored_intensity_dist(**kwargs)

            # (1b) Draw number of censored states per peptide
            n_cen_states_per_peptide = lib.rncen(
                n_obs=n_obs_states_per_peptide,
                p_rnd_cen=p_rnd_cen,
                p_int_cen=cen_dist[
                    'p_int_cen'],
                lmbda=lmbda, r=r)
            n_cen_states_per_peptide_draws[t] = n_cen_states_per_peptide
            # Update state-level counts
            n_states_per_peptide = (n_obs_states_per_peptide +
                                    n_cen_states_per_peptide)
            n_states_per_protein = np.bincount(mapping_peptides,
                                               weights=n_states_per_peptide)
            n_states = np.sum(n_states_per_peptide)

            # (1c) Draw censored intensities
            kwargs['n_cen'] = n_cen_states_per_peptide
            kwargs['p_rnd_cen'] = p_rnd_cen
            kwargs['propDf'] = prop_df_y_mis
            kwargs.update(cen_dist)
            intensities_cen, mapping_states_cen, W = lib.rintensities_cen(
                **kwargs)

            # Sum observed intensities per peptide
            total_intensity_cen_per_peptide = np.bincount(
                mapping_states_cen, weights=intensities_cen,
                minlength=n_peptides)

            # Compute mean intensities per peptide
            mean_intensity_per_peptide = ((total_intensity_obs_per_peptide +
                                           total_intensity_cen_per_peptide) /
                                          n_states_per_peptide)

            # (2) Update peptide-level mean parameters (gamma). Gibbs step.
            gamma_draws[t] = updates_serial.rgibbs_gamma(
                mu=mu_draws[t - 1, mapping_peptides],
                tausq=tausq_draws[t - 1, mapping_peptides],
                sigmasq=var_peptide_conditional,
                y_bar=mean_intensity_per_peptide, n_states=n_states_per_peptide)
            mean_gamma_by_protein = np.bincount(mapping_peptides,
                                                weights=gamma_draws[t])
            mean_gamma_by_protein /= n_peptides_per_protein

            if supervised:
                # (3) Update concentrations given coefficients. Gibbs step.
                concentration_draws[t] = updates_serial.rgibbs_concentration(
                    gamma_bar=mean_gamma_by_protein, tausq=tausq_draws[t - 1],
                    n_peptides=n_peptides_per_protein, beta=beta,
                    mean_concentration=mean_concentration,
                    prec_concentration=prec_concentration)
                concentration_draws[t, mapping_known_concentrations] = \
                        known_concentrations

                mu_draws[t] = beta[0] + beta[1] * concentration_draws[t]
            else:
                # (3) Update protein-level mean parameters (mu). Gibbs step.
                mu_draws[t] = updates_serial.rgibbs_mu(
                    gamma_bar=mean_gamma_by_protein, tausq=tausq_draws[t - 1],
                    n_peptides=n_peptides_per_protein, **cfg['priors']['mu'])

            # (4) Update state-level variance parameters (sigmasq). Gibbs step.
            rss_by_state = ((intensities_obs -
                             gamma_draws[t, mapping_states_obs]) ** 2)
            rss_by_protein = np.bincount(mapping_peptides[mapping_states_obs],
                                         weights=rss_by_state,
                                         minlength=n_proteins)
            rss_by_state = ((intensities_cen -
                             gamma_draws[t, mapping_states_cen]) ** 2)
            rss_by_protein += np.bincount(mapping_peptides[mapping_states_cen],
                                          weights=rss_by_state,
                                          minlength=n_proteins)
            sigmasq_draws[t] = updates_serial.rgibbs_variances(
                rss=rss_by_protein, n=n_states_per_protein,
                prior_shape=shape_sigmasq, prior_rate=rate_sigmasq)

            # Mapping from protein to peptide conditional variances for
            # convenience
            var_peptide_conditional = sigmasq_draws[t, mapping_peptides]

            # (5) Update peptide-level variance parameters (tausq). Gibbs step.
            rss_by_peptide = (
                gamma_draws[t] - mu_draws[t, mapping_peptides]) ** 2
            rss_by_protein = np.bincount(mapping_peptides,
                                         weights=rss_by_peptide)
            tausq_draws[t] = updates_serial.rgibbs_variances(
                rss=rss_by_protein, n=n_peptides_per_protein,
                prior_shape=shape_tausq, prior_rate=rate_tausq)
        elif task == TAGS['SIGMA']:
            # Run distributed MH step for sigmasq hyperparameters
            updates_parallel.rmh_worker_variance_hyperparams(
                comm=comm, variances=sigmasq_draws[t], MPIROOT=MPIROOT)
        elif task == TAGS['TAU']:
            # Run distributed MH step for sigmasq hyperparameters
            updates_parallel.rmh_worker_variance_hyperparams(
                comm=comm, variances=tausq_draws[t], MPIROOT=MPIROOT)
        elif task == TAGS['NSTATES']:
            # Run distributed MH step for n_states hyperparameters
            updates_parallel.rmh_worker_nbinom_hyperparams(
                comm=comm, x=n_states_per_peptide - 1, r_prev=r,
                p_prev=1. - lmbda, MPIROOT=MPIROOT,
                **cfg['priors']['n_states_dist'])
        elif task == TAGS['ETA']:
            # Run distributed MH step for eta (coefficients in censoring model)

            # Build design matrix and response. Only using observed and
            # intensity-censored states.
            n_at_risk = n_obs_states + np.sum(W < 1)
            X = np.zeros((n_at_risk + n_peptide_features * 2,
                          2 + n_peptide_features * 2))
            X[:n_at_risk, 0] = 1.
            X[:n_at_risk, 1] = np.r_[intensities_obs, intensities_cen[W < 1]]
            if n_peptide_features > 0:
                peptide_features_by_state = data['peptide_features_worker'][
                    np.r_[mapping_states_obs, mapping_states_cen[W < 1]]
                ]
                X[:n_at_risk, 2:(2 + n_peptide_features)] = \
                    peptide_features_by_state
                X[:n_at_risk, (2 + n_peptide_features):] = \
                    (peptide_features_by_state.T * X[:n_at_risk, 1]).T
                X[n_at_risk:, 2:] = np.eye(n_peptide_features * 2)

            y = np.zeros(n_at_risk + n_peptide_features * 2)
            y[:n_obs_states] = 1.
            if n_peptide_features > 0:
                y[n_at_risk:] = 0.5

            w = np.ones_like(y)
            if n_peptide_features > 0:
                w[n_at_risk:(n_at_risk + n_peptide_features)] = (
                    cfg['priors']['eta_features']['primary_pseudoobs'] /
                    (comm.Get_size() - 1.))
                w[(n_at_risk + n_peptide_features):] = (
                    cfg['priors']['eta_features']['interaction_pseudoobs'] /
                    (comm.Get_size() - 1.))

            # Estimate GLM parameters.
            fit_eta = glm.glm(y=y, X=X, w=w, family=glm_family, info=True,
                              cov=True)

            # Handle distributed computation draw
            updates_parallel.rmh_worker_glm_coef(
                comm=comm, b_prev=eta, family=glm_family, y=y, X=X, w=w,
                MPIROOT=MPIROOT, **fit_eta)
        elif task == TAGS['PRNDCEN']:
            # Run distributed Gibbs step for p_rnd_cen
            updates_parallel.rgibbs_worker_p_rnd_cen(
                comm=comm, n_rnd_cen=np.sum(W, dtype=np.int), n_states=n_states,
                MPIROOT=MPIROOT)
        elif task == TAGS['BETA']:
            # Run distributed Gibbs step for coefficients of
            # concentration-intensity relationship
            if concentration_dist:
                updates_parallel.rgibbs_worker_beta(
                    comm=comm, concentrations=concentration_draws[t],
                    gamma_bar=mean_gamma_by_protein,
                    tausq=tausq_draws[t],
                    n_peptides=n_peptides_per_protein, MPIROOT=MPIROOT)
            else:
                updates_parallel.rgibbs_worker_beta(
                    comm=comm, concentrations=known_concentrations,
                    gamma_bar=mean_gamma_by_protein[
                        mapping_known_concentrations],
                    tausq=tausq_draws[t, mapping_known_concentrations],
                    n_peptides=n_peptides_per_protein[
                        mapping_known_concentrations], MPIROOT=MPIROOT)
        elif task == TAGS['CONCENTRATION_DIST']:
            # Run distributed Gibbs step for hyperparameters of concentration
            # distribution
            updates_parallel.rgibbs_worker_concentration_dist(
                comm=comm, concentrations=concentration_draws[t],
                MPIROOT=MPIROOT)
        elif task == TAGS['SAVE']:
            # Construct path for worker-specific results
            path_worker = cfg['output']['pattern_results_worker'] % rank

            # Setup draws to return
            draws = {'mu': mu_draws,
                     'gamma': gamma_draws,
                     'sigmasq': sigmasq_draws,
                     'tausq': tausq_draws,
                     'n_cen_states_per_peptide': n_cen_states_per_peptide_draws,
                    }
            if supervised:
                draws.update({'concentration': concentration_draws})
            lib.write_to_hdf5(
                path=path_worker, compress=cfg['output']['compress'],
                draws=draws, mapping_peptides=data['mapping_peptides'],
                proteins_worker=data['proteins_worker'])

    # Setup draws to return
    draws = {'mu': mu_draws,
             'gamma': gamma_draws,
             'sigmasq': sigmasq_draws,
             'tausq': tausq_draws,
             'n_cen_states_per_peptide': n_cen_states_per_peptide_draws,
            }
    if supervised:
        draws.update({
            'concentration': concentration_draws})

    return (draws, data['mapping_peptides'],
            data['proteins_worker'], data['peptides_worker'])