예제 #1
0
 def get_dist(data_vec):
     dist_list = []
     for vec in data_vec:
         vec = vec[~np.isnan(vec)]
         if len(vec):
             dist_list.append(pg.NormalDistribution(np.nanmean(vec), max(np.std(vec), 1E-6)))
         else:
             dist_list.append(pg.NormalDistribution(0, 999999))
     return pg.IndependentComponentsDistribution(dist_list)
예제 #2
0
    def generate_fret_states(kind, state_means, trans_mat, trans_prob):
        """Creates artificial FRET states"""
        if all(isinstance(s, float) for s in state_means):
            kind = "defined"

        rand_k_states = np.random.randint(1, random_k_states_max + 1)

        if kind == "random":
            k_states = rand_k_states
            state_means = generate_state_means(min_state_diff, k_states)
        elif kind == "aggregate":
            state_means = np.random.uniform(0, 1)
            k_states = 1
        else:
            if np.size(state_means) <= random_k_states_max:
                # Pick the same amount of k states as state means given
                k_states = np.size(state_means)
            else:
                # Pick no more than k_states_max from the state means (e.g.
                # given [0.1, 0.2, 0.3, 0.4, 0.5] use only
                # random_k_states_max of these)
                k_states = rand_k_states
                state_means = np.random.choice(
                    state_means, size=k_states, replace=False
                )

        if type(state_means) == float:
            dists = [pg.NormalDistribution(state_means, 0)]
        else:
            dists = [pg.NormalDistribution(m, 0) for m in state_means]

        starts = np.array([1 / k_states] * k_states)

        lib.utils.random_seed_mp()
        np.random.shuffle(dists)

        # Generate arbitrary transition matrix
        if trans_mat is None:
            trans_mat = np.empty([k_states, k_states])
            trans_mat.fill(trans_prob)
            np.fill_diagonal(trans_mat, 1 - trans_prob)

            # Make sure that each row/column sums to exactly 1
            if trans_prob != 0:
                stay_prob = 1 - trans_prob
                remaining_prob = 1 - trans_mat.sum(axis=0)
                trans_mat[trans_mat == stay_prob] += remaining_prob

        # Generate HMM model
        model = pg.HiddenMarkovModel.from_matrix(
            trans_mat, distributions=dists, starts=starts
        )
        model.bake()

        E_true = np.array(model.sample(trace_length))
        return E_true
예제 #3
0
 def get_blank_distribution(self):
     nd = 3
     dist_list = []
     for _ in range(nd):
         dist_list.append(pg.IndependentComponentsDistribution(
             [pg.NormalDistribution(0, 1) for _ in range(len(self.feature_list))]))
     return pg.GeneralMixtureModel(dist_list, weights=[1 / nd] * nd)
예제 #4
0
def build_model(n_bins, n_cmps, n_features, means, stds, state_names=None):
    # Initial values for all Gaussian components
    dist_init = np.random.random((n_bins, n_cmps, n_features, 2))
    dist_init[..., 0] -= 0.5  # Center means to 0.0
    for feat_i in range(n_features):
        # Random init mean in range [-2std, 2std)
        dist_init[..., feat_i, 0] *= 4 * stds[feat_i]
        dist_init[..., feat_i, 0] += means[feat_i]
        # Random init std in range [0, std)
        dist_init[..., feat_i, 1] *= stds[feat_i]

    if n_cmps > 1:
        dists = tuple(
            pgn.GeneralMixtureModel(
                list(
                    pgn.IndependentComponentsDistribution(
                        tuple(
                            pgn.NormalDistribution(*dist_init[bin_i, cmp_i,
                                                              feat_i, :])
                            for feat_i in range(n_features)))
                    for cmp_i in range(n_cmps))) for bin_i in range(n_bins))
    else:
        dists = tuple(
            pgn.IndependentComponentsDistribution(
                tuple(
                    pgn.NormalDistribution(*dist_init[bin_i, 0, feat_i, :])
                    for feat_i in range(n_features)))
            for bin_i in range(n_bins))
    trans_mat = np.random.random((n_bins, n_bins))
    starts = np.ones(n_bins)

    model = pgn.HiddenMarkovModel.from_matrix(trans_mat,
                                              dists,
                                              starts,
                                              state_names=state_names)
    return model
예제 #5
0
    def generate_fret_states(kind, state_means, trans_mat, trans_prob):
        """Creates artificial FRET states"""
        if all(isinstance(s, float) for s in state_means):
            kind = "defined"

        rand_k_states = np.random.randint(1, random_k_states_max + 1)

        if kind == "aggregate":
            state_means = np.random.uniform(0, 1)
            k_states = 1
        elif kind == "random":
            k_states = (len(trans_mat)
                        if trans_mat is not None else rand_k_states)
            state_means = generate_state_means(min_state_diff, k_states)
        else:
            if np.size(state_means) <= random_k_states_max:
                # Pick the same amount of k states as state means given
                k_states = np.size(state_means)
            else:
                # Pick no more than k_states_max from the state means (e.g.
                # given [0.1, 0.2, 0.3, 0.4, 0.5] use only
                # random_k_states_max of these)
                k_states = rand_k_states
                state_means = np.random.choice(state_means,
                                               size=k_states,
                                               replace=False)

        if type(state_means) == float:
            dists = [pg.NormalDistribution(state_means, eps)]
        else:
            dists = [pg.NormalDistribution(m, eps) for m in state_means]

        starts = np.random.uniform(0, 1, size=k_states)
        starts /= starts.sum()

        # Generate arbitrary transition matrix
        if trans_mat is None:
            matrix = np.empty([k_states, k_states])
            matrix.fill(trans_prob)
            np.fill_diagonal(matrix, 1 - trans_prob)

            # Make sure that each row/column sums to exactly 1
            if trans_prob != 0:
                stay_prob = 1 - trans_prob
                remaining_prob = 1 - matrix.sum(axis=0)
                matrix[matrix == stay_prob] += remaining_prob
        else:
            if len(state_means) != len(trans_mat):
                raise ValueError(
                    "Number of FRET states ({0}) doesn't match transition matrix {1}x{1}"
                    .format(len(state_means), len(trans_mat)))
            matrix = trans_mat

        model = pg.HiddenMarkovModel.from_matrix(matrix,
                                                 distributions=dists,
                                                 starts=starts)
        model.bake()

        final_matrix = model.dense_transition_matrix()[:k_states, :k_states]

        E_true = np.array(model.sample(n=1, length=trace_length))
        E_true = np.squeeze(E_true).round(4)
        return E_true, final_matrix
예제 #6
0
def hmm_get_model(cnarr, method, processes):
    """

    Parameters
    ----------
    cnarr : CopyNumArray
        The normalized bin-level values to be segmented.
    method : string
        One of 'hmm', 'hmm-tumor', 'hmm-germline'.
    processes : int
        Number of parallel jobs to run.

    Returns
    -------
    model :
        A pomegranate HiddenMarkovModel trained on the given dataset.
    """
    assert method in ('hmm-tumor', 'hmm-germline', 'hmm')
    observations = as_observation_matrix(cnarr.autosomes())

    # Estimate standard deviation from the full distribution, robustly
    stdev = biweight_midvariance(np.concatenate(observations), initial=0)
    if method == 'hmm-germline':
        state_names = ["loss", "neutral", "gain"]
        distributions = [
            pom.NormalDistribution(-1.0, stdev, frozen=True),
            pom.NormalDistribution(0.0, stdev, frozen=True),
            pom.NormalDistribution(0.585, stdev, frozen=True),
        ]
    elif method == 'hmm-tumor':
        state_names = ["del", "loss", "neutral", "gain", "amp"]
        distributions = [
            pom.NormalDistribution(-2.0, stdev, frozen=False),
            pom.NormalDistribution(-0.5, stdev, frozen=False),
            pom.NormalDistribution(0.0, stdev, frozen=True),
            pom.NormalDistribution(0.3, stdev, frozen=False),
            pom.NormalDistribution(1.0, stdev, frozen=False),
        ]
    else:
        state_names = ["loss", "neutral", "gain"]
        distributions = [
            pom.NormalDistribution(-1.0, stdev, frozen=False),
            pom.NormalDistribution(0.0, stdev, frozen=False),
            pom.NormalDistribution(0.585, stdev, frozen=False),
        ]

    n_states = len(distributions)
    # Starts -- prefer neutral
    binom_coefs = scipy.special.binom(n_states - 1, range(n_states))
    start_probabilities = binom_coefs / binom_coefs.sum()
    # Ends -- equally likely
    #end_probabilities = np.ones(n_states) / n_states

    # Prefer to keep the current state in each transition
    # All other transitions are equally likely, to start
    transition_matrix = (np.identity(n_states) * 100 + np.ones(
        (n_states, n_states)) / n_states)

    model = pom.HiddenMarkovModel.from_matrix(transition_matrix,
                                              distributions,
                                              start_probabilities,
                                              state_names=state_names,
                                              name=method)

    model.fit(
        sequences=observations,
        weights=[len(obs) for obs in observations],
        distribution_inertia=.8,  # Allow updating dists, but slowly
        edge_inertia=0.1,
        # lr_decay=.75,
        pseudocount=5,
        use_pseudocount=True,
        max_iterations=100000,
        n_jobs=processes,
        verbose=False)
    return model
예제 #7
0
def variants_in_segment(varr, segment, min_variants=50):
    if len(varr) > min_variants:
        observations = varr.mirrored_baf(above_half=True)
        state_names = ["neutral", "alt"]
        distributions = [
            pom.NormalDistribution(0.5, .1, frozen=True),
            pom.NormalDistribution(0.67, .1, frozen=True),
        ]
        n_states = len(distributions)
        # Starts -- prefer neutral
        start_probabilities = [.95, .05]
        # Prefer to keep the current state in each transition
        # All other transitions are equally likely, to start
        transition_matrix = (np.identity(n_states) * 100 + np.ones(
            (n_states, n_states)) / n_states)
        model = pom.HiddenMarkovModel.from_matrix(transition_matrix,
                                                  distributions,
                                                  start_probabilities,
                                                  state_names=state_names,
                                                  name="loh")

        model.fit(
            sequences=[observations],
            edge_inertia=0.1,
            lr_decay=.75,
            pseudocount=5,
            use_pseudocount=True,
            max_iterations=100000,
            #n_jobs=1,  # processes,
            verbose=False)
        states = np.array(model.predict(observations, algorithm='map'))

        logging.info("Done, now finalizing")
        logging.debug("Model states: %s", model.states)
        logging.debug("Predicted states: %s", states[:100])
        logging.debug(str(collections.Counter(states)))
        #logging.debug("Observations: %s", observations[0][:100])
        logging.debug("Edges: %s", model.edges)

        # Merge adjacent bins with the same state to create segments
        fake_cnarr = CNA(varr.add_columns(weight=1, log2=0, gene='.').data)
        results = squash_by_groups(fake_cnarr,
                                   varr.as_series(states),
                                   by_arm=False)
        assert (results.start < results.end).all()

    else:
        results = None

    if results is not None and len(results) > 1:
        logging.info(
            "Segment %s:%d-%d on allele freqs for %d additional breakpoints",
            segment.chromosome, segment.start, segment.end,
            len(results) - 1)
        # Place breakpoints midway between SNVs
        # XXX TODO use original cnarr bin boundaries to select/adjust breakpoint
        mid_breakpoints = (results.start.values[1:] +
                           results.end.values[:-1]) // 2
        starts = np.concatenate([[segment.start], mid_breakpoints])
        ends = np.concatenate([mid_breakpoints, [segment.end]])
        dframe = pd.DataFrame({
            'chromosome': segment.chromosome,
            'start': starts,
            'end': ends,
            # 'baf': results['mean'],
            'gene': segment.gene,  # '-'
            'log2': segment.log2,
            'probes': results['probes'],
            # 'weight': (segment.weight * results['probes']
            #            / (segment.end - segment.start)),
        })
        bad_segs_idx = (dframe.start >= dframe.end)
        if bad_segs_idx.any():
            raise RuntimeError("Improper post-processing of segment {} -- "
                               "{} bins start >= end:\n{}\n".format(
                                   segment, bad_segs_idx.sum(),
                                   dframe[bad_segs_idx]))

    else:
        dframe = pd.DataFrame(
            {
                'chromosome': segment.chromosome,
                'start': segment.start,
                'end': segment.end,
                'gene': segment.gene,  #'-',
                'log2': segment.log2,
                'probes': segment.probes,
                # 'weight': segment.weight,
            },
            index=[0])

    return dframe
예제 #8
0
def get_model(r, params, window_size, num_skipped, seq_len, p, \
    g, resample_prob, x_chr=False, haploid=False, debug=False, h_t=1, skip_score=float("-Inf")):
    """
    Builds the hidden Markov model for a given chromosome or scaffold, using the
    Pomegranate module.
    
    Arguments:
        r -- (float) the per site, per generation recombination probability
        params -- a dict where keys are names of states (AA, AB, and BB) and values
            are dicts where values are mu and sd, which are floats representing
            means and standard deviations of emission probability distributions
        window_size -- (int) the window size for this run, in bp
        num_skipped -- (int) the number of windows that were skipped due to not passing
            criteria
        seq_len -- (int) the number of windows in the current chromosome/scaffold
        p -- (float) the percent ancestry the admixed population derives from ancestral
            population A (estimated beforehand)
        g -- (int) the number of generations since admixture (estimated beforehand)
        resample_prob -- (float) probability of resampling the same ancestral recombination
            event twice in an individual after the set number of generations since admixture
            (referred to as z in the paper)
        x_chr -- (boolean) does this chromosome/scaffold belong to a hemizygous sex
            chromosome?
        haploid -- (boolean) is this individual haploid along this chromosome/scaffold?
        debug -- (boolean) should debugging messages be printed to the screen?
        h_t -- (float) if the user has specified that expected reduction in heterozygosity
            given the number of generations since admixture should be incorporated into
            the model, this is the expected fraction of the initial heterozygosity that
            remains after g generations.
        skip_score -- (float) the number emitted by adlibs_score when "skipped" windows
            are encountered
    
    Returns:
        a Pomegranate HMM object for the current chromosome/scaffold
    """
    global prob_lim

    model = pomegranate.HiddenMarkovModel(name='ancestry')

    # Compute probabilities of transitioning to a skip state or the end. Cap these
    # both at the specified probability limit.
    skip_prob = num_skipped / seq_len
    if skip_prob > prob_lim:
        skip_prob = prob_lim
    state_end = 1 / seq_len
    if state_end > prob_lim:
        state_end = prob_lim

    if x_chr:
        r *= (2 / 3)

    # Determine probabilities of transitions
    if haploid:
        # Should 2 be 1.5? I don't think so -- we already multiplied r by (2/3)
        # so that's in here already.
        aa_bb = g * r * (1 - p)
        bb_aa = g * r * p
        # Eliminate the heterozygous state.
        aa_ab = 0
        ab_aa = 0
        bb_ab = 0
        ab_bb = 0
    else:
        probs = get_trans_probs(r, g, p, resample_prob)
        aa_ab = probs['aa_ab']
        ab_aa = probs['ab_aa']
        aa_bb = probs['aa_bb']
        bb_ab = probs['bb_ab']
        ab_bb = probs['ab_bb']
        bb_aa = probs['bb_aa']

    aa_ab *= window_size
    ab_aa *= window_size
    aa_bb *= window_size
    bb_ab *= window_size
    ab_bb *= window_size
    bb_aa *= window_size

    aa_aa = 1 - (aa_ab + aa_bb + state_end + skip_prob)
    ab_ab = 1 - (ab_aa + ab_bb + state_end + skip_prob)
    bb_bb = 1 - (bb_aa + bb_ab + state_end + skip_prob)

    # Account for reduction in heterozygosity due to genetic drift

    if haploid:
        pass
        #aa_aa += (aa_bb - aa_bb*h_t)
        #aa_bb *= h_t
        #bb_bb += (bb_aa - bb_aa*h_t)
        #bb_aa *= h_t
    else:
        aa_aa += (aa_aa / (aa_aa + aa_bb)) * (aa_ab - aa_ab * h_t)
        aa_bb += (aa_bb / (aa_aa + aa_bb)) * (aa_ab - aa_ab * h_t)
        bb_aa += (bb_aa / (bb_aa + bb_bb)) * (bb_ab - bb_ab * h_t)
        bb_bb += (bb_bb / (bb_aa + bb_bb)) * (bb_ab - bb_ab * h_t)
        aa_ab *= h_t
        bb_ab *= h_t
        ab_aa += (ab_aa / (ab_aa + ab_bb)) * (ab_ab - ab_ab * h_t)
        ab_bb += (ab_bb / (ab_aa + ab_bb)) * (ab_ab - ab_ab * h_t)
        ab_ab *= h_t

    if debug:
        print("# AA -> AA {}".format(aa_aa), file=sys.stderr)
        print("# AA -> AB {}".format(aa_ab), file=sys.stderr)
        print("# AA -> BB {}".format(aa_bb), file=sys.stderr)
        print("# AB -> AA {}".format(ab_aa), file=sys.stderr)
        print("# AB -> AB {}".format(ab_ab), file=sys.stderr)
        print("# AB -> BB {}".format(ab_bb), file=sys.stderr)
        print("# BB -> AA {}".format(bb_aa), file=sys.stderr)
        print("# BB -> AB {}".format(bb_ab), file=sys.stderr)
        print("# BB -> BB {}".format(bb_bb), file=sys.stderr)
        print("# SKIP {}".format(skip_prob), file=sys.stderr)

    aaDist = pomegranate.NormalDistribution(params['AA']['mu'],
                                            params['AA']['sd'])
    abDist = pomegranate.NormalDistribution(params['AB']['mu'],
                                            params['AB']['sd'])
    bbDist = pomegranate.NormalDistribution(params['BB']['mu'],
                                            params['BB']['sd'])

    aaState = pomegranate.State(aaDist, name="AA")
    abState = pomegranate.State(abDist, name="AB")
    bbState = pomegranate.State(bbDist, name="BB")

    model.add_state(aaState)
    if not haploid:
        model.add_state(abState)
    model.add_state(bbState)

    #### ADD skip states

    skip_dist = pomegranate.UniformDistribution(skip_score - 0.01, skip_score)

    aa_skip_state = pomegranate.State(skip_dist, name="skip-AA")
    ab_skip_state = pomegranate.State(skip_dist, name="skip-AB")
    bb_skip_state = pomegranate.State(skip_dist, name="skip-BB")

    model.add_state(aa_skip_state)
    if not haploid:
        model.add_state(ab_skip_state)
    model.add_state(bb_skip_state)

    if haploid:
        model.add_transition(model.start, aaState, p * (1 - skip_prob))
        model.add_transition(model.start, aa_skip_state, p * skip_prob)
        model.add_transition(model.start, bbState, (1 - p) * (1 - skip_prob))
        model.add_transition(model.start, bb_skip_state, (1 - p) * skip_prob)
    else:
        model.add_transition(model.start, aaState, p**2 * (1 - skip_prob))
        model.add_transition(model.start, aa_skip_state, p**2 * skip_prob)
        model.add_transition(model.start, abState,
                             2 * p * (1 - p) * (1 - skip_prob))
        model.add_transition(model.start, ab_skip_state,
                             2 * p * (1 - p) * skip_prob)
        model.add_transition(model.start, bbState,
                             (1 - p)**2 * (1 - skip_prob))
        model.add_transition(model.start, bb_skip_state,
                             (1 - p)**2 * skip_prob)

    model.add_transition(aaState, model.end, 1 / seq_len)
    if not haploid:
        model.add_transition(abState, model.end, 1 / seq_len)
    model.add_transition(bbState, model.end, 1 / seq_len)

    model.add_transition(aaState, bbState, aa_bb)
    model.add_transition(aaState, aaState, aa_aa)
    model.add_transition(bbState, aaState, bb_aa)
    model.add_transition(bbState, bbState, bb_bb)

    if not haploid:
        model.add_transition(aaState, abState, aa_ab)
        model.add_transition(abState, aaState, ab_aa)
        model.add_transition(abState, bbState, ab_bb)
        model.add_transition(abState, abState, ab_ab)
        model.add_transition(bbState, abState, bb_ab)

    ### Add skip state transitions
    model.add_transition(aaState, aa_skip_state, skip_prob)
    if not haploid:
        model.add_transition(abState, ab_skip_state, skip_prob)
    model.add_transition(bbState, bb_skip_state, skip_prob)

    model.add_transition(aa_skip_state, aa_skip_state, skip_prob)
    if not haploid:
        model.add_transition(ab_skip_state, ab_skip_state, skip_prob)
    model.add_transition(bb_skip_state, bb_skip_state, skip_prob)

    model.add_transition(aa_skip_state, bbState, aa_bb)
    model.add_transition(bb_skip_state, aaState, bb_aa)

    if not haploid:
        model.add_transition(aa_skip_state, abState, aa_ab)
        model.add_transition(ab_skip_state, aaState, ab_aa)
        model.add_transition(ab_skip_state, bbState, ab_bb)
        model.add_transition(bb_skip_state, abState, bb_ab)

    model.add_transition(aa_skip_state, model.end, 1 / seq_len)
    if not haploid:
        model.add_transition(ab_skip_state, model.end, 1 / seq_len)
    model.add_transition(bb_skip_state, model.end, 1 / seq_len)

    model.add_transition(aa_skip_state, aaState,
                         1 - skip_prob - aa_ab - aa_bb - 1 / seq_len)
    if not haploid:
        model.add_transition(ab_skip_state, abState,
                             1 - skip_prob - ab_aa - ab_bb - 1 / seq_len)
    model.add_transition(bb_skip_state, bbState,
                         1 - skip_prob - bb_aa - bb_ab - 1 / seq_len)
    ###

    model.bake()

    return model