Exemplo n.º 1
0
def choose_ancestral_states_joint(tree, feature, states, frequencies):
    """
    Chooses node ancestral states based on their marginal probabilities using joint method.

    :param frequencies: numpy array of state frequencies
    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the ancestral states are to be chosen
    :param states: numpy.array of possible character states in order corresponding to the probabilities array
    :return: void, modified the get_personalized_feature_name(feature, ALLOWED_STATES) feature of each node
        to only contain the selected states.
    """
    lh_feature = get_personalized_feature_name(feature, BU_LH)
    lh_state_feature = get_personalized_feature_name(feature,
                                                     BU_LH_JOINT_STATES)
    allowed_state_feature = get_personalized_feature_name(
        feature, ALLOWED_STATES)
    joint_state_feature = get_personalized_feature_name(feature, JOINT_STATE)
    _, state2array = get_state2allowed_states(states, False)

    def chose_consistent_state(node, state_index):
        node.add_feature(joint_state_feature, state_index)
        node.add_feature(allowed_state_feature, state2array[state_index])

        for child in node.children:
            chose_consistent_state(
                child,
                getattr(child, lh_state_feature)[state_index])

    chose_consistent_state(tree,
                           (getattr(tree, lh_feature) * frequencies).argmax())
Exemplo n.º 2
0
def downpass(tree, feature, states):
    """
    DOWNPASS traverses the tree starting from the root and going down till the tips,
    and for each node combines the state information from its supertree and its subtree (calculated at UPPASS).
    As the root state was already the most parsimonious after the UPPASS,
    we skip it and start directly with the root children.

    if N is not a tip:
        L, R <- left and right children of N
        if N is root:
            UP_S(N) <- union of all states
        else:
            P <- parent of N
            B <- brother of N
            UP_S(N) <- most_common_states(UP_S(P), S(B))
        S(N) <- most_common_states(UP_S(N), S(L), S(R))
        DOWNPASS(L)
        DOWNPASS(R)

    :param tree: the tree of interest
    :type tree: ete3.Tree
    :param feature: character for which the parsimonious states are reconstructed
    :type feature: str
    :param states: possible character states
    :type states: np.array(str)
    :return: adds get_personalized_feature_name(feature, PARS_STATES) feature to the tree nodes
    """

    ps_feature_down = get_personalized_feature_name(feature, BU_PARS_STATES)
    ps_feature_up = get_personalized_feature_name(feature, TD_PARS_STATES)
    ps_feature = get_personalized_feature_name(feature, PARS_STATES)

    for node in tree.traverse('preorder'):
        if node.is_root():
            node.add_feature(ps_feature_up, set(states))
        else:
            node.add_feature(
                ps_feature_up,
                get_most_common_states([getattr(node.up, ps_feature_up)] + [
                    getattr(sibling, ps_feature_down)
                    for sibling in node.up.children if sibling != node
                ]))
        down_up_states = get_most_common_states([getattr(node, ps_feature_up)]
                                                + [getattr(child, ps_feature_down) for child in node.children]) \
            if not node.is_leaf() else getattr(node, ps_feature_up)
        preset_states = getattr(node, ps_feature)
        state_intersection = down_up_states & preset_states
        node.add_feature(
            ps_feature,
            state_intersection if state_intersection else preset_states)

    for node in tree.traverse():
        node.del_feature(ps_feature_down)
        node.del_feature(ps_feature_up)
Exemplo n.º 3
0
    def test_likelihood_same_for_all_nodes(self):
        """
        Tests if marginal likelihoods were correctly calculated
        by comparing the likelihoods of all the nodes (should be all the same).
        """
        lh_feature = get_personalized_feature_name(feature, LH)
        lh_sf_feature = get_personalized_feature_name(feature, LH_SF)

        for node in tree.traverse():
            if not node.is_root() and not (node.is_leaf() and node.dist == 0):
                node_loglh = np.log10(getattr(node, lh_feature).sum()) - getattr(node, lh_sf_feature)
                parent_loglh = np.log10(getattr(node.up, lh_feature).sum()) - getattr(node.up, lh_sf_feature)
                self.assertAlmostEqual(node_loglh, parent_loglh, places=2,
                                       msg='Likelihoods of {} and {} were supposed to be the same.'
                                       .format(node.name, node.up.name))
Exemplo n.º 4
0
def uppass(tree, feature):
    """
    UPPASS traverses the tree starting from the tips and going up till the root,
    and assigns to each parent node a state based on the states of its child nodes.

    if N is a tip:
    S(N) <- state of N
    else:
      L, R <- left and right children of N
      UPPASS(L)
      UPPASS(R)
      if S(L) intersects with S(R):
         S(N) <- intersection(S(L), S(R))
      else:
         S(N) <- union(S(L), S(R))

    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the parsimonious states are reconstructed
    :return: void, adds get_personalized_feature_name(feature, BU_PARS_STATES) feature to the tree nodes
    """

    ps_feature = get_personalized_feature_name(feature, BU_PARS_STATES)

    for node in tree.traverse('postorder'):
        if not node.is_leaf():
            children_states = get_most_common_states(
                getattr(child, ps_feature) for child in node.children)
            node_states = getattr(node, ps_feature)
            state_intersection = node_states & children_states
            node.add_feature(
                ps_feature,
                state_intersection if state_intersection else node_states)
Exemplo n.º 5
0
def acctran(tree, character, feature=PARS_STATES):
    """
    ACCTRAN (accelerated transformation) (Farris, 1970) aims at reducing the number of ambiguities
    in the parsimonious result. ACCTRAN forces the state changes to be performed as close to the root as possible,
    and therefore prioritises the reverse mutations.

    if N is not a tip:
        L, R <- left and right children of N
        if intersection(S(N), S(L)) is not empty:
            S(L) <- intersection(S(N), S(L))
        if intersection(S(N), S(R)) is not empty:
            S(R) <- intersection(S(N), S(R))
        ACCTRAN(L)
        ACCTRAN(R)

    :param tree: ete3.Tree, the tree of interest
    :param character: str, character for which the parsimonious states are reconstructed
    :return: void, adds get_personalized_feature_name(feature, PARS_STATES) feature to the tree nodes
    """

    ps_feature_down = get_personalized_feature_name(character, BU_PARS_STATES)

    for node in tree.traverse('preorder'):
        if node.is_root():
            node.add_feature(feature, getattr(node, ps_feature_down))
        node_states = getattr(node, feature)
        for child in node.children:
            child_states = getattr(child, ps_feature_down)
            state_intersection = node_states & child_states
            child.add_feature(
                feature,
                state_intersection if state_intersection else child_states)
Exemplo n.º 6
0
def unalter_zero_tip_joint_states(tree, feature, state2index):
    """
    Unalters the joint tip states for zero-distance tips
    to contain only their states.

    :param state2index: dict, mapping between states and their indices in the joint state array
    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the likelihood was altered
    :return: void, modifies the get_personalised_feature_name(feature, BU_LH_JOINT_STATES) feature to zero-distance tips.
    """
    lh_joint_state_feature = get_personalized_feature_name(
        feature, BU_LH_JOINT_STATES)
    for tip in tree:
        if tip.dist > 0:
            continue
        state = getattr(tip, feature, set())
        if len(state) > 1:
            allowed_indices = {state2index[_] for _ in state}
            allowed_index = next(iter(allowed_indices))
            joint_states = getattr(tip, lh_joint_state_feature)
            for i in range(len(state2index)):
                if joint_states[i] not in allowed_indices:
                    joint_states[i] = allowed_index
        elif len(state) == 1:
            tip.add_feature(
                lh_joint_state_feature,
                np.ones(len(state2index), np.int) *
                state2index[next(iter(state))])
Exemplo n.º 7
0
def initialize_allowed_states(tree, feature, states):
    """
    Initializes the allowed state arrays for tips based on their states given by the feature.

    :param tree: tree for which the tip likelihoods are to be initialized
    :type tree: ete3.Tree
    :param feature: feature in which the tip states are stored
        (the value could be None for a missing state or list if multiple stated are possible)
    :type feature: str
    :param states: ordered array of states.
    :type states: numpy.array
    :return: void, adds the get_personalised_feature_name(feature, ALLOWED_STATES) feature to tree tips.
    """
    allowed_states_feature = get_personalized_feature_name(
        feature, ALLOWED_STATES)
    state2index = dict(zip(states, range(len(states))))

    for node in tree.traverse():
        node_states = getattr(node, feature, set())
        if not node_states:
            allowed_states = np.ones(len(state2index), dtype=np.int)
        else:
            allowed_states = np.zeros(len(state2index), dtype=np.int)
            for state in node_states:
                allowed_states[state2index[state]] = 1
        node.add_feature(allowed_states_feature, allowed_states)
Exemplo n.º 8
0
def deltran(tree, feature):
    """
    DELTRAN (delayed transformation) (Swofford & Maddison, 1987) aims at reducing the number of ambiguities
    in the parsimonious result. DELTRAN makes the changes as close as possible to the leaves,
    hence prioritizing parallel mutations. DELTRAN is performed after DOWNPASS.

    if N is not a root:
        P <- parent(N)
        if intersection(S(N), S(P)) is not empty:
            S(N) <- intersection(S(N), S(P))
    if N is not a tip:
        L, R <- left and right children of N
        DELTRAN(L)
        DELTRAN(R)

    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the parsimonious states are reconstructed
    :return: void, modifies get_personalized_feature_name(feature, PARS_STATES) feature of the tree nodes
    """
    ps_feature = get_personalized_feature_name(feature, PARS_STATES)

    for node in tree.traverse('preorder'):
        if not node.is_root():
            node_states = getattr(node, ps_feature)
            parent_states = getattr(node.up, ps_feature)
            state_intersection = node_states & parent_states
            if state_intersection:
                node.add_feature(ps_feature, state_intersection)
Exemplo n.º 9
0
def calculate_marginal_likelihoods(tree, feature, frequencies):
    """
    Calculates marginal likelihoods for each tree node
    by multiplying state frequencies with their bottom-up and top-down likelihoods.

    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the likelihood is calculated
    :param frequencies: numpy array of state frequencies
    :return: void, stores the node marginal likelihoods in the get_personalised_feature_name(feature, LH) feature.
    """
    bu_lh_feature = get_personalized_feature_name(feature, BU_LH)
    bu_lh_sf_feature = get_personalized_feature_name(feature, BU_LH_SF)
    td_lh_feature = get_personalized_feature_name(feature, TD_LH)
    td_lh_sf_feature = get_personalized_feature_name(feature, TD_LH_SF)
    lh_feature = get_personalized_feature_name(feature, LH)
    lh_sf_feature = get_personalized_feature_name(feature, LH_SF)
    allowed_state_feature = get_personalized_feature_name(
        feature, ALLOWED_STATES)

    for node in tree.traverse('preorder'):
        likelihood = getattr(node, bu_lh_feature) * getattr(node, td_lh_feature) * frequencies \
                     * getattr(node, allowed_state_feature)
        node.add_feature(lh_feature, likelihood)
        node.add_feature(
            lh_sf_feature,
            getattr(node, td_lh_sf_feature) + getattr(node, bu_lh_sf_feature))

        node.del_feature(bu_lh_feature)
        node.del_feature(bu_lh_sf_feature)
        node.del_feature(td_lh_feature)
        node.del_feature(td_lh_sf_feature)
Exemplo n.º 10
0
def convert_allowed_states2feature(tree, feature, states, out_feature=None):
    if out_feature is None:
        out_feature = feature
    allowed_states_feature = get_personalized_feature_name(
        feature, ALLOWED_STATES)
    for node in tree.traverse():
        node.add_feature(
            out_feature,
            set(states[getattr(node, allowed_states_feature).astype(bool)]))
Exemplo n.º 11
0
def get_column_method(column, method):
    column = col_name2cat(column)
    if is_meta_ml(method):
        method = get_default_ml_method()
    elif is_meta_mp(method):
        method = get_default_mp_method()
    else:
        return column, method
    return get_personalized_feature_name(column, method), method
Exemplo n.º 12
0
def check_marginal_likelihoods(tree, feature):
    """
    Sanity check: combined bottom-up and top-down likelihood of each node of the tree must be the same.

    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the likelihood is calculated
    :return: void, stores the node marginal likelihoods in the get_personalised_feature_name(feature, LH) feature.
    """
    lh_feature = get_personalized_feature_name(feature, LH)
    lh_sf_feature = get_personalized_feature_name(feature, LH_SF)

    for node in tree.traverse():
        if not node.is_root() and not (node.is_leaf() and node.dist == 0):
            node_loglh = np.log10(getattr(node, lh_feature).sum()) - getattr(
                node, lh_sf_feature)
            parent_loglh = np.log10(getattr(
                node.up, lh_feature).sum()) - getattr(node.up, lh_sf_feature)
            assert (round(node_loglh, 2) == round(parent_loglh, 2))
Exemplo n.º 13
0
def _parsimonious_states2allowed_states(tree, ps_feature, feature,
                                        state2index):
    allowed_state_feature = get_personalized_feature_name(
        feature, ALLOWED_STATES)
    for node in tree.traverse():
        pars_states = getattr(node, ps_feature)
        allowed_states = np.zeros(len(state2index), dtype=int)
        for state in pars_states:
            allowed_states[state2index[state]] = 1
        node.add_feature(allowed_state_feature, allowed_states)
Exemplo n.º 14
0
 def process_reconstructed_states(method):
     if method == prediction_method or is_meta_ml(prediction_method):
         method_character = get_personalized_feature_name(character, method) \
             if prediction_method != method else character
         convert_allowed_states2feature(tree, character, states,
                                        method_character)
         res = result.copy()
         res[CHARACTER] = method_character
         res[METHOD] = method
         results.append(res)
Exemplo n.º 15
0
def choose_ancestral_states_map(tree, feature, states):
    """
    Chooses node ancestral states based on their marginal probabilities using MAP method.

    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the ancestral states are to be chosen
    :param states: numpy.array of possible character states in order corresponding to the probabilities array
    :return: void, modified the get_personalized_feature_name(feature, ALLOWED_STATES) feature of each node
        to only contain the selected states.
    """
    lh_feature = get_personalized_feature_name(feature, LH)
    allowed_state_feature = get_personalized_feature_name(
        feature, ALLOWED_STATES)
    _, state2array = get_state2allowed_states(states, False)

    for node in tree.traverse():
        marginal_likelihoods = getattr(node, lh_feature)
        node.add_feature(allowed_state_feature,
                         state2array[marginal_likelihoods.argmax()])
Exemplo n.º 16
0
def initialise_parsimonious_states(tree, feature, states):
    """
    Initializes the bottom-up state arrays for tips based on their states given by the feature.

    :param tree: ete3.Tree, tree for which the tip states are to be initialized
    :param feature: str, feature in which the tip states are stored (the value could be None for a missing state)
    :param states: numpy array, possible states.
    :return: void, adds the get_personalised_feature_name(feature, BU_PARS) feature to tree tips.
    """
    ps_feature_down = get_personalized_feature_name(feature, BU_PARS_STATES)
    ps_feature = get_personalized_feature_name(feature, PARS_STATES)
    all_states = set(states)

    for node in tree.traverse():
        state = getattr(node, feature, set())
        if not state:
            node.add_feature(ps_feature_down, all_states)
        else:
            node.add_feature(ps_feature_down, state)
        node.add_feature(ps_feature, getattr(node, ps_feature_down))
Exemplo n.º 17
0
 def process_result(method, feature):
     out_feature = get_personalized_feature_name(
         character, method) if prediction_method != method else character
     res = result.copy()
     res[NUM_SCENARIOS], res[NUM_UNRESOLVED_NODES], res[NUM_STATES_PER_NODE] \
         = choose_parsimonious_states(tree, feature, out_feature)
     res[NUM_STATES_PER_NODE] /= num_nodes
     res[PERC_UNRESOLVED] = res[NUM_UNRESOLVED_NODES] * 100 / num_nodes
     logger.debug('{} node{} unresolved ({:.2f}%) for {} by {}, '
                  'i.e. {:.4f} state{} per node in average.'.format(
                      res[NUM_UNRESOLVED_NODES],
                      's are' if res[NUM_UNRESOLVED_NODES] != 1 else ' is',
                      res[PERC_UNRESOLVED], character, method,
                      res[NUM_STATES_PER_NODE],
                      's' if res[NUM_STATES_PER_NODE] > 1 else ''))
     res[CHARACTER] = out_feature
     res[METHOD] = method
     results.append(res)
Exemplo n.º 18
0
def convert_likelihoods_to_probabilities(tree, feature, states):
    """
    Normalizes each node marginal likelihoods to convert them to marginal probabilities.

    :param states: numpy array of states in the order corresponding to the marginal likelihood arrays
    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the probabilities are calculated
    :return: pandas DataFrame, that maps node names to their marginal likelihoods.
    """
    lh_feature = get_personalized_feature_name(feature, LH)

    name2probs = {}

    for node in tree.traverse():
        lh = getattr(node, lh_feature)
        name2probs[node.name] = lh / lh.sum()

    return pd.DataFrame.from_dict(name2probs, orient='index', columns=states)
Exemplo n.º 19
0
def alter_zero_tip_allowed_states(tree, feature):
    """
    Alters the bottom-up likelihood arrays for zero-distance tips
    to make sure they do not contradict with other zero-distance tip siblings.

    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the likelihood is altered
    :return: void, modifies the get_personalised_feature_name(feature, BU_LH) feature to zero-distance tips.
    """
    zero_parent2tips = defaultdict(list)

    allowed_state_feature = get_personalized_feature_name(
        feature, ALLOWED_STATES)

    for tip in tree:
        if tip.dist == 0:
            state = getattr(tip, feature, None)
            if state is not None and state != '':
                zero_parent2tips[tip.up].append(tip)

    # adjust zero tips to contain all the zero tip options as states
    for parent, zero_tips in zero_parent2tips.items():
        # If there is a common state do nothing
        counts = None
        for tip in zero_tips:
            if counts is None:
                counts = getattr(tip, allowed_state_feature).copy()
            else:
                counts += getattr(tip, allowed_state_feature)
        if counts.max() == len(zero_tips):
            continue

        # Otherwise set all tip states to state union
        allowed_states = None
        for tip in zero_tips:
            if allowed_states is None:
                allowed_states = getattr(tip, allowed_state_feature).copy()
            else:
                tip_allowed_states = getattr(tip, allowed_state_feature)
                allowed_states[np.nonzero(tip_allowed_states)] = 1
            tip.add_feature(allowed_state_feature, allowed_states)
Exemplo n.º 20
0
def unalter_zero_tip_allowed_states(tree, feature, state2index):
    """
    Unalters the bottom-up likelihood arrays for zero-distance tips
    to contain ones only in their states.

    :param state2index: dict, mapping between states and their indices in the likelihood array
    :param tree: ete3.Tree, the tree of interest
    :param feature: str, character for which the likelihood was altered
    :return: void, modifies the get_personalised_feature_name(feature, BU_LH) feature to zero-distance tips.
    """
    allowed_state_feature = get_personalized_feature_name(
        feature, ALLOWED_STATES)
    for tip in tree:
        if tip.dist > 0:
            continue
        state = getattr(tip, feature, set())
        if state:
            initial_allowed_states = np.zeros(len(state2index), np.int)
            for _ in state:
                initial_allowed_states[state2index[_]] = 1
            allowed_states = getattr(
                tip, allowed_state_feature) & initial_allowed_states
            tip.add_feature(allowed_state_feature, (allowed_states if np.any(
                allowed_states > 0) else initial_allowed_states))
Exemplo n.º 21
0
def get_num_parsimonious_steps(tree, feature):
    ps_feature_num = get_personalized_feature_name(feature, PARS_STATE2NUM)

    for node in tree.traverse('postorder'):
        if node.is_leaf():
            node.add_feature(ps_feature_num,
                             {state: 0
                              for state in getattr(node, feature)})
        else:
            state2num = {}
            for state in getattr(node, feature):
                num = 0
                for child in node.children:
                    child_state2num = getattr(child, ps_feature_num)
                    num += min(((0 if state == child_state else 1) + child_num)
                               for (child_state,
                                    child_num) in child_state2num.items())
                state2num[state] = num
            node.add_feature(ps_feature_num, state2num)
            for child in node.children:
                child.del_feature(ps_feature_num)
    state2num = getattr(tree, ps_feature_num)
    tree.del_feature(ps_feature_num)
    return min(state2num.values())
Exemplo n.º 22
0
def calculate_top_down_likelihood(tree,
                                  character,
                                  frequencies,
                                  sf,
                                  kappa=None,
                                  model=F81):
    """
    Calculates the top-down likelihood for the given tree.
    The likelihood for each node is stored in the corresponding feature,
    given by get_personalised_feature_name(feature, TD_LH).

    To calculate the top-down likelihood of a node, we assume that the tree is rooted in this node
    and combine the likelihoods of the “up-subtrees”,
    e.g. to calculate the top-down likelihood of a node N1 being in a state i,
    given that its parent node is P and its brother node is N2, we imagine that the tree is re-rooted in N1,
    therefore P becoming the child of N1, and N2 its grandchild.
    We then calculate the bottom-up likelihood from the P subtree:
    L_top_down(N1, i) = \sum_j P(i -> j, dist(N1, P)) * L_top_down(P) * \sum_k P(j -> k, dist(N2, P)) * L_bottom_up (N2).

    For the root node we assume its top-down likelihood to be 1 for all the states.

    :param model: model of character evolution
    :type model: str
    :param sf: scaling factor
    :type sf: float
    :param character: character whose ancestral state likelihood is being calculated
    :type character: str
    :param tree: tree of interest (with bottom-up likelihood pre-calculated)
    :type tree: ete3.Tree
    :param frequencies: state frequencies
    :type frequencies: numpy.array
    :return: void, stores the node top-down likelihoods in the get_personalised_feature_name(feature, TD_LH) feature.
    """

    lh_feature = get_personalized_feature_name(character, TD_LH)
    lh_sf_feature = get_personalized_feature_name(character, TD_LH_SF)
    bu_lh_feature = get_personalized_feature_name(character, BU_LH)
    bu_lh_sf_feature = get_personalized_feature_name(character, BU_LH_SF)

    get_pij = get_pij_method(model, frequencies, kappa)
    for node in tree.traverse('preorder'):
        if node.is_root():
            node.add_feature(lh_feature, np.ones(len(frequencies), np.float64))
            node.add_feature(lh_sf_feature, 0)
            continue

        parent = node.up
        parent_bu_likelihood = getattr(parent, bu_lh_feature)

        node_pjis = np.transpose(get_pij(node.dist * sf))
        node_contribution = getattr(node, bu_lh_feature).dot(node_pjis)

        parent_likelihood = getattr(parent, lh_feature) * parent_bu_likelihood
        parent_likelihood[np.nonzero(parent_likelihood)] /= node_contribution[
            np.nonzero(parent_likelihood)]
        factors = getattr(parent, lh_sf_feature) + getattr(
            parent, bu_lh_sf_feature) - getattr(node, bu_lh_sf_feature)

        td_likelihood = parent_likelihood.dot(node_pjis)
        factors += rescale(
            td_likelihood,
            fraction_of_limit=len(node.children) if not node.is_leaf() else 1)

        node.add_feature(lh_feature, td_likelihood)
        node.add_feature(lh_sf_feature, factors)
Exemplo n.º 23
0
def pastml_pipeline(tree, data, data_sep='\t', id_index=0,
                    columns=None, prediction_method=MPPA, model=F81, parameters=None,
                    name_column=None, date_column=None, tip_size_threshold=REASONABLE_NUMBER_OF_TIPS,
                    out_data=None, html_compressed=None, html=None, work_dir=None,
                    verbose=False, forced_joint=False, upload_to_itol=False, itol_id=None, itol_project=None,
                    itol_tree_name=None):
    """
    Applies PASTML to the given tree with the specified states and visualizes the result (as html maps).

    :param tree: path to the input tree in newick format (must be rooted).
    :type tree: str

    :param data: path to the annotation file in tab/csv format with the first row containing the column names.
    :type data: str
    :param data_sep: (optional, by default '\t') column separator for the annotation table.
        By default is set to tab, i.e. for tab-delimited file. Set it to ',' if your file is csv.
    :type data_sep: char
    :param id_index: (optional, by default is 0) index of the column in the annotation table
        that contains the tree tip names, indices start from zero.
    :type id_index: int

    :param columns: (optional) name(s) of the annotation table column(s) that contain character(s)
        to be analysed. If not specified all annotation table columns will be considered.
    :type columns: str or list(str)
    :param prediction_method: (optional, default is pastml.ml.MPPA) ancestral character reconstruction method(s),
        can be one of the max likelihood (ML) methods: pastml.ml.MPPA, pastml.ml.MAP, pastml.ml.JOINT,
        one of the max parsimony (MP) methods: pastml.parsimony.ACCTRAN, pastml.parsimony.DELTRAN,
        pastml.parsimony.DOWNPASS; or pastml.acr.COPY to keep the annotated character states as-is without inference.
        One can also specify one of the meta-methods: pastml.ml.ALL, pastml.ml.ML, pastml.parsimony.MP,
        that would perform ACR with multiple methods (all of them for pastml.ml.ALL,
        all the ML methods for pastml.ml.ML, or all the MP methods for pastml.parsimony.MP)
        and save/visualise the results as multiple characters suffixed with the corresponding method.
        When multiple ancestral characters are specified (with ``columns`` argument),
        the same method can be used for all of them (if only one method is specified),
        or different methods can be used (specified in the same order as ``columns``).
        If multiple methods are given, but not for all the characters,
        for the rest of them the default method (pastml.ml.MPPA) is chosen.'
    :type prediction_method: str or list(str)
    :param forced_joint: (optional, default is False) add JOINT state to the MPPA state selection
        even if it is not selected by Brier score.
    :type forced_joint: bool
    :param model: (optional, default is pastml.models.f81_like.F81) evolutionary model(s) for ML methods
        (ignored by MP methods).
        When multiple ancestral characters are specified (with ``columns`` argument),
        the same model can be used for all of them (if only one model is specified),
        or different models can be used (specified in the same order as ``columns``).
        If multiple models are given, but not for all the characters,
        for the rest of them the default model (pastml.models.f81_like.F81) is chosen.
    :type model: str or list(str)
    :param parameters: optional way to fix some of the ML-method parameters.
        Could be specified as
        (1a) a dict {column: {param: value}},
        where column corresponds to the character for which these parameters should be used,
        or (1b) in a form {column: path_to_param_file};
        or (2) as a list of paths to parameter files
        (in the same order as ``columns`` argument that specifies characters)
        possibly given only for the first few characters;
        or (3) as a path to parameter file (only for the first character).
        Each file should be tab-delimited, with two columns: the first one containing parameter names,
        and the second, named "value", containing parameter values.
        Parameters can include character state frequencies (parameter name should be the corresponding state,
        and parameter value - the float frequency value, between 0 and 1),
        and tree branch scaling factor (parameter name pastml.ml.SCALING_FACTOR).
    :type parameters: str or list(str) or dict

    :param name_column: (optional) name of the annotation table column to be used for node names
        in the compressed map visualisation
        (must be one of those specified in ``columns``, if ``columns`` are specified).
        If the annotation table contains only one column, it will be used by default.
    :type name_column: str
    :param date_column: (optional) name of the annotation table column that contains tip dates,
        if specified it is used to add a time slider to the visualisation.
    :type date_column: str
    :param tip_size_threshold: (optional, by default is 15) recursively remove the tips
        of size less than threshold-th largest tip from the compressed map (set to 1e10 to keep all).
        The larger it is the less tips will be trimmed.
    :type tip_size_threshold: int

    :param out_data: path to the output annotation file with the reconstructed ancestral character states.
    :type out_data: str
    :param html_compressed: path to the output compressed visualisation file (html).
    :type html_compressed: str
    :param html: (optional) path to the output tree visualisation file (html).
    :type html: str
    :param work_dir: (optional) path to the folder where pastml parameter, named tree
        and marginal probability (for marginal ML methods (pastml.ml.MPPA, pastml.ml.MAP) only) files are to be stored.
        Default is <path_to_input_file>/<input_file_name>_pastml. If the folder does not exist, it will be created.
    :type work_dir: str

    :param verbose: (optional, default is False) print information on the progress of the analysis.
    :type verbose: bool

    :param upload_to_itol: (optional, default is False) whether the annotated tree should be uploaded to iTOL
        (https://itol.embl.de/)
    :type upload_to_itol: bool
    :param itol_id: (optional) iTOL user batch upload ID that enables uploading to your iTOL account
        (see https://itol.embl.de/help.cgi#batch). If not specified, the tree will not be associated to any account.
    :type itol_id: str
    :param itol_project: (optional) iTOL project the annotated tree should be uploaded to
        (must exist, and itol_id must be specified). If not specified, the tree will not be associated to any project.
    :type itol_project: str
    :param itol_tree_name: (optional) name for the tree uploaded to iTOL.
    :type itol_tree_name: str

    :return: void
    """
    logger = _set_up_pastml_logger(verbose)

    root, df, years, tip2date, name_column = \
        _validate_input(columns, data, data_sep, date_column, html, html_compressed, id_index, name_column, tree,
                        copy_only=COPY == prediction_method or (isinstance(prediction_method, list)
                                                                and all(COPY == _ for _ in prediction_method)))

    if not date_column:
        date_column = 'Dist. to root'

    if parameters:
        if isinstance(parameters, str):
            parameters = [parameters]
        if isinstance(parameters, list):
            parameters = dict(zip(df.columns, parameters))
        elif isinstance(parameters, dict):
            parameters = {col_name2cat(col): params for (col, params) in parameters.items()}
        else:
            raise ValueError('Parameters should be either a list or a dict, got {}.'.format(type(parameters)))
    else:
        parameters = {}

    if not work_dir:
        work_dir = get_pastml_work_dir(tree)
    os.makedirs(work_dir, exist_ok=True)

    acr_results = acr(root, df, prediction_method=prediction_method, model=model, column2parameters=parameters,
                      force_joint=forced_joint)
    column2states = {acr_result[CHARACTER]: acr_result[STATES] for acr_result in acr_results}

    if not out_data:
        out_data = os.path.join(work_dir, get_combined_ancestral_state_file())
    state_df = _serialize_predicted_states(sorted(column2states.keys()), out_data, root)

    # a meta-method would have added a suffix to the name feature
    if html_compressed and name_column and name_column not in column2states:
        ml_name_column = get_personalized_feature_name(name_column, get_default_ml_method())
        name_column = ml_name_column if ml_name_column in column2states \
            else get_personalized_feature_name(name_column, get_default_mp_method())

    itol_result = None
    pool = ThreadPool()
    new_tree = os.path.join(work_dir, get_named_tree_file(tree))
    root.write(outfile=new_tree, format_root_node=True, format=3)
    async_result = pool.map_async(func=_serialize_acr, iterable=((acr_res, work_dir) for acr_res in acr_results))
    if upload_to_itol:
        itol_result = pool.apply_async(func=generate_itol_annotations,
                                       args=(column2states, work_dir, acr_results, state_df, date_column, tip2date,
                                             new_tree, itol_id, itol_project,
                                             itol_tree_name))

    if html or html_compressed:
        logger.debug('\n=============VISUALISATION=====================')
        visualize(root, column2states=column2states,
                  html=html, html_compressed=html_compressed, years=years, tip2date=tip2date,
                  name_column=name_column, tip_size_threshold=tip_size_threshold, date_column=date_column)

    async_result.wait()
    if itol_result:
        itol_result.wait()
    pool.close()

    return root
Exemplo n.º 24
0
def parsimonious_acr(tree, character, prediction_method, states, num_nodes,
                     num_tips):
    """
    Calculates parsimonious states on the tree and stores them in the corresponding feature.

    :param states: numpy array of possible states
    :param prediction_method: str, ACCTRAN (accelerated transformation), DELTRAN (delayed transformation) or DOWNPASS
    :param tree: ete3.Tree, the tree of interest
    :param character: str, character for which the parsimonious states are reconstructed
    :return: dict, mapping between reconstruction parameters and values
    """
    initialise_parsimonious_states(tree, character, states)
    uppass(tree, character)

    results = []
    result = {STATES: states, NUM_NODES: num_nodes, NUM_TIPS: num_tips}

    logger = logging.getLogger('pastml')

    def process_result(method, feature):
        out_feature = get_personalized_feature_name(
            character, method) if prediction_method != method else character
        res = result.copy()
        res[NUM_SCENARIOS], res[NUM_UNRESOLVED_NODES], res[NUM_STATES_PER_NODE] \
            = choose_parsimonious_states(tree, feature, out_feature)
        res[NUM_STATES_PER_NODE] /= num_nodes
        res[PERC_UNRESOLVED] = res[NUM_UNRESOLVED_NODES] * 100 / num_nodes
        logger.debug('{} node{} unresolved ({:.2f}%) for {} by {}, '
                     'i.e. {:.4f} state{} per node in average.'.format(
                         res[NUM_UNRESOLVED_NODES],
                         's are' if res[NUM_UNRESOLVED_NODES] != 1 else ' is',
                         res[PERC_UNRESOLVED], character, method,
                         res[NUM_STATES_PER_NODE],
                         's' if res[NUM_STATES_PER_NODE] > 1 else ''))
        res[CHARACTER] = out_feature
        res[METHOD] = method
        results.append(res)

    if prediction_method in {ACCTRAN, MP}:
        feature = get_personalized_feature_name(character, PARS_STATES)
        if prediction_method == MP:
            feature = get_personalized_feature_name(feature, ACCTRAN)
        acctran(tree, character, feature)
        result[STEPS] = get_num_parsimonious_steps(tree, feature)
        process_result(ACCTRAN, feature)

        bu_feature = get_personalized_feature_name(character, BU_PARS_STATES)
        for node in tree.traverse():
            if prediction_method == ACCTRAN:
                node.del_feature(bu_feature)
            node.del_feature(feature)

    if prediction_method != ACCTRAN:
        downpass(tree, character, states)
        feature = get_personalized_feature_name(character, PARS_STATES)
        if prediction_method == DOWNPASS:
            result[STEPS] = get_num_parsimonious_steps(tree, feature)
        if prediction_method in {DOWNPASS, MP}:
            process_result(DOWNPASS, feature)
        if prediction_method in {DELTRAN, MP}:
            deltran(tree, character)
            if prediction_method == DELTRAN:
                result[STEPS] = get_num_parsimonious_steps(tree, feature)
            process_result(DELTRAN, feature)
        for node in tree.traverse():
            node.del_feature(feature)

    logger.debug(
        "Parsimonious reconstruction for {} requires {} state changes.".format(
            character, result[STEPS]))
    return results
Exemplo n.º 25
0
def choose_ancestral_states_mppa(tree, feature, states, force_joint=True):
    """
    Chooses node ancestral states based on their marginal probabilities using MPPA method.

    :param force_joint: make sure that Joint state is chosen even if it has a low probability.
    :type force_joint: bool
    :param tree: tree of interest
    :type tree: ete3.Tree
    :param feature: character for which the ancestral states are to be chosen
    :type feature: str
    :param states: possible character states in order corresponding to the probabilities array
    :type states: numpy.array
    :return: number of ancestral scenarios selected,
        calculated by multiplying the number of selected states for all nodes.
        Also modified the get_personalized_feature_name(feature, ALLOWED_STATES) feature of each node
        to only contain the selected states.
    :rtype: int
    """
    lh_feature = get_personalized_feature_name(feature, LH)
    allowed_state_feature = get_personalized_feature_name(
        feature, ALLOWED_STATES)
    joint_state_feature = get_personalized_feature_name(feature, JOINT_STATE)

    n = len(states)
    _, state2array = get_state2allowed_states(states, False)

    num_scenarios = 1
    unresolved_nodes = 0
    num_states = 0

    # If force_joint == True,
    # we make sure that the joint state is always chosen,
    # for this we sort the marginal probabilities array as [lowest_non_joint_mp, ..., highest_non_joint_mp, joint_mp]
    # select k in 1:n such as the correction between choosing 0, 0, ..., 1/k, ..., 1/k and our sorted array is min
    # and return the corresponding states
    for node in tree.traverse():
        marginal_likelihoods = getattr(node, lh_feature)
        marginal_probs = marginal_likelihoods / marginal_likelihoods.sum()
        if force_joint:
            joint_index = getattr(node, joint_state_feature)
            joint_prob = marginal_probs[joint_index]
            marginal_probs = np.hstack(
                (np.sort(np.delete(marginal_probs,
                                   joint_index)), [joint_prob]))
        else:
            marginal_probs = np.sort(marginal_probs)
        best_k = n
        best_correstion = np.inf
        for k in range(1, n + 1):
            correction = np.hstack(
                (np.zeros(n - k), np.ones(k) / k)) - marginal_probs
            correction = correction.dot(correction)
            if correction < best_correstion:
                best_correstion = correction
                best_k = k

        num_scenarios *= best_k
        num_states += best_k
        if force_joint:
            indices_selected = sorted(range(n),
                                      key=lambda _:
                                      (0 if n == joint_index else 1,
                                       -marginal_likelihoods[_]))[:best_k]
        else:
            indices_selected = sorted(
                range(n), key=lambda _: -marginal_likelihoods[_])[:best_k]
        if best_k == 1:
            allowed_states = state2array[indices_selected[0]]
        else:
            allowed_states = np.zeros(len(states), dtype=np.int)
            allowed_states[indices_selected] = 1
            unresolved_nodes += 1
        node.add_feature(allowed_state_feature, allowed_states)

    return num_scenarios, unresolved_nodes, num_states
Exemplo n.º 26
0
def get_bottom_up_likelihood(tree,
                             character,
                             frequencies,
                             sf,
                             kappa=None,
                             is_marginal=True,
                             model=F81):
    """
    Calculates the bottom-up likelihood for the given tree.
    The likelihood for each node is stored in the corresponding feature,
    given by get_personalised_feature_name(feature, BU_LH).

    :param model: model of character evolution
    :type model: str
    :param is_marginal: whether the likelihood reconstruction is marginal (true) or joint (false)
    :type is_marginal: bool
    :param tree: tree of interest
    :type tree: ete3.Tree
    :param character: character for which the likelihood is calculated
    :type character: str
    :param frequencies: array of state frequencies \pi_i
    :type frequencies: numpy.array
    :param sf: scaling factor
    :type sf: float
    :return: log likelihood
    :rtype: float
    """
    lh_sf_feature = get_personalized_feature_name(character, BU_LH_SF)
    lh_feature = get_personalized_feature_name(character, BU_LH)
    lh_joint_state_feature = get_personalized_feature_name(
        character, BU_LH_JOINT_STATES)
    allowed_state_feature = get_personalized_feature_name(
        character, ALLOWED_STATES)

    get_pij = get_pij_method(model, frequencies, kappa)
    for node in tree.traverse('postorder'):
        likelihood_array = np.ones(len(frequencies),
                                   dtype=np.float64) * getattr(
                                       node, allowed_state_feature)
        factors = 0
        for child in node.children:
            child_likelihoods = get_pij(child.dist * sf) * getattr(
                child, lh_feature)
            if is_marginal:
                child_likelihoods = child_likelihoods.sum(axis=1)
            else:
                child_states = child_likelihoods.argmax(axis=1)
                child.add_feature(lh_joint_state_feature, child_states)
                child_likelihoods = child_likelihoods.max(axis=1)

            factors += rescale(child_likelihoods,
                               fraction_of_limit=len(node.children))
            likelihood_array *= child_likelihoods

        if np.all(likelihood_array == 0):
            return -np.inf

        factors += rescale(likelihood_array,
                           fraction_of_limit=len(node.up.children)
                           if not node.is_root() else 1)
        node.add_feature(lh_feature, likelihood_array)
        node.add_feature(
            lh_sf_feature,
            factors + sum(getattr(_, lh_sf_feature) for _ in node.children))
    root_likelihoods = getattr(tree, lh_feature) * frequencies
    root_likelihoods = root_likelihoods.sum(
    ) if is_marginal else root_likelihoods.max()
    return np.log(root_likelihoods) - getattr(tree, lh_sf_feature) * np.log(10)