Пример #1
0
 def test_num_nodes(self):
     state2num = Counter()
     root = tree.copy()
     collapse_zero_branches([root])
     for node in root.traverse():
         state = getattr(node, feature)
         if len(state) > 1:
             state2num['unresolved'] += 1
         else:
             state2num[next(iter(state))] += 1
     expected_state2num = {
         'Africa': 114,
         'Albania': 50,
         'Greece': 69,
         'WestEurope': 28,
         'EastEurope': 16
     }
     self.assertDictEqual(
         expected_state2num,
         state2num,
         msg='Was supposed to have {} as states counts, got {}.'.format(
             expected_state2num, state2num))
Пример #2
0
from collections import Counter

import pandas as pd

from pastml.tree import read_tree, collapse_zero_branches
from pastml.acr import acr
from pastml.parsimony import ACCTRAN, STEPS

DATA_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data')
TREE_NWK = os.path.join(DATA_DIR, 'Albanian.tree.152tax.tre')
STATES_INPUT = os.path.join(DATA_DIR, 'data.txt')

feature = 'Country'
df = pd.read_csv(STATES_INPUT, index_col=0, header=0)[[feature]]
tree = read_tree(TREE_NWK)
collapse_zero_branches([tree])
acr_result = acr(tree, df, prediction_method=ACCTRAN)[0]


class ACRStateAcctranTest(unittest.TestCase):
    def test_num_steps(self):
        self.assertEqual(
            32,
            acr_result[STEPS],
            msg='Was supposed to have {} parsimonious steps, got {}.'.format(
                32, acr_result[STEPS]))

    def test_num_nodes(self):
        state2num = Counter()
        for node in tree.traverse():
            state = getattr(node, feature)
Пример #3
0
import pandas as pd
import numpy as np

from pastml.tree import read_tree, collapse_zero_branches
from pastml.acr import acr
from pastml.ml import JOINT, EFT

DATA_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data')
TREE_NWK = os.path.join(DATA_DIR, 'Albanian.tree.152tax.tre')
STATES_INPUT = os.path.join(DATA_DIR, 'data.txt')

feature = 'Country'
df = pd.read_csv(STATES_INPUT, index_col=0, header=0)[[feature]]
tree = read_tree(TREE_NWK)
collapse_zero_branches(tree)
acr(tree, df, prediction_method=JOINT, model=EFT)


class ACRStateJointEFTTest(unittest.TestCase):
    def test_collapsed_vs_full(self):
        tree_uncollapsed = read_tree(TREE_NWK)
        acr(tree_uncollapsed, df, prediction_method=JOINT, model=EFT)

        def get_state(node):
            state = getattr(node, feature)
            return state if not isinstance(state, list) else ', '.join(
                sorted(state))

        df_full = pd.DataFrame.from_dict(
            {
Пример #4
0
def acr(tree, df, prediction_method=MPPA, model=F81, column2parameters=None, force_joint=True):
    """
    Reconstructs ancestral states for the given tree and
    all the characters specified as columns of the given annotation dataframe.

    :param df: dataframe indexed with tree node names
        and containing characters for which ACR should be performed as columns.
    :type df: pandas.DataFrame
    :param tree: tree whose ancestral state are to be reconstructed.
    :type tree: ete3.Tree
    :param model: (optional, default is F81) model(s) to be used by PASTML,
        can be either one model to be used for all the characters,
        or a list of different models (in the same order as the annotation dataframe columns)
    :type model: str or list(str)
    :param prediction_method: (optional, default is MPPA) ancestral state prediction method(s) to be used by PASTML,
        can be either one method to be used for all the characters,
        or a list of different methods (in the same order as the annotation dataframe columns)
    :type prediction_method: str or list(str)
    :param column2parameters: an optional way to fix some parameters,
        must be in a form {column: {param: value}},
        where param can be a character state (then the value should specify its frequency between 0 and 1),
        or pastml.ml.SCALING_FACTOR (then the value should be the scaling factor for three branches,
        e.g. set to 1 to keep the original branches). Could also be in a form {column: path_to_param_file}.
    :type column2parameters: dict
    :param force_joint: (optional, default is True) whether the JOINT state should be added to the MPPA prediction
        even when not selected by the Brier score
    :type force_joint: bool

    :return: list of ACR result dictionaries, one per character.
    :rtype: list(dict)
    """
    for c in df.columns:
        df[c] = df[c].apply(lambda _: '' if pd.isna(_) else _.encode('ASCII', 'replace').decode())
    columns = preannotate_tree(df, tree)
    name_tree(tree)
    collapse_zero_branches(tree, features_to_be_merged=df.columns)

    avg_br_len, num_nodes, num_tips = get_tree_stats(tree)

    logging.getLogger('pastml').debug('\n=============ACR===============================')

    column2parameters = column2parameters if column2parameters else {}

    def _work(args):
        return reconstruct_ancestral_states(*args, avg_br_len=avg_br_len, num_nodes=num_nodes, num_tips=num_tips,
                                            force_joint=force_joint)

    prediction_methods = value2list(len(columns), prediction_method, MPPA)
    models = value2list(len(columns), model, F81)

    def get_states(method, model, column):
        df_states = [_ for _ in df[column].unique() if pd.notnull(_) and _ != '']
        if not is_ml(method) or model not in {HKY, JTT}:
            return np.sort(df_states)
        states = HKY_STATES if HKY == model else JTT_STATES
        if not set(df_states) & set(states):
            raise ValueError('The allowed states for model {} are {}, '
                             'but your annotation file specifies {} as states in column {}.'
                             .format(model, ', '.join(states), ', '.join(df_states), column))
        state_set = set(states)
        df[column] = df[column].apply(lambda _: _ if _ in state_set else '')
        return states

    with ThreadPool() as pool:
        acr_results = \
            pool.map(func=_work, iterable=((tree, column, get_states(method, model, column), method, model,
                                            column2parameters[column] if column in column2parameters else None)
                                           for (column, method, model) in zip(columns, prediction_methods, models)))

    result = []
    for acr_res in acr_results:
        if isinstance(acr_res, list):
            result.extend(acr_res)
        else:
            result.append(acr_res)

    return result