def test_num_nodes(self): state2num = Counter() root = tree.copy() collapse_zero_branches([root]) for node in root.traverse(): state = getattr(node, feature) if len(state) > 1: state2num['unresolved'] += 1 else: state2num[next(iter(state))] += 1 expected_state2num = { 'Africa': 114, 'Albania': 50, 'Greece': 69, 'WestEurope': 28, 'EastEurope': 16 } self.assertDictEqual( expected_state2num, state2num, msg='Was supposed to have {} as states counts, got {}.'.format( expected_state2num, state2num))
from collections import Counter import pandas as pd from pastml.tree import read_tree, collapse_zero_branches from pastml.acr import acr from pastml.parsimony import ACCTRAN, STEPS DATA_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data') TREE_NWK = os.path.join(DATA_DIR, 'Albanian.tree.152tax.tre') STATES_INPUT = os.path.join(DATA_DIR, 'data.txt') feature = 'Country' df = pd.read_csv(STATES_INPUT, index_col=0, header=0)[[feature]] tree = read_tree(TREE_NWK) collapse_zero_branches([tree]) acr_result = acr(tree, df, prediction_method=ACCTRAN)[0] class ACRStateAcctranTest(unittest.TestCase): def test_num_steps(self): self.assertEqual( 32, acr_result[STEPS], msg='Was supposed to have {} parsimonious steps, got {}.'.format( 32, acr_result[STEPS])) def test_num_nodes(self): state2num = Counter() for node in tree.traverse(): state = getattr(node, feature)
import pandas as pd import numpy as np from pastml.tree import read_tree, collapse_zero_branches from pastml.acr import acr from pastml.ml import JOINT, EFT DATA_DIR = os.path.join(os.path.abspath(os.path.dirname(__file__)), 'data') TREE_NWK = os.path.join(DATA_DIR, 'Albanian.tree.152tax.tre') STATES_INPUT = os.path.join(DATA_DIR, 'data.txt') feature = 'Country' df = pd.read_csv(STATES_INPUT, index_col=0, header=0)[[feature]] tree = read_tree(TREE_NWK) collapse_zero_branches(tree) acr(tree, df, prediction_method=JOINT, model=EFT) class ACRStateJointEFTTest(unittest.TestCase): def test_collapsed_vs_full(self): tree_uncollapsed = read_tree(TREE_NWK) acr(tree_uncollapsed, df, prediction_method=JOINT, model=EFT) def get_state(node): state = getattr(node, feature) return state if not isinstance(state, list) else ', '.join( sorted(state)) df_full = pd.DataFrame.from_dict( {
def acr(tree, df, prediction_method=MPPA, model=F81, column2parameters=None, force_joint=True): """ Reconstructs ancestral states for the given tree and all the characters specified as columns of the given annotation dataframe. :param df: dataframe indexed with tree node names and containing characters for which ACR should be performed as columns. :type df: pandas.DataFrame :param tree: tree whose ancestral state are to be reconstructed. :type tree: ete3.Tree :param model: (optional, default is F81) model(s) to be used by PASTML, can be either one model to be used for all the characters, or a list of different models (in the same order as the annotation dataframe columns) :type model: str or list(str) :param prediction_method: (optional, default is MPPA) ancestral state prediction method(s) to be used by PASTML, can be either one method to be used for all the characters, or a list of different methods (in the same order as the annotation dataframe columns) :type prediction_method: str or list(str) :param column2parameters: an optional way to fix some parameters, must be in a form {column: {param: value}}, where param can be a character state (then the value should specify its frequency between 0 and 1), or pastml.ml.SCALING_FACTOR (then the value should be the scaling factor for three branches, e.g. set to 1 to keep the original branches). Could also be in a form {column: path_to_param_file}. :type column2parameters: dict :param force_joint: (optional, default is True) whether the JOINT state should be added to the MPPA prediction even when not selected by the Brier score :type force_joint: bool :return: list of ACR result dictionaries, one per character. :rtype: list(dict) """ for c in df.columns: df[c] = df[c].apply(lambda _: '' if pd.isna(_) else _.encode('ASCII', 'replace').decode()) columns = preannotate_tree(df, tree) name_tree(tree) collapse_zero_branches(tree, features_to_be_merged=df.columns) avg_br_len, num_nodes, num_tips = get_tree_stats(tree) logging.getLogger('pastml').debug('\n=============ACR===============================') column2parameters = column2parameters if column2parameters else {} def _work(args): return reconstruct_ancestral_states(*args, avg_br_len=avg_br_len, num_nodes=num_nodes, num_tips=num_tips, force_joint=force_joint) prediction_methods = value2list(len(columns), prediction_method, MPPA) models = value2list(len(columns), model, F81) def get_states(method, model, column): df_states = [_ for _ in df[column].unique() if pd.notnull(_) and _ != ''] if not is_ml(method) or model not in {HKY, JTT}: return np.sort(df_states) states = HKY_STATES if HKY == model else JTT_STATES if not set(df_states) & set(states): raise ValueError('The allowed states for model {} are {}, ' 'but your annotation file specifies {} as states in column {}.' .format(model, ', '.join(states), ', '.join(df_states), column)) state_set = set(states) df[column] = df[column].apply(lambda _: _ if _ in state_set else '') return states with ThreadPool() as pool: acr_results = \ pool.map(func=_work, iterable=((tree, column, get_states(method, model, column), method, model, column2parameters[column] if column in column2parameters else None) for (column, method, model) in zip(columns, prediction_methods, models))) result = [] for acr_res in acr_results: if isinstance(acr_res, list): result.extend(acr_res) else: result.append(acr_res) return result