示例#1
0
    def switch_hosts(self, t0, seed=None):
        """
        Select an extant pathogen lineage at random and reassign its host
        :return:
        """
        assert len(
            self.extant_h) > 1, "Error: attempted to switch between one host"
        if seed:
            random.seed(seed)
        pick_p = random.choice(
            self.extant_p)  # select an extant pathogen lineage at random
        pick_h = pick_p.host
        while pick_h == pick_p.host:
            pick_h = random.choice(self.extant_h)

        # add a node of degree size 2 to annotate host switch event in tree
        pick_p.dist = t0 - pick_p.height
        next_p = Tree(name=pick_p.name + '_m%s-%sm' %
                      (pick_p.host.name, pick_h.name),
                      dist=0)
        next_p.add_features(host=pick_h, height=t0)
        pick_p.up = next_p
        next_p.children = [pick_p]

        self.extant_p.remove(pick_p)
        self.extant_p.append(next_p)
        self.not_extant_p.append(pick_p)
示例#2
0
def build_conv_topo(annotated_tree, vnodes):

      tconv = annotated_tree.copy(method="deepcopy")
      for n in tconv.iter_leaves():
        n.add_features(L=1)
      for n in tconv.traverse():
        n.add_features(COPY=0)
      # get the most recent ancestral node of all the convergent clades
      l_convergent_clades = tconv.search_nodes(T=True)
      common_anc_conv=tconv.get_common_ancestor(l_convergent_clades)

      # duplicate it at its same location (branch lenght = 0). we get
      # a duplicated subtree with subtrees A and B (A == B)

      dist_dup = common_anc_conv.dist
      if not common_anc_conv.is_root():
        dup_point = common_anc_conv.add_sister(name="dup_point",dist=0.000001)
        dup_point_root = False
      else:
        dup_point = Tree()
        dup_point_root = True
        dup_point.dist=0.000001

      dup_point.add_features(ND=0,T=False, C=False, Cz=False)

      common_anc_conv.detach()
      common_anc_conv_copy = common_anc_conv.copy(method="deepcopy")

      # tag duplicated nodes:

      for n in common_anc_conv_copy.traverse():
        n.COPY=1
        if n.ND not in vnodes and not n.is_root():
            n.dist=0.000001

      # pruned A from all branches not leading to any convergent clade
      l_leaves_to_keep_A = common_anc_conv.search_nodes(COPY=0, C=False, L=1)
      #logger.debug("A: %s",l_leaves_to_keep_A)
      common_anc_conv.prune(l_leaves_to_keep_A, preserve_branch_length=True)

      # pruned B from all branches not leading to any non-convergent clade
      l_leaves_to_keep_B = common_anc_conv_copy.search_nodes(COPY=1, C=True, L=1)
      #logger.debug("B : %s", l_leaves_to_keep_B)
      common_anc_conv_copy.prune(l_leaves_to_keep_B, preserve_branch_length=True)


      dup_point.add_child(common_anc_conv_copy)
      dup_point.add_child(common_anc_conv)

      tconv = dup_point.get_tree_root()

      nodeId = 0
      for node in tconv.traverse("postorder"):
          node.ND = nodeId
          nodeId += 1

      return tconv
示例#3
0
def birth(tree,
          node):  #subpop is the subpopulation where the event is to occur,
    #setpop is the set of nodes in subpop
    child1, child2 = Tree(), Tree()
    child1.dist, child2.dist = 0, 0
    child1.add_features(extinct=False)
    child2.add_features(extinct=False)
    #add children to nodes
    node.add_child(child1)
    node.add_child(child2)
    return tree
示例#4
0
def initialise(rate):
    tree = Tree()
    tree.add_features(extinct=False)
    tree.dist = 0.0
    node = random.choice(tree.get_leaves())
    tree = birth(tree, node)
    leaf_nodes = tree.get_leaves()
    wtime = random.expovariate(rate)
    for leaf in leaf_nodes:
        if not leaf.extinct:
            leaf.dist += wtime
    return tree
示例#5
0
文件: tp3.py 项目: Whippsie/BioTP4
def makeNewDistanceMatrix(n, seqStringList, distanceMatrix, i, j, dictPos,
                          dictTree):
    newMatrix = []
    rows = n
    columns = rows
    for row in range(rows + 1):
        rowScore = []
        for column in range(columns + 1):
            if row == 0 and column == 0:
                rowScore.append("~")
            elif row == 0:
                rowScore.append(seqStringList[column - 1])
                #On spécifie la valeur du noeud dans la nouvelle matrix (oldVal,newVal)
                if seqStringList[column - 1] in dictPos:
                    dictPos[seqStringList[column -
                                          1]] = (dictPos[seqStringList[column -
                                                                       1]][0],
                                                 column)
                else:
                    # On doit créer un nouvel entrée pour le merge
                    dictPos[seqStringList[column - 1]] = (column, column)
                    t = Tree()
                    t.add_child(dictTree[distanceMatrix[i][0]])
                    t.add_child(dictTree[distanceMatrix[0][j]])
                    t.add_features(name=seqStringList[column - 1], dist=0)
                    dictTree[seqStringList[column - 1]] = t

                    #On doit inactiver les anciennes valeurs

            elif column == 0:
                rowScore.append(seqStringList[row - 1])
            elif row != i and column != i and row != column:
                rowScore.append(distanceMatrix[dictPos[seqStringList[
                    row - 1]][0]][dictPos[seqStringList[column - 1]][0]])
            else:
                rowScore.append(0)
        newMatrix.append(rowScore)

    for row in range(rows + 1):
        # On met à jour les anciens indices
        dictPos[seqStringList[row - 1]] = (dictPos[seqStringList[row - 1]][1],
                                           dictPos[seqStringList[row - 1]][1])

    return newMatrix, dictPos, dictTree
def parse_tree(json_obj):
    tree = Tree()
    tree.add_features(custom_name='0')
    for i in json_obj["tree"]:
        # parse stem
        if (i.get("stem")):
            if (i["stem"]["parent"] == 0):
                parent_node = tree
            else:
                stem_parent_name = str(i["stem"]["parent"])
                parent_node = tree.search_nodes(
                    custom_name=stem_parent_name)[0]
            child = parent_node.add_child()
            child.add_features(custom_name=str(i["stem"]["child"]))
        elif (i.get("leaf")):  # parse leaf
            leaf_parent_name = str(i["leaf"]["parent"])
            parent_node = tree.search_nodes(custom_name=leaf_parent_name)[0]
            parent_node.add_child(name=str(i["leaf"]["label"]))
    return tree
示例#7
0
    def subtree(clone):
        '''Helper function to generate the subtree for each subclone
            Recursively called to include all subclones situated under given clone'''
        # calculate branch distance as difference between clone and parent birthdays
        distance = clone.birthday - clone.parent.birthday
        s = Tree(name=clone.ID, dist=distance)          # set clone as root of subtree
        if log == True:
            size = 10*np.log10(clone.get_family_size())
        else:
            size = clone.get_family_size()
        s.add_features(weight=size, rgb_color=clone.rgb_color)

        # create copy of subclones list and filter (this avoids the original subclones list to be filtered)
        sub_filtered = clone.subclones[:]
        if det_lim > 0:
            sub_filtered = list(filter(lambda subclone: subclone.get_family_size() >= det_lim, sub_filtered))

        for sub in sub_filtered:
            st = subtree(sub)  # call subtree function recursively for each subclone
            s.add_child(st)
        return s
示例#8
0
文件: tp3.py 项目: Whippsie/BioTP4
def makeDistanceMatrix(seqList, dictPos, dictTree, createDic):
    distanceMatrix = []
    rows = len(seqList)
    columns = rows
    for row in range(rows + 1):
        rowScore = []
        for column in range(columns + 1):
            if row == 0 and column == 0:
                rowScore.append("~")
            elif row == 0:
                #Crée les dictionnaires INITIAUX
                rowScore.append(seqList[column - 1].getName())
                if (createDic):
                    dictPos[seqList[column - 1].getName()] = (column, column)
                    t = Tree()
                    t.add_features(name=seqList[column - 1].getName(),
                                   active=True)
                    dictTree[seqList[column - 1].getName()] = t
            elif column == 0:
                rowScore.append(seqList[row - 1].getName())
            else:
                rowScore.append(0)
        distanceMatrix.append(rowScore)
    return distanceMatrix, dictPos, dictTree
示例#9
0
def partitionTreeSet(N):
    if N == 1:
        x = Tree(";",format=100)
        x.add_features(value=N, name=str(N))
        
        xFace = styleFace(x.name)
        x.add_face(xFace,column=0,position="branch-top")

        return (x,)
    else:
        y = ()
        base = Tree(";",format=100)
        base.dist = 1

        for k in range(lam(N)):
            left    = partitionTreeSet(N-(k+1))
            right   = partitionTreeSet(k+1)

            for l in left:
                for r in right:
                    l.dist = 1
                    r.dist = 1

                    z = base.copy()
                    z.dist = 1
                    
                    z.add_features(value=N, name=str(N))
                    z.add_child(l.copy())
                    z.add_child(r.copy())

                    zFace = styleFace(z.name)
                    z.add_face(zFace,column=0,position="branch-top")

                    y = y + (z,)
        
        return y
示例#10
0
# Parse the node information for the root

root_info = lines[last_comments:]
median = float(root_info.split("age_median=")[1].split(":")[0])
mean = float(root_info.split("age_mean=")[1].split("]")[0])
sd = float(root_info.split("age_sd=")[1].split(":")[0])
min = float(root_info.split("age_range={")[1].split("_")[0])
max = float(root_info.split("age_range={")[1].split("_")[1].split("}")[0])
ciMin = float(root_info.split("age_quant_5_95={")[1].split("_")[0])
ciMax = float(
    root_info.split("age_quant_5_95={")[1].split("_")[1].split("}")[0])
id = float(root_info.split("id=")[1].split(":")[0])
t.add_features(support=1.0,
               age_median=median,
               age_mean=mean,
               age_sd=sd,
               age_range="{" + str(min) + "_" + str(max) + "}",
               age_quant_5_95="{" + str(ciMin) + "_" + str(ciMax) + "}",
               id=id)

ts = TreeStyle()
ts.min_leaf_separation = 0
ts.show_scale = False
ts.show_leaf_name = False
ts.scale = scale  #0.3 #0.1# 0.3  # 10 pixels per branch length unit
nstyle = NodeStyle()
nstyle["size"] = 0.0001
for n in t.traverse():
    n.set_style(nstyle)

for leaf in t:
示例#11
0
def build_tree(population, det_lim=1, log=False):
    '''Builds an ete3 Tree object based on the clone phylogeny in the population
        A detection limit can be set which will filter out clones that fall below this limit. The limit is
            one by default, so that only alive clones are taken into account.
        A log-scale can be set which will be used to calculate the node sizes as the log10 of the clone size'''

    def tree_layout(node):
        '''Tree layout function to define the layout of each node within the tree'''
        hex_color = '#%02X%02X%02X' %(node.rgb_color)
        node.img_style["fgcolor"] = hex_color  # set color of node
        node.img_style["size"] = node.weight   # set size of node


    start_clone = population.start_clone
    t = Tree(name=start_clone.ID, dist=0)   # set start clone as root of tree
    if log == True:
        size = 10*np.log10(start_clone.get_family_size())
    else:
        size = start_clone.get_family_size()
    t.add_features(weight=size, rgb_color=start_clone.rgb_color)


    def subtree(clone):
        '''Helper function to generate the subtree for each subclone
            Recursively called to include all subclones situated under given clone'''
        # calculate branch distance as difference between clone and parent birthdays
        distance = clone.birthday - clone.parent.birthday
        s = Tree(name=clone.ID, dist=distance)          # set clone as root of subtree
        if log == True:
            size = 10*np.log10(clone.get_family_size())
        else:
            size = clone.get_family_size()
        s.add_features(weight=size, rgb_color=clone.rgb_color)

        # create copy of subclones list and filter (this avoids the original subclones list to be filtered)
        sub_filtered = clone.subclones[:]
        if det_lim > 0:
            sub_filtered = list(filter(lambda subclone: subclone.get_family_size() >= det_lim, sub_filtered))

        for sub in sub_filtered:
            st = subtree(sub)  # call subtree function recursively for each subclone
            s.add_child(st)
        return s


    # create copy of subclones list and filter (this avoids the original subclones list to be filtered)
    filtered = start_clone.subclones[:]
    if det_lim > 0:
        filtered = list(filter(lambda clone: clone.get_family_size() >= det_lim, filtered))

    for subclone in filtered:
        s = subtree(subclone)
        t.add_child(s)

    # Define TreeStyle
    ts = TreeStyle()
    ts.show_leaf_name = False
    ts.show_branch_length = False
    ts.show_branch_support = False
    ts.rotation = 90  # rotate the tree to get a horizontal one
    ts.layout_fn = tree_layout

    return t, ts
示例#12
0
文件: tp3.py 项目: Whippsie/BioTP4
def main():

    # Creates a list of Sequence object with the name and content
    seqList = readSequences("proteines.fa")

    # Updates the newContent property with the oldContent without gap
    seqList = removeGaps(seqList)
    print("New sequences")
    printSequences(seqList)
    print(" =========================================")
    # Parses the BLOSUM62 matrix
    blosumMatrix = makeBlosumMatrix()

    dictPos = {}
    dictTree = {}

    # Calculates the first distance matrix using blosum62 score
    distanceMatrix, dictPos, dictTree = calculateDistanceMatrix(
        blosumMatrix, seqList, dictPos, dictTree)
    print("Matrice initiale des distances")
    printMatrix(distanceMatrix)

    print(" =========================================")

    print("Matrice pondérée")
    negativeMatrix, posSmallest, dictPos, dictTree = calculateNJMatrix(
        seqList, distanceMatrix, dictPos, dictTree)
    printMatrix(negativeMatrix)
    print("Smallest is: ", posSmallest[0], posSmallest[1])

    # Cette fonction merge 2 séquences en une nouvelle, modifie la liste des sequences et rajoute le noeud dans l'arbre
    seqListString, distanceMatrix, njTreeStringArray, dictPos, dictTree = updateNJTree(
        posSmallest[0], posSmallest[1], len(seqList), distanceMatrix, seqList,
        dictPos, dictTree)
    print(" =========================================")
    print(" =========================================")
    print("Matrice des distances après 1 itération")
    printMatrix(distanceMatrix)

    # Le but ici est de looper et de modifier la matrice jusqu'à ce que seulement 2 noeuds restent dans la liste des sequences
    # Dans ce cas-là, on les merge dans une racine vide (car NJ retourne un non-enraciné)
    while len(seqListString) > 2:

        seqList = []
        # newSeqList est une liste de String, on doit donc créer les objets correspondants
        for s in seqListString:
            seqList.append(Sequence(s, "", ""))

        #On recalcule la matrice NJ à partir de la nouvelle matrice des distances
        negativeMatrix, posSmallest, dictPos, dictTree = calculateNJMatrix(
            seqList, distanceMatrix, dictPos, dictTree)
        print(" =========================================")

        print("Matrice pondérée")
        printMatrix(negativeMatrix)
        print("Smallest is: ", posSmallest[0], posSmallest[1])
        print("")
        # Cette fonction merge 2 séquences en une nouvelle, modifie la liste des sequences et rajoute le noeud dans l'arbre
        seqListString, distanceMatrix, njTreeStringArray, dictPos, dictTree = updateNJTree(
            posSmallest[0], posSmallest[1], len(seqList), distanceMatrix,
            seqList, dictPos, dictTree)
        print(" =========================================")
        print(" =========================================")
        print("Matrice des distances après itérations")
        printMatrix(distanceMatrix)

    #Il nous reste un seul match à faire, on aura toujours 3 colonnes
    t = Tree()
    t.add_child(dictTree[distanceMatrix[0][2]])
    t.add_child(dictTree[distanceMatrix[0][1]])
    t.add_features(name='notaroot')
    t.add_features(dist=0)
    dictTree['notaroot'] = t

    t = dictTree['notaroot']
    print(" =========================================")
    print("Arbre NJ obtenu")
    print(t)
    print(" =========================================")

    treesFromFile = readTrees("arbres.nw")
    # Enlever les commentaires pour voir toutes les comparaisons RF des arbres du fichier
    #rfMatrix = calculateRFMatrix(treesFromFile)
    #print ("==============RF MATRIX==============")
    #printMatrix(rfMatrix)
    cpt = 0
    for i in treesFromFile:
        print("Comparaison RF entre Arbre NJ et Arbre ", cpt, " du fichier.")
        print(robinsonFould(i, t))
        cpt += 1
    distances(t)
示例#13
0
文件: test.py 项目: Whippsie/BioTP4
    dictPos[nameNode] = i
    t = Tree()
    a = t.add_child(name=nameNode)
    a.add_features(active=True)
    dictTree[nameNode] = a
    print(a)

#Exemple de merge
nameNode = 'd'
dictPos[nameNode] = 1
#nameList = ['a','d']
noeud = Tree()
#a = t.add_child(name=nameNode)
noeud.add_child(dictTree[nameList[1]])
noeud.add_child(dictTree[nameList[2]])
noeud.add_features(name=nameNode)
dictTree[nameNode] = noeud
test = dictTree[nameNode]
print(test.get_ascii(show_internal=True))
print(noeud.get_ascii(show_internal=True))

print(dictPos)
print(dictTree)
for node in t1:
    if node.is_root():
        print("hello")
    #if not node.is_leaf():
    #innerbranch.append(node)
    #print (node)

#for leaf in t1:
示例#14
0
def build_tree(data,
               feature_info,
               sens,
               expl,
               output,
               metric,
               conf,
               max_depth,
               min_leaf_size=100,
               agg_type='avg',
               max_bins=10,
               subsample_frac=1.0):
    """
    Builds a decision tree guided towards nodes with high bias

    Parameters
    ----------
    data :
        the dataset

    feature_info :
        information about user features

    sens :
        name of the sensitive feature

    expl :
        name of the explanatory feature

    output :
        the target feature

    metric :
        the fairness metric to use

    conf :
        the confidence level

    max_depth :
        maximum depth of the decision-tree

    min_leaf_size :
        minimum size of a leaf

    agg_type :
        aggregation method for children scores

    max_bins :
        maximum number of bins to use when binning continuous features

    Returns
    -------
    tree :
        the tree built by the algorithm
    """
    from ete3 import Tree
    logging.info('Building a Guided Decision Tree')
    tree = Tree()

    # Check if there are multiple labeled outputs
    # targets = data.columns[-output.num_labels:].tolist()
    targets = output.names.tolist()
    logging.debug('Targets: %s', targets)

    features = set(data.columns.tolist()) - set([sens, expl]) - set(targets)
    logging.debug('Contextual Features: %s', features)

    # check the data dimensions
    if metric.dataType == Metric.DATATYPE_CORR:
        if expl:
            dim = (feature_info[expl].arity, 6)
        else:
            dim = 6
    else:
        # get the dimensions of the OUTPUT x SENSITIVE contingency table
        if expl:
            dim = (feature_info[expl].arity, output.arity,
                   feature_info[sens].arity)
        else:
            dim = (output.arity, feature_info[sens].arity)

    logging.debug('Data Dimension for Metric: %s', dim)

    # bin the continuous features
    cont_thresholds = find_thresholds(data, features, feature_info, max_bins)

    score_params = ScoreParams(metric, agg_type, conf)
    split_params = SplitParams(targets, sens, expl, dim, feature_info,
                               cont_thresholds, min_leaf_size, subsample_frac)

    # get a measure for the root
    if metric.dataType == Metric.DATATYPE_CT:
        stats = [count_values(data, sens, targets[0], expl, dim)[0]]
    elif metric.dataType == Metric.DATATYPE_CORR:
        stats = [corr_values(data, sens, targets[0], expl, dim)[0]]
    else:
        stats = [data[targets + [sens]]]

    _, root_metric = score(stats, score_params)
    tree.add_features(metric=root_metric[0])

    #
    # Builds up the tree recursively. Selects the best feature to split on,
    # in order to maximize the average bias (mutual information) in all
    # sub-trees.
    def rec_build_tree(node_data, node, pred, split_features, depth,
                       parent_score, pool):
        """
        Recursive tree building.

        Parameters
        ----------
        node_data :
            the data for the current node

        pred :
            the predicate defining the current context

        split_features :
            the features on which a split can occur

        depth :
            the current depth

        parent_score :
            the metric score at the parent

        pool :
            the thread pool

        Returns
        -------
        tree :
            the tree built by the algorithm
        """

        node.add_features(size=len(node_data))

        # make a new leaf if recursion is stopped
        if (depth == max_depth) or (len(split_features) == 0):
            return

        logging.debug('looking for splits at pred %s', pred)

        # select the best feature to split on
        split_score, best_feature, threshold, to_drop, child_metrics = \
            select_best_feature(node_data, split_features, split_params,
                                score_params, parent_score, pool)

        # no split found, make a leaf
        if best_feature is None:
            return

        logging.info('splitting on %s (score=%s) with threshold %s at pred %s',
                     best_feature, split_score, threshold, pred)

        if threshold:
            # binary split
            data_left = node_data[node_data[best_feature] <= threshold]
            data_right = node_data[node_data[best_feature] > threshold]

            # predicates for sub-trees
            pred_left = "{} <= {}".format(best_feature, threshold)
            pred_right = "{} > {}".format(best_feature, threshold)

            # add new nodes to the underlying tree structure
            left_child = node.add_child(name=str(pred_left))
            left_child.add_features(feature_type='continuous',
                                    feature=best_feature,
                                    threshold=threshold,
                                    is_left=True,
                                    metric=child_metrics['left'])

            right_child = node.add_child(name=str(pred_right))
            right_child.add_features(feature_type='continuous',
                                     feature=best_feature,
                                     threshold=threshold,
                                     is_left=False,
                                     metric=child_metrics['right'])

            # recursively build the tree
            rec_build_tree(data_left, left_child, pred + [pred_left],
                           split_features - set(to_drop), depth + 1,
                           split_score, pool)
            rec_build_tree(data_right, right_child, pred + [pred_right],
                           split_features - set(to_drop), depth + 1,
                           split_score, pool)

        else:
            # categorical split
            for val in node_data[best_feature].unique():

                # check if this child was pruned or not
                if val in child_metrics:
                    # predicate for the current sub-tree
                    new_pred = "{} = {}".format(best_feature, val)

                    # add a node to the underlying tree structure
                    child = node.add_child(name=str(new_pred))
                    child.add_features(feature_type='categorical',
                                       feature=best_feature,
                                       category=val,
                                       metric=child_metrics[val])

                    child_data = node_data[node_data[best_feature] == val]

                    # recursively build the tree
                    rec_build_tree(
                        child_data, child, pred + [new_pred],
                        split_features - set(to_drop + [best_feature]),
                        depth + 1, split_score, pool)

    #
    # When contextual features are just a few there is
    # no actual benefit out of parallelization. In fact,
    # contention introduces a slight overhead. Hence,
    # use only one thread to score less than 10 features.
    #
    if len(features) < 10:
        pool_size = 1
    else:
        pool_size = 1  # max(1, multiprocessing.cpu_count() - 2)

    if pool_size == 1:
        rec_build_tree(data, tree, [], features, 0, 0, None)
    else:
        pool = multiprocessing.Pool(pool_size)
        rec_build_tree(data, tree, [], features, 0, 0, pool)
        pool.close()
        pool.join()

    return tree