def mapDendro(self, tree, G):
        print "MAP"
        pdm = treemeasure.PatristicDistanceMatrix(tree)

        tns = tree.taxon_namespace

        #~ tree.edge_mrca = {}
        node_map = {}
        self.edge_count = {}
        self.mrca_count = Counter([])

        for i in xrange(self.N):
            v = self.namesLUTr[str(i)]
            nodes = pdm._mrca[tns.get_taxon(v)]
            self.mrca_count += Counter(
                [ancestor for taxon, ancestor in nodes.iteritems()])

        n_internals = len(self.mrca_count.keys())
        node_map = dict(
            (node, id) for id, node in enumerate(self.mrca_count.iterkeys()))

        for node in self.mrca_count.keys():
            self.mrca_count[node_map[node]] = self.mrca_count.pop(node)
            self.edge_count[node_map[node]] = np.zeros(len(G))

        for gi, edgeList in enumerate(G):
            for edge in edgeList:
                emrca = pdm.mrca(tns.get_taxon(edge[0]),
                                 tns.get_taxon(edge[1]))
                self.edge_count[node_map[emrca]][gi] += 1.
        print "DENDRO"
예제 #2
0
def pairwise_sequence(X, Y):
    '''
    input:
        X: recordings of all cells in one cluster in pandas DataFrame format;
           contains two column: 'cell' and 'state'
        Y: groundtruth(tree) of the cell cluster
    output:
        permutated pairwise combinitions of all cells and distances
        X_out: a list of 2 by 10 matrix of the recordings from two cells 
        Y_out: a list of distances between two cells
    '''
    pdm = treemeasure.PatristicDistanceMatrix(Y)
    X_out = []
    Y_out = []
    for i, tx1 in enumerate(Y.taxon_namespace):
        for j, tx2 in enumerate(Y.taxon_namespace):
            Y_pair = pdm(tx1, tx2)
            state1 = np.array(
                [float(a) for a in list(tx1.label.split(' ')[1])])
            state2 = np.array(
                [float(a) for a in list(tx2.label.split(' ')[1])])
            #X_pair = np.vstack((state1, state2)) # random forest only takes 1-d feature array
            X_pair = np.hstack((state1, state2))
            X_out.append(X_pair)
            Y_out.append(Y_pair)
            #print(X_pair, Y_pair)

    return X_out, Y_out
예제 #3
0
def bdict_from_tree(tree, pairs):
    """
    get pairwise distance dictionary from tree
    input:
        tree: tree object
        pairs: list, of all pairs of sequences
    output:
        bDict: dict, pair -> distance
    """
    bDict = {}
    pdm = treemeasure.PatristicDistanceMatrix(tree)
    for pair in pairs:
        seq1Name, seq2Name = pair
        # print pdm(tree.find_node_with_taxon_label(seq1Name).taxon, tree.find_node_with_taxon_label(seq2Name).taxon)
        bDict[pair] = pdm(tree.find_node_with_taxon_label(seq1Name).taxon, tree.find_node_with_taxon_label(seq2Name).taxon)
    return bDict
    def calculatePvals(self, tree, edgeList=None, ng=1, setPriors=False):
        st = time.time()
        #~ print "Pvals"

        pdm = treemeasure.PatristicDistanceMatrix(tree)

        tns = tree.taxon_namespace

        #~ print "Init pvals",len(tree.nodes()),time.time()-st
        for node in tree.nodes():
            if node.is_internal():
                node.ei = 0.0

        tree.edge_mrca = {}

        if edgeList is not None:
            #~ print "Calc ei",len(edgeList),time.time()-st
            for edge in edgeList:
                emrca = pdm.mrca(tns.get_taxon(edge[0]),
                                 tns.get_taxon(edge[1]))
                emrca.ei += 1
                tree.edge_mrca[str(edge)] = emrca
                #~ self.mrca(pdm,tns.get_taxon(edge[0]),tns.get_taxon(edge[1])).ei += 1

        #~ print "Calc p",len(tree.nodes()),time.time()-st
        for node in tree.nodes():
            if node.is_internal():
                split_sizes = [len(b.leaf_nodes()) for b in node.child_nodes()]
                node.ni = np.sum(
                    split_sizes * (np.sum(split_sizes) - split_sizes)
                )  # number of possible links (split_sizes vector times reverse cumsum split_sizes)
                #~ node.p = (node.ei+1) / (node.ni+2) #Expected value and not actually used
                node.alpha = 1  #set priors - NB: these are overwritten for the null model
                node.beta = 1
                if setPriors:
                    node.alpha += node.ei
                    node.beta += (node.ni * ng - node.ei)
예제 #5
0
try:
    clusterF = open(args.c, 'r')
except IOError:
    print "\n cluster file not found in directory."
    sys.exit()

#create save file
try:
    save = open("ClustersExtendedOnTree.txt", 'w')
except IOError:
    print 'no room for save file'
    sys.exit()

#read in the tree, keeping underscores in the names
tree = dendropy.Tree.get(path=tree, schema='newick', preserve_underscores=True)
pdm = treemeasure.PatristicDistanceMatrix(tree)
taxa = tree.taxon_namespace

#go cluster by cluster, get the mrca of them all, then output all the tip child nodes of that MRCA as the new cluster
clusters = []
while 1:
    s = clusterF.readline()
    if not s:
        break
    s = s.rstrip()
    sections = s.split("\t")
    mrca = tree.mrca(taxon_labels=sections)
    newClusterNodes = mrca.leaf_nodes()
    newCluster = []
    for leaf in newClusterNodes:
        newCluster.append(re.sub("\'", "", str(leaf.taxon)))
예제 #6
0
    def summarize_trees(self,
            trees,
            trees_outf=None,
            params=None,
            summaries=None):
        trees = self.tree_postprocessor.process_trees(trees)
        stats_fields = set()

        # crucial assumption here is all trees from same landscape wrt to
        # number of islands and habitats
        representative_taxon = trees[0].taxon_namespace[0]
        community_by_disturbed_vs_interior_habitat = {}
        num_islands = len(representative_taxon.island_code)
        num_habitats = len(representative_taxon.habitat_code)
        # community_by_island = {}
        # community_by_habitat = {}
        # for i in num_islands:
        #     community_by_island[i] = {}
        # for i in num_habitats:
        #     community_by_habitat[i] = {}
        # community_by_disturbed_vs_interior_habitat[0] = {}
        # community_by_disturbed_vs_interior_habitat[1] = {}

        for tree in list(trees):
            num_tips = 0
            total_length = 0.0
            total_edges = 0
            nodes_by_island = collections.defaultdict(list)
            nodes_by_habitat = collections.defaultdict(list)
            disturbed_habitat_nodes = []
            interior_habitat_nodes = []

            all_tips = []
            for nd in tree:
                # colorize
                if nd.taxon is None and nd.label is None:
                    continue
                if nd.label is not None:
                    self.tree_postprocessor.decode_labeled_item_biogeography(nd)
                # stats
                total_edges += 1
                num_tips += 1
                total_length += nd.edge.length
                if nd.is_leaf():
                    all_tips.append(nd)
                    island_code = nd.taxon.island_code
                    for idx, i in enumerate(island_code):
                        # island_idx = len(island_code) - idx
                        island_idx = idx
                        if i == "1":
                            nodes_by_island[island_idx].append(nd)
                    habitat_code = nd.taxon.habitat_code
                    for idx, i in enumerate(habitat_code):
                        # habitat_idx = len(habitat_code) - idx
                        habitat_idx = idx
                        if i == "1":
                            nodes_by_habitat[habitat_idx].append(nd)
                            if habitat_idx == 0:
                                disturbed_habitat_nodes.append(nd)
                            else:
                                if nd not in interior_habitat_nodes:
                                    interior_habitat_nodes.append(nd)
            if len(nodes_by_island) < num_islands and self.drop_trees_not_occupying_all_islands:
                trees.remove(tree)
                continue
            if len(nodes_by_habitat) < num_habitats and self.drop_trees_not_occupying_all_habitats:
                trees.remove(tree)
                continue
            pdm = treemeasure.PatristicDistanceMatrix(tree=tree)
            tree.stats = collections.defaultdict(lambda:"NA")
            if params is not None:
                tree.params = params.copy()
            tree.stats["size"] = num_tips
            tree.stats["length"] = total_length
            tree.stats["edges"] = total_edges
            # node_ages = tree.internal_node_ages()
            # node_ages = [n/total_length for n in node_ages]
            # tree.stats["est.birth.rate"] = birthdeath.fit_pure_birth_model(internal_node_ages=node_ages)["birth_rate"]
            tree.stats["est.birth.rate"] = birthdeath.fit_pure_birth_model(tree=tree)["birth_rate"]

            weighted_disturbed, unweighted_disturbed = self.get_mean_patristic_distance(pdm, disturbed_habitat_nodes)
            weighted_interior, unweighted_interior = self.get_mean_patristic_distance(pdm, interior_habitat_nodes)
            tree.stats["weighted.disturbed.habitat.pd"] = weighted_disturbed
            tree.stats["unweighted.disturbed.habitat.pd"] = unweighted_disturbed
            tree.stats["weighted.interior.habitat.pd"] = weighted_interior
            tree.stats["unweighted.interior.habitat.pd"] = unweighted_interior
            try:
                tree.stats["weighted.disturbed.to.interior.habitat.pd"] = weighted_disturbed/weighted_interior
                tree.stats["unweighted.disturbed.to.interior.habitat.pd"] = unweighted_disturbed/unweighted_interior
            except (ZeroDivisionError, TypeError):
                tree.stats["weighted.disturbed.to.interior.habitat.pd"] = "NA"
                tree.stats["unweighted.disturbed.to.interior.habitat.pd"] = "NA"

            rstats = self.rcalc.calc_ecological_stats(
                    tree=tree,
                    patristic_distance_matrix=pdm,
                    total_tree_length=total_length,
                    total_tree_edges=total_edges,
                    nodes_by_island=nodes_by_island,
                    nodes_by_habitat=nodes_by_habitat,
                    disturbed_habitat_nodes=disturbed_habitat_nodes,
                    interior_habitat_nodes=interior_habitat_nodes,
                    )

            stats_fields.update(tree.stats.keys())

            if summaries is not None:
                sss = tree.stats.copy()
                sss.update(tree.params)
                summaries.append(sss)

        if trees_outf is not None:
            try:
                trees.write_to_stream(trees_outf, "nexus")
            except AttributeError:
                self.write_nexus(trees, trees_outf)
        return trees, stats_fields
예제 #7
0
def leaf_data_from_sim_tree(tree):
    """
    summarizing results from simulated tree
    output:
    """
    seqs = {}
    alignInSeg = {}
    alignSeg = {}
    segRateDict = {}
    # notAlignSeqs = {}
    ts = {}
    treeTem = deepcopy(tree)
    pdm = treemeasure.PatristicDistanceMatrix(treeTem)
    leafNodes = treeTem.leaf_nodes()
    lanNames = [leafNode.taxon.label for leafNode in leafNodes]
    lanNames.sort()
    lanPairsIter = itertools.combinations(lanNames, 2)
    segRateDict = get_seg_rate_dict(lanNames, treeTem)
    segIds = segRateDict.keys()
    segIds.sort()
    nSegs = len(segIds)
    segIdsNew = range(nSegs)
    segIdsTrans = dict(zip(segIds, segIdsNew))
    for segId in segIds:
        segIdNew = segIdsTrans[segId]
        segRateDict[segIdNew] = segRateDict.pop(segId)
    # get simple sequences without alignment
    for lanName in lanNames:
        node = treeTem.find_node_with_taxon_label(lanName)
        seqs[lanName] = node.value
        # notAlignSeqs[lanName] = get_dna_seq(node.value)
    # get multiple alignment
    segIdLocIdDict = get_all_locid_within_all_segs(seqs, segIds)
    multiAlignSeqs = get_dna_multi_align(segIds, segIdLocIdDict, seqs,
                                         lanNames)
    for lanName, values in seqs.iteritems():
        segIdsInLan = values.keys()
        for segId in segIdsInLan:
            segIdNew = segIdsTrans[segId]
            seqs[lanName][segIdNew] = seqs[lanName][segId]
            del seqs[lanName][segId]
    # get pairwise alignemtn
    for lanPair in lanPairsIter:
        lanName1 = lanPair[0]
        lanName2 = lanPair[1]
        node1 = treeTem.find_node_with_taxon_label(lanName1)
        node2 = treeTem.find_node_with_taxon_label(lanName2)
        segs1 = node1.value
        segs2 = node2.value
        alignInSeg[lanPair] = get_dna_align(segs1, segs2)
        alignSeg[lanPair] = get_seg_align(segs1, segs2)
        ts[lanPair] = pdm(node1.taxon, node2.taxon)
        # very small compuatational error in pdm method
    # get segment lengths
    seq0 = deepcopy(multiAlignSeqs[multiAlignSeqs.keys()[0]])
    lenSegs = []
    while seq0 != '':
        next = seq0.index('*')
        if next != 0:
            lenSegs.append(next)
            seq0 = seq0[(next + 1):]
        else:
            seq0 = seq0[1:]
    return seqs, alignInSeg, alignSeg, segRateDict, ts, multiAlignSeqs, lenSegs