Exemplo n.º 1
0
    def runTest(self):
        """Testing NormalizedBitmaskDict"""
        mask = 0xFF # 1111 1111
        splits = [
            ((0x03, '0000 0011'), (0x03, '0000 0011')),
            ((0x34, '0011 0100'), (0xCB, '1100 1011')),
            ((0x44, '0100 0100'), (0xBB, '1011 1011')),
            ((0x12, '0001 0010'), (0xED, '1110 1101')),
            ((0x75, '0111 0101'), (0x75, '0111 0101')),
            ]
        d = containers.NormalizedBitmaskDict(mask=mask)
        for s in splits:
            d[s[0][0]] = s[0][1]

        for s in splits:
            self.assertIn(s[0][0], d)
            self.assertIn(s[1][0], d)
            self.assertEqual(d[s[0][0]], d[s[1][0]])

        for k, v in d.items():
            pass

        del d[splits[0][0][0]]
        del d[splits[1][1][0]]
        self.assertNotIn(splits[0][0][0], d)
        self.assertNotIn(splits[0][1][0], d)
        self.assertNotIn(splits[1][0][0], d)
        self.assertNotIn(splits[1][1][0], d)
Exemplo n.º 2
0
def tree_from_token_stream(stream_tokenizer, **kwargs):
    """
    Processes a (SINGLE) TREE statement. Assumes that the input stream is
    located at the beginning of the statement (i.e., the first non-comment
    token should be the opening parenthesis of the tree definition).

    str_to_taxon kwarg (if used) must supply the StrToTaxon interface).
    """
    translate_dict = kwargs.get("translate_dict", None)
    encode_splits = kwargs.get("encode_splits", False)
    rooting_interpreter = kwargs.get("rooting_interpreter",
                                     RootingInterpreter(**kwargs))
    finish_node_func = kwargs.get("finish_node_func", None)
    edge_len_type = kwargs.get("edge_len_type", float)
    taxon_set = kwargs.get("taxon_set", None)
    suppress_internal_node_taxa = kwargs.get("suppress_internal_node_taxa",
                                             False)
    store_tree_weights = kwargs.get("store_tree_weights", False)
    extract_comment_metadata = kwargs.get('extract_comment_metadata', False)
    case_sensitive_taxon_labels = kwargs.get('case_sensitive_taxon_labels',
                                             False)
    allow_repeated_use = kwargs.get('allow_repeated_use', False)
    stream_tokenizer_extract_comment_metadata_setting = stream_tokenizer.extract_comment_metadata
    stream_tokenizer.extract_comment_metadata = extract_comment_metadata
    if taxon_set is None:
        taxon_set = dataobject.TaxonSet()
    tree = dataobject.Tree(taxon_set=taxon_set)

    stream_tokenizer.tree_rooting_comment = None  # clear previous comment
    stream_tokenizer.clear_comment_metadata()
    token = stream_tokenizer.read_next_token()
    if not token:
        return None
    tree.is_rooted = rooting_interpreter.interpret_as_rooted(
        stream_tokenizer.tree_rooting_comment)
    #    if stream_tokenizer.tree_rooting_comment is not None:
    #        tree.is_rooted = rooting_interpreter.interpret_as_rooted(stream_tokenizer.tree_rooting_comment)
    #    elif rooting_interpreter.interpret_as_rooted(stream_tokenizer.tree_rooting_comment):
    #        tree_is_rooted = True

    if store_tree_weights and stream_tokenizer.tree_weight_comment is not None:
        try:
            weight_expression = stream_tokenizer.tree_weight_comment.split(
                ' ')[1]
            tree.weight = eval("/".join(
                ["float(%s)" % cv for cv in weight_expression.split('/')]))
        except IndexError:
            pass
        except ValueError:
            pass
        stream_tokenizer.tree_weight_comment = None

    if encode_splits:
        if len(taxon_set) == 0:
            raise Exception("When encoding splits on a tree as it is being parsed, a "
                + "fully pre-populated TaxonSet object must be specified using the 'taxon_set' keyword " \
                + "to avoid taxon/split bitmask values changing as new Taxon objects are created " \
                + "and added to the TaxonSet.")
        if tree.is_rooted:
            tree.split_edges = {}
        else:
            atb = taxon_set.all_taxa_bitmask()
            d = containers.NormalizedBitmaskDict(mask=atb)
            tree.split_edges = d
        split_map = tree.split_edges

    stt = kwargs.get('str_to_taxon')
    if stt is None:
        stt = StrToTaxon(taxon_set,
                         translate_dict,
                         allow_repeated_use=allow_repeated_use,
                         case_sensitive=case_sensitive_taxon_labels)

    tree.seed_node = dataobject.Node()
    curr_node = tree.seed_node
    if encode_splits:
        curr_node.edge.split_bitmask = 0L

    ### NHX format support ###
    def store_node_comments(active_node):
        if stream_tokenizer.comments:
            active_node.comments.extend(stream_tokenizer.comments)

    def store_comment_metadata(target):
        if extract_comment_metadata:
            if stream_tokenizer.has_comment_metadata():
                comment_metadata = stream_tokenizer.comment_metadata
                try:
                    target.comment_metadata.update(comment_metadata)
                except AttributeError:
                    target.comment_metadata = comment_metadata
                stream_tokenizer.clear_comment_metadata()
            elif not hasattr(target, "comment_metadata"):
                target.comment_metadata = {}

    # store and clear comments
    tree.comments = stream_tokenizer.comments
    stream_tokenizer.clear_comments()
    store_comment_metadata(tree)

    while True:
        if not token or token == ';':
            if curr_node is not tree.seed_node:
                raise stream_tokenizer.data_format_error(
                    "Unbalanced parentheses -- not enough ')' characters found in tree description"
                )
            if encode_splits:
                split_map[curr_node.edge.split_bitmask] = curr_node.edge
            break
        if token == '(':
            if not curr_node.parent_node:
                if curr_node.child_nodes():
                    raise stream_tokenizer.data_format_error(
                        "Unexpected '(' after the tree description.  Expecting a label for the root or a ;"
                    )
            tmp_node = dataobject.Node()
            if encode_splits:
                tmp_node.edge.split_bitmask = 0L
            curr_node.add_child(tmp_node)
            curr_node = tmp_node
            token = stream_tokenizer.read_next_token()
            store_node_comments(curr_node)
            store_comment_metadata(curr_node)
        elif token == ',':
            tmp_node = dataobject.Node()
            if curr_node.is_leaf() and not curr_node.taxon:
                #                 curr_node.taxon = taxon_set.Taxon(oid="UNAMED_" + str(id(curr_node)), label='')
                #                 taxon_set.add(curr_node.taxon)
                raise stream_tokenizer.data_format_error(
                    "Missing taxon specifier in a tree -- found either a '(,' or ',,' construct."
                )
            p = curr_node.parent_node
            if not p:
                raise stream_tokenizer.data_format_error(
                    "Comma found one the 'outside' of a newick tree description"
                )
            if encode_splits:
                tmp_node.edge.split_bitmask = 0L
                e = curr_node.edge
                u = e.split_bitmask
                split_map[u] = e
                p.edge.split_bitmask |= u
            if finish_node_func is not None:
                finish_node_func(curr_node, tree)
            p.add_child(tmp_node)
            curr_node = tmp_node
            token = stream_tokenizer.read_next_token()
            store_node_comments(curr_node)
            store_comment_metadata(curr_node)
        else:
            if token == ')':
                if curr_node.is_leaf() and not curr_node.taxon:
                    raise stream_tokenizer.data_format_error(
                        "Missing taxon specifier in a tree -- found either a '(,' or ',,' construct."
                    )
                p = curr_node.parent_node
                if not p:
                    raise stream_tokenizer.data_format_error(
                        "Unbalanced parentheses -- too many ')' characters found in tree description"
                    )
                if encode_splits:
                    e = curr_node.edge
                    u = e.split_bitmask
                    p.edge.split_bitmask |= u
                    split_map[u] = curr_node.edge
                if finish_node_func is not None:
                    finish_node_func(curr_node, tree)
                curr_node = p
            else:
                is_leaf = curr_node.is_leaf()
                if is_leaf:
                    if curr_node.taxon:
                        raise stream_tokenizer.data_format_error(
                            "Multiple labels found for the same leaf (taxon '%s' and label '%s')"
                            % (str(curr_node.taxon), token))
                    try:
                        t = stt_require_taxon(stt, label=token)
                    except StrToTaxon.MultipleTaxonUseError, e:
                        raise stream_tokenizer.data_format_error(e.msg)
                else:
                    if curr_node.label:
                        raise stream_tokenizer.data_format_error(
                            "Multiple labels found for the same leaf (taxon '%s' and label '%s')"
                            % (curr_node.label, token))
                    if suppress_internal_node_taxa:
                        t = None
                    else:
                        try:
                            t = stt.get_taxon(label=token)
                        except StrToTaxon.MultipleTaxonUseError, e:
                            raise stream_tokenizer.data_format_error(e.msg)
Exemplo n.º 3
0
def encode_splits(tree, create_dict=True, delete_outdegree_one=True):
    """
    Processes splits on a tree, encoding them as bitmask on each edge.
    Adds the following to each edge:
        - `split_bitmask` : a rooted split representation, i.e. a long/bitmask
            where bits corresponding to indices of taxa descended from this
            edge are turned on
    If `create_dict` is True, then the following is added to the tree:
        - `split_edges`:
            [if `tree.is_rooted`]: a dictionary where keys are the
            splits and values are edges.
            [otherwise]: a containers.NormalizedBitmaskDictionary where the keys are the
            normalized (unrooted) split representations and the values
            are edges. A normalized split_mask is where the split_bitmask
            is complemented if the right-most bit is not '0' (or just
            the split_bitmask otherwise).
    If `delete_outdegree_one` is True then nodes with one
        will be deleted as they are encountered (this is required
        if the split_edges dictionary is to refer to all edges in the tree).
        Note this will mean that an unrooted tree like '(A,(B,C))' will
        be changed to '(A,B,C)' after this operation!
    """
    taxon_set = tree.taxon_set
    if taxon_set is None:
        taxon_set = tree.infer_taxa()
    if create_dict:
        tree.split_edges = {}
        split_map = tree.split_edges
        # if tree.is_rooted:
        #     tree.split_edges = {}
        # else:
        #     atb = taxon_set.all_taxa_bitmask()
        #     d = containers.NormalizedBitmaskDict(mask=atb)
        #     tree.split_edges = d
        # split_map = tree.split_edges
    if not tree.seed_node:
        return

    if delete_outdegree_one:
        sn = tree.seed_node
        if not tree.is_rooted:
            if len(sn.child_nodes()) == 2:
                tree.deroot()
        while len(sn.child_nodes()) == 1:
            c = sn.child_nodes()[0]
            if len(c.child_nodes()) == 0:
                break
            try:
                sn.edge.length += c.edge.length
            except:
                pass
            sn.remove_child(c)
            for gc in c.child_nodes():
                sn.add_child(gc)

    for edge in tree.postorder_edge_iter():
        cm = 0
        h = edge.head_node
        child_nodes = h.child_nodes()
        nc = len(child_nodes)
        if nc > 0:
            if nc == 1 and delete_outdegree_one and edge.tail_node:
                p = edge.tail_node
                assert (p)
                c = child_nodes[0]
                try:
                    c.edge.length += edge.length
                except:
                    pass
                pos = p.child_nodes().index(h)
                p.add_child(c, pos=pos)
                p.remove_child(h)
            else:
                for child in child_nodes:
                    cm |= child.edge.split_bitmask
        else:
            t = edge.head_node.taxon
            if t:
                cm = taxon_set.taxon_bitmask(t)
        edge.split_bitmask = cm
        if create_dict:
            split_map[cm] = edge
    # create normalized bitmasks, where the full (tree) split mask is *not*
    # all the taxa, but only those found on the tree
    if not tree.is_rooted:
        mask = tree.seed_node.edge.split_bitmask
        d = containers.NormalizedBitmaskDict(mask=mask)
        for k, v in tree.split_edges.items():
            d[k] = v
        tree.split_edges = d