示例#1
0
文件: graph.py 项目: kadster/lingpy
def nwk2gml(
    treefile,
    filename='',
):
    """
    Function converts a tree in newick format to a network in gml-format.

    treefile : str
        Either a str defining the path to a file containing the tree in
        Newick-format, or the tree-string itself.
    filename : str (default='lingpy')
        The name of the output GML-file. If filename is set to c{None}, the
        function returns a :py:class:`~networkx.Graph`.

    Returns
    -------
    graph : networkx.Graph

    """

    # create an empty graph
    graph = nx.DiGraph()

    # load the tree
    if type(treefile) == str:
        try:
            tree = cg.LoadTree(treefile)
        except:
            tree = cg.LoadTree(treestring=treefile)
    else:
        tree = treefile

    # get the node names of the tree
    nodes = tree.getNodeNames()

    # get taxa for convenience
    taxa = tree.getTipNames()

    # iterate over the nodes and add them and the edges to the graph
    for node in nodes:

        # add the node (just as a precaution)
        if node in taxa:
            graph.add_node(node, tip=True)
        else:
            graph.add_node(node, tip=False)

        # get the parent of the node
        parent = tree.getNodeMatchingName(node).Parent

        # add the edge if the parent is not None
        if parent:
            graph.add_edge(parent.Name, node)

    return _graph_or_file(graph, filename)
示例#2
0
def matrix2tree(matrix,
                taxa,
                tree_calc="neighbor",
                distances=True,
                filename=""):
    """
    Calculate a tree of a given distance matrix.

    Parameters
    ----------
    matrix : list
        The distance matrix to be used.
    taxa : list
        A list of the taxa in the distance matrix.
    tree_calc : str (default="neighbor")
        The method for tree calculation that shall be used. Select between:

        * "neighbor": Neighbor-joining method (:evobib:`Saitou1987`)
        * "upgma" : UPGMA method (:evobib:`Sokal1958`)

    distances : bool (default=True)
        If set to c{True}, distances will be included in the
        tree-representation.
    filename : str (default='')
        If a filename is specified, the data will be written to that file.

    Returns
    -------
    tree : ~lingpy.thirdparty.cogent.tree.PhyloNode
        A ~lingpy.thirdparty.cogent.tree.PhyloNode object for handling tree
        files.
    """

    if tree_calc == 'upgma':
        algorithm = cluster.upgma
    elif tree_calc == 'neighbor':
        algorithm = cluster.neighbor
    else:
        raise ValueError(tree_calc)

    tree = cg.LoadTree(treestring=algorithm(matrix, taxa, distances))

    if not filename:
        return tree
    util.write_text_file(filename + '.nwk', text_type(tree))
示例#3
0
文件: tree.py 项目: kadster/lingpy
def nwk2tree_matrix(newick):
    """
    Convert a newick file to a tree matrix.

    Notes
    -----
    This is an additional function that can be used for plots with help of
    matplotlibs functions. The tree_matrix is compatible with those matrices
    that scipy's linkage functions create.
    """
    if type(newick) == str:
        tree = cg.LoadTree(treestring=newick)
    elif hasattr(newick, 'root'):
        tree = newick

    taxa = [
        t
        for t in sorted(tree.taxa,
                        key=lambda x: len(tree.getConnectingEdges('root', x)),
                        reverse=True)
    ]

    tax2id = dict(zip(taxa, range(len(taxa))))
    nodes = [t for t in tree.getNodeNames() if t not in taxa]

    nodes = sorted(
        nodes,
        key=lambda x: len(tree.getNodeMatchingName(x).tips()),
    )
    matrix = []

    for node in nodes:
        n = tree.getNodeMatchingName(node)
        children = n.Children
        names = [c.Name for c in children]
        idxA = tax2id[names[0]]
        idxB = tax2id[names[1]]
        idx = max(tax2id.values()) + 1
        tax2id[node] = idx
        obs = len(n.tips())
        dst = obs * 1.0
        matrix += [[idxA, idxB, dst, obs]]

    return matrix, taxa
示例#4
0
文件: qlc.py 项目: kadster/lingpy
def read_qlc(infile, comment='#'):
    """
    Simple function that loads qlc-format into a dictionary.

    Parameters
    ----------
    infile : str
        The name of the input file.
    comment : str (default="#")
        The comment character. If a line starts with this character, it will be
        ignored.

    Returns
    -------
    d : dict
        A dictionary with integer keys corresponding to the order of the lines
        of the input file. The header is given 0 as a specific key.
    """
    lines = read_text_file(infile, lines=True, normalize="NFC")
    data, meta, dtype = [], {}, False

    while lines:
        line = lines.pop(0)
        if line.startswith(comment) or not line:
            continue

        if line.startswith('@'):
            key, value = [s.strip() for s in line[1:].split(':', 1)]
            if key == 'tree':
                meta["tree"] = cg.LoadTree(treestring=value)
            elif key == 'json':
                for j1, j2 in json.loads(value).items():
                    meta[j1] = j2
            else:
                if key not in meta:
                    meta[key] = value
                else:
                    if isinstance(meta[key], list):
                        meta[key].append(value)
                    else:
                        log.warning(
                            "Key '{0}' in input file is not unique! Use JSON-format for "
                            "these datatypes!".format(key))
                        meta[key] = [meta[key]] + [value]
        # line starts with complex stuff
        elif line.startswith('<'):
            tmp = line[1:line.index('>')]
            # check for specific keywords
            if ' ' in tmp:
                dtype = tmp.split(' ')[0]
                keys = {
                    k: v[1:-1]
                    for k, v in [key.split('=') for key in tmp.split(' ')[1:]]
                }
            else:
                dtype = tmp.strip()
                keys = {}

            tmp = []

            while True:
                line = lines.pop(0)
                if line.startswith('</' + dtype + '>'):
                    break
                tmp += [line]

            tmp = '\n'.join(tmp)

            # check for data stuff
            if dtype == "json":
                tmp = json.loads(tmp)
                if not keys:
                    for key in tmp:
                        meta[key] = tmp[key]
                elif keys:
                    meta[keys["id"]] = {}
                    for k in tmp:
                        meta[keys["id"]][k] = tmp[k]
            elif dtype in ['tre', 'nwk']:
                if "trees" not in meta:
                    meta["trees"] = {}

                if not keys:
                    keys["id"] = "1"

                # XXX consider switching to Tree here XXX
                meta['trees'][keys["id"]] = cg.LoadTree(treestring=tmp)
            elif dtype in ['csv']:
                meta[keys["id"]] = {}
                ncol = int(keys.get('ncol', 2))

                if "dtype" in keys:
                    transf = eval(keys["dtype"])
                else:
                    transf = str

                # split tmp into lines
                tmp = tmp.split('\n')
                for l in tmp:
                    if ncol == 2:
                        a, b = l.split('\t')
                        b = transf(b)
                    else:
                        l = l.split('\t')
                        a = l[0]
                        b = [transf(b) for b in l[1:]]
                    meta[keys["id"]][a] = b
            elif dtype == 'msa':
                tmp = tmp.split('\n')
                if 'msa' not in meta:
                    meta['msa'] = {}

                ref = keys.get('ref', 'cogid')
                if ref not in meta['msa']:
                    meta['msa'][ref] = {}

                tmp_msa = {}
                try:
                    tmp_msa['dataset'] = meta['dataset']
                except:
                    tmp_msa['dataset'] = infile.replace('.csv', '')

                tmp_msa['seq_id'] = keys['id']

                # add consensus string to msa, if it appears in the keys
                if "consensus" in keys:
                    tmp_msa['consensus'] = keys['consensus']

                msad = []
                for l in tmp:
                    if not l.startswith(comment):
                        msad.append(
                            [x.strip().rstrip('.') for x in l.split('\t')])
                tmp_msa = _list2msa(msad, header=False, ids=True, **tmp_msa)

                try:
                    meta['msa'][ref][int(keys['id'])] = tmp_msa
                except ValueError:
                    meta['msa'][ref][keys['id']] = tmp_msa

            elif dtype == 'dst':
                taxa, matrix = read_dst(tmp)
                distances = [[0.0 for _ in matrix] for _ in matrix]
                for i, line in enumerate(matrix):
                    for j, cell in enumerate(line):
                        if i < j:
                            distances[i][j] = cell
                            distances[j][i] = cell
                meta['distances'] = distances
            elif dtype == 'scorer':
                scorer = read_scorer(tmp)
                if 'scorer' not in meta:
                    meta['scorer'] = {}
                keys.setdefault('id', 'basic')
                meta['scorer'][keys['id']] = scorer

            elif dtype == 'taxa':
                meta['taxa'] = [t.strip() for t in tmp.split('\n')]
        else:
            data += [[l.strip() for l in line.split('\t')]]

    # create the dictionary in which the data will be stored
    d = {}

    # check for first line, if a local ID is given in the header (or simply
    # "ID"), take this line as the ID, otherwise create it
    local_id = data[0][0].lower() in ['id', 'local_id', 'localid']

    # iterate over data and fill the dictionary (a bit inefficient, but enough
    # for the moment)
    try:
        i = 1
        for j, line in enumerate(data[1:]):
            if local_id:
                d[int(line[0])] = line[1:]
            else:
                d[i] = line
                i += 1
    except ValueError as e:  # pragma: no cover
        raise Exception("Error processing line {0}:\n".format(j) +
                        str(data[1:][j]) + '\nOriginal error message: ' +
                        str(e))

    # assign the header to d[0]
    if local_id:
        d[0] = [x.lower() for x in data[0][1:]]
    else:
        d[0] = [x.lower() for x in data[0]]

    for m in meta:
        d[m] = meta[m]

    if 'trees' in d and 'tree' not in d:
        d['tree'] = sorted(d['trees'].items(), key=lambda x: x[0])[0][1]

    return d
示例#5
0
文件: graph.py 项目: kadster/lingpy
def gls2gml(
    gls,
    graph,
    tree,
    filename='',
):
    """
    Create GML-representation of a given gain-loss-scenario (GLS).

    Parameters
    ----------
    gls : list
        A list of tuples, indicating the origins of characters along a tree.
    graph : networkx.graph
        A graph that serves as a template for the plotting of the GLS.
    tree : cogent.tree.PhyloNode
        A tree object. 
    """
    # check for tree-formatting
    if type(tree) == str:
        tree = cg.LoadTree(treestring=tree)

    # create a mapper for the ids and the string-names
    mapper = {}
    for node, data in graph.nodes(data=True):
        mapper[data['label']] = node

    # create a graph
    g = nx.Graph()

    # sort the gls according to the number of tips
    gls_srt = sorted(gls,
                     key=lambda x: len(tree.getNodeMatchingName(x[0]).tips()),
                     reverse=True)

    # set the basic event frame, depending on the state of the root
    if gls_srt[0][1] == 1 and gls_srt[0][0] == 'root':
        this_color = "#ffffff"
        state = 'O'
    else:
        this_color = "#000000"
        state = 'l'

    # let all nodes inherit these parameters
    for node, data in graph.nodes(data=True):
        data['graphics']['fill'] = this_color
        data['graphics']['type'] = 'ellipse'
        data['graphics']['w'] = 20.0
        data['graphics']['h'] = 20.0
        data['origin'] = 0
        data['state'] = state

        g.add_node(node, **data)

    # assign the root as starting point
    data = graph.nodes[mapper['root']]
    data['graphics']['type'] = 'ellipse'
    data['graphics']['w'] = 50.0
    data['graphics']['h'] = 50.0
    data['state'] = state
    g.add_node(mapper['root'], **data)

    # iterate over the nodes involved in change and assign the values to their
    # children
    for name, event in gls_srt:
        if event == 1:
            this_fill = '#ffffff'
            state = 'O'
        else:
            this_fill = '#000000'
            state = 'L'

        # get the names of the descendant nodes in the subtree
        sub_tree_nodes = tree.getNodeMatchingName(name).getNodeNames()

        # iterate over all nodes to change
        for node in sub_tree_nodes:
            data = g.nodes[mapper[node]]
            data['graphics']['fill'] = this_fill
            data['state'] = state.lower()
            g.add_node(mapper[node], **data)

        # change the size of the root of the subtree
        g.nodes[mapper[name]]['graphics']['h'] = 50.0
        g.nodes[mapper[name]]['graphics']['w'] = 50.0
        g.nodes[mapper[name]]['graphics']['fill'] = this_fill
        g.nodes[mapper[name]]['origin'] = 1
        g.nodes[mapper[name]]['state'] = state

    # add the edges to the tree
    for edgeA, edgeB, data in graph.edges(data=True):
        # for computers with new networkx version
        try:
            del data['graphics']['Line']
        except:
            pass
        # if 'label' not in data:
        g.add_edge(edgeA, edgeB, **data)

    return _graph_or_file(g, filename)
示例#6
0
文件: graph.py 项目: kadster/lingpy
def radial_layout(treestring,
                  change=lambda x: x**1.75,
                  degree=100,
                  filename='',
                  start=0,
                  root='root'):
    """
    Function calculates a simple radial tree layout.

    Parameters
    ----------
    treefile : str
        Either a str defining the path to a file containing the tree in
        Newick-format, or the tree-string itself.
    filename : str (default=None)
        The name of the output file (GML-format). If set to c{None}, no output
        will be written to file.
    change : function (default = lambda x:2 * x**2)
        The function used to modify the radius in the polar projection of the
        tree.

    Returns
    -------
    graph : networkx.Graph
        A graph representation of the tree with coordinates specified in the
        graphics-attribute of the nodes.

    Notes
    -----
    
    This function creates a radial tree-layout from a given tree specified in
    Newick format.

    """
    # calculate the factor for projection from the degree
    pfactor = degree / 360

    # get starting factor
    startf = start * np.pi / 180

    # calculate the projection (should be centered)
    if degree <= 180:
        pstart = startf + (180 - degree) / 360 * np.pi
        pend = pstart + 2 * np.pi * pfactor
    else:
        pstart = startf + 0
        pend = startf + 2 * np.pi * pfactor

    # define private function for centering of nodes
    def get_center(nodes):

        # first sort all values since we need max and min of the theta values
        xvals = sorted([n[0] for n in nodes])

        # get minimum and maximum
        xA, xB = xvals[0], xvals[-1]

        # calculate the new coordinates, the radius is simply decreased by 1
        y = min([n[1] for n in nodes]) - 1

        # the theta-value is calculated by the following formula
        x = (xA + abs(xA - xB) / 2)

        return x, y

    # get the tree
    if type(treestring) == str:
        try:
            tree = cg.LoadTree(treestring)
        except:
            tree = cg.LoadTree(treestring=treestring)
    else:
        tree = treestring

    # get the leaves
    leaves = tree.getTipNames()

    # get the paths in order to find out the radius of the tree
    paths = {}

    for l in leaves:
        path = tree.getConnectingEdges(root, l)
        try:
            paths[len(path)] += [l]
        except:
            paths[len(path)] = [l]

    # get the max path
    maxL = max(paths)

    # get the initial coordinates
    coords = {}

    for node, x in zip(leaves, np.linspace(pstart, pend, len(leaves))):
        coords[node] = (x, maxL, 0)

    # assign leaves to queue
    queue = [(l, 0) for l in leaves]

    # make the visited list
    visited = []

    # start the loop
    while queue:

        # get the node
        node, dim = queue.pop(0)

        # increase the dimension by 1
        dim += 1

        if node in visited:
            pass
        else:

            # get the parent and all children
            children = [
                child.Name
                for child in tree.getNodeMatchingName(node).Parent.Children
            ]

            # iterate over children
            goon = True
            for child in children:
                if child in coords:
                    pass
                else:
                    goon = False
                    break

            # goon, if this is possible
            if not goon:
                queue += [(node, dim)]
            else:

                x, y = get_center([coords[child] for child in children])
                parent = tree.getNodeMatchingName(node).Parent.Name
                if parent == root:
                    coords[parent] = (x, y, dim + 1)
                else:
                    coords[parent] = (x, y, dim)

                visited += [child for child in children]

                if parent != root:
                    queue += [(parent, dim)]

    # convert tree to graph
    graph = nwk2gml(treestring, filename=None)

    # iterate over the graph and assign the data
    for n, d in graph.nodes(data=True):
        x, y, z = coords[n]

        # change coordinates
        xN = change(y) * np.cos(x)
        yN = change(y) * np.sin(x)

        # get angle for text-rotation in degrees
        angle = x * 180 / np.pi

        # derive zorder from angle
        if angle <= 90:
            zorder = 90 - angle
        elif 180 >= angle > 90:
            zorder = angle - 90
        elif 180 < angle <= 270:
            zorder = angle - 90
        elif 270 < angle:
            zorder = 90 + (360 - angle)

        # check for specific parts where the angle has to be adapted
        if 270 >= angle > 180:
            angle -= 180
            s = 'right'
        elif 180 >= angle >= 90:
            angle += 180
            s = 'right'
        else:
            s = 'left'

        # assign the data to the graph
        d['graphics'] = {
            'x': xN,
            'y': yN,
            'z': z,
            'angle': angle,
            's': s,
            'zorder': int(zorder)
        }

        # don't forget the label
        d['label'] = n

    return _graph_or_file(graph, filename)
示例#7
0
#load long language names
f = open('data/asjp/world_longnames.txt', 'r')
rl = f.readlines()
f.close()
longnames = array([x.strip() for x in rl])
longNameToID = dict({(longnames[i], i) for i in range(0, len(longnames))})

#load long language names
f = open('data/asjp/world_names.txt', 'r')
rl = f.readlines()
f.close()
names = array([x.strip() for x in rl])
nameToID = dict({(names[i], i) for i in range(0, len(names))})

guideTree = cg.LoadTree("data/asjp/world-NWPV.nwk")
#convert guideTree node names to integers as expected by Lingpy MSA
for leaf in guideTree.tips():
    leaf.Name = str(longNameToID[leaf.Name])

iteration = 1
numIterations = 100

while iteration <= numIterations:
    if iteration == 1:
        #mfile = open("replacement-weights.txt","r")
        #sounds = array(mfile.readline().strip().split("\t"))
        #repWeightsRaw = mfile.readlines()
        #mfile.close()
        #repWeights = array([x.strip().split('\t') for x in repWeightsRaw])
示例#8
0
def plot_gls(gls, treestring, degree=90, fileformat='pdf', **keywords):
    """
    Plot a gain-loss scenario for a given reference tree.
    """

    # get kewyords
    defaults = dict(figsize=(15, 15),
                    left=0.05,
                    top=0.95,
                    bottom=0.05,
                    right=0.95,
                    radius=0.5,
                    textsize=8,
                    edgewidth=5,
                    linewidth=2,
                    scale_radius=1.2,
                    ylim=1,
                    xlim=1,
                    text=True,
                    gain_color='white',
                    loss_color='black',
                    gain_linestyle='dotted',
                    loss_linestyle='solid',
                    ax_linewidth=0,
                    filename=rcParams['filename'])

    for k in defaults:
        if k not in keywords:
            keywords[k] = defaults[k]

    # set filename as variabel for convenience
    filename = keywords['filename']

    try:
        tree = cg.LoadTree(treestring=treestring)
    except:
        try:
            tree = cg.LoadTree(treestring)
        except:
            tree = treestring

    tgraph = radial_layout(treestring, degree=degree)

    graph = gls2gml(gls, tgraph, tree)

    nodes = []

    # assign nodes and edges
    for n, d in graph.nodes(data=True):
        g = d['graphics']
        x = g['x']
        y = g['y']
        s = d['state']

        nodes += [(x, y, s)]

    # now plot the stuff
    fig = plt.figure(figsize=keywords['figsize'])
    figsp = fig.add_subplot(111)
    figsp.axes.get_xaxis().set_visible(False)
    figsp.axes.get_yaxis().set_visible(False)

    # set the axes linewidht
    for s in figsp.spines.values():
        s.set_linewidth(keywords['ax_linewidth'])

    plt.axis('equal')

    for nA, nB in graph.edges():
        xA = graph.node[nA]['graphics']['x']
        xB = graph.node[nB]['graphics']['x']
        yA = graph.node[nA]['graphics']['y']
        yB = graph.node[nB]['graphics']['y']

        plt.plot([xA, xB], [yA, yB],
                 '-',
                 color='black',
                 linewidth=keywords['edgewidth'],
                 zorder=1)

    # now, iterate over nodes
    for x, y, s in nodes:
        if s == 'O':
            w = mpl.patches.Wedge((x, y),
                                  keywords['radius'],
                                  0,
                                  360,
                                  facecolor=keywords['gain_color'],
                                  linewidth=keywords['linewidth'],
                                  linestyle=keywords['gain_linestyle'])
        elif s == 'o':
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] / keywords['scale_radius'],
                0,
                360,
                facecolor=keywords['gain_color'],
                linewidth=keywords['linewidth'])
        elif s == 'L':
            w = mpl.patches.Wedge((x, y),
                                  keywords['radius'],
                                  0,
                                  360,
                                  facecolor=keywords['loss_color'],
                                  linewidth=keywords['linewidth'],
                                  linestyle=keywords['loss_linestyle'])
        else:
            w = mpl.patches.Wedge(
                (x, y),
                keywords['radius'] / keywords['scale_radius'],
                0,
                360,
                facecolor=keywords['loss_color'],
                linewidth=keywords['linewidth'])
        figsp.add_artist(w)

        # if text is chosen as argument
        if keywords['text']:
            if s in 'Oo':
                t = '1'
                c = 'black'
            else:
                t = '0'
                c = 'white'

            plt.text(x,
                     y,
                     t,
                     size=keywords['textsize'],
                     color=c,
                     va="center",
                     ha="center",
                     fontweight='bold')

    # set x and y-values
    xvals = [x[0] for x in nodes]
    yvals = [x[1] for x in nodes]

    plt.xlim(min(xvals) - keywords['xlim'], max(xvals) + keywords['xlim'])
    plt.ylim(min(yvals) - keywords['ylim'], max(yvals) + keywords['ylim'])

    plt.subplots_adjust(left=keywords['left'],
                        right=keywords['right'],
                        top=keywords['top'],
                        bottom=keywords['bottom'])
    plt.savefig(filename + '.' + fileformat)
    plt.clf()
    log.file_written(filename + '.' + fileformat)