예제 #1
0
def test_read_write(tmpdir):
    trees = read(os.path.join(
        os.path.dirname(__file__), 'fixtures', 'tree-glottolog-newick.txt'))
    descs = [len(tree.descendants) for tree in trees]
    # The bookkeeping family has 391 languages
    assert descs[0] == 391
    tmp = str(tmpdir.join('test.txt'))
    write(trees, tmp)
    assert os.path.exists(tmp)
    assert [len(tree.descendants) for tree in read(tmp)] == descs
예제 #2
0
def test_read_write(tmpdir):
    trees = read(
        os.path.join(os.path.dirname(__file__), 'fixtures',
                     'tree-glottolog-newick.txt'))
    descs = [len(tree.descendants) for tree in trees]
    # The bookkeeping family has 391 languages
    assert descs[0] == 391
    tmp = str(tmpdir.join('test.txt'))
    write(trees, tmp)
    assert os.path.exists(tmp)
    assert [len(tree.descendants) for tree in read(tmp)] == descs
예제 #3
0
 def test_read_write(self):
     trees = read(os.path.join(
         os.path.dirname(__file__), 'fixtures', 'tree-glottolog-newick.txt'))
     descs = [len(tree.descendants) for tree in trees]
     # The bookkeeping family has 391 languages
     self.assertEqual(descs[0], 391)
     tmp = mktemp()
     write(trees, tmp)
     assert os.path.exists(tmp)
     self.assertEqual([len(tree.descendants) for tree in read(tmp)], descs)
     os.remove(tmp)
예제 #4
0
def test_read_write(tmpdir):
    trees = read(
        pathlib.Path(__file__).parent / 'fixtures' /
        'tree-glottolog-newick.txt')
    descs = [len(tree.descendants) for tree in trees]
    # The bookkeeping family has 391 languages
    assert descs[0] == 391
    tmp = str(tmpdir.join('test.txt'))
    write(trees, tmp)
    assert pathlib.Path(tmp).exists()
    assert [len(tree.descendants) for tree in read(tmp)] == descs
예제 #5
0
 def test_read_write(self):
     trees = read(os.path.join(
         os.path.dirname(__file__), 'fixtures', 'tree-glottolog-newick.txt'))
     descs = [len(tree.descendants) for tree in trees]
     # The bookkeeping family has 391 languages
     self.assertEqual(descs[0], 391)
     tmp = mktemp()
     write(trees, tmp)
     assert os.path.exists(tmp)
     self.assertEqual([len(tree.descendants) for tree in read(tmp)], descs)
     os.remove(tmp)
예제 #6
0
def test_read_write(tmp_path):
    trees = read(
        pathlib.Path(__file__).parent / 'fixtures' /
        'tree-glottolog-newick.txt')
    assert '[' in trees[0].descendants[0].name
    descs = [len(tree.descendants) for tree in trees]
    # The bookkeeping family has 391 languages
    assert descs[0] == 391
    tmp = tmp_path / 'test.txt'
    write(trees, tmp)
    assert tmp.exists()
    assert [len(tree.descendants) for tree in read(tmp)] == descs
예제 #7
0
def parse_newick_file(filename: str, digraph=True):
    """
    Parses a newick file and returns the networkx graph.
    :param filename: str; full path of the to be parsed file
    :param digraph: Bool; is the graph a digraph
    :return: nx.Graph()
    """
    tree = newick.read(filename)

    if digraph:
        graph_newick = nx.DiGraph()
    else:
        graph_newick = nx.Graph

    none_counter = 1

    # Adding root node
    graph_newick.add_node(tree[0], child_position=0)

    while tree:
        tree_node = tree[0]
        tree_node, none_counter = rename_none_node(tree_node, none_counter)
        graph_newick, descendants, none_counter = add_newick_node_and_edge(
            graph_newick, tree_node, none_counter)
        tree += descendants
        tree.remove(tree_node)

    return graph_newick
예제 #8
0
def test_get_glottolog_newick(tmppath, mocker):
    tmppath.joinpath('glottolog-2.5.newick').write_text(
        '(B [abcd1234],C [abcd1234])A [abcd1234];', encoding='utf8')
    mocker.patch(
        'beastling.configuration.user_data_dir',
        new=mocker.Mock(return_value=str(tmppath)))
    trees = newick.read(str(get_glottolog_data('newick', '2.5')))
    assert trees[0].name == 'A [abcd1234]'
예제 #9
0
    def test_get_glottolog_newick(self):
        with self.tmp.joinpath('glottolog-2.5.newick').open('w', encoding='utf8') as fp:
            fp.write('(B [abcd1234],C [abcd1234])A [abcd1234];')

        with patch(
                'beastling.configuration.user_data_dir',
                new=Mock(return_value=self.tmp.as_posix())):
            trees = newick.read(get_glottolog_data('newick', '2.5'))
            self.assertEqual(trees[0].name, 'A [abcd1234]')
예제 #10
0
    def test_get_glottolog_newick(self):
        with self.tmp.joinpath('glottolog-2.5.newick').open(
                'w', encoding='utf8') as fp:
            fp.write('(B [abcd1234],C [abcd1234])A [abcd1234];')

        with patch('beastling.configuration.user_data_dir',
                   new=Mock(return_value=self.tmp.as_posix())):
            trees = newick.read(get_glottolog_data('newick', '2.5'))
            self.assertEqual(trees[0].name, 'A [abcd1234]')
예제 #11
0
def trees_from_file(fname, encoding='utf8', strip_comments=False, **kw):
    """
    Load a list of trees from a Newick formatted file.
    :param fname: file path.
    :param strip_comments: Flag signaling whether to strip comments enclosed in square \
    brackets.
    :param kw: Keyword arguments are passed through to `Node.read`.
    :return: [`PhyloTree`] instance.
    """
    l = newick.read(fname, encoding, strip_comments, **kw)
    return [newick_node_to_tree(n) for n in l]
예제 #12
0
def tree_probs_from_file(fname, encoding='utf8', strip_comments=False, **kw):
    """
    Load a list of instances of `PhyloTree` from a Newick formatted file and return their probabilities in a list, assuming
    they are instances of `PhyloTree`.
    :param fname: file path.
    :param strip_comments: Flag signaling whether to strip comments enclosed in square \
    brackets.
    :param kw: Keyword arguments are passed through to `Node.read`.
    :return: A list of symbolic expressions depending on `a`.
    """
    l = newick.read(fname, encoding, strip_comments, **kw)
    return [prob_tree(PhyloTree.newick_node_to_tree(t)) for t in l]
예제 #13
0
def main():
    filename = ['OG0000002_tree.txt', ]
    for file in filename:
        #Read Dataset
        trees = read('./dataset/{}'.format(file))
        node_needed = trees[0].get_node('n441').get_leaves()
        all_node = trees[0].get_leaves()

        trees[0].prune(node_needed, inverse=True)

        #Rename Nodes
        trees[0].visit(clean_node)

        #Get node names
        leaves = trees[0].get_leaves()

        #Get Node lengths
        unique = {}
        for node in leaves:
            regex = r"[a-z]_[a-zA-Z]+_[a-zA-Z1-9]"
            if re.search(regex, node.name):
                node_new_name = re.search(regex, str(node.name)).group(0)
            if node_new_name in unique:
                unique[node_new_name] = max(unique[node_new_name], node.length)
            else:
                unique[node_new_name] = node.length 

        #prune to just keep the longest unique nodes
        for node in leaves:
            regex = r"[a-z]_[a-zA-Z]+_[a-zA-Z1-9]"
            if re.search(regex, node.name):
                node_new_name = re.search(regex, str(node.name)).group(0)
            if node.length != unique[node_new_name]:
                trees[0].prune_by_names(node.name)

        #remove c elegans ref
        c_elegans_remove = ['c_elegans_ref_protein_PAR-1']
        trees[0].prune_by_names(c_elegans_remove)

        #Remove nodes with no chil
        while trees[0].walk(mode='postorder'):
            atleast_once = True
            for n in trees[0].walk(mode='postorder'):
                regex = r"^n[1-9]+"
                if n.ancestor and len(n.descendants) == 0 and re.search(regex, n.name):
                    trees[0].prune_by_names(n.name)
                    atleast_once = False
            if atleast_once == True:
                break

        #dump the tree
        with open('{}_required_tree.txt'.format(file), 'w') as fobj:
            dump(trees, fobj)
예제 #14
0
def parse_tree(fname):
    trees = read(fname)
    new_tr = Tree()
    new_tr.create_node("root", 1)
    size = 2 * len(trees[0].get_leaves()) + 1
    st_ids = []
    for i in range(2, size + 2):
        st_ids.append(i)

    parse_newick(trees[0].descendants, new_tr, 1, st_ids)

    return new_tr
예제 #15
0
    def run(self, dispatcher: CollectingDispatcher, tracker: Tracker,
            domain: Dict[Text, Any]) -> List[Dict[Text, Any]]:

        trees = read("data/tree.txt", strip_comments=True)
        entities = list(tracker.get_latest_entity_values("language"))
        data_path = os.path.join("data", "cldf-datasets-wals-014143f", "cldf",
                                 "languages.csv")
        wals_data = pd.read_csv(data_path)
        if len(entities) > 0:
            query_lang = entities.pop()
            query_lang_en = translator.translate(text=query_lang,
                                                 lang_tgt='en')
            query_lang_en = query_lang_en.strip()
            query_lang_en = query_lang_en.lower()
            if len(query_lang_en.split(' ')) > 1:
                f = [x.capitalize() for x in query_lang_en.split(' ')]

                query_lang_en = list(
                    set(f).intersection(set(wals_data["Name"])))[0]
            print(query_lang_en)

        matched_leaves = []
        for i, node in enumerate(trees):
            s = node.get_leaves()
            r = [k for k in s if query_lang_en in k.name]
            matched_leaves.extend(r)

            # print(r,node,i)

        if len(matched_leaves) > 0:
            for i in matched_leaves:
                anc = get_immediate_cousins(i)
                out_text = ','.join(anc)
                out_text = translator.translate(text=out_text, lang_tgt='hi')
                # print(out_text)
                dispatcher.utter_message(text="मिलती जुलती भाषा  " + out_text)
        else:
            dispatcher.utter_message(text='क्षमा करें, मुझे समझ नहीं आया')
예제 #16
0
def sequence(trees, mp="", ml=""):
    """
    Add sequence for each node in a tree

    @param trees: dictionary, in the form of {id : [parent, name, offspring, support, length, level, sequence]}
    @param mp: string, a string refer to a filename of MSA file (for MP method only), only contain aligned protein sequences for terminal nodes
    @param ml: string, a string refer to a filename of MSA file (for ML method only), contains aligned protein sequences both for all terminal nodes and internal nodes
    @return: dictionary, in the form of {id : [parent, name, offspring, support, length, level, sequence]}
    """

    tree = newick.read(trees)
    mp_sequence = fasta.read(mp)
    ml_sequence = fasta.read(ml)

    if mp_sequence:
        tips = [v[1] for k, v in tree.items() if not v[2]]
        taxa = mp_sequence.keys()
        if not set(tips).difference(set(taxa)):
            for k, v in tree.items():
                if v[1] in tips:
                    v[-1] = mp_sequence[v[1]]
                    tree[k] = v
    elif ml_sequence:
        s = ml_sequence
        support = [v for v in tree.values() if v[1] == "root"][0]
        ids = [k for k, v in tree.items() if v[1] == "root"][0]
        support[3] = "N1"
        tree[ids] = support
        for k, v in tree.items():
            if v[3] in s:
                v[-1] = s[v[3]]
                tree[k] = v
            elif v[1] in s:
                v[-1] = s[v[1]]
                tree[k] = v
    return tree
예제 #17
0
def classifications_from_newick(string, label_pattern=GLOTTOLOG_NODE_LABEL):
    label2name = {}

    def parse_label(label):
        match = {
            k: v.strip() if v else ''
            for k, v in label_pattern.match(label).groupdict().items()
        }
        assert match['glottocode']
        label2name[label] = (match.get('name', '').strip().replace("\\'", "'"),
                             match['glottocode'])
        return match

    def get_classification(node):
        ancestor = node.ancestor
        if not ancestor:
            # Node is root of some family
            return [label2name[node.name]]
        res = []
        while ancestor:
            res.append(label2name[ancestor.name])
            ancestor = ancestor.ancestor
        return list(reversed(res))

    classifications, nodemap = {}, {}
    # Walk the tree and build the classifications dictionary
    trees = newick.read(string)
    for tree in trees:
        for node in tree.walk():
            label = parse_label(node.name)
            classification = get_classification(node)
            classifications[label['glottocode']] = classification
            if label.get('isocode'):
                classifications[label['isocode']] = classification
            nodemap[label['glottocode']] = node
    return classifications, nodemap, label2name
예제 #18
0
    def load_glottolog_data(self):
        """
        Loads the Glottolog classification information from the appropriate
        newick file, parses it and stores the required datastructure in
        self.classification.
        """
        # Don't load if the analysis doesn't use it
        if not self.check_glottolog_required():
            return
        # Don't load if we already have - can this really happen?
        if self.glottolog_loaded:
            return
        self.glottolog_loaded = True

        label2name = {}
        glottocode2node = {}

        def parse_label(label):
            match = GLOTTOLOG_NODE_LABEL.match(label)
            label2name[label] = (match.group('name').strip().replace("\\'","'"), match.group('glottocode'))
            return (
                match.group('name').strip(),
                match.group('glottocode'),
                match.group('isocode'))

        def get_classification(node):
            res = []
            ancestor = node.ancestor
            while ancestor:
                res.append(label2name[ancestor.name])
                ancestor = ancestor.ancestor
            return list(reversed(res))

        # Walk the tree and build the classifications dictionary
        glottolog_trees = newick.read(get_glottolog_data('newick', self.glottolog_release))
        for tree in glottolog_trees:
            for node in tree.walk():
                name, glottocode, isocode = parse_label(node.name)
                classification = get_classification(node)
                self.classifications[glottocode] = classification
                if isocode:
                    self.classifications[isocode] = classification
                glottocode2node[glottocode] = node

        # Load geographic metadata
        for t in reader(
                get_glottolog_data('geo', self.glottolog_release), namedtuples=True):
            if t.macroarea:
                self.glotto_macroareas[t.glottocode] = t.macroarea
                for isocode in t.isocodes.split():
                    self.glotto_macroareas[isocode] = t.macroarea
            if self.location_data:
                continue # Use user-supplied data instead

            if t.latitude and t.longitude:
                latlon = (float(t.latitude), float(t.longitude))
                self.locations[t.glottocode] = latlon
                for isocode in t.isocodes.split():
                    self.locations[isocode] = latlon

        if self.location_data:
            return

        # Second pass of geographic data to handle dialects, which inherit
        # their parent language's location
        for t in reader(
                get_glottolog_data('geo', self.glottolog_release), namedtuples=True):
            if t.level == "dialect":
                failed = False
                node = glottocode2node[t.glottocode]
                ancestor = node.ancestor
                while label2name[ancestor.name][1] not in self.locations:
                    if not ancestor.ancestor:
                        # We've hit the root without finding an ancestral node
                        # with location data!
                        failed = True
                        break
                    else:
                        ancestor = ancestor.ancestor
                if failed:
                    continue
                latlon = self.locations[label2name[ancestor.name][1]]
                self.locations[t.glottocode] = latlon
                for isocode in t.isocodes.split():
                    self.locations[isocode] = latlon
import newick
from Bio import SeqIO
from StringIO import StringIO

def find_rev(t,dnas):
    r = []
    for i in range(len(dnas[t.u])):
        r += [(p[0],p[-1],i,dnas[p[0].u][i]) for p in t.find_rev(dnas,i)]

    return r

if __name__ == '__main__':
    with open('data/data.dat') as f:
        nw = f.readline()
        nw.split()

        tree = newick.read(StringIO(nw))
        fst = f.read()
        fst = StringIO(fst)
        dnas,_ = SeqIO.parse(fst,'fasta')

    nodes = tree.nodes()

    for node in nodes:
        revs = find_rev(node,dnas)

        for fc, dest, pos, mid in revs:
            print("%s %s %d %s->%s->%s" % (fc.u, dest.u, pos + 1, dnas[node.u][pos], mid, dnas[dest.u][pos]))
            assert(dnas[node.u][pos] == dnas[dest.u][pos])
예제 #20
0
                # sort nodes according to branch length
                lengths=[node.length for node in obj]
                pair_ind=np.triu_indices(len(touch_ind), 1)
                
                for k in range(len(pair_ind[0])):
                    i,j=pair_ind[0][k],pair_ind[1][k]
                    ll=[lengths[touch_ind[i]],lengths[touch_ind[j]]]
                    tot=sum(ll)
                    if tot<threshold:
                        ind=ll.index(min(ll))
                        removed=True
                        if ind==0: del obj[touch_ind[i]]
                        else: del obj[touch_ind[j]]
                        break

def get_names(obj):
    if type(obj)==newick.Node: print obj.name

def get_lengths(obj):
    if type(obj)==newick.Node: print obj.length

if __name__ == "__main__":
	threshold = 0.001
	tree = newick.read(sys.argv[1])

	map_tree(tree,prune,threshold)
	map_tree(tree,prune,threshold)
	map_tree(tree,get_names)

	newick.write(tree,sys.argv[1]+'_pruned')
예제 #21
0
 def read(newick_filename):
     return Node.build_from_newick_object(newick.read(newick_filename)[0])
예제 #22
0
    def load_glottolog_data(self):
        """
        Loads the Glottolog classification information from the appropriate
        newick file, parses it and stores the required datastructure in
        self.classification.
        """
        # Don't load if the analysis doesn't use it
        if not self.check_glottolog_required():
            return
        # Don't load if we already have - can this really happen?
        if self.glottolog_loaded:
            return
        self.glottolog_loaded = True

        label2name = {}
        glottocode2node = {}

        def parse_label(label):
            match = GLOTTOLOG_NODE_LABEL.match(label)
            label2name[label] = (match.group('name').strip().replace(
                "\\'", "'"), match.group('glottocode'))
            return (match.group('name').strip(), match.group('glottocode'),
                    match.group('isocode'))

        def get_classification(node):
            res = []
            ancestor = node.ancestor
            while ancestor:
                res.append(label2name[ancestor.name])
                ancestor = ancestor.ancestor
            return list(reversed(res))

        # Walk the tree and build the classifications dictionary
        glottolog_trees = newick.read(
            get_glottolog_data('newick', self.glottolog_release))
        for tree in glottolog_trees:
            for node in tree.walk():
                name, glottocode, isocode = parse_label(node.name)
                classification = get_classification(node)
                self.classifications[glottocode] = classification
                if isocode:
                    self.classifications[isocode] = classification
                glottocode2node[glottocode] = node

        # Load geographic metadata
        for t in reader(get_glottolog_data('geo', self.glottolog_release),
                        namedtuples=True):
            if t.macroarea:
                self.glotto_macroareas[t.glottocode] = t.macroarea
                for isocode in t.isocodes.split():
                    self.glotto_macroareas[isocode] = t.macroarea
            if self.location_data:
                continue  # Use user-supplied data instead

            if t.latitude and t.longitude:
                latlon = (float(t.latitude), float(t.longitude))
                self.locations[t.glottocode] = latlon
                for isocode in t.isocodes.split():
                    self.locations[isocode] = latlon

        if self.location_data:
            return

        # Second pass of geographic data to handle dialects, which inherit
        # their parent language's location
        for t in reader(get_glottolog_data('geo', self.glottolog_release),
                        namedtuples=True):
            if t.level == "dialect":
                failed = False
                node = glottocode2node[t.glottocode]
                ancestor = node.ancestor
                while label2name[ancestor.name][1] not in self.locations:
                    if not ancestor.ancestor:
                        # We've hit the root without finding an ancestral node
                        # with location data!
                        failed = True
                        break
                    else:
                        ancestor = ancestor.ancestor
                if failed:
                    continue
                latlon = self.locations[label2name[ancestor.name][1]]
                self.locations[t.glottocode] = latlon
                for isocode in t.isocodes.split():
                    self.locations[isocode] = latlon
예제 #23
0
def load_from_file(file_name: str):
    return read(file_name)