예제 #1
0
def do_alnntree(pref, ndf, fasta, refs, congen, targetids, gaps=0.9, cpus=-1):
    # TODO: add checkpoint to avoid repeating
    to_phy = congen
    for name, data in ndf.groupby('saccver'):
        # mi = data.sstart.min()
        # ma = data.send.max()
        tx = data.staxid.iloc[0]
        try:
            seq = refs['>%s' % name].replace('\n', '').strip()  # [mi-1:ma+1]
        except KeyError:
            name = name.split('|')[0]
            seq = refs['>%s' % name].replace('\n', '').strip()
        to_phy += '>%d.%s\n%s\n' % (tx, name, seq)
    with shelve.open(fasta) as dic:
        for h, s in dic.items():
            if h.strip()[1:] in targetids:
                print(h)
                to_phy += '%s\n%s\n' % (h, s.strip().replace('\n', ''))
            else:
                print(h, 'not in')
    aln, _ = stdin_run(['mafft', '--thread', str(cpus), '--auto', '-'], to_phy)
    trm = trimaln(aln.decode('utf-8'), targetids, gaps=gaps)
    tre, _ = stdin_run(['fasttreeMP', '-nt', '-gtr', '-gamma'], trm)
    tre = tre.strip()[:-1].replace(b';', b'-').decode('utf-8') + ';'
    t = PhyloTree(tre, sp_naming_function=lambda name: name.split('.')[0])
    with open('%s.aln' % pref, 'w') as al, open('%s.treepickle' % pref, 'wb') \
            as tp:
        al.write(trm)
        t.write(outfile='%s.tree' % pref)
        dill.dump(t, tp)
    tax2 = t.annotate_ncbi_taxa()
    fix_species(t)
    print(t)
    return t, tax2
예제 #2
0
    def test_species(self):
        """
        tests if node.species and ncbi_query are working
        """

        # test node.species

        species_tree = PhyloTree(
            """(Felis_catus_1:1,
                (Homo_sapiens_1:1, Pan_troglodytes_1:1),
                Saccharomyces_cerevisiae_1:1);""",
            format=1)
        species_tree.set_species_naming_function(lambda n: n.name.split("_")[1] if "_" in n.name else '')

        pattern0 = """('',
                       (' len(set(["sapiens","pygmaeus"]) & species(@))>0',
                       Pan_troglodytes_1)
                       );"""

        pattern0 = TreePattern(pattern0)


        root = species_tree.get_tree_root()
        self.assertEqual(list(pattern0.find_match(species_tree)), [root])

        # test ncbi taxonomy

        ncbi = NCBITaxa()
        taxonomy_tree = PhyloTree("((9598, 9606), 10090);", sp_naming_function=lambda name: name)
        taxonomy_tree.annotate_ncbi_taxa()
        root = taxonomy_tree.get_tree_root()

        pattern1 = """ '  @.sci_name == "Euarchontoglires" ';"""
        pattern2 = """
          (( '@.sci_name=="H**o sapiens"' , '9526 in @.lineage ' )' @.rank=="subfamily" and @.taxid == 207598 ')
          '  @.sci_name == "Euarchontoglires" and "cellular organisms" in @.named_lineage';
          """

        pattern1 = TreePattern(pattern1)
        pattern2 = TreePattern(pattern2)

        match1 = pattern1.find_match(taxonomy_tree)
        match2 = pattern2.find_match(taxonomy_tree)

        self.assertEqual(list(match1), [root])
        self.assertEqual(list(match2), [root])
예제 #3
0
from ete3 import PhyloTree
from ete3 import NCBITaxa
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

input_tree = sys.argv[1]
input_fasta = sys.argv[2]
output_fasta_ordered_select = sys.argv[3]
output_fasta_ordered_all = sys.argv[4]

# There's a way to save these extra attributes, but it's a bit awkward (not supported by newick format)
# So we fetch them anew each time.
ncbi = NCBITaxa()
tree = PhyloTree(input_tree,
                 sp_naming_function=lambda name: name.split('.', 1)[0])
tax2names, tax2lineages, tax2rank = tree.annotate_ncbi_taxa()
print tree.get_ascii(attributes=["name", "sci_name", "taxid"])

record_dict = SeqIO.to_dict(SeqIO.parse(input_fasta, "fasta"))

# H**o sapiens
# Macaca mulatta
# Canis lupus familiaris
# Mus musculus
# Gallus gallus
# Anolis carolinensis
# Danio rerio
sorted_fasta_select = []
sorted_fasta_select.append(
    SeqRecord(record_dict["9606.ENSP00000261448"].seq, "H**o sapiens", '', ''))
sorted_fasta_select.append(
예제 #4
0
def run(args):
    from ete3 import Tree, PhyloTree

    features = set()
    for nw in args.src_tree_iterator:
        if args.ncbi:
            tree = PhyloTree(nw)
            features.update([
                "taxid", "name", "rank", "bgcolor", "sci_name",
                "collapse_subspecies", "named_lineage", "lineage"
            ])
            tree.annotate_ncbi_taxa(args.taxid_attr)
        else:
            tree = Tree(nw)

        type2cast = {
            "str": str,
            "int": int,
            "float": float,
            "set": set,
            "list": list
        }

        for annotation in args.feature:
            aname, asource, amultiple, acast = None, None, False, str
            for field in annotation:
                try:
                    key, value = list(map(str.strip, field.split(":")))
                except Exception:
                    raise ValueError("Invalid feature option [%s]" % field)

                if key == "name":
                    aname = value
                elif key == "source":
                    asource = value
                elif key == "multiple":
                    #append
                    amultiple = value
                elif key == "type":
                    try:
                        acast = type2cast[value]
                    except KeyError:
                        raise ValueError("Invalid feature type [%s]" % field)
                else:
                    raise ValueError("Unknown feature option [%s]" % field)

            if not aname and not asource:
                ValueError(
                    'name and source are required when annotating a new feature [%s]'
                    % annotation)

            features.add(aname)
            for line in open(asource, 'rU'):
                line = line.strip()
                if not line or line.startswith('#'):
                    continue
                nodenames, attr_value = list(map(str.strip, line.split('\t')))
                nodenames = list(map(str.strip, nodenames.split(',')))
                relaxed_grouping = True
                if nodenames[0].startswith('!'):
                    relaxed_grouping = False
                    nodenames[0] = nodenames[0][1:]

                if len(nodenames) > 1:
                    target_node = tree.get_common_ancestor(nodenames)
                    if not relaxed_grouping:
                        pass
                        # do something
                else:
                    target_node = tree & nodenames[0]

                if hasattr(target_node, aname):
                    log.warning('Overwriting annotation for node" [%s]"' %
                                nodenames)
                else:
                    target_node.add_feature(aname, acast(attr_value))

        dump(tree, features=features)