Пример #1
0
 def __init__(self, tree, otu_dict, alignment, ingroup_mrca, workdir, config_obj,
              schema='newick', taxon_namespace=None):
     debug("build ATT class")
     self.aln = alignment
     assert isinstance(self.aln, datamodel.charmatrixmodel.DnaCharacterMatrix), \
             ("your aln '%s' is not a DnaCharacterMatrix" % alignment)
     self.tre = Tree.get(data=tree,
                         schema=schema,
                         preserve_underscores=True,
                         taxon_namespace=self.aln.taxon_namespace)
     assert (self.tre.taxon_namespace is self.aln.taxon_namespace), "tre and aln taxon_namespace are not identical"
     assert isinstance(otu_dict, dict), ("otu_dict '%s' is not of type dict" % otu_dict)
     self.otu_dict = otu_dict
     self.config = config_obj
     self.ps_otu = 1  # iterator for new otu IDs
     self._reconcile()
     self._reconcile_names()
     self.workdir = os.path.abspath(workdir)
     if not os.path.exists(self.workdir):
         os.makedirs(self.workdir)
     assert int(ingroup_mrca), ("your ingroup_mrca '%s' is not an integer." % ingroup_mrca)
     self.mrca_ott = ingroup_mrca  # ott_ingroup mrca can be pulled directly from phylesystem
     self.orig_seqlen = []  # will get filled in later...
     self.gb_dict = {}  # has all info about new blast seq
     self._reconciled = False
     self.unpubl_otu_json = None
Пример #2
0
  def match_id_to_mrca(self, tax_id, mrca_id):
      """ Recursive function to find out if tax_id is part of mrca_id.
      """
      # debug("match_id_to_mrca")
      if nodes is None:
          self.initialize()
     # debug("testing if {} within {}".format(tax_id, mrca_id))
      current_id = int(tax_id)
      mrca_id = int(mrca_id)
      #debug([rank_mrca_id, rank_tax_id])
      while current_id:
          if current_id == mrca_id:
              # debug("found right rank")
              return True
          elif current_id == 1:
 #             debug("current id is: {}".format(current_id))
              return False
          elif current_id == 0:
              debug("current id is: {}, in search for {} in {}".format(current_id, tax_id, mrca_id))
              return False             
          else: #try parent
              try:
                  current_id = int(nodes[nodes["tax_id"] == current_id]["parent_tax_id"].values[0])
              except:
                  sys.stderr.write("no parent found for ncbi:id {}".format(current_id))
                  return False
Пример #3
0
 def get_tax_seq_acc(self, acc):
     gb_id = acc
     if len(gb_id.split(".")) == 1:
         debug("accession number {} not recognized".format(gb_id))
         return None, None, None
     if gb_id in self.acc_tax_seq_dict:
         tax_name = self.acc_tax_seq_dict[gb_id]["taxname"]
         ncbi_id = self.acc_tax_seq_dict[gb_id]["^ncbi:taxon"]
         seq = self.acc_tax_seq_dict[gb_id]["seq"]
     else:
         read_handle = self.entrez_efetch(gb_id)
         tax_name = ncbi_data_parser.get_ncbi_tax_name(read_handle)
         ncbi_id = ncbi_data_parser.get_ncbi_tax_id(read_handle)
         seq = read_handle[0][u'GBSeq_sequence']
         tax_name = tax_name.replace(
             " ",
             "_")  #TODO check that searches are using names without spaces
         self.ncbiid_to_spn[ncbi_id] = tax_name
         self.acc_ncbi_dict[gb_id] = ncbi_id
         self.acc_tax_seq_dict[gb_id] = {
             'taxname': tax_name,
             "^ncbi:taxon": ncbi_id,
             'seq': seq
         }  #This is going to be a memory hog...
     assert ncbi_id is not None
     return ncbi_id, tax_name, seq
Пример #4
0
 def get_otu_for_acc(self, gb_id):
     if gb_id in set([self.otu_dict[otu].get("^ncbi:accession",'UNK') for otu in self.otu_dict]):
         for otu in self.otu_dict:
             if self.otu_dict[otu].get("^ncbi:accession") == gb_id:
                 debug("tried to create OTU for {} but already had otu {}".format(gb_id, otu))
                 return otu
     else:
         return None
Пример #5
0
    def _reconcile(self):
        """Taxa that are only found in the tree, or only in the alignment are deleted.

        This checks that the tree "original labels" from phylesystem
        align with those found in the alignment.
        """
        debug("reconcile")
        treed_tax = set()
        for leaf in self.tre.leaf_nodes():
            treed_tax.add(leaf.taxon)
        aln_tax = set()
        for tax, seq in self.aln.items():
            aln_tax.add(tax)
        prune = treed_tax ^ aln_tax
        missing = [i.label for i in prune]
        if missing:
            errmf = 'NAME RECONCILIATION Some of the taxa in the tree are not in the alignment or vice versa' \
                    ' and will be pruned. Missing "{}"\n'
            errm = errmf.format('", "'.join(missing))
            sys.stderr.write(errm)
        del_aln = []
        del_tre = []
        for taxon in prune:
            assert (taxon in aln_tax) or (taxon in treed_tax)
            if taxon in aln_tax:
                del_aln.append(taxon)
            if taxon in treed_tax:
                del_tre.append(taxon)
        self.aln.remove_sequences(del_aln)
        self.tre.prune_taxa(del_tre)
        for tax in prune:
            # potentially slow at large number of taxa and large numbers to be pruned
            found = 0
            for otu in self.otu_dict:
                if "^ot:originalLabel" in self.otu_dict[otu]:
                    if self.otu_dict[otu][u'^ot:originalLabel'] == tax.label:
                        self.otu_dict[otu]['^physcraper:status'] = "deleted in reconciliation"
                        found = 1
                elif otu == tax.label:
                    self.otu_dict[otu]['^physcraper:status'] = "deleted in reconciliation"
                    found = 1
            if found == 0:
                sys.stderr.write("lost taxon {} in reconcilliation \n".format(tax.label))
            self.aln.taxon_namespace.remove_taxon(tax)
        assert self.aln.taxon_namespace == self.tre.taxon_namespace
Пример #6
0
def get_user_input():
    """Asks for yes or no user input.

    :return: user input
    """
    debug("get user input")
    is_valid = 0
    x = None
    while not is_valid:
        try:
            x = raw_input("Please write either 'yes' or 'no': ")
            if x == 'yes' or x == 'no':
                is_valid = 1  # set it to 1 to validate input and to terminate the while..not loop
            else:
                print("'%s' is not a valid answer." % x)
        except ValueError as e:
            print("'%s' is not a valid answer." % e.args[0].split(": ")[1])
    return x
Пример #7
0
 def __init__(self, config_obj, workdir):
     """Generates a series of name disambiguation dicts"""
     self.config = config_obj
     assert self.config.email
     self.ott_to_ncbi = {}
     self.ncbi_to_ott = {}  # used to get ott_id for new Genbank query taxa
     self.ott_to_name = {}  # used in add_otu to get name from otuId
     self.acc_ncbi_dict = {
     }  # filled by ncbi_parser (by subprocess in earlier versions of the code).
     self.spn_to_ncbiid = {
     }  # spn to ncbi_id, it's only fed by the ncbi_data_parser, but makes it faster
     self.ncbiid_to_spn = {
     }  #TODO when is this generated? MK: well, here. it is filled with information from genbank to speed up translation between ncbi_taxon_ids and names. similar to  acc_ncbi_dict and spn_to_ncbiid.
     tax_folder = os.path.dirname(config_obj.ott_ncbi)
     fi = open(
         config_obj.ott_ncbi
     )  # This is in the taxonomy folder of the repo, needs to be updated by devs when OpenTree taxonomy changes.
     for lin in fi:
         lii = lin.split(",")
         self.ott_to_ncbi[int(lii[0])] = int(lii[1])
         self.ott_to_name[int(
             lii[0])] = lii[2].strip()  # todo merge into ott_to_ncbi?
     fi.close()
     fi = open("{}/ncbi_ott".format(tax_folder))
     for lin in fi:
         lii = lin.split(",")
         self.ncbi_to_ott[int(lii[0])] = int(lii[1])
     fi.close()
     assert len(self.ott_to_ncbi) > 0
     assert len(self.ott_to_name) > 0
     assert len(self.ncbi_to_ott) > 1000
     if config_obj.blast_loc == 'remote':
         debug("Config remote {}".format(config_obj.blast_loc))
         self.otu_rank = {
         }  # used only for web queries - contains taxonomic hierarchy information
     else:  # ncbi parser contains information about spn, tax_id, and ranks
         debug("Config not remote {}".format(config_obj.blast_loc))
         self.ncbi_parser = ncbi_data_parser.Parser(
             names_file=self.config.ncbi_parser_names_fn,
             nodes_file=self.config.ncbi_parser_nodes_fn)
     self.acc_tax_seq_dict = {}
Пример #8
0
    def write_labelled(self, label, filename = "labelled", direc='workdir', norepeats=True, add_gb_id=False):
        """output tree and alignment with human readable labels
        Jumps through a bunch of hoops to make labels unique.

        NOT MEMORY EFFICIENT AT ALL

        Has different options available for different desired outputs

        :param label: which information shall be displayed in labelled files: possible options:
                    '^ot:ottTaxonName', '^user:TaxonName', "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"
        :param treepath: optional: full file name (including path) for phylogeny
        :param alnpath:  optional: full file name (including path) for alignment
        :param norepeats: optional: if there shall be no duplicate names in the labelled output files
        :param add_gb_id: optional, to supplement tiplabel with corresponding GenBank sequence identifier
        :return: writes out labelled phylogeny and alignment to file
        """
        #debug("write labelled files")
        if direc == 'workdir':
            direc = self.workdir
        treepath = "{}/{}".format(direc, "{}.tre".format(filename))
        alnpath = "{}/{}".format(direc, '{}.fas'.format(filename))
        debug(treepath)
        assert label in ['^ot:ottTaxonName', '^user:TaxonName', '^physcraper:TaxonName',
                         "^ot:originalLabel", "^ot:ottId", "^ncbi:taxon"]
        tmp_newick = self.tre.as_string(schema="newick")
        tmp_tre = Tree.get(data=tmp_newick,
                           schema="newick",
                           preserve_underscores=True)
        tmp_fasta = self.aln.as_string(schema="fasta")
        tmp_aln = DnaCharacterMatrix.get(data=tmp_fasta,
                                         schema="fasta",
                                         taxon_namespace=tmp_tre.taxon_namespace)
        new_names = set()
        for taxon in tmp_tre.taxon_namespace:
            new_label = self.otu_dict[taxon.label].get(label, None)
            if new_label is None:
                if self.otu_dict[taxon.label].get("^ot:originalLabel"):
                    new_label = "orig_{}".format(self.otu_dict[taxon.label]["^ot:originalLabel"])
                else:
                    new_label = "ncbi_{}_ottname_{}".format(self.otu_dict[taxon.label].get("^ncbi:taxon", "unk"),
                                                            self.otu_dict[taxon.label].get('^physcraper:TaxonName', "unk"))
            new_label = str(new_label).replace(' ', '_')
            if add_gb_id:
                gb_id = self.otu_dict[taxon.label].get('^ncbi:accession')
                if gb_id is None:
                    gb_id = self.otu_dict[taxon.label].get("^ot:originalLabel")
                new_label = "_".join([new_label, str(gb_id)])
                sp_counter = 2
                if new_label in new_names and norepeats:
                    new_label = "_".join([new_label, str(sp_counter)])
                    sp_counter += 1
            else:
                if new_label in new_names and norepeats:
                    new_label = "_".join([new_label, taxon.label])
            taxon.label = new_label
            new_names.add(new_label)
        tmp_tre.write(path=treepath,
                      schema="newick",
                      unquoted_underscores=True,
                      suppress_edge_lengths=False)
        tmp_aln.write(path=alnpath,
                      schema="fasta")
Пример #9
0
    def add_otu(self, gb_id, ids_obj):
        """ Generates an otu_id for new sequences and adds them into self.otu_dict.
        Needs to be passed an IdDict to do the mapping.

        :param gb_id: the Genbank identifier/ or local unpublished
        :param ids_obj: needs to IDs class to have access to the taxonomic information
        :return: the unique otu_id - the key from self.otu_dict of the corresponding sequence
        """
        # debug("add_otu function")
        otu_id = self.get_otu_for_acc(gb_id)
        if otu_id:
            return otu_id
        otu_id = "otuPS{}".format(self.ps_otu)
        self.ps_otu += 1
        ott_id = None
        #debug("trying to add an otu with accesion {}".format(gb_id))
        ncbi_id, tax_name = ncbi_data_parser.get_tax_info_from_acc(gb_id, self, ids_obj)
        if ncbi_id == None:
            debug("DID NOT ADD accession {} ncbi_id {}".format(gb_id, ncbi_id, tax_name))
            return None
        else:
            ncbi_id = int(ncbi_id)
        if ncbi_id in ids_obj.ncbi_to_ott.keys():
            #debug("ADDED OTU: accession {} ncbi_id {}".format(gb_id, ncbi_id, tax_name))
            ott_id = int(ids_obj.ncbi_to_ott[ncbi_id])
        else:
            debug("{} Ncbi id not found in ott_ncbi dictionaries\n".format(ncbi_id))
            ott_id = None
        if ott_id in ids_obj.ott_to_name:
            ott_name = ids_obj.ott_to_name[ott_id]
        else:
            ott_name = None
        self.otu_dict[otu_id] = {}
        self.otu_dict[otu_id]["^ncbi:title"] = self.gb_dict[gb_id]["title"]
        self.otu_dict[otu_id]["^ncbi:taxon"] = ncbi_id
        self.otu_dict[otu_id]["^ncbi:TaxonName"] = tax_name
        self.otu_dict[otu_id]["^ot:ottId"] = ott_id
        self.otu_dict[otu_id]["^physcraper:status"] = "query"
        self.otu_dict[otu_id]["^ot:ottTaxonName"] = ott_name
        self.otu_dict[otu_id]["^physcraper:last_blasted"] = None
        if gb_id[:6] == "unpubl":
            self.otu_dict[otu_id]["^physcraper:status"] = "local seq"
            self.otu_dict[otu_id]["^ot:originalLabel"] = self.gb_dict[gb_id]["localID"]
            self.otu_dict[otu_id]['^user:TaxonName'] = self.gb_dict[gb_id][u'^user:TaxonName']
        else:
            self.otu_dict[otu_id]["^ncbi:gi"] = self.gb_dict[gb_id]["^ncbi:gi"]
            self.otu_dict[otu_id]["^ncbi:accession"] = gb_id
        # get a name for the OTU, no matter from which source
        if tax_name is not None:
            self.otu_dict[otu_id]["^physcraper:TaxonName"] = tax_name
        elif ott_name is not None:
            self.otu_dict[otu_id]["^physcraper:TaxonName"] = ott_name
        elif self.otu_dict[otu_id].get('^user:TaxonName'):
            self.otu_dict[otu_id]["^physcraper:TaxonName"] = self.otu_dict[otu_id]['^user:TaxonName']
        else:
            self.otu_dict[otu_id]["^physcraper:TaxonName"] = "ACC_{}".format(gb_id)
        assert self.otu_dict[otu_id]["^physcraper:TaxonName"]  # is not None
        if _DEBUG >= 2:
            sys.stderr.write("acc:{} assigned new otu: {}\n".format(gb_id, otu_id))
        #debug("RETURNED OTU_ID {}".format(otu_id))
        return otu_id