def return_profile_OTF(self, fam): """ Returns profiles as binary vectors for use with optimisation pipelines """ if type(fam) is str: fam = hashutils.hogid2fam(fam) ortho_fam = self.READ_ORTHO(fam) tp = self.HAM_PIPELINE([fam, ortho_fam]) losses = [ self.taxaIndex[n.name] for n in tp.traverse() if n.lost and n.name in self.taxaIndex ] dupl = [ self.taxaIndex[n.name] for n in tp.traverse() if n.dupl and n.name in self.taxaIndex ] presence = [ self.taxaIndex[n.name] for n in tp.traverse() if n.nbr_genes > 0 and n.name in self.taxaIndex ] indices = dict( zip(['presence', 'loss', 'dup'], [presence, losses, dupl])) hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex))) for i, event in enumerate(indices): if len(indices[event]) > 0: taxindex = np.asarray(indices[event]) hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex) hog_matrix_raw[:, hogindex] = 1 return {fam: {'mat': hog_matrix_raw, 'tree': tp}}
def return_profile_complements(self, fam): """ Returns profiles for each loss to search for complementary hogs """ if type(fam) is str: fam = hashutils.hogid2fam(fam) ortho_fam = self.READ_ORTHO(fam) tp = self.HAM_PIPELINE([fam, ortho_fam]) losses = set([ n.name for n in tp.traverse() if n.lost and n.name in self.taxaIndex ]) #these are the roots of the fams we are looking for #we just assume no duplications or losses from this point ancestral_nodes = ([ n for n in profiler.tree.traverse() if n.name in losses ]) losses = [] dupl = [] complements = {n.name + '_loss': []} indices = dict( zip(['presence', 'loss', 'dup'], [presence, losses, dupl])) hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex))) for i, event in enumerate(indices): if len(indices[event]) > 0: taxindex = np.asarray(indices[event]) hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex) hog_matrix_raw[:, hogindex] = 1 return {fam: {'mat': hog_matrix_raw, 'hash': tp}}
def pull_hashes(self, hoglist): """ Given a list of hog_ids , returns a dictionary containing their hashes. This uses the hdf5 file to get the hashvalues :param hog_id: query hog id :param fam_id: query fam id :return: a dict containing the hash values of the hogs in hoglist """ return { hog: hashutils.fam2hash_hdf5(hashutils.hogid2fam(str(hog)), self.hashes_h5, nsamples=self.nsamples) for hog in hoglist }
def hog_query(self, hog_id=None, fam_id=None, k=100): """ Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH. :param hog_id: query hog id :param fam_id: query fam id :return: list containing the results of the LSH for the given query """ if hog_id is not None: fam_id = hashutils.hogid2fam(hog_id) query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5, nsamples=self.nsamples) #print(query_hash.hashvalues) results = self.lshobj.query(query_hash, k) return results
def return_profile_OTF_DCA(self, fam, lock=None): """ Returns profiles as strings for use with DCA pipelines just concatenate the numpy arrays and use the tostring function to generate an input "alignment" """ if type(fam) is str: fam = hashutils.hogid2fam(fam) if lock is not None: lock.acquire() ortho_fam = self.READ_ORTHO(fam) if lock is not None: lock.release() tp = self.HAM_PIPELINE([fam, ortho_fam]) dcastr = hashutils.tree2str_DCA(tp, self.taxaIndex) return {fam: {'dcastr': dcastr, 'tree': tp}}
def hog_query_sorted(self, hog_id=None, fam_id=None, k=100): """ Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH. :param hog_id: query hog id :param fam_id: query fam id :return: list containing the results of the LSH for the given query """ if hog_id is not None: fam_id = hashutils.hogid2fam(hog_id) query_hash = hashutils.fam2hash_hdf5(fam_id, self.hashes_h5, nsamples=self.nsamples) results = self.lshobj.query(query_hash, k) hogdict = self.pull_hashes(results) hogdict = {hog: hogdict[hog].jaccard(query_hash) for hog in hogdict} sortedhogs = [(k, v) for k, v in hogdict.items()] sortedhogs = sorted(student_tuples, key=lambda x: x[1]) sortedhogs = [h[0] for h in sortehogs.reverse()] return hogdict