def setUpClass(cls): try: path = os.environ['PYOMA_DB2CHECK'] except KeyError: raise unittest.SkipTest("No database specified in PYOMA_DB2CHECK") cls.db = pyomadb.Database(path)
def _load_ogs(self): """ Using the orthoxml file select only the OGs of interest that have more species than the min_species threshold :return: Dictionary with og name as key and list of SeqRecords """ if '.fa' in self.args.dna_reference or '.fasta' in self.args.dna_reference: print('--- Load ogs and find their corresponding DNA seq from {} ---'.format(self.args.dna_reference)) print( 'Loading {} into memory. This might take a while . . . '.format(self.args.dna_reference.split("/")[-1])) self._db = SeqIO.index(self.args.dna_reference, "fasta") self._db_source = 'fa' elif '.h5' in self.args.dna_reference: print('--- Load ogs and find their corresponding DNA seq from {} ---'.format(self.args.dna_reference)) self._db = db.Database(self.args.dna_reference) self._db_id_map = db.OmaIdMapper(self._db) self._db_source = 'h5' else: print('--- Load ogs and find their corresponding DNA seq using the REST api ---') self._db_source = 'REST_api' if self.oma.mode is 'standalone': self._og_orthoxml = os.path.join(self.oma_output_path, 'OrthologousGroups.orthoxml') self._tree_str = os.path.join(self.oma_output_path, 'EstimatedSpeciesTree.nwk') self._ham_analysis = pyham.Ham(self._tree_str, self._og_orthoxml, use_internal_name=False) ogs = {} orthologous_groups_aa = os.path.join(self.args.output_path, "01_ref_ogs_aa") if not os.path.exists(orthologous_groups_aa): os.makedirs(orthologous_groups_aa) orthologous_groups_dna = os.path.join(self.args.output_path, "01_ref_ogs_dna") if not os.path.exists(orthologous_groups_dna): os.makedirs(orthologous_groups_dna) names_og = self.ogs for name, records in tqdm(names_og.items(), desc='Loading OGs', unit=' OGs'): # name = file.split("/")[-1].split(".")[0] ogs[name] = OG() ogs[name].aa = self._get_aa_records(name, records) output_file_aa = os.path.join(orthologous_groups_aa, name + ".fa") output_file_dna = os.path.join(orthologous_groups_dna, name + ".fa") if self._db_source: ogs[name].dna = self._get_dna_records(ogs[name].aa, name) else: print("DNA reference was not provided. Only amino acid sequences gathered!") self._write(output_file_dna, ogs[name].dna) self._write(output_file_aa, ogs[name].aa) return ogs
def __init__(self, go_file, go_terms, gaf, omadb=None, tarfile_ortho=None, TermCountsFile=None): self.go_file = go_file if omadb: print('open oma db obj') from pyoma.browser import db h5_oma = open_file(omadb, mode="r") self.db_obj = db.Database(h5_oma) print('done') elif tarfile_ortho: #retrieve hog members from tarfile_ortho self.tar = tarfile.open(tarfile_ortho, "r:gz") else: raise Exception('please provide input dataset') #go_terms_hdf5 = h5py.File(go_terms, mode='r') #self.goterms2parents = go_terms_hdf5['goterms2parents'] self.godf = pickle.loads(open(go_terms, 'rb').read()) self.go_file = obo_parser.GODag(go_file) print('building gaf') self.gaf = goatools_utils.buildGAF(gaf) print('done') if TermCountsFile is None: self.termcounts = TermCounts(self.go_file, self.gaf) else: self.termcounts = pickle.loads(open(TermCountsFile, 'rb').read()) #make a partial self.resniksimpreconf = partial(goatools_utils.resnik_sim_pandas, df=self.godf, termcounts=self.termcounts)
import networkx as nx import pandas as pd from matplotlib import pyplot as plt import glob from pyoma.browser import db import pickle from utils import config_utils omadir = config_utils.omadir db = db.Database(omadir + '/OmaServer.h5') print('loading mapping') experiments = ' fusion coexpression experiments textmining' unidf = pd.read_csv('full_uniprot_2_string.04_2015.tsv', delim_whitespace=True, header=0) unidf.columns = [col.split('|')[0].replace('#', '') for col in unidf.columns] unidf['uniprot_code'] = unidf.uniprot_ac.map(lambda x: x.split('|')[0]) unidf['uniprot_ac'] = unidf.uniprot_ac.map(lambda x: x.split('|')[1]) omadf = pd.read_csv('oma-uniprot.txt', delim_whitespace=True, comment='#', names=['oma', 'uniprot']) print('done') print('loading network files') networks = glob.glob('./*protein.links.full*txt') print(networks)
def __init__(self, tarfile_ortho=None, h5_oma=None, taxa=None, masterTree=None, saving_name=None, numperm=256, treeweights=None, taxfilter=None, taxmask=None, verbose=False): if h5_oma is not None: from pyoma.browser import db self.h5OMA = h5_oma self.db_obj = db.Database(h5_oma) self.oma_id_obj = db.OmaIdMapper(self.db_obj) elif tarfile_ortho: self.tar = tarfile_ortho self.h5OMA = None self.db_obj = None self.oma_id_obj = None self.tax_filter = taxfilter self.tax_mask = taxmask self.verbose = verbose self.datetime = datetime self.date_string = "{:%B_%d_%Y_%H_%M}".format(datetime.now()) self.saving_name = saving_name #original_umask = os.umask(0) if saving_name: self.saving_path = config_utils.datadir + saving_name + '/' if not os.path.isdir(self.saving_path): os.mkdir(path=self.saving_path) else: self.saving_path = config_utils.datadir + self.date_string + '/' if not os.path.isdir(self.saving_path): os.mkdir(path=self.saving_path) if masterTree is None: if h5_oma: genomes = pd.DataFrame( h5_oma.root.Genome.read())["NCBITaxonId"].tolist() genomes = [str(g) for g in genomes] taxa = genomes + [131567, 2759, 2157, 45596] + [ taxrel[0] for taxrel in list(h5_oma.root.Taxonomy[:]) ] + [taxrel[1] for taxrel in list(h5_oma.root.Taxonomy[:])] self.tree_string, self.tree_ete3 = files_utils.get_tree( taxa=taxa, genomes=genomes, savename=saving_name) elif taxa: with open(taxa, 'r') as taxin: taxlist = [int(line) for line in taxin] self.tree_string, self.tree_ete3 = files_utils.get_tree( taxa=taxlist, savename=saving_name) else: raise Exception( 'please specify either a list of taxa or a tree') elif mastertree: with open(masterTree, 'wb') as pklin: self.tree_ete3 = pickle.loads(pklin.read()) self.tree_string = self.tree_ete3.write(format=1) self.taxaIndex, self.reverse = files_utils.generate_taxa_index( self.tree_ete3, self.tax_filter, self.tax_mask) with open(config_utils.datadir + 'taxaIndex.pkl', 'wb') as taxout: taxout.write(pickle.dumps(self.taxaIndex)) self.numperm = numperm if treeweights is None: #generate aconfig_utilsll ones self.treeweights = hashutils.generate_treeweights( self.tree_ete3, self.taxaIndex, taxfilter, taxmask) else: #load machine learning weights self.treeweights = treeweights print(self.treeweights) wmg = WeightedMinHashGenerator(3 * len(self.taxaIndex), sample_size=numperm, seed=1) with open(self.saving_path + saving_name + 'wmg.pkl', 'wb') as taxout: taxout.write(pickle.dumps(self.taxaIndex)) self.wmg = wmg self.HAM_PIPELINE = functools.partial( pyhamutils.get_ham_treemap_from_row, tree=self.tree_string) self.HASH_PIPELINE = functools.partial(hashutils.row2hash, taxaIndex=self.taxaIndex, treeweights=self.treeweights, wmg=wmg) if self.h5OMA: self.READ_ORTHO = functools.partial(pyhamutils.get_orthoxml_oma, db_obj=self.db_obj) elif self.tar: self.READ_ORTHO = pyhamutils.get_orthoxml self.hashes_path = self.saving_path + 'hashes.h5' self.lshpath = self.saving_path + 'newlsh.pkl' self.lshforestpath = self.saving_path + 'newlshforest.pkl' self.mat_path = self.saving_path + 'hogmat.h5' self.columns = len(self.taxaIndex)
from Bio.SeqRecord import SeqRecord from pyoma.browser import db import familyanalyzer as fa # parameters MIN_SPECIES = 20 DUP_RATIO = 0 DIR = '/Users/daviddylus/Research/read2tree/reference_datasets/Dataset1/Output/' # read in files hog_XML = DIR + 'HierarchicalGroups.orthoxml' og_XML = DIR + 'OrthologousGroups.orthoxml' h5file = open_file("/Volumes/Untitled/OmaServer.h5", mode="r") genomeTab = h5file.root.Genome dbObj = db.Database(h5file) omaIdObj = db.OmaIdMapper(dbObj) if DUP_RATIO != 0: hog_op = fa.OrthoXMLParser(hog_XML) gene_family_xml_nodes_hog = hog_op.getToplevelGroups() # select all the families with more than X species and duplication ratio smaller than Y hog_families_X = {} for i, family in enumerate(gene_family_xml_nodes_hog): family_id = family.get('id') genes_per_hog = [ val for sublist in hog_op.getGenesPerSpeciesInFam(family).values() for val in sublist ] species_per_hog = hog_op.getGenesPerSpeciesInFam(family).keys() duplication_ratio = float(len(genes_per_hog)) / float(