def __init__(self, genes, resource_manager=None): self.genes = genes self.graph = nx.MultiGraph() if not resource_manager: self.resource_manager = ResourceManager() else: self.resource_manager = resource_manager self.go_dag = GODag(self.resource_manager.get_go_obo()) self.goa = self._load_goa_gaf()
def main(path_to_pkl, path_to_godag, path_to_go_list_pkl): df = pd.read_pickle(path_to_pkl) preds = reshape(df['predictions'].values) labels = reshape(df['labels'].values) global GODag GODag = GODag(path_to_godag, optional_attrs=['relationship']) func_df = pd.read_pickle(path_to_go_list_pkl) functions = func_df['functions'].values func_index = dict() for i, go_id in enumerate(functions): func_index[go_id] = i global func_set func_set = set(func_index) # preds = df['predictions'].values gos = df['gos'].values f, p, r, t, preds_max = compute_performance(preds, labels, gos) print('f: \t{}\np: \t{}\nr: \t{}'.format(f, p, r)) # labels = list() # scores = list() # for i in range(len(preds)): # all_gos = set() # for go_id in gos[i]: # if go_id in all_functions: # all_gos |= get_anchestors(go, go_id) # all_gos.discard(GO_ID) # scores_dict = {} # for val in preds[i]: # go_id, score = val # if go_id in all_functions: # go_set = get_anchestors(go, go_id) # for g_id in go_set: # if g_id not in scores_dict or scores_dict[g_id] < score: # scores_dict[g_id] = score # all_preds = set(scores_dict) # | all_gos # all_preds.discard(GO_ID) # for go_id in all_preds: # if go_id in scores_dict: # scores.append(scores_dict[go_id]) # else: # scores.append(0) # if go_id in all_gos: # labels.append(1) # else: # labels.append(0) # scores = np.array(scores) # labels = np.array(labels) roc_auc = compute_roc(preds, labels) print('AUROC: \t{}'.format(roc_auc)) auprc = compute_prc(preds, labels) print('AUPRC: \t{}'.format(auprc)) # preds_max = (scores > t).astype(np.int32) mcc = compute_mcc(preds_max, labels) print('MCC: \t{}'.format(mcc))
def init_goea(**kws): """Initialize GODag and GOEnrichmentStudy.""" obo_dag = GODag(ROOT + "go-basic.obo") assoc = read_associations(ROOT + "association", no_top=True) popul_ids = [line.rstrip() for line in open(ROOT + "population")] methods = kws['methods'] if 'methods' in kws else ['not_bonferroni'] study_ids = [line.rstrip() for line in open(ROOT + "study")] return GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods), study_ids
def load_dag(self, opt_fields=None): """Run numerous tests for various self.reports.""" tic = timeit.default_timer() dag = GODag(self.obo, opt_fields) toc = timeit.default_timer() msg = "Elapsed HMS for OBO DAG load: {HMS} OPTIONAL_ATTR({O})\n".format( HMS=str(datetime.timedelta(seconds=(toc - tic))), O=opt_fields) sys.stdout.write(msg) return dag
def test_tcntobj_relationships(prt=sys.stdout): """Test loading of relationships, like part_of, into TermCounts""" fin_obo = os.path.join(REPO, "go-basic.obo") fin_anno = os.path.join(REPO, 'goa_human.gpad') download_go_basic_obo(fin_obo, prt, loading_bar=None) dnld_annotation(fin_anno) # Load ontologies go2obj_r0 = GODag(fin_obo) go2obj_r1 = GODag(fin_obo, optional_attrs=['relationship']) # Load annotations annoobj = GpadReader(fin_anno, godag=go2obj_r0) # Create TermCounts objects ns2tcntobj_r0 = {ns:TermCounts(go2obj_r0, annoobj.get_id2gos(ns)) for ns in NSS} ns2tcntobj_r1 = {ns:TermCounts(go2obj_r1, annoobj.get_id2gos(ns), RELS) for ns in NSS} _chk_pass_fail(ns2tcntobj_r0, ns2tcntobj_r1)
def _chk_godag(go2obj_act, obo): """Check that the update_association function did not alter godag.""" go2obj_exp = GODag(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..", obo)) assert len(go2obj_act) == len(go2obj_exp) assert set(go2obj_act) == set(go2obj_exp) for go_act, obj_act in go2obj_act.items(): obj_exp = go2obj_exp[go_act] act_gos = set(o.id for o in obj_act.parents) exp_gos = set(o.id for o in obj_exp.parents) assert act_gos == exp_gos, "\nACT: {A}\nEXP: {E}".format(A=act_gos, E=exp_gos)
def get_highest_ic(): if not os.path.isfile(HIGHEST_IC_FILE_PATH): go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w')) compute_highest_inc_parallel(list(go_dag.keys())) ic_file = open(HIGHEST_IC_FILE_PATH, 'r') highest_ic_anc = json.load(ic_file) ic_file.close() return highest_ic_anc
def __init__(self, fin_obo): self.repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../") self.fin_obo = os.path.join(self.repo, fin_obo) self.dag = GODag(self.fin_obo) self.go2obj = { go: o for go, o in self.dag.items() if not o.is_obsolete } self.goids_all = self.go2obj.keys()
def load_GODag(): """ OBO file retrieved from http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo """ from jcvi.apps.base import download so_file_url = "http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo" so_file = download(so_file_url, debug=False) return GODag(so_file)
def test_semantic_similarity(): """Test faster version of sematic similarity""" godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo')) name2go = {o.name: o.item_id for o in godag.values()} assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1a.anno'), godag, name2go) tcntobj = TermCounts(godag, assoc) assert tcntobj.gocnts[name2go['I']] == 50 assert tcntobj.gocnts[name2go['L']] == 50 assert tcntobj.gocnts[name2go['M']] == 50 assert tcntobj.gocnts[name2go['N']] == 50
def __init__(self, fin_go_basic_obo, go2items, log=None): self.log = sys.stdout if log is None else log self.bstdout = True if log is None else log # Some obo fields often used in searching. Many are optional to load when reading obo self.goa_srch_hdrs = [ 'defn', 'comment', 'name', 'is_a', 'relationship', 'synonym', 'xref' ] self.obo_dag = GODag(fin_go_basic_obo, optional_attrs=self.goa_srch_hdrs) self.go2items = go2items
def test_paths_to_top(): dag = GODag(ROOT + "mini_obo.obo") expected_paths = [['GO:0000001', 'GO:0000002', 'GO:0000005', 'GO:0000010'], ['GO:0000001', 'GO:0000003', 'GO:0000005', 'GO:0000010'], [ 'GO:0000001', 'GO:0000003', 'GO:0000006', 'GO:0000008', 'GO:0000010' ]] actual_paths = dag.paths_to_top("GO:0000010") chk_results(actual_paths, expected_paths) print_paths(actual_paths)
def _load_dag(dag_fin, opt_fields=None, out=None): """Run numerous tests for various REPOrts.""" tic = timeit.default_timer() dag = GODag(os.path.join(REPO, dag_fin), opt_fields) toc = timeit.default_timer() msg = "Elapsed HMS for OBO DAG load: {}\n\n".format(str(datetime.timedelta(seconds=(toc-tic)))) if out is not None: out.write(msg) else: sys.stdout.write(msg) return dag
def get_goea_results(method="fdr_bh"): """Get GOEA results.""" root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") obo_fin = os.path.join(root_dir, "goslim_generic.obo") obo_dag = GODag(obo_fin) assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True) popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method]) study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))] goea_results = goeaobj.run_study(study_ids, methods=[method]) return goea_results
def __init__(self): download_go_basic_obo(self.obo, sys.stdout, loading_bar=None) self.godag_r0 = GODag(self.obo) self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship'])) self.goids = list(set(o.id for o in self.godag_r0.values())) # GoSubDag (plain) tic = timeit.default_timer() self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None) prt_hms( tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources))) # GoSubDag with relationships self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True) prt_hms( tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format( N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources)))
def intialize_term_counts(): go_freq_dict = dict() go_dag = GODag(os.path.join(DATA_DIR, "go-basic.obo")) associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH, godag=go_dag).get_id2gos('all') term_counts = TermCounts(go_dag, associations) for i in go_dag.values(): go_freq_dict[i.id] = term_counts.get_count(i.id) # write frequency dict to JSON file with open(JSON_INDEXED_FILE_PATH, 'w') as json_file: json.dump(go_freq_dict, json_file)
def get_objgoea(pop, assoc, args): """Run gene ontology enrichment analysis (GOEA).""" obo_dag = GODag(obo_file=args.obo) methods = args.method.split(",") propagate_counts = not args.no_propagate_counts return GOEnrichmentStudy(pop, assoc, obo_dag, propagate_counts=propagate_counts, alpha=args.alpha, pvalcalc=args.pvalcalc, methods=methods)
def load_go_dag(self): """Load GO DAG. # Arguments filepath: str (optional), path to go.obo """ filepath = self.go_dag_path if not os.path.exists(filepath): raise GeneOntologyError(f"{os.path.basename(filepath)} does not exist at {os.path.dirname(filepath)}") go_dag = GODag(filepath) self.go_dag = go_dag
def run_comparison(go_list_1, go_list_2, go_dag=None, progress=None): """ Compute the pairwise similarity values for all rows from the given file. Parameters ---------- go_list_1 : a list with GO-identifiers as strings All GO-terms present in the first sample. go_list_2 : a list with GO-identifiers as strings All GO-terms present in the second sample. go_dag : GODag object GODag object from the goatools package progress : function (number) => void is called with the current progress value (a floating point value between 0 and 1) Returns ------- tuple A tuple with 3 values. These correspond to the similarity scores of biological process, cellular component and molecular function respectively. """ freq_dict = get_frequency_counts() highest_ic_anc = get_highest_ic() if go_dag is None: go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w')) split_per_domain_1 = split_per_domain(go_list_1, go_dag) split_per_domain_2 = split_per_domain(go_list_2, go_dag) output = list() total_comparisons = len(set(go_list_1)) * len(set(go_list_2)) done = 0 def progress_reporter(batch_size): nonlocal done if progress: done += batch_size progress(done / total_comparisons) for i in range(len(GO_DOMAINS)): output.append( compute_bma_metric(split_per_domain_1[i], split_per_domain_2[i], freq_dict, highest_ic_anc, progress_reporter, similarity_method="lin")) if progress: progress(1) return tuple(output)
def test_paths_to_top(): #dag = GODag("./tests/data/mini_obo.obo") dag = GODag("./data/mini_obo.obo") expected_paths = [['GO:0000001', 'GO:0000002', 'GO:0000005', 'GO:0000010'], ['GO:0000001', 'GO:0000003', 'GO:0000005', 'GO:0000010'], [ 'GO:0000001', 'GO:0000003', 'GO:0000006', 'GO:0000008', 'GO:0000010' ]] actual_paths = dag.paths_to_top("GO:0000010") chk_results(actual_paths, expected_paths) prt_paths(actual_paths)
def compute_similarity_method(params): (go_list1, go_list2, term_counts, go_dag_path, highest_ic_anc, similarity_method) = params go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w')) result = dict() for id1 in go_list1: for id2 in go_list2: key = (id1, id2) if id1 < id2 else (id2, id1) result[key] = similarity_method(id1, id2, go_dag, term_counts, highest_ic_anc) return result
def get_goeaobj(methods=None): """Test GOEA with method, fdr.""" obo_fin = "{REPO}/go-basic.obo".format(REPO=REPO) if not os.path.isfile(obo_fin): get_godag("go-basic.obo") obo_dag = GODag(obo_fin) assoc = read_associations( "{REPO}/tests/data/small_association".format(REPO=REPO), no_top=True) popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO) popul_ids = [line.rstrip() for line in open(popul_fin)] goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods) return goeaobj
def load_ontologies_and_associations(self): print "---LOADING ONTOLOGIES AND ASSOCIATIONS---" # Check if files exist and download if not obo_fname = download_go_basic_obo() gene2go = download_ncbi_associations() # Load ontologies and associations obodag = GODag(obo_fname) geneid2gos_human = read_ncbi_gene2go("gene2go", taxids=[9606]) print "{N:,} annotated human genes".format(N=len(geneid2gos_human)) return obodag, geneid2gos_human
def init(self): _log.debug("Cargando archivo de ontologias:" + self.obo_file) self.go_dag = GODag(self.obo_file) _log.debug("Se cargo el archivo:" + self.obo_file) if os.path.exists(self.graph_file): self.graph = nx.read_gpickle(self.graph_file) else: self._build_graph() nx.write_gpickle(self.graph, self.graph_file) _log.debug("Se genero el grafo de terminos")
def __init__( self, work_dir: str = '.', clean_work_dir: bool = False, organism: str = 'human', study_parameters: Dict[str, Union[int, float, str, List, Dict]] = { 'propagate_counts': False, 'alpha': 0.05, 'methods': ['fdr_bh'] } ) -> GOEngine: """A GOEngine that can be used for performing analysis using GOATOOLS Args: work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory. clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True. organism (str, optional): The organism . Defaults to 'human'. study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']} Returns: GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS """ print("Creating a GO Engine ...") if not os.path.exists(work_dir): raise ValueError( f"The provided work path: {work_dir} does not exist!!!") self.work_dir = work_dir if organism != 'human' and organism != 'mouse': raise ValueError( f"The provided organism: {organism} is not support, current engine mainly work with human and moues only" ) print(f"\t --> Downloading data ...") obo_fname = download_go_basic_obo( os.path.join(work_dir, 'go-basic.obo')) gene2go_fname = download_ncbi_associations( os.path.join(work_dir, 'gene2go')) ## parse the GO term print( f"\t --> parsing the data and intializing the base GOEA object...") obo_dag = GODag(obo_fname) if organism == 'human': self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(), obo_dag, **study_parameters) else: self._goea_obj = GOEnrichmentStudyNS( gene2iden_human.keys(), Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(), obo_dag, **study_parameters) self._clean_work_dir = clean_work_dir self._gene_ids = None return
def init_goea(log): """Read Ontologies and Annotations once.""" # --------------------------------------------------------------------- # Run Gene Ontology Analysis (GOEA) # # 1. Initialize obo_dag = GODag("go-basic.obo") assoc = read_associations("../data/association", no_top=True) popul_ids = [line.rstrip() for line in open("../data/population")] # 2. Run enrichment analysis goeaobj = GOEA(obo_dag, assoc, log) goeaobj.set_population(popul_ids) return goeaobj
def test_all(): """Run numerous tests for various reports.""" dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo") tic = timeit.default_timer() dag = GODag(dag_fin) toc = timeit.default_timer() out = sys.stdout write_hier_all(dag, out) write_hier_norep(dag, out) write_hier_lim(dag, out) write_hier_mrk(dag, out) msg = "Elapsed HMS: {}\n\n".format(str(datetime.timedelta(seconds=(toc-tic)))) sys.stdout.write(msg)
def get_pathway_mapping(organism=9606, ontology='basic', exclude=None, force=False): obo = 'goslim_generic.obo' if 'slim' in ontology else 'go-basic.obo' namespace_filter = get_namespace_filter(exclude) if force & (os.path.isfile(obo)): os.remove(obo) obo_fname = goatools.base.download_go_basic_obo(obo) obodag = GODag(obo_fname) return {term_id:term.name for term_id,term in obodag.items() if namespace_filter(term.namespace)}
def get_goeaobj(method, geneids_pop, taxid): """Load: ontologies, associations, and population geneids.""" fin_obo = "go-basic.obo" if not os.path.isfile(fin_obo): wget.download("wget http://geneontology.org/ontology/go-basic.obo") obo_dag = GODag(fin_obo) assoc_geneid2gos = get_assoc_ncbi_taxids([taxid]) goeaobj = GOEnrichmentStudy(geneids_pop, assoc_geneid2gos, obo_dag, propagate_counts=False, alpha=0.05, methods=[method]) return goeaobj
def test_semantic_i150(): """Test that comparing two identical GO IDs returns true""" fin_dag = os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo') ## fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf') # Read files godag = GODag(fin_dag) ## objanno = GafReader(fin_gaf) ## gene2gos = objanno.get_id2gos(namespace='CC') ## # Termcounts ## termcounts = TermCounts(godag, gene2gos, prt=sys.stdout) # Compare all GO terms with itself for goterm in set(godag.values()): goid = goterm.item_id assert semantic_similarity(goid, goid, godag) == 1.0