def __init__(self) -> None: if os.path.exists("gene_ontology.1_2.obo"): self.obo = obo_parser.GODag("gene_ontology.1_2.obo") else: logger.info("Downloading Gene Ontology OBO...") request.urlretrieve( "http://www.geneontology.org/ontology/obo_format_1_2/gene_ontology.1_2.obo" ) self.obo = obo_parser.GODag( "gene_ontology.1_2.obo") # This will be used in query_obo_term logger.info("Done downloading OBO.") self.substruct = UniprotGoTerms()
def GO_Level_Function(Final_df, obo_file_path, save_path): # Load the obo data to the parser: go_dag = obo_parser.GODag(obo_file_path) # Get the levels of the GO terms: Final_df_levels = Final_df # We don´t know the level of these terms (are not in the parser): missing_GO_terms = set(Final_df_levels["GO_term"]) - set(go_dag) print("You have a total of " + str(len(missing_GO_terms)) + " without level info") # Delete the terms to avoid errors: Final_df_levels = Final_df_levels[~Final_df_levels.GO_term.isin(missing_GO_terms)] Final_df_levels_deleted = Final_df_levels[Final_df_levels.GO_term.isin(missing_GO_terms)] # Get the level of the terms that we can: Final_df_levels['Level'] = [go_dag[term].level for term in Final_df_levels['GO_term']] # Build the final dataFrame with a NaN for those that we don´t have the level: # For future studies the users can decide if they include them or not in the analyses. Final_df_Levels = pd.concat([Final_df_levels, Final_df_levels_deleted]) Final_df_Levels = Final_df_Levels.sort_values(by=['Total_K']) Final_df_Levels = Final_df_Levels.reset_index(drop = True) # Save the results: Final_df_Levels.to_csv(save_path, sep = "\t", header = True, index = False)
def goatools(self): """The network loaded into goatools' format. * https://github.com/tanghaibao/goatools To install: $ pip install goatools """ from goatools import obo_parser return obo_parser.GODag(self.path)
def display_topics(model, feature_names, no_top_words, go_file): """ Params: - model (LDA) - feature names/targets - number of top "words" (GO terms) to display for each topic Returns: - sorted, unique list of GO terms """ go = obo_parser.GODag(go_file) all_terms = set() for topic_idx, topic in enumerate(model.components_): terms = [ feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1] ] print("Topic {:d}:".format(topic_idx + 1)) for i, term in enumerate(terms): if term in go: go_term = go[term] all_terms.add(term) print('{:2} {} ({}) = {} [{}]'.format(i + 1, term, go_term.namespace, go_term.name, go_term.depth)) print() return list(sorted(all_terms))
def get_GO(optional_attrs=None): ''' Fetches GO Basic to local file if not present Returns GO Basic file ''' go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = os.getcwd() + '/data' # Check if we have the ./data directory already if (not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if (e.errno != 17): raise e else: raise Exception( 'Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.' ) # Check if the file exists already if (not os.path.isfile(data_folder + '/go-basic.obo')): go_obo = wget.download(go_obo_url, data_folder + '/go-basic.obo') else: go_obo = data_folder + '/go-basic.obo' go = obo_parser.GODag(go_obo, optional_attrs=optional_attrs) return go
def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'): # Load the Gene Ontology n_comps = metagene_matrix.shape[1] self.download_and_cache_resources( ) # Download ontology and annotations, if necessary gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo') # Load the human annotations c = 0 with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf: funcs = {} for entry in GOA.gafiterator(gaf): c += 1 uniprot_id = entry.pop('DB_Object_Symbol') funcs[uniprot_id] = entry # Our population is the set of genes we are analysing population = self.gene_symbols() print("We have %d genes in our population" % len(population)) # Build associations from functional annotations we got from the gaf file associations = {} for x in funcs: if x not in associations: associations[x] = set() associations[x].add(str(funcs[x]['GO_ID'])) gea = GOEnrichmentStudy(population, associations, gene_ontology, propagate_counts=True, alpha=0.05, methods=[method]) gea_results_by_component = {} rankings = self.ranked_genes_by_component(metagene_matrix) for ci in range(n_comps): study_genes = rankings[ci] print('\nComp. %d: %s...' % (ci, str(study_genes[:10]))) gea_results_by_component[ci] = gea.run_study(study_genes) # Get results into a dataframe per component. Easiest way is to use routine to # write a .tsv file, then read back and filter gea_results_df_by_component = [] for ci in range(n_comps): ge_df = self._perform_gene_enrichment_analysis_one_component( ci, gea_results_by_component, gea) if ge_df is not None: gea_results_df_by_component += [ge_df] # Merge the per-component dataframes into a single one gea_all_sig_results_df = pd.DataFrame() gea_all_sig_results_df = gea_all_sig_results_df.append( gea_results_df_by_component) gea_all_sig_results_df.to_csv(self.cache_dir + '%s_gea_all.tsv' % self.prefix, sep='\t')
def __init__(self, tax_id=9606, logger=None, force_update=False, go_dir=DEFAULT_GO_DIR, bg_genes=None): # gene_converter can be used to enable automatic gene conversion self.gene_converter = None self.logger = logger or log.get_console_logger(self.__class__.__name__) self.tax_id = tax_id if not os.path.isdir(go_dir): self.logger.warn("Creating master GO directory at %s.", go_dir) os.makedirs(go_dir) else: self.logger.info("Using existing GO directory at %s.", go_dir) self.base_dir = go_dir # get filenames and parse both GAF and OBO self.obo_fn = self.check_and_get_obo(force_update=force_update) self.gaf_fn = self.check_and_get_gaf(force_update=force_update) self.obo = obo_parser.GODag(self.obo_fn) self.gaf = associations.read_ncbi_gene2go(self.gaf_fn, taxids=[self.tax_id]) self.logger.info("{N:,} annotated human genes".format(N=len(self.gaf))) self.bg_genes = bg_genes if self.bg_genes is not None: self.set_bg_genes(bg_genes)
def get_GO_data(self): """ Get GO tree data #Credits: https://nbviewer.jupyter.org/urls/dessimozlab.github.io/go-handbook/GO%20Tutorial%20in%20Python%20-%20Solutions.ipynb Parameters ---------- Returns ------- None """ print("Getting GO data folder.") go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' go_data_folder = join(self.data_path, "data") create_dir(go_data_folder) # Check if the file exists already if not isfile(join(go_data_folder, "go-basic.obo")): self.go_obo = wget.download(go_obo_url, join(go_data_folder, "go-basic.obo")) else: self.go_obo = join(go_data_folder, "go-basic.obo") self.go_db = obo_parser.GODag(self.go_obo)
def prepare_GO_data(adata, gene2go, GO_file, GO_min_genes=500, GO_max_genes=None, GO_min_level=3, GO_max_level=3): """ Preprocesses data . GO terms are propagated to all parents categories so all GO terms satisfying conditions of min and max genes are included. gene2go: mapping of gene IDs to GO terms count_data: anndata object containing raw count data GO_file: GO ontology obo file GO_min_genes: minimum number of genes assigned to GO required to keep GO term (default: 500) GO_max_genes: maximum number of genes assigned to GO required to keep GO term (default: None) GO_min_level: minimum level required to keep GO term (default: 3) npcs: number of principal components annotations: dictionary containing cell annotations (default: None) return: dictionary of GO terms with processed anndata object with calculated knn graph of only genes belonging to that GO term """ GOdag = obo_parser.GODag(obo_file=GO_file) genes = set(adata.var_names) gene2go = {g: gene2go[g] for g in gene2go.keys() if g in genes} GOdag.update_association(gene2go) # propagate through hierarchy go2gene = reverse_association(gene2go) # return go2gene filtered_go2gene = {} for GO in go2gene: ngenes = len(go2gene[GO]) if check_conditions(GOdag.get(GO), ngenes, GO_min_genes, GO_max_genes, GO_min_level, GO_max_level): filtered_go2gene[GO] = go2gene[GO] print("Num filtered GOs:", len(filtered_go2gene)) return filtered_go2gene
def parseOBO(**kwargs): """Parse a GO OBO file containing the GO itself. See `OBO`_ for more information on the file format. .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html """ try: from goatools import obo_parser except: raise ImportError('GOATools needs to be installed to use parseOBO') go_obo_url = kwargs.get('go_obo_url', None) if go_obo_url is None: go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = kwargs.get('data_folder', None) if data_folder is None: data_folder = os.getcwd() + '/Data' # Check if we have the ./data directory already if (not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if (e.errno != 17): raise e else: raise Exception( 'Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.' ) # Check if the file exists already if (not os.path.isfile(data_folder + '/go-basic.obo')): try: handle = openURL(go_obo_url) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format( go_obo_url, str(err))) else: data = handle.read() if len(data): filename = data_folder + '/go-basic.obo' with open(filename, 'w+b') as obofile: obofile.write(data) LOGGER.debug('{0} downloaded ({1})'.format( go_obo_url, sympath(filename))) else: LOGGER.warn( '{0} download failed, reason unknown.'.format(go_obo_url)) else: go_obo = data_folder + '/go-basic.obo' return obo_parser.GODag(go_obo)
def fetch_go_hierarcy(): obo_file_location = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) if not os.path.exists( os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)): wget.download(constants.GO_OBO_URL, os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) go = obo_parser.GODag(obo_file_location, optional_attrs=['relationship']) # also use print "Downloading gene-GO associations" association_file_location = os.path.join( constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME) if not os.path.exists(association_file_location): wget.download( constants.GO_ASSOCIATION_GENE2GEO_URL, os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)) print "Loading gene-GO associations" # gene2go = download_ncbi_associations(obo_file_location) - why does this line needed? go2geneids_human = read_ncbi_gene2go(association_file_location, taxids=[9606], go2geneids=True) print "Writing out GO child-parent links" if not os.path.exists(constants.OUTPUT_GLOBAL_DIR): os.makedirs(constants.OUTPUT_GLOBAL_DIR) out_fname = "go_output_{}_{}.txt".format(constants.CANCER_TYPE, time.time()) genes = [] isa = [] relship = [] with open(os.path.join(constants.OUTPUT_GLOBAL_DIR, out_fname), 'w') as o: for goid in go2geneids_human.keys(): if not go.has_key(goid): print "GO obo file does not contain {}".format(goid) continue entry = go[goid] for gene in go2geneids_human[entry.id]: genes.append((str(gene), entry.id)) o.write("{}\t{}\t{}\n".format("genes", *genes[-1])) children = entry.children for c in children: isa.append((c.id, entry.id)) o.write("{}\t{}\t{}\n".format("is a", *isa[-1])) rels = entry.relationship_rev for rtype in rels.keys(): rs = rels[rtype] for r in rs: relship.append((rtype, r.id, entry.id)) o.write("{}\t{}\t{}\n".format(rtype, *relship[-1])) return (genes, isa, relship)
def test_top_parent(prt=sys.stdout): """Semantic Similarity test for Issue #86.""" fin_obo = "data/i86.obo" branch_dist = 5 repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..") godag = obo_parser.GODag(os.path.join(repo, fin_obo)) # Get all the annotations from arabidopsis. # Calculate the semantic distance and semantic similarity: _test_path_same(godag, prt) _test_path_parallel(godag, prt) _test_path_bp_mf(branch_dist, godag, prt) sys.stdout.write("TESTS PASSed: similarity_top_parent\n")
def _load_go_db(data_dir, slim_down): """ Load GO databases using goatools.oboparser. Always loads the full GO, and loads the metagenomics slim GO if slim_down = True :param data_dir: Data directory :param slim_down: Whether slim database is going to be used or not :return: A tuple, with full and slim GO. If slim_down = False, then the tuple is (full GO, None) """ obo_path, slim_path = GeneOntologyDb._define_data_paths(data_dir) if not (os.path.exists(obo_path) and os.path.exists(slim_path)): logging.error( 'GO files not found in specified directory.\n' + 'Please use the command >metaquantome db ... to download the files.' ) # read gos go_dag = obo_parser.GODag(obo_path) if slim_down: go_dag_slim = obo_parser.GODag(slim_path) else: go_dag_slim = None return go_dag, go_dag_slim
def __init__(self, basename, method): self.basename = basename self.method = method self._gene_symbols = None self.cache_dir = '../Cache/%s/GeneEnrichment/' % self.basename self.plots_dir = '../Plots/%s/GeneEnrichment/' % self.basename self.gene_column_name = 'Gene_ID' if 'Canon' in self.basename else 'GeneENSG' self._go_enrichment_study = None # will be lazily evaluated os.makedirs(self.cache_dir, exist_ok=True) os.makedirs(self.plots_dir, exist_ok=True) self.download_and_cache_resources( ) # Download ontology and annotations, if necessary self._gene_ontology = obo_parser.GODag( '../DownloadedResources/go-basic.obo')
def main(args): logging.basicConfig( level=logging.INFO, format='%(module)s:%(levelname)s:%(asctime)s:%(message)s', handlers=[ logging.FileHandler("../logs/report.log"), logging.StreamHandler() ]) logging.info(args) paths = utils.read_paths(args.paths_file) go = obo_parser.GODag(args.obo_file) gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606]) termcounts = TermCounts(go, gene2go) if args.namespace is not None: if args.namespace == 'cc': go = { go_term: values for go_term, values in go.items() if values.namespace == 'cellular_component' } elif args.namespace == 'mf': go = { go_term: values for go_term, values in go.items() if values.namespace == 'molecular_function' } elif args.namespace == 'bp': go = { go_term: values for go_term, values in go.items() if values.namespace == 'biological_process' } else: raise ValueError('namespace can be only cc, mf or bp') wrapped = [[path, go, gene2go, termcounts] for path in paths] if args.n_cores > 1: sims = list(p_map(wrap, wrapped)) else: sims = list(map(wrap, tqdm(wrapped))) utils.create_dir_if_not_exist(dirname(args.out_sims_file)) np.savetxt(args.out_sims_file, sims)
def __init__(self, gaf_file_path: os.path, obo_file_path: os.path = None): self.go_annotations = pd.read_table( gaf_file_path, names=[ "DB", "DB_Object_ID", "DB_Object_Symbol", "Qualifier", "GO_ID", "DB:Reference", "Evidence Code", "With (or) From", "Aspect", "DB_Object_Name", "DB_Object_Synonym", "DB_Object_Type", "Taxon and Interacting taxon", "Date", "Assigned_By", "Annotation_Extension", "Gene_Product_Form_ID" ], header=None, dtype="string", comment="!", compression="gzip") self.go = obo_parser.GODag( obo_file_path if obo_file_path else self.retrieveOBOFile(), optional_attrs="relationship")
def test_semantic_i88(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") goids = set(go for go, o in godag.items() if go == o.id) goids = set(godag.keys()) # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, "tair.gaf") # dnld_assc includes read_gaf associations = dnld_assc(fin_gaf, godag, prt=None) # First get the counts and information content for each GO term. termcounts = TermCounts(godag, associations) gosubdag = GoSubDag(goids, godag, tcntobj=termcounts) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process go_root = deepest_common_ancestor([go_id3, go_id4], godag) sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'. format(GO1=go_id3, GO2=go_id4, VAL=sim)) gosubdag.prt_goids([go_root, go_id3, go_id4]) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
def download_and_process_go(species='hsa'): print("Creating GO files") from goatools import obo_parser obo_file = os.path.join(id_mapping_dir, 'go.obo') if not os.path.exists(obo_file): download_current_go() go = obo_parser.GODag(obo_file) gene_to_go, go_to_gene, goid_to_name = download_ncbi_gene_file() go_aspect = dict() go_depth = dict() dirname = network_data_dir go_to_gene_name = os.path.join(dirname, '{}_goids_to_genes.p'.format(species)) go_to_go_name = os.path.join(dirname, '{}_goids_to_goname.p'.format(species)) gene_to_go_name = os.path.join(dirname, '{}_gene_to_go.p'.format(species)) go_depth_name = os.path.join(dirname, '{}_godepth.p'.format(species)) go_aspect_name = os.path.join(dirname, '{}_go_aspect.p'.format(species)) for i in go_to_gene.keys(): go_depth[i] = go[i].depth go_aspect[i] = go[i].namespace pickle.dump(go_to_gene, open(go_to_gene_name, 'wb')) pickle.dump(goid_to_name, open(go_to_go_name, 'wb')) pickle.dump(go_depth, open(go_depth_name, 'wb')) pickle.dump(go_aspect, open(go_aspect_name, 'wb')) for i in go_to_gene: term = i genes = go_to_gene[i] for g in genes: if g in gene_to_go: gene_to_go[g].add(term) else: gene_to_go[g] = set() gene_to_go[g].add(term) pickle.dump(gene_to_go, open(gene_to_go_name, 'wb')) print("Done creating GO files")
def createGOjsFile(): # recreates GO.js file so that ontology is up to date import json from goatools import obo_parser file = obo_parser.GODag('data/go-basic.obo') GOjsDict = {} for goID in file: parents = [] for parent in file[goID].parents: parents.append(parent.id) name = file[goID].name namespace = file[goID].namespace if not file[goID].is_obsolete: GOjsDict[unicode(file[goID].id)] = { 'p': parents, 'c': namespace, 'n': name } GOjs = open('js/GO.js', 'w') json.dump(GOjsDict, GOjs)
def createGOjsFile(): # recreates GO.js file so that ontology is up to date import json from goatools import obo_parser file = obo_parser.GODag('data/go-basic.obo', 'relationship') GOjsDict = {} for goID in file: parents = [] uppers = [] for parent in file[goID].parents: parents.append(parent.id) uppers.append((parent.id, 'is_a')) if len(list(file[goID].relationship)) == 1: for relationships in list(file[goID].relationship.values()): relation = list(relationships)[0].id parents.append(relation) uppers.append([relation, list(file[goID].relationship)[0]]) if len(list(file[goID].relationship)) > 1: i = 0 for relationships in list(file[goID].relationship.values()): relation = list(relationships)[0].id parents.append(relation) uppers.append([relation, list(file[goID].relationship)[i]]) i += 1 name = file[goID].name namespace = file[goID].namespace if not file[goID].is_obsolete: GOjsDict[unicode(file[goID].id)] = { 'p': parents, 'c': namespace, 'n': name, 'u': uppers } GOjs = open('js/GO.js', 'w') json.dump(GOjsDict, GOjs)
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") # Get all the annotations from arabidopsis. associations = read_gaf("http://geneontology.org/gene-associations/gene_association.tair.gz") # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
def __init__(self, go_file, go_terms, gaf, omadb=None, tarfile_ortho=None, TermCountsFile=None): self.go_file = go_file if omadb: print('open oma db obj') from pyoma.browser import db h5_oma = open_file(omadb, mode="r") self.db_obj = db.Database(h5_oma) print('done') elif tarfile_ortho: #retrieve hog members from tarfile_ortho self.tar = tarfile.open(tarfile_ortho, "r:gz") else: raise Exception('please provide input dataset') #go_terms_hdf5 = h5py.File(go_terms, mode='r') #self.goterms2parents = go_terms_hdf5['goterms2parents'] self.godf = pickle.loads(open(go_terms, 'rb').read()) self.go_file = obo_parser.GODag(go_file) print('building gaf') self.gaf = goatools_utils.buildGAF(gaf) print('done') if TermCountsFile is None: self.termcounts = TermCounts(self.go_file, self.gaf) else: self.termcounts = pickle.loads(open(TermCountsFile, 'rb').read()) #make a partial self.resniksimpreconf = partial(goatools_utils.resnik_sim_pandas, df=self.godf, termcounts=self.termcounts)
Created on Wed Jul 3 09:58:37 2019 @author: benlitterer """ #TODO think about how to present gos that overlap with the blosum output's go's. Keep them in algo or not? #TODO also maybe consider not letting B62 match with itself and use that as a referance?? from goatools import obo_parser import argparse import os from collections import Counter from bokeh.plotting import figure, show, output_file from bokeh.palettes import RdYlBu, Category20 go_obo = "/home/benlitterer/Academic/Research/ProjectMatrices" go = obo_parser.GODag(go_obo + "/go-basic.obo") parser = argparse.ArgumentParser( description="path to directory containing your Gene Annotation Output") parser.add_argument("blast_directory") parser.add_argument("go_directory") args = parser.parse_args() #cwd = "/home/benlitterer/Academic/Research/Summer2019/referance" goannots = {} BLAST_OUT_DIR = args.blast_directory GO_dict = {} db_path = args.go_directory #"/home/benlitterer/Academic/Research/Summer2019/testAllMatricesMediumInput/MappedGOs01Cutoff/"
genes = _gene2go.keys() query_result_list = [] for genes_chunk in np.array_split(genes, max(genes.shape[0] // 1000, 1)): query_res = mg.querymany(genes_chunk, scopes='entrezgene', fields='entrezgene,symbol', species='human', entrezonly=True, as_dataframe=True, df_index=False, verbose=False) if 'notfound' in query_res.columns: query_res = query_res[query_res.notfound != True] # ignore PEP8 warnings. query_result_list.append(query_res) df_res = pd.concat(query_result_list) res = dict(zip(df_res.entrezgene, df_res.symbol)) return res with HidePrints(): _go_dag = obo_parser.GODag(go_obo_path) _gaf = read_gaf(gaf_path, prt=None) _termcounts = TermCounts(_go_dag, _gaf) _gene2go = read_ncbi_gene2go(gene2go_path) _gene2symbol = _init_gene2symbol_dict() _symbol2gene = {symbol: gene for gene, symbol in _gene2symbol.items()} def get_genes(): return list(_gene2go.keys()) def get_symbols(): return list(_gene2symbol.values())
from goatools.semantic import TermCounts, ic, resnik_sim, semantic_similarity from magine.enrichment.ontology_analysis import MagineGO from magine.data.storage import id_mapping_dir obo_file = os.path.join(id_mapping_dir, 'go.obo') if not os.path.exists(obo_file): print("Using ontology for first time") print("Downloading files") from magine.enrichment.databases.gene_ontology import \ download_and_process_go download_and_process_go() assert os.path.exists(obo_file) go = obo_parser.GODag(obo_file) mg = MagineGO() print("Loading termcounts") associations = mg.gene_to_go termcounts = TermCounts(go, associations) print("Loaded termcounts") def path_to_root(go_term): """ Creates networkx graph from provided term to root term Parameters ---------- go_term : str
lib.log.info("Compiling all annotations for each genome") #get orthology into dictionary orthoDict = {} if len(args.input) > 1: with open(orthologs, 'rU') as input: for line in input: line = line.replace('\n', '') col = line.split('\t') genes = col[-1].split(', ') for i in genes: orthoDict[i] = col[0] #get GO associations into dictionary as well with lib.suppress_stdout_stderr(): goLookup = obo_parser.GODag(os.path.join(parentdir, 'DB', 'go.obo')) goDict = {} with open(os.path.join(go_folder, 'associations.txt'), 'rU') as input: for line in input: line = line.replace('\n', '') col = line.split('\t') gos = col[1].split(';') goList = [] for i in gos: try: description = i + ' ' + goLookup[i].name except KeyError: print '%s not found in go.obo, try to download updated go file' % i description = i goList.append(description) goDict[col[0]] = goList
if go_id not in go_anno_dict[gene_id][0]: go_anno_dict[gene_id][0].append(go_id) go_anno_dict[gene_id][1].append(go_domain) go_anno_dict[gene_id][2].append(go_des) if gene_inter_id not in go_anno_dict[gene_id][2]: go_anno_dict[gene_id][3].append(gene_inter_id) go_anno_dict[gene_id][4].append(gene_inter_des) json_file = '%s.json' % args.biomart with open(json_file, 'w') as json_file_info: json.dump(go_anno_dict, json_file_info) elif args.go: if args.go.endswith('json'): with open(args.go) as go_info: go_anno_dict = json.load(go_info) else: go_db_info = obo_parser.GODag(go_db) reader = csv.reader(file(args.go, 'rb')) for n, each_record in enumerate(reader): if n != 0: gene_id = each_record[0] go_id = each_record[1] go_domain = go_db_info[go_id].namespace go_des = go_db_info[go_id].name if gene_id not in go_anno_dict: go_anno_dict[gene_id] = [[go_id], [go_domain], [go_des]] else: if go_id not in go_anno_dict[gene_id][0]: go_anno_dict[gene_id][0].append(go_id) go_anno_dict[gene_id][1].append(go_domain) go_anno_dict[gene_id][2].append(go_des) go_json_file = '%s.json' % args.go
#convert ID's to UniProtKB (https://www.uniprot.org/uploadlists/); saved as "UniProtIDs.csv"; proteins not able to convert: 'not_in_proteins.txt' UP_ID = np.genfromtxt(fname='UniProtIDs.csv', names=True, delimiter=',', dtype=['U15','U6','U25','U25','U25','U25']) ensembl_to_up = {j : UP_ID['UniProtID'][i] for i, j in enumerate(UP_ID['EnsemblID'])} up_to_ensembl = {UP_ID['UniProtID'][i] : j for i, j in enumerate(UP_ID['EnsemblID'])} newEnsemblIDs = [] for i in EnsemblIDs: if i in UP_ID['EnsemblID']: newEnsemblIDs.append(i) C_int_UP = list(map(lambda x: ensembl_to_up[x], newEnsemblIDs)) #Enrichment Analysis go = obo.GODag('/disks/strw13/DBDM/A4_2/go-basic.obo') with gzip.open('goa_human.gaf.gz', 'rt') as fp: funcs = {} for entry in gafiterator(fp): uniprot_id = entry.pop('DB_Object_ID') funcs[uniprot_id] = entry pop = funcs.keys() assoc = {} for x in funcs: if x not in assoc: assoc[x] = set() assoc[x].add(str(funcs[x]['GO_ID']))
utils.parallel_process( get_sps, destnodes_sample, n_jobs=args.N_cores), []) logging.info('Num of all paths: {}'.format(len(all_paths))) fc_paths = [] for i in trange(len(all_paths)): fullpath = all_paths[i] if len(fullpath) > 2: path = all_paths[i][1:-1] if np.all([node in fcnodes for node in path]): fc_paths.append(fullpath) logging.info('Num of FC paths: {}'.format(len(fc_paths))) go = obo_parser.GODag(args.obo_file) gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606]) termcounts = TermCounts(go, gene2go) def get_sim(genes_pair): # sim_measure = lin_sim i, j = genes_pair[0], genes_pair[1] i_go = [goterm for goterm in gene2go[i] if goterm in go] j_go = [goterm for goterm in gene2go[j] if goterm in go] sims = [] for i_go_term in i_go: def wrap(j_go_term): return resnik_sim(i_go_term, j_go_term, go, termcounts) simlist = [sim for sim in map(wrap, j_go) if sim is not None]
except OSError as e: if (e.errno != 17): raise e else: raise Exception( 'Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.' ) # Check if the file exists already if (not os.path.isfile(data_folder + '/go-basic.obo')): go_obo = wget.download(go_obo_url, data_folder + '/go-basic.obo') else: go_obo = data_folder + '/go-basic.obo' print(go_obo) go = obo_parser.GODag(go_obo) go_id = 'GO:0048527' go_term = go[go_id] print(go_term) print('GO term name: {}'.format(go_term.name)) print('GO term namespace: {}'.format(go_term.namespace)) for term in go_term.parents: print(term) for term in go_term.children: print(term) def transitive_closure(go_term, go): go_term_set = set() find_parents(go_term, go, go_term_set) find_children(go_term, go, go_term_set)