예제 #1
0
 def __init__(self) -> None:
     if os.path.exists("gene_ontology.1_2.obo"):
         self.obo = obo_parser.GODag("gene_ontology.1_2.obo")
     else:
         logger.info("Downloading Gene Ontology OBO...")
         request.urlretrieve(
             "http://www.geneontology.org/ontology/obo_format_1_2/gene_ontology.1_2.obo"
         )
         self.obo = obo_parser.GODag(
             "gene_ontology.1_2.obo")  # This will be used in query_obo_term
         logger.info("Done downloading OBO.")
     self.substruct = UniprotGoTerms()
예제 #2
0
def GO_Level_Function(Final_df, obo_file_path, save_path):
    
    # Load the obo data to the parser:
    
    go_dag = obo_parser.GODag(obo_file_path)
    
    # Get the levels of the GO terms: 
    
    Final_df_levels = Final_df
    
    # We don´t know the level of these terms (are not in the parser):
     
    missing_GO_terms = set(Final_df_levels["GO_term"]) - set(go_dag)
    
    print("You have a total of " + str(len(missing_GO_terms)) + " without level info")
    
    # Delete the terms to avoid errors:
    
    Final_df_levels = Final_df_levels[~Final_df_levels.GO_term.isin(missing_GO_terms)]
    Final_df_levels_deleted = Final_df_levels[Final_df_levels.GO_term.isin(missing_GO_terms)]
    
    # Get the level of the terms that we can:

    Final_df_levels['Level'] = [go_dag[term].level for term in Final_df_levels['GO_term']]
    
    # Build the final dataFrame with a NaN for those that we don´t have the level:
    # For future studies the users can decide if they include them or not in the analyses.
    
    Final_df_Levels = pd.concat([Final_df_levels, Final_df_levels_deleted])
    Final_df_Levels = Final_df_Levels.sort_values(by=['Total_K'])
    Final_df_Levels = Final_df_Levels.reset_index(drop = True)
    
    # Save the results:
    
    Final_df_Levels.to_csv(save_path, sep = "\t", header = True, index = False)
예제 #3
0
 def goatools(self):
     """The network loaded into goatools' format.
     * https://github.com/tanghaibao/goatools
     To install: $ pip install goatools
     """
     from goatools import obo_parser
     return obo_parser.GODag(self.path)
예제 #4
0
def display_topics(model, feature_names, no_top_words, go_file):
    """
    Params:
    - model (LDA)
    - feature names/targets
    - number of top "words" (GO terms) to display for each topic
    Returns:
    - sorted, unique list of GO terms
    """
    go = obo_parser.GODag(go_file)
    all_terms = set()

    for topic_idx, topic in enumerate(model.components_):
        terms = [
            feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]
        ]

        print("Topic {:d}:".format(topic_idx + 1))

        for i, term in enumerate(terms):
            if term in go:
                go_term = go[term]
                all_terms.add(term)
                print('{:2} {} ({}) = {} [{}]'.format(i + 1, term,
                                                      go_term.namespace,
                                                      go_term.name,
                                                      go_term.depth))
        print()

    return list(sorted(all_terms))
def get_GO(optional_attrs=None):
    '''
    Fetches GO Basic to local file if not present
    Returns GO Basic file
    '''
    go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
    data_folder = os.getcwd() + '/data'

    # Check if we have the ./data directory already
    if (not os.path.isfile(data_folder)):
        # Emulate mkdir -p (no error if folder exists)
        try:
            os.mkdir(data_folder)
        except OSError as e:
            if (e.errno != 17):
                raise e
    else:
        raise Exception(
            'Data path (' + data_folder + ') exists as a file. '
            'Please rename, remove or change the desired location of the data path.'
        )

    # Check if the file exists already
    if (not os.path.isfile(data_folder + '/go-basic.obo')):
        go_obo = wget.download(go_obo_url, data_folder + '/go-basic.obo')
    else:
        go_obo = data_folder + '/go-basic.obo'
    go = obo_parser.GODag(go_obo, optional_attrs=optional_attrs)
    return go
예제 #6
0
    def perform_gene_enrichment_analysis(self, metagene_matrix, method='fdr'):
        # Load the Gene Ontology
        n_comps = metagene_matrix.shape[1]

        self.download_and_cache_resources(
        )  # Download ontology and annotations, if necessary
        gene_ontology = obo_parser.GODag('../DownloadedResources/go-basic.obo')

        # Load the human annotations
        c = 0
        with gzip.open('../DownloadedResources/goa_human.gaf.gz', 'rt') as gaf:
            funcs = {}
            for entry in GOA.gafiterator(gaf):
                c += 1
                uniprot_id = entry.pop('DB_Object_Symbol')
                funcs[uniprot_id] = entry

        # Our population is the set of genes we are analysing

        population = self.gene_symbols()
        print("We have %d genes in our population" % len(population))

        # Build associations from functional annotations we got from the gaf file
        associations = {}
        for x in funcs:
            if x not in associations:
                associations[x] = set()
            associations[x].add(str(funcs[x]['GO_ID']))

        gea = GOEnrichmentStudy(population,
                                associations,
                                gene_ontology,
                                propagate_counts=True,
                                alpha=0.05,
                                methods=[method])
        gea_results_by_component = {}
        rankings = self.ranked_genes_by_component(metagene_matrix)
        for ci in range(n_comps):
            study_genes = rankings[ci]
            print('\nComp. %d: %s...' % (ci, str(study_genes[:10])))
            gea_results_by_component[ci] = gea.run_study(study_genes)

        # Get results into a dataframe per component.  Easiest way is to use routine to
        # write a .tsv file, then read back and filter

        gea_results_df_by_component = []
        for ci in range(n_comps):
            ge_df = self._perform_gene_enrichment_analysis_one_component(
                ci, gea_results_by_component, gea)
            if ge_df is not None:
                gea_results_df_by_component += [ge_df]

        # Merge the per-component dataframes into a single one
        gea_all_sig_results_df = pd.DataFrame()
        gea_all_sig_results_df = gea_all_sig_results_df.append(
            gea_results_df_by_component)

        gea_all_sig_results_df.to_csv(self.cache_dir +
                                      '%s_gea_all.tsv' % self.prefix,
                                      sep='\t')
예제 #7
0
    def __init__(self,
                 tax_id=9606,
                 logger=None,
                 force_update=False,
                 go_dir=DEFAULT_GO_DIR,
                 bg_genes=None):
        # gene_converter can be used to enable automatic gene conversion
        self.gene_converter = None
        self.logger = logger or log.get_console_logger(self.__class__.__name__)
        self.tax_id = tax_id
        if not os.path.isdir(go_dir):
            self.logger.warn("Creating master GO directory at %s.", go_dir)
            os.makedirs(go_dir)
        else:
            self.logger.info("Using existing GO directory at %s.", go_dir)
        self.base_dir = go_dir

        # get filenames and parse both GAF and OBO
        self.obo_fn = self.check_and_get_obo(force_update=force_update)
        self.gaf_fn = self.check_and_get_gaf(force_update=force_update)
        self.obo = obo_parser.GODag(self.obo_fn)

        self.gaf = associations.read_ncbi_gene2go(self.gaf_fn,
                                                  taxids=[self.tax_id])
        self.logger.info("{N:,} annotated human genes".format(N=len(self.gaf)))

        self.bg_genes = bg_genes
        if self.bg_genes is not None:
            self.set_bg_genes(bg_genes)
예제 #8
0
    def get_GO_data(self):
        """
		Get GO tree data
		#Credits:
		https://nbviewer.jupyter.org/urls/dessimozlab.github.io/go-handbook/GO%20Tutorial%20in%20Python%20-%20Solutions.ipynb

		Parameters
		----------

		Returns
		-------
		None
		"""
        print("Getting GO data folder.")

        go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'
        go_data_folder = join(self.data_path, "data")
        create_dir(go_data_folder)

        # Check if the file exists already
        if not isfile(join(go_data_folder, "go-basic.obo")):
            self.go_obo = wget.download(go_obo_url,
                                        join(go_data_folder, "go-basic.obo"))
        else:
            self.go_obo = join(go_data_folder, "go-basic.obo")
        self.go_db = obo_parser.GODag(self.go_obo)
예제 #9
0
def prepare_GO_data(adata, gene2go, GO_file, GO_min_genes=500, GO_max_genes=None, GO_min_level=3, GO_max_level=3):
    """
    Preprocesses data .
    GO terms are propagated to all parents categories so all GO terms satisfying conditions of 
    min and max genes are included.
    gene2go: mapping of gene IDs to GO terms
    count_data: anndata object containing raw count data
    GO_file: GO ontology obo file
    GO_min_genes: minimum number of genes assigned to GO required to keep GO term (default: 500)
    GO_max_genes: maximum number of genes assigned to GO required to keep GO term (default: None)
    GO_min_level: minimum level required to keep GO term (default: 3)
    npcs: number of principal components
    annotations: dictionary containing cell annotations (default: None)
    return: dictionary of GO terms with processed anndata object with calculated knn graph
            of only genes belonging to that GO term
    """
    GOdag = obo_parser.GODag(obo_file=GO_file)
    genes = set(adata.var_names)

    gene2go = {g: gene2go[g] for g in gene2go.keys() if g in genes}
    GOdag.update_association(gene2go)  # propagate through hierarchy
    go2gene = reverse_association(gene2go)
    # return go2gene
    filtered_go2gene = {}

    for GO in go2gene:
        ngenes = len(go2gene[GO])
        if check_conditions(GOdag.get(GO), ngenes, GO_min_genes,
                            GO_max_genes, GO_min_level, GO_max_level):
            filtered_go2gene[GO] = go2gene[GO]
    print("Num filtered GOs:", len(filtered_go2gene))
    return filtered_go2gene
예제 #10
0
파일: goa.py 프로젝트: nffaruk/ProDy
def parseOBO(**kwargs):
    """Parse a GO OBO file containing the GO itself.
    See `OBO`_ for more information on the file format.

    .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html
    """
    try:
        from goatools import obo_parser
    except:
        raise ImportError('GOATools needs to be installed to use parseOBO')

    go_obo_url = kwargs.get('go_obo_url', None)
    if go_obo_url is None:
        go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo'

    data_folder = kwargs.get('data_folder', None)
    if data_folder is None:
        data_folder = os.getcwd() + '/Data'

    # Check if we have the ./data directory already
    if (not os.path.isfile(data_folder)):
        # Emulate mkdir -p (no error if folder exists)
        try:
            os.mkdir(data_folder)
        except OSError as e:
            if (e.errno != 17):
                raise e
    else:
        raise Exception(
            'Data path (' + data_folder + ') exists as a file. '
            'Please rename, remove or change the desired location of the data path.'
        )

    # Check if the file exists already
    if (not os.path.isfile(data_folder + '/go-basic.obo')):
        try:
            handle = openURL(go_obo_url)
        except Exception as err:
            LOGGER.warn('{0} download failed ({1}).'.format(
                go_obo_url, str(err)))
        else:
            data = handle.read()
            if len(data):
                filename = data_folder + '/go-basic.obo'

                with open(filename, 'w+b') as obofile:
                    obofile.write(data)

                LOGGER.debug('{0} downloaded ({1})'.format(
                    go_obo_url, sympath(filename)))
            else:
                LOGGER.warn(
                    '{0} download failed, reason unknown.'.format(go_obo_url))

    else:
        go_obo = data_folder + '/go-basic.obo'

    return obo_parser.GODag(go_obo)
예제 #11
0
def fetch_go_hierarcy():

    obo_file_location = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    if not os.path.exists(
            os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)):
        wget.download(constants.GO_OBO_URL,
                      os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    go = obo_parser.GODag(obo_file_location,
                          optional_attrs=['relationship'])  # also use

    print "Downloading gene-GO associations"
    association_file_location = os.path.join(
        constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME)
    if not os.path.exists(association_file_location):
        wget.download(
            constants.GO_ASSOCIATION_GENE2GEO_URL,
            os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME))

    print "Loading gene-GO associations"
    # gene2go = download_ncbi_associations(obo_file_location) - why does this line needed?
    go2geneids_human = read_ncbi_gene2go(association_file_location,
                                         taxids=[9606],
                                         go2geneids=True)

    print "Writing out GO child-parent links"
    if not os.path.exists(constants.OUTPUT_GLOBAL_DIR):
        os.makedirs(constants.OUTPUT_GLOBAL_DIR)

    out_fname = "go_output_{}_{}.txt".format(constants.CANCER_TYPE,
                                             time.time())
    genes = []
    isa = []
    relship = []
    with open(os.path.join(constants.OUTPUT_GLOBAL_DIR, out_fname), 'w') as o:
        for goid in go2geneids_human.keys():
            if not go.has_key(goid):
                print "GO obo file does not contain {}".format(goid)
                continue
            entry = go[goid]
            for gene in go2geneids_human[entry.id]:
                genes.append((str(gene), entry.id))
                o.write("{}\t{}\t{}\n".format("genes", *genes[-1]))
            children = entry.children
            for c in children:
                isa.append((c.id, entry.id))
                o.write("{}\t{}\t{}\n".format("is a", *isa[-1]))
            rels = entry.relationship_rev
            for rtype in rels.keys():
                rs = rels[rtype]
                for r in rs:
                    relship.append((rtype, r.id, entry.id))
                    o.write("{}\t{}\t{}\n".format(rtype, *relship[-1]))

    return (genes, isa, relship)
def test_top_parent(prt=sys.stdout):
    """Semantic Similarity test for Issue #86."""
    fin_obo = "data/i86.obo"
    branch_dist = 5
    repo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
    godag = obo_parser.GODag(os.path.join(repo, fin_obo))
    # Get all the annotations from arabidopsis.

    # Calculate the semantic distance and semantic similarity:
    _test_path_same(godag, prt)
    _test_path_parallel(godag, prt)
    _test_path_bp_mf(branch_dist, godag, prt)
    sys.stdout.write("TESTS PASSed: similarity_top_parent\n")
예제 #13
0
    def _load_go_db(data_dir, slim_down):
        """
        Load GO databases using goatools.oboparser. Always
        loads the full GO, and loads the metagenomics slim GO if slim_down = True

        :param data_dir: Data directory
        :param slim_down: Whether slim database is going to be used or not
        :return: A tuple, with full and slim GO. If slim_down = False, then the tuple is (full GO, None)
        """
        obo_path, slim_path = GeneOntologyDb._define_data_paths(data_dir)
        if not (os.path.exists(obo_path) and os.path.exists(slim_path)):
            logging.error(
                'GO files not found in specified directory.\n' +
                'Please use the command >metaquantome db ...  to download the files.'
            )
        # read gos
        go_dag = obo_parser.GODag(obo_path)
        if slim_down:
            go_dag_slim = obo_parser.GODag(slim_path)
        else:
            go_dag_slim = None
        return go_dag, go_dag_slim
예제 #14
0
    def __init__(self, basename, method):
        self.basename = basename
        self.method = method
        self._gene_symbols = None
        self.cache_dir = '../Cache/%s/GeneEnrichment/' % self.basename
        self.plots_dir = '../Plots/%s/GeneEnrichment/' % self.basename
        self.gene_column_name = 'Gene_ID' if 'Canon' in self.basename else 'GeneENSG'
        self._go_enrichment_study = None  # will be lazily evaluated

        os.makedirs(self.cache_dir, exist_ok=True)
        os.makedirs(self.plots_dir, exist_ok=True)
        self.download_and_cache_resources(
        )  # Download ontology and annotations, if necessary
        self._gene_ontology = obo_parser.GODag(
            '../DownloadedResources/go-basic.obo')
예제 #15
0
def main(args):
    logging.basicConfig(
        level=logging.INFO,
        format='%(module)s:%(levelname)s:%(asctime)s:%(message)s',
        handlers=[
            logging.FileHandler("../logs/report.log"),
            logging.StreamHandler()
        ])
    logging.info(args)

    paths = utils.read_paths(args.paths_file)

    go = obo_parser.GODag(args.obo_file)
    gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606])
    termcounts = TermCounts(go, gene2go)

    if args.namespace is not None:
        if args.namespace == 'cc':
            go = {
                go_term: values
                for go_term, values in go.items()
                if values.namespace == 'cellular_component'
            }
        elif args.namespace == 'mf':
            go = {
                go_term: values
                for go_term, values in go.items()
                if values.namespace == 'molecular_function'
            }
        elif args.namespace == 'bp':
            go = {
                go_term: values
                for go_term, values in go.items()
                if values.namespace == 'biological_process'
            }
        else:
            raise ValueError('namespace can be only cc, mf or bp')

    wrapped = [[path, go, gene2go, termcounts] for path in paths]
    if args.n_cores > 1:
        sims = list(p_map(wrap, wrapped))
    else:
        sims = list(map(wrap, tqdm(wrapped)))

    utils.create_dir_if_not_exist(dirname(args.out_sims_file))
    np.savetxt(args.out_sims_file, sims)
예제 #16
0
 def __init__(self, gaf_file_path: os.path, obo_file_path: os.path = None):
     self.go_annotations = pd.read_table(
         gaf_file_path,
         names=[
             "DB", "DB_Object_ID", "DB_Object_Symbol", "Qualifier", "GO_ID",
             "DB:Reference", "Evidence Code", "With (or) From", "Aspect",
             "DB_Object_Name", "DB_Object_Synonym", "DB_Object_Type",
             "Taxon and Interacting taxon", "Date", "Assigned_By",
             "Annotation_Extension", "Gene_Product_Form_ID"
         ],
         header=None,
         dtype="string",
         comment="!",
         compression="gzip")
     self.go = obo_parser.GODag(
         obo_file_path if obo_file_path else self.retrieveOBOFile(),
         optional_attrs="relationship")
예제 #17
0
def test_semantic_i88():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    goids = set(go for go, o in godag.items() if go == o.id)
    goids = set(godag.keys())
    # Get all the annotations from arabidopsis.
    fin_gaf = os.path.join(REPO, "tair.gaf")
    # dnld_assc includes read_gaf
    associations = dnld_assc(fin_gaf, godag, prt=None)

    # First get the counts and information content for each GO term.
    termcounts = TermCounts(godag, associations)
    gosubdag = GoSubDag(goids, godag, tcntobj=termcounts)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364'  # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707'  # BP level-02 depth-02 single-multicellular organism process
    go_root = deepest_common_ancestor([go_id3, go_id4], godag)
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.
          format(GO1=go_id3, GO2=go_id4, VAL=sim))
    gosubdag.prt_goids([go_root, go_id3, go_id4])

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id,
                                                           INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                                  GO2=go_id4,
                                                                  VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                               GO2=go_id4,
                                                               VAL=sim_l))
예제 #18
0
def download_and_process_go(species='hsa'):
    print("Creating GO files")
    from goatools import obo_parser
    obo_file = os.path.join(id_mapping_dir, 'go.obo')
    if not os.path.exists(obo_file):
        download_current_go()
    go = obo_parser.GODag(obo_file)
    gene_to_go, go_to_gene, goid_to_name = download_ncbi_gene_file()

    go_aspect = dict()
    go_depth = dict()

    dirname = network_data_dir

    go_to_gene_name = os.path.join(dirname,
                                   '{}_goids_to_genes.p'.format(species))
    go_to_go_name = os.path.join(dirname,
                                 '{}_goids_to_goname.p'.format(species))
    gene_to_go_name = os.path.join(dirname, '{}_gene_to_go.p'.format(species))
    go_depth_name = os.path.join(dirname, '{}_godepth.p'.format(species))
    go_aspect_name = os.path.join(dirname, '{}_go_aspect.p'.format(species))
    for i in go_to_gene.keys():
        go_depth[i] = go[i].depth
        go_aspect[i] = go[i].namespace

    pickle.dump(go_to_gene, open(go_to_gene_name, 'wb'))
    pickle.dump(goid_to_name, open(go_to_go_name, 'wb'))
    pickle.dump(go_depth, open(go_depth_name, 'wb'))
    pickle.dump(go_aspect, open(go_aspect_name, 'wb'))

    for i in go_to_gene:
        term = i
        genes = go_to_gene[i]
        for g in genes:
            if g in gene_to_go:
                gene_to_go[g].add(term)
            else:
                gene_to_go[g] = set()
                gene_to_go[g].add(term)
    pickle.dump(gene_to_go, open(gene_to_go_name, 'wb'))
    print("Done creating GO files")
예제 #19
0
def createGOjsFile():
    # recreates GO.js file so that ontology is up to date
    import json
    from goatools import obo_parser

    file = obo_parser.GODag('data/go-basic.obo')

    GOjsDict = {}
    for goID in file:
        parents = []
        for parent in file[goID].parents:
            parents.append(parent.id)
        name = file[goID].name
        namespace = file[goID].namespace
        if not file[goID].is_obsolete:
            GOjsDict[unicode(file[goID].id)] = {
                'p': parents,
                'c': namespace,
                'n': name
            }
    GOjs = open('js/GO.js', 'w')
    json.dump(GOjsDict, GOjs)
예제 #20
0
def createGOjsFile():
    # recreates GO.js file so that ontology is up to date
    import json
    from goatools import obo_parser

    file = obo_parser.GODag('data/go-basic.obo', 'relationship')

    GOjsDict = {}
    for goID in file:
        parents = []
        uppers = []
        for parent in file[goID].parents:
            parents.append(parent.id)
            uppers.append((parent.id, 'is_a'))

        if len(list(file[goID].relationship)) == 1:
            for relationships in list(file[goID].relationship.values()):
                relation = list(relationships)[0].id
            parents.append(relation)
            uppers.append([relation, list(file[goID].relationship)[0]])
        if len(list(file[goID].relationship)) > 1:
            i = 0
            for relationships in list(file[goID].relationship.values()):
                relation = list(relationships)[0].id
                parents.append(relation)
                uppers.append([relation, list(file[goID].relationship)[i]])
                i += 1
        name = file[goID].name
        namespace = file[goID].namespace
        if not file[goID].is_obsolete:
            GOjsDict[unicode(file[goID].id)] = {
                'p': parents,
                'c': namespace,
                'n': name,
                'u': uppers
            }
    GOjs = open('js/GO.js', 'w')
    json.dump(GOjsDict, GOjs)
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    # Get all the annotations from arabidopsis.
    associations = read_gaf("http://geneontology.org/gene-associations/gene_association.tair.gz")


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
    def __init__(self,
                 go_file,
                 go_terms,
                 gaf,
                 omadb=None,
                 tarfile_ortho=None,
                 TermCountsFile=None):
        self.go_file = go_file

        if omadb:
            print('open oma db obj')
            from pyoma.browser import db
            h5_oma = open_file(omadb, mode="r")
            self.db_obj = db.Database(h5_oma)
            print('done')
        elif tarfile_ortho:
            #retrieve hog members from tarfile_ortho
            self.tar = tarfile.open(tarfile_ortho, "r:gz")
        else:
            raise Exception('please provide input dataset')

        #go_terms_hdf5 = h5py.File(go_terms, mode='r')
        #self.goterms2parents = go_terms_hdf5['goterms2parents']
        self.godf = pickle.loads(open(go_terms, 'rb').read())
        self.go_file = obo_parser.GODag(go_file)
        print('building gaf')
        self.gaf = goatools_utils.buildGAF(gaf)
        print('done')
        if TermCountsFile is None:
            self.termcounts = TermCounts(self.go_file, self.gaf)
        else:
            self.termcounts = pickle.loads(open(TermCountsFile, 'rb').read())
        #make a partial
        self.resniksimpreconf = partial(goatools_utils.resnik_sim_pandas,
                                        df=self.godf,
                                        termcounts=self.termcounts)
예제 #23
0
Created on Wed Jul  3 09:58:37 2019

@author: benlitterer
"""

#TODO think about how to present gos that overlap with the blosum output's go's. Keep them in algo or not?
#TODO also maybe consider not letting B62 match with itself and use that as a referance??
from goatools import obo_parser
import argparse
import os
from collections import Counter
from bokeh.plotting import figure, show, output_file
from bokeh.palettes import RdYlBu, Category20

go_obo = "/home/benlitterer/Academic/Research/ProjectMatrices"
go = obo_parser.GODag(go_obo + "/go-basic.obo")

parser = argparse.ArgumentParser(
    description="path to directory containing your Gene Annotation Output")
parser.add_argument("blast_directory")
parser.add_argument("go_directory")
args = parser.parse_args()
#cwd = "/home/benlitterer/Academic/Research/Summer2019/referance"
goannots = {}

BLAST_OUT_DIR = args.blast_directory

GO_dict = {}
db_path = args.go_directory  #"/home/benlitterer/Academic/Research/Summer2019/testAllMatricesMediumInput/MappedGOs01Cutoff/"

예제 #24
0
    genes = _gene2go.keys()
    query_result_list = []
    for genes_chunk in np.array_split(genes, max(genes.shape[0] // 1000, 1)):
        query_res = mg.querymany(genes_chunk, scopes='entrezgene', fields='entrezgene,symbol',
                                 species='human', entrezonly=True, as_dataframe=True,
                                 df_index=False, verbose=False)
        if 'notfound' in query_res.columns:
            query_res = query_res[query_res.notfound != True]  # ignore PEP8 warnings.
        query_result_list.append(query_res)
    df_res = pd.concat(query_result_list)
    res = dict(zip(df_res.entrezgene, df_res.symbol))
    return res


with HidePrints():
    _go_dag = obo_parser.GODag(go_obo_path)
    _gaf = read_gaf(gaf_path, prt=None)
    _termcounts = TermCounts(_go_dag, _gaf)
    _gene2go = read_ncbi_gene2go(gene2go_path)
    _gene2symbol = _init_gene2symbol_dict()
    _symbol2gene = {symbol: gene for gene, symbol in _gene2symbol.items()}


def get_genes():
    return list(_gene2go.keys())


def get_symbols():
    return list(_gene2symbol.values())

예제 #25
0
from goatools.semantic import TermCounts, ic, resnik_sim, semantic_similarity
from magine.enrichment.ontology_analysis import MagineGO

from magine.data.storage import id_mapping_dir

obo_file = os.path.join(id_mapping_dir, 'go.obo')

if not os.path.exists(obo_file):
    print("Using ontology for first time")
    print("Downloading files")
    from magine.enrichment.databases.gene_ontology import \
        download_and_process_go
    download_and_process_go()
    assert os.path.exists(obo_file)

go = obo_parser.GODag(obo_file)

mg = MagineGO()
print("Loading termcounts")
associations = mg.gene_to_go
termcounts = TermCounts(go, associations)
print("Loaded termcounts")


def path_to_root(go_term):
    """
    Creates networkx graph from provided term to root term

    Parameters
    ----------
    go_term : str
예제 #26
0
lib.log.info("Compiling all annotations for each genome")

#get orthology into dictionary
orthoDict = {}
if len(args.input) > 1:
    with open(orthologs, 'rU') as input:
        for line in input:
            line = line.replace('\n', '')
            col = line.split('\t')
            genes = col[-1].split(', ')
            for i in genes:
                orthoDict[i] = col[0]

#get GO associations into dictionary as well
with lib.suppress_stdout_stderr():
    goLookup = obo_parser.GODag(os.path.join(parentdir, 'DB', 'go.obo'))
goDict = {}
with open(os.path.join(go_folder, 'associations.txt'), 'rU') as input:
    for line in input:
        line = line.replace('\n', '')
        col = line.split('\t')
        gos = col[1].split(';')
        goList = []
        for i in gos:
            try:
                description = i + ' ' + goLookup[i].name
            except KeyError:
                print '%s not found in go.obo, try to download updated go file' % i
                description = i
            goList.append(description)
        goDict[col[0]] = goList
예제 #27
0
                    if go_id not in go_anno_dict[gene_id][0]:
                        go_anno_dict[gene_id][0].append(go_id)
                        go_anno_dict[gene_id][1].append(go_domain)
                        go_anno_dict[gene_id][2].append(go_des)
                    if gene_inter_id not in go_anno_dict[gene_id][2]:
                        go_anno_dict[gene_id][3].append(gene_inter_id)
                        go_anno_dict[gene_id][4].append(gene_inter_des)
        json_file = '%s.json' % args.biomart
        with open(json_file, 'w') as json_file_info:
            json.dump(go_anno_dict, json_file_info)
elif args.go:
    if args.go.endswith('json'):
        with open(args.go) as go_info:
            go_anno_dict = json.load(go_info)
    else:
        go_db_info = obo_parser.GODag(go_db)
        reader = csv.reader(file(args.go, 'rb'))
        for n, each_record in enumerate(reader):
            if n != 0:
                gene_id = each_record[0]
                go_id = each_record[1]
                go_domain = go_db_info[go_id].namespace
                go_des = go_db_info[go_id].name
                if gene_id not in go_anno_dict:
                    go_anno_dict[gene_id] = [[go_id], [go_domain], [go_des]]
                else:
                    if go_id not in go_anno_dict[gene_id][0]:
                        go_anno_dict[gene_id][0].append(go_id)
                        go_anno_dict[gene_id][1].append(go_domain)
                        go_anno_dict[gene_id][2].append(go_des)
        go_json_file = '%s.json' % args.go
예제 #28
0
#convert ID's to UniProtKB (https://www.uniprot.org/uploadlists/); saved as "UniProtIDs.csv"; proteins not able to convert: 'not_in_proteins.txt'
UP_ID = np.genfromtxt(fname='UniProtIDs.csv', names=True, delimiter=',', dtype=['U15','U6','U25','U25','U25','U25'])
ensembl_to_up = {j : UP_ID['UniProtID'][i] for i, j in enumerate(UP_ID['EnsemblID'])}
up_to_ensembl = {UP_ID['UniProtID'][i] : j for i, j in enumerate(UP_ID['EnsemblID'])}


newEnsemblIDs = []
for i in EnsemblIDs:
    if i in UP_ID['EnsemblID']:
        newEnsemblIDs.append(i)

C_int_UP = list(map(lambda x: ensembl_to_up[x], newEnsemblIDs))


#Enrichment Analysis
go = obo.GODag('/disks/strw13/DBDM/A4_2/go-basic.obo')

with gzip.open('goa_human.gaf.gz', 'rt') as fp:
    funcs = {}
    for entry in gafiterator(fp):
        uniprot_id = entry.pop('DB_Object_ID')
        funcs[uniprot_id] = entry

pop = funcs.keys()
assoc = {}

for x in funcs:
    if x not in assoc:
        assoc[x] = set()
    assoc[x].add(str(funcs[x]['GO_ID']))
    
            utils.parallel_process(
                get_sps, destnodes_sample, n_jobs=args.N_cores), [])

    logging.info('Num of all paths: {}'.format(len(all_paths)))

    fc_paths = []
    for i in trange(len(all_paths)):
        fullpath = all_paths[i]
        if len(fullpath) > 2:
            path = all_paths[i][1:-1]
            if np.all([node in fcnodes for node in path]):
                fc_paths.append(fullpath)

    logging.info('Num of FC paths: {}'.format(len(fc_paths)))

    go = obo_parser.GODag(args.obo_file)
    gene2go = read_ncbi_gene2go(args.gene2go_file, taxids=[9606])
    termcounts = TermCounts(go, gene2go)

    def get_sim(genes_pair):
        # sim_measure = lin_sim
        i, j = genes_pair[0], genes_pair[1]
        i_go = [goterm for goterm in gene2go[i] if goterm in go]
        j_go = [goterm for goterm in gene2go[j] if goterm in go]
        sims = []
        for i_go_term in i_go:

            def wrap(j_go_term):
                return resnik_sim(i_go_term, j_go_term, go, termcounts)

            simlist = [sim for sim in map(wrap, j_go) if sim is not None]
예제 #30
0
    except OSError as e:
        if (e.errno != 17):
            raise e
else:
    raise Exception(
        'Data path (' + data_folder + ') exists as a file. '
        'Please rename, remove or change the desired location of the data path.'
    )

# Check if the file exists already
if (not os.path.isfile(data_folder + '/go-basic.obo')):
    go_obo = wget.download(go_obo_url, data_folder + '/go-basic.obo')
else:
    go_obo = data_folder + '/go-basic.obo'
print(go_obo)
go = obo_parser.GODag(go_obo)
go_id = 'GO:0048527'
go_term = go[go_id]
print(go_term)
print('GO term name: {}'.format(go_term.name))
print('GO term namespace: {}'.format(go_term.namespace))
for term in go_term.parents:
    print(term)
for term in go_term.children:
    print(term)


def transitive_closure(go_term, go):
    go_term_set = set()
    find_parents(go_term, go, go_term_set)
    find_children(go_term, go, go_term_set)