예제 #1
0
 def __init__(self, genes, resource_manager=None):
     self.genes = genes
     self.graph = nx.MultiGraph()
     if not resource_manager:
         self.resource_manager = ResourceManager()
     else:
         self.resource_manager = resource_manager
     self.go_dag = GODag(self.resource_manager.get_go_obo())
     self.goa = self._load_goa_gaf()
예제 #2
0
def main(path_to_pkl, path_to_godag, path_to_go_list_pkl):
    df = pd.read_pickle(path_to_pkl)
    preds = reshape(df['predictions'].values)
    labels = reshape(df['labels'].values)
    global GODag
    GODag = GODag(path_to_godag, optional_attrs=['relationship'])

    func_df = pd.read_pickle(path_to_go_list_pkl)
    functions = func_df['functions'].values
    func_index = dict()
    for i, go_id in enumerate(functions):
        func_index[go_id] = i
    global func_set
    func_set = set(func_index)

    # preds = df['predictions'].values
    gos = df['gos'].values
    f, p, r, t, preds_max = compute_performance(preds, labels, gos)
    print('f: \t{}\np: \t{}\nr: \t{}'.format(f, p, r))
    # labels = list()
    # scores = list()
    # for i in range(len(preds)):
    #     all_gos = set()
    #     for go_id in gos[i]:
    #         if go_id in all_functions:
    #             all_gos |= get_anchestors(go, go_id)
    #     all_gos.discard(GO_ID)
    #     scores_dict = {}
    #     for val in preds[i]:
    #         go_id, score = val
    #         if go_id in all_functions:
    #             go_set = get_anchestors(go, go_id)
    #             for g_id in go_set:
    #                 if g_id not in scores_dict or scores_dict[g_id] < score:
    #                     scores_dict[g_id] = score
    #     all_preds = set(scores_dict) # | all_gos
    #     all_preds.discard(GO_ID)
    #     for go_id in all_preds:
    #         if go_id in scores_dict:
    #             scores.append(scores_dict[go_id])
    #         else:
    #             scores.append(0)
    #         if go_id in all_gos:
    #             labels.append(1)
    #         else:
    #             labels.append(0)

    # scores = np.array(scores)
    # labels = np.array(labels)
    roc_auc = compute_roc(preds, labels)
    print('AUROC: \t{}'.format(roc_auc))

    auprc = compute_prc(preds, labels)
    print('AUPRC: \t{}'.format(auprc))
    # preds_max = (scores > t).astype(np.int32)
    mcc = compute_mcc(preds_max, labels)
    print('MCC: \t{}'.format(mcc))
예제 #3
0
def init_goea(**kws):
    """Initialize GODag and GOEnrichmentStudy."""
    obo_dag = GODag(ROOT + "go-basic.obo")
    assoc = read_associations(ROOT + "association", no_top=True)
    popul_ids = [line.rstrip() for line in open(ROOT + "population")]
    methods = kws['methods'] if 'methods' in kws else ['not_bonferroni']
    study_ids = [line.rstrip() for line in open(ROOT + "study")]
    return GOEnrichmentStudy(popul_ids, assoc, obo_dag,
                             methods=methods), study_ids
예제 #4
0
 def load_dag(self, opt_fields=None):
     """Run numerous tests for various self.reports."""
     tic = timeit.default_timer()
     dag = GODag(self.obo, opt_fields)
     toc = timeit.default_timer()
     msg = "Elapsed HMS for OBO DAG load: {HMS} OPTIONAL_ATTR({O})\n".format(
         HMS=str(datetime.timedelta(seconds=(toc - tic))), O=opt_fields)
     sys.stdout.write(msg)
     return dag
예제 #5
0
def test_tcntobj_relationships(prt=sys.stdout):
    """Test loading of relationships, like part_of, into TermCounts"""
    fin_obo = os.path.join(REPO, "go-basic.obo")
    fin_anno = os.path.join(REPO, 'goa_human.gpad')

    download_go_basic_obo(fin_obo, prt, loading_bar=None)
    dnld_annotation(fin_anno)

    # Load ontologies
    go2obj_r0 = GODag(fin_obo)
    go2obj_r1 = GODag(fin_obo, optional_attrs=['relationship'])

    # Load annotations
    annoobj = GpadReader(fin_anno, godag=go2obj_r0)

    # Create TermCounts objects
    ns2tcntobj_r0 = {ns:TermCounts(go2obj_r0, annoobj.get_id2gos(ns)) for ns in NSS}
    ns2tcntobj_r1 = {ns:TermCounts(go2obj_r1, annoobj.get_id2gos(ns), RELS) for ns in NSS}
    _chk_pass_fail(ns2tcntobj_r0, ns2tcntobj_r1)
예제 #6
0
def _chk_godag(go2obj_act, obo):
    """Check that the update_association function did not alter godag."""
    go2obj_exp = GODag(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../..", obo))
    assert len(go2obj_act) == len(go2obj_exp)
    assert set(go2obj_act) == set(go2obj_exp)
    for go_act, obj_act in go2obj_act.items():
        obj_exp = go2obj_exp[go_act]
        act_gos = set(o.id for o in obj_act.parents)
        exp_gos = set(o.id for o in obj_exp.parents)
        assert act_gos == exp_gos, "\nACT: {A}\nEXP: {E}".format(A=act_gos, E=exp_gos)
예제 #7
0
def get_highest_ic():
    if not os.path.isfile(HIGHEST_IC_FILE_PATH):
        go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w'))
        compute_highest_inc_parallel(list(go_dag.keys()))

    ic_file = open(HIGHEST_IC_FILE_PATH, 'r')
    highest_ic_anc = json.load(ic_file)
    ic_file.close()

    return highest_ic_anc
예제 #8
0
 def __init__(self, fin_obo):
     self.repo = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                              "../")
     self.fin_obo = os.path.join(self.repo, fin_obo)
     self.dag = GODag(self.fin_obo)
     self.go2obj = {
         go: o
         for go, o in self.dag.items() if not o.is_obsolete
     }
     self.goids_all = self.go2obj.keys()
예제 #9
0
def load_GODag():
    """
    OBO file retrieved from http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo
    """
    from jcvi.apps.base import download

    so_file_url = "http://obo.cvs.sourceforge.net/viewvc/obo/obo/ontology/genomic-proteomic/so.obo"
    so_file = download(so_file_url, debug=False)

    return GODag(so_file)
예제 #10
0
def test_semantic_similarity():
    """Test faster version of sematic similarity"""
    godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo'))
    name2go = {o.name: o.item_id for o in godag.values()}
    assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1a.anno'), godag, name2go)
    tcntobj = TermCounts(godag, assoc)
    assert tcntobj.gocnts[name2go['I']] == 50
    assert tcntobj.gocnts[name2go['L']] == 50
    assert tcntobj.gocnts[name2go['M']] == 50
    assert tcntobj.gocnts[name2go['N']] == 50
예제 #11
0
 def __init__(self, fin_go_basic_obo, go2items, log=None):
     self.log = sys.stdout if log is None else log
     self.bstdout = True if log is None else log
     # Some obo fields often used in searching. Many are optional to load when reading obo
     self.goa_srch_hdrs = [
         'defn', 'comment', 'name', 'is_a', 'relationship', 'synonym',
         'xref'
     ]
     self.obo_dag = GODag(fin_go_basic_obo,
                          optional_attrs=self.goa_srch_hdrs)
     self.go2items = go2items
예제 #12
0
def test_paths_to_top():
    dag = GODag(ROOT + "mini_obo.obo")
    expected_paths = [['GO:0000001', 'GO:0000002', 'GO:0000005', 'GO:0000010'],
                      ['GO:0000001', 'GO:0000003', 'GO:0000005', 'GO:0000010'],
                      [
                          'GO:0000001', 'GO:0000003', 'GO:0000006',
                          'GO:0000008', 'GO:0000010'
                      ]]
    actual_paths = dag.paths_to_top("GO:0000010")
    chk_results(actual_paths, expected_paths)
    print_paths(actual_paths)
예제 #13
0
def _load_dag(dag_fin, opt_fields=None, out=None):
    """Run numerous tests for various REPOrts."""
    tic = timeit.default_timer()
    dag = GODag(os.path.join(REPO, dag_fin), opt_fields)
    toc = timeit.default_timer()
    msg = "Elapsed HMS for OBO DAG load: {}\n\n".format(str(datetime.timedelta(seconds=(toc-tic))))
    if out is not None:
        out.write(msg)
    else:
        sys.stdout.write(msg)
    return dag
예제 #14
0
def get_goea_results(method="fdr_bh"):
    """Get GOEA results."""
    root_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data")
    obo_fin = os.path.join(root_dir, "goslim_generic.obo")
    obo_dag = GODag(obo_fin)
    assoc = read_associations(os.path.join(root_dir, "slim_association"), no_top=True)
    popul_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_population"))]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=[method])
    study_ids = [line.rstrip() for line in open(os.path.join(root_dir, "small_study"))]
    goea_results = goeaobj.run_study(study_ids, methods=[method])
    return goea_results
예제 #15
0
 def __init__(self):
     download_go_basic_obo(self.obo, sys.stdout, loading_bar=None)
     self.godag_r0 = GODag(self.obo)
     self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship']))
     self.goids = list(set(o.id for o in self.godag_r0.values()))
     # GoSubDag (plain)
     tic = timeit.default_timer()
     self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None)
     prt_hms(
         tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format(
             N=len(self.gosubdag_r0.go2obj),
             S=len(self.gosubdag_r0.go_sources)))
     # GoSubDag with relationships
     self.gosubdag_r1 = GoSubDag(self.goids,
                                 self.godag_r1,
                                 prt=None,
                                 relationships=True)
     prt_hms(
         tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format(
             N=len(self.gosubdag_r1.go2obj),
             S=len(self.gosubdag_r1.go_sources)))
예제 #16
0
def intialize_term_counts():
    go_freq_dict = dict()
    go_dag = GODag(os.path.join(DATA_DIR, "go-basic.obo"))

    associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH,
                                 godag=go_dag).get_id2gos('all')
    term_counts = TermCounts(go_dag, associations)
    for i in go_dag.values():
        go_freq_dict[i.id] = term_counts.get_count(i.id)
    # write frequency dict to JSON file
    with open(JSON_INDEXED_FILE_PATH, 'w') as json_file:
        json.dump(go_freq_dict, json_file)
예제 #17
0
def get_objgoea(pop, assoc, args):
    """Run gene ontology enrichment analysis (GOEA)."""
    obo_dag = GODag(obo_file=args.obo)
    methods = args.method.split(",")
    propagate_counts = not args.no_propagate_counts
    return GOEnrichmentStudy(pop,
                             assoc,
                             obo_dag,
                             propagate_counts=propagate_counts,
                             alpha=args.alpha,
                             pvalcalc=args.pvalcalc,
                             methods=methods)
    def load_go_dag(self):
        """Load GO DAG.

        # Arguments
            filepath: str (optional), path to go.obo
        """
        filepath = self.go_dag_path
        if not os.path.exists(filepath):
            raise GeneOntologyError(f"{os.path.basename(filepath)} does not exist at {os.path.dirname(filepath)}")

        go_dag = GODag(filepath)
        self.go_dag = go_dag
예제 #19
0
파일: megago.py 프로젝트: hreinwal/MegaGO
def run_comparison(go_list_1, go_list_2, go_dag=None, progress=None):
    """ Compute the pairwise similarity values for all rows from the given file.

    Parameters
    ----------
    go_list_1 : a list with GO-identifiers as strings
        All GO-terms present in the first sample.
    go_list_2 : a list with GO-identifiers as strings
        All GO-terms present in the second sample.
    go_dag : GODag object
        GODag object from the goatools package
    progress : function (number) => void
        is called with the current progress value (a floating point value between 0 and 1)

    Returns
    -------
    tuple
        A tuple with 3 values. These correspond to the similarity scores of biological process, cellular component and
        molecular function respectively.
    """

    freq_dict = get_frequency_counts()
    highest_ic_anc = get_highest_ic()
    if go_dag is None:
        go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w'))

    split_per_domain_1 = split_per_domain(go_list_1, go_dag)
    split_per_domain_2 = split_per_domain(go_list_2, go_dag)

    output = list()

    total_comparisons = len(set(go_list_1)) * len(set(go_list_2))
    done = 0

    def progress_reporter(batch_size):
        nonlocal done
        if progress:
            done += batch_size
            progress(done / total_comparisons)

    for i in range(len(GO_DOMAINS)):
        output.append(
            compute_bma_metric(split_per_domain_1[i],
                               split_per_domain_2[i],
                               freq_dict,
                               highest_ic_anc,
                               progress_reporter,
                               similarity_method="lin"))

    if progress:
        progress(1)

    return tuple(output)
def test_paths_to_top():
    #dag = GODag("./tests/data/mini_obo.obo")
    dag = GODag("./data/mini_obo.obo")
    expected_paths = [['GO:0000001', 'GO:0000002', 'GO:0000005', 'GO:0000010'],
                      ['GO:0000001', 'GO:0000003', 'GO:0000005', 'GO:0000010'],
                      [
                          'GO:0000001', 'GO:0000003', 'GO:0000006',
                          'GO:0000008', 'GO:0000010'
                      ]]
    actual_paths = dag.paths_to_top("GO:0000010")
    chk_results(actual_paths, expected_paths)
    prt_paths(actual_paths)
예제 #21
0
파일: metrics.py 프로젝트: hreinwal/MegaGO
def compute_similarity_method(params):
    (go_list1, go_list2, term_counts, go_dag_path, highest_ic_anc,
     similarity_method) = params
    go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w'))
    result = dict()

    for id1 in go_list1:
        for id2 in go_list2:
            key = (id1, id2) if id1 < id2 else (id2, id1)
            result[key] = similarity_method(id1, id2, go_dag, term_counts,
                                            highest_ic_anc)
    return result
예제 #22
0
def get_goeaobj(methods=None):
    """Test GOEA with method, fdr."""
    obo_fin = "{REPO}/go-basic.obo".format(REPO=REPO)
    if not os.path.isfile(obo_fin):
        get_godag("go-basic.obo")
    obo_dag = GODag(obo_fin)
    assoc = read_associations(
        "{REPO}/tests/data/small_association".format(REPO=REPO), no_top=True)
    popul_fin = "{REPO}/tests/data/small_population".format(REPO=REPO)
    popul_ids = [line.rstrip() for line in open(popul_fin)]
    goeaobj = GOEnrichmentStudy(popul_ids, assoc, obo_dag, methods=methods)
    return goeaobj
예제 #23
0
    def load_ontologies_and_associations(self):
        print "---LOADING ONTOLOGIES AND ASSOCIATIONS---"
        # Check if files exist and download if not
        obo_fname = download_go_basic_obo()
        gene2go = download_ncbi_associations()

        # Load ontologies and associations
        obodag = GODag(obo_fname)
        geneid2gos_human = read_ncbi_gene2go("gene2go", taxids=[9606])
        print "{N:,} annotated human genes".format(N=len(geneid2gos_human))

        return obodag, geneid2gos_human
예제 #24
0
    def init(self):
        _log.debug("Cargando archivo de ontologias:" + self.obo_file)
        self.go_dag = GODag(self.obo_file)
        _log.debug("Se cargo el archivo:" + self.obo_file)

        if os.path.exists(self.graph_file):
            self.graph = nx.read_gpickle(self.graph_file)
        else:
            self._build_graph()
            nx.write_gpickle(self.graph, self.graph_file)

        _log.debug("Se genero el grafo de terminos")
예제 #25
0
    def __init__(
        self,
        work_dir: str = '.',
        clean_work_dir: bool = False,
        organism: str = 'human',
        study_parameters: Dict[str, Union[int, float, str, List, Dict]] = {
            'propagate_counts': False,
            'alpha': 0.05,
            'methods': ['fdr_bh']
        }
    ) -> GOEngine:
        """A GOEngine that can be used for performing analysis using GOATOOLS

        Args:
            work_dir (str, optional): The path to a temp directory were intermediate-results and raw data will be downloaded/written to. Defaults to the current working directory.
            clean_work_dir (bool, optional): Whether or not to remove data written to the work directory at class termination, default to True.
            organism (str, optional): The organism . Defaults to 'human'.
            study_parameters (Dict[str,Union[int,float,str,List,Dict]], optional): A dict of parameters to control the base function, defaults to {'propagate_counts':False,'alpha':0.05, 'methods':['fdr_bh']}
        Returns:
            GOEngine: return a GO engine that can be used for performing GO enrichment analysis GOEnrichmentStudyNS
        """
        print("Creating a GO Engine ...")
        if not os.path.exists(work_dir):
            raise ValueError(
                f"The provided work path: {work_dir} does not exist!!!")
        self.work_dir = work_dir
        if organism != 'human' and organism != 'mouse':
            raise ValueError(
                f"The provided organism: {organism} is not support, current engine mainly work with human and moues only"
            )
        print(f"\t --> Downloading data ...")
        obo_fname = download_go_basic_obo(
            os.path.join(work_dir, 'go-basic.obo'))
        gene2go_fname = download_ncbi_associations(
            os.path.join(work_dir, 'gene2go'))
        ## parse the GO term
        print(
            f"\t --> parsing the data and intializing the base GOEA object...")
        obo_dag = GODag(obo_fname)
        if organism == 'human':
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[9606]).get_ns2assc(),
                obo_dag, **study_parameters)
        else:
            self._goea_obj = GOEnrichmentStudyNS(
                gene2iden_human.keys(),
                Gene2GoReader(gene2go_fname, taxids=[10090]).get_ns2assc(),
                obo_dag, **study_parameters)
        self._clean_work_dir = clean_work_dir
        self._gene_ids = None
        return
예제 #26
0
def init_goea(log):
    """Read Ontologies and Annotations once."""
    # ---------------------------------------------------------------------
    # Run Gene Ontology Analysis (GOEA)
    #
    # 1. Initialize
    obo_dag = GODag("go-basic.obo")
    assoc = read_associations("../data/association", no_top=True)
    popul_ids = [line.rstrip() for line in open("../data/population")]
    # 2. Run enrichment analysis
    goeaobj = GOEA(obo_dag, assoc, log)
    goeaobj.set_population(popul_ids)
    return goeaobj
예제 #27
0
def test_all():
    """Run numerous tests for various reports."""
    dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo")
    tic = timeit.default_timer()
    dag = GODag(dag_fin)
    toc = timeit.default_timer()
    out = sys.stdout
    write_hier_all(dag, out)
    write_hier_norep(dag, out)
    write_hier_lim(dag, out)
    write_hier_mrk(dag, out)
    msg = "Elapsed HMS: {}\n\n".format(str(datetime.timedelta(seconds=(toc-tic))))
    sys.stdout.write(msg)
예제 #28
0
def get_pathway_mapping(organism=9606, ontology='basic', exclude=None, force=False):

	obo = 'goslim_generic.obo' if 'slim' in ontology else 'go-basic.obo'

	namespace_filter = get_namespace_filter(exclude)

	if force & (os.path.isfile(obo)):
		os.remove(obo)

	obo_fname = goatools.base.download_go_basic_obo(obo)

	obodag = GODag(obo_fname)

	return {term_id:term.name for term_id,term in obodag.items() if namespace_filter(term.namespace)}
예제 #29
0
def get_goeaobj(method, geneids_pop, taxid):
    """Load: ontologies, associations, and population geneids."""
    fin_obo = "go-basic.obo"
    if not os.path.isfile(fin_obo):
        wget.download("wget http://geneontology.org/ontology/go-basic.obo")
    obo_dag = GODag(fin_obo)
    assoc_geneid2gos = get_assoc_ncbi_taxids([taxid])
    goeaobj = GOEnrichmentStudy(geneids_pop,
                                assoc_geneid2gos,
                                obo_dag,
                                propagate_counts=False,
                                alpha=0.05,
                                methods=[method])
    return goeaobj
예제 #30
0
def test_semantic_i150():
    """Test that comparing two identical GO IDs returns true"""
    fin_dag = os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo')
    ## fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf')
    # Read files
    godag = GODag(fin_dag)
    ## objanno = GafReader(fin_gaf)
    ## gene2gos = objanno.get_id2gos(namespace='CC')
    ## # Termcounts
    ## termcounts = TermCounts(godag, gene2gos, prt=sys.stdout)
    # Compare all GO terms with itself
    for goterm in set(godag.values()):
        goid = goterm.item_id
        assert semantic_similarity(goid, goid, godag) == 1.0