예제 #1
0
def gen_anno_small():
    """Generate a maller nnotations containing 10% of the oringal genes"""
    godag = GODag(os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.obo'))
    name2go = {o.name: o.item_id for o in godag.values()}
    file_id2gos = os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a_small.anno')
    name2num = {e:i/10 for e, i in NAME2NUM.items()}
    _get_id2gos(file_id2gos, godag, name2go, name2num)
    print(name2num)
예제 #2
0
class _Run(object):
    """Group entire go-basic.obo"""

    obo = os.path.join(os.path.dirname(os.path.abspath(__file__)), "../../go-basic.obo")

    def __init__(self):
        download_go_basic_obo(self.obo, sys.stdout, loading_bar=None)
        self.godag_r0 = GODag(self.obo)
        self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship']))
        self.goids = list(set(o.id for o in self.godag_r0.values()))
        # GoSubDag (plain)
        tic = timeit.default_timer()
        self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None)
        prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format(
            N=len(self.gosubdag_r0.go2obj), S=len(self.gosubdag_r0.go_sources)))
        # GoSubDag with relationships
        self.gosubdag_r1 = GoSubDag(self.goids, self.godag_r1, prt=None, relationships=True)
        prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format(
            N=len(self.gosubdag_r1.go2obj), S=len(self.gosubdag_r1.go_sources)))

    def prt_cnts(self, cnts):
        """Compare ancestor/descendant counts with relatives=False/True."""
        k2v = {k:self.str_stats(v) for k, v in cnts.items()}
        print(k2v)

    @staticmethod
    def str_stats(vals):
        """Print statistics on values."""
        ntd = stats.describe(vals)
        std = int(round(np.sqrt(ntd.variance)))
        return "({m} {M}) STD={STD:,}".format(m=ntd.minmax[0], M=ntd.minmax[1], STD=std)

    def get_gosubdag_r0(self, goids):
        """Return a GoSubDag with N randomly chosen GO sources."""
        tic = timeit.default_timer()
        gosubdag = GoSubDag(goids, self.godag_r0, relationships=None,
                            #rcntobj=self.gosubdag_r0.rcntobj,
                            prt=None)
        prt_hms(tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format(
            N=len(gosubdag.go2obj), S=len(gosubdag.go_sources)))
        return gosubdag

    def get_gosubdag_r1(self, goids):
        """Return a GoSubDag with N randomly chosen GO sources."""
        tic = timeit.default_timer()
        gosubdag = GoSubDag(goids, self.godag_r1, relationships=True,
                            #rcntobj=self.gosubdag_r1.rcntobj,
                            prt=None)
        prt_hms(tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format(
            N=len(gosubdag.go2obj), S=len(gosubdag.go_sources)))
        return gosubdag

    def get_goids_rand(self, qty):
        """Return N randomly chosen GO IDs."""
        shuffle(self.goids)
        return self.goids[:qty]
예제 #3
0
def test_semantic_similarity():
    """Test faster version of sematic similarity"""
    godag = GODag(os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo'))
    name2go = {o.name: o.item_id for o in godag.values()}
    assoc = _get_id2gos(os.path.join(REPO, 'tests/data/yangRWC/fig1a.anno'), godag, name2go)
    tcntobj = TermCounts(godag, assoc)
    assert tcntobj.gocnts[name2go['I']] == 50
    assert tcntobj.gocnts[name2go['L']] == 50
    assert tcntobj.gocnts[name2go['M']] == 50
    assert tcntobj.gocnts[name2go['N']] == 50
예제 #4
0
def intialize_term_counts():
    go_freq_dict = dict()
    go_dag = GODag(os.path.join(DATA_DIR, "go-basic.obo"))

    associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH,
                                 godag=go_dag).get_id2gos('all')
    term_counts = TermCounts(go_dag, associations)
    for i in go_dag.values():
        go_freq_dict[i.id] = term_counts.get_count(i.id)
    # write frequency dict to JSON file
    with open(JSON_INDEXED_FILE_PATH, 'w') as json_file:
        json.dump(go_freq_dict, json_file)
예제 #5
0
def test_semantic_i150():
    """Test that comparing two identical GO IDs returns true"""
    fin_dag = os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo')
    ## fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf')
    # Read files
    godag = GODag(fin_dag)
    ## objanno = GafReader(fin_gaf)
    ## gene2gos = objanno.get_id2gos(namespace='CC')
    ## # Termcounts
    ## termcounts = TermCounts(godag, gene2gos, prt=sys.stdout)
    # Compare all GO terms with itself
    for goterm in set(godag.values()):
        goid = goterm.item_id
        assert semantic_similarity(goid, goid, godag) == 1.0
예제 #6
0
def test_semantic_similarity():
    """Test initializing TermCounts with annotations made to alternate GO ID"""
    godag = GODag(os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.obo'))
    file_id2gos = os.path.join(REPO, '../goatools/tests/data/yangRWC/fig2a.anno')
    name2go = {o.name: o.item_id for o in godag.values()}
    assoc = _get_id2gos(file_id2gos, godag, name2go, NAME2NUM)
    tcntobj = TermCounts(godag, assoc)
    # N_v: Test accuracy of Python equivalent to Java: getNumberOfAnnotations
    # Test number of unique genes annotated to a GO Term PLUS genes annotated to a descendant
    assert tcntobj.gocnts[name2go['A']] == 100, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['B']] == 40, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['C']] == 50, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['D']] == 10, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['E']] == 10, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['F']] == 10, tcntobj.gocnts
    assert tcntobj.gocnts[name2go['G']] == 30, tcntobj.gocnts
예제 #7
0
def _precompute_term_frequencies():
    print("Start precomputations of term frequencies...")
    go_freq_dict = dict()
    go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w'))

    associations = IdToGosReader(UNIPROT_ASSOCIATIONS_FILE_PATH,
                                 godag=go_dag).get_id2gos('all')
    term_counts = TermCounts(go_dag, associations)

    for i in go_dag.values():
        go_freq_dict[i.id] = term_counts.get_count(i.id)
        for alt_id in i.alt_ids:
            go_freq_dict[alt_id] = term_counts.get_count(i.id)
    # write frequency dict to JSON file
    with open(FREQUENCY_COUNTS_FILE_PATH, 'w') as json_file:
        json.dump(go_freq_dict, json_file)
예제 #8
0
    p.add_option(
        "--term",
        help="Write the parents and children of this query term",
    )

    opts, args = p.parse_args()

    if len(args) != 1:
        sys.exit(p.print_help())

    (obo_file, ) = args

    def description(rec):
        level = "level-{:>02}".format(rec.level)
        description = "{} [{}]".format(rec.name, rec.namespace)
        if rec.is_obsolete:
            description += " obsolete"
        alt_ids = ",".join(rec.alt_ids)
        return "\t".join((rec.item_id, level, description, alt_ids))

    g = GODag(obo_file, prt=None)
    header = "\t".join(("#id", "level", "name", "alt_ids"))
    print(header)
    for rec in sorted(set(g.values()), key=lambda x: x.item_id):
        print(description(rec))

    # run a test case
    if opts.term:
        rec = g.query_term(opts.term, verbose=True)
        g.draw_lineage([rec], verbose=True)
예제 #9
0
class NxMgAssembler(object):
    """Class which assembles a networkx MultiGraph based on a list of genes.

    Parameters
    ----------
    genes : list of dict
        A list of gene references based on which the graph is assembled.

    Attributes
    ----------
    graph : networkx.MultiGraph
        The assembled graph containing links for interactions between genes,
        GO annotations for genes, and the GO ontology.
    """

    def __init__(self, genes, resource_manager=None):
        self.genes = genes
        self.graph = nx.MultiGraph()
        if not resource_manager:
            self.resource_manager = ResourceManager()
        else:
            self.resource_manager = resource_manager
        self.go_dag = GODag(self.resource_manager.get_go_obo())
        self.goa = self._load_goa_gaf()

    def _get_go_terms_for_gene(self, gene):
        # Filter to rows with the given gene's UniProt ID
        if ('UP' not in gene) or ('HGNC_SYMBOL' not in gene):
            return []
        elif gene['HGNC_SYMBOL'] not in self.graph:
            return []
        df = self.goa[self.goa['DB_ID'] == gene['UP']]
        go_ids = sorted(list(set(df['GO_ID'])))
        return go_ids

    def add_go_annotations(self):
        """Add edges between gene nodes and GO nodes based on GO
        annotations."""
        logger.info('Adding GO annotations for genes in graph.')
        for gene in self.genes:
            go_ids = self._get_go_terms_for_gene(gene)
            for go_id in go_ids:
                if go_id in self.go_dag:
                    go_term = self.go_dag[go_id]
                    if go_term.is_obsolete:
                        continue
                    self.graph.add_node(go_term.id,
                                        name=go_term.name,
                                        GO=go_term.id,
                                        domain=go_term.namespace)
                    self.graph.add_edge(gene['HGNC_SYMBOL'], go_term.id,
                                        label='GO:annotation')

    def add_go_ontology(self):
        """Add edges between GO nodes based on the GO ontology."""
        logger.info('Adding GO ontology edges to graph.')
        for go_term in list(self.go_dag.values()):
            if go_term.is_obsolete:
                continue
            self.graph.add_node(go_term.id,
                                name=go_term.name,
                                GO=go_term.id,
                                domain=go_term.namespace)
            for parent_term in go_term.parents:
                if parent_term.is_obsolete:
                    continue
                self.graph.add_node(go_term.id,
                                    name=go_term.name,
                                    GO=go_term.id,
                                    domain=go_term.namespace)
                self.graph.add_edge(go_term.id, parent_term.id,
                                    label='GO:is_a')

    def node2edges(self, node_key):
        """Return the edges corresponding to a node."""
        return self.graph.edges(node_key, keys=True)

    def save_graph(self, fname):
        """Save the file into a GraphML file.

        Parameters
        ----------
        fname : str
            The name of the file to save the graph into.
        """
        nx.write_graphml(self.graph, fname)

    def _load_goa_gaf(self):
        """Load the gene/GO annotations as a pandas data frame."""
        goa_ec = {'EXP', 'IDA', 'IPI', 'IMP', 'IGI', 'IEP', 'HTP', 'HDA',
                  'HMP', 'HGI', 'HEP', 'IBA', 'IBD'}
        goa = pd.read_csv(self.resource_manager.get_goa_gaf(), sep='\t',
                          skiprows=23, dtype=str,
                          header=None,
                          names=['DB',
                                 'DB_ID',
                                 'DB_Symbol',
                                 'Qualifier',
                                 'GO_ID',
                                 'DB_Reference',
                                 'Evidence_Code',
                                 'With_From',
                                 'Aspect',
                                 'DB_Object_Name',
                                 'DB_Object_Synonym',
                                 'DB_Object_Type',
                                 'Taxon',
                                 'Date',
                                 'Assigned',
                                 'Annotation_Extension',
                                 'Gene_Product_Form_ID'])
        goa = goa.sort_values(by=['DB_ID', 'GO_ID'])
        # Filter out all "NOT" negative evidences
        goa['Qualifier'].fillna('', inplace=True)
        goa = goa[~goa['Qualifier'].str.startswith('NOT')]
        # Filter to rows with evidence code corresponding to experimental
        # evidence
        goa = goa[goa['Evidence_Code'].isin(goa_ec)]
        return goa
예제 #10
0
class _Run:
    """Group entire go-basic.obo"""

    obo = os.path.join(os.path.dirname(os.path.abspath(__file__)),
                       "../../go-basic.obo")

    def __init__(self):
        download_go_basic_obo(self.obo, sys.stdout, loading_bar=None)
        self.godag_r0 = GODag(self.obo)
        self.godag_r1 = GODag(self.obo, optional_attrs=set(['relationship']))
        self.goids = list(set(o.id for o in self.godag_r0.values()))
        # GoSubDag (plain)
        tic = timeit.default_timer()
        self.gosubdag_r0 = GoSubDag(self.goids, self.godag_r0, prt=None)
        prt_hms(
            tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format(
                N=len(self.gosubdag_r0.go2obj),
                S=len(self.gosubdag_r0.go_sources)))
        # GoSubDag with relationships
        self.gosubdag_r1 = GoSubDag(self.goids,
                                    self.godag_r1,
                                    prt=None,
                                    relationships=True)
        prt_hms(
            tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format(
                N=len(self.gosubdag_r1.go2obj),
                S=len(self.gosubdag_r1.go_sources)))

    def prt_cnts(self, cnts):
        """Compare ancestor/descendant counts with relatives=False/True."""
        k2v = {k: self.str_stats(v) for k, v in cnts.items()}
        print(k2v)

    @staticmethod
    def str_stats(vals):
        """Print statistics on values."""
        ntd = stats.describe(vals)
        std = int(round(np.sqrt(ntd.variance)))
        return "({m} {M}) STD={STD:,}".format(m=ntd.minmax[0],
                                              M=ntd.minmax[1],
                                              STD=std)

    def get_gosubdag_r0(self, goids):
        """Return a GoSubDag with N randomly chosen GO sources."""
        tic = timeit.default_timer()
        gosubdag = GoSubDag(
            goids,
            self.godag_r0,
            relationships=None,
            #rcntobj=self.gosubdag_r0.rcntobj,
            prt=None)
        prt_hms(
            tic, "GoSubDag r0 {N:4} GOs {S:3} srcs".format(
                N=len(gosubdag.go2obj), S=len(gosubdag.go_sources)))
        return gosubdag

    def get_gosubdag_r1(self, goids):
        """Return a GoSubDag with N randomly chosen GO sources."""
        tic = timeit.default_timer()
        gosubdag = GoSubDag(
            goids,
            self.godag_r1,
            relationships=True,
            #rcntobj=self.gosubdag_r1.rcntobj,
            prt=None)
        prt_hms(
            tic, "GoSubDag r1 {N:4} GOs {S:3} srcs".format(
                N=len(gosubdag.go2obj), S=len(gosubdag.go_sources)))
        return gosubdag

    def get_goids_rand(self, qty):
        """Return N randomly chosen GO IDs."""
        shuffle(self.goids)
        return self.goids[:qty]
예제 #11
0
def test_parents_ancestors():
    """Test getting parents and ancestors"""
    # Load a small GO DAG to demonstrate getting parents and ancestors
    file_dag = os.path.join(REPO, 'tests/data/i126/viral_gene_silence.obo')
    # Load all relationships using optional attribute
    godag = GODag(file_dag)

    optional_relationships = set()  # Don't trace any optional relationships
    go2parents_isa = get_go2parents(godag, optional_relationships)
    go2children_isa = get_go2children(godag, optional_relationships)
    # TODO: Add more tests for only is_a

    godag = GODag(file_dag, optional_attrs={'relationship'})
    goids = set(o.item_id for o in godag.values())

    # Get parents through "is_a" only
    optional_relationships = set()  # Don't trace any optional relationships
    go2parents_isa = get_go2parents(godag, optional_relationships)
    go2children_isa = get_go2children(godag, optional_relationships)

    # Get parents through "is_a" and all the "regulates" realtionships
    optional_relationships = {
        'regulates', 'negatively_regulates', 'positively_regulates'
    }
    go2parents_reg = get_go2parents(godag, optional_relationships)
    go2children_reg = get_go2children(godag, optional_relationships)

    # Print parents throush "is_a" relationship
    goid = 'GO:0019222'  # regulation of metabolic process
    assert go2parents_isa[goid] == {'GO:0050789'}
    assert go2parents_reg[goid] == {'GO:0050789', 'GO:0008152'}

    exp = {'GO:0009892', 'GO:0060255'}
    assert go2children_isa[goid] == exp
    assert go2children_reg[goid] == exp
    assert go2children_isa['GO:0008152'] == {'GO:0071704'}
    assert go2children_reg['GO:0008152'] == {
        'GO:0071704', 'GO:0019222', 'GO:0009892'
    }

    # Load GO DAG into a GoSubDag object, to use user-selected relationships
    gosubdag_r0 = GoSubDag(goids, godag)
    assert gosubdag_r0.rcntobj.go2ancestors[goid] == \
        {'GO:0050789', 'GO:0065007', 'GO:0008150'}

    # Load GO DAG into a GoSubDag object, to use user-selected relationships
    gosubdag_r1 = GoSubDag(goids, godag, relationships=optional_relationships)
    assert gosubdag_r1.rcntobj.go2ancestors[goid] == \
        {'GO:0050789', 'GO:0008152', 'GO:0065007', 'GO:0008150'}, \
        gosubdag_r1.rcntobj.go2ancestors[goid]

    exp = {'GO:0071704', 'GO:0010467', 'GO:0043170'}
    assert gosubdag_r0.rcntobj.go2descendants['GO:0008152'] == exp

    assert gosubdag_r0.rcntobj.go2descendants['GO:0043170'] == {'GO:0010467'}
    exp = {
        'GO:0010467', 'GO:0010468', 'GO:0010605', 'GO:0010608', 'GO:0010629',
        'GO:0016441', 'GO:0016458', 'GO:0040029', 'GO:0060147', 'GO:0060148',
        'GO:0060150', 'GO:0060255', 'GO:0060968'
    }
    assert gosubdag_r1.rcntobj.go2descendants['GO:0043170'] == exp

    gosubdag_r1n = GoSubDag(goids,
                            godag,
                            relationships={'negatively_regulates'})
    exp = {'GO:0010629', 'GO:0016441', 'GO:0016458'}
    assert gosubdag_r1n.rcntobj.go2descendants['GO:0010467'] == exp