def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) dca = deepest_common_ancestor([go_id3, go_id4], godag) assert dca == NS2GO['BP'] assert sim_r == get_info_content(dca, termcounts) assert sim_r == 0.0 print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = 0.0 because they are similar through BP top sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l == 0.0, "FATAL LIN SCORE" # go_top_cc = NS2GO['CC'] sim_r = resnik_sim(go_top_cc, go_top_cc, godag, termcounts) assert sim_r == 0.0 sim_l = lin_sim(go_top_cc, go_top_cc, godag, termcounts) assert sim_l == 1.0
def test_i148b_semsim_lin(do_plt=False): """Test for issue 148, Lin Similarity if a term has no annotations""" fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf') godag = GODag(os.path.join(REPO, "tests/data/yangRWC/fig2a.obo")) annoobj = GafReader(fin_gaf, godag=godag) associations = annoobj.get_id2gos('CC') tcntobj = TermCounts(godag, associations) if do_plt: _do_plt(tcntobj, godag) goids = list(godag.keys()) ##print(lin_sim('GO:0000006', 'GO:0000002', godag, tcntobj, 1.0)) ## print(lin_sim('GO:0005575', 'GO:0005575', godag, tcntobj, 1.0)) ##return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l) return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l)
def createMatrix(goTerms, background, method): """ Return a numerical matrix Keyword arguments: goTerms -- list of go terms background -- flattened background: lists of genes and GO Terms method -- semantic similarity method, either "Lin", "Resnik", "Wang" or "Edge-based" Creates semantic similarity matrix """ termcounts = TermCounts(godag, background) matrix = list() wang_r1 = None if method == "Wang": wang_r1 = SsWang(goTerms, godag) # only create half of matrix, fill rest with -1 i = 0 for termA in goTerms: j = 0 row = list() for termB in goTerms: sim = -1 if i < j: if method == "Lin": sim = lin_sim(termA, termB, godag, termcounts) elif method == "Resnik": sim = resnik_sim(termA, termB, godag, termcounts) elif method == "Wang": sim = wang_r1.get_sim(termA, termB) else: sim = semantic_similarity(termA, termB, godag) row.append(sim) j += 1 matrix.append(row) i += 1 return matrix
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" goids = [ "GO:0140101", "GO:0140097", "GO:0140096", "GO:0140098", "GO:0015318", "GO:0140110", ] # Get all the annotations from arabidopsis. associations = [ ('human', 'goa_human.gaf'), ('yeast', 'gene_association.sgd'), ] cwd = os.getcwd() # current working directory godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) for species, assc_name in associations: # Limit test numbers for speed print() # Get all the annotations for the current species assc_gene2gos = dnld_assc(os.path.join(cwd, assc_name), godag, prt=None) # Calculate the information content of the single term, GO:0048364 termcounts = TermCounts(godag, assc_gene2gos) # Print information values for each GO term for goid in sorted(goids): infocontent = get_info_content(goid, termcounts) print( '{SPECIES} Information content {INFO:8.6f} {GO} {NAME}'.format( SPECIES=species, GO=goid, INFO=infocontent, NAME=godag[goid].name)) # Print semantic similarities between each pair of GO terms print("GO #1 GO #2 Resnik Lin") print("---------- ---------- ------ -------") for go_a, go_b in itertools.combinations(sorted(goids), 2): # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in the GO. sim_r = resnik_sim(go_a, go_b, godag, termcounts) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_a, go_b, godag, termcounts) print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format(GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l)) assert sim_r, "FATAL RESNIK SCORE" assert sim_l, "FATAL LIN SCORE"
def sim_sem(x, y): term = [] for w in x: for w1 in y: try: sim_l = lin_sim(w, w1, godag, termcounts) if sim_l == None: pass else: term.append(sim_l) except KeyError: continue return (np.mean(term))
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc( os.path.join(os.getcwd(), 'gene_association.tair'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'. format(GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) assert sim_r, "FATAL RESNIK SCORE" # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l, "FATAL LIN SCORE"
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" goids = [ "GO:0140101", "GO:0140097", "GO:0140096", "GO:0140098", "GO:0015318", "GO:0140110", ] # Get all the annotations from arabidopsis. associations = [ ('human', 'goa_human.gaf'), ('yeast', 'sgd.gaf'), ] godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) for species, assc_name in associations: # Limit test numbers for speed print() # Get all the annotations for the current species fin_assc = os.path.join(REPO, assc_name) assc_gene2gos = dnld_assc(fin_assc, godag, namespace='MF', prt=None) # Calculate the information content of the single term, GO:0048364 termcounts = TermCounts(godag, assc_gene2gos) # Print information values for each GO term for goid in sorted(goids): infocontent = get_info_content(goid, termcounts) term = godag[goid] print('{SPECIES} Information content {INFO:8.6f} {NS} {GO} {NAME}'.format( SPECIES=species, GO=goid, INFO=infocontent, NS=term.namespace, NAME=term.name)) # Print semantic similarities between each pair of GO terms print("GO #1 GO #2 Resnik Lin") print("---------- ---------- ------ -------") for go_a, go_b in itertools.combinations(sorted(goids), 2): # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in the GO. sim_r = resnik_sim(go_a, go_b, godag, termcounts) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_a, go_b, godag, termcounts) print('{GO1} {GO2} {RESNIK:6.4f} {LIN:7.4f}'.format( GO1=go_a, GO2=go_b, RESNIK=sim_r, LIN=sim_l)) assert sim_r, "FATAL RESNIK SCORE" assert sim_l, "FATAL LIN SCORE"
def test_semantic_i88(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") goids = set(go for go, o in godag.items() if go == o.id) goids = set(godag.keys()) # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, "tair.gaf") # dnld_assc includes read_gaf associations = dnld_assc(fin_gaf, godag, prt=None) # First get the counts and information content for each GO term. termcounts = TermCounts(godag, associations) gosubdag = GoSubDag(goids, godag, tcntobj=termcounts) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process go_root = deepest_common_ancestor([go_id3, go_id4], godag) sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'. format(GO1=go_id3, GO2=go_id4, VAL=sim)) gosubdag.prt_goids([go_root, go_id3, go_id4]) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) # Get all the annotations from arabidopsis. associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) assert infocontent, "FATAL INFORMATION CONTENT" # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) assert sim_r, "FATAL RESNIK SCORE" # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l)) assert sim_l, "FATAL LIN SCORE"
def _test_path_bp_mf(branch_dist, godag, prt): """Test distances between BP branch and MF branch.""" go_mf = 'GO:0003676' # level-03 depth-03 nucleic acid binding [molecular_function] go_bp = 'GO:0007516' # level-04 depth-05 hemocyte development [biological_process] dst_none = semantic_distance(go_mf, go_bp, godag) sim_none = semantic_similarity(go_mf, go_bp, godag) assc = dnld_assc("gene_association.tair", godag) termcounts = TermCounts(godag, assc) fmt = '({GO1}, {GO2}) {TYPE:6} score = {VAL}\n' sim_r = resnik_sim(go_mf, go_bp, godag, termcounts) sim_l = lin_sim(go_mf, go_bp, godag, termcounts) if prt is not None: prt.write( fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=dst_none)) prt.write( fmt.format(TYPE='semantic similarity', GO1=go_mf, GO2=go_bp, VAL=sim_none)) prt.write( fmt.format(TYPE='Resnik similarity', GO1=go_mf, GO2=go_bp, VAL=sim_r)) prt.write( fmt.format(TYPE='Lin similarity', GO1=go_mf, GO2=go_bp, VAL=sim_l)) assert dst_none is None assert sim_none is None assert sim_r is None assert sim_l is None sim_d = semantic_distance(go_mf, go_bp, godag, branch_dist) if prt is not None: prt.write( fmt.format(TYPE='semantic distance', GO1=go_mf, GO2=go_bp, VAL=sim_d)) assert sim_d == godag[go_mf].depth + godag[go_bp].depth + branch_dist
def test_semantic_i88(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") goids = set(go for go, o in godag.items() if go == o.id) goids = set(godag.keys()) # Get all the annotations from arabidopsis. fin_gaf = os.path.join(REPO, "tair.gaf") # dnld_assc includes read_gaf associations = dnld_assc(fin_gaf, godag, prt=None) # First get the counts and information content for each GO term. termcounts = TermCounts(godag, associations) gosubdag = GoSubDag(goids, godag, tcntobj=termcounts) # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process go_root = deepest_common_ancestor([go_id3, go_id4], godag) sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) gosubdag.prt_goids([go_root, go_id3, go_id4]) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format( GO1=go_id3, GO2=go_id4, VAL=sim_l))
def test_i148_semsim_lin(prt=sys.stdout): """Test for issue 148, Lin Similarity if a term has no annotations""" fin_gpad = os.path.join(REPO, 'goa_human.gpad') dnld_annofile(fin_gpad, 'gpad') godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None) annoobj = GpadReader(fin_gpad, godag=godag) goids = [ 'GO:0042581', 'GO:0101002', 'GO:0042582', 'GO:0070820', 'GO:0008021', 'GO:0005766', 'GO:0016591' ] associations = annoobj.get_id2gos('CC') termcounts = TermCounts(godag, associations) # Calculate Lin values p2v = { frozenset([a, b]): lin_sim(a, b, godag, termcounts) for a, b in combo_w_rplc(goids, 2) } _prt_values(goids, p2v, prt=sys.stdout)
def test_semantic_similarity(): """Computing basic semantic similarities between GO terms.""" godag = obo_parser.GODag("go-basic.obo") # Get all the annotations from arabidopsis. associations = read_gaf("http://geneontology.org/gene-associations/gene_association.tair.gz") # Now we can calculate the semantic distance and semantic similarity, as so: # "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25. go_id3 = 'GO:0048364' # BP level-03 depth-04 root development go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process sim = semantic_similarity(go_id3, go_id4, godag) print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format( GO1=go_id3, GO2=go_id4, VAL=sim)) print(godag[go_id3]) print(godag[go_id4]) # Then we can calculate the information content of the single term, <code>GO:0048364</code>. # "Information content (GO:0048364) = 7.75481392334 # First get the counts of each GO term. termcounts = TermCounts(godag, associations) # Calculate the information content go_id = "GO:0048364" infocontent = get_info_content(go_id, termcounts) print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent)) # Resnik's similarity measure is defined as the information content of the most # informative common ancestor. That is, the most specific common parent-term in # the GO. Then we can calculate this as follows: # "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252 sim_r = resnik_sim(go_id3, go_id4, godag, termcounts) print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r)) # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763 sim_l = lin_sim(go_id3, go_id4, godag, termcounts) print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
continue dots[state].setdefault(r.name, [None] * len(species)) dots[state][r.name][nspe] = r all_names = list(set(dots['Activated'].keys() + dots['Repressed'].keys())) go2name = dict([(r.GO, n) for r in dots['Activated'].get(n, dots['Repressed'].get(n, '')) if r][0] for n in all_names) name2go = dict([(n, r.GO) for r in dots['Activated'].get(n, dots['Repressed'].get(n, '')) if r][0] for n in all_names) termcount = TermCounts(obodag, geneid2gos) dist_matrix = [[((lin_sim(name2go[go1], name2go[go2], obodag, termcount) if go1 != go2 else 0) or 0) for go1 in all_names] for go2 in all_names] dist_cond = [ dist_matrix[i][j] for i in range(len(dist_matrix)) for j in range(i + 1, len(dist_matrix)) ] dist_matrix = [[(1 + dist_matrix[i][j]) if i != j else 0 for i in range(len(dist_matrix))] for j in range(len(dist_matrix))] for i in range(len(dist_matrix)): for j in range(i + 1, len(dist_matrix)): if dist_matrix[i][j] != dist_matrix[j][i]: print i, j, dist_matrix[i][j], dist_matrix[j][i]