示例#1
0
def createMatrix(goTerms, background, method):
    """ Return a numerical matrix

    Keyword arguments:
    goTerms -- list of go terms
    background -- flattened background: lists of genes and GO Terms
    method -- semantic similarity method, either "Lin", "Resnik", "Wang" or "Edge-based"
    Creates semantic similarity matrix
    """
    termcounts = TermCounts(godag, background)
    matrix = list()
    wang_r1 = None
    if method == "Wang":
        wang_r1 = SsWang(goTerms, godag)
    # only create half of matrix, fill rest with -1
    i = 0
    for termA in goTerms:
        j = 0
        row = list()
        for termB in goTerms:
            sim = -1
            if i < j:
                if method == "Lin":
                    sim = lin_sim(termA, termB, godag, termcounts)
                elif method == "Resnik":
                    sim = resnik_sim(termA, termB, godag, termcounts)
                elif method == "Wang":
                    sim = wang_r1.get_sim(termA, termB)
                else:
                    sim = semantic_similarity(termA, termB, godag)
            row.append(sim)
            j += 1
        matrix.append(row)
        i += 1
    return matrix
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag)


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       Resnik similarity score (GO:0048364, GO:0044707) = 0.0 because DCA is BP top
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    dca = deepest_common_ancestor([go_id3, go_id4], godag)
    assert dca == NS2GO['BP']
    assert sim_r == get_info_content(dca, termcounts)
    assert sim_r == 0.0
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = 0.0 because they are similar through BP top
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
    assert sim_l == 0.0, "FATAL LIN SCORE"

    # 
    go_top_cc = NS2GO['CC']
    sim_r = resnik_sim(go_top_cc, go_top_cc, godag, termcounts)
    assert sim_r == 0.0
    sim_l = lin_sim(go_top_cc, go_top_cc, godag, termcounts)
    assert sim_l == 1.0
示例#3
0
def test_semantic_i150():
    """Test that comparing two identical GO IDs returns true"""
    fin_dag = os.path.join(REPO, 'tests/data/yangRWC/fig1a.obo')
    ## fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf')
    # Read files
    godag = GODag(fin_dag)
    ## objanno = GafReader(fin_gaf)
    ## gene2gos = objanno.get_id2gos(namespace='CC')
    ## # Termcounts
    ## termcounts = TermCounts(godag, gene2gos, prt=sys.stdout)
    # Compare all GO terms with itself
    for goterm in set(godag.values()):
        goid = goterm.item_id
        assert semantic_similarity(goid, goid, godag) == 1.0
示例#4
0
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(os.getcwd(), "go-basic.obo"),
                      loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(
        os.path.join(os.getcwd(), 'gene_association.tair'), godag)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364'  # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707'  # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.
          format(GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id,
                                                           INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                                  GO2=go_id4,
                                                                  VAL=sim_r))
    assert sim_r, "FATAL RESNIK SCORE"

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                               GO2=go_id4,
                                                               VAL=sim_l))
    assert sim_l, "FATAL LIN SCORE"
示例#5
0
def test_semantic_i88():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    goids = set(go for go, o in godag.items() if go == o.id)
    goids = set(godag.keys())
    # Get all the annotations from arabidopsis.
    fin_gaf = os.path.join(REPO, "tair.gaf")
    # dnld_assc includes read_gaf
    associations = dnld_assc(fin_gaf, godag, prt=None)

    # First get the counts and information content for each GO term.
    termcounts = TermCounts(godag, associations)
    gosubdag = GoSubDag(goids, godag, tcntobj=termcounts)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364'  # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707'  # BP level-02 depth-02 single-multicellular organism process
    go_root = deepest_common_ancestor([go_id3, go_id4], godag)
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.
          format(GO1=go_id3, GO2=go_id4, VAL=sim))
    gosubdag.prt_goids([go_root, go_id3, go_id4])

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id,
                                                           INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                                  GO2=go_id4,
                                                                  VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3,
                                                               GO2=go_id4,
                                                               VAL=sim_l))
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = get_godag(os.path.join(REPO, "go-basic.obo"), loading_bar=None)
    # Get all the annotations from arabidopsis.
    associations = dnld_assc(os.path.join(REPO, 'tair.gaf'), godag)


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))
    assert infocontent, "FATAL INFORMATION CONTENT"

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))
    assert sim_r, "FATAL RESNIK SCORE"

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
    assert sim_l, "FATAL LIN SCORE"
def _test_path_bp_mf(branch_dist, godag, prt):
    """Test distances between BP branch and MF branch."""
    go_mf = 'GO:0003676'  # level-03 depth-03 nucleic acid binding [molecular_function]
    go_bp = 'GO:0007516'  # level-04 depth-05 hemocyte development [biological_process]
    dst_none = semantic_distance(go_mf, go_bp, godag)
    sim_none = semantic_similarity(go_mf, go_bp, godag)
    assc = dnld_assc("gene_association.tair", godag)
    termcounts = TermCounts(godag, assc)
    fmt = '({GO1}, {GO2}) {TYPE:6} score = {VAL}\n'
    sim_r = resnik_sim(go_mf, go_bp, godag, termcounts)
    sim_l = lin_sim(go_mf, go_bp, godag, termcounts)
    if prt is not None:
        prt.write(
            fmt.format(TYPE='semantic distance',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=dst_none))
        prt.write(
            fmt.format(TYPE='semantic similarity',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=sim_none))
        prt.write(
            fmt.format(TYPE='Resnik similarity',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=sim_r))
        prt.write(
            fmt.format(TYPE='Lin similarity', GO1=go_mf, GO2=go_bp, VAL=sim_l))
    assert dst_none is None
    assert sim_none is None
    assert sim_r is None
    assert sim_l is None
    sim_d = semantic_distance(go_mf, go_bp, godag, branch_dist)
    if prt is not None:
        prt.write(
            fmt.format(TYPE='semantic distance',
                       GO1=go_mf,
                       GO2=go_bp,
                       VAL=sim_d))
    assert sim_d == godag[go_mf].depth + godag[go_bp].depth + branch_dist
示例#8
0
def test_semantic_i88():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    goids = set(go for go, o in godag.items() if go == o.id)
    goids = set(godag.keys())
    # Get all the annotations from arabidopsis.
    fin_gaf = os.path.join(REPO, "tair.gaf")
    # dnld_assc includes read_gaf
    associations = dnld_assc(fin_gaf, godag, prt=None)

    # First get the counts and information content for each GO term.
    termcounts = TermCounts(godag, associations)
    gosubdag = GoSubDag(goids, godag, tcntobj=termcounts)

    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    go_root = deepest_common_ancestor([go_id3, go_id4], godag)
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    gosubdag.prt_goids([go_root, go_id3, go_id4])

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim_l))
def test_semantic_similarity():
    """Computing basic semantic similarities between GO terms."""
    godag = obo_parser.GODag("go-basic.obo")
    # Get all the annotations from arabidopsis.
    associations = read_gaf("http://geneontology.org/gene-associations/gene_association.tair.gz")


    # Now we can calculate the semantic distance and semantic similarity, as so:
    #       "The semantic similarity between terms GO:0048364 and GO:0044707 is 0.25.
    go_id3 = 'GO:0048364' # BP level-03 depth-04 root development
    go_id4 = 'GO:0044707' # BP level-02 depth-02 single-multicellular organism process
    sim = semantic_similarity(go_id3, go_id4, godag)
    print('\nThe semantic similarity between terms {GO1} and {GO2} is {VAL}.'.format(
        GO1=go_id3, GO2=go_id4, VAL=sim))
    print(godag[go_id3])
    print(godag[go_id4])

    # Then we can calculate the information content of the single term, <code>GO:0048364</code>.
    #       "Information content (GO:0048364) = 7.75481392334

    # First get the counts of each GO term.
    termcounts = TermCounts(godag, associations)

    # Calculate the information content
    go_id = "GO:0048364"
    infocontent = get_info_content(go_id, termcounts)
    print('\nInformation content ({GO}) = {INFO}\n'.format(GO=go_id, INFO=infocontent))

    # Resnik's similarity measure is defined as the information content of the most
    # informative common ancestor. That is, the most specific common parent-term in
    # the GO. Then we can calculate this as follows:
    #       "Resnik similarity score (GO:0048364, GO:0044707) = 4.0540784252
    sim_r = resnik_sim(go_id3, go_id4, godag, termcounts)
    print('Resnik similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_r))

    # Lin similarity score (GO:0048364, GO:0044707) = -0.607721957763
    sim_l = lin_sim(go_id3, go_id4, godag, termcounts)
    print('Lin similarity score ({GO1}, {GO2}) = {VAL}'.format(GO1=go_id3, GO2=go_id4, VAL=sim_l))
示例#10
0
def check_term_list(list_of_terms, verbose=False):
    """

    Parameters
    ----------
    list_of_terms : list
    verbose : bool

    Returns
    -------

    """
    list_of_terms = check_depth_and_children(list_of_terms)
    combos = itertools.combinations(list_of_terms, 2)
    to_remove = set()
    for i, j in combos:

        sim2 = semantic_similarity(i, j, go)
        dca = deepest_common_ancestor([i, j])

        sim = resnik_sim(i, j, go, termcounts)
        ic_i = ic(i, termcounts)
        ic_j = ic(j, termcounts)
        ic_dca = ic(dca, termcounts)

        if verbose:
            print("\nGO 1\t\t GO 2\t\t GO3")
            print("Min branch length = {}".format(min_branch_length(i, j)))
            print("{}\t{}\t{}".format(go[i].name, go[j].name, go[dca].name))
            print("{}\t{}\t{}".format(i, j, dca))
            print("{}\t{}\t{}".format(go[i].depth, go[j].depth, go[dca].depth))
            print("Resnik {}\tSemantic {}".format(sim, sim2))
            # print("{}\t{}\n".format(sim, sim2))

            print("IC\t\t{}\t\t\t{}\t\t\t{}".format(ic_i, ic_j, ic_dca))

        # remove if two terms of similar by distance in graph
        if sim2 > .5:
            if go[i].depth > go[j].depth:
                to_remove.add(i)
            else:
                to_remove.add(j)
        # remove if deepest common ancestor has at least 4 IC
        elif sim > 9:
            if go[dca].depth < 3:
                continue
            list_of_terms.add(dca)
            to_remove.add(i)
            to_remove.add(j)
            if dca in to_remove or dca in list_of_terms:
                list_of_terms.add(dca)
                to_remove.add(i)
                to_remove.add(j)
    if verbose:
        print(to_remove)
        print('Number of term before = {}'.format(len(list_of_terms)))

    list_of_terms = list_of_terms.difference(to_remove)

    if verbose:
        print('Number of term after = {}'.format(len(list_of_terms)))

    return list_of_terms
示例#11
0
def check_term_list(list_of_terms, verbose=False):
    """

    Parameters
    ----------
    list_of_terms : list
    verbose : bool

    Returns
    -------

    """
    list_of_terms = check_depth_and_children(list_of_terms)
    combos = itertools.combinations(list_of_terms, 2)
    to_remove = set()
    for i, j in combos:

        sim2 = semantic_similarity(i, j, go)
        dca = deepest_common_ancestor([i, j])

        sim = resnik_sim(i, j, go, termcounts)
        ic_i = ic(i, termcounts)
        ic_j = ic(j, termcounts)
        ic_dca = ic(dca, termcounts)

        if verbose:
            print("\nGO 1\t\t GO 2\t\t GO3")
            print("Min branch length = {}".format(min_branch_length(i, j)))
            print("{}\t{}\t{}".format(go[i].name, go[j].name, go[dca].name))
            print("{}\t{}\t{}".format(i, j, dca))
            print("{}\t{}\t{}".format(go[i].depth, go[j].depth, go[dca].depth))
            print("Resnik {}\tSemantic {}".format(sim, sim2))
            # print("{}\t{}\n".format(sim, sim2))

            print("IC\t\t{}\t\t\t{}\t\t\t{}".format(ic_i, ic_j, ic_dca))

        # remove if two terms of similar by distance in graph
        if sim2 > .5:
            if go[i].depth > go[j].depth:
                to_remove.add(i)
            else:
                to_remove.add(j)
        # remove if deepest common ancestor has at least 4 IC
        elif sim > 9:
            if go[dca].depth < 3:
                continue
            list_of_terms.add(dca)
            to_remove.add(i)
            to_remove.add(j)
            if dca in to_remove or dca in list_of_terms:
                list_of_terms.add(dca)
                to_remove.add(i)
                to_remove.add(j)
    if verbose:
        print(to_remove)
        print('Number of term before = {}'.format(len(list_of_terms)))

    list_of_terms = list_of_terms.difference(to_remove)

    if verbose:
        print('Number of term after = {}'.format(len(list_of_terms)))

    return list_of_terms