예제 #1
0
def test_vector_space_wrapper(frame=None):
    """
    Check if VectorSpaceWrapper's index is sorted and its elements are concepts.
    """

    # Load a VSW from a user-supplied frame
    if frame:
        frame = load_any_embeddings(frame)
        wrap = VectorSpaceWrapper(frame=frame)
        wrap.load()
        ok_(all(is_term(label) for label in wrap.frame.index[1:]))
        ok_(wrap.frame.index.is_monotonic_increasing)

    # Load a VSW from a filename
    vector_filename = DATA + '/vectors/glove12-840B.h5'
    wrap = VectorSpaceWrapper(vector_filename=vector_filename)
    wrap.load()
    ok_(all(is_term(label) for label in wrap.frame.index[1:]))
    ok_(wrap.frame.index.is_monotonic_increasing)

    # Load a VSW from a frame
    frame = load_any_embeddings(DATA + '/vectors/glove12-840B.h5')
    wrap = VectorSpaceWrapper(frame=frame)
    wrap.load()
    ok_(all(is_term(label) for label in wrap.frame.index[1:]))
    ok_(wrap.frame.index.is_monotonic_increasing)
예제 #2
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #3
0
def test_standardize_row_labels():
    vec1 = TEST_FRAME.loc['island']
    vec2 = TEST_FRAME.loc['Island']
    vec3 = TEST_FRAME.loc['thing']
    standardized_vectors = standardize_row_labels(TEST_FRAME)

    # Check if all labels are terms
    ok_(all(is_term(label) for label in standardized_vectors.index))

    # Check if all terms standardized to the same concept are merged
    ok_(standardized_vectors.index.is_unique)
    ok_('/c/en/Island' not in standardized_vectors.index)
    ok_('/c/en/island' in standardized_vectors.index)
    ok_('/c/en/thing' in standardized_vectors.index)
    ok_(standardized_vectors.loc['/c/en/island'].equals(pd.Series([3.0, 3.0, 3.0])))
    ok_(not standardized_vectors.loc['/c/en/island'].equals(vec1))
    ok_(not standardized_vectors.loc['/c/en/island'].equals(vec2))
    ok_(not standardized_vectors.loc['/c/en/thing'].equals(vec3))

    # Check if numbers are substituted with '#'
    ok_('/c/en/##' in standardized_vectors.index)
예제 #4
0
def test_vector_space_wrapper():
    """
    Check if VectorSpaceWrapper's index is sorted and its elements are concepts.
    """
    wrap = VectorSpaceWrapper(frame=TEST_FRAME)
    wrap.load()
    ok_(all(is_term(label) for label in wrap.frame.index))
    ok_(wrap.frame.index.is_monotonic_increasing)

    # test there are no transformations to raw terms other than adding the english tag
    ok_('/c/en/figure skater' in wrap.frame.index) # no underscore
    ok_('/c/en/Island' in wrap.frame.index) # no case folding

    # test index_prefix_range
    ok_(wrap.index_prefix_range('/c/en/figure') == (3, 6))
    ok_(wrap.index_prefix_range('/c/en/skating') == (0, 0))

    # test_similar_terms
    ok_('/c/en/figure skating' in wrap.similar_terms('/c/en/figure skating', limit=3).index)
    ok_('/c/en/figure skater' in wrap.similar_terms('/c/en/figure skating', limit=3).index)
    ok_('/c/en/figure' in wrap.similar_terms('/c/en/figure skating', limit=3).index)
예제 #5
0
def test_standardize_row_labels():
    vec1 = TEST_FRAME.loc['island']
    vec2 = TEST_FRAME.loc['Island']
    vec3 = TEST_FRAME.loc['thing']
    standardized_vectors = standardize_row_labels(TEST_FRAME)

    # Check if all labels are terms
    ok_(all(is_term(label) for label in standardized_vectors.index))

    # Check if all terms standardized to the same concept are merged
    ok_(standardized_vectors.index.is_unique)
    ok_('/c/en/Island' not in standardized_vectors.index)
    ok_('/c/en/island' in standardized_vectors.index)
    ok_('/c/en/thing' in standardized_vectors.index)
    ok_(standardized_vectors.loc['/c/en/island'].equals(pd.Series([3.0, 3.0, 3.0])))
    ok_(not standardized_vectors.loc['/c/en/island'].equals(vec1))
    ok_(not standardized_vectors.loc['/c/en/island'].equals(vec2))
    ok_(not standardized_vectors.loc['/c/en/thing'].equals(vec3))

    # Check if numbers are substituted with '#'
    ok_('/c/en/##' in standardized_vectors.index)
예제 #6
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {
        '@id': uri,
        'label': label
    }
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)
        if len(pieces) > 3:
            ld['sense_label'] = '/'.join(pieces[3:])
        ld['term'] = uri_prefix(uri)
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri
    return ld
예제 #7
0
def test_standardize_row_labels(simple_frame):
    vec1 = simple_frame.loc['island']
    vec2 = simple_frame.loc['Island']
    vec3 = simple_frame.loc['thing']
    standardized_vectors = standardize_row_labels(simple_frame)

    # Check if all labels are terms
    assert all(is_term(label) for label in standardized_vectors.index)

    # Check if all terms standardized to the same concept are merged
    assert standardized_vectors.index.is_unique
    assert '/c/en/Island' not in standardized_vectors.index
    assert '/c/en/island' in standardized_vectors.index
    assert '/c/en/thing' in standardized_vectors.index
    assert standardized_vectors.loc['/c/en/island'].equals(
        pd.Series([3.0, 3.0, 3.0]))
    assert not standardized_vectors.loc['/c/en/island'].equals(vec1)
    assert not standardized_vectors.loc['/c/en/island'].equals(vec2)
    assert not standardized_vectors.loc['/c/en/thing'].equals(vec3)

    # Check if numbers are substituted with '#'
    assert '/c/en/##' in standardized_vectors.index
예제 #8
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = True
        if domain in {'sw.opencyc.org', 'umbel.org', 'wikidata.dbpedia.org'}:
            ld['site_available'] = False
        ld['path'] = urlparse(uri).path
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #9
0
def ld_node(uri, label=None):
    """
    Convert a ConceptNet URI into a dictionary suitable for Linked Data.
    """
    if label is None:
        label = uri_to_label(uri)
    ld = {'@id': uri, 'label': label}
    if is_term(uri):
        pieces = split_uri(uri)
        ld['language'] = get_uri_language(uri)

        # Get a reasonably-distinct sense label for the term.
        # Usually it will be the part of speech, but when we have fine-grained
        # information from Wikipedia or WordNet, it'll include the last
        # component as well.
        if len(pieces) > 3:
            ld['sense_label'] = pieces[3]

        if len(pieces) > 4 and pieces[4] in ('wp', 'wn'):
            ld['sense_label'] += ', ' + pieces[-1]

        ld['term'] = uri_prefix(uri)
        ld['@type'] = 'Node'
    elif uri.startswith('http'):
        domain = urlparse(uri).netloc
        ld['site'] = domain
        ld['term'] = uri

        # OpenCyc is down and UMBEL doesn't host their vocabulary on the
        # Web. This property indicates whether you can follow a link
        # via HTTP and retrieve more information.
        ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'}
        ld['@type'] = 'Node'
    elif uri.startswith('/r/'):
        ld['@type'] = 'Relation'
    return ld
예제 #10
0
def test_standardize_row_labels(frame=None):
    if not frame:
        frame = DATA + '/raw/vectors/glove12.840B.300d.txt.gz'
    vectors = load_any_embeddings(frame)

    vec1 = vectors.loc['island']
    vec2 = vectors.loc['Island']
    vec3 = vectors.loc['things']
    standardized_vectors = standardize_row_labels(vectors)

    # Check if all labels are terms
    ok_(all(is_term(label) for label in standardized_vectors.index[1:]))

    # Check if all terms standardized to the same concept are merged
    ok_(standardized_vectors.index.is_unique)
    ok_('/c/en/Island' not in standardized_vectors.index)
    ok_('/c/en/island' in standardized_vectors.index)
    ok_('/c/en/thing' in standardized_vectors.index)
    ok_(not standardized_vectors.loc['/c/en/island'].equals(vec1))
    ok_(not standardized_vectors.loc['/c/en/island'].equals(vec2))
    ok_(not standardized_vectors.loc['/c/en/things'].equals(vec3))

    # Check if numbers are substituted with '#'
    ok_('/c/en/##' in standardized_vectors.index)