def test_vector_space_wrapper(frame=None): """ Check if VectorSpaceWrapper's index is sorted and its elements are concepts. """ # Load a VSW from a user-supplied frame if frame: frame = load_any_embeddings(frame) wrap = VectorSpaceWrapper(frame=frame) wrap.load() ok_(all(is_term(label) for label in wrap.frame.index[1:])) ok_(wrap.frame.index.is_monotonic_increasing) # Load a VSW from a filename vector_filename = DATA + '/vectors/glove12-840B.h5' wrap = VectorSpaceWrapper(vector_filename=vector_filename) wrap.load() ok_(all(is_term(label) for label in wrap.frame.index[1:])) ok_(wrap.frame.index.is_monotonic_increasing) # Load a VSW from a frame frame = load_any_embeddings(DATA + '/vectors/glove12-840B.h5') wrap = VectorSpaceWrapper(frame=frame) wrap.load() ok_(all(is_term(label) for label in wrap.frame.index[1:])) ok_(wrap.frame.index.is_monotonic_increasing)
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'} ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def test_standardize_row_labels(): vec1 = TEST_FRAME.loc['island'] vec2 = TEST_FRAME.loc['Island'] vec3 = TEST_FRAME.loc['thing'] standardized_vectors = standardize_row_labels(TEST_FRAME) # Check if all labels are terms ok_(all(is_term(label) for label in standardized_vectors.index)) # Check if all terms standardized to the same concept are merged ok_(standardized_vectors.index.is_unique) ok_('/c/en/Island' not in standardized_vectors.index) ok_('/c/en/island' in standardized_vectors.index) ok_('/c/en/thing' in standardized_vectors.index) ok_(standardized_vectors.loc['/c/en/island'].equals(pd.Series([3.0, 3.0, 3.0]))) ok_(not standardized_vectors.loc['/c/en/island'].equals(vec1)) ok_(not standardized_vectors.loc['/c/en/island'].equals(vec2)) ok_(not standardized_vectors.loc['/c/en/thing'].equals(vec3)) # Check if numbers are substituted with '#' ok_('/c/en/##' in standardized_vectors.index)
def test_vector_space_wrapper(): """ Check if VectorSpaceWrapper's index is sorted and its elements are concepts. """ wrap = VectorSpaceWrapper(frame=TEST_FRAME) wrap.load() ok_(all(is_term(label) for label in wrap.frame.index)) ok_(wrap.frame.index.is_monotonic_increasing) # test there are no transformations to raw terms other than adding the english tag ok_('/c/en/figure skater' in wrap.frame.index) # no underscore ok_('/c/en/Island' in wrap.frame.index) # no case folding # test index_prefix_range ok_(wrap.index_prefix_range('/c/en/figure') == (3, 6)) ok_(wrap.index_prefix_range('/c/en/skating') == (0, 0)) # test_similar_terms ok_('/c/en/figure skating' in wrap.similar_terms('/c/en/figure skating', limit=3).index) ok_('/c/en/figure skater' in wrap.similar_terms('/c/en/figure skating', limit=3).index) ok_('/c/en/figure' in wrap.similar_terms('/c/en/figure skating', limit=3).index)
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = { '@id': uri, 'label': label } if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri return ld
def test_standardize_row_labels(simple_frame): vec1 = simple_frame.loc['island'] vec2 = simple_frame.loc['Island'] vec3 = simple_frame.loc['thing'] standardized_vectors = standardize_row_labels(simple_frame) # Check if all labels are terms assert all(is_term(label) for label in standardized_vectors.index) # Check if all terms standardized to the same concept are merged assert standardized_vectors.index.is_unique assert '/c/en/Island' not in standardized_vectors.index assert '/c/en/island' in standardized_vectors.index assert '/c/en/thing' in standardized_vectors.index assert standardized_vectors.loc['/c/en/island'].equals( pd.Series([3.0, 3.0, 3.0])) assert not standardized_vectors.loc['/c/en/island'].equals(vec1) assert not standardized_vectors.loc['/c/en/island'].equals(vec2) assert not standardized_vectors.loc['/c/en/thing'].equals(vec3) # Check if numbers are substituted with '#' assert '/c/en/##' in standardized_vectors.index
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) # Get a reasonably-distinct sense label for the term. # Usually it will be the part of speech, but when we have fine-grained # information from Wikipedia or WordNet, it'll include the last # component as well. if len(pieces) > 3: ld['sense_label'] = pieces[3] if len(pieces) > 4 and pieces[4] in ('wp', 'wn'): ld['sense_label'] += ', ' + pieces[-1] ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = True if domain in {'sw.opencyc.org', 'umbel.org', 'wikidata.dbpedia.org'}: ld['site_available'] = False ld['path'] = urlparse(uri).path ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if is_term(uri): pieces = split_uri(uri) ld['language'] = get_uri_language(uri) # Get a reasonably-distinct sense label for the term. # Usually it will be the part of speech, but when we have fine-grained # information from Wikipedia or WordNet, it'll include the last # component as well. if len(pieces) > 3: ld['sense_label'] = pieces[3] if len(pieces) > 4 and pieces[4] in ('wp', 'wn'): ld['sense_label'] += ', ' + pieces[-1] ld['term'] = uri_prefix(uri) ld['@type'] = 'Node' elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri # OpenCyc is down and UMBEL doesn't host their vocabulary on the # Web. This property indicates whether you can follow a link # via HTTP and retrieve more information. ld['site_available'] = domain not in {'sw.opencyc.org', 'umbel.org'} ld['@type'] = 'Node' elif uri.startswith('/r/'): ld['@type'] = 'Relation' return ld
def test_standardize_row_labels(frame=None): if not frame: frame = DATA + '/raw/vectors/glove12.840B.300d.txt.gz' vectors = load_any_embeddings(frame) vec1 = vectors.loc['island'] vec2 = vectors.loc['Island'] vec3 = vectors.loc['things'] standardized_vectors = standardize_row_labels(vectors) # Check if all labels are terms ok_(all(is_term(label) for label in standardized_vectors.index[1:])) # Check if all terms standardized to the same concept are merged ok_(standardized_vectors.index.is_unique) ok_('/c/en/Island' not in standardized_vectors.index) ok_('/c/en/island' in standardized_vectors.index) ok_('/c/en/thing' in standardized_vectors.index) ok_(not standardized_vectors.loc['/c/en/island'].equals(vec1)) ok_(not standardized_vectors.loc['/c/en/island'].equals(vec2)) ok_(not standardized_vectors.loc['/c/en/things'].equals(vec3)) # Check if numbers are substituted with '#' ok_('/c/en/##' in standardized_vectors.index)