def merge_intersect(frames, subsample=20, k=300): """ Combine the vector knowledge contained in `frames` over the vocabulary that they agree on, and use dimensionality reduction to mitigate the redundancy of learning the same thing multiple ways. If their vocabularies result from retrofitting, then the resulting vocabulary will be the vocabulary of the retrofit knowledge graph, plus any other terms that happen to be in all of the frames. """ # Find the intersected vocabulary of the frames, and concatenate their # vectors over that vocabulary. joined = concat_intersect(frames) # Find a subset of the labels that we'll use for calculating the # dimensionality-reduced version. The labels we particularly care about # are single words in our CORE_LANGUAGES. Even those are too numerous, # so we take an arbitrary 1/n sample of them, where n is given by the # `subsample` parameter. filtered_labels = pd.Series([ label for (i, label) in enumerate(joined.index) if i % subsample == 0 and '_' not in label and get_language(label) in CORE_LANGUAGES ]) # Mean-center and L_2-normalize the data, to prevent artifacts # in dimensionality reduction. adjusted = joined.loc[filtered_labels] adjusted -= joined.mean(0) normalize(adjusted.values, norm='l2', copy=False) # The SVD of this normalized matrix will give us its projection into # a lower-dimensional space (`projected`), as well as the operator that # performs that projection (`projection`) and the relative weights of the # columns (`eigenvalues`). projected, eigenvalues, projection = dataframe_svd_projection(adjusted, k) # We don't actually need this smaller matrix or its projection anymore; # what we learned is how to project _any_ matrix into this space. del adjusted del projected # Project the original `joined` matrix into this space using the # `projection` operator. reprojected = joined.dot(projection) del joined # `projection` (V) is an orthogonal matrix, so when we multiply by it, we # get a `reprojected` that approximately preserves distances (U * Σ). # # But these distances reflect redundant features among the input matrices. # To mitigate this redundancy, and to match Levy and Goldberg's observation # that U * Σ ** (1/2) is a better SVD projection for word-representation # purposes than U * Σ, we divide by Σ ** (1/2). np.divide(reprojected.values, eigenvalues ** .5, out=reprojected.values) normalize(reprojected.values, norm='l2', copy=False) # Return our unified vectors, and the projection that could map other # concatenated vectors into the same vector space. return reprojected, projection
def expand_terms(self, terms, limit_per_term=10, include_neighbors=True): """ Given a list of weighted terms as (term, weight) tuples, add terms that are one step away in ConceptNet at a lower weight, terms in English that share the surface form with these terms, and the terms which share prefix with these terms, if the terms are OOV. This helps increase the recall power of the vector space, because it means you can find terms that are too infrequent to have their own vector by looking up their neighbors, etc. This forms a reasonable approximation of the vector an infrequent term would have anyway. """ self.load() expanded = terms[:] for term, weight in terms: if include_neighbors and term not in self.frame.index and self.finder is not None: for edge in self.finder.lookup(term, limit=limit_per_term): if field_match(edge['start']['term'], term) and not field_match( edge['end']['term'], term): neighbor = edge['end']['term'] elif field_match(edge['end']['term'], term) and not field_match( edge['start']['term'], term): neighbor = edge['start']['term'] else: continue # TODO: explain this formula neighbor_weight = weight * min(10, edge['weight']) * 0.01 expanded.append((neighbor, neighbor_weight)) prefix_weight = 0.01 if get_language(term) != 'en': englishified = '/c/en/' + split_uri(term)[2] expanded.append((englishified, prefix_weight)) while term: # Skip excessively general lookups, for either an entire # language, or all terms starting with a single # non-ideographic letter if term.endswith('/') or (term[-2] == '/' and term[-1] < chr(0x3000)): break prefixed = self.terms_with_prefix(term) if prefixed: n_prefixed = len(prefixed) for prefixed_term in prefixed: expanded.append( (prefixed_term, prefix_weight / n_prefixed)) break term = term[:-1] total_weight = sum(abs(weight) for term, weight in expanded) if total_weight == 0: return [] else: return [(uri_prefix(term), weight / total_weight) for (term, weight) in expanded]
def make_big_frame(frame, language): """ Choose the vocabulary for the big frame and make the big frame. Eliminate the terms which are in languages other than the language specified. """ vocabulary = [term for term in frame.index if get_language(term) == language] big_frame = frame.ix[vocabulary] return big_frame
def merge_intersect(frames, subsample=20, vocab_cutoff=200000, k=300): label_intersection = set(frames[0].index) for frame in frames[1:]: label_intersection &= set(frame.index) filtered_labels = pd.Series([ label for label in sorted(label_intersection) if '_' not in label and get_language(label) in CORE_LANGUAGES ]) frames = [frame.loc[filtered_labels].astype('f') for frame in frames] joined = pd.concat(frames, join='inner', axis=1, ignore_index=True) joined.fillna(0.) adjusted = l2_normalize_rows(joined.ix[::subsample] - joined.mean(0)) # Search the frames for significant terms that we've missed. # Significant terms are those that appear in 3 different vocabularies, # or in 2 different vocabularies and in the first `vocab_cutoff` rows of # one of them. vocabulary = frames[0].index for frame in frames[1:]: vocabulary |= frame.index term_scores = pd.Series(index=vocabulary).fillna(0) for frame in frames[1:]: term_scores.loc[frame.index] += 1 term_scores.loc[frame.index[:vocab_cutoff]] += 1 new_terms = vocabulary[term_scores >= 3].difference(joined.index) new_vecs = [frame.reindex(new_terms) for frame in frames] joined2 = pd.concat([ joined, pd.concat(new_vecs, join='outer', axis=1, ignore_index=True).astype('f').fillna(0.) ]) del new_vecs projected, eigenvalues, projection = dataframe_svd_projection(adjusted, k) del adjusted del projected reprojected = joined2.dot(projection) reprojected /= (eigenvalues**.5) del joined2 reprojected = l2_normalize_rows(reprojected, offset=1e-6) reprojected.sort_index(inplace=True) return reprojected, projection
def ld_node(uri, label=None): """ Convert a ConceptNet URI into a dictionary suitable for Linked Data. """ if label is None: label = uri_to_label(uri) ld = {'@id': uri, 'label': label} if uri.startswith('/c/'): pieces = split_uri(uri) ld['language'] = get_language(uri) if len(pieces) > 3: ld['sense_label'] = '/'.join(pieces[3:]) ld['term'] = uri_prefix(uri) elif uri.startswith('http'): domain = urlparse(uri).netloc ld['site'] = domain ld['term'] = uri return ld
def build_features_from_conceptnet_table(filename): mat = SparseMatrixBuilder() concept_labels = OrderedSet() feature_labels = OrderedSet() with open(str(filename), encoding='utf-8') as infile: for line in infile: concept1, concept2, value_str, dataset, relation = line.strip( ).split('\t') concept1 = replace_numbers(concept1) concept2 = replace_numbers(concept2) value = float(value_str) if relation in SYMMETRIC_RELATIONS: feature_pairs = [] if get_language(concept1) in CORE_LANGUAGES: feature_pairs.append( ('{} {} ~'.format(uri_prefix(concept1), relation), concept2)) if get_language(concept2) in CORE_LANGUAGES: feature_pairs.append( ('{} {} ~'.format(uri_prefix(concept2), relation), concept1)) else: if get_language(concept1) in CORE_LANGUAGES: feature_pairs.append( ('{} {} -'.format(uri_prefix(concept1), relation), concept2)) if get_language(concept2) in CORE_LANGUAGES: feature_pairs.append( ('- {} {}'.format(uri_prefix(concept2), relation), concept1)) feature_counts = defaultdict(int) for feature, concept in feature_pairs: feature_counts[feature] += 1 for feature, concept in feature_pairs: prefixes = list(uri_prefixes(concept, 3)) if feature_counts[feature] > 1: for prefix in prefixes: concept_index = concept_labels.add(prefix) feature_index = feature_labels.add(feature) mat[concept_index, feature_index] = value # Link nodes to their more general versions for concept in concept_labels: prefixes = list(uri_prefixes(concept, 3)) for prefix in prefixes: auto_features = [ '{} {} ~'.format(prefix, 'SimilarTo'), '{} {} ~'.format(prefix, 'RelatedTo'), '{} {} -'.format(prefix, 'FormOf'), '- {} {}'.format(prefix, 'FormOf'), ] for feature in auto_features: concept_index = concept_labels.add(prefix) feature_index = feature_labels.add(feature) mat[concept_index, feature_index] = value shape = (len(concept_labels), len(feature_labels)) c_index = pd.Index(concept_labels) f_index = pd.Index(feature_labels) return mat.tocsr(shape), c_index, f_index
def retrofit(row_labels, dense_frame, sparse_csr, iterations=5, verbosity=0): """ Retrofitting is a process of combining information from a machine-learned space of term vectors with further structured information about those terms. It was originally presented in this 2015 NAACL paper by Manaal Faruqui, Jesse Dodge, Sujay Jauhar, Chris Dyer, Eduard Hovy, and Noah Smith, "Retrofitting Word Vectors to Semantic Lexicons": https://www.cs.cmu.edu/~hovy/papers/15HLT-retrofitting-word-vectors.pdf This function implements a variant that I've been calling "wide retrofitting", which extends the process to learn vectors for terms that were outside the original space. `row_labels` is the list of terms that we want to have vectors for. `dense_frame` is a DataFrame assigning vectors to some of these terms. `sparse_csr` is a SciPy sparse square matrix, whose rows and columns are implicitly labeled with `row_labels`. The entries of this matrix are positive for terms that we know are related from our structured data. (This is an awkward form of input, but unfortunately there is no good way to represent sparse labeled data in Pandas.) `sharded_retrofit` is responsible for building `row_labels` and `sparse_csr` appropriately. """ # Initialize a DataFrame with rows that we know retroframe = pd.DataFrame( index=row_labels, columns=dense_frame.columns, dtype='f' ) retroframe.update(dense_frame) # orig_weights = 1 for known vectors, 0 for unknown vectors orig_weights = 1 - retroframe.iloc[:, 0].isnull() weight_array = orig_weights.values[:, np.newaxis].astype('f') orig_vecs = retroframe.fillna(0).values # Divide up the labels by what language they're in -- we'll use this to # subtract the mean of each language, reducing clumping by language and # improving multilingual alignment. rows_by_language = defaultdict(list) for i, label in enumerate(row_labels): lang = get_language(label) rows_by_language[lang].append(i) all_languages = sorted(rows_by_language) row_groups = [rows_by_language[lang] for lang in all_languages] # Subtract the mean so that vectors don't just clump around common # hypernyms for row_group in row_groups: orig_vecs[row_group] -= orig_vecs[row_group].mean(0) # Delete the frame we built, we won't need its indices again until the end del retroframe vecs = orig_vecs for iteration in range(iterations): if verbosity >= 1: print('Retrofitting: Iteration %s of %s' % (iteration+1, iterations)) vecs = sparse_csr.dot(vecs) for row_group in row_groups: orig_vecs[row_group] -= orig_vecs[row_group].mean(0) # use sklearn's normalize, because it normalizes in place and # leaves zero-rows at 0 normalize(vecs, norm='l2', copy=False) # Average known rows with original vectors vecs += orig_vecs vecs /= (weight_array + 1.) retroframe = pd.DataFrame(data=vecs, index=row_labels, columns=dense_frame.columns) return retroframe