def score(self, X): """ The goal of GloVe is to learn vectors whose dot products are proportional to the log co-occurrence probability. This score method assesses that directly using the current `self.embedding`. Parameters ---------- X : pd.DataFrame or np.array, shape `(self.n_words, self.n_vocab)` The original count matrix. Returns ------- float The Pearson correlation. """ X = self.convert_input_to_array(X) G = self.convert_input_to_array(self.embedding) mask = X > 0 M = G.dot(G.T) X_log = utils.log_of_array_ignoring_zeros(X) row_log_prob = np.log(X.sum(axis=1)) row_log_prob = np.outer(row_log_prob, np.ones(X.shape[1])) prob = X_log - row_log_prob return np.corrcoef(prob[mask].ravel(), M[mask].ravel())[0, 1]
def fit(self, X): """ Prepares `X` to permit learning against the GloVe objective, and then uses the superclass `fit` method to train the model parameters. Unlike the supervised models in this repository, this method returns the learned embedding (W + C) rather than `self`, so that it acts like a model that transforms a vector space (see also the autoencoder models). Parameters ---------- X : np.array, shape `(n_words, n_words)` This should be a square matrix of possible scaled co-occurrence counts. Attributes ---------- self.embedding: np.array, shape (n_words, embed_dim) The same matrix that is returned by the method. Returns ------- embedding: np.array, shape (n_words, embed_dim) The same matrix that is stored as `self.embedding`. """ X_vals = self.convert_input_to_array(X) self.n_words = len(X_vals) # This applies the function # # f(x) = (x/self.xmax)**self.alpha if x < self.xmax, else 1.0 # # to the full count matrix: bounded = np.minimum(X_vals, self.xmax) weights = (bounded / self.xmax)**self.alpha # Precompute log X[i, j] for all i, j: X_log = utils.log_of_array_ignoring_zeros(X_vals) super().fit(X_log, weights) # Per the advice in the paper, use the sum of the word and # context embeddings: embedding = self.model.W + self.model.C embedding = embedding.detach().cpu().numpy() # If the input was a `pd.DataFrame`, return one as well: self.embedding = self.convert_output(embedding, X) return self.embedding
def test_log_of_array_ignoring_zeros(arg, expected): result = utils.log_of_array_ignoring_zeros(arg) return np.array_equal(result, expected)
def fit(self, df): """ Learn the GloVe matrix. Parameters ---------- df : pd.DataFrame or np.array, shape `(n_vocab, n_vocab)` This should be a matrix of (possibly scaled) co-occcurrence counts. Returns ------- pd.DataFrame or np.array, shape `(n_vocab, self.n)` The type will be the same as the user's `df`. If it's a `pd.DataFrame`, the index will be the same as `df.index`. """ X = self.convert_input_to_array(df) m = X.shape[0] # Parameters: W = utils.randmatrix(m, self.n) # Word weights. C = utils.randmatrix(m, self.n) # Context weights. B = utils.randmatrix(2, m) # Word and context biases. # Precomputable GloVe values: X_log = utils.log_of_array_ignoring_zeros(X) X_weights = (np.minimum(X, self.xmax) / self.xmax)**self.alpha # eq. (9) # Learning: indices = list(range(m)) for iteration in range(self.max_iter): epoch_error = 0.0 random.shuffle(indices) for i, j in itertools.product(indices, indices): if X[i, j] > 0.0: weight = X_weights[i, j] # Cost is J' based on eq. (8) in the paper: diff = W[i].dot(C[j]) + B[0, i] + B[1, j] - X_log[i, j] fdiff = diff * weight # Gradients: wgrad = fdiff * C[j] cgrad = fdiff * W[i] wbgrad = fdiff wcgrad = fdiff # Updates: W[i] -= self.eta * wgrad C[j] -= self.eta * cgrad B[0, i] -= self.eta * wbgrad B[1, j] -= self.eta * wcgrad # One-half squared error term: epoch_error += 0.5 * weight * (diff**2) epoch_error /= m if epoch_error <= self.tol: utils.progress_bar( "Converged on iteration {} with error {}".format( iteration, epoch_error, self.display_progress)) break utils.progress_bar("Finished epoch {} of {}; error is {}".format( iteration, self.max_iter, epoch_error, self.display_progress)) # Return the sum of the word and context matrices, per the advice # in section 4.2: G = W + C self.embedding = self.convert_output(G, df) return self.embedding
def glove(df, n=100, xmax=100, alpha=0.75, max_iter=100, eta=0.05, tol=1e-4, display_progress=True): """Basic GloVe. This is mainly here as a reference implementation. We recommend using `mittens.GloVe` instead. Parameters ---------- df : pd.DataFrame or np.array This must be a square matrix. n : int (default: 100) The dimensionality of the output vectors. xmax : int (default: 100) Words with frequency greater than this are given weight 1.0. Words with frequency under this are given weight (c/xmax)**alpha where c is their count in mat (see the paper, eq. (9)). alpha : float (default: 0.75) Exponent in the weighting function (see the paper, eq. (9)). max_iter : int (default: 100) Number of training epochs. eta : float (default: 0.05) Controls the rate of SGD weight updates. tol : float (default: 1e-4) Stopping criterion for the loss. display_progress : bool (default: True) Whether to print iteration number and current error to stdout. Returns ------- pd.DataFrame With dimension `(df.shape[0], n)` """ X = df.values if isinstance(df, pd.DataFrame) else df m = X.shape[0] # Parameters: W = utils.randmatrix(m, n) # Word weights. C = utils.randmatrix(m, n) # Context weights. B = utils.randmatrix(2, m) # Word and context biases. # Precomputable GloVe values: X_log = utils.log_of_array_ignoring_zeros(X) X_weights = (np.minimum(X, xmax) / xmax)**alpha # eq. (9) # Learning: indices = list(range(m)) for iteration in range(max_iter): error = 0.0 random.shuffle(indices) for i, j in itertools.product(indices, indices): if X[i, j] > 0.0: weight = X_weights[i, j] # Cost is J' based on eq. (8) in the paper: diff = W[i].dot(C[j]) + B[0, i] + B[1, j] - X_log[i, j] fdiff = diff * weight # Gradients: wgrad = fdiff * C[j] cgrad = fdiff * W[i] wbgrad = fdiff wcgrad = fdiff # Updates: W[i] -= eta * wgrad C[j] -= eta * cgrad B[0, i] -= eta * wbgrad B[1, j] -= eta * wcgrad # One-half squared error term: error += 0.5 * weight * (diff**2) error /= m if display_progress: if error < tol: utils.progress_bar("Stopping at iteration {} with " "error {}".format(iteration, error)) break else: utils.progress_bar("Iteration {}: error {}".format( iteration, error)) if display_progress: sys.stderr.write('\n') # Return the sum of the word and context matrices, per the advice # in section 4.2: G = W + C if isinstance(df, pd.DataFrame): G = pd.DataFrame(G, index=df.index) return G