Пример #1
0
    def score(self, X):
        """
        The goal of GloVe is to learn vectors whose dot products are
        proportional to the log co-occurrence probability. This score
        method assesses that directly using the current `self.embedding`.

        Parameters
        ----------
        X : pd.DataFrame or np.array, shape `(self.n_words, self.n_vocab)`
            The original count matrix.

        Returns
        -------
        float
            The Pearson correlation.

        """
        X = self.convert_input_to_array(X)
        G = self.convert_input_to_array(self.embedding)
        mask = X > 0
        M = G.dot(G.T)
        X_log = utils.log_of_array_ignoring_zeros(X)
        row_log_prob = np.log(X.sum(axis=1))
        row_log_prob = np.outer(row_log_prob, np.ones(X.shape[1]))
        prob = X_log - row_log_prob
        return np.corrcoef(prob[mask].ravel(), M[mask].ravel())[0, 1]
Пример #2
0
    def fit(self, X):
        """
        Prepares `X` to permit learning against the GloVe objective,
        and then uses the superclass `fit` method to train the model
        parameters. Unlike the supervised models in this repository,
        this method returns the learned embedding (W + C) rather than
        `self`, so that it acts like a model that transforms a vector
        space (see also the autoencoder models).

        Parameters
        ----------
        X : np.array, shape `(n_words, n_words)`
            This should be a square matrix of possible scaled
            co-occurrence counts.

        Attributes
        ----------
        self.embedding: np.array, shape (n_words, embed_dim)
            The same matrix that is returned by the method.

        Returns
        -------
        embedding: np.array, shape (n_words, embed_dim)
            The same matrix that is stored as `self.embedding`.

        """
        X_vals = self.convert_input_to_array(X)
        self.n_words = len(X_vals)
        # This applies the function
        #
        #  f(x) = (x/self.xmax)**self.alpha if x < self.xmax, else 1.0
        #
        # to the full count matrix:
        bounded = np.minimum(X_vals, self.xmax)
        weights = (bounded / self.xmax)**self.alpha
        # Precompute log X[i, j] for all i, j:
        X_log = utils.log_of_array_ignoring_zeros(X_vals)
        super().fit(X_log, weights)
        # Per the advice in the paper, use the sum of the word and
        # context embeddings:
        embedding = self.model.W + self.model.C
        embedding = embedding.detach().cpu().numpy()
        # If the input was a `pd.DataFrame`, return one as well:
        self.embedding = self.convert_output(embedding, X)
        return self.embedding
Пример #3
0
def test_log_of_array_ignoring_zeros(arg, expected):
    result = utils.log_of_array_ignoring_zeros(arg)
    return np.array_equal(result, expected)
Пример #4
0
    def fit(self, df):
        """
        Learn the GloVe matrix.

        Parameters
        ----------
        df : pd.DataFrame or np.array, shape `(n_vocab, n_vocab)`
            This should be a matrix of (possibly scaled) co-occcurrence
            counts.

        Returns
        -------
        pd.DataFrame or np.array, shape `(n_vocab, self.n)`
           The type will be the same as the user's `df`. If it's a
           `pd.DataFrame`, the index will be the same as `df.index`.

        """
        X = self.convert_input_to_array(df)
        m = X.shape[0]
        # Parameters:
        W = utils.randmatrix(m, self.n)  # Word weights.
        C = utils.randmatrix(m, self.n)  # Context weights.
        B = utils.randmatrix(2, m)  # Word and context biases.
        # Precomputable GloVe values:
        X_log = utils.log_of_array_ignoring_zeros(X)
        X_weights = (np.minimum(X, self.xmax) /
                     self.xmax)**self.alpha  # eq. (9)
        # Learning:
        indices = list(range(m))
        for iteration in range(self.max_iter):
            epoch_error = 0.0
            random.shuffle(indices)
            for i, j in itertools.product(indices, indices):
                if X[i, j] > 0.0:
                    weight = X_weights[i, j]
                    # Cost is J' based on eq. (8) in the paper:
                    diff = W[i].dot(C[j]) + B[0, i] + B[1, j] - X_log[i, j]
                    fdiff = diff * weight
                    # Gradients:
                    wgrad = fdiff * C[j]
                    cgrad = fdiff * W[i]
                    wbgrad = fdiff
                    wcgrad = fdiff
                    # Updates:
                    W[i] -= self.eta * wgrad
                    C[j] -= self.eta * cgrad
                    B[0, i] -= self.eta * wbgrad
                    B[1, j] -= self.eta * wcgrad
                    # One-half squared error term:
                    epoch_error += 0.5 * weight * (diff**2)

            epoch_error /= m

            if epoch_error <= self.tol:
                utils.progress_bar(
                    "Converged on iteration {} with error {}".format(
                        iteration, epoch_error, self.display_progress))
                break

            utils.progress_bar("Finished epoch {} of {}; error is {}".format(
                iteration, self.max_iter, epoch_error, self.display_progress))

        # Return the sum of the word and context matrices, per the advice
        # in section 4.2:
        G = W + C
        self.embedding = self.convert_output(G, df)
        return self.embedding
Пример #5
0
def glove(df,
          n=100,
          xmax=100,
          alpha=0.75,
          max_iter=100,
          eta=0.05,
          tol=1e-4,
          display_progress=True):
    """Basic GloVe. This is mainly here as a reference implementation.
    We recommend using `mittens.GloVe` instead.

    Parameters
    ----------
    df : pd.DataFrame or np.array
        This must be a square matrix.
    n : int (default: 100)
        The dimensionality of the output vectors.
    xmax : int (default: 100)
        Words with frequency greater than this are given weight 1.0.
        Words with frequency under this are given weight (c/xmax)**alpha
        where c is their count in mat (see the paper, eq. (9)).
    alpha : float (default: 0.75)
        Exponent in the weighting function (see the paper, eq. (9)).
    max_iter : int (default: 100)
        Number of training epochs.
    eta : float (default: 0.05)
        Controls the rate of SGD weight updates.
    tol : float (default: 1e-4)
        Stopping criterion for the loss.
    display_progress : bool (default: True)
        Whether to print iteration number and current error to stdout.

    Returns
    -------
    pd.DataFrame
        With dimension `(df.shape[0], n)`

    """
    X = df.values if isinstance(df, pd.DataFrame) else df
    m = X.shape[0]
    # Parameters:
    W = utils.randmatrix(m, n)  # Word weights.
    C = utils.randmatrix(m, n)  # Context weights.
    B = utils.randmatrix(2, m)  # Word and context biases.
    # Precomputable GloVe values:
    X_log = utils.log_of_array_ignoring_zeros(X)
    X_weights = (np.minimum(X, xmax) / xmax)**alpha  # eq. (9)
    # Learning:
    indices = list(range(m))
    for iteration in range(max_iter):
        error = 0.0
        random.shuffle(indices)
        for i, j in itertools.product(indices, indices):
            if X[i, j] > 0.0:
                weight = X_weights[i, j]
                # Cost is J' based on eq. (8) in the paper:
                diff = W[i].dot(C[j]) + B[0, i] + B[1, j] - X_log[i, j]
                fdiff = diff * weight
                # Gradients:
                wgrad = fdiff * C[j]
                cgrad = fdiff * W[i]
                wbgrad = fdiff
                wcgrad = fdiff
                # Updates:
                W[i] -= eta * wgrad
                C[j] -= eta * cgrad
                B[0, i] -= eta * wbgrad
                B[1, j] -= eta * wcgrad
                # One-half squared error term:
                error += 0.5 * weight * (diff**2)
        error /= m
        if display_progress:
            if error < tol:
                utils.progress_bar("Stopping at iteration {} with "
                                   "error {}".format(iteration, error))
                break
            else:
                utils.progress_bar("Iteration {}: error {}".format(
                    iteration, error))
    if display_progress:
        sys.stderr.write('\n')
    # Return the sum of the word and context matrices, per the advice
    # in section 4.2:
    G = W + C
    if isinstance(df, pd.DataFrame):
        G = pd.DataFrame(G, index=df.index)
    return G