예제 #1
0
파일: grarep.py 프로젝트: netrias/CSRGraph
def grarep_main(src,
                dst,
                weights,
                n_components=2,
                embedder=TruncatedSVD(n_iter=10, random_state=42),
                order=5,
                verbose=True):
    """An implementation of `"GraRep" <https://dl.acm.org/citation.cfm?id=2806512>`
    from the CIKM '15 paper "GraRep: Learning Graph Representations with Global
    Structural Information". The procedure uses sparse truncated SVD to learn
    embeddings for the powers of the PMI matrix computed from powers of the
    normalized adjacency matrix.
    Parameters : 
    ----------------
    n_components (int): 
        Number of individual embedding dimensions.
    order (int): 
        Number of PMI matrix powers.
    embedder : (instance of sklearn API compatible model)
        Should implement the `fit_transform` method: 
            https://scikit-learn.org/stable/glossary.html#term-fit-transform
        The model should also have `n_components` as a parameter
        for number of resulting embedding dimensions. See:
            https://scikit-learn.org/stable/modules/manifold.html#manifold
        If not compatible, set resulting dimensions in the model instance directly

    TODO: add lambda parameter in [0, 1]
            Which is negative sampling ratio

    Returns :
    ---------------
    list[np.array]
    Containing one matrix size (n_nodes, n_components) per order
    """
    embedder.n_components = n_components
    # create transition matrix
    norm_weights = _row_norm(weights, src)
    tranmat = csr_matrix((norm_weights, dst, src))
    target_matrix = tranmat.copy()
    res = []
    if verbose:
        order_range = tqdm.trange(0, order)
    else:
        order_range = range(0, order)
    for _ in order_range:
        target_matrix = _create_target_matrix(target_matrix, tranmat)
        res.append(embedder.fit_transform(target_matrix))
    return res
예제 #2
0
파일: graph.py 프로젝트: adeliia/CSRGraph
    def normalize(self, return_self=True):
        """
        Normalizes edge weights per node

        For any node in the Graph, the new edges' weights will sum to 1

        return_self : bool
            whether to change the graph's values and return itself
            this lets us call `G.normalize()` directly
        """
        new_weights = _row_norm(self.weights, self.indptr)
        if return_self:
            self.weights = new_weights
            if hasattr(self, 'mat'):
                self.mat = sparse.csr_matrix(
                    (self.weights, self.dst, self.indptr))
            return self
        else:
            return CSRGraph(
                sparse.csr_matrix((new_weights, self.dst, self.indptr)))
예제 #3
0
    def normalize(self, return_self=True):
        """
        Normalizes edge weights per node

        For any node in the Graph, the new edges' weights will sum to 1

        return_self : bool
            whether to change the graph's values and return itself
            this lets us call `G.normalize()` directly
        """
        new_weights = _row_norm(self.weights, self.src)
        if return_self:
            self.mat = sparse.csr_matrix((new_weights, self.dst, self.src))
            # Point objects to the correct places
            self.weights = self.mat.data
            self.src = self.mat.indptr
            self.dst = self.mat.indices
            gc.collect()
            return self
        else:
            return csrgraph(sparse.csr_matrix(
                (new_weights, self.dst, self.src)), 
                nodenames=self.names)
예제 #4
0
    def ggvec(self, n_components=2,
              learning_rate=0.05,
              tol="auto",
              max_epoch=500,
              negative_ratio=1.0,
              order=1,
              negative_decay=0.,
              exponent=0.5,
              max_loss=30.,
              tol_samples=100,
              verbose=True):
        """
        GGVec: Fast global first (and higher) order local embeddings.

        This algorithm directly minimizes related nodes' distances.
        It uses a relaxation pass (negative sample) + contraction pass (loss minimization)
        To find stable embeddings based on the minimal dot product of edge weights.

        Parameters:
        -------------
        n_components (int): 
            Number of individual embedding dimensions.
        order : int >= 1
            Meta-level of the embeddings. Improves link prediction performance.
            Setting this higher than 1 ~quadratically slows down algorithm
                Order = 1 directly optimizes the graph.
                Order = 2 optimizes graph plus 2nd order edges
                    (eg. neighbours of neighbours)
                Order = 3 optimizes up to 3rd order edges
            Higher order edges are automatically weighed using GraRep-style graph formation
            Eg. the higher-order graph is from stable high-order random walk distribution.
        negative_ratio : float in [0, 1]
            Negative sampling ratio.
            Setting this higher will do more negative sampling.
            This is slower, but can lead to higher quality embeddings.
        exponent : float
            Weighing exponent in loss function. 
            Having this lower reduces effect of large edge weights.
        tol : float in [0, 1] or "auto"
            Optimization early stopping criterion.
            Stops average loss < tol for tol_samples epochs.
            "auto" sets tol as a function of learning_rate
        tol_samples : int
            Optimization early stopping criterion.
            This is the number of epochs to sample for loss stability.
            Once loss is stable over this number of epochs we stop early.
        negative_decay : float in [0, 1]
            Decay on negative ratio.
            If >0 then negative ratio will decay by (1-negative_decay) ** epoch
            You should usually leave this to 0.
        max_epoch : int
            Stopping criterion.
        max_count : int
            Ceiling value on edge weights for numerical stability
        learning_rate : float in [0, 1]
            Optimization learning rate.
        max_loss : float
            Loss value ceiling for numerical stability.
        """
        if tol == 'auto':
            tol = max(learning_rate / 2, 0.05)
        # Higher order graph embeddings
        # Method inspired by GraRep (powers of transition matrix)
        if order > 1:
            norm_weights = _row_norm(self.weights, self.src)
            tranmat = sparse.csr_matrix((norm_weights, self.dst, self.src))
            target_matrix = tranmat.copy()
            res = np.zeros((self.nnodes, n_components))
            for _ in range(order - 1):
                target_matrix = target_matrix.dot(tranmat)
                w = ggvec.ggvec_main(
                    data=target_matrix.data, 
                    src=target_matrix.indptr, 
                    dst=target_matrix.indices,
                    n_components=n_components, tol=tol,
                    tol_samples=tol_samples,
                    max_epoch=max_epoch, learning_rate=learning_rate, 
                    negative_ratio=negative_ratio,
                    negative_decay=negative_decay,
                    exponent=exponent,
                    max_loss=max_loss, verbose=verbose)
                res = np.sum([res, w], axis=0)
            return res
        else:
            return ggvec.ggvec_main(
                data=self.weights, src=self.src, dst=self.dst,
                n_components=n_components, tol=tol,
                tol_samples=tol_samples,
                max_epoch=max_epoch, learning_rate=learning_rate, 
                negative_ratio=negative_ratio,
                negative_decay=negative_decay,
                exponent=exponent,
                max_loss=max_loss, verbose=verbose)