Пример #1
0
    def fit_predict(self, features, adjacency_matrix):
        adjacency_matrix = scipy.sparse.csr_matrix(adjacency_matrix)
        nodes_number = adjacency_matrix.shape[0]
        X = None
        if self.X is None:
            random_walker = gvnr.preprocessing.random_walker.RandomWalker(
                adjacency_matrix,
                walks_length=40,
                walks_number=number_of_walks)
            random_walks = random_walker.build_random_walks()
            slider = gvnr.preprocessing.window_slider.WindowSlider(
                random_walks,
                nodes_number,
                window_size=5,
                window_factor="decreasing")
            X = slider.build_cooccurrence_matrix()
        else:
            X = self.X

        model = gvnr.models.gvnr.gvnr()

        model.fit(X,
                  learn_rate=0.001,
                  embedding_size=80,
                  batch_size=batch_size,
                  n_epochs=2,
                  k_neg=num_neg,
                  x_min=x_min)

        I = normalize(model.get_embeddings(embedding='I') +
                      model.get_embeddings(embedding='J'),
                      axis=0)
        return I
Пример #2
0
def run(baselines, save=False):
    logger.info("Running evaluations with parameters:")
    for d in baselines:
        logger.info(f"{d}: {baselines[d]}")

    for dataset in baselines:
        binary_vectors, tfidf_vectors, svd_vectors, adjacency_matrix, labels, gt_mask = gvnr.data.datasets.get_dataset(
            dataset)
        X_file = pkg_resources.resource_filename(
            "gvnr", 'resources/{0}_X.npz'.format(dataset))
        X = None
        if os.path.isfile(X_file):
            X = scipy.sparse.load_npz(X_file)
        else:
            random_walker = gvnr.preprocessing.random_walker.RandomWalker(
                scipy.sparse.csr_matrix(adjacency_matrix),
                walks_length=40,
                walks_number=80)
            random_walks = random_walker.build_random_walks()
            slider = gvnr.preprocessing.window_slider.WindowSlider(
                random_walks,
                adjacency_matrix.shape[0],
                window_size=5,
                window_factor="decreasing")
            X = slider.build_cooccurrence_matrix()
            scipy.sparse.save_npz(X_file, X)

        for baseline in baselines[dataset]:
            model = None
            features = None
            if baseline == "binary":
                model = gvnr.models.wrappers.binary_wrapper()
                features = binary_vectors
            if baseline == "tfidf":
                model = gvnr.models.wrappers.tfidf_wrapper()
                features = tfidf_vectors
            if baseline == "svd":
                model = gvnr.models.wrappers.svd_wrapper()
                features = svd_vectors
            if baseline == "tadw":
                model = gvnr.models.wrappers.tadw_wrapper()
                features = svd_vectors
            if baseline == "netmf":
                model = gvnr.models.wrappers.netmf_wrapper()
            if baseline == "netmf_small":
                model = gvnr.models.wrappers.netmf_small_wrapper()
            if baseline == "deepwalk":
                model = gvnr.models.wrappers.deepwalk_wrapper()
                features = None
            if baseline == "deepwalk_svd":
                model = gvnr.models.wrappers.deepwalk_svd_wrapper()
                features = svd_vectors
            if baseline == "netmf_svd":
                model = gvnr.models.wrappers.netmf_svd_wrapper()
                features = svd_vectors
            if baseline == "glove":
                model = gvnr.models.wrappers.glove_wrapper(X=X)
                features = binary_vectors
            if baseline == "gvnr_no_filter":
                model = gvnr.models.wrappers.gvnr_no_filter_wrapper(X=X)
                features = binary_vectors
            if baseline == "gvnr":
                model = gvnr.models.wrappers.gvnr_wrapper(X=X)
                features = binary_vectors
            if baseline == "gvnrt":
                model = gvnr.models.wrappers.gvnrt_wrapper(X=X)
                features = binary_vectors

            vectors = model.fit_predict(features, adjacency_matrix)
            if save is True:
                filename = os.path.abspath(
                    os.path.join(
                        os.path.dirname(__file__), os.path.pardir,
                        f"embeddings/{dataset}_{baseline}_vectors.csv"))
                logger.info(f"Saving embeddings to {filename}...")
                np.savetxt(filename, vectors)

            vector = vectors[gt_mask]
            scores = None
            if dataset in ["cora", "aminer", "wiki", "citeseer"]:
                scores = gvnr.evaluation.get_score(vectors, labels)
            if dataset in ["flickr", "blogcatalog", "wikipedia", "ppi"]:
                scores = gvnr.evaluation.get_score(vectors,
                                                   labels,
                                                   multilabels=True)
            logger.info(
                "DATASET: {0}, BASELINE: {1} => f1_micro {2} f1_macro {2}".
                format(dataset, baseline, proportions))
            logger.info("   ".join([
                "&{0:.1f}".format(s * 100)
                for s in list(scores["f1_micro"]) + list(scores["f1_macro"])
            ]))
Пример #3
0
labels:             groundtruth labels for classification
gt_mask:            mask to select nodes that are linked with a label (only useful with aminer)
"""
binary_vectors, tfidf_vectors, svd_vectors, adjacency_matrix, labels, gt_mask = gvnr.data.datasets.get_dataset(
    "cora")

logger.info("2) PREPROCESSING THE DATA")
"""
We perform random walks on the network. N being the number of nodes, we get N*10 sequences of nodes of lengths 40.
These are stored in a numpy.ndarray  
"""
random_walker = gvnr.preprocessing.random_walker.RandomWalker(
    scipy.sparse.csr_matrix(adjacency_matrix),
    walks_length=40,
    walks_number=10)
random_walks = random_walker.build_random_walks()
"""
From these sequences of nodes, we slide a window to increment a matrix of counts of co-occurring nodes. We look 5
context nodes on the left and on the right of a target node. The window_factor is chosen such that co-occurrence counts
are decreasingly incremented given the distance to the target node e.g:
[1/5, 1/4, 1/3, 1/2, 1, target_node, 1, 1/2, 1/3, 1/4, 1/5] 
"""
slider = gvnr.preprocessing.window_slider.WindowSlider(
    random_walks,
    adjacency_matrix.shape[0],
    window_size=5,
    window_factor="decreasing")
X = slider.build_cooccurrence_matrix()

logger.info("3) TRAINING THE MODEL")
"""