Exemplo n.º 1
0
def create_HinSAGE_model(graph, link_prediction=False):

    if link_prediction:
        generator = HinSAGELinkGenerator(graph,
                                         batch_size=2,
                                         num_samples=[2, 1])
        edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
        train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    else:
        generator = HinSAGENodeGenerator(graph,
                                         batch_size=2,
                                         num_samples=[2, 2])
        train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = HinSAGE(layer_sizes=[8, 8],
                         generator=train_gen,
                         bias=True,
                         dropout=0.5)

    if link_prediction:
        # Define input and output sockets of hinsage:
        x_inp, x_out = base_model.default_model()

        # Final estimator layer
        prediction = link_regression(edge_embedding_method="ip")(x_out)
    else:
        x_inp, x_out = base_model.default_model(flatten_output=True)
        prediction = layers.Dense(units=2, activation="softmax")(x_out)

    keras_model = Model(inputs=x_inp, outputs=prediction)

    return base_model, keras_model, generator, train_gen
    def train(
        self,
        layer_size,
        num_samples,
        train_size=0.7,
        batch_size: int = 200,
        num_epochs: int = 20,
        learning_rate=5e-3,
        dropout=0.0,
        use_bias=True,
    ):
        """
        Build and train the HinSAGE model for link attribute prediction on the specified graph G
        with given parameters.

        Args:
            layer_size: a list of number of hidden nodes in each layer
            num_samples: number of neighbours to sample at each layer
            batch_size: size of mini batch
            num_epochs: number of epochs to train the model (epoch = all training batches are streamed through the model once)
            learning_rate: initial learning rate
            dropout: dropout probability in the range [0, 1)
            use_bias: tells whether to use a bias terms in HinSAGE model

        Returns:

        """

        # Training and test edges
        edges = list(self.g.edges(data=True))
        edges_train, edges_test = model_selection.train_test_split(
            edges, train_size=train_size)

        #  Edgelists:
        edgelist_train = [(e[0], e[1]) for e in edges_train]
        edgelist_test = [(e[0], e[1]) for e in edges_test]

        labels_train = [e[2]["score"] for e in edges_train]
        labels_test = [e[2]["score"] for e in edges_test]

        # Our machine learning task of learning user-movie ratings can be framed as a supervised Link Attribute Inference:
        # given a graph of user-movie ratings, we train a model for rating prediction using the ratings edges_train,
        # and evaluate it using the test ratings edges_test. The model also requires the user-movie graph structure.
        # To proceed, we need to create a StellarGraph object from the ingested graph, for training the model:
        # When sampling the GraphSAGE subgraphs, we want to treat user-movie links as undirected
        self.g = sg.StellarGraph(self.g, node_features="feature")

        # Next, we create the link generators for preparing and streaming training and testing data to the model.
        # The mappers essentially sample k-hop subgraphs of G with randomly selected head nodes, as required by
        # the HinSAGE algorithm, and generate minibatches of those samples to be fed to the input layer of the HinSAGE model.
        generator = HinSAGELinkGenerator(
            self.g,
            batch_size,
            num_samples,
        )
        train_gen = generator.flow(edgelist_train, labels_train)
        test_gen = generator.flow(edgelist_test, labels_test)

        # Build the model by stacking a two-layer HinSAGE model and a link regression layer on top.
        assert len(layer_size) == len(
            num_samples
        ), "layer_size and num_samples must be of the same length! Stopping."
        hinsage = HinSAGE(layer_sizes=layer_size,
                          generator=train_gen,
                          bias=use_bias,
                          dropout=dropout)

        # Define input and output sockets of hinsage:
        x_inp, x_out = hinsage.default_model()

        # Final estimator layer
        score_prediction = link_regression(
            edge_embedding_method=args.edge_embedding_method)(x_out)

        # Create Keras model for training
        model = Model(inputs=x_inp, outputs=score_prediction)
        model.compile(
            optimizer=optimizers.Adam(lr=learning_rate),
            loss=losses.mean_squared_error,
            metrics=[root_mean_square_error, metrics.mae],
        )

        # Train model
        print("Training the model for {} epochs with initial learning rate {}".
              format(num_epochs, learning_rate))
        history = model.fit_generator(
            train_gen,
            validation_data=test_gen,
            epochs=num_epochs,
            verbose=2,
            shuffle=True,
            use_multiprocessing=True,
            workers=multiprocessing.cpu_count() // 2,
        )

        # Evaluate and print metrics
        test_metrics = model.evaluate_generator(test_gen)

        print("Test Evaluation:")
        for name, val in zip(model.metrics_names, test_metrics):
            print("\t{}: {:0.4f}".format(name, val))