Exemplo n.º 1
0
def create_HinSAGE_model(graph, link_prediction=False):

    if link_prediction:
        generator = HinSAGELinkGenerator(graph,
                                         batch_size=2,
                                         num_samples=[2, 1])
        edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
        train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    else:
        generator = HinSAGENodeGenerator(graph,
                                         batch_size=2,
                                         num_samples=[2, 2])
        train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = HinSAGE(layer_sizes=[8, 8],
                         generator=train_gen,
                         bias=True,
                         dropout=0.5)

    if link_prediction:
        # Define input and output sockets of hinsage:
        x_inp, x_out = base_model.build()

        # Final estimator layer
        prediction = link_regression(edge_embedding_method="ip")(x_out)
    else:
        x_inp, x_out = base_model.build()
        prediction = layers.Dense(units=2, activation="softmax")(x_out)

    keras_model = Model(inputs=x_inp, outputs=prediction)

    return base_model, keras_model, generator, train_gen
Exemplo n.º 2
0
    def train_hinsage(self, S, node_identifiers, label, batch_size, epochs):

        """
        
        This function trains a HinSAGE model, implemented in Tensorflow.
        It returns the trained HinSAGE model and a pandas datarame containing the embeddings generated for the train nodes.
        
        Parameters
        ----------
        S : StellarGraph Object
            The graph on which HinSAGE trains its aggregator functions.
        node_identifiers : list
            Defines the nodes that HinSAGE uses to train its aggregation functions.
        label: Pandas dataframe
            Defines the label of the nodes used for training, with the index representing the nodes.
        batch_size: int
            batch size to train the neural network in which HinSAGE is implemented.
        epochs: int
            Number of epochs for the neural network.
        
        """
        # The mapper feeds data from sampled subgraph to GraphSAGE model
        train_node_identifiers = node_identifiers[:round(0.8*len(node_identifiers))]
        train_labels = label.loc[train_node_identifiers]
        validation_node_identifiers = node_identifiers[round(0.8*len(node_identifiers)):]
        validation_labels = label.loc[validation_node_identifiers]
        generator = HinSAGENodeGenerator(S, batch_size, self.num_samples, head_node_type= self.embedding_for_node_type)
        train_gen = generator.flow(train_node_identifiers, train_labels, shuffle=True)
        test_gen = generator.flow(validation_node_identifiers, validation_labels)

        # HinSAGE model
        model = HinSAGE(layer_sizes=[self.embedding_size]*len(self.num_samples), generator=generator, dropout=0)
        x_inp, x_out = model.build()
        
        # Final estimator layer
        prediction = layers.Dense(units=1, activation="sigmoid", dtype='float32')(x_out)
        
        # Create Keras model for training
        model = Model(inputs=x_inp, outputs=prediction)
        model.compile(
        optimizer=optimizers.Adam(lr=1e-3),
             loss=binary_crossentropy,
            )
        
        # Train Model
        model.fit(
        train_gen, epochs=epochs, verbose=1, validation_data=test_gen, shuffle=False
        )
 
        trained_model = Model(inputs=x_inp, outputs=x_out)
        train_gen_not_shuffled = generator.flow( node_identifiers, label, shuffle=False)
        embeddings_train = trained_model.predict(train_gen_not_shuffled)

        train_emb = pd.DataFrame(embeddings_train,  index=node_identifiers)
    
        return trained_model, train_emb
def get_hinsage_model(generator,
                      train_gen,
                      test_gen,
                      num_samples=[8, 4],
                      hinsage_layer_sizes=[32, 32],
                      bias=True,
                      dropout=0.0,
                      lr=1e-2,
                      edge_embedding_method='concat',
                      output_act='sigmoid'):

    assert len(hinsage_layer_sizes) == len(num_samples)

    hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes,
                      generator=generator,
                      bias=bias,
                      dropout=dropout)

    # Expose input and output sockets of hinsage:
    x_inp, x_out = hinsage.in_out_tensors()

    # Final estimator layer
    score_prediction = link_classification(
        output_dim=1,
        output_act='sigmoid',
        edge_embedding_method=edge_embedding_method)(x_out)

    def root_mean_square_error(s_true, s_pred):
        return K.sqrt(K.mean(K.pow(s_true - s_pred, 2)))

    def recall_m(y_true, y_pred):
        y_pred = tf.where(y_pred > 0.5, 1.0, 0.0)
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision_m(y_true, y_pred):
        y_pred = tf.where(y_pred > 0.5, 1.0, 0.0)
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision

    def f1_m(y_true, y_pred):
        y_pred = tf.where(y_pred > 0.5, 1.0, 0.0)
        precision = precision_m(y_true, y_pred)
        recall = recall_m(y_true, y_pred)
        return 2 * ((precision * recall) / (precision + recall + K.epsilon()))

    model = Model(inputs=x_inp, outputs=score_prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=lr),
        # loss=losses.mean_squared_error,
        loss=losses.binary_crossentropy,
        metrics=[
            metrics.binary_accuracy,
            metrics.Precision(),
            metrics.Recall()
        ],
        # metrics=[root_mean_square_error, metrics.mae, 'acc'],
    )

    return model
Exemplo n.º 4
0
def train(
    G,
    user_targets,
    layer_size,
    num_samples,
    batch_size,
    num_epochs,
    learning_rate,
    dropout,
):
    """
    Train a HinSAGE model on the specified graph G with given parameters.

    Args:
        G: A StellarGraph object ready for machine learning
        layer_size: A list of number of hidden nodes in each layer
        num_samples: Number of neighbours to sample at each layer
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    print(G.info())

    # Split "user" nodes into train/test
    # Split nodes into train/test using stratification.
    train_targets, test_targets = model_selection.train_test_split(
        user_targets, train_size=0.25, test_size=None
    )

    print("Train targets:\n", train_targets.iloc[:, 0].value_counts())
    print("Test targets:\n", test_targets.iloc[:, 0].value_counts())

    # The mapper feeds data from sampled subgraph to GraphSAGE model
    generator = HinSAGENodeGenerator(G, batch_size, num_samples, head_node_type="user")
    train_gen = generator.flow_from_dataframe(train_targets, shuffle=True)
    test_gen = generator.flow_from_dataframe(test_targets)

    # GraphSAGE model
    model = HinSAGE(layer_sizes=layer_size, generator=generator, dropout=dropout)
    x_inp, x_out = model.build()

    # Final estimator layer
    prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out)

    # The elite label is only true for a small fraction of the total users,
    # so weight the training loss to ensure that model learns to predict
    # the positive class.
    # class_count = train_targets.values.sum(axis=0)
    # weights = class_count.sum()/class_count
    weights = [0.01, 1.0]
    print("Weighting loss by: {}".format(weights))

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate),
        loss=weighted_binary_crossentropy(weights),
        metrics=[metrics.binary_accuracy],
    )

    # Train model
    history = model.fit_generator(
        train_gen, epochs=num_epochs, verbose=2, shuffle=False
    )

    # Evaluate on test set and print metrics
    predictions = model.predict_generator(test_gen)
    binary_predictions = predictions[:, 1] > 0.5
    print("\nTest Set Metrics (on {} nodes)".format(len(predictions)))

    # Calculate metrics using Scikit-Learn
    cm = sk_metrics.confusion_matrix(test_targets.iloc[:, 1], binary_predictions)
    print("Confusion matrix:")
    print(cm)

    accuracy = sk_metrics.accuracy_score(test_targets.iloc[:, 1], binary_predictions)
    precision = sk_metrics.precision_score(test_targets.iloc[:, 1], binary_predictions)
    recall = sk_metrics.recall_score(test_targets.iloc[:, 1], binary_predictions)
    f1 = sk_metrics.f1_score(test_targets.iloc[:, 1], binary_predictions)
    roc_auc = sk_metrics.roc_auc_score(test_targets.iloc[:, 1], binary_predictions)

    print(
        "accuracy = {:0.3}, precision = {:0.3}, recall = {:0.3}, f1 = {:0.3}".format(
            accuracy, precision, recall, f1
        )
    )
    print("ROC AUC = {:0.3}".format(roc_auc))

    # Save model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("yelp_model" + save_str + ".h5")
Exemplo n.º 5
0
    def train(
        self,
        layer_size,
        num_samples,
        train_size=0.7,
        batch_size: int = 200,
        num_epochs: int = 20,
        learning_rate=5e-3,
        dropout=0.0,
        use_bias=True,
    ):
        """
        Build and train the HinSAGE model for link attribute prediction on the specified graph G
        with given parameters.

        Args:
            layer_size: a list of number of hidden nodes in each layer
            num_samples: number of neighbours to sample at each layer
            batch_size: size of mini batch
            num_epochs: number of epochs to train the model (epoch = all training batches are streamed through the model once)
            learning_rate: initial learning rate
            dropout: dropout probability in the range [0, 1)
            use_bias: tells whether to use a bias terms in HinSAGE model

        Returns:

        """

        # Training and test edges
        edges = list(self.g.edges(data=True))
        edges_train, edges_test = model_selection.train_test_split(
            edges, train_size=train_size)

        #  Edgelists:
        edgelist_train = [(e[0], e[1]) for e in edges_train]
        edgelist_test = [(e[0], e[1]) for e in edges_test]

        labels_train = [e[2]["score"] for e in edges_train]
        labels_test = [e[2]["score"] for e in edges_test]

        # Our machine learning task of learning user-movie ratings can be framed as a supervised Link Attribute Inference:
        # given a graph of user-movie ratings, we train a model for rating prediction using the ratings edges_train,
        # and evaluate it using the test ratings edges_test. The model also requires the user-movie graph structure.
        # To proceed, we need to create a StellarGraph object from the ingested graph, for training the model:
        # When sampling the GraphSAGE subgraphs, we want to treat user-movie links as undirected
        self.g = sg.StellarGraph(self.g, node_features="feature")

        # Next, we create the link generators for preparing and streaming training and testing data to the model.
        # The mappers essentially sample k-hop subgraphs of G with randomly selected head nodes, as required by
        # the HinSAGE algorithm, and generate minibatches of those samples to be fed to the input layer of the HinSAGE model.
        generator = HinSAGELinkGenerator(self.g,
                                         batch_size,
                                         num_samples,
                                         head_node_types=["user", "movie"])
        train_gen = generator.flow(edgelist_train, labels_train)
        test_gen = generator.flow(edgelist_test, labels_test)

        # Build the model by stacking a two-layer HinSAGE model and a link regression layer on top.
        assert len(layer_size) == len(
            num_samples
        ), "layer_size and num_samples must be of the same length! Stopping."

        hinsage = HinSAGE(layer_sizes=layer_size,
                          generator=generator,
                          bias=use_bias,
                          dropout=dropout)

        # Define input and output sockets of hinsage:
        x_inp, x_out = hinsage.build()

        # Final estimator layer
        score_prediction = link_regression(
            edge_embedding_method=args.edge_embedding_method)(x_out)

        # Create Keras model for training
        model = Model(inputs=x_inp, outputs=score_prediction)
        model.compile(
            optimizer=optimizers.Adam(lr=learning_rate),
            loss=losses.mean_squared_error,
            metrics=[root_mean_square_error, metrics.mae],
        )

        # Train model
        print("Training the model for {} epochs with initial learning rate {}".
              format(num_epochs, learning_rate))
        history = model.fit_generator(
            train_gen,
            validation_data=test_gen,
            epochs=num_epochs,
            verbose=2,
            shuffle=True,
            use_multiprocessing=True,
            workers=multiprocessing.cpu_count() // 2,
        )

        # Evaluate and print metrics
        test_metrics = model.evaluate_generator(test_gen)

        print("Test Evaluation:")
        for name, val in zip(model.metrics_names, test_metrics):
            print("\t{}: {:0.4f}".format(name, val))
Exemplo n.º 6
0
    def create_embeddings(node_type, num_samples, hinsage_layer_sizes, epochs,
                          patience, batch_size, dropout, activations):

        # Check if num_samples and layer_size are compatible
        assert len(hinsage_layer_sizes) == len(num_samples)

        generator = HinSAGENodeGenerator(G,
                                         batch_size,
                                         num_samples=num_samples,
                                         head_node_type=node_type)

        # HinSAGE layers
        hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes,
                          activations=activations,
                          generator=generator,
                          bias=True,
                          normalize="l2",
                          dropout=dropout)

        def run_deep_graph_infomax(base_model, generator, epochs, node_type):
            corrupted_generator = CorruptedGenerator(generator)
            gen = corrupted_generator.flow(G.nodes(node_type=node_type))
            infomax = DeepGraphInfomax(base_model, corrupted_generator)

            x_in, x_out = infomax.in_out_tensors()

            print("Starting Training")
            ttrain = time.time()
            # Train
            model = Model(inputs=x_in, outputs=x_out)
            model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits,
                          optimizer=Adam(lr=1e-3))
            es = EarlyStopping(monitor="loss", min_delta=0, patience=patience)

            history = model.fit(gen,
                                epochs=epochs,
                                verbose=verbose,
                                callbacks=[es])
            # sg.utils.plot_history(history)

            ttrain1 = time.time()
            print(
                f"Training complete in {(ttrain1-ttrain):.2f} s ({(ttrain1-ttrain)/60:.2f} min)"
            )

            x_emb_in, x_emb_out = base_model.in_out_tensors()
            # for full batch models, squeeze out the batch dim (which is 1)
            if generator.num_batch_dims() == 2:
                x_emb_out = tf.squeeze(x_emb_out, axis=0)

            return x_emb_in, x_emb_out

        # Run Deep Graph Infomax
        x_emb_in, x_emb_out = run_deep_graph_infomax(hinsage,
                                                     generator,
                                                     epochs=epochs,
                                                     node_type=node_type)

        emb_model = Model(inputs=x_emb_in, outputs=x_emb_out)
        all_embeddings = emb_model.predict(
            generator.flow(G.nodes(node_type=node_type)))

        # TSNE visualization of embeddings
        ttsne = time.time()
        print("Creating TSNE")
        embeddings_2d = pd.DataFrame(
            TSNE(n_components=2).fit_transform(all_embeddings),
            index=G.nodes(node_type=node_type))

        # draw the points (colors based on ExtendedCaseGraphID)
        node_ids = G.nodes(node_type=node_type).tolist()
        ext_targets = v_sample.loc[[int(node_id) for node_id in node_ids
                                    ]].ExtendedCaseGraphID

        label_map = {
            l: i * 10
            for i, l in enumerate(np.unique(ext_targets), start=10)
            if pd.notna(l)
        }
        node_colours = [
            label_map[target] if pd.notna(target) else 0
            for target in ext_targets
        ]

        ttsne1 = time.time()
        print(
            f"TSNE completed in {(ttsne1-ttsne):.2f} s ({(ttsne1-ttsne)/60:.2f} min)"
        )

        alpha = 0.7
        fig, ax = plt.subplots(figsize=(15, 15))
        ax.scatter(
            embeddings_2d[0],
            embeddings_2d[1],
            c=node_colours,
            cmap="jet",
            alpha=alpha,
        )
        ax.set(aspect="equal")
        plt.title(
            f'TSNE visualization of HinSAGE "{node_type}" embeddings with Deep Graph Infomax'
        )
        plt.savefig(f"./embeddings/HinSAGE_DGI_embeddings_{node_type}.pdf")

        return all_embeddings, embeddings_2d
Exemplo n.º 7
0
    def run_for_node_type(v_type, hinsage_layer_sizes, num_samples,
                          activations, epochs):
        nan_tflag = data_splits[v_type].iloc[0].values[0]
        train_tflag = data_splits[v_type].iloc[1].values[0]
        test_tflag = data_splits[v_type].iloc[2].values[0]

        train_cv_set = v_sets[v_type][nan_tflag:nan_tflag + train_tflag]
        train_cv_ids = train_cv_set.index.values.tolist()
        train_cv_labels = v_data.loc[[
            int(node_id) for node_id in train_cv_ids
        ]].ExtendedCaseGraphID

        test_set = v_sets[v_type][-test_tflag:]
        test_ids = test_set.index.values.tolist()

        generator = HinSAGENodeGenerator(G,
                                         batch_size,
                                         num_samples,
                                         head_node_type=v_type)

        hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes,
                          activations=activations,
                          generator=generator,
                          bias=True,
                          normalize="l2",
                          dropout=dropout)

        def run_deep_graph_infomax(base_model, generator, epochs):
            print(f"Starting training for {v_type} type: ")
            t0 = time.time()
            corrupted_generator = CorruptedGenerator(generator)
            gen = corrupted_generator.flow(G.nodes(node_type=v_type))
            infomax = DeepGraphInfomax(base_model, corrupted_generator)

            x_in, x_out = infomax.in_out_tensors()

            # Train with DGI
            model = Model(inputs=x_in, outputs=x_out)
            model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits,
                          optimizer=Adam(lr=1e-3))
            es = EarlyStopping(monitor="loss", min_delta=0, patience=10)
            history = model.fit(gen,
                                epochs=epochs,
                                verbose=verbose,
                                callbacks=[es])
            #sg.utils.plot_history(history)

            x_emb_in, x_emb_out = base_model.in_out_tensors()
            if generator.num_batch_dims() == 2:
                x_emb_out = tf.squeeze(x_emb_out, axis=0)

            t1 = time.time()
            print(f'Time required: {t1-t0:.2f} s ({(t1-t0)/60:.1f} min)')

            return x_emb_in, x_emb_out, model

        #? Train HinSAGE model:
        x_emb_in, x_emb_out, _model = run_deep_graph_infomax(hinsage,
                                                             generator,
                                                             epochs=epochs)

        emb_model = Model(inputs=x_emb_in, outputs=x_emb_out)

        train_cv_embs = emb_model.predict(
            generator.flow(train_cv_set.index.values))

        #? Optional: Plot embeddings of training and CV set of current node type
        if (visualize == True):
            train_cv_embs_2d = pd.DataFrame(
                TSNE(n_components=2).fit_transform(train_cv_embs),
                index=train_cv_set.index.values)
            label_map = {
                l: i * 10
                for i, l in enumerate(np.unique(train_cv_labels), start=10)
                if pd.notna(l)
            }
            node_colours = [
                label_map[target] if pd.notna(target) else 0
                for target in train_cv_labels
            ]

            alpha = 0.7
            fig, ax = plt.subplots(figsize=(15, 15))
            ax.scatter(
                train_cv_embs_2d[0],
                train_cv_embs_2d[1],
                c=node_colours,
                cmap="jet",
                alpha=alpha,
            )
            ax.set(aspect="equal")
            plt.title(
                f"TSNE of HinSAGE {v_type} embeddings with DGI- coloring on ExtendedCaseGraphID"
            )
            plt.show()

            return 1

        #? Split training and cross valuation set using 80% 20% simple ordered split
        n_embs = train_cv_embs.shape[0]
        train_size = int(n_embs * 0.80)
        cv_size = int(n_embs * 0.20)

        train_set = train_cv_embs[:train_size]
        train_labels = np.ravel(
            pd.DataFrame(train_cv_labels.values[:train_size]).fillna(0))

        cv_set = train_cv_embs[-cv_size:]
        cv_labels = np.ravel(
            pd.DataFrame(train_cv_labels.values[-cv_size:]).fillna(0))

        #? CLASSIFY
        print(f"Running Classifier for {v_type} type")
        classifier = DecisionTreeClassifier()
        classifier.fit(
            X=train_set,
            y=train_labels,
        )
        cv_pred = classifier.predict(cv_set)
        f1_avg = f1_score(cv_labels, cv_pred, average='weighted')
        acc = (cv_pred == cv_labels).mean()
        print(f"{v_type} CV Metrics: f1: {f1_avg:.6f} - acc: {acc:.6f}")

        #? Now Run on test set
        test_embs = emb_model.predict(generator.flow(test_set.index.values))
        test_pred = classifier.predict(test_embs)

        #? Save predictions
        outdir = './output'
        outname = f"{v_type}_predictions.csv"
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        fullname = os.path.join(outdir, outname)

        output = pd.DataFrame(test_ids)
        output = output.rename(columns={0: 'node_id'})
        output['ExtendedCaseGraphID'] = test_pred
        output = output.set_index('node_id')

        output.to_csv(fullname)

        return output