Пример #1
0
def graph_node_classifier(name, train_data, layer_sizes=[32, 32], verbose=1):
    """
    Build and return a neural node classification model.
    Notes: Only mutually-exclusive class labels are supported.
    Args:
        name (string): one of:
                      - 'graphsage' for GraphSAGE model
                      (only GraphSAGE currently supported)
        train_data (NodeSequenceWrapper): a deepwrap.graph.sg_wrappers.NodeSequenceWrapper object
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    """
    from .sg_wrappers import NodeSequenceWrapper

    # check argument
    if not isinstance(train_data, NodeSequenceWrapper):
        err = """
            train_data must be a deepwrap.graph.sg_wrappers.NodeSequenceWrapper object
            """
        raise Exception(err)
    if len(layer_sizes) != 2:
        raise ValueError('layer_sizes must be of length 2')

    num_classes = U.nclasses_from_data(train_data)

    # determine multilabel
    multilabel = U.is_multilabel(train_data)
    if multilabel:
        raise ValueError(
            'Multi-label classification not currently supported for graphs.')
    U.vprint("Is Multi-Label? %s" % (multilabel), verbose=verbose)

    # set loss and activations
    loss_func = 'categorical_crossentropy'
    activation = 'softmax'

    # import stellargraph
    try:
        import stellargraph as sg
        from stellargraph.layer import GraphSAGE
    except:
        raise Exception(SG_ERRMSG)
    if version.parse(sg.__version__) < version.parse('0.8'):
        raise Exception(SG_ERRMSG)

    # build a GraphSAGE node classification model
    graphsage_model = GraphSAGE(
        layer_sizes=layer_sizes,
        generator=train_data,
        bias=True,
        dropout=0.5,
    )
    # x_inp, x_out = graphsage_model.default_model(flatten_output=True)
    x_inp, x_out = graphsage_model.build()
    prediction = Dense(units=num_classes, activation=activation)(x_out)
    model = Model(inputs=x_inp, outputs=prediction)
    model.compile(optimizer='adam', loss=loss_func, metrics=["accuracy"])
    U.vprint('done', verbose=verbose)
    return model
Пример #2
0
def graph_link_predictor(name,
                         train_data,
                         preproc,
                         layer_sizes=[20, 20],
                         verbose=1):
    """
    Build and return a neural link prediction model.

    Args:
        name (string): one of:
                      - 'graphsage' for GraphSAGE model 
                      (only GraphSAGE currently supported)

        train_data (LinkSequenceWrapper): a ktrain.graph.sg_wrappers.LinkSequenceWrapper object
        preproc(LinkPreprocessor): a LinkPreprocessor instance
        verbose (boolean): verbosity of output
    Return:
        model (Model): A Keras Model instance
    """
    from .sg_wrappers import LinkSequenceWrapper

    # check argument
    if not isinstance(train_data, LinkSequenceWrapper):
        err = """
            train_data must be a ktrain.graph.sg_wrappers.LinkSequenceWrapper object
            """
        raise Exception(err)
    if len(layer_sizes) != len(preproc.sample_sizes):
        raise ValueError(
            'number of layer_sizes must match len(preproc.sample_sizes)')

    num_classes = U.nclasses_from_data(train_data)

    # set loss and activations
    loss_func = 'categorical_crossentropy'
    activation = 'softmax'

    # import stellargraph
    try:
        import stellargraph as sg
        from stellargraph.layer import GraphSAGE, link_classification
    except:
        raise Exception(SG_ERRMSG)
    if version.parse(sg.__version__) < version.parse('0.8'):
        raise Exception(SG_ERRMSG)

    # build a GraphSAGE link prediction model
    graphsage = GraphSAGE(layer_sizes=layer_sizes,
                          generator=train_data,
                          bias=True,
                          dropout=0.3)
    x_inp, x_out = graphsage.build()
    prediction = link_classification(output_dim=1,
                                     output_act="relu",
                                     edge_embedding_method='ip')(x_out)
    model = Model(inputs=x_inp, outputs=prediction)
    model.compile(optimizer=U.DEFAULT_OPT,
                  loss='binary_crossentropy',
                  metrics=["accuracy"])
    return model
Пример #3
0
    def train_clf(self, graph, L):
        '''
			Train GraphSage model with updated labeled pool L
			Return new trained model
		'''
        train_targets = self.target_encoding.transform(
            self.df_targets.loc[L].to_dict("records"))
        train_gen = self.generator.flow(L, train_targets)

        gsage = GraphSAGE(layer_sizes=[32, 32],
                          generator=self.generator,
                          bias=True,
                          dropout=0.5)

        x_inp, x_out = gsage.build()
        predictions = layers.Dense(units=train_targets.shape[1],
                                   activation="softmax")(x_out)

        class_support = dict(Counter(self.df_targets.loc[L]["label"]))
        classes = sorted(self.data.class_labels)
        counts = [
            class_support[c] if c in class_support else 0 for c in classes
        ]
        weights = np.sum(counts) / np.array(counts)
        weighted_loss = self.weighted_categorical_crossentropy(weights)

        model = Model(inputs=x_inp, outputs=predictions)
        model.compile(
            optimizer=optimizers.Adam(lr=0.2),
            # loss=losses.categorical_crossentropy,
            loss=weighted_loss,
            metrics=["acc"],
        )

        # if not os.path.isdir("model_logs"):
        #     os.makedirs("model_logs")
        # es_callback = EarlyStopping(
        #     monitor="acc", patience=50
        # )  # patience is the number of epochs to wait before early stopping in case of no further improvement
        # mc_callback = ModelCheckpoint(
        #     "model_logs/best_model.h5", monitor="acc", save_best_only=True, save_weights_only=True
        # )

        history = model.fit_generator(
            train_gen,
            epochs=50,
            verbose=0,
            shuffle=
            False,  # this should be False, since shuffling data means shuffling the whole graph
            # callbacks=[es_callback, mc_callback],
        )

        # model.load_weights("model_logs/best_model.h5")

        return model
Пример #4
0
def create_graphSAGE_model(graph, link_prediction=False):

    if link_prediction:
        # We are going to train on the original graph
        generator = GraphSAGELinkGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
        train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    else:
        generator = GraphSAGENodeGenerator(graph,
                                           batch_size=2,
                                           num_samples=[2, 2])
        train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    # if link_prediction:
    #     edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]])
    #     train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0]))
    # else:
    #     train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]]))

    base_model = GraphSAGE(layer_sizes=[8, 8],
                           generator=generator,
                           bias=True,
                           dropout=0.5)

    if link_prediction:
        # Expose input and output sockets of graphsage, for source and destination nodes:
        x_inp, x_out = base_model.build()

        prediction = link_classification(output_dim=1,
                                         output_act="relu",
                                         edge_embedding_method="ip")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)
    else:
        x_inp, x_out = base_model.build()
        prediction = layers.Dense(units=2, activation="softmax")(x_out)

        keras_model = Model(inputs=x_inp, outputs=prediction)

    return base_model, keras_model, generator, train_gen
# In[17]:


layer_sizes = [20, 20]
assert len(layer_sizes) == len(num_samples)

graphsage = GraphSAGE(
    layer_sizes=layer_sizes, generator=generator, bias=True, dropout=0.5
)


# In[18]:


# Build the model and expose the input and output tensors.
x_inp, x_out = graphsage.build()


# Final link classification layer that takes a pair of node embeddings produced by graphsage, applies a binary
# operator to them to produce the corresponding link embedding ('ip' for inner product; other options for the binary
# operator can be seen by running a cell with `?link_classification` in it), and passes it through a dense layer:

# In[19]:


prediction = link_classification(
    output_dim=1, output_act="relu", edge_embedding_method="ip"
)(x_out)


# Stack the GraphSAGE and prediction layers into a Keras model.
Пример #6
0
def train(
    G,
    layer_size: List[int],
    num_samples: List[int],
    batch_size: int = 100,
    num_epochs: int = 10,
    learning_rate: float = 0.001,
    dropout: float = 0.0,
):
    """
    Train the GraphSAGE model on the specified graph G
    with given parameters.

    Args:
        G: NetworkX graph file
        layer_size: A list of number of hidden units in each layer of the GraphSAGE model
        num_samples: Number of neighbours to sample at each layer of the GraphSAGE model
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """

    # Split links into train/test
    print("Using '{}' method to sample negative links".format(
        args.edge_sampling_method))

    # From the original graph, extract E_test and the reduced graph G_test:
    edge_splitter_test = EdgeSplitter(G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G, and obtain the
    # reduced graph G_test with the sampled links removed:
    G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # From G_test, extract E_train and the reduced graph G_train:
    edge_splitter_train = EdgeSplitter(G_test, G)
    # Randomly sample a fraction p=0.1 of all positive links, and same number of negative links, from G_test, and obtain the
    # further reduced graph G_train with the sampled links removed:
    G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
        p=0.1,
        keep_connected=True,
        method=args.edge_sampling_method,
        probs=args.edge_sampling_probs,
    )

    # G_train, edge_ds_train, edge_labels_train will be used for model training
    # G_test, edge_ds_test, edge_labels_test will be used for model testing

    # Convert G_train and G_test to StellarGraph objects (undirected, as required by GraphSAGE) for ML:
    G_train = sg.StellarGraph(G_train, node_features="feature")
    G_test = sg.StellarGraph(G_test, node_features="feature")

    # Mapper feeds link data from sampled subgraphs to GraphSAGE model
    # We need to create two mappers: for training and testing of the model
    train_gen = GraphSAGELinkGenerator(G_train, batch_size, num_samples)
    train_flow = train_gen.flow(edge_ids_train,
                                edge_labels_train,
                                shuffle=True)

    test_gen = GraphSAGELinkGenerator(G_test, batch_size, num_samples)
    test_flow = test_gen.flow(edge_ids_test, edge_labels_test)

    # GraphSAGE model
    graphsage = GraphSAGE(layer_sizes=layer_size,
                          generator=train_gen,
                          bias=True,
                          dropout=dropout)

    # Construct input and output tensors for the link prediction model
    x_inp, x_out = graphsage.build()

    # Final estimator layer
    prediction = link_classification(
        output_dim=1,
        output_act="sigmoid",
        edge_embedding_method=args.edge_embedding_method,
    )(x_out)

    # Stack the GraphSAGE and prediction layers into a Keras model, and specify the loss
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate),
        loss=losses.binary_crossentropy,
        metrics=[metrics.binary_accuracy],
    )

    # Evaluate the initial (untrained) model on the train and test set:
    init_train_metrics = model.evaluate_generator(train_flow)
    init_test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the initial (untrained) model:")
    for name, val in zip(model.metrics_names, init_test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Train model
    print("\nTraining the model for {} epochs...".format(num_epochs))
    history = model.fit_generator(
        train_flow,
        epochs=num_epochs,
        validation_data=test_flow,
        verbose=2,
        shuffle=False,
    )

    # Evaluate and print metrics
    train_metrics = model.evaluate_generator(train_flow)
    test_metrics = model.evaluate_generator(test_flow)

    print("\nTrain Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, train_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    print("\nTest Set Metrics of the trained model:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("graphsage_link_pred" + save_str + ".h5")
Пример #7
0
def train(
    edgelist,
    node_data,
    layer_size,
    num_samples,
    batch_size=100,
    num_epochs=10,
    learning_rate=0.005,
    dropout=0.0,
    target_name="subject",
):
    """
    Train a GraphSAGE model on the specified graph G with given parameters, evaluate it, and save the model.

    Args:
        edgelist: Graph edgelist
        node_data: Feature and target data for nodes
        layer_size: A list of number of hidden nodes in each layer
        num_samples: Number of neighbours to sample at each layer
        batch_size: Size of batch for inference
        num_epochs: Number of epochs to train the model
        learning_rate: Initial Learning rate
        dropout: The dropout (0->1)
    """
    # Extract target and encode as a one-hot vector
    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    node_targets = target_encoding.fit_transform(
        node_data[[target_name]].to_dict("records"))
    node_ids = node_data.index

    # Extract the feature data. These are the feature vectors that the Keras model will use as input.
    # The CORA dataset contains attributes 'w_x' that correspond to words found in that publication.
    node_features = node_data[feature_names]

    # Create graph from edgelist and set node features and node type
    Gnx = nx.from_pandas_edgelist(edgelist, edge_attr="label")
    nx.set_node_attributes(Gnx, "paper", "label")

    # Convert to StellarGraph and prepare for ML
    G = sg.StellarGraph(Gnx,
                        node_type_name="label",
                        node_features=node_features)

    # Split nodes into train/test using stratification.
    train_nodes, test_nodes, train_targets, test_targets = model_selection.train_test_split(
        node_ids,
        node_targets,
        train_size=140,
        test_size=None,
        stratify=node_targets,
        random_state=5232,
    )

    # Split test set into test and validation
    val_nodes, test_nodes, val_targets, test_targets = model_selection.train_test_split(
        test_nodes,
        test_targets,
        train_size=500,
        test_size=None,
        random_state=5214)

    # Create mappers for GraphSAGE that input data from the graph to the model
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples, seed=5312)
    train_gen = generator.flow(train_nodes, train_targets, shuffle=True)
    val_gen = generator.flow(val_nodes, val_targets)

    # GraphSAGE model
    model = GraphSAGE(
        layer_sizes=layer_size,
        generator=train_gen,
        bias=True,
        dropout=dropout,
        aggregator=MeanAggregator,
    )
    # Expose the input and output sockets of the model:
    x_inp, x_out = model.build()

    # Snap the final estimator layer to x_out
    prediction = layers.Dense(units=train_targets.shape[1],
                              activation="softmax")(x_out)

    # Create Keras model for training
    model = keras.Model(inputs=x_inp, outputs=prediction)
    model.compile(
        optimizer=optimizers.Adam(lr=learning_rate, decay=0.001),
        loss=losses.categorical_crossentropy,
        metrics=[metrics.categorical_accuracy],
    )
    print(model.summary())

    # Train model
    history = model.fit_generator(train_gen,
                                  epochs=num_epochs,
                                  validation_data=val_gen,
                                  verbose=2,
                                  shuffle=False)

    # Evaluate on test set and print metrics
    test_metrics = model.evaluate_generator(
        generator.flow(test_nodes, test_targets))
    print("\nTest Set Metrics:")
    for name, val in zip(model.metrics_names, test_metrics):
        print("\t{}: {:0.4f}".format(name, val))

    # Get predictions for all nodes
    all_predictions = model.predict_generator(generator.flow(node_ids))

    # Turn predictions back into the original categories
    node_predictions = pd.DataFrame(
        target_encoding.inverse_transform(all_predictions), index=node_ids)
    accuracy = np.mean([
        "subject=" + gt_subject == p for gt_subject, p in zip(
            node_data["subject"], node_predictions.idxmax(axis=1))
    ])
    print("All-node accuracy: {:3f}".format(accuracy))

    # TODO: extract the GraphSAGE embeddings from x_out, and save/plot them

    # Save the trained model
    save_str = "_n{}_l{}_d{}_r{}".format(
        "_".join([str(x) for x in num_samples]),
        "_".join([str(x) for x in layer_size]),
        dropout,
        learning_rate,
    )
    model.save("cora_example_model" + save_str + ".h5")

    # We must also save the target encoding to convert model predictions
    with open("cora_example_encoding" + save_str + ".pkl", "wb") as f:
        pickle.dump([target_encoding], f)
    test_ids = nodes[5000:]
    train_labels= [graph.nodes[id]["_class"] for id in train_ids]
    test_labels = [graph.nodes[id]["_class"] for id in test_ids]
    all_labels = train_labels + test_labels
    train_labels = np.array(train_labels).reshape(len(train_ids),1)
    test_labels = np.array(test_labels).reshape(len(test_ids), 1)
    print(np.unique(train_labels, return_counts=True))
    print(np.unique(test_labels, return_counts=True))
    generator = GraphSAGENodeGenerator(G, batch_size=50, num_samples=[10,10])
    train_data_gen = generator.flow(train_ids, train_labels)
    test_data_gen = generator.flow(test_ids, test_labels)
    all_gen = generator.flow(list(nodes), all_labels)

    print("Node Gen done!")
    base_model = GraphSAGE(layer_sizes=[32, 32], generator=generator, bias=True, dropout=0.8)
    x_in, x_out = base_model.build()
    prediction = layers.Dense(units=2, activation="softmax")(x_out)

    print("model building done")

    model = Model(inputs=x_in, outputs = prediction)
    model.compile(optimizer=optimizers.Adam(lr=0.005), loss=losses.categorical_crossentropy, metrics=["acc"])
    tensorboard = callbacks.TensorBoard(log_dir="logs",embeddings_freq=1, update_freq=1, histogram_freq=1)
    tboard = model.fit(train_data_gen, epochs=4, validation_data=test_data_gen, verbose=True,
                                  shuffle=False, callbacks=[tensorboard])
    print(tboard)
    print("prediction done")

    y_pred = model.predict(train_data_gen, verbose=1)
    labels = np.argmax(y_pred, axis=1)
    print(classification_report(labels, train_labels))
Пример #9
0
    assert set(nodes_train).issuperset(actual_nodes_train)
unsupervised_samples = UnsupervisedSampler(Gtrain,
                                           nodes=actual_nodes_train,
                                           length=length_of_walks,
                                           number_of_walks=number_of_walks)
train_gen = GraphSAGELinkGenerator(Gtrain, batch_size,
                                   num_samples).flow(unsupervised_samples)

# Build the model
assert len(layer_sizes) == len(num_samples)
graphsage = GraphSAGE(layer_sizes=layer_sizes,
                      generator=train_gen,
                      bias=bias,
                      dropout=0.0,
                      normalize="l2")
x_inp, x_out = graphsage.build(flatten_output=False)
prediction = link_classification(output_dim=1,
                                 output_act="sigmoid",
                                 edge_embedding_method='ip')(x_out)
model = keras.Model(inputs=x_inp, outputs=prediction)
model.compile(
    optimizer=keras.optimizers.Adam(lr=1e-3),
    loss=keras.losses.binary_crossentropy,
    metrics=[keras.metrics.binary_accuracy],
)

# Train the model
history = model.fit_generator(
    train_gen,
    epochs=nepochs,
    verbose=verbose,
Пример #10
0
def train_model(Gnx, train_data, test_data, all_features):
    output_results = {}
    from collections import Counter
    #TODO: save size of dataset, train_data, and test data
    #save the count of each subject in the blocks
    print(len(train_data), len(test_data))
    subject_groups_train = Counter(train_data['subject'])
    subject_groups_test = Counter(test_data['subject'])
    output_results['train_size'] = len(train_data)
    output_results['test_size'] = len(test_data)
    output_results['subject_groups_train'] = subject_groups_train
    output_results['subject_groups_test'] = subject_groups_test

    #node_features = train_data[feature_names]
    #print (feature_names)
    G = sg.StellarGraph(Gnx, node_features=all_features)
    #TODO: save graph info
    print(G.info())
    print("writing graph.dot")
    #write_dot(Gnx,"graph.dot")
    output_results['graph_info'] = G.info()
    print("building the graph generator...")

    batch_size = 50
    num_samples = [10, 5]
    generator = GraphSAGENodeGenerator(G, batch_size, num_samples)
    #generator = HinSAGENodeGenerator(G, batch_size, num_samples)

    target_encoding = feature_extraction.DictVectorizer(sparse=False)
    train_targets = target_encoding.fit_transform(
        train_data[["subject"]].to_dict('records'))
    print(np.unique(train_data["subject"].to_list()))
    class_weights = class_weight.compute_class_weight(
        'balanced', np.unique(train_data["subject"].to_list()),
        train_data["subject"].to_list())
    print('class_weights', class_weights)
    test_targets = target_encoding.transform(test_data[["subject"
                                                        ]].to_dict('records'))
    train_gen = generator.flow(train_data.index, train_targets, shuffle=True)
    graphsage_model = GraphSAGE(
        #graphsage_model = HinSAGE(
        #layer_sizes=[32, 32],
        layer_sizes=[80, 80],
        generator=generator,  #train_gen,
        bias=True,
        dropout=0.5,
    )
    print("building model...")
    #x_inp, x_out = graphsage_model.build(flatten_output=True)
    x_inp, x_out = graphsage_model.build()
    prediction = layers.Dense(units=train_targets.shape[1],
                              activation="softmax")(x_out)

    model = Model(inputs=x_inp, outputs=prediction)
    print("compiling model...")
    model.compile(
        optimizer=optimizers.Adam(lr=0.005),
        loss=losses.categorical_crossentropy,
        metrics=["acc", metrics.categorical_accuracy],
    )
    print("testing the model...")
    test_gen = generator.flow(test_data.index, test_targets)
    history = model.fit_generator(
        train_gen,
        epochs=EPOCH,
        validation_data=test_gen,
        verbose=2,
        shuffle=True,
        class_weight=class_weights,
    )
    # save test metrics
    test_metrics = model.evaluate_generator(test_gen)
    print("\nTest Set Metrics:")
    output_results['test_metrics'] = []
    for name, val in zip(model.metrics_names, test_metrics):
        output_results['test_metrics'].append({'name': name, 'val:': val})
        print("\t{}: {:0.4f}".format(name, val))

    test_nodes = test_data.index
    test_mapper = generator.flow(test_nodes)
    test_predictions = model.predict_generator(test_mapper)
    node_predictions = target_encoding.inverse_transform(test_predictions)
    results = pd.DataFrame(node_predictions, index=test_nodes).idxmax(axis=1)
    df = pd.DataFrame({
        "Predicted": results,
        "True": test_data['subject']
    })  #, "program":test_data['program']})
    clean_result_labels = df["Predicted"].map(
        lambda x: x.replace('subject=', ''))
    # save predicted labels
    pred_labels = np.unique(clean_result_labels.values)
    #pred_program = np.unique(df['program'].values)
    # save predictions per label
    precision, recall, f1, _ = skmetrics.precision_recall_fscore_support(
        df['True'].values,
        clean_result_labels.values,
        average=None,
        labels=pred_labels)
    output_results['classifier'] = []
    for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1):
        output_results['classifier'].append({
            'label': lbl,
            'precision': prec,
            'recall': rec,
            'fscore': fm
        })
    print(output_results['classifier'])
    print(pred_labels)
    print('precision: {}'.format(precision))
    print('recall: {}'.format(recall))
    print('fscore: {}'.format(f1))

    return generator, model, x_inp, x_out, history, target_encoding, output_results
Пример #11
0
    def _train_model(self, gnx, train_data, test_data, all_features,
                     target_feature_name):
        subject_groups_train = Counter(train_data[target_feature_name])
        subject_groups_test = Counter(test_data[target_feature_name])

        graph = sg.StellarGraph(gnx, node_features=all_features)

        output_results = {
            'train_size': len(train_data),
            'test_size': len(test_data),
            'subject_groups_train': subject_groups_train,
            'subject_groups_test': subject_groups_test,
            'graph_info': graph.info()
        }

        num_samples = [10, 5]
        generator = GraphSAGENodeGenerator(graph, self.batch_size, num_samples)

        target_encoding = feature_extraction.DictVectorizer(sparse=False)
        train_targets = target_encoding.fit_transform(
            train_data[[target_feature_name]].to_dict('records'))
        class_weights = class_weight.compute_class_weight(
            class_weight='balanced',
            classes=np.unique(train_data[target_feature_name].to_list()),
            y=train_data[target_feature_name].to_list())
        class_weights = dict(enumerate(class_weights))
        test_targets = target_encoding.transform(
            test_data[[target_feature_name]].to_dict('records'))
        train_gen = generator.flow(train_data.index,
                                   train_targets,
                                   shuffle=True)
        graph_sage_model = GraphSAGE(
            layer_sizes=[80, 80],
            generator=generator,  # train_gen,
            bias=True,
            dropout=0.5,
        )
        print('building model...')

        x_inp, x_out = graph_sage_model.build()
        prediction = layers.Dense(units=train_targets.shape[1],
                                  activation="softmax")(x_out)

        model = Model(inputs=x_inp, outputs=prediction)
        print('compiling model...')
        model.compile(
            optimizer=optimizers.Adam(learning_rate=0.005),
            loss=losses.categorical_crossentropy,
            metrics=['acc', metrics.categorical_accuracy],
        )
        print('testing the model...')
        test_gen = generator.flow(test_data.index, test_targets)
        history = model.fit(
            train_gen,
            epochs=self.num_epochs,
            validation_data=test_gen,
            verbose=2,
            shuffle=True,
            class_weight=class_weights,
        )
        # save test metrics
        test_metrics = model.evaluate(test_gen)
        print('Test Set Metrics:')
        output_results['test_metrics'] = []
        for name, val in zip(model.metrics_names, test_metrics):
            output_results['test_metrics'].append({'name': name, 'val:': val})
            print("\t{}: {:0.4f}".format(name, val))

        test_nodes = test_data.index
        test_mapper = generator.flow(test_nodes)
        test_predictions = model.predict(test_mapper)
        node_predictions = target_encoding.inverse_transform(test_predictions)
        results = pd.DataFrame(node_predictions,
                               index=test_nodes).idxmax(axis=1)
        df = pd.DataFrame({
            'Predicted': results,
            'True': test_data[target_feature_name]
        })
        clean_result_labels = df['Predicted'].map(
            lambda x: x.replace('subject=', ''))

        # save predicted labels
        pred_labels = np.unique(clean_result_labels.values)
        precision, recall, f1, _ = skmetrics.precision_recall_fscore_support(
            df['True'].values,
            clean_result_labels.values,
            average=None,
            labels=pred_labels)
        output_results['classifier'] = []
        for lbl, prec, rec, fm in zip(pred_labels, precision, recall, f1):
            output_results['classifier'].append({
                'label': lbl,
                'precision': prec,
                'recall': rec,
                'fscore': fm
            })

        print(output_results['classifier'])
        print(pred_labels)
        print('precision: {}'.format(precision))
        print('recall: {}'.format(recall))
        print('fscore: {}'.format(f1))

        output_results['history'] = {
            'epochs': history.epoch,
            'training_log': history.history,
            'training_params': history.params
        }

        return generator, model, x_inp, x_out, history, target_encoding, output_results