def inductive_step_hinsage(self, S, trained_model, inductive_node_identifiers, batch_size): """ This function generates embeddings for unseen nodes using a trained hinsage model. It returns the embeddings for these unseen nodes. Parameters ---------- S : StellarGraph Object The graph on which HinSAGE is deployed. trained_model : Neural Network The trained hinsage model, containing the trained and optimized aggregation functions per depth. inductive_node_identifiers : list Defines the nodes that HinSAGE needs to generate embeddings for batch_size: int batch size for the neural network in which HinSAGE is implemented. """ # The mapper feeds data from sampled subgraph to HinSAGE model generator = HinSAGENodeGenerator( S, batch_size, self.num_samples, head_node_type=self.embedding_for_node_type) test_gen_not_shuffled = generator.flow(inductive_node_identifiers, shuffle=False) inductive_emb = trained_model.predict(test_gen_not_shuffled, verbose=1) inductive_emb = pd.DataFrame(inductive_emb, index=inductive_node_identifiers) return inductive_emb
def train_hinsage(self, S, node_identifiers, label, batch_size, epochs): """ This function trains a HinSAGE model, implemented in Tensorflow. It returns the trained HinSAGE model and a pandas datarame containing the embeddings generated for the train nodes. Parameters ---------- S : StellarGraph Object The graph on which HinSAGE trains its aggregator functions. node_identifiers : list Defines the nodes that HinSAGE uses to train its aggregation functions. label: Pandas dataframe Defines the label of the nodes used for training, with the index representing the nodes. batch_size: int batch size to train the neural network in which HinSAGE is implemented. epochs: int Number of epochs for the neural network. """ # The mapper feeds data from sampled subgraph to GraphSAGE model train_node_identifiers = node_identifiers[:round(0.8*len(node_identifiers))] train_labels = label.loc[train_node_identifiers] validation_node_identifiers = node_identifiers[round(0.8*len(node_identifiers)):] validation_labels = label.loc[validation_node_identifiers] generator = HinSAGENodeGenerator(S, batch_size, self.num_samples, head_node_type= self.embedding_for_node_type) train_gen = generator.flow(train_node_identifiers, train_labels, shuffle=True) test_gen = generator.flow(validation_node_identifiers, validation_labels) # HinSAGE model model = HinSAGE(layer_sizes=[self.embedding_size]*len(self.num_samples), generator=generator, dropout=0) x_inp, x_out = model.build() # Final estimator layer prediction = layers.Dense(units=1, activation="sigmoid", dtype='float32')(x_out) # Create Keras model for training model = Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=1e-3), loss=binary_crossentropy, ) # Train Model model.fit( train_gen, epochs=epochs, verbose=1, validation_data=test_gen, shuffle=False ) trained_model = Model(inputs=x_inp, outputs=x_out) train_gen_not_shuffled = generator.flow( node_identifiers, label, shuffle=False) embeddings_train = trained_model.predict(train_gen_not_shuffled) train_emb = pd.DataFrame(embeddings_train, index=node_identifiers) return trained_model, train_emb
def create_HinSAGE_model(graph, link_prediction=False): if link_prediction: generator = HinSAGELinkGenerator(graph, batch_size=2, num_samples=[2, 1]) edge_ids_train = np.array([[1, 2], [2, 3], [1, 3]]) train_gen = generator.flow(edge_ids_train, np.array([1, 1, 0])) else: generator = HinSAGENodeGenerator(graph, batch_size=2, num_samples=[2, 2]) train_gen = generator.flow([1, 2], np.array([[1, 0], [0, 1]])) base_model = HinSAGE(layer_sizes=[8, 8], generator=train_gen, bias=True, dropout=0.5) if link_prediction: # Define input and output sockets of hinsage: x_inp, x_out = base_model.build() # Final estimator layer prediction = link_regression(edge_embedding_method="ip")(x_out) else: x_inp, x_out = base_model.build() prediction = layers.Dense(units=2, activation="softmax")(x_out) keras_model = Model(inputs=x_inp, outputs=prediction) return base_model, keras_model, generator, train_gen
def train( G, user_targets, layer_size, num_samples, batch_size, num_epochs, learning_rate, dropout, ): """ Train a HinSAGE model on the specified graph G with given parameters. Args: G: A StellarGraph object ready for machine learning layer_size: A list of number of hidden nodes in each layer num_samples: Number of neighbours to sample at each layer batch_size: Size of batch for inference num_epochs: Number of epochs to train the model learning_rate: Initial Learning rate dropout: The dropout (0->1) """ print(G.info()) # Split "user" nodes into train/test # Split nodes into train/test using stratification. train_targets, test_targets = model_selection.train_test_split( user_targets, train_size=0.25, test_size=None ) print("Train targets:\n", train_targets.iloc[:, 0].value_counts()) print("Test targets:\n", test_targets.iloc[:, 0].value_counts()) # The mapper feeds data from sampled subgraph to GraphSAGE model generator = HinSAGENodeGenerator(G, batch_size, num_samples, head_node_type="user") train_gen = generator.flow_from_dataframe(train_targets, shuffle=True) test_gen = generator.flow_from_dataframe(test_targets) # GraphSAGE model model = HinSAGE(layer_sizes=layer_size, generator=generator, dropout=dropout) x_inp, x_out = model.build() # Final estimator layer prediction = layers.Dense(units=train_targets.shape[1], activation="softmax")(x_out) # The elite label is only true for a small fraction of the total users, # so weight the training loss to ensure that model learns to predict # the positive class. # class_count = train_targets.values.sum(axis=0) # weights = class_count.sum()/class_count weights = [0.01, 1.0] print("Weighting loss by: {}".format(weights)) # Create Keras model for training model = keras.Model(inputs=x_inp, outputs=prediction) model.compile( optimizer=optimizers.Adam(lr=learning_rate), loss=weighted_binary_crossentropy(weights), metrics=[metrics.binary_accuracy], ) # Train model history = model.fit_generator( train_gen, epochs=num_epochs, verbose=2, shuffle=False ) # Evaluate on test set and print metrics predictions = model.predict_generator(test_gen) binary_predictions = predictions[:, 1] > 0.5 print("\nTest Set Metrics (on {} nodes)".format(len(predictions))) # Calculate metrics using Scikit-Learn cm = sk_metrics.confusion_matrix(test_targets.iloc[:, 1], binary_predictions) print("Confusion matrix:") print(cm) accuracy = sk_metrics.accuracy_score(test_targets.iloc[:, 1], binary_predictions) precision = sk_metrics.precision_score(test_targets.iloc[:, 1], binary_predictions) recall = sk_metrics.recall_score(test_targets.iloc[:, 1], binary_predictions) f1 = sk_metrics.f1_score(test_targets.iloc[:, 1], binary_predictions) roc_auc = sk_metrics.roc_auc_score(test_targets.iloc[:, 1], binary_predictions) print( "accuracy = {:0.3}, precision = {:0.3}, recall = {:0.3}, f1 = {:0.3}".format( accuracy, precision, recall, f1 ) ) print("ROC AUC = {:0.3}".format(roc_auc)) # Save model save_str = "_n{}_l{}_d{}_r{}".format( "_".join([str(x) for x in num_samples]), "_".join([str(x) for x in layer_size]), dropout, learning_rate, ) model.save("yelp_model" + save_str + ".h5")
def create_embeddings(node_type, num_samples, hinsage_layer_sizes, epochs, patience, batch_size, dropout, activations): # Check if num_samples and layer_size are compatible assert len(hinsage_layer_sizes) == len(num_samples) generator = HinSAGENodeGenerator(G, batch_size, num_samples=num_samples, head_node_type=node_type) # HinSAGE layers hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes, activations=activations, generator=generator, bias=True, normalize="l2", dropout=dropout) def run_deep_graph_infomax(base_model, generator, epochs, node_type): corrupted_generator = CorruptedGenerator(generator) gen = corrupted_generator.flow(G.nodes(node_type=node_type)) infomax = DeepGraphInfomax(base_model, corrupted_generator) x_in, x_out = infomax.in_out_tensors() print("Starting Training") ttrain = time.time() # Train model = Model(inputs=x_in, outputs=x_out) model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer=Adam(lr=1e-3)) es = EarlyStopping(monitor="loss", min_delta=0, patience=patience) history = model.fit(gen, epochs=epochs, verbose=verbose, callbacks=[es]) # sg.utils.plot_history(history) ttrain1 = time.time() print( f"Training complete in {(ttrain1-ttrain):.2f} s ({(ttrain1-ttrain)/60:.2f} min)" ) x_emb_in, x_emb_out = base_model.in_out_tensors() # for full batch models, squeeze out the batch dim (which is 1) if generator.num_batch_dims() == 2: x_emb_out = tf.squeeze(x_emb_out, axis=0) return x_emb_in, x_emb_out # Run Deep Graph Infomax x_emb_in, x_emb_out = run_deep_graph_infomax(hinsage, generator, epochs=epochs, node_type=node_type) emb_model = Model(inputs=x_emb_in, outputs=x_emb_out) all_embeddings = emb_model.predict( generator.flow(G.nodes(node_type=node_type))) # TSNE visualization of embeddings ttsne = time.time() print("Creating TSNE") embeddings_2d = pd.DataFrame( TSNE(n_components=2).fit_transform(all_embeddings), index=G.nodes(node_type=node_type)) # draw the points (colors based on ExtendedCaseGraphID) node_ids = G.nodes(node_type=node_type).tolist() ext_targets = v_sample.loc[[int(node_id) for node_id in node_ids ]].ExtendedCaseGraphID label_map = { l: i * 10 for i, l in enumerate(np.unique(ext_targets), start=10) if pd.notna(l) } node_colours = [ label_map[target] if pd.notna(target) else 0 for target in ext_targets ] ttsne1 = time.time() print( f"TSNE completed in {(ttsne1-ttsne):.2f} s ({(ttsne1-ttsne)/60:.2f} min)" ) alpha = 0.7 fig, ax = plt.subplots(figsize=(15, 15)) ax.scatter( embeddings_2d[0], embeddings_2d[1], c=node_colours, cmap="jet", alpha=alpha, ) ax.set(aspect="equal") plt.title( f'TSNE visualization of HinSAGE "{node_type}" embeddings with Deep Graph Infomax' ) plt.savefig(f"./embeddings/HinSAGE_DGI_embeddings_{node_type}.pdf") return all_embeddings, embeddings_2d
def run_for_node_type(v_type, hinsage_layer_sizes, num_samples, activations, epochs): nan_tflag = data_splits[v_type].iloc[0].values[0] train_tflag = data_splits[v_type].iloc[1].values[0] test_tflag = data_splits[v_type].iloc[2].values[0] train_cv_set = v_sets[v_type][nan_tflag:nan_tflag + train_tflag] train_cv_ids = train_cv_set.index.values.tolist() train_cv_labels = v_data.loc[[ int(node_id) for node_id in train_cv_ids ]].ExtendedCaseGraphID test_set = v_sets[v_type][-test_tflag:] test_ids = test_set.index.values.tolist() generator = HinSAGENodeGenerator(G, batch_size, num_samples, head_node_type=v_type) hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes, activations=activations, generator=generator, bias=True, normalize="l2", dropout=dropout) def run_deep_graph_infomax(base_model, generator, epochs): print(f"Starting training for {v_type} type: ") t0 = time.time() corrupted_generator = CorruptedGenerator(generator) gen = corrupted_generator.flow(G.nodes(node_type=v_type)) infomax = DeepGraphInfomax(base_model, corrupted_generator) x_in, x_out = infomax.in_out_tensors() # Train with DGI model = Model(inputs=x_in, outputs=x_out) model.compile(loss=tf.nn.sigmoid_cross_entropy_with_logits, optimizer=Adam(lr=1e-3)) es = EarlyStopping(monitor="loss", min_delta=0, patience=10) history = model.fit(gen, epochs=epochs, verbose=verbose, callbacks=[es]) #sg.utils.plot_history(history) x_emb_in, x_emb_out = base_model.in_out_tensors() if generator.num_batch_dims() == 2: x_emb_out = tf.squeeze(x_emb_out, axis=0) t1 = time.time() print(f'Time required: {t1-t0:.2f} s ({(t1-t0)/60:.1f} min)') return x_emb_in, x_emb_out, model #? Train HinSAGE model: x_emb_in, x_emb_out, _model = run_deep_graph_infomax(hinsage, generator, epochs=epochs) emb_model = Model(inputs=x_emb_in, outputs=x_emb_out) train_cv_embs = emb_model.predict( generator.flow(train_cv_set.index.values)) #? Optional: Plot embeddings of training and CV set of current node type if (visualize == True): train_cv_embs_2d = pd.DataFrame( TSNE(n_components=2).fit_transform(train_cv_embs), index=train_cv_set.index.values) label_map = { l: i * 10 for i, l in enumerate(np.unique(train_cv_labels), start=10) if pd.notna(l) } node_colours = [ label_map[target] if pd.notna(target) else 0 for target in train_cv_labels ] alpha = 0.7 fig, ax = plt.subplots(figsize=(15, 15)) ax.scatter( train_cv_embs_2d[0], train_cv_embs_2d[1], c=node_colours, cmap="jet", alpha=alpha, ) ax.set(aspect="equal") plt.title( f"TSNE of HinSAGE {v_type} embeddings with DGI- coloring on ExtendedCaseGraphID" ) plt.show() return 1 #? Split training and cross valuation set using 80% 20% simple ordered split n_embs = train_cv_embs.shape[0] train_size = int(n_embs * 0.80) cv_size = int(n_embs * 0.20) train_set = train_cv_embs[:train_size] train_labels = np.ravel( pd.DataFrame(train_cv_labels.values[:train_size]).fillna(0)) cv_set = train_cv_embs[-cv_size:] cv_labels = np.ravel( pd.DataFrame(train_cv_labels.values[-cv_size:]).fillna(0)) #? CLASSIFY print(f"Running Classifier for {v_type} type") classifier = DecisionTreeClassifier() classifier.fit( X=train_set, y=train_labels, ) cv_pred = classifier.predict(cv_set) f1_avg = f1_score(cv_labels, cv_pred, average='weighted') acc = (cv_pred == cv_labels).mean() print(f"{v_type} CV Metrics: f1: {f1_avg:.6f} - acc: {acc:.6f}") #? Now Run on test set test_embs = emb_model.predict(generator.flow(test_set.index.values)) test_pred = classifier.predict(test_embs) #? Save predictions outdir = './output' outname = f"{v_type}_predictions.csv" if not os.path.exists(outdir): os.mkdir(outdir) fullname = os.path.join(outdir, outname) output = pd.DataFrame(test_ids) output = output.rename(columns={0: 'node_id'}) output['ExtendedCaseGraphID'] = test_pred output = output.set_index('node_id') output.to_csv(fullname) return output