def encode_types(graph, node_types, edge_types): node_iterator = multidigraph_node_data_iterator(graph) encode_categorically(node_iterator, node_types, 'type', 'categorical_type') edge_iterator = multidigraph_edge_data_iterator(graph) encode_categorically(edge_iterator, edge_types, 'type', 'categorical_type') return graph
def __call__(self, graph): if self.obfuscate: obfuscate_labels(graph, self.obfuscate) # Encode attribute values as number graph = encode_values(graph, self.categorical, self.continuous) graph = nx.convert_node_labels_to_integers( graph, label_attribute=self.label_attribute ) if self.duplicate: graph = duplicate_edges_in_reverse(graph) # Node or Edge Type as int graph = encode_types(graph, multidigraph_node_data_iterator, self.node_types) graph = encode_types(graph, multidigraph_edge_data_iterator, self.edge_types) for data in multidigraph_node_data_iterator(graph): features = create_feature_vector(data) target = data[self.target_name] data.clear() data["x"] = features data["y"] = target for data in multidigraph_edge_data_iterator(graph): features = create_feature_vector(data) target = data[self.target_name] data.clear() data["edge_attr"] = features data["y_edge"] = target return graph
def encode_values(graph, categorical_attributes, continuous_attributes): for node_data in multidigraph_node_data_iterator(graph): typ = node_data['type'] if categorical_attributes is not None and typ in categorical_attributes.keys(): # Add the integer value of the category for each categorical attribute instance category_values = categorical_attributes[typ] node_data['encoded_value'] = category_values.index(node_data['value']) elif continuous_attributes is not None and typ in continuous_attributes.keys(): min_val, max_val = continuous_attributes[typ] node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val) else: node_data['encoded_value'] = 0 for edge_data in multidigraph_edge_data_iterator(graph): edge_data['encoded_value'] = 0 return graph
def pipeline(graphs, tr_ge_split, node_types, edge_types, num_processing_steps_tr=10, num_processing_steps_ge=10, num_training_iterations=10000, continuous_attributes=None, categorical_attributes=None, type_embedding_dim=5, attr_embedding_dim=6, edge_output_size=3, node_output_size=3, output_dir=None): ############################################################ # Manipulate the graph data ############################################################ # Encode attribute values for graph in graphs: for node_data in multidigraph_node_data_iterator(graph): typ = node_data['type'] if categorical_attributes is not None and typ in categorical_attributes.keys( ): # Add the integer value of the category for each categorical attribute instance category_values = categorical_attributes[typ] node_data['encoded_value'] = category_values.index( node_data['value']) elif continuous_attributes is not None and typ in continuous_attributes.keys( ): min_val, max_val = continuous_attributes[typ] node_data['encoded_value'] = (node_data['value'] - min_val) / (max_val - min_val) else: node_data['encoded_value'] = 0 for edge_data in multidigraph_edge_data_iterator(graph): edge_data['encoded_value'] = 0 indexed_graphs = [ nx.convert_node_labels_to_integers(graph, label_attribute='concept') for graph in graphs ] graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs] graphs = [encode_types(graph, node_types, edge_types) for graph in graphs] input_graphs = [create_input_graph(graph) for graph in graphs] target_graphs = [create_target_graph(graph) for graph in graphs] tr_input_graphs = input_graphs[:tr_ge_split] tr_target_graphs = target_graphs[:tr_ge_split] ge_input_graphs = input_graphs[tr_ge_split:] ge_target_graphs = target_graphs[tr_ge_split:] ############################################################ # Build and run the KGCN ############################################################ attr_embedders = configure_embedders(node_types, attr_embedding_dim, categorical_attributes, continuous_attributes) kgcn = KGCN(len(node_types), len(edge_types), type_embedding_dim, attr_embedding_dim, attr_embedders, edge_output_size=edge_output_size, node_output_size=node_output_size) learner = KGCNLearner(kgcn, num_processing_steps_tr=num_processing_steps_tr, num_processing_steps_ge=num_processing_steps_ge) train_values, test_values, tr_info = learner( tr_input_graphs, tr_target_graphs, ge_input_graphs, ge_target_graphs, num_training_iterations=num_training_iterations, log_dir=output_dir) plot_across_training(*tr_info, output_file=f'{output_dir}learning.png') plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge, output_file=f'{output_dir}graph.png') logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1]) indexed_ge_graphs = indexed_graphs[tr_ge_split:] ge_graphs = [ apply_logits_to_graphs(graph, logit_graph) for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs) ] for ge_graph in ge_graphs: for data in multidigraph_data_iterator(ge_graph): data['probabilities'] = softmax(data['logits']) data['prediction'] = int(np.argmax(data['probabilities'])) _, _, _, _, _, solveds_tr, solveds_ge = tr_info return ge_graphs, solveds_tr, solveds_ge
def pipeline(graphs, tr_ge_split, node_types, edge_types, num_processing_steps_tr=10, num_processing_steps_ge=10, num_training_iterations=10000, categorical_attributes=None, type_embedding_dim=5, attr_embedding_dim=6, edge_output_size=3, node_output_size=3): ############################################################ # Manipulate the graph data ############################################################ # Encode attribute values for graph in graphs: for data in multidigraph_data_iterator(graph): data['encoded_value'] = 0 for node_data in multidigraph_node_data_iterator(graph): typ = node_data['type'] # Add the integer value of the category for each categorical attribute instance for attr_typ, category_values in categorical_attributes.items(): if typ == attr_typ: node_data['encoded_value'] = category_values.index( node_data['value']) indexed_graphs = [ nx.convert_node_labels_to_integers(graph, label_attribute='concept') for graph in graphs ] graphs = [duplicate_edges_in_reverse(graph) for graph in indexed_graphs] graphs = [encode_types(graph, node_types, edge_types) for graph in graphs] input_graphs = [create_input_graph(graph) for graph in graphs] target_graphs = [create_target_graph(graph) for graph in graphs] tr_input_graphs = input_graphs[:tr_ge_split] tr_target_graphs = target_graphs[:tr_ge_split] ge_input_graphs = input_graphs[tr_ge_split:] ge_target_graphs = target_graphs[tr_ge_split:] ############################################################ # Build and run the KGCN ############################################################ type_categories_list = [i for i, _ in enumerate(node_types)] non_attribute_nodes = type_categories_list.copy() attr_embedders = dict() # Construct categorical attribute embedders for attr_typ, category_values in categorical_attributes.items(): num_categories = len(category_values) def make_embedder(): return CategoricalAttribute(num_categories, attr_embedding_dim, name=attr_typ + '_cat_embedder') attr_typ_index = node_types.index(attr_typ) # Record the embedder, and the index of the type that it should encode attr_embedders[make_embedder] = [attr_typ_index] non_attribute_nodes.pop(attr_typ_index) # All entities and relations (non-attributes) also need an embedder with matching output dimension, which does # nothing. This is provided as a list of their indices def make_blank_embedder(): return BlankAttribute(attr_embedding_dim) attr_embedders[make_blank_embedder] = non_attribute_nodes kgcn = KGCN(len(node_types), len(edge_types), type_embedding_dim, attr_embedding_dim, attr_embedders, edge_output_size=edge_output_size, node_output_size=node_output_size) learner = KGCNLearner(kgcn, num_processing_steps_tr=num_processing_steps_tr, num_processing_steps_ge=num_processing_steps_ge) train_values, test_values, tr_info = learner( tr_input_graphs, tr_target_graphs, ge_input_graphs, ge_target_graphs, num_training_iterations=num_training_iterations) plot_across_training(*tr_info) plot_predictions(ge_input_graphs, test_values, num_processing_steps_ge) logit_graphs = graphs_tuple_to_networkxs(test_values["outputs"][-1]) indexed_ge_graphs = indexed_graphs[tr_ge_split:] ge_graphs = [ apply_logits_to_graphs(graph, logit_graph) for graph, logit_graph in zip(indexed_ge_graphs, logit_graphs) ] for ge_graph in ge_graphs: for data in multidigraph_data_iterator(ge_graph): data['probabilities'] = softmax(data['logits']) data['prediction'] = int(np.argmax(data['probabilities'])) _, _, _, _, _, solveds_tr, solveds_ge = tr_info return ge_graphs, solveds_tr, solveds_ge