def get_tensorflow_object_last(self, dst_units): """ Parameters ---------- dst_units: int Number of units that the recurrent cell will have """ try: c_ = getattr(tf.keras.layers, self.type) except Exception: print_failure( "The layer of layer_type '" + self.type + "' is not a valid tf.keras layer. Please check the " "documentation to write the correct way to define this layer. ") self.parameters['units'] = dst_units # can we assume that it will always be units?? try: layer = c_(**self.parameters) except Exception: parameters_string = '' for k, v in self.parameters.items(): parameters_string += k + ': ' + v + '\n' print_failure( "One of the parameters passed to the layer of layer_type '" + self.type + "' is incorrect. \n " + "You have defined the following parameters: \n" + parameters_string) return layer
def calculate(self, product_input1, product_input2): """ Parameters ---------- product_input1: tensor Input 1 product_input2: tensor Input 2 """ try: if self.type_product == 'dot_product': result = tf.tensordot(product_input1, product_input2, axes=[[1], [1]]) # the correct values are in the diagonal (IMPROVE THIS) # This does the dot product row by row (so independently for each adjacency) result = tf.linalg.tensor_diag_part(result) result = tf.expand_dims(result, axis=-1) elif self.type_product == 'element_wise': result = tf.math.multiply(product_input1, product_input2) elif self.type_product == 'mat_mult': result = tf.tensordot(product_input1, product_input2, axes=[[2], [1]]) result = tf.squeeze(result, axis=2) result = tf.cast(result, tf.float32) return result except: print_failure( 'The product operation between ' + product_input1 + ' and ' + product_input2 + ' failed. Check that the dimensions are compatible.')
def get_tensorflow_object(self, dst_dim): """ Parameters ---------- dst_dim: int Dimension of the destination nodes. Thus, number of units of the RNN model """ self.parameters['units'] = dst_dim try: c_ = getattr(tf.keras.layers, self.type + 'Cell') except Exception: print_failure( "Error when trying to define a RNN of layer_type '" + self.type + "' since this layer_type does not exist. Check the valid RNN cells that Keras allow to define.") try: layer = c_(**self.parameters) except Exception: print_failure( "Error when creating the RNN of layer_type '" + self.type + "' since invalid parameters were passed. Check the documentation to check which " "parameters are acceptable or check the spelling of the parameters' names.") return layer
def find_total_input_dim(self, dimensions, calculations): """ Parameters ---------- dimensions: dict Dictionary with the dimensions of each tensor (indexed by name) calculations: dict Dictionary with the current calculations throughout the execution of the GNN model """ if self.input is not None: input_nn = self.input input_dim = 0 dimension = None for i in input_nn: if '_initial_state' in i: i = i.split('_initial_state')[0] if i in dimensions: dimension = dimensions[i] elif i + '_out_dim' in calculations: dimension = calculations[i + '_out_dim'] # take the dimension from here or from self.dimensions else: print_failure("Keyword " + i + " used in the model definition was not recognized") input_dim += dimension return input_dim
def stream_read_json(self, f): """ Parameters ---------- f: Input data """ # check that it is a valid array of objects pos1 = f.read(1) if pos1 != '[': print_failure( "Error because the dataset files must be an array of json objects, and not single json objects") start_pos = 1 while True: try: obj = json.load(f) yield obj return except json.JSONDecodeError as e: f.seek(start_pos) json_str = f.read(e.pos) obj = json.loads(json_str) start_pos += e.pos + 1 a = f.read(1) # this 1 is the coma or the final symbol if a == self.end_symbol or a == ']': yield obj return yield obj
def obtain_total_input_dim_message(self, dimensions, calculations, dst_name, src): """ Parameters ---------- dimensions: dict Dictionary with the dimensions of each tensor (indexed by name) calculations: dict Dictionary with the current calculations throughout the execution of the GNN model dst_name: str Name of the destination entity src: Source_mp object Object that includes the information about the source entity of the mp """ # Find out the dimension of the model input_nn = self.input input_dim = 0 for i in input_nn: if i == 'source': input_dim += int(dimensions.get(src.name)) elif i == 'destination': input_dim += int(dimensions.get(dst_name)) elif i in dimensions: input_dim += int(dimensions[i]) elif i + '_dim' in calculations: input_dim += dimensions else: print_failure("Keyword " + i + " used in the message passing was not recognized.") return input_dim
def calculate(self, src_states, adj_src, dst_states, adj_dst): """ Parameters ---------- src_states: tensor Input 1 adj_src: tensor Adj src -> dest dst_states: tensor Input 2 adj_dst: tensor Adj dst -> src """ # obtain the extended input (by extending it to the number of adjacencies between them) try: extended_src = tf.gather(src_states, adj_src) except Exception: print_failure('Extending the adjacency list ' + str(self.adj_list) + ' was not possible. Check that the indexes of the source of the adjacency ' 'list match the input given.') try: extended_dst = tf.gather(dst_states, adj_dst) except Exception: print_failure('Extending the adjacency list ' + str(self.adj_list) + ' was not possible. Check that the indexes of the destination of ' 'the adjacency list match the input given.') return extended_src, extended_dst
def __prepocess_parameters(self): for k, v in self.parameters.items(): if v == 'None': self.parameters[k] = None elif v == 'True': self.parameters[k] = True elif v == 'False': self.parameters[k] = False elif 'regularizer' in k: try: self.parameters[k] = tf.keras.regularizers.l2(float(self.parameters.get(k))) except Exception: print_failure("The " + k + " parameter '" + str(self.parameters.get(k)) + "' in layer of layer_type " + self.type + " is invalid. Please make sure it is a numerical value.") elif 'activation' in k: # already ensures that it was not None try: self.parameters['activation'] = getattr(tf.nn, v) except Exception: print_failure("The activation '" + v + "' is not a valid function from the tf.nn library. Please check the documentation " "and the spelling of the function.")
def __init__(self, op): """ Parameters ---------- op: dict Dictionary with the data defining this general operation """ self.type = op.get('type') self.output_name = op.get('output_name', None) self.output_label = op.get('output_label', None) if self.output_label is not None: # There may be more than one output_label self.output_label = [output.split('$')[-1] for output in self.output_label] # delete the $ from the output label # parse the input of the operation self.input = [] self.source_dataset = False self.destination_dataset = False if 'input' in op: for input_item in op.get('input'): if '$source' == input_item or '$destination' == input_item: print_failure( 'The keywords source and destination are reserved keywords. Thus, they cannot name feature ' 'from the dataset. Check that you really meant to use $, indicating that its a feature ' 'from the dataset') else: self.input.append(input_item.split('$')[-1]) # delete the $ from the inputs (if any)
def create_aggregations(self, attrs): """ Parameters ---------- attrs: dict Dictionary with the required attributes for the aggregation (defining the set of operations) """ aggregations = [] single_embedding = None multiple_embedding = None for attr in attrs: attr_type = attr.get('type') if attr_type == 'interleave': aggregations.append(InterleaveAggr(attr)) multiple_embedding = True elif attr_type == 'neural_network': aggregations.append( FeedForwardOperation(attr, model_role='aggregation')) single_embedding = True elif attr_type == 'concat': aggregations.append(ConcatAggr(attr)) multiple_embedding = True elif attr_type == 'sum': aggregations.append(SumAggr(attr)) single_embedding = True elif attr_type == 'mean': aggregations.append(MeanAggr(attr)) single_embedding = True elif attr_type == 'min': aggregations.append(MinAggr(attr)) single_embedding = True elif attr_type == 'max': aggregations.append(MaxAggr(attr)) single_embedding = True elif attr_type == 'std': aggregations.append(StdAggr(attr)) single_embedding = True elif attr_type == 'attention': aggregations.append(AttentionAggr(attr)) single_embedding = True elif attr_type == 'edge_attention': aggregations.append(EdgeAttentionAggr(attr)) single_embedding = True elif attr_type == 'convolution': aggregations.append(ConvAggr(attr)) single_embedding = True else: # this is for the ordered aggregation multiple_embedding = True if single_embedding and multiple_embedding: print_failure( "You cannot combine aggregations which return a sequence of tensors, " "and aggregations that return a single embedding") elif single_embedding: return aggregations, 0 else: return aggregations, 1
def generate_from_array(self, data_samples, entity_names, feature_names, output_names, adj_names, interleave_names, additional_input, training, shuffle=False): """ Parameters ---------- data_samples: [array] Array of samples to be processed entity_names: [array] Name of the entities to be found in the dataset feature_names: [array] Name of the features to be found in the dataset output_names: [str] Names of the output data to be found in the dataset interleave_names: [array] First parameter is the name of the interleave, and the second the destination entity additional_input: [array] Name of other vectors that need to be retrieved because they appear in other parts of the model definition training: bool Indicates if we are training, and thus a label is required. shuffle: bool Shuffle parameter of the dataset """ data_samples = [json.loads(x) for x in data_samples] self.entity_names = [x for x in entity_names] self.feature_names = [x for x in feature_names] self.output_names = output_names self.adj_names = adj_names self.interleave_names = [[i[0], i[1]] for i in interleave_names] self.additional_input = [x for x in additional_input] self.training = training for sample in data_samples: try: processed_sample = self.__process_sample(sample) yield processed_sample except StopIteration: pass except KeyboardInterrupt: sys.exit() except Exception as inf: print_failure("\n There was an unexpected error: \n" + str( inf) + "\n Please make sure that all the names used in the sample passed ")
def calculate(self, inputs): """ Parameters ---------- inputs: tensor """ try: result = tf.concat(inputs, axis=self.axis) return result except Exception: print_failure( 'The concat operation failed. Check that the dimensions are compatible.') sys.exit(1)
def __read_yaml(self, path, file_name=''): """ Parameters ---------- path: str Path of the json file with the model description file_name: str Name of the file we aim to read """ if os.path.isfile(path): with open(path, 'r') as stream: try: return yaml.safe_load(stream) except yaml.YAMLError as exc: print_failure("There was the following error in the " + file_name + " file.\n" + str(exc)) else: print_failure("The " + file_name + " file was not found in: " + path)
def get_tensorflow_object(self): try: c_ = getattr(tf.keras.layers, self.type) except Exception: print_failure( "The layer of layer_type '" + self.type + "' is not a valid tf.keras layer. Please check the documentation to " "write the correct way to define this layer. ") try: layer = c_(**self.parameters) except Exception: parameters_string = '' for k, v in self.parameters.items(): parameters_string += k + ': ' + v + '\n' print_failure( "One of the parameters passed to the layer of layer_type '" + self.type + "' is incorrect. \n " + "You have defined the following parameters: \n" + parameters_string) return layer
def construct_tf_model(self, input_dim, dst_dim=None, is_readout=False, dst_name=None): """ Parameters ---------- input_dim: int Dimension of the input of the model dst_dim: int Dimension of the destination hs if any is_readout: bool Is a model used for the readout? dst_name: str Name of the destination entity """ model = tf.keras.models.Sequential() model.add(tf.keras.Input(shape=input_dim)) layer_counter = 1 n = len(self.layers) for j in range(n): current_layer = self.layers[j] try: # if it's the last layer and we have defined an output dimension if j == (n - 1) and dst_dim is not None: layer_model = current_layer.get_tensorflow_object_last(dst_dim) else: layer_model = current_layer.get_tensorflow_object() model.add(layer_model) except: if dst_dim is None: if is_readout: print_failure('The layer ' + str(layer_counter) + ' of the readout is not correctly defined. Check keras documentation ' 'to make sure all the parameters are correct.') else: print_failure('The layer ' + str(layer_counter) + ' of the message creation neural network in the message passing to ' + str(dst_name) + ' is not correctly defined. Check keras documentation to ' 'make sure all the parameters are correct.') else: print_failure('The layer ' + str( layer_counter) + ' of the update neural network in message passing to ' + str(dst_name) + ' is not correctly defined. Check keras documentation to make sure all the ' 'parameters are correct.') layer_counter += 1 output_shape = model.output_shape[-1] return [model, output_shape]
def __process_sample(self, sample, file=None): """ Parameters ---------- sample: dict Input sample which is a serialized version (in JSON) of a networkx graph. file: str Path to these file (which is useful for error-checking purposes) """ # load the model G = json_graph.node_link_graph(sample) # Only directed graphs are supported. Error checking message if the graph is undirected if not G.is_directed(): print_failure("IGNNITION received as input an undirected graph, even though it only supports " "(at the moment) directed graphs. Please consider reformating your code accordingly. " "You can double the edges between two nodes (e.g., edge 1-2 can be transformed into 1->2 " "and 2->1) to simulate the same behaviour.") if G.is_multigraph(): print_failure("IGNNITION received as input a multigraph, while these are not yet supported. This means, " "that for every pair of nodes, only one edge with the same source and destination can exist " "(e.g., you cannot have two edges 1->2 and 1->2. Notice that 1->2 and 2->1 does not incur " "in this problem.") entity_counter = {} mapping = {} data = {} for name in self.entity_names: entity_counter[name] = 0 list_nodes = list(G.nodes()) for node_name in list_nodes: attributes = G.nodes[node_name] if 'entity' not in attributes: print_failure( "Error in the dataset file located in '" + file + ". The node named'" + str(node_name) + "' was not assigned an entity.") entity_name = attributes['entity'] new_node_name = entity_name + '_{}' num_node = entity_counter[entity_name] entity_counter[entity_name] += 1 mapping[node_name] = new_node_name.format(num_node) # save the number of nodes of each entity for name in self.entity_names: data['num_' + name] = entity_counter[name] # rename the name of the nodes to a mapping that also indicates its entity layer_type D_G = nx.relabel_nodes(G, mapping) # discard if the graph is empty if not D_G.edges(): print_info("\nA sample was discarded because the graph is empty (has no edges).") raise StopIteration # load the features (all the features are set to be lists. So we always return a list of lists) for f in self.feature_names: try: features_dict = nx.get_node_attributes(D_G, f) feature_vals = np.array(list(features_dict.values())) entity_names = set([name.split('_')[0] for name in features_dict.keys()]) #indicates the (unique) # names of the entities that have that feature if len(entity_names) > 1: entities_string = functools.reduce(lambda x,y: str(x) + ',' + str(y), entity_names ) print_failure("The feature " + f + " was defined in several entities(" + entities_string + "). The feature names should be unique for each layer_type of node.") # it should always be a 2d array if len(np.shape(feature_vals)) == 1: feature_vals = np.expand_dims(feature_vals, axis=-1) if feature_vals.size == 0: message = "The feature " + f + " was used in the model_description.yaml file " \ "but was not defined in the dataset." if file is not None: message = "Error in the dataset file located in '" + file + ".\n" + message raise Exception(message) else: data[f] = feature_vals except: message = "The feature " + f + " was used in the model_description.yaml file " \ "but was not defined in the dataset." if file is not None: message = "Error in the dataset file located in '" + file + ".\n" + message raise Exception(message) # take other inputs if needed (check that they might be global features) for a in self.additional_input: # 1) try to see if this name has been defined as a node attribute node_dict = nx.get_node_attributes(D_G, a) node_attr = np.array(list(node_dict.values())) entity_names = set([name.split('_')[0] for name in node_dict.keys()]) # indicates the (unique) names # of the entities that have that feature if len(entity_names) > 1: entities_string = functools.reduce(lambda x, y: str(x) + ',' + str(y), entity_names) print_failure( "The feature " + a + " was defined in several entities(" + entities_string + "). The feature names should be unique for each layer_type of node.") # it should always be a 2d array if len(np.shape(node_attr)) == 1: node_attr = np.expand_dims(node_attr, axis=-1) # 2) try to see if this name has been defined as an edge feature edge_dict = nx.get_edge_attributes(D_G, a) edge_attr = np.array(list(edge_dict.values())) entity_names = set([(pair[0].split('_')[0], pair[1].split('_')[0]) for pair in edge_dict.keys()]) # # indicates the (unique) names of the entities that have that feature # obtain the entities, with a small token indicating if it is source or destination # Problem: When we transform an undirected graph to directed, we double all the edges. Hence, # we still need to differentiate between source and destination entities?? Solution: Allow only directed?? # for now, check that the name is unique for every src-dst. Problem: One node connected to another but # the reverse to other nodes?? if len(entity_names) > 2: #print(entity_names) entities_string = functools.reduce(lambda x, y: str(x) + ',' + str(y), entity_names) print_failure( "The edge feature " + a + " was defined in connecting two different source-destination entities(" + entities_string + "). Make sure that an edge feature is unique for a given pair of entities " "(types of nodes).") # it should always be a 2d array if len(np.shape(edge_attr)) == 1: edge_attr = np.expand_dims(edge_attr, axis=-1) # 3) try to see if this name has been defined as a graph feature graph_attr = [D_G.graph[a]] if a in D_G.graph else [] # Check that this name has not been defined both as node features and as edge_features if node_attr.size != 0 and edge_attr.size != 0 and len(graph_attr) != 0: print_failure("The feature " + a + "was defined both as node feature, edge feature and graph feature. " "Please use unique names in this case.") elif node_attr.size != 0 and edge_attr.size != 0: print_failure("The feature " + a + "was defined both as node feature and as edge feature. Please use " "unique names in this case.") elif node_attr.size != 0 and len(graph_attr) != 0: print_failure("The feature " + a + "was defined both as node feature and as graph feature. Please use " "unique names in this case.") # Return the correct value if node_attr.size != 0: data[a] = node_attr elif edge_attr.size != 0: data[a] = edge_attr elif a in D_G.graph: data[a] = graph_attr else: message = 'The data named "' + a + '" was used in the model_description.yaml file ' \ 'but was not defined in the dataset.' if file is not None: message = "Error in the dataset file located in '" + file + ".\n" + message raise Exception(message) if self.training: # collect the output (if there is more than one, concatenate them on axis=1 # limitation: all the outputs must be of the same layer_type (same number of elements) final_output = [] for output in self.output_names: try: aux = list(nx.get_node_attributes(D_G, output).values()) if not aux: # When having global/graph-level output aux = D_G.graph[output] aux = aux if isinstance(aux, list) else [aux] except Exception: print_failure( f"Error when trying to get output with name: {output}. " "Check the data which corresponds to the output_label in the readout block." ) # if it is a 1d array, transform it into a 2d array if len(np.array(aux).shape) == 1: aux = np.expand_dims(aux, -1) # try to concatenate them together. If error, it means that the two labels are incompatible final_output.extend(aux) data['__ignnition_{}_len'.format(output)] = len(aux) # find the adjacencies edges_list = list(D_G.edges()) processed_neighbours = {} # create the adjacency lists that we are required to pass for adj_name_item in self.adj_names: src_entity = adj_name_item.split('_to_')[0] dst_entity = adj_name_item.split('_to_')[1] data['src_' + src_entity + '_to_' + dst_entity] = [] data['dst_' + src_entity + '_to_' + dst_entity] = [] data['seq_' + src_entity + '_to_' + dst_entity] = [] for e in edges_list: src_node, dst_node = e src_num = int(src_node.split('_')[-1]) dst_num = int(dst_node.split('_')[-1]) src_entity = D_G.nodes[src_node]['entity'] dst_entity = D_G.nodes[dst_node]['entity'] if dst_node not in processed_neighbours: processed_neighbours[dst_node] = 0 if src_entity + '_to_' + dst_entity in self.adj_names: data['src_' + src_entity + '_to_' + dst_entity].append(src_num) data['dst_' + src_entity + '_to_' + dst_entity].append(dst_num) data['seq_' + src_entity + '_to_' + dst_entity].append(processed_neighbours[dst_node]) processed_neighbours[dst_node] += 1 # this is useful to check which sequence number to use # check that the dataset contains all the adjacencies needed if not self.warnings_shown: for adj_name_item in self.adj_names: if data['src_' + adj_name_item] == []: src_entity = adj_name_item.split('_to_')[0] dst_entity = adj_name_item.split('_to_')[1] print_info( "WARNING: The GNN definition uses edges between " + src_entity + " and " + dst_entity + " but these were not found in the input graph. The MP defined between these two entities " "will be ignored.\nIn case the graph ought to contain such edges, one reason for this error " "is a mistake in defining the graph as directional, when the edges have been defined as " "undirected. Please check the documentation.") self.warnings_shown = True # this collects the sequence for the interleave aggregation (if any) for i in self.interleave_names: name, dst_entity = i interleave_definition = list(D_G.graph[name].values()) # this must be a graph variable involved_entities = {} total_sequence = [] total_size, n_total, counter = 0, 0, 0 for src_entity in interleave_definition: total_size += 1 if src_entity not in involved_entities: involved_entities[src_entity] = counter # each entity a different value (identifier) seq = data['seq_' + src_entity + '_to_' + dst_entity] n_total += max(seq) + 1 # superior limit of the size of any destination counter += 1 # obtain all the original definition in a numeric format total_sequence.append(involved_entities[src_entity]) # we exceed the maximum length for sake to make it multiple. Then we will cut it repetitions = math.ceil(float(n_total) / total_size) result = np.array((total_sequence * repetitions)[:n_total]) for entity in involved_entities: id = involved_entities[entity] data['indices_' + entity + '_to_' + dst_entity] = np.where(result == id)[0].tolist() if self.training: return data, final_output else: return data
def __validate_model_description(self, data): """ Parameters ---------- data: dict Dictionary with the initial data """ entities = data['entities'] stages = data['message_passing']['stages'] src_names, dst_names, called_nn_names, input_names = [], [], [], [] output_names = ['source', 'destination'] # check the hidden state creation for entity_item in entities: state_ops = entity_item['initial_state'] for op in state_ops: if 'input' in op: input_names += op['input'] if 'output' in op: output_names += op['output'] # check the message passing for stage in stages: stage_mp = stage.get('stage_message_passings') for mp in stage_mp: # for every message-passing dst_names.append(mp.get('destination_entity')) sources = mp.get('source_entities') # check the message functions for src in sources: src_names.append(src.get('name')) messages = src.get('message', None) if messages is not None: for op in messages: # for every operation if op.get('type') == 'neural_network': called_nn_names.append(op.get('nn_name')) input_names += op.get('input') if 'output_name' in op: output_names.append(op.get('output_name')) # check the aggregation functions aggregations = mp.get('aggregation') for aggr in aggregations: if aggr.get('type') == 'neural_network': input_names += aggr.get('input') if 'output_name' in aggr: output_names.append(aggr.get('output_name')) readout_op = data.get('readout') called_nn_names += [op.get('nn_name') for op in readout_op if op.get('type') == 'neural_network'] if 'output_label' not in readout_op[-1]: print_failure('The last operation of the readout MUST contain the definition of the output_label') else: input_names += readout_op[-1]['output_label'] # now check the entities entity_names = [a.get('name') for a in data.get('entities')] nn_names = [n.get('nn_name') for n in data.get('neural_networks')] # check if the name of two NN defined match if len(nn_names) != len(set(nn_names)): print_failure("The names of two NN are repeated. Please ensure that each NN has a unique name.") # check the source entities for a in src_names: if a not in entity_names: print_failure( 'The source entity "' + a + '" was used in a message passing. However, there is no such entity. \n ' 'Please check the spelling or define a new entity.') # check the destination entities for d in dst_names: if d not in entity_names: print_failure( 'The destination entity "' + d + '" was used in a message passing. However, there is no such ' 'entity. \n Please check the spelling or define a new entity.') # check the nn_names for name in called_nn_names: if name not in nn_names: print_failure( 'The name "' + name + '" is used as a reference to a neural network (nn_name), even though the ' 'neural network was not defined. \n Please make sure the name is correctly ' 'spelled or define a neural network named ' + name) # ensure that all the inputs (that are not output of another operation) start with a $ for i in input_names: if i not in output_names and i[0] != '$': print_failure('The input name ' + i + ' references data from the dataset but does not start with $') for i in output_names: if i[0] == '$': print_failure( 'The keyword ' + i + ' starts with $ even though it does not represent data from the dataset.')