def get_sub_graph_data(self): spec = FeatureSpec(10) for i in range(3): spec.append_dense() total_dim = 3 for i in range(7): dim = random.randint(8, 10) spec.append_sparse(20 + 10 * i, dim, False) total_dim += dim N = 10 # [f_num, batch_size] = [3, N] batch_floats = np.random.random([3, N]) batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32) # [i_num, batch_size] = [7, N] batch_ints = np.array([[i * j for j in range(N)] for i in range(7)]) batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64) vertices = Vertex(floats=batch_floats, ints=batch_ints) adj = np.zeros([N, N], dtype=np.float32) for i in range(N): for j in range(N): adj[i][j] = i tf_adj = tf.convert_to_tensor(adj, dtype=tf.float32) g = SubGraph(vertices, tf_adj, schema=("nodes", spec)) return g, N, total_dim
def test_homogeneous_graph(self): spec = FeatureSpec(10) for i in range(3): spec.append_dense() total_dim = 3 for i in range(7): dim = random.randint(8, 10) spec.append_sparse(20 + 10 * i, dim, False) total_dim += dim hops = [4, 5] # the centric vertices share the same spec with 2-hop neighbors schema = [("nodes", spec), ("nodes", spec), ("nodes", spec)] # [f_num, batch_size] = [3, 2] batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(3)]) batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2] batch_ints = np.array([[i, 2 * i] for i in range(7)]) batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64) vertices = Vertex(floats=batch_floats, ints=batch_ints) # [f_num, batch_size] = [3, 2 * 4] hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] for i in range(3)]) hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2 * 4] hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(7)]) hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64) neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints) # [f_num, batch_size] = [3, 2 * 4 * 5] hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1] for i in range(3)]) hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2 * 4 * 5] hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1] for i in range(7)]) hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64) neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints) g = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops) dims = np.array([total_dim, 16, 8]) model = HomoEgoGIN(dims, num_head=5, bn_fn=None, active_fn=tf.nn.relu, droput=0.1) embeddings = model.forward(g) with tf.Session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) ret = sess.run(embeddings) self.assertListEqual([2, 8], list(ret.shape)) # [batch_size, output_dim]
def get_model_and_graph(self): spec = FeatureSpec(10) for i in range(3): spec.append_dense() total_dim = 3 for i in range(7): dim = random.randint(8, 10) spec.append_sparse(20 + 10 * i, dim, False) total_dim += dim hops = [4, 5] # the centric vertices share the same spec with 2-hop neighbors schema = [("nodes", spec), ("nodes", spec), ("nodes", spec)] # [f_num, batch_size] = [3, 2] batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(3)]) batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2] batch_ints = np.array([[i, 2 * i] for i in range(7)]) batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64) # [batch_size] = [2] batch_labels = np.array([1, 0]) batch_labels = tf.convert_to_tensor(batch_labels, dtype=tf.int32) vertices = Vertex(floats=batch_floats, ints=batch_ints, labels=batch_labels) # [f_num, batch_size] = [3, 2 * 4] hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] for i in range(3)]) hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2 * 4] hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(7)]) hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64) neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints) # [f_num, batch_size] = [3, 2 * 4 * 5] hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1] for i in range(3)]) hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2 * 4 * 5] hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1] for i in range(7)]) hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64) neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints) g = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops) dims = np.array([total_dim, 16, 8]) model = HomoEgoGraphSAGE(dims, bn_fn=None, active_fn=tf.nn.relu, droput=0.1) return model, g, 2 # batch_size
def get_graph(cls, hops, neg=None): spec = FeatureSpec(10) for i in range(3): spec.append_dense() total_dim = 3 for i in range(7): dim = i + 1 spec.append_sparse(20 + 10 * i, dim, False) total_dim += dim neg = 1 if not neg else int(neg) hops[0] = int(hops[0] * neg) # the centric vertices share the same spec with 2-hop neighbors schema = [("nodes", spec), ("nodes", spec), ("nodes", spec)] # batch_size = 2 # [f_num, batch_size] = [3, 2 * neg] batch_floats = np.array([[1.0 * i, 2.0 * i] * neg for i in range(3)]) batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2 * neg] batch_ints = np.array([[i, 2 * i] * neg for i in range(7)]) batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64) # [batch_size] = [2] batch_labels = np.array([1, 0]) batch_labels = tf.convert_to_tensor(batch_labels, dtype=tf.int32) vertices = Vertex(floats=batch_floats, ints=batch_ints, labels=batch_labels) # [f_num, batch_size] = [3, 2 * neg * hop0] hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] for i in range(3)]) hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2 * neg * hop0] hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(7)]) hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64) neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints) # [f_num, batch_size] = [3, 2 * neg * hop0 * hop1] hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1] for i in range(3)]) hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2 * neg * hop0 * hop1] hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1] for i in range(7)]) hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64) neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints) hops[0] = int(hops[0] / neg) g = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops) return g, total_dim
def test_homogeneous_graph(self): spec = FeatureSpec(10) for i in range(3): spec.append_dense() total_dim = 3 for i in range(7): dim = random.randint(8, 10) spec.append_sparse(20 + 10 * i, dim, False) total_dim += dim hops = [4, 5] # the centric vertices share the same spec with 2-hop neighbors schema = [("nodes", spec), ("nodes", spec), ("nodes", spec)] # [f_num, batch_size] = [3, 2] batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(3)]) batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2] batch_ints = np.array([[i, 2 * i] for i in range(7)]) batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64) vertices = Vertex(floats=batch_floats, ints=batch_ints) # [f_num, batch_size] = [3, 2 * 4] hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] for i in range(3)]) hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2 * 4] hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(7)]) hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64) neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints) # [f_num, batch_size] = [3, 2 * 4 * 5] hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1] for i in range(3)]) hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2 * 4 * 5] hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1] for i in range(7)]) hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64) neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints) g1 = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops) g2 = g1.forward() self.assertListEqual(list(g2.expands), hops) x_list = [g2.nodes, g2.hop(0), g2.hop(1)] with tf.Session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) ret = sess.run(x_list) self.assertListEqual(list(ret[0].shape), [2, total_dim]) self.assertListEqual(list(ret[1].shape), [2 * hops[0], total_dim]) self.assertListEqual(list(ret[2].shape), [2 * hops[0] * hops[1], total_dim])
class Decoder(object): """ Decoder is used for Graph.node() and Graph.edge() to describe the schema of data source. """ def __init__(self, weighted=False, labeled=False, attr_types=None, attr_delimiter=":", attr_dims=None): """ Initialize a data source decoder. Args: weighted (boolean, Optional): Whether the data source has weights. Default is False. labeled (boolean, Optional): Whether the data source has labels. Default is False. attr_types (list, Optional): Attribute type list if attributes exist. Default is None, which means no attribute exists. Each element must be string, tuple or list. Valid types like below: attr_types = ['int', 'float', 'string'] attr_types = ['int', ('string', 10), 'string'] # 10 means bucket size For ('string', 10), we will get the string and hash it into 10 buckets directly. The raw attribute can by any string. # True means multi-val splited by ',', only for string attribute attr_types = ['int', ('string', 10, True)] # 10 means bucket size attr_types = ['int', ('int', 10), 'string'] For ('int', 10), we will cast the string to int first and then hash it into 10 buckets. In this way, the raw attribute must be an integer. When `attr_dims` is assigned, be sure that the string attribute must be configured with a bucket size. Bucket size for int attribute is optional, and it will be considered as a continuous attribute if bucket size is not assigned. attr_delimiter (string, Optional): The delimiter to seperate attributes. Default is ':'. If attributes exist, all of them are concatenated together with a delimiter in the raw storage. We need to know how to parse them. attr_dims (list, Optional): An integer list, the element of which represents the dimension of the corresponding attribute that will be encodeding to. Default is None, which means no attribute exists. All valid configurations of attr_type and attr_dim are as shown below. | attr_type |attr_dim| encoded into | | --------- | -- | -------- | | "string" | 8 | Dynamic bucket embedding variable, dim=8 | | ("string",10) | 8 | Embedding variable, bucketsize=10, dim=8 | |("string",10,True)| 8 |Sparse embedding variable, bucketsize=10,dim=8| |("string",None,True)| 8| Sparse dynamic embedding variable, dim=8 | | "int" |None| Continues numeric tensor | | "int" | 8 | Dynamic bucket embedding variable, dim=8 | | ("int",10) | 8 | Embedding variable, bucket size=10, dim=8 | | "float" |None| Continues numeric tensor | Note that dynamic bucket embedding variable is only supported in PAI-TF. For continues numeric attribute, attr_dim should be either None or 0. """ self._weighted = weighted self._labeled = labeled self._attr_types = attr_types self._attr_delimiter = attr_delimiter self._attr_dims = attr_dims self._int_attr_num = 0 self._float_attr_num = 0 self._string_attr_num = 0 self._fspec = None self._attributed = self._parse_attributes() def _parse_attributes(self): if not self._attr_types: return False if not isinstance(self._attr_types, list): raise ValueError( "attr_types for Decoder must be a list, got {}.".format( type(self._attr_types))) for i in range(len(self._attr_types)): type_name, bucket_size, is_multival = self.parse( self._attr_types[i]) self._int_attr_num += int(type_name == "int") self._float_attr_num += int(type_name == "float") if is_multival: self._string_attr_num += 1 else: self._int_attr_num += int(type_name == "string" and bucket_size is not None) self._string_attr_num += int(type_name == "string" and bucket_size is None) return True def _build_feature_spec(self): num_attrs = len(self._attr_types) numeric_types = ("float", "int") embedding_types = ("int", "string") self._fspec = FeatureSpec(num_attrs, self._weighted, self._labeled) if not self._attr_dims: self._attr_dims = [None for _ in range(num_attrs)] if num_attrs != len(self._attr_dims): raise ValueError( "The size of attr_dims must be equal with attr_types.") def check(dim, attr_type, bucket): if not dim: assert type_name in numeric_types and bucket_size is None, \ "Must assign an attr_dim for {}, and bucket_size should None." \ .format(type_name) else: assert type_name in embedding_types, \ "Must assign an attr_dim with None for {}".format(type_name) for attr_type, dim in zip(self._attr_types, self._attr_dims): type_name, bucket_size, is_multival = self.parse(attr_type) check(dim, type_name, bucket_size) if is_multival: self._fspec.append_multival(bucket_size, dim, ",") elif dim: self._fspec.append_sparse(bucket_size, dim, type_name == "int") else: self._fspec.append_dense(type_name == "float") def parse(self, attr_type): if isinstance(attr_type, tuple) or isinstance(attr_type, list): type_name = attr_type[0] bucket_size = attr_type[1] if len(attr_type) >= 2 else None is_multival = attr_type[2] if len(attr_type) >= 3 else False else: type_name = attr_type bucket_size = None is_multival = False assert type_name in {"int", "float", "string"} if is_multival and type_name != "string": raise ValueError("multi-value attribute must be string type.") return type_name, bucket_size, is_multival @property def weighted(self): return self._weighted @property def labeled(self): return self._labeled @property def attributed(self): return self._attributed @property def attr_types(self): return self._attr_types @property def attr_delimiter(self): return self._attr_delimiter @property def data_format(self): # attributed << 3 | labeled << 2 | weighted << 1 return int(self._weighted * 2 + \ self._labeled * 4 + self._attributed * 8) @property def int_attr_num(self): return self._int_attr_num @property def float_attr_num(self): return self._float_attr_num @property def string_attr_num(self): return self._string_attr_num @property def feature_spec(self): if not self._fspec: self._build_feature_spec() return self._fspec def format_attrs(self, int_attrs, float_attrs, string_attrs): """ Reshape and format attributes with int_attr_num, float_attr_num and string_attr_num calculated by decoder.attr_types. Return: Reshaped int_attrs, float_attrs, string_attrs """ if int_attrs is not None: int_attrs = int_attrs.reshape(-1, self._int_attr_num) if float_attrs is not None: float_attrs = float_attrs.reshape(-1, self._float_attr_num) if string_attrs is not None: string_attrs = string_attrs.reshape(-1, self._string_attr_num) return int_attrs, float_attrs, string_attrs
def test_heterogeneous_graph(self): u_spec = FeatureSpec(10) for i in range(3): u_spec.append_dense() u_total_dim = 3 for i in range(7): dim = random.randint(8, 10) u_spec.append_sparse(20 + 10 * i, dim, False) u_total_dim += dim i_spec = FeatureSpec(19) for i in range(6): i_spec.append_dense() i_total_dim = 6 for i in range(13): dim = random.randint(8, 11) i_spec.append_sparse(30 + 10 * i, dim, False) i_total_dim += dim u_out_dim = 16 i_out_dim = 12 out_dim = 9 hops = [4, 5] # the centric vertices share the same spec with 2-hop neighbors # metapath: u--i--i schema = [("u_nodes", u_spec), ("nbr", i_spec), ("nbr", i_spec)] # [f_num, batch_size] = [3, 2] batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(3)]) batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32) # [i_num, batch_size] = [7, 2] batch_ints = np.array([[i, 2 * i] for i in range(7)]) batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64) vertices = Vertex(floats=batch_floats, ints=batch_ints) # [f_num, batch_size] = [6, 2 * 4] hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] for i in range(6)]) hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32) # [i_num, batch_size] = [13, 2 * 4] hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(13)]) hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64) neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints) # [f_num, batch_size] = [6, 2 * 4 * 5] hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1] for i in range(6)]) hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32) # [i_num, batch_size] = [13, 2 * 4 * 5] hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1] for i in range(13)]) hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64) neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints) g = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops) layer_ui = EgoSAGELayer("heter_uv", input_dim=(u_total_dim, i_total_dim), output_dim=u_out_dim, agg_type="mean", com_type="concat") layer_ii = EgoSAGELayer("heter_ii", input_dim=i_total_dim, output_dim=i_out_dim, agg_type="mean", com_type="concat") layer_uii = EgoSAGELayer("heter_uii", input_dim=(u_out_dim, i_out_dim), output_dim=out_dim, agg_type="sum", com_type="concat") layer_group_1 = EgoSAGELayerGroup([layer_ui, layer_ii]) layer_group_2 = EgoSAGELayerGroup([layer_uii]) model = EgoGraphSAGE([layer_group_1, layer_group_2], bn_fn=None, active_fn=tf.nn.relu, droput=0.1) embeddings = model.forward(g) with tf.Session() as sess: sess.run(tf.local_variables_initializer()) sess.run(tf.global_variables_initializer()) ret = sess.run(embeddings) self.assertListEqual([2, 9], list(ret.shape)) # [batch_size, output_dim]