Exemplo n.º 1
0
    def _build_feature_spec(self):
        num_attrs = len(self._attr_types)
        numeric_types = ("float", "int")
        embedding_types = ("int", "string")

        self._fspec = FeatureSpec(num_attrs, self._weighted, self._labeled)

        if not self._attr_dims:
            self._attr_dims = [None for _ in range(num_attrs)]

        if num_attrs != len(self._attr_dims):
            raise ValueError(
                "The size of attr_dims must be equal with attr_types.")

        def check(dim, attr_type, bucket):
            if not dim:
                assert type_name in numeric_types and bucket_size is None, \
                  "Must assign an attr_dim for {}, and bucket_size should None." \
                  .format(type_name)
            else:
                assert type_name in embedding_types, \
                  "Must assign an attr_dim with None for {}".format(type_name)

        for attr_type, dim in zip(self._attr_types, self._attr_dims):
            type_name, bucket_size, is_multival = self.parse(attr_type)
            check(dim, type_name, bucket_size)
            if is_multival:
                self._fspec.append_multival(bucket_size, dim, ",")
            elif dim:
                self._fspec.append_sparse(bucket_size, dim, type_name == "int")
            else:
                self._fspec.append_dense(type_name == "float")
Exemplo n.º 2
0
    def get_sub_graph_data(self):
        spec = FeatureSpec(10)
        for i in range(3):
            spec.append_dense()

        total_dim = 3
        for i in range(7):
            dim = random.randint(8, 10)
            spec.append_sparse(20 + 10 * i, dim, False)
            total_dim += dim

        N = 10
        # [f_num, batch_size] = [3, N]
        batch_floats = np.random.random([3, N])
        batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, N]
        batch_ints = np.array([[i * j for j in range(N)] for i in range(7)])
        batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64)

        vertices = Vertex(floats=batch_floats, ints=batch_ints)

        adj = np.zeros([N, N], dtype=np.float32)
        for i in range(N):
            for j in range(N):
                adj[i][j] = i

        tf_adj = tf.convert_to_tensor(adj, dtype=tf.float32)

        g = SubGraph(vertices, tf_adj, schema=("nodes", spec))
        return g, N, total_dim
Exemplo n.º 3
0
    def test_homogeneous_graph(self):
        spec = FeatureSpec(10)
        for i in range(3):
            spec.append_dense()

        total_dim = 3
        for i in range(7):
            dim = random.randint(8, 10)
            spec.append_sparse(20 + 10 * i, dim, False)
            total_dim += dim

        hops = [4, 5]
        # the centric vertices share the same spec with 2-hop neighbors
        schema = [("nodes", spec), ("nodes", spec), ("nodes", spec)]

        # [f_num, batch_size] = [3, 2]
        batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(3)])
        batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2]
        batch_ints = np.array([[i, 2 * i] for i in range(7)])
        batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64)
        vertices = Vertex(floats=batch_floats, ints=batch_ints)

        # [f_num, batch_size] = [3, 2 * 4]
        hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0]
                                for i in range(3)])
        hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2 * 4]
        hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(7)])
        hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64)
        neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints)

        # [f_num, batch_size] = [3, 2 * 4 * 5]
        hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1]
                                for i in range(3)])
        hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2 * 4 * 5]
        hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1]
                              for i in range(7)])
        hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64)
        neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints)

        g = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops)

        dims = np.array([total_dim, 16, 8])
        model = HomoEgoGIN(dims,
                           num_head=5,
                           bn_fn=None,
                           active_fn=tf.nn.relu,
                           droput=0.1)
        embeddings = model.forward(g)

        with tf.Session() as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())
            ret = sess.run(embeddings)
            self.assertListEqual([2, 8],
                                 list(ret.shape))  # [batch_size, output_dim]
Exemplo n.º 4
0
    def get_model_and_graph(self):
        spec = FeatureSpec(10)
        for i in range(3):
            spec.append_dense()

        total_dim = 3
        for i in range(7):
            dim = random.randint(8, 10)
            spec.append_sparse(20 + 10 * i, dim, False)
            total_dim += dim

        hops = [4, 5]
        # the centric vertices share the same spec with 2-hop neighbors
        schema = [("nodes", spec), ("nodes", spec), ("nodes", spec)]

        # [f_num, batch_size] = [3, 2]
        batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(3)])
        batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2]
        batch_ints = np.array([[i, 2 * i] for i in range(7)])
        batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64)
        # [batch_size] = [2]
        batch_labels = np.array([1, 0])
        batch_labels = tf.convert_to_tensor(batch_labels, dtype=tf.int32)
        vertices = Vertex(floats=batch_floats,
                          ints=batch_ints,
                          labels=batch_labels)

        # [f_num, batch_size] = [3, 2 * 4]
        hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0]
                                for i in range(3)])
        hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2 * 4]
        hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(7)])
        hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64)
        neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints)

        # [f_num, batch_size] = [3, 2 * 4 * 5]
        hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1]
                                for i in range(3)])
        hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2 * 4 * 5]
        hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1]
                              for i in range(7)])
        hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64)
        neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints)

        g = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops)

        dims = np.array([total_dim, 16, 8])
        model = HomoEgoGraphSAGE(dims,
                                 bn_fn=None,
                                 active_fn=tf.nn.relu,
                                 droput=0.1)
        return model, g, 2  # batch_size
Exemplo n.º 5
0
    def get_graph(cls, hops, neg=None):
        spec = FeatureSpec(10)
        for i in range(3):
            spec.append_dense()

        total_dim = 3
        for i in range(7):
            dim = i + 1
            spec.append_sparse(20 + 10 * i, dim, False)
            total_dim += dim

        neg = 1 if not neg else int(neg)
        hops[0] = int(hops[0] * neg)
        # the centric vertices share the same spec with 2-hop neighbors
        schema = [("nodes", spec), ("nodes", spec), ("nodes", spec)]

        # batch_size = 2
        # [f_num, batch_size] = [3, 2 * neg]
        batch_floats = np.array([[1.0 * i, 2.0 * i] * neg for i in range(3)])
        batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2 * neg]
        batch_ints = np.array([[i, 2 * i] * neg for i in range(7)])
        batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64)
        # [batch_size] = [2]
        batch_labels = np.array([1, 0])
        batch_labels = tf.convert_to_tensor(batch_labels, dtype=tf.int32)
        vertices = Vertex(floats=batch_floats,
                          ints=batch_ints,
                          labels=batch_labels)

        # [f_num, batch_size] = [3, 2 * neg * hop0]
        hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0]
                                for i in range(3)])
        hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2 * neg * hop0]
        hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(7)])
        hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64)
        neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints)

        # [f_num, batch_size] = [3, 2 * neg * hop0 * hop1]
        hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1]
                                for i in range(3)])
        hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2 * neg * hop0 * hop1]
        hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1]
                              for i in range(7)])
        hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64)
        neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints)

        hops[0] = int(hops[0] / neg)
        g = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops)
        return g, total_dim
Exemplo n.º 6
0
  def test_homogeneous_graph(self):
    spec = FeatureSpec(10)
    for i in range(3):
      spec.append_dense()

    total_dim = 3
    for i in range(7):
      dim = random.randint(8, 10)
      spec.append_sparse(20 + 10 * i, dim, False)
      total_dim += dim

    hops = [4, 5]
    # the centric vertices share the same spec with 2-hop neighbors
    schema = [("nodes", spec), ("nodes", spec), ("nodes", spec)]

    # [f_num, batch_size] = [3, 2]
    batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(3)])
    batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32)
    # [i_num, batch_size] = [7, 2]
    batch_ints = np.array([[i, 2 * i] for i in range(7)])
    batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64)
    vertices = Vertex(floats=batch_floats, ints=batch_ints)

    # [f_num, batch_size] = [3, 2 * 4]
    hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] for i in range(3)])
    hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32)
    # [i_num, batch_size] = [7, 2 * 4]
    hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(7)])
    hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64)
    neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints)

    # [f_num, batch_size] = [3, 2 * 4 * 5]
    hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1] for i in range(3)])
    hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32)
    # [i_num, batch_size] = [7, 2 * 4 * 5]
    hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1] for i in range(7)])
    hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64)
    neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints)

    g1 = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops)
    g2 = g1.forward()

    self.assertListEqual(list(g2.expands), hops)
    x_list = [g2.nodes, g2.hop(0), g2.hop(1)]

    with tf.Session() as sess:
      sess.run(tf.local_variables_initializer())
      sess.run(tf.global_variables_initializer())
      ret = sess.run(x_list)
      self.assertListEqual(list(ret[0].shape), [2, total_dim])
      self.assertListEqual(list(ret[1].shape), [2 * hops[0], total_dim])
      self.assertListEqual(list(ret[2].shape), [2 * hops[0] * hops[1], total_dim])
Exemplo n.º 7
0
class Decoder(object):
    """ Decoder is used for Graph.node() and Graph.edge() to describe the schema
      of data source.
  """
    def __init__(self,
                 weighted=False,
                 labeled=False,
                 attr_types=None,
                 attr_delimiter=":",
                 attr_dims=None):
        """ Initialize a data source decoder.

    Args:
      weighted (boolean, Optional): Whether the data source has weights.
        Default is False.
      labeled (boolean, Optional): Whether the data source has labels.
        Default is False.
      attr_types (list, Optional): Attribute type list if attributes exist.
        Default is None, which means no attribute exists. Each element must 
        be string, tuple or list.

        Valid types like below:
        attr_types = ['int', 'float', 'string']
        attr_types = ['int', ('string', 10), 'string'] # 10 means bucket size
        For ('string', 10), we will get the string and hash it into 10 buckets
        directly. The raw attribute can by any string.

        # True means multi-val splited by ',', only for string attribute
        attr_types = ['int', ('string', 10, True)]

        # 10 means bucket size
        attr_types = ['int', ('int', 10), 'string']
        For ('int', 10), we will cast the string to int first and then hash it
        into 10 buckets. In this way, the raw attribute must be an integer.

        When `attr_dims` is assigned, be sure that the string attribute must
        be configured with a bucket size. Bucket size for int attribute is
        optional, and it will be considered as a continuous attribute if bucket
        size is not assigned.

      attr_delimiter (string, Optional): The delimiter to seperate attributes.
        Default is ':'. If attributes exist, all of them are concatenated
        together with a delimiter in the raw storage. We need to know how to
        parse them.
      attr_dims (list, Optional): An integer list, the element of which
        represents the dimension of the corresponding attribute that will be
        encodeding to. Default is None, which means no attribute exists.

        All valid configurations of attr_type and attr_dim are as shown below.
        |    attr_type     |attr_dim|             encoded into                 |
        |    ---------     | -- |                  --------                    |
        |     "string"     | 8  |   Dynamic bucket embedding variable, dim=8   |
        |   ("string",10)  | 8  |   Embedding variable, bucketsize=10, dim=8   |
        |("string",10,True)| 8  |Sparse embedding variable, bucketsize=10,dim=8|
        |("string",None,True)| 8|   Sparse dynamic embedding variable, dim=8   |
        |       "int"      |None|           Continues numeric tensor           |
        |       "int"      | 8  |   Dynamic bucket embedding variable, dim=8   |
        |    ("int",10)    | 8  |   Embedding variable, bucket size=10, dim=8  |
        |      "float"     |None|           Continues numeric tensor           |
        Note that dynamic bucket embedding variable is only supported in PAI-TF.
        For continues numeric attribute, attr_dim should be either None or 0.
    """
        self._weighted = weighted
        self._labeled = labeled
        self._attr_types = attr_types
        self._attr_delimiter = attr_delimiter
        self._attr_dims = attr_dims

        self._int_attr_num = 0
        self._float_attr_num = 0
        self._string_attr_num = 0
        self._fspec = None

        self._attributed = self._parse_attributes()

    def _parse_attributes(self):
        if not self._attr_types:
            return False
        if not isinstance(self._attr_types, list):
            raise ValueError(
                "attr_types for Decoder must be a list, got {}.".format(
                    type(self._attr_types)))
        for i in range(len(self._attr_types)):
            type_name, bucket_size, is_multival = self.parse(
                self._attr_types[i])
            self._int_attr_num += int(type_name == "int")
            self._float_attr_num += int(type_name == "float")
            if is_multival:
                self._string_attr_num += 1
            else:
                self._int_attr_num += int(type_name == "string"
                                          and bucket_size is not None)
                self._string_attr_num += int(type_name == "string"
                                             and bucket_size is None)
        return True

    def _build_feature_spec(self):
        num_attrs = len(self._attr_types)
        numeric_types = ("float", "int")
        embedding_types = ("int", "string")

        self._fspec = FeatureSpec(num_attrs, self._weighted, self._labeled)

        if not self._attr_dims:
            self._attr_dims = [None for _ in range(num_attrs)]

        if num_attrs != len(self._attr_dims):
            raise ValueError(
                "The size of attr_dims must be equal with attr_types.")

        def check(dim, attr_type, bucket):
            if not dim:
                assert type_name in numeric_types and bucket_size is None, \
                  "Must assign an attr_dim for {}, and bucket_size should None." \
                  .format(type_name)
            else:
                assert type_name in embedding_types, \
                  "Must assign an attr_dim with None for {}".format(type_name)

        for attr_type, dim in zip(self._attr_types, self._attr_dims):
            type_name, bucket_size, is_multival = self.parse(attr_type)
            check(dim, type_name, bucket_size)
            if is_multival:
                self._fspec.append_multival(bucket_size, dim, ",")
            elif dim:
                self._fspec.append_sparse(bucket_size, dim, type_name == "int")
            else:
                self._fspec.append_dense(type_name == "float")

    def parse(self, attr_type):
        if isinstance(attr_type, tuple) or isinstance(attr_type, list):
            type_name = attr_type[0]
            bucket_size = attr_type[1] if len(attr_type) >= 2 else None
            is_multival = attr_type[2] if len(attr_type) >= 3 else False
        else:
            type_name = attr_type
            bucket_size = None
            is_multival = False

        assert type_name in {"int", "float", "string"}

        if is_multival and type_name != "string":
            raise ValueError("multi-value attribute must be string type.")
        return type_name, bucket_size, is_multival

    @property
    def weighted(self):
        return self._weighted

    @property
    def labeled(self):
        return self._labeled

    @property
    def attributed(self):
        return self._attributed

    @property
    def attr_types(self):
        return self._attr_types

    @property
    def attr_delimiter(self):
        return self._attr_delimiter

    @property
    def data_format(self):
        # attributed << 3 | labeled << 2 | weighted << 1
        return int(self._weighted * 2 + \
                   self._labeled * 4 + self._attributed * 8)

    @property
    def int_attr_num(self):
        return self._int_attr_num

    @property
    def float_attr_num(self):
        return self._float_attr_num

    @property
    def string_attr_num(self):
        return self._string_attr_num

    @property
    def feature_spec(self):
        if not self._fspec:
            self._build_feature_spec()
        return self._fspec

    def format_attrs(self, int_attrs, float_attrs, string_attrs):
        """ Reshape and format attributes with int_attr_num, float_attr_num
    and string_attr_num calculated by decoder.attr_types.

    Return:
      Reshaped int_attrs, float_attrs, string_attrs
    """
        if int_attrs is not None:
            int_attrs = int_attrs.reshape(-1, self._int_attr_num)

        if float_attrs is not None:
            float_attrs = float_attrs.reshape(-1, self._float_attr_num)

        if string_attrs is not None:
            string_attrs = string_attrs.reshape(-1, self._string_attr_num)

        return int_attrs, float_attrs, string_attrs
Exemplo n.º 8
0
    def test_heterogeneous_graph(self):
        u_spec = FeatureSpec(10)
        for i in range(3):
            u_spec.append_dense()

        u_total_dim = 3
        for i in range(7):
            dim = random.randint(8, 10)
            u_spec.append_sparse(20 + 10 * i, dim, False)
            u_total_dim += dim

        i_spec = FeatureSpec(19)
        for i in range(6):
            i_spec.append_dense()

        i_total_dim = 6
        for i in range(13):
            dim = random.randint(8, 11)
            i_spec.append_sparse(30 + 10 * i, dim, False)
            i_total_dim += dim

        u_out_dim = 16
        i_out_dim = 12
        out_dim = 9
        hops = [4, 5]
        # the centric vertices share the same spec with 2-hop neighbors
        # metapath: u--i--i
        schema = [("u_nodes", u_spec), ("nbr", i_spec), ("nbr", i_spec)]

        # [f_num, batch_size] = [3, 2]
        batch_floats = np.array([[1.0 * i, 2.0 * i] for i in range(3)])
        batch_floats = tf.convert_to_tensor(batch_floats, dtype=tf.float32)
        # [i_num, batch_size] = [7, 2]
        batch_ints = np.array([[i, 2 * i] for i in range(7)])
        batch_ints = tf.convert_to_tensor(batch_ints, dtype=tf.int64)
        vertices = Vertex(floats=batch_floats, ints=batch_ints)

        # [f_num, batch_size] = [6, 2 * 4]
        hop1_floats = np.array([[1.0 * i, 2.0 * i] * hops[0]
                                for i in range(6)])
        hop1_floats = tf.convert_to_tensor(hop1_floats, dtype=tf.float32)
        # [i_num, batch_size] = [13, 2 * 4]
        hop1_ints = np.array([[i, 2 * i] * hops[0] for i in range(13)])
        hop1_ints = tf.convert_to_tensor(hop1_ints, dtype=tf.int64)
        neighbor_hop_1 = Vertex(floats=hop1_floats, ints=hop1_ints)

        # [f_num, batch_size] = [6, 2 * 4 * 5]
        hop2_floats = np.array([[1.0 * i, 2.0 * i] * hops[0] * hops[1]
                                for i in range(6)])
        hop2_floats = tf.convert_to_tensor(hop2_floats, dtype=tf.float32)
        # [i_num, batch_size] = [13, 2 * 4 * 5]
        hop2_ints = np.array([[i, 2 * i] * hops[0] * hops[1]
                              for i in range(13)])
        hop2_ints = tf.convert_to_tensor(hop2_ints, dtype=tf.int64)
        neighbor_hop_2 = Vertex(floats=hop2_floats, ints=hop2_ints)

        g = EgoGraph(vertices, [neighbor_hop_1, neighbor_hop_2], schema, hops)

        layer_ui = EgoSAGELayer("heter_uv",
                                input_dim=(u_total_dim, i_total_dim),
                                output_dim=u_out_dim,
                                agg_type="mean",
                                com_type="concat")
        layer_ii = EgoSAGELayer("heter_ii",
                                input_dim=i_total_dim,
                                output_dim=i_out_dim,
                                agg_type="mean",
                                com_type="concat")
        layer_uii = EgoSAGELayer("heter_uii",
                                 input_dim=(u_out_dim, i_out_dim),
                                 output_dim=out_dim,
                                 agg_type="sum",
                                 com_type="concat")
        layer_group_1 = EgoSAGELayerGroup([layer_ui, layer_ii])
        layer_group_2 = EgoSAGELayerGroup([layer_uii])

        model = EgoGraphSAGE([layer_group_1, layer_group_2],
                             bn_fn=None,
                             active_fn=tf.nn.relu,
                             droput=0.1)
        embeddings = model.forward(g)

        with tf.Session() as sess:
            sess.run(tf.local_variables_initializer())
            sess.run(tf.global_variables_initializer())
            ret = sess.run(embeddings)
            self.assertListEqual([2, 9],
                                 list(ret.shape))  # [batch_size, output_dim]