Пример #1
0
    def __call__(self):
        np.random.seed(os.getpid())
        if self.neg_sample_type == "outdegree":
            outdegree = self.graph.outdegree()
            distribution = 1. * outdegree / outdegree.sum()
            alias, events = alias_sample_build_table(distribution)
        max_len = int(self.batch_size * self.walk_len *
                      ((1 + self.win_size) - 0.3))
        for walks in self.walk_generator():
            src, pos = [], []
            for walk in walks:
                s, p = skip_gram_gen_pair(walk, self.win_size)
                src.extend(s), pos.extend(p)
            src = np.array(src, dtype=np.int64),
            pos = np.array(pos, dtype=np.int64)
            src, pos = np.reshape(src, [-1, 1, 1]), np.reshape(pos, [-1, 1, 1])

            if src.shape[0] == 0:
                continue
            neg_sample_size = [len(pos), self.neg_num, 1]
            if self.neg_sample_type == "average":
                negs = self.graph.sample_nodes(neg_sample_size)
            elif self.neg_sample_type == "outdegree":
                negs = alias_sample(neg_sample_size, alias, events)
            # [batch_size, 1, 1] [batch_size, neg_num+1, 1]
            dst = np.concatenate([pos, negs], 1)
            src_feat = np.concatenate([src, self.node_feat[src[:, :, 0]]], -1)
            dst_feat = np.concatenate([dst, self.node_feat[dst[:, :, 0]]], -1)
            src_feat, dst_feat = np.expand_dims(src_feat, -1), np.expand_dims(
                dst_feat, -1)
            yield src_feat[:max_len], dst_feat[:max_len]
Пример #2
0
    def batch_fn(self, batch_ex):
        # batch_ex = [
        #     (src, dst, neg),
        #     (src, dst, neg),
        #     (src, dst, neg),
        #     ]
        #
        batch_src = []
        batch_dst = []
        batch_neg = []
        for batch in batch_ex:
            batch_src.append(batch[0])
            batch_dst.append(batch[1])
            if len(batch) == 3:  # default neg samples
                batch_neg.append(batch[2])

        if len(batch_src) != self.batch_size:
            if self.phase == "train":
                return None  #Skip

        if len(batch_neg) > 0:
            batch_neg = np.unique(np.concatenate(batch_neg))
        batch_src = np.array(batch_src, dtype="int64")
        batch_dst = np.array(batch_dst, dtype="int64")

        sampled_batch_neg = alias_sample(batch_dst.shape, self.alias,
                                         self.events)

        if len(batch_neg) > 0:
            batch_neg = np.concatenate([batch_neg, sampled_batch_neg], 0)
        else:
            batch_neg = sampled_batch_neg

        if self.phase == "train":
            ignore_edges = set()
        else:
            ignore_edges = set()

        nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0))
        subgraphs = graphsage_sample(self.graph,
                                     nodes,
                                     self.samples,
                                     ignore_edges=ignore_edges)
        #subgraphs[0].reindex_to_parrent_nodes(subgraphs[0].nodes)
        feed_dict = {}
        for i in range(self.num_layers):
            feed_dict.update(self.graph_wrappers[i].to_feed(subgraphs[i]))

        # only reindex from first subgraph
        sub_src_idx = subgraphs[0].reindex_from_parrent_nodes(batch_src)
        sub_dst_idx = subgraphs[0].reindex_from_parrent_nodes(batch_dst)
        sub_neg_idx = subgraphs[0].reindex_from_parrent_nodes(batch_neg)

        feed_dict["user_index"] = np.array(sub_src_idx, dtype="int64")
        feed_dict["item_index"] = np.array(sub_dst_idx, dtype="int64")
        feed_dict["neg_item_index"] = np.array(sub_neg_idx, dtype="int64")
        feed_dict["term_ids"] = self.term_ids[subgraphs[0].node_feat["index"]]
        return feed_dict
Пример #3
0
 def choose_neighbor_alias_method(self, node, layer):
     """
     Choose the neighhor with strategy of random 
     """
     weight_list = self.layer_norm_distance[layer][node]
     neighbors = self.layer_message[layer][node]
     select_idx = alias_sample(1, self.sample_alias[layer][node],
                               self.sample_events[layer][node])
     return neighbors[select_idx[0]]
Пример #4
0
    def test_speed(self):
        """test_speed
        """

        num = 1000
        size = [10240, 1, 5]
        probs = np.random.uniform(0.0, 1.0, [num])
        probs /= np.sum(probs)

        start = time.time()
        alias, events = alias_sample_build_table(probs)
        for i in range(100):
            alias_sample(size, alias, events)
        alias_sample_time = time.time() - start

        start = time.time()
        for i in range(100):
            np.random.choice(num, size, p=probs)
        np_sample_time = time.time() - start
        self.assertTrue(alias_sample_time < np_sample_time)
Пример #5
0
 def test_resut(self):
     """test_result
     """
     size = [450000]
     num = 10
     probs = np.arange(1, num).astype(np.float64)
     probs /= np.sum(probs)
     alias, events = alias_sample_build_table(probs)
     ret = alias_sample(size, alias, events)
     cnt = Counter(ret)
     sort_cnt_keys = [x[1] for x in sorted(zip(cnt.values(), cnt.keys()))]
     self.assertEqual(sort_cnt_keys, np.arange(0, num - 1).tolist())
Пример #6
0
    def __call__(self):
        np.random.seed(os.getpid())
        if self.neg_sample_type == "outdegree":
            outdegree = self.graph.outdegree()
            distribution = 1. * outdegree / outdegree.sum()
            alias, events = alias_sample_build_table(distribution)
        max_len = int(self.batch_size * self.walk_len *
                      ((1 + self.win_size) - 0.3))
        for walks in self.walk_generator():
            try:
                src_list, pos_list = [], []
                for walk in walks:
                    s, p = skip_gram_gen_pair(walk, self.win_size)
                    src_list.append(s[:max_len]), pos_list.append(p[:max_len])
                src = [s for x in src_list for s in x]
                pos = [s for x in pos_list for s in x]
                src = np.array(src, dtype=np.int64),
                pos = np.array(pos, dtype=np.int64)
                src, pos = np.reshape(src,
                                      [-1, 1, 1]), np.reshape(pos, [-1, 1, 1])

                neg_sample_size = [len(pos), self.neg_num, 1]
                if src.shape[0] == 0:
                    continue
                if self.neg_sample_type == "average":
                    negs = np.random.randint(low=0,
                                             high=self.graph.num_nodes,
                                             size=neg_sample_size)
                elif self.neg_sample_type == "outdegree":
                    negs = alias_sample(neg_sample_size, alias, events)
                elif self.neg_sample_type == "inbatch":
                    pass
                dst = np.concatenate([pos, negs], 1)
                # [batch_size, 1, 1] [batch_size, neg_num+1, 1]
                yield src[:max_len], dst[:max_len]
            except Exception as e:
                log.exception(e)
Пример #7
0
    def batch_fn(self, batch_ex):
        batch_src = []
        batch_dst = []
        batch_neg = []
        for batch in batch_ex:
            batch_src.append(batch[0])
            batch_dst.append(batch[1])
            if len(batch) == 3:  # default neg samples
                batch_neg.append(batch[2])

        if len(batch_src) != self.batch_size:
            if self.phase == "train":
                return None  #Skip

        if len(batch_neg) > 0:
            batch_neg = np.unique(np.concatenate(batch_neg))
        batch_src = np.array(batch_src, dtype="int64")
        batch_dst = np.array(batch_dst, dtype="int64")

        if self.neg_type == "batch_neg":
            batch_neg = batch_dst
        else:
            # TODO user define shape of neg_sample
            neg_shape = batch_dst.shape
            sampled_batch_neg = alias_sample(neg_shape, self.alias,
                                             self.events)
            batch_neg = np.concatenate([batch_neg, sampled_batch_neg], 0)

        if self.phase == "train":
            # TODO user define ignore edges or not
            #ignore_edges = np.concatenate([np.stack([batch_src, batch_dst], 1), np.stack([batch_dst, batch_src], 1)], 0)
            ignore_edges = set()
        else:
            ignore_edges = set()

        nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0))
        subgraphs = graphsage_sample(self.graph,
                                     nodes,
                                     self.samples,
                                     ignore_edges=ignore_edges)
        subgraph = subgraphs[0]
        subgraphs[0].node_feat["index"] = subgraphs[
            0].reindex_to_parrent_nodes(subgraphs[0].nodes).astype(np.int64)
        subgraphs[0].node_feat["term_ids"] = self.term_ids[
            subgraphs[0].node_feat["index"]].astype(np.int64)

        # only reindex from first subgraph
        sub_src_idx = subgraphs[0].reindex_from_parrent_nodes(batch_src)
        sub_dst_idx = subgraphs[0].reindex_from_parrent_nodes(batch_dst)
        sub_neg_idx = subgraphs[0].reindex_from_parrent_nodes(batch_neg)

        user_index = np.array(sub_src_idx, dtype="int64")
        pos_item_index = np.array(sub_dst_idx, dtype="int64")
        neg_item_index = np.array(sub_neg_idx, dtype="int64")

        user_real_index = np.array(batch_src, dtype="int64")
        pos_item_real_index = np.array(batch_dst, dtype="int64")

        num_nodes = np.array([len(subgraph.nodes)], np.int32)
        num_edges = np.array([len(subgraph.edges)], np.int32)
        edges = subgraph.edges
        node_feat = subgraph.node_feat
        edge_feat = subgraph.edge_feat

        # pairwise training with label 1.
        fake_label = np.ones_like(user_index)

        if self.phase == "train":
            return num_nodes, num_edges, edges, node_feat["index"], node_feat["term_ids"], user_index, \
                    pos_item_index, neg_item_index, user_real_index, pos_item_real_index, fake_label
        else:
            return num_nodes, num_edges, edges, node_feat["index"], node_feat["term_ids"], user_index, \
                    pos_item_index, neg_item_index, user_real_index, pos_item_real_index