def __call__(self): np.random.seed(os.getpid()) if self.neg_sample_type == "outdegree": outdegree = self.graph.outdegree() distribution = 1. * outdegree / outdegree.sum() alias, events = alias_sample_build_table(distribution) max_len = int(self.batch_size * self.walk_len * ((1 + self.win_size) - 0.3)) for walks in self.walk_generator(): src, pos = [], [] for walk in walks: s, p = skip_gram_gen_pair(walk, self.win_size) src.extend(s), pos.extend(p) src = np.array(src, dtype=np.int64), pos = np.array(pos, dtype=np.int64) src, pos = np.reshape(src, [-1, 1, 1]), np.reshape(pos, [-1, 1, 1]) if src.shape[0] == 0: continue neg_sample_size = [len(pos), self.neg_num, 1] if self.neg_sample_type == "average": negs = self.graph.sample_nodes(neg_sample_size) elif self.neg_sample_type == "outdegree": negs = alias_sample(neg_sample_size, alias, events) # [batch_size, 1, 1] [batch_size, neg_num+1, 1] dst = np.concatenate([pos, negs], 1) src_feat = np.concatenate([src, self.node_feat[src[:, :, 0]]], -1) dst_feat = np.concatenate([dst, self.node_feat[dst[:, :, 0]]], -1) src_feat, dst_feat = np.expand_dims(src_feat, -1), np.expand_dims( dst_feat, -1) yield src_feat[:max_len], dst_feat[:max_len]
def batch_fn(self, batch_ex): # batch_ex = [ # (src, dst, neg), # (src, dst, neg), # (src, dst, neg), # ] # batch_src = [] batch_dst = [] batch_neg = [] for batch in batch_ex: batch_src.append(batch[0]) batch_dst.append(batch[1]) if len(batch) == 3: # default neg samples batch_neg.append(batch[2]) if len(batch_src) != self.batch_size: if self.phase == "train": return None #Skip if len(batch_neg) > 0: batch_neg = np.unique(np.concatenate(batch_neg)) batch_src = np.array(batch_src, dtype="int64") batch_dst = np.array(batch_dst, dtype="int64") sampled_batch_neg = alias_sample(batch_dst.shape, self.alias, self.events) if len(batch_neg) > 0: batch_neg = np.concatenate([batch_neg, sampled_batch_neg], 0) else: batch_neg = sampled_batch_neg if self.phase == "train": ignore_edges = set() else: ignore_edges = set() nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0)) subgraphs = graphsage_sample(self.graph, nodes, self.samples, ignore_edges=ignore_edges) #subgraphs[0].reindex_to_parrent_nodes(subgraphs[0].nodes) feed_dict = {} for i in range(self.num_layers): feed_dict.update(self.graph_wrappers[i].to_feed(subgraphs[i])) # only reindex from first subgraph sub_src_idx = subgraphs[0].reindex_from_parrent_nodes(batch_src) sub_dst_idx = subgraphs[0].reindex_from_parrent_nodes(batch_dst) sub_neg_idx = subgraphs[0].reindex_from_parrent_nodes(batch_neg) feed_dict["user_index"] = np.array(sub_src_idx, dtype="int64") feed_dict["item_index"] = np.array(sub_dst_idx, dtype="int64") feed_dict["neg_item_index"] = np.array(sub_neg_idx, dtype="int64") feed_dict["term_ids"] = self.term_ids[subgraphs[0].node_feat["index"]] return feed_dict
def choose_neighbor_alias_method(self, node, layer): """ Choose the neighhor with strategy of random """ weight_list = self.layer_norm_distance[layer][node] neighbors = self.layer_message[layer][node] select_idx = alias_sample(1, self.sample_alias[layer][node], self.sample_events[layer][node]) return neighbors[select_idx[0]]
def test_speed(self): """test_speed """ num = 1000 size = [10240, 1, 5] probs = np.random.uniform(0.0, 1.0, [num]) probs /= np.sum(probs) start = time.time() alias, events = alias_sample_build_table(probs) for i in range(100): alias_sample(size, alias, events) alias_sample_time = time.time() - start start = time.time() for i in range(100): np.random.choice(num, size, p=probs) np_sample_time = time.time() - start self.assertTrue(alias_sample_time < np_sample_time)
def test_resut(self): """test_result """ size = [450000] num = 10 probs = np.arange(1, num).astype(np.float64) probs /= np.sum(probs) alias, events = alias_sample_build_table(probs) ret = alias_sample(size, alias, events) cnt = Counter(ret) sort_cnt_keys = [x[1] for x in sorted(zip(cnt.values(), cnt.keys()))] self.assertEqual(sort_cnt_keys, np.arange(0, num - 1).tolist())
def __call__(self): np.random.seed(os.getpid()) if self.neg_sample_type == "outdegree": outdegree = self.graph.outdegree() distribution = 1. * outdegree / outdegree.sum() alias, events = alias_sample_build_table(distribution) max_len = int(self.batch_size * self.walk_len * ((1 + self.win_size) - 0.3)) for walks in self.walk_generator(): try: src_list, pos_list = [], [] for walk in walks: s, p = skip_gram_gen_pair(walk, self.win_size) src_list.append(s[:max_len]), pos_list.append(p[:max_len]) src = [s for x in src_list for s in x] pos = [s for x in pos_list for s in x] src = np.array(src, dtype=np.int64), pos = np.array(pos, dtype=np.int64) src, pos = np.reshape(src, [-1, 1, 1]), np.reshape(pos, [-1, 1, 1]) neg_sample_size = [len(pos), self.neg_num, 1] if src.shape[0] == 0: continue if self.neg_sample_type == "average": negs = np.random.randint(low=0, high=self.graph.num_nodes, size=neg_sample_size) elif self.neg_sample_type == "outdegree": negs = alias_sample(neg_sample_size, alias, events) elif self.neg_sample_type == "inbatch": pass dst = np.concatenate([pos, negs], 1) # [batch_size, 1, 1] [batch_size, neg_num+1, 1] yield src[:max_len], dst[:max_len] except Exception as e: log.exception(e)
def batch_fn(self, batch_ex): batch_src = [] batch_dst = [] batch_neg = [] for batch in batch_ex: batch_src.append(batch[0]) batch_dst.append(batch[1]) if len(batch) == 3: # default neg samples batch_neg.append(batch[2]) if len(batch_src) != self.batch_size: if self.phase == "train": return None #Skip if len(batch_neg) > 0: batch_neg = np.unique(np.concatenate(batch_neg)) batch_src = np.array(batch_src, dtype="int64") batch_dst = np.array(batch_dst, dtype="int64") if self.neg_type == "batch_neg": batch_neg = batch_dst else: # TODO user define shape of neg_sample neg_shape = batch_dst.shape sampled_batch_neg = alias_sample(neg_shape, self.alias, self.events) batch_neg = np.concatenate([batch_neg, sampled_batch_neg], 0) if self.phase == "train": # TODO user define ignore edges or not #ignore_edges = np.concatenate([np.stack([batch_src, batch_dst], 1), np.stack([batch_dst, batch_src], 1)], 0) ignore_edges = set() else: ignore_edges = set() nodes = np.unique(np.concatenate([batch_src, batch_dst, batch_neg], 0)) subgraphs = graphsage_sample(self.graph, nodes, self.samples, ignore_edges=ignore_edges) subgraph = subgraphs[0] subgraphs[0].node_feat["index"] = subgraphs[ 0].reindex_to_parrent_nodes(subgraphs[0].nodes).astype(np.int64) subgraphs[0].node_feat["term_ids"] = self.term_ids[ subgraphs[0].node_feat["index"]].astype(np.int64) # only reindex from first subgraph sub_src_idx = subgraphs[0].reindex_from_parrent_nodes(batch_src) sub_dst_idx = subgraphs[0].reindex_from_parrent_nodes(batch_dst) sub_neg_idx = subgraphs[0].reindex_from_parrent_nodes(batch_neg) user_index = np.array(sub_src_idx, dtype="int64") pos_item_index = np.array(sub_dst_idx, dtype="int64") neg_item_index = np.array(sub_neg_idx, dtype="int64") user_real_index = np.array(batch_src, dtype="int64") pos_item_real_index = np.array(batch_dst, dtype="int64") num_nodes = np.array([len(subgraph.nodes)], np.int32) num_edges = np.array([len(subgraph.edges)], np.int32) edges = subgraph.edges node_feat = subgraph.node_feat edge_feat = subgraph.edge_feat # pairwise training with label 1. fake_label = np.ones_like(user_index) if self.phase == "train": return num_nodes, num_edges, edges, node_feat["index"], node_feat["term_ids"], user_index, \ pos_item_index, neg_item_index, user_real_index, pos_item_real_index, fake_label else: return num_nodes, num_edges, edges, node_feat["index"], node_feat["term_ids"], user_index, \ pos_item_index, neg_item_index, user_real_index, pos_item_real_index