def __sample_neighbor_nodes(self, sample_size, nodeId): ''' Sample subset of neighborhood nodes. ''' p = sample_size neighbor_nodes = Set() held_out_set = self._network.get_held_out_set() test_set = self._network.get_test_set() while p > 0: nodeList = random.sample_range("neighbor sampler", self._N, sample_size * 2) if self._compatibility_mode: # to be able to replay from C++ nodeList = sorted(nodeList) for neighborId in nodeList: if p < 0: if False: print sys._getframe().f_code.co_name + ": Are you sure p < 0 is a good idea?" break if neighborId == nodeId: continue # check condition, and insert into mini_batch_set if it is valid. edge = (min(nodeId, neighborId), max(nodeId, neighborId)) if edge in held_out_set or edge in test_set or neighborId in neighbor_nodes: continue else: # add it into mini_batch_set neighbor_nodes.add(neighborId) p -= 1 if self._compatibility_mode: # to be able to replay from C++ neighbor_nodes = sorted(neighbor_nodes) return neighbor_nodes
def __sample_neighbor_nodes(self, sample_size, nodeId): ''' Sample subset of neighborhood nodes. ''' p = sample_size neighbor_nodes = Set() held_out_set = self._network.get_held_out_set() test_set = self._network.get_test_set() while p > 0: nodeList = random.sample_range("neighbor sampler", self._N, sample_size * 2) if self._compatibility_mode: # to be able to replay from C++ nodeList = sorted(nodeList) for neighborId in nodeList: if p < 0: if False: print sys._getframe( ).f_code.co_name + ": Are you sure p < 0 is a good idea?" break if neighborId == nodeId: continue # check condition, and insert into mini_batch_set if it is valid. edge = (min(nodeId, neighborId), max(nodeId, neighborId)) if edge in held_out_set or edge in test_set or neighborId in neighbor_nodes: continue else: # add it into mini_batch_set neighbor_nodes.add(neighborId) p -= 1 if self._compatibility_mode: # to be able to replay from C++ neighbor_nodes = sorted(neighbor_nodes) return neighbor_nodes
def __stratified_random_node_sampling(self, num_pieces): """ stratified sampling approach gives more attention to link edges (the edge is connected by two nodes). The sampling process works like this: a) randomly choose one node $i$ from all nodes (1,....N) b) decide to choose link edges or non-link edges with (50%, 50%) probability. c) if we decide to sample link edge: return all the link edges for the chosen node $i$ else sample edges from all non-links edges for node $i$. The number of edges we sample equals to number of all non-link edges / num_pieces """ # randomly select the node ID nodeId = random.get("minibatch sampler").randint(0, self.__N - 1) # decide to sample links or non-links flag = random.get("minibatch sampler").randint( 0, 1) # flag=0: non-link edges flag=1: link edges # sys.stderr.write ("Sample minibatch num_pieces %d minibatch size %d\n" % (num_pieces, (self.__N / self.__num_pieces))) mini_batch_set = Set() if flag == 0: """ sample non-link edges """ # this is approximation, since the size of self.train_link_map[nodeId] # greatly smaller than N. mini_batch_size = int(self.__N / self.__num_pieces) p = mini_batch_size while p > 0: # because of the sparsity, when we sample $mini_batch_size*2$ nodes, the list likely # contains at least mini_batch_size valid nodes. nodeList = random.sample_range("minibatch sampler", self.__N, mini_batch_size * 2) for neighborId in nodeList: if p < 0: if False: print sys._getframe( ).f_code.co_name + ": Are you sure p < 0 is a good idea?" break if neighborId == nodeId: continue # check condition, and insert into mini_batch_set if it is valid. edge = (min(nodeId, neighborId), max(nodeId, neighborId)) if edge in self.__linked_edges or edge in self.__held_out_map or \ edge in self.__test_map or edge in mini_batch_set: # print "Discard edge " + str(edge) continue # add it into mini_batch_set mini_batch_set.add(edge) p -= 1 print "A Create mini batch size " + str( len(mini_batch_set)) + " scale " + str( self.__N * self.__num_pieces) # for e in mini_batch_set: # sys.stdout.write("%s " % str(e)) # sys.stdout.write("\n") return (mini_batch_set, self.__N * self.__num_pieces) else: """ sample linked edges """ # return all linked edges # print "train_link_map[" + str(nodeId) + "] size " + str(len(self.__train_link_map[nodeId])) for neighborId in self.__train_link_map[nodeId]: mini_batch_set.add((min(nodeId, neighborId), max(nodeId, neighborId))) print "B Create mini batch size " + str( len(mini_batch_set)) + " scale " + str(self.__N) return (mini_batch_set, self.__N)