def pretrain_begin_iteration(self): super(Sampler, self).pretrain_begin_iteration() # sample triangular data filtered_pos = [ p for p in self._pos[0] if p[0] + 1 < self._pos_range[1] ] # except the last time slice! if len(filtered_pos) <= 0: print( "No possible triangular samples, given positive range {} to {}" .format(self._pos_range[0], self._pos_range[1])) triagdata = [None] * len( self._pos[0] ) # in order to pass assertion in datagen_pos_neg.batches() else: if not self.__enable_cache: mapper = utils.ParMap(self.__uncached_sampler_factory(), self.__sample_uncached_monitor, njobs=gconf.njobs) triagdata = [] sample_round = 0 while len(triagdata) < len(self._pos[0]): left_cnt = len(self._pos[0]) - len(triagdata) # verboses print("sample round {}, target #samples {}".format( sample_round, left_cnt)) sample_round += 1 # increase the probability of finish sampling in a single round left_cnt = int( left_cnt * (float(self.__all_trial) / self.__succ_trial + 0.2)) if left_cnt < 100: left_cnt = 100 mapper.njobs = 1 lb = max(0, utils.crandint(len(filtered_pos) - left_cnt)) ub = min(lb + left_cnt, len(filtered_pos)) newsamples = mapper.run(filtered_pos[lb:ub]) self.__all_trial += (ub - lb) self.__succ_trial += len(newsamples) triagdata.extend(newsamples) triagdata = triagdata[:len(self._pos[0])] else: raise NotImplementedError() self._neg.append(triagdata) # neg, triangdata_int, triangdata_float
def __make_neg(self, posdata, negdup=1): negdata = [] # TODO: this is an ugly fix, try to add indexing support in mygraph nodenames = list(self.dataset.gtgraphs['any'].vp['name']) for d in posdata: k, src, tgt = d negdata.append([]) for i in range(negdup): if utils.crandint(2) == 0: # replace source if self.__enable_cache: curcache = self._rep_cache(k)[tgt] new_src = curcache[utils.crandint(len(curcache))] negdata[-1].extend([new_src, tgt]) else: # TODO: although it is almost impossible for a node to have all edges, check this in advance #new_src = random.randint(0, self.dataset.gtgraphs[k].num_vertices() - 1) new_src = utils.crandint( self.dataset.gtgraphs[k].num_vertices()) assert not self.dataset.gtgraphs[k].is_directed() while self.dataset.mygraphs[k].exists( nodenames[new_src], nodenames[tgt]): #new_src = random.randint(0, self.dataset.gtgraphs[k].num_vertices() - 1) new_src = utils.crandint( self.dataset.gtgraphs[k].num_vertices()) negdata[-1].extend([new_src, tgt]) else: # replace target if self.__enable_cache: curcache = self._rep_cache(k)[src] #new_tgt = curcache[random.randint(0, len(curcache) - 1)] new_tgt = curcache[utils.crandint(len(curcache))] negdata[-1].extend([src, new_tgt]) else: #new_tgt = random.randint(0, self.dataset.gtgraphs[k].num_vertices() - 1) new_tgt = utils.crandint( self.dataset.gtgraphs[k].num_vertices()) while self.dataset.mygraphs[k].exists( nodenames[src], nodenames[new_tgt]): #new_tgt = random.randint(0, self.dataset.gtgraphs[k].num_vertices() - 1) new_tgt = utils.crandint( self.dataset.gtgraphs[k].num_vertices()) negdata[-1].extend([src, new_tgt]) negdata = np.array(negdata) assert negdata.shape == (len(posdata), 2 * negdup), "{}, {}".format( negdata.shape, (len(posdata), 2 * negdup)) return negdata
def __sample_one_uncached(data, nodenames, name2idx, mygraphs, localstep): k, src, tgt = [ int(d) for d in data ] # convert from np types to int, to avoid problems in c extensions k = int(k) src = int(src) tgt = int(tgt) localstep = int(localstep) myg = mygraphs[k - localstep] mynextg = mygraphs[k + 1 - localstep] if utils.crandint(2) == 0: # target as key point trycnt = 0 # new_src = random.randint(0, self.dataset.graphs[k].num_vertices() - 1) nbr = myg.out_neighbours(nodenames[tgt]) new_src = name2idx[nbr[utils.crandint(len(nbr))]] # while self._edge(k, tgt, new_src) is None or self._edge(k, src, new_src) is not None: while new_src == tgt or new_src == src or not myg.exists(nodenames[tgt], nodenames[new_src]) or \ myg.exists(nodenames[src], nodenames[new_src]): if trycnt >= 5: break # new_src = random.randint(0, self.dataset.graphs[k].num_vertices() - 1) new_src = name2idx[nbr[utils.crandint(len(nbr))]] trycnt += 1 if trycnt >= 5: # nbr = [int(v) for v in self.dataset.gtgraphs[k].vertex(tgt).out_neighbours() # if int(v) != src and int(v) != tgt and not myg.exists(nodenames[int(v)], nodenames[src])] # if int(v) != src and self._edge(k, v, src) is None] cand = [name2idx[n] for n in nbr] cand = [ n for n in cand if n != src and n != tgt and not myg.exists(nodenames[n], nodenames[src]) ] if len(cand) <= 0: return None, trycnt # new_src = nbr[random.randint(0, len(nbr) - 1)] new_src = cand[utils.crandint(len(cand))] # triagdata.append([k, tgt, src, new_src, self._edge(k + 1, src, new_src) is not None, # w[self._edge(k, tgt, src)], w[self._edge(k, tgt, new_src)]]) ret = [ k, tgt, src, new_src, mynextg.exists(nodenames[src], nodenames[new_src]), myg.edge(nodenames[tgt], nodenames[src]), myg.edge(nodenames[tgt], nodenames[new_src]) ] else: # src as key point trycnt = 0 nbr = myg.out_neighbours(nodenames[src]) # new_tgt = random.randint(0, self.dataset.graphs[k].num_vertices() - 1) new_tgt = name2idx[nbr[utils.crandint(len(nbr))]] # while self._edge(k, src, new_tgt) is None or self._edge(k, tgt, new_tgt) is not None: while new_tgt == src or new_tgt == tgt or not myg.exists(nodenames[src], nodenames[new_tgt]) or \ myg.exists(nodenames[tgt], nodenames[new_tgt]): if trycnt >= 5: break # new_tgt = random.randint(0, self.dataset.graphs[k].num_vertices() - 1) new_tgt = name2idx[nbr[utils.crandint(len(nbr))]] trycnt += 1 if trycnt >= 5: # nbr = [int(v) for v in self.dataset.gtgraphs[k].vertex(src).out_neighbours() # if int(v) != tgt and int(v) != src and not myg.exists(nodenames[int(v)], nodenames[tgt])] # if int(v) != tgt and self._edge(k, v, tgt) is None] cand = [name2idx[n] for n in nbr] cand = [ n for n in cand if n != tgt and n != src and not myg.exists(nodenames[n], nodenames[tgt]) ] if len(cand) <= 0: return None, trycnt # new_tgt = nbr[random.randint(0, len(nbr) - 1)] new_tgt = cand[utils.crandint(len(cand))] # triagdata.append([k, src, tgt, new_tgt, self._edge(k + 1, tgt, new_tgt) is not None, # w[self._edge(k, src, tgt)], w[self._edge(k, src, new_tgt)]]) ret = [ k, src, tgt, new_tgt, mynextg.exists(nodenames[tgt], nodenames[new_tgt]), myg.edge(nodenames[src], nodenames[tgt]), myg.edge(nodenames[src], nodenames[new_tgt]) ] assert len(set(ret[1:4])) == 3 and ret[5] > 0 and ret[5] > 0, ret return ret, trycnt