def __init_kmeansPP(self, X): distribution = torch.ones(len(X)) / len(X) frequency = pytorch_categorical.Categorical(distribution) centroids_index = [] N, D = X.shape while (len(centroids_index) != self._n_c): f = frequency.sample(sample_shape=(1, 1)).item() if (f not in centroids_index): centroids_index.append(f) centroids = X[centroids_index] x = X.unsqueeze(1).expand(N, len(centroids_index), D) dst = self._distance(centroids, x) value, indexes = dst.min(-1) vs = value**2 distribution = vs / (vs.sum()) frequency = pytorch_categorical.Categorical(distribution) self.centroids_index = torch.tensor(centroids_index, device=X.device).long() self.centroids = X[self.centroids_index]
alpha, beta = args.init_alpha, args.init_beta print("Loading Corpus ") D, X, Y = dataset_dict[args.dataset]() print("Creating dataset") # index of examples dataset dataset_index = corpora_tools.from_indexable( torch.arange(0, len(D), 1).unsqueeze(-1)) print("Dataset Size -> ", len(D)) D.set_path(False) # negative sampling distribution frequency = D.getFrequency()**(3 / 4) frequency[:, 1] /= frequency[:, 1].sum() frequency = pytorch_categorical.Categorical(frequency[:, 1]) # random walk dataset d_rw = D.light_copy() rw_log = logger.JSONLogger("ressources/random_walk.conf", mod="continue") if (args.force_rw): key = args.dataset + "_" + str(args.context_size) + "_" + str( args.walk_lenght) + "_" + str(args.seed) if (key in rw_log): try: print('Loading random walks from files') d_rw = torch.load(rw_log[key]["file"]) print('Loaded') except: os.makedirs("/local/gerald/KMEANS_RESULTS/", exist_ok=True)
alpha, beta = args.init_alpha, args.init_beta print("Loading Corpus ") D, X, Y = dataset_dict[args.dataset]() print("Creating dataset") # index of examples dataset dataset_index = corpora_tools.from_indexable( torch.arange(0, len(D), 1).unsqueeze(-1)) print("Dataset Size -> ", len(D)) D.set_path(False) # negative sampling distribution frequency = D.getFrequency()**(3 / 4) frequency[:, 1] /= frequency[:, 1].sum() frequency = pytorch_categorical.Categorical( torch.ones(len(frequency)) / len(frequency)) # random walk dataset d_rw = D.light_copy() d_rw.set_walk(args.walk_lenght, 1.0) d_rw.set_path(True) d_rw = corpora.ContextCorpus(d_rw, context_size=args.context_size, precompute=args.precompute_rw) # neigbhor dataset d_v = D.light_copy() d_v.set_walk(1, 1.0) print(d_rw[1][0].size()) print("Merging dataset") embedding_dataset = corpora_tools.zip_datasets(