def __init_kmeansPP(self, X):
        distribution = torch.ones(len(X)) / len(X)
        frequency = pytorch_categorical.Categorical(distribution)
        centroids_index = []
        N, D = X.shape
        while (len(centroids_index) != self._n_c):

            f = frequency.sample(sample_shape=(1, 1)).item()
            if (f not in centroids_index):
                centroids_index.append(f)
                centroids = X[centroids_index]
                x = X.unsqueeze(1).expand(N, len(centroids_index), D)
                dst = self._distance(centroids, x)
                value, indexes = dst.min(-1)
                vs = value**2
                distribution = vs / (vs.sum())
                frequency = pytorch_categorical.Categorical(distribution)
        self.centroids_index = torch.tensor(centroids_index,
                                            device=X.device).long()
        self.centroids = X[self.centroids_index]
예제 #2
0
alpha, beta = args.init_alpha, args.init_beta

print("Loading Corpus ")
D, X, Y = dataset_dict[args.dataset]()
print("Creating dataset")
# index of examples dataset
dataset_index = corpora_tools.from_indexable(
    torch.arange(0, len(D), 1).unsqueeze(-1))
print("Dataset Size -> ", len(D))

D.set_path(False)

# negative sampling distribution
frequency = D.getFrequency()**(3 / 4)
frequency[:, 1] /= frequency[:, 1].sum()
frequency = pytorch_categorical.Categorical(frequency[:, 1])
# random walk dataset
d_rw = D.light_copy()

rw_log = logger.JSONLogger("ressources/random_walk.conf", mod="continue")
if (args.force_rw):
    key = args.dataset + "_" + str(args.context_size) + "_" + str(
        args.walk_lenght) + "_" + str(args.seed)
    if (key in rw_log):

        try:
            print('Loading random walks from files')
            d_rw = torch.load(rw_log[key]["file"])
            print('Loaded')
        except:
            os.makedirs("/local/gerald/KMEANS_RESULTS/", exist_ok=True)
예제 #3
0
alpha, beta = args.init_alpha, args.init_beta

print("Loading Corpus ")
D, X, Y = dataset_dict[args.dataset]()
print("Creating dataset")
# index of examples dataset
dataset_index = corpora_tools.from_indexable(
    torch.arange(0, len(D), 1).unsqueeze(-1))
print("Dataset Size -> ", len(D))
D.set_path(False)

# negative sampling distribution
frequency = D.getFrequency()**(3 / 4)
frequency[:, 1] /= frequency[:, 1].sum()
frequency = pytorch_categorical.Categorical(
    torch.ones(len(frequency)) / len(frequency))
# random walk dataset
d_rw = D.light_copy()
d_rw.set_walk(args.walk_lenght, 1.0)
d_rw.set_path(True)
d_rw = corpora.ContextCorpus(d_rw,
                             context_size=args.context_size,
                             precompute=args.precompute_rw)
# neigbhor dataset
d_v = D.light_copy()
d_v.set_walk(1, 1.0)

print(d_rw[1][0].size())

print("Merging dataset")
embedding_dataset = corpora_tools.zip_datasets(