def fit(self,X): n_objects = X.shape[0] n_features = X.shape[1] sigma = np.zeros((self.n_clusters, n_features, n_features)) w = np.tile(1.0/self.n_clusters, self.n_clusters) centers_idx = np.random.choice(n_objects, size=self.n_clusters, replace=False) mu = X[centers_idx, :] for cluster in range (self.n_clusters): sigma[cluster :, :] = np.eye(n_features) ll = log_likelihood(X, w, mu, sigma) for i in range(self.max_iter): ll_new = log_likelihood(X, w, mu, sigma) self.save_logs(compute_labels(X, mu), w, mu, sigma, ll) if i > 0 and abs(ll_new - ll) < self.tol: self.cluster_centers_ = mu.copy() self.labels_ = compute_labels(X, mu) self.covars_ = sigma.copy() self.w_ = w.copy() break else: gamma = self.estep(X,w, mu, sigma) w, mu, sigma = self.mstep(X,gamma) ll = ll_new i+=1 if i == self.max_iter: self.convergence = -1
def fit(self, X): n_objects, n_features = X.shape self.covars_ = np.zeros((self.n_clusters, n_features, n_features)) self.w_ = np.tile(1.0 / self.n_clusters, self.n_clusters) centers_idx = np.random.choice(n_objects, size = self.n_clusters, replace = False) self.cluster_centers_ = X[centers_idx, :] for cluster in range(self.n_clusters): self.covars_[cluster :, :] = np.eye(n_features) self.ll = log_likelihood(X, self.w_, self.cluster_centers_, self.covars_) for i in range(self.max_iter): if self.logging: self.logs['log_likelihood'].append(log_likelihood(X, self.w_, self.cluster_centers_, self.covars_)) self.logs['labels'].append(compute_labels(X, self.cluster_centers_)) self.logs['w'].append(self.w_) self.logs['mu'].append(self.cluster_centers_) self.logs['sigma'].append(self.covars_) ll_new = log_likelihood(X, self.w_, self.cluster_centers_, self.covars_) if i > 0 and abs(ll_new - self.ll) < self.tol: break else: g = self.e_step(X) self.m_step(X, g) self.ll = ll_new self.labels_ = compute_labels(X, self.cluster_centers_)
def fit(self, X): n_objects = X.shape[0] best_log_likelihood = float('-inf') for i in range(self.n_init): centers_idx = np.random.choice(n_objects, size=self.n_clusters, replace=False) mu = X[centers_idx, :] labels = compute_labels(X, mu) ll = log_likelihood_from_labels(X, labels) if ll > best_log_likelihood: best_log_likelihood = ll self.cluster_centers_ = mu.copy() self.labels_ = labels
docvecs_fname = "dataset/processed_data/docvecs.csv" docvecs.to_csv(docvecs_fname, index=False, header=False) end = int(round(time.time() * 1000)) print("Process input done! - Elapsed time: %d" % (end - begin)) # load pre-processed data begin = int(round(time.time() * 1000)) print("Loading pre-processed data ...") train_tfidfvecs = utils.load_vecs( "dataset/processed_data/train/tfidfvecs.csv") test_tfidfvecs = utils.load_vecs( "dataset/processed_data/test/tfidfvecs.csv") train_docvecs = utils.load_vecs("dataset/processed_data/train/docvecs.csv") test_docvecs = utils.load_vecs("dataset/processed_data/test/docvecs.csv") train_labels = utils.compute_labels("dataset/textdata/train/data.csv") test_labels = utils.compute_labels("dataset/textdata/test/data.csv") train_dataset = { 'inputs': [train_tfidfvecs, train_docvecs], 'outputs': [train_labels] } test_dataset = { 'inputs': [test_tfidfvecs, test_docvecs], 'outputs': [test_labels] } end = int(round(time.time() * 1000)) print("Compute labels done! - Elapsed time: %d" % (end - begin)) if not context_only: checkpoint_dir = "pretrained/multiview/" else: