def gmm_scale(gmm, shift=None, scale=None, reverse=False, params=None): """ Apply scaling factors to GMM instances. Parameters ---------- gmm : GaussianMixture GMM instance to be scaled. shift : int, float, optional Shift for the entire model. Default is 0 (no shift). scale : int, float, optional Scale for all components. Default is 1 (no scale). reverse : bool, optional Whether the GMM should be reversed. params GaussianMixture params for initialization of new instance. Returns ------- GaussianMixture Modified GMM instance. """ # Fetch parameters if not supplied if params is None: # noinspection PyUnresolvedReferences params = gmm.get_params() # Instantiate new GMM gmm_new = GaussianMixture(**params) # Create scaled fitted GMM model gmm_new.weights_ = gmm.weights_ # Apply shift if set gmm_new.means_ = gmm.means_ + shift if shift is not None else gmm.means_ # Apply scale if scale is not None: gmm_new.means_ /= scale gmm_new.covariances_ = gmm.covariances_ / scale ** 2 if scale is not None else gmm.covariances_ gmm_new.precisions_ = np.linalg.inv(gmm_new.covariances_) if scale is not None else gmm.precisions_ gmm_new.precisions_cholesky_ = np.linalg.cholesky(gmm_new.precisions_) if scale is not None \ else gmm.precisions_cholesky_ # Reverse if set if reverse: gmm_new.means_ *= -1 # Add converged attribute if available if gmm.converged_: gmm_new.converged_ = gmm.converged_ # Return scaled GMM return gmm_new
def update_GMM(g_x, N_samples, chunk): M_samples = chunk.shape[0] #3 a_x = EM_using_mono(chunk) m = g_x.weights_.shape[0] # no of components in original model C = merge(g_x, N_samples, a_x, M_samples) g_x2 = GMM(n_components=1, covariance_type='spherical') g_x2.fit(chunk) modify_gaussian(g_x2, C) g_x2.converged_ = True return N_samples + M_samples, g_x2
def _sample_rows_same(self, X): """ uses efficient sklearn implementation to sample from gaussian mixture -> only works if all rows of X are the same""" weights, locs, scales = self._get_mixture_components(np.expand_dims(X[0], axis=0)) # make sure that sum of weights < 1 weights = weights.astype(np.float64) weights = weights / np.sum(weights) gmm = GaussianMixture(n_components=self.n_centers, covariance_type='diag', max_iter=5, tol=1e-1) gmm.fit(np.random.normal(size=(100,self.ndim_y))) # just pretending a fit # overriding the GMM parameters with own params gmm.converged_ = True gmm.weights_ = weights[0] gmm.means_ = locs[0] gmm.covariances_ = scales[0] y_sample, _ = gmm.sample(X.shape[0]) assert y_sample.shape == (X.shape[0], self.ndim_y) return X, y_sample
def test_once_by_random_features(): Xtrain = numpy.random.random_sample((5000)).reshape(-1, 10) Xtest = numpy.random.random_sample((500)).reshape(-1, 10) gmm_orig = GaussianMixture(n_components=8, random_state=1) gmm_copy = GaussianMixture() gmm_orig.fit(Xtrain) gmm_copy.weights_ = gmm_orig.weights_ gmm_copy.means_ = gmm_orig.means_ gmm_copy.covariances_ = gmm_orig.covariances_ gmm_copy.precisions_ = gmm_orig.precisions_ gmm_copy.precisions_cholesky_ = gmm_orig.precisions_cholesky_ gmm_copy.converged_ = gmm_orig.converged_ gmm_copy.n_iter_ = gmm_orig.n_iter_ gmm_copy.lower_bound_ = gmm_orig.lower_bound_ y_orig = gmm_orig.score_samples(Xtest) y_copy = gmm_copy.score_samples(Xtest) return all(y_orig == y_copy)
def fit_markov_chain(y,plot=False): y_0 = y[:-1] y_1 = y[1:] grad_0 = np.gradient(y_0) grad_1 = np.gradient(y_1) state_1 = grad_1[np.where(grad_0 < 0)] # instances where previous gradient was negative state_2 = grad_1[np.where(grad_0 > 0)] # instances where previous gradient was positive mean_1,std_1 = stats.norm.fit(state_1) mean_2,std_2 = stats.norm.fit(state_2) # Reshaping parameters to be suitable for sklearn.GaussianMixture means = np.array([mean_1,mean_2]) means = means.reshape(2,1) y_GM = np.concatenate((state_2.reshape(-1,1),state_1.reshape(-1,1))) precisions = [1/(std_1**2),1/(std_2**2)] GM = GaussianMixture(n_components=2,covariance_type='spherical') GM.weights_ = [0.5,0.5] GM.means_ = means GM.covariances_ = [std_1,std_2] GM.precisions_ = precisions GM.precisions_cholesky_ = precisions GM.converged_ = True if(plot): samples = GM.sample(5000)[0] fig,ax_list = plt.subplots(3,1) fig.set_size_inches(20,20) ax_list[0].hist(state_1,bins=70) ax_list[1].hist(state_2,bins=70) lnspc_1 = np.linspace(state_1.min(),state_1.max(),y.shape[0]) gauss_1 = stats.norm.pdf(lnspc_1, mean_1, std_1) lnspc_2 = np.linspace(state_2.min(),state_2.max(),y.shape[0]) gauss_2 = stats.norm.pdf(lnspc_2, mean_2, std_2) ax_list[0].plot(lnspc_1,gauss_1) ax_list[1].plot(lnspc_2,gauss_2) ax_list[0].scatter(mean_1,30) ax_list[1].scatter(mean_2,30) ax_list[2].hist(samples,bins=100) plt.show() return GM
def main(dataset_name, pca, cluster_method, lm_type, document_repr_type, random_state): save_dict_data = {} # pca = 0 means no pca do_pca = pca != 0 save_dict_data["dataset_name"] = dataset_name save_dict_data["pca"] = pca save_dict_data["cluster_method"] = cluster_method save_dict_data["lm_type"] = lm_type save_dict_data["document_repr_type"] = document_repr_type save_dict_data["random_state"] = random_state naming_suffix = f"pca{pca}.clus{cluster_method}.{lm_type}.{document_repr_type}.{random_state}" print(naming_suffix) data_dir = os.path.join(INTERMEDIATE_DATA_FOLDER_PATH, dataset_name) print(data_dir) with open(os.path.join(data_dir, "dataset.pk"), "rb") as f: dictionary = pk.load(f) class_names = dictionary["class_names"] num_classes = len(class_names) print(class_names) with open( os.path.join( data_dir, f"document_repr_lm-{lm_type}-{document_repr_type}.pk"), "rb") as f: dictionary = pk.load(f) document_representations = dictionary["document_representations"] class_representations = dictionary["class_representations"] repr_prediction = np.argmax(cosine_similarity_embeddings( document_representations, class_representations), axis=1) save_dict_data["repr_prediction"] = repr_prediction if do_pca: _pca = PCA(n_components=pca, random_state=random_state) document_representations = _pca.fit_transform(document_representations) class_representations = _pca.transform(class_representations) print(f"Explained variance: {sum(_pca.explained_variance_ratio_)}") if cluster_method == 'gmm': cosine_similarities = cosine_similarity_embeddings( document_representations, class_representations) document_class_assignment = np.argmax(cosine_similarities, axis=1) document_class_assignment_matrix = np.zeros( (document_representations.shape[0], num_classes)) for i in range(document_representations.shape[0]): document_class_assignment_matrix[i][ document_class_assignment[i]] = 1.0 gmm = GaussianMixture(n_components=num_classes, covariance_type='tied', random_state=random_state, n_init=999, warm_start=True) gmm.converged_ = "HACK" gmm._initialize(document_representations, document_class_assignment_matrix) gmm.lower_bound_ = -np.infty gmm.fit(document_representations) documents_to_class = gmm.predict(document_representations) centers = gmm.means_ save_dict_data["centers"] = centers distance = -gmm.predict_proba(document_representations) + 1 elif cluster_method == 'kmeans': kmeans = KMeans(n_clusters=num_classes, init=class_representations, random_state=random_state) kmeans.fit(document_representations) documents_to_class = kmeans.predict(document_representations) centers = kmeans.cluster_centers_ save_dict_data["centers"] = centers distance = np.zeros( (document_representations.shape[0], centers.shape[0]), dtype=float) for i, _emb_a in enumerate(document_representations): for j, _emb_b in enumerate(centers): distance[i][j] = np.linalg.norm(_emb_a - _emb_b) save_dict_data["documents_to_class"] = documents_to_class save_dict_data["distance"] = distance with open(os.path.join(data_dir, f"data.{naming_suffix}.pk"), "wb") as f: pk.dump(save_dict_data, f)