def semimarkov_sufficient_stats(feature_list, label_list, covariance_type, n_classes, max_k=None): assert len(feature_list) == len(label_list) tied_diag = covariance_type == 'tied_diag' if tied_diag: emissions = GaussianMixture(n_classes, covariance_type='diag') else: emissions = GaussianMixture(n_classes, covariance_type=covariance_type) X_l = [] r_l = [] span_counts = np.zeros(n_classes, dtype=np.float32) span_lengths = np.zeros(n_classes, dtype=np.float32) span_start_counts = np.zeros(n_classes, dtype=np.float32) # to, from span_transition_counts = np.zeros((n_classes, n_classes), dtype=np.float32) instance_count = 0 # for i in tqdm.tqdm(list(range(len(train_data))), ncols=80): for X, labels in zip(feature_list, label_list): X_l.append(X) r = np.zeros((X.shape[0], n_classes)) r[np.arange(X.shape[0]), labels] = 1 assert r.sum() == X.shape[0] r_l.append(r) spans = labels_to_spans(labels.unsqueeze(0), max_k) # symbol, length spans = rle_spans(spans, torch.LongTensor([spans.size(1)]))[0] last_symbol = None for index, (symbol, length) in enumerate(spans): if index == 0: span_start_counts[symbol] += 1 span_counts[symbol] += 1 span_lengths[symbol] += length if last_symbol is not None: span_transition_counts[symbol, last_symbol] += 1 last_symbol = symbol instance_count += 1 X_arr = np.vstack(X_l) r_arr = np.vstack(r_l) emissions._initialize(X_arr, r_arr) if tied_diag: cov, prec_chol = get_diagonal_covariances(X_arr) emissions.covariances_[:] = np.copy(cov) emissions.precisions_cholesky_[:] = np.copy(prec_chol) return emissions, { 'span_counts': span_counts, 'span_lengths': span_lengths, 'span_start_counts': span_start_counts, 'span_transition_counts': span_transition_counts, 'instance_count': instance_count, }
def semimarkov_sufficient_stats(feature_list, label_list, length_list, covariance_type, n_classes, max_k=None): assert len(feature_list) == len(label_list) == len(length_list) emissions = GaussianMixture(n_classes, covariance_type=covariance_type) X_l = [] r_l = [] span_counts = np.zeros(n_classes, dtype=np.float32) span_lengths = np.zeros(n_classes, dtype=np.float32) span_start_counts = np.zeros(n_classes, dtype=np.float32) span_transition_counts = np.zeros((n_classes, n_classes), dtype=np.float32) instance_count = 0 for X, labels, seq_len in zip(feature_list, label_list, length_list): X = X.cpu() labels = labels.cpu() seq_len = seq_len.cpu().numpy() X_l.append(X) r = np.zeros((X.shape[0], n_classes)) r[np.arange(X.shape[0]), labels] = 1 assert r.sum() == X.shape[0] r_l.append(r) spans = labels_to_spans(labels.unsqueeze(0), max_k) spans = rle_spans(spans, torch.LongTensor([spans.size(1)]))[0] prev = None length_ = 0 for idx, (symbol, length) in enumerate(spans): if idx == 0: span_start_counts[symbol] += 1 length_ = 0 length_ += length if length_ > seq_len: break span_counts[symbol] += 1 span_lengths[symbol] += length if prev is not None: span_transition_counts[symbol, prev] += 1 prev = symbol instance_count += 1 X_arr = np.vstack(X_l) r_arr = np.vstack(r_l) emissions._initialize(X_arr, r_arr) return emissions, { 'span_counts': span_counts, 'span_lengths': span_lengths, 'span_start_counts': span_start_counts, 'span_transition_counts': span_transition_counts, 'instance_count': instance_count }
def main(dataset_name, pca, cluster_method, lm_type, document_repr_type, random_state): save_dict_data = {} # pca = 0 means no pca do_pca = pca != 0 save_dict_data["dataset_name"] = dataset_name save_dict_data["pca"] = pca save_dict_data["cluster_method"] = cluster_method save_dict_data["lm_type"] = lm_type save_dict_data["document_repr_type"] = document_repr_type save_dict_data["random_state"] = random_state naming_suffix = f"pca{pca}.clus{cluster_method}.{lm_type}.{document_repr_type}.{random_state}" print(naming_suffix) data_dir = os.path.join(INTERMEDIATE_DATA_FOLDER_PATH, dataset_name) print(data_dir) with open(os.path.join(data_dir, "dataset.pk"), "rb") as f: dictionary = pk.load(f) class_names = dictionary["class_names"] num_classes = len(class_names) print(class_names) with open( os.path.join( data_dir, f"document_repr_lm-{lm_type}-{document_repr_type}.pk"), "rb") as f: dictionary = pk.load(f) document_representations = dictionary["document_representations"] class_representations = dictionary["class_representations"] repr_prediction = np.argmax(cosine_similarity_embeddings( document_representations, class_representations), axis=1) save_dict_data["repr_prediction"] = repr_prediction if do_pca: _pca = PCA(n_components=pca, random_state=random_state) document_representations = _pca.fit_transform(document_representations) class_representations = _pca.transform(class_representations) print(f"Explained variance: {sum(_pca.explained_variance_ratio_)}") if cluster_method == 'gmm': cosine_similarities = cosine_similarity_embeddings( document_representations, class_representations) document_class_assignment = np.argmax(cosine_similarities, axis=1) document_class_assignment_matrix = np.zeros( (document_representations.shape[0], num_classes)) for i in range(document_representations.shape[0]): document_class_assignment_matrix[i][ document_class_assignment[i]] = 1.0 gmm = GaussianMixture(n_components=num_classes, covariance_type='tied', random_state=random_state, n_init=999, warm_start=True) gmm.converged_ = "HACK" gmm._initialize(document_representations, document_class_assignment_matrix) gmm.lower_bound_ = -np.infty gmm.fit(document_representations) documents_to_class = gmm.predict(document_representations) centers = gmm.means_ save_dict_data["centers"] = centers distance = -gmm.predict_proba(document_representations) + 1 elif cluster_method == 'kmeans': kmeans = KMeans(n_clusters=num_classes, init=class_representations, random_state=random_state) kmeans.fit(document_representations) documents_to_class = kmeans.predict(document_representations) centers = kmeans.cluster_centers_ save_dict_data["centers"] = centers distance = np.zeros( (document_representations.shape[0], centers.shape[0]), dtype=float) for i, _emb_a in enumerate(document_representations): for j, _emb_b in enumerate(centers): distance[i][j] = np.linalg.norm(_emb_a - _emb_b) save_dict_data["documents_to_class"] = documents_to_class save_dict_data["distance"] = distance with open(os.path.join(data_dir, f"data.{naming_suffix}.pk"), "wb") as f: pk.dump(save_dict_data, f)
gmm_reference = GaussianMixture(n_components=2, covariance_type="spherical", tol=0, random_state=np_random) _initialize_orig = gmm_reference._initialize weights_init, means_init, precisions_init = None, None, None def _patched_initialize(X, resp): global weights_init, means_init, precisions_init _initialize_orig(X, resp) weights_init = gmm_reference.weights_ means_init = gmm_reference.means_ precisions_init = gmm_reference.precisions_cholesky_ gmm_reference._initialize = _patched_initialize batch_size = 32 n_samples, n_features = 250, 2 mu1, mu2 = -1.0, 5.0 sigma1, sigma2 = 1.0, 2.0 X_batch = [] weights_init_batch, means_init_batch, precisions_init_batch = [], [], [] expected_weights, expected_means, expected_covariances = [], [], [] for _ in range(batch_size): n1 = int(n_samples * 0.7) * n_features n2 = n_features * n_samples - n1 X = np_random.normal(np.r_[np.full(n1, mu1), np.full(n2, mu2)], np.r_[np.full(n1, sigma1), np.full(n2, sigma2)])
def get_diagonal_covariances(data): # data: num_points x feat_dim model = GaussianMixture(n_components=1, covariance_type='diag') responsibilities = np.ones((data.shape[0], 1)) model._initialize(data, responsibilities) return model.covariances_, model.precisions_cholesky_