def save_memmap_data(output_dirpath, dataname, data_size, nb_features, Xy_gen): output_path_obs = project_dir / output_dirpath / (dataname + ".dat") output_path_labels = project_dir / output_dirpath / (dataname + ".lab") fp_obs = np.memmap(output_path_obs, dtype='float32', mode='w+', shape=(data_size, nb_features)) fp_labels = np.memmap(output_path_labels, mode='w+', shape=(data_size, )) logger.info( "{} Data will be created in file: {}; labels stored in file: {}". format(dataname, output_path_obs, output_path_labels)) logger.info("About to create {}: Total {} examples.".format( dataname, data_size)) curr_idx = 0 for i, (batch_X, batch_y) in enumerate(Xy_gen): curr_batch_size = batch_X.shape[0] fp_obs[curr_idx:curr_idx + curr_batch_size] = batch_X if batch_y is not None: fp_labels[curr_idx:curr_idx + curr_batch_size] = batch_y curr_idx += curr_batch_size if batch_y is None: os.remove(str(output_path_labels))
def _download_single_dataset(output_dirpath, dataname): regex_million = re.compile(r"blobs_(\d+)_million") match = regex_million.match(dataname) if match: size_batch = 10000 data_size = int(1e6) * int(match.group(1)) nb_features = 2000 nb_centers = 1000 save_memmap_data( output_dirpath, dataname, data_size, nb_features, generator_blobs_data(data_size, size_batch, nb_features, nb_centers)) else: if dataname in MAP_NAME_DATASET_DD.keys(): MAP_NAME_DATASET_DD[dataname](output_dirpath) return elif MAP_NAME_CLASSES_PRESENCE_RAM[dataname]: (x_train, y_train), (x_test, y_test) = MAP_NAME_DATASET_RAM[dataname]() map_savez = { "x_train": x_train, "y_train": y_train, "x_test": x_test, "y_test": y_test } else: X = MAP_NAME_DATASET_RAM[dataname]() map_savez = {"x_train": X} output_path = project_dir / output_dirpath / dataname logger.info(f"Save {dataname} to {output_path}") np.savez(output_path, **map_savez)
def generator_blobs_data(data_size, size_batch, nb_features, nb_centers): total_nb_chunks = int(data_size // size_batch) init_centers = np.random.uniform(-10.0, 10.0, (nb_centers, nb_features)) for i in range(total_nb_chunks): logger.info("Chunk {}/{}".format(i + 1, total_nb_chunks)) X, y = make_blobs(size_batch, n_features=nb_features, centers=init_centers, cluster_std=12.) yield X, y
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, gamma, landmarks): # verify sample size for evaluation n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None # Make nystrom approximation # nys_obj = Nystroem(gamma=gamma, n_components=landmarks.shape[0]) # nys_obj.fit(landmarks) # nystrom_embedding = nys_obj.transform(sample) landmarks_norm = get_squared_froebenius_norm_line_wise(landmarks)[:, np.newaxis] metric = prepare_nystrom(landmarks, landmarks_norm, gamma=gamma) nystrom_embedding = nystrom_transformation(sample, landmarks, metric, landmarks_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T # Create real kernel matrix real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm) # real_kernel = rbf_kernel(sample, sample, gamma) real_kernel_norm = np.linalg.norm(real_kernel_special) # evaluation reconstruction error reconstruction_error_nystrom = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm # start svm + nystrom classification if x_test is not None: logger.info("Start classification") x_train_nystrom_embedding = nystrom_transformation(x_train, landmarks, metric, landmarks_norm, None, gamma=gamma) x_test_nystrom_embedding = nystrom_transformation(x_test, landmarks, metric, landmarks_norm, None, gamma=gamma) linear_svc_clf = LinearSVC(class_weight="balanced") linear_svc_clf.fit(x_train_nystrom_embedding, y_train) predictions = linear_svc_clf.predict(x_test_nystrom_embedding) if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1]) f1 = 2 * precision * recall / (precision + recall) accuracy_nystrom_svm = f1 else: accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0] else: accuracy_nystrom_svm = None return reconstruction_error_nystrom, accuracy_nystrom_svm
def get_objective_value(X_data, op_centroids, indicator_vector): logger.info("Compute objective") if paraman["--minibatch"]: final_objective_value = compute_objective_by_batch(X_data, op_centroids, indicator_vector, paraman["--minibatch"]) else: final_objective_value = compute_objective(X_data, op_centroids, indicator_vector) resprinter.add({ "final_objective_value": final_objective_value, }) return final_objective_value
def compute_objective_by_batch(X_data, op_centroids, indicator_vector, batch_size): total_nb_of_minibatch = X_data.shape[0] // batch_size objective_value_so_far = 0 for i_minibatch, example_batch_indexes in enumerate( DataGenerator(X_data, batch_size=batch_size, return_indexes=True)): logger.info("Minibatch number {}/{};".format(i_minibatch, total_nb_of_minibatch)) example_batch = X_data[example_batch_indexes] indicator_vector_batch = indicator_vector[example_batch_indexes] objective_value_so_far += compute_objective(example_batch, op_centroids, indicator_vector_batch) final_objective_value = objective_value_so_far return final_objective_value
def generator_data(data_load_func, size_batch=10000): X, y = data_load_func() data_size = X.shape[0] total_nb_chunks = int(data_size // size_batch) remaining = int(data_size % size_batch) for i in range(total_nb_chunks): logger.info("Chunk {}/{}".format(i + 1, total_nb_chunks)) if y is None: yield X[i * size_batch:(i + 1) * size_batch], None else: yield X[i * size_batch:(i + 1) * size_batch], y[i * size_batch:(i + 1) * size_batch] if remaining > 0: if y is None: yield X[(i + 1) * size_batch:], None else: yield X[(i + 1) * size_batch:], y[(i + 1) * size_batch:]
def make_1nn_evaluation(x_train, y_train, x_test, y_test, U_centroids, indicator_vector): def scikit_evaluation(str_type): clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type) clf.fit(x_train, y_train) start_inference_time = time.time() predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_{}_inference_time".format(str_type): inference_time, "1nn_{}_accuracy".format(str_type): accuracy } resprinter.add(results_1nn) return inference_time def kmean_tree_evaluation(): lst_clf_by_cluster = [ KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit( x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0]) ] start_inference_time = time.time() distances = get_distances(x_test, U_centroids) indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time logger.info("1 nearest neighbor with k-means search") kmean_tree_time = kmean_tree_evaluation() if paraman["kmeans"]: lst_knn_types = ["brute", "ball_tree", "kd_tree"] for knn_type in lst_knn_types: signal.signal(signal.SIGALRM, timeout_signal_handler) signal.alarm(int(kmean_tree_time * 10)) try: logger.info( "1 nearest neighbor with {} search".format(knn_type)) scikit_evaluation(knn_type) except TimeoutError as te: logger.warning( "Timeout during execution of 1-nn with {} version: {}". format(knn_type, te)) signal.alarm(0)
def hierarchical_palm4msa(arr_X_target: np.array, lst_S_init: list, lst_dct_projection_function: list, nb_iter: int, f_lambda_init: float = 1, residual_on_right: bool = True, update_right_to_left=True, track_objective_palm=False, return_objective_function=False, delta_objective_error_threshold_palm=1e-6): """ :param arr_X_target: :param lst_S_init: The factors are given right to left. In all case. :param nb_keep_values: :param f_lambda_init: :param nb_iter: :param update_right_to_left: Way in which the factors are updated in the inner palm4msa algorithm. If update_right_to_left is True, the factors are updated right to left (e.g; the last factor in the list first). Otherwise the contrary. :param residual_on_right: During the split step, the residual can be computed as a right or left factor. If residual_on_right is True, the residuals are computed as right factors. We can also see this option as the update way for the hierarchical strategy: when the residual is computed on the right, it correspond to compute the last factor first (left to right according to the paper: the factor with the bigger number first) :return: """ if not update_right_to_left: raise NotImplementedError # todo voir pourquoi ça plante... mismatch dimension arr_residual = arr_X_target op_S_factors = SparseFactors(deepcopy(lst_S_init)) nb_factors = op_S_factors.n_factors # check if lst_dct_param_projection_operator contains a list of dict with param for step split and finetune assert len( lst_dct_projection_function ) == nb_factors - 1, "Number of factor {} and number of constraints {} are different".format( len(lst_dct_projection_function), nb_factors - 1) assert all( len({"split", "finetune"}.difference(dct.keys())) == 0 for dct in lst_dct_projection_function) f_lambda = f_lambda_init if return_objective_function: objective_function = np.empty((nb_factors, 3)) else: objective_function = None lst_objectives = [] # main loop for k in range(nb_factors - 1): lst_objective_split_fine_fac_k = [] nb_factors_so_far = k + 1 logger.info("Working on factor: {}".format(k)) logger.info("Step split") ########################## Step split ########################################################## if return_objective_function: # compute objective before split step objective_function[k, 0] = compute_objective_function( arr_X_target, f_lambda, op_S_factors) # calcule decomposition en 2 du résidu précédent if k == 0: f_lambda_init_split = f_lambda_init else: f_lambda_init_split = 1. func_split_step_palm4msa = lambda lst_S_init: palm4msa( arr_X_target=arr_residual, lst_S_init=lst_S_init, # eye for factor and zeros for residual nb_factors=2, lst_projection_functions=lst_dct_projection_function[k]["split"], # define constraints: ||0 = d pour T1; relaxed constraint on ||0 for T2 f_lambda_init=f_lambda_init_split, nb_iter=nb_iter, update_right_to_left=update_right_to_left, track_objective=track_objective_palm, delta_objective_error_threshold= delta_objective_error_threshold_palm) if residual_on_right: op_S_factors_init = SparseFactors(lst_S_init[nb_factors_so_far:]) residual_init = op_S_factors_init.compute_product( ) # todo I think this product can be prepared before and save computation lst_S_init_split_step = [lst_S_init[k], residual_init] f_lambda_prime, S_out, unscaled_residual_reconstruction, objective_palm_split, _ = \ func_split_step_palm4msa(lst_S_init=lst_S_init_split_step) new_factor = S_out.get_factor(0) new_residual = S_out.get_factor(1) op_S_factors.set_factor(k, new_factor) else: op_S_factors_init = SparseFactors(lst_S_init[:-nb_factors_so_far]) residual_init = op_S_factors_init.compute_product( ) # todo I think this product can be prepared before and save computation lst_S_init_split_step = [ residual_init, lst_S_init[-nb_factors_so_far] ] f_lambda_prime, S_out, unscaled_residual_reconstruction, objective_palm_split, _ = \ func_split_step_palm4msa(lst_S_init=lst_S_init_split_step) new_residual = S_out.get_factor(0) new_factor = S_out.get_factor(1) op_S_factors.set_factor(nb_factors - nb_factors_so_far, new_factor) if k == 0: f_lambda = f_lambda_prime else: f_lambda *= f_lambda_prime lst_objective_split_fine_fac_k.append(objective_palm_split) # get the k first elements [:k+1] and the next one (k+1)th as arr_residual (depend on the residual_on_right option) logger.info("Step finetuning") ########################## Step finetuning ########################################################## if return_objective_function: objective_function[k, 1] = compute_objective_function( arr_X_target, f_lambda, op_S_factors) func_fine_tune_step_palm4msa = lambda lst_S_init: palm4msa( arr_X_target=arr_X_target, lst_S_init=lst_S_init, nb_factors=nb_factors_so_far + 1, lst_projection_functions=lst_dct_projection_function[k]["finetune" ], f_lambda_init=f_lambda, nb_iter=nb_iter, update_right_to_left=update_right_to_left, track_objective=track_objective_palm, delta_objective_error_threshold= delta_objective_error_threshold_palm) if residual_on_right: lst_S_in = op_S_factors.get_list_of_factors()[:nb_factors_so_far] f_lambda, lst_S_out, _, objective_palm_fine, _ = \ func_fine_tune_step_palm4msa( lst_S_init=lst_S_in + [new_residual]) for i in range(nb_factors_so_far): op_S_factors.set_factor(i, lst_S_out.get_factor(i)) # TODO remove .toarray()? arr_residual = lst_S_out.get_factor(nb_factors_so_far).toarray() else: lst_S_in = op_S_factors.get_list_of_factors()[-nb_factors_so_far:] f_lambda, lst_S_out, _, objective_palm_fine, _ = \ func_fine_tune_step_palm4msa( lst_S_init=[new_residual] + lst_S_in) for i in range(nb_factors_so_far): op_S_factors.set_factor(-nb_factors_so_far + i, lst_S_out.get_factor(i + 1)) # TODO remove .toarray()? arr_residual = lst_S_out.get_factor(0).toarray() lst_objective_split_fine_fac_k.append(objective_palm_fine) lst_objectives.append(tuple(lst_objective_split_fine_fac_k)) if return_objective_function: objective_function[k, 2] = compute_objective_function( arr_X_target, f_lambda, op_S_factors) # last factor is residual of last palm4LED if residual_on_right: op_S_factors.set_factor(-1, arr_residual) else: op_S_factors.set_factor(0, arr_residual) if return_objective_function: objective_function[nb_factors - 1, :] = np.array( [compute_objective_function(arr_X_target, f_lambda, op_S_factors) ] * 3) arr_X_curr = f_lambda * op_S_factors.compute_product() return f_lambda, op_S_factors, arr_X_curr, lst_objectives, objective_function
nystrom_inference_time = nystrom_inference_time_stop - nystrom_inference_time_start sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel) nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm } resprinter.add(nystrom_results) if __name__ == "__main__": logger.info("Command line: " + " ".join(sys.argv)) arguments = docopt.docopt(__doc__) paraman = ParameterManager(arguments) initialized_results = dict((v, None) for v in lst_results_header) resprinter = ResultPrinter(output_file=paraman["--output-file_resprinter"]) resprinter.add(initialized_results) resprinter.add(paraman) objprinter = ObjectiveFunctionPrinter( output_file=paraman["--output-file_objprinter"]) has_failed = False if paraman["--verbose"]: daiquiri.setup(level=logging.DEBUG) else: daiquiri.setup(level=logging.INFO) try:
lst_factors[-1] = np.random.rand(min(X.shape), X.shape[1]) lst_factors[0] = np.eye(X.shape[0], min(X.shape)) _lambda = 1. lst_proj_op_by_fac_step = [] factor = 10 nb_keep_values = factor*d for k in range(nb_factors - 1): nb_values_residual = max(nb_keep_values, int(d / 2 ** (k + 1)) * d) dct_step_lst_nb_keep_values = { "split": [get_lambda_proxsplincol(nb_keep_values), get_lambda_proxsplincol(nb_values_residual)] if residual_on_right else [get_lambda_proxsplincol(nb_values_residual), get_lambda_proxsplincol(nb_keep_values)], "finetune": [get_lambda_proxsplincol(nb_keep_values)] * (k+1) + [get_lambda_proxsplincol(nb_values_residual)] if residual_on_right else [get_lambda_proxsplincol(nb_values_residual)] + [get_lambda_proxsplincol(nb_keep_values)] * (k+1) } lst_proj_op_by_fac_step.append(dct_step_lst_nb_keep_values) logger.info("Sparsity parameter by factor: {}".format(pformat(lst_proj_op_by_fac_step))) #final_lambda, final_factors, final_X = PALM4LED(H, lst_factors, [nb_keep_values for _ in range(nb_factors)], _lambda, nb_iter) final_lambda, final_factors, final_X, nb_iter_by_factor, _ = hierarchical_palm4msa( arr_X_target=X, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=_lambda, nb_iter=nb_iter, update_right_to_left=True, residual_on_right=residual_on_right, graphical_display=False) logger.info("Number of iteration for each factor: {}; Total: {}".format(nb_iter_by_factor, sum(nb_iter_by_factor))) visual_evaluation_palm4msa(X, lst_factors, final_factors, final_X)
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids): """ Evaluation Nystrom construction time and approximation precision. The approximation is based on a subsample of size n_sample of the input data set. :param x_train: Input dataset as ndarray. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param n_sample: The number of sample to take into account in the reconstruction (can't be too large) :return: """ def prepare_nystrom(landmarks, landmarks_norm): basis_kernel_W = special_rbf_kernel(landmarks, landmarks, gamma, landmarks_norm, landmarks_norm) U, S, V = np.linalg.svd(basis_kernel_W) S = np.maximum(S, 1e-12) normalization_ = np.dot(U / np.sqrt(S), V) return normalization_ def nystrom_transformation(x_input, landmarks, p_metric, landmarks_norm, x_input_norm): nystrom_embedding = special_rbf_kernel(landmarks, x_input, gamma, landmarks_norm, x_input_norm).T @ p_metric return nystrom_embedding n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning( "Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample # Compute euristic gamma as the mean of euclidian distance between example gamma = compute_euristic_gamma(x_train) log_memory_usage( "Memory after euristic gamma computation in make_nystrom_evaluation") # precompute the centroids norm for later use (optimization) centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids) # centroids_norm = None indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None log_memory_usage( "Memory after sample selection in make_nystrom_evaluation") ######################## # Nystrom on centroids # ######################## logger.info("Build Nystrom on centroids") ## TIME: nystrom build time # nystrom build time is Nystrom preparation time for later use. ## START nystrom_build_start_time = time.process_time() metric = prepare_nystrom(U_centroids, centroids_norm) nystrom_build_stop_time = time.process_time() log_memory_usage("Memory after SVD computation in make_nystrom_evaluation") # STOP nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time ## TIME: nystrom inference time # Nystrom inference time is the time for Nystrom transformation for all the samples. ## START nystrom_inference_time_start = time.process_time() nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T nystrom_inference_time_stop = time.process_time() log_memory_usage( "Memory after kernel matrix approximation in make_nystrom_evaluation") ## STOP nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample ################################################################ ###################### # Nystrom on uniform # ###################### logger.info("Build Nystrom on uniform sampling") indexes_uniform_samples = np.random.permutation( x_train.shape[0])[:U_centroids.shape[0]] uniform_sample = x_train[indexes_uniform_samples] uniform_sample_norm = None log_memory_usage( "Memory after uniform sample selection in make_nystrom_evaluation") metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm) log_memory_usage( "Memory after SVD computation in uniform part of make_nystrom_evaluation" ) nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm) nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T ################################################################# ############### # Real Kernel # ############### logger.info("Compute real kernel matrix") real_kernel = special_rbf_kernel(sample, sample, gamma, samples_norm, samples_norm) real_kernel_norm = np.linalg.norm(real_kernel) log_memory_usage( "Memory after real kernel computation in make_nystrom_evaluation") ################################################################ #################### # Error evaluation # #################### sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel) / real_kernel_norm sampled_froebenius_norm_uniform = np.linalg.norm( nystrom_approx_kernel_value_uniform - real_kernel) / real_kernel_norm # svm evaluation if x_test is not None: logger.info("Start classification") time_classification_start = time.process_time() x_train_nystrom_embedding = nystrom_transformation( x_train, U_centroids, metric, centroids_norm, None) x_test_nystrom_embedding = nystrom_transformation( x_test, U_centroids, metric, centroids_norm, None) linear_svc_clf = LinearSVC() linear_svc_clf.fit(x_train_nystrom_embedding, y_train) accuracy_nystrom_svm = linear_svc_clf.score(x_test_nystrom_embedding, y_test) time_classification_stop = time.process_time() delta_time_classification = time_classification_stop - time_classification_start else: accuracy_nystrom_svm = None delta_time_classification = None nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm, "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform, "nystrom_svm_accuracy": accuracy_nystrom_svm, "nystrom_svm_time": delta_time_classification } resprinter.add(nystrom_results)
K_nb_cluster=U_init.shape[0], nb_iter=paraman["--nb-iteration"], initialization=U_init, batch_size=paraman["--minibatch"]) else: objective_values_k, final_centroids, indicator_vector_final = kmeans( X_data=X, K_nb_cluster=U_init.shape[0], nb_iter=paraman["--nb-iteration"], initialization=U_init) return final_centroids, indicator_vector_final if __name__ == "__main__": logger.info("Command line: " + " ".join(sys.argv)) log_memory_usage("Memory at startup") arguments = docopt.docopt(__doc__) paraman = ParameterManagerEfficientNystrom(arguments) initialized_results = dict((v, None) for v in lst_results_header) resprinter = ResultPrinter(output_file=paraman["--output-file_resprinter"]) resprinter.add(initialized_results) resprinter.add(paraman) has_failed = False if paraman["-v"] >= 2: daiquiri.setup(level=logging.DEBUG) elif paraman["-v"] >= 1: daiquiri.setup(level=logging.INFO) else: daiquiri.setup(level=logging.WARNING)
def kmeans(X_data, K_nb_cluster, nb_iter, initialization, delta_objective_error_threshold=1e-6, proj_l1=False, _lambda=None, epsilon=None): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param initialization: The (K, d) matrix of centroids at initialization. :param delta_objective_error_threshold: The normalized difference between the error criterion at 2 successive step must be greater or equal to that value. :return: """ X_data_norms = get_squared_froebenius_norm_line_wise(X_data) # Initialize our centroids by picking random data points U_centroids_hat = copy.deepcopy(initialization) U_centroids = U_centroids_hat objective_function = np.empty((nb_iter, )) # Loop for the maximum number of iterations i_iter = 0 delta_objective_error = np.inf while (i_iter == 0) or ( (i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration Kmeans {}".format(i_iter)) indicator_vector, distances = assign_points_to_clusters( X_data, U_centroids, X_norms=X_data_norms) cluster_names, counts = np.unique(indicator_vector, return_counts=True) cluster_names_sorted = np.argsort(cluster_names) # Update centroid location using the new indicator vector counts, cluster_names_sorted = update_clusters_with_integrity_check( X_data, X_data_norms, U_centroids_hat, K_nb_cluster, counts, indicator_vector, distances, cluster_names, cluster_names_sorted) U_centroids = U_centroids_hat if proj_l1: if _lambda is None or epsilon is None: raise ValueError( "epsilon and lambda must be set if proj_l1 is True") for i_centroid, centroid in enumerate(U_centroids): U_centroids[i_centroid, :] = proj_onto_l1_ball( _lambda=_lambda, epsilon_tol=epsilon, vec=centroid) objective_function[i_iter, ] = compute_objective( X_data, U_centroids, indicator_vector) if i_iter >= 1: delta_objective_error = np.abs(objective_function[i_iter] - objective_function[i_iter - 1] ) / objective_function[i_iter - 1] i_iter += 1 return objective_function[:i_iter], U_centroids, indicator_vector
def kmeans_minibatch(X_data, K_nb_cluster, nb_iter, initialization, batch_size): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param initialization: The (K, d) matrix of centroids at initialization. :param batch_size: The size of each batch. :return: """ X_data_norms = get_squared_froebenius_norm_line_wise(X_data) # Initialize our centroids by picking random data points U_centroids_hat = copy.deepcopy(initialization) U_centroids = U_centroids_hat full_indicator_vector = np.zeros(X_data.shape[0], dtype=int) full_count_vector = np.zeros(K_nb_cluster, dtype=int) objective_function = np.empty((nb_iter, )) # Loop for the maximum number of iterations i_iter = 0 delta_objective_error_threshold = 1e-6 delta_objective_error = np.inf while True: for i_iter, example_batch_indexes in enumerate( DataGenerator(X_data, batch_size=batch_size, return_indexes=True)): if not (delta_objective_error > delta_objective_error_threshold): logger.info( "not (delta_objective_error {}-{}={} > delta_objective_error_threshold {})" .format(objective_function[i_iter], objective_function[i_iter - 1], delta_objective_error, delta_objective_error_threshold)) break example_batch = X_data[example_batch_indexes] logger.info("Iteration Kmeans {}".format(i_iter)) indicator_vector, distances = assign_points_to_clusters( example_batch, U_centroids, X_norms=X_data_norms[example_batch_indexes]) full_indicator_vector[example_batch_indexes] = indicator_vector cluster_names, counts = np.unique(indicator_vector, return_counts=True) # cluster_names_sorted = np.argsort(cluster_names) # count_vector = np.zeros(K_nb_cluster, dtype=int) count_vector[cluster_names] = counts full_count_vector += count_vector # previous_full_count_vector = full_count_vector - count_vector # Update centroid location using the newly # assigned data point classes # This way of updating the centroids (centroid index wise) is better than the one proposed in the paper "Web-Scale K-Means Clustering" # as the number of update with always be <= batch_size for c in range(K_nb_cluster): if full_count_vector[c] != 0 and count_vector[c] != 0: U_centroids_hat[c] += (1 / full_count_vector[c]) * np.sum( example_batch[indicator_vector == c] - U_centroids_hat[c], axis=0) # this is exactly equivalent to an update of the mean: # U_centroids_hat[c] = (previous_full_count_vector[c] / full_count_vector[c]) * U_centroids_hat[c] + (1 / full_count_vector[c]) * np.sum(example_batch[indicator_vector == c], axis=0) # for i_ex, ex in enumerate(example_batch): # c = indicator_vector[i_ex] # full_count_vector[c] += 1 # eta = 1./full_count_vector[c] # U_centroids_hat[c] = (1-eta) * U_centroids_hat[c] + eta * ex # counts, cluster_names_sorted = assess_clusters_integrity(X_data, # X_data_norms, # U_centroids_hat, # K_nb_cluster, # counts, # indicator_vector, # distances, # cluster_names, # cluster_names_sorted) # check if all clusters still have points # for c in range(K_nb_cluster): # biggest_cluster_index = np.argmax(counts) # type: int # biggest_cluster = cluster_names[biggest_cluster_index] # biggest_cluster_data = X_data[indicator_vector == biggest_cluster] # # cluster_data = X_data[indicator_vector == c] # if len(cluster_data) == 0: # logger.warning("cluster has lost data, add new cluster. cluster idx: {}".format(c)) # U_centroids_hat[c] = biggest_cluster_data[np.random.randint(len(biggest_cluster_data))].reshape(1, -1) # counts = list(counts) # counts[biggest_cluster_index] -= 1 # counts.append(1) # counts = np.array(counts) # cluster_names_sorted = list(cluster_names_sorted) # cluster_names_sorted.append(c) # cluster_names_sorted = np.array(cluster_names_sorted) # else: # U_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0) U_centroids = U_centroids_hat objective_function[i_iter, ] = compute_objective( X_data, U_centroids, full_indicator_vector) if i_iter >= 1: delta_objective_error = np.abs( objective_function[i_iter] - objective_function[i_iter - 1] ) / objective_function[ i_iter - 1] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 else: continue break return objective_function[:i_iter], U_centroids, indicator_vector
def make_nystrom_evaluation(x_train, y_train, x_test, y_test, U_centroids): """ Evaluation Nystrom construction time and approximation precision. The approximation is based on a subsample of size n_sample of the input data set. :param x_train: Input dataset as ndarray. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param n_sample: The number of sample to take into account in the reconstruction (can't be too large) :return: """ n_sample = paraman["--nystrom"] if n_sample > x_train.shape[0]: logger.warning("Batch size for nystrom evaluation is bigger than data size. {} > {}. Using " "data size instead.".format(n_sample, x_train.shape[0])) n_sample = x_train.shape[0] paraman["--nystrom"] = n_sample # Compute euristic gamma as the mean of euclidian distance between example gamma = compute_euristic_gamma(x_train) log_memory_usage("Memory after euristic gamma computation in make_nystrom_evaluation") # precompute the centroids norm for later use (optimization) centroids_norm = get_squared_froebenius_norm_line_wise(U_centroids)[:, np.newaxis] # centroids_norm = None indexes_samples = np.random.permutation(x_train.shape[0])[:n_sample] sample = x_train[indexes_samples] samples_norm = None log_memory_usage("Memory after sample selection in make_nystrom_evaluation") ######################## # Nystrom on centroids # ######################## logger.info("Build Nystrom on centroids") ## TIME: nystrom build time # nystrom build time is Nystrom preparation time for later use. ## START nystrom_build_start_time = time.process_time() metric = prepare_nystrom(U_centroids, centroids_norm, gamma=gamma) nystrom_build_stop_time = time.process_time() log_memory_usage("Memory after SVD computation in make_nystrom_evaluation") # STOP nystrom_build_time = nystrom_build_stop_time - nystrom_build_start_time ## TIME: nystrom inference time # Nystrom inference time is the time for Nystrom transformation for all the samples. ## START nystrom_inference_time_start = time.process_time() nystrom_embedding = nystrom_transformation(sample, U_centroids, metric, centroids_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value = nystrom_embedding @ nystrom_embedding.T nystrom_inference_time_stop = time.process_time() log_memory_usage("Memory after kernel matrix approximation in make_nystrom_evaluation") ## STOP nystrom_inference_time = (nystrom_inference_time_stop - nystrom_inference_time_start) / n_sample ################################################################ ###################### # Nystrom on uniform # ###################### logger.info("Build Nystrom on uniform sampling") indexes_uniform_samples = np.random.permutation(x_train.shape[0])[:U_centroids.shape[0]] uniform_sample = x_train[indexes_uniform_samples] uniform_sample_norm = get_squared_froebenius_norm_line_wise(uniform_sample)[:, np.newaxis] log_memory_usage("Memory after uniform sample selection in make_nystrom_evaluation") metric_uniform = prepare_nystrom(uniform_sample, uniform_sample_norm, gamma=gamma) log_memory_usage("Memory after SVD computation in uniform part of make_nystrom_evaluation") nystrom_embedding_uniform = nystrom_transformation(sample, uniform_sample, metric_uniform, uniform_sample_norm, samples_norm, gamma=gamma) nystrom_approx_kernel_value_uniform = nystrom_embedding_uniform @ nystrom_embedding_uniform.T ################################################################# ############### # Real Kernel # ############### logger.info("Compute real kernel matrix") real_kernel_special = special_rbf_kernel(sample, sample, gamma, norm_X=samples_norm, norm_Y=samples_norm) # real_kernel = rbf_kernel(sample, sample, gamma) real_kernel_norm = np.linalg.norm(real_kernel_special) log_memory_usage("Memory after real kernel computation in make_nystrom_evaluation") ################################# # Sklearn based Nystrom uniform # ################################# # sklearn_nystrom = Nystroem(gamma=gamma, n_components=uniform_sample.shape[0]) # sklearn_nystrom = sklearn_nystrom.fit(uniform_sample) # sklearn_transfo = sklearn_nystrom.transform(sample) # kernel_sklearn_nys = sklearn_transfo @ sklearn_transfo.T ################################################################ #################### # Error evaluation # #################### sampled_froebenius_norm = np.linalg.norm(nystrom_approx_kernel_value - real_kernel_special) / real_kernel_norm sampled_froebenius_norm_uniform = np.linalg.norm(nystrom_approx_kernel_value_uniform - real_kernel_special) / real_kernel_norm # svm evaluation if x_test is not None: logger.info("Start classification") time_classification_start = time.process_time() x_train_nystrom_embedding = nystrom_transformation(x_train, U_centroids, metric, centroids_norm, None, gamma=gamma) x_test_nystrom_embedding = nystrom_transformation(x_test, U_centroids, metric, centroids_norm, None, gamma=gamma) linear_svc_clf = LinearSVC(class_weight="balanced") linear_svc_clf.fit(x_train_nystrom_embedding, y_train) predictions = linear_svc_clf.predict(x_test_nystrom_embedding) time_classification_stop = time.process_time() if paraman["--kddcup04"]: # compute recall: nb_true_positive/real_nb_positive recall = np.sum(predictions[y_test == 1])/np.sum(y_test[y_test == 1]) # compute precision: nb_true_positive/nb_positive precision = np.sum(predictions[y_test == 1])/np.sum(predictions[predictions==1]) f1 = 2 * precision * recall / (precision + recall) accuracy_nystrom_svm = f1 else: accuracy_nystrom_svm = np.sum(predictions == y_test) / y_test.shape[0] delta_time_classification = time_classification_stop - time_classification_start else: accuracy_nystrom_svm = None delta_time_classification = None nystrom_results = { "nystrom_build_time": nystrom_build_time, "nystrom_inference_time": nystrom_inference_time, "nystrom_sampled_error_reconstruction": sampled_froebenius_norm, "nystrom_sampled_error_reconstruction_uniform": sampled_froebenius_norm_uniform, "nystrom_svm_accuracy": accuracy_nystrom_svm, "nystrom_svm_time": delta_time_classification } resprinter.add(nystrom_results)
def qmeans(X_data: np.ndarray, K_nb_cluster: int, nb_iter: int, nb_factors: int, params_palm4msa: dict, initialization: np.ndarray, hierarchical_inside=False, graphical_display=False): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param nb_factors: The number of factors for the decomposition. :param initialization: The initial matrix of centroids not yet factorized. :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm. :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used. :param graphical_display: Tell the algorithm to display the results. :return: """ assert K_nb_cluster == initialization.shape[0] X_data_norms = get_squared_froebenius_norm_line_wise(X_data) init_lambda = params_palm4msa["init_lambda"] nb_iter_palm = params_palm4msa["nb_iter"] lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"] residual_on_right = params_palm4msa["residual_on_right"] X_centroids_hat = copy.deepcopy(initialization) min_K_d = min(X_centroids_hat.shape) lst_factors = [np.eye(min_K_d) for _ in range(nb_factors)] eye_norm = np.sqrt(K_nb_cluster) lst_factors[0] = np.eye(K_nb_cluster) / eye_norm lst_factors[1] = np.eye(K_nb_cluster, min_K_d) lst_factors[-1] = np.zeros((min_K_d, X_centroids_hat.shape[1])) if graphical_display: lst_factors_init = copy.deepcopy(lst_factors) _lambda_tmp, lst_factors, U_centroids, nb_iter_by_factor, objective_palm = hierarchical_palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, graphical_display=False) _lambda = _lambda_tmp / eye_norm if graphical_display: if hierarchical_inside: plt.figure() plt.yscale("log") plt.scatter(np.arange(len(objective_palm) * 3, step=3), objective_palm[:, 0], marker="x", label="before split") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1, objective_palm[:, 1], marker="x", label="between") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2, objective_palm[:, 2], marker="x", label="after finetune") plt.plot(np.arange(len(objective_palm) * 3), objective_palm.flatten(), color="k") plt.legend() plt.show() visual_evaluation_palm4msa( np.eye(K_nb_cluster) @ X_centroids_hat, lst_factors_init, lst_factors, _lambda * multi_dot(lst_factors)) objective_function = np.empty((nb_iter, 2)) # Loop for the maximum number of iterations i_iter = 0 delta_objective_error_threshold = 1e-6 delta_objective_error = np.inf while (i_iter <= 1) or ( (i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration Qmeans {}".format(i_iter)) U_centroids = _lambda * multi_dot(lst_factors[1:]) if i_iter > 0: objective_function[i_iter, 0] = compute_objective(X_data, U_centroids, indicator_vector) # Assign all points to the nearest centroid # first get distance from all points to all centroids distances = get_distances(X_data, U_centroids, precomputed_data_points_norm=X_data_norms) # then, Determine class membership of each point # by picking the closest centroid indicator_vector = np.argmin(distances, axis=1) objective_function[i_iter, 1] = compute_objective(X_data, U_centroids, indicator_vector) # Update centroid location using the newly # assigned data point classes for c in range(K_nb_cluster): X_centroids_hat[c] = np.mean(X_data[indicator_vector == c], 0) # get the number of observation in each cluster cluster_names, counts = np.unique(indicator_vector, return_counts=True) cluster_names_sorted = np.argsort(cluster_names) if len(counts) < K_nb_cluster: raise ValueError( "Some clusters have no point. Aborting iteration {}".format( i_iter)) diag_counts_sqrt = np.diag(np.sqrt( counts[cluster_names_sorted])) # todo use sparse matrix object diag_counts_sqrt_norm = np.linalg.norm( diag_counts_sqrt ) # todo analytic sqrt(n) instead of cumputing it with norm diag_counts_sqrt_normalized = diag_counts_sqrt / diag_counts_sqrt_norm # set it as first factor lst_factors[0] = diag_counts_sqrt_normalized if graphical_display: lst_factors_init = copy.deepcopy(lst_factors) if hierarchical_inside: _lambda_tmp, lst_factors, _, nb_iter_by_factor, objective_palm = hierarchical_palm4msa( arr_X_target=diag_counts_sqrt @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, # f_lambda_init=_lambda, f_lambda_init=_lambda * diag_counts_sqrt_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, graphical_display=False) loss_palm_before = objective_palm[0, 0] loss_palm_after = objective_palm[-1, -1] else: _lambda_tmp, lst_factors, _, objective_palm, nb_iter_palm = palm4msa( arr_X_target=diag_counts_sqrt @ X_centroids_hat, lst_S_init=lst_factors, nb_factors=len(lst_factors), lst_projection_functions=lst_proj_op_by_fac_step[-1] ["finetune"], f_lambda_init=_lambda * diag_counts_sqrt_norm, nb_iter=nb_iter_palm, update_right_to_left=True, graphical_display=False) loss_palm_before = objective_palm[0, -1] loss_palm_after = objective_palm[-1, -1] logger.debug("Loss palm before: {}".format(loss_palm_before)) logger.debug("Loss palm after: {}".format(loss_palm_after)) if graphical_display: if hierarchical_inside: plt.figure() plt.yscale("log") plt.scatter(np.arange(len(objective_palm) * 3, step=3), objective_palm[:, 0], marker="x", label="before split") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 1, objective_palm[:, 1], marker="x", label="between") plt.scatter(np.arange(len(objective_palm) * 3, step=3) + 2, objective_palm[:, 2], marker="x", label="after finetune") plt.plot(np.arange(len(objective_palm) * 3), objective_palm.flatten(), color="k") plt.legend() plt.show() visual_evaluation_palm4msa(diag_counts_sqrt @ X_centroids_hat, lst_factors_init, lst_factors, _lambda_tmp * multi_dot(lst_factors)) _lambda = _lambda_tmp / diag_counts_sqrt_norm logger.debug("Returned loss (with diag) palm: {}".format( objective_palm[-1, 0])) if i_iter >= 2: delta_objective_error = np.abs( objective_function[i_iter, 0] - objective_function[i_iter - 1, 0] ) / objective_function[ i_iter - 1, 0] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 U_centroids = _lambda * multi_dot(lst_factors[1:]) distances = get_distances(X_data, U_centroids, precomputed_data_points_norm=X_data_norms) indicator_vector = np.argmin(distances, axis=1) return objective_function[:i_iter], U_centroids, indicator_vector
nb_factors = 5 sparsity_factor = 2 nb_iter_palm = 300 residual_on_right = False # lst_constraints, lst_constraints_vals = build_constraint_sets(U_centroids_hat.shape[0], U_centroids_hat.shape[1], nb_factors, sparsity_factor=sparsity_factor) K = U_centroids_hat.shape[0] d = U_centroids_hat.shape[1] lst_constraints, lst_constraints_vals = build_constraint_set_smart( K, d, nb_factors, sparsity_factor=sparsity_factor, residual_on_right=residual_on_right) logger.info("constraints: {}".format(pformat(lst_constraints_vals))) hierarchical_palm_init = { "init_lambda": 1., "nb_iter": nb_iter_palm, "lst_constraint_sets": lst_constraints, "residual_on_right": residual_on_right } # try: objective_values_q_hier, centroids_finaux_q_hier, indicator_hier = qmeans( X, nb_clusters, nb_iter_kmeans, nb_factors, hierarchical_palm_init,
logger.info("Compute objective") if paraman["--minibatch"]: final_objective_value = compute_objective_by_batch( X_data, op_centroids, indicator_vector, paraman["--minibatch"]) else: final_objective_value = compute_objective(X_data, op_centroids, indicator_vector) resprinter.add({ "final_objective_value": final_objective_value, }) return final_objective_value if __name__ == "__main__": logger.info("Command line: " + " ".join(sys.argv)) log_memory_usage("Memory at startup") arguments = docopt.docopt(__doc__) paraman = ParameterManager(arguments) initialized_results = dict((v, None) for v in lst_results_header) resprinter = ResultPrinter(output_file=paraman["--output-file_resprinter"]) resprinter.add(initialized_results) resprinter.add(paraman) objprinter = ObjectiveFunctionPrinter( output_file=paraman["--output-file_objprinter"]) has_failed = False if paraman["-v"] >= 2: daiquiri.setup(level=logging.DEBUG) elif paraman["-v"] >= 1: daiquiri.setup(level=logging.INFO) else:
i_iter += 1 else: continue break return objective_function[:i_iter], U_centroids, indicator_vector if __name__ == "__main__": n_samples = 1000 n_features = 2 n_centers = 500 batch_size = 100 nb_clust = 10 nb_iter = 20 X, _ = datasets.make_blobs(n_samples=n_samples, n_features=n_features, centers=n_centers) centroids_init = X[np.random.permutation(X.shape[0])[:nb_clust]] actual_nb_iter = (n_samples // batch_size) * nb_iter logger.info("Nb iteration: {}".format(actual_nb_iter)) obj, _, _ = kmeans_minibatch(X, nb_clust, actual_nb_iter, centroids_init, batch_size) plt.plot(obj) plt.show()
def qmeans(X_data: np.ndarray, K_nb_cluster: int, nb_iter: int, nb_factors: int, params_palm4msa: dict, initialization: np.ndarray, hierarchical_inside=False, delta_objective_error_threshold=1e-6, hierarchical_init=False): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param nb_factors: The number of factors for the decomposition. :param initialization: The initial matrix of centroids not yet factorized. :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm. :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used. :param delta_objective_error_threshold: :param hierarchical_init: Tells if the algorithm should make the initialization of sparse factors with the hierarchical version of palm or not. :return: """ assert K_nb_cluster == initialization.shape[0], "The number of cluster {} is not equal to the number of centroids in the initialization {}.".format(K_nb_cluster, initialization.shape[0]) X_data_norms = get_squared_froebenius_norm_line_wise(X_data) nb_examples = X_data.shape[0] logger.info("Initializing Qmeans") init_lambda = params_palm4msa["init_lambda"] nb_iter_palm = params_palm4msa["nb_iter"] lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"] residual_on_right = params_palm4msa["residual_on_right"] delta_objective_error_threshold_inner_palm = params_palm4msa["delta_objective_error_threshold"] track_objective_palm = params_palm4msa["track_objective"] X_centroids_hat = copy.deepcopy(initialization) lst_factors = init_lst_factors(K_nb_cluster, X_centroids_hat.shape[1], nb_factors) eye_norm = np.sqrt(K_nb_cluster) if hierarchical_inside or hierarchical_init: _lambda_tmp, op_factors, U_centroids, objective_palm, array_objective_hierarchical= \ hierarchical_palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, track_objective_palm=track_objective_palm, delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm, return_objective_function=track_objective_palm) else: _lambda_tmp, op_factors, U_centroids, objective_palm, nb_iter_palm = \ palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, nb_factors=len(lst_factors), lst_projection_functions=lst_proj_op_by_fac_step[-1][ "finetune"], f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, track_objective=track_objective_palm, delta_objective_error_threshold=delta_objective_error_threshold_inner_palm) lst_factors = None # safe assignment for debug _lambda = _lambda_tmp / eye_norm objective_function = np.ones(nb_iter) * -1 lst_all_objective_functions_palm = [] lst_all_objective_functions_palm.append(objective_palm) i_iter = 0 delta_objective_error = np.inf while ((i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration Qmeans {}".format(i_iter)) lst_factors_ = op_factors.get_list_of_factors() op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) ########################### # Cluster assignment step # ########################### indicator_vector, distances = assign_points_to_clusters(X_data, op_centroids, X_norms=X_data_norms) ####################### # Cluster update step # ####################### # get the number of observation in each cluster cluster_names, counts = np.unique(indicator_vector, return_counts=True) cluster_names_sorted = np.argsort(cluster_names) # Update centroid location using the newly (it happens in the assess_cluster_integrity function) # assigned data point classes # and check if all clusters still have points # and change the object X_centroids_hat in place if some cluster have lost points (biggest cluster) counts, cluster_names_sorted = update_clusters_with_integrity_check(X_data, X_data_norms, X_centroids_hat, # in place changes K_nb_cluster, counts, indicator_vector, distances, cluster_names, cluster_names_sorted) ################# # PALM4MSA step # ################# # create the diagonal of the sqrt of those counts diag_counts_sqrt_normalized = csr_matrix( (np.sqrt(counts[cluster_names_sorted] / nb_examples), (np.arange(K_nb_cluster), np.arange(K_nb_cluster)))) diag_counts_sqrt = np.sqrt(counts[cluster_names_sorted]) # set it as first factor op_factors.set_factor(0, diag_counts_sqrt_normalized) if hierarchical_inside: _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical = \ hierarchical_palm4msa( arr_X_target=diag_counts_sqrt[:, None,] * X_centroids_hat, lst_S_init=op_factors.get_list_of_factors(), lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=_lambda * np.sqrt(nb_examples), nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, return_objective_function=track_objective_palm, track_objective_palm=track_objective_palm, delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm) else: _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \ palm4msa(arr_X_target=diag_counts_sqrt[:, None,] * X_centroids_hat, lst_S_init=op_factors.get_list_of_factors(), nb_factors=op_factors.n_factors, lst_projection_functions=lst_proj_op_by_fac_step[-1][ "finetune"], f_lambda_init=_lambda * np.sqrt(nb_examples), nb_iter=nb_iter_palm, update_right_to_left=True, track_objective=track_objective_palm, delta_objective_error_threshold=delta_objective_error_threshold_inner_palm) lst_all_objective_functions_palm.append(objective_palm) _lambda = _lambda_tmp / np.sqrt(nb_examples) objective_function[i_iter] = compute_objective(X_data, op_centroids, indicator_vector) if i_iter >= 1: delta_objective_error = np.abs(objective_function[i_iter] - objective_function[i_iter-1]) / objective_function[i_iter-1] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilee i_iter += 1 lst_factors_ = op_factors.get_list_of_factors() op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) return objective_function[:i_iter], op_centroids, indicator_vector, lst_all_objective_functions_palm
def hierarchical_palm4msa(arr_X_target: np.array, lst_S_init: list, lst_dct_projection_function: list, nb_iter: int, f_lambda_init: float = 1, residual_on_right: bool = True, update_right_to_left=True, graphical_display=False): """ lst S init contains factors in decreasing indexes (e.g: the order along which they are multiplied in the product). example: S5 S4 S3 S2 S1 lst S [-j] = Sj :param arr_X_target: The target to approximate. :param lst_S_init: The initial list of sparse factors. The factors are given right to left. In all case. :param nb_factors: The number of factors. :param lst_projection_functions: The projection function for each of the sparse factor. :param f_lambda_init: The initial scaling factor. :param nb_iter: The number of iteration before stopping. :param update_right_to_left: Way in which the factors are updated in the inner palm4msa algorithm. If update_right_to_left is True, the factors are updated right to left (e.g; the last factor in the list first). Otherwise the contrary. :param residual_on_right: During the split step, the residual can be computed as a right or left factor. If residual_on_right is True, the residuals are computed as right factors. We can also see this option as the update way for the hierarchical strategy: when the residual is computed on the right, it correspond to compute the last factor first (left to right according to the paper: the factor with the bigger number first) :param graphical_display: Make a graphical representation of results. :return: """ if not update_right_to_left: raise NotImplementedError # todo voir pourquoi ça plante... mismatch dimension arr_residual = arr_X_target lst_S = deepcopy(lst_S_init) nb_factors = len(lst_S) # check if lst_dct_param_projection_operator contains a list of dict with param for step split and finetune assert len(lst_dct_projection_function) == nb_factors - 1 assert all( len({"split", "finetune"}.difference(dct.keys())) == 0 for dct in lst_dct_projection_function) lst_nb_iter_by_factor = [] f_lambda = f_lambda_init # todo enlever? objective_function = np.empty((nb_factors, 3)) # main loop for k in range(nb_factors - 1): nb_factors_so_far = k + 1 logger.info("Working on factor: {}".format(k)) logger.info("Step split") objective_function[k, 0] = compute_objective_function( arr_X_target, f_lambda, lst_S) # calcule decomposition en 2 du résidu précédent if k == 0: f_lambda_init_split = f_lambda_init else: f_lambda_init_split = 1. func_split_step_palm4msa = lambda lst_S_init: palm4msa( arr_X_target=arr_residual, lst_S_init=lst_S_init, # eye for factor and zeros for residual nb_factors=2, lst_projection_functions=lst_dct_projection_function[k]["split"], # define constraints: ||0 = d pour T1; relaxed constraint on ||0 for T2 f_lambda_init=f_lambda_init_split, nb_iter=nb_iter, update_right_to_left=update_right_to_left, graphical_display=graphical_display) if residual_on_right: residual_init = get_side_prod(lst_S_init[nb_factors_so_far:]) S_init = lst_S_init[k] lst_S_init_split_step = [S_init, residual_init] else: residual_init = get_side_prod(lst_S_init[:-nb_factors_so_far]) S_init = lst_S_init[-nb_factors_so_far] lst_S_init_split_step = [residual_init, S_init] if residual_on_right: f_lambda_prime, ( new_factor, new_residual ), unscaled_residual_reconstruction, _, nb_iter_this_factor = func_split_step_palm4msa( lst_S_init=lst_S_init_split_step) else: f_lambda_prime, ( new_residual, new_factor ), unscaled_residual_reconstruction, _, nb_iter_this_factor = func_split_step_palm4msa( lst_S_init=lst_S_init_split_step) if k == 0: f_lambda = f_lambda_prime # f_lambda = f_lambda else: f_lambda *= f_lambda_prime if residual_on_right: lst_S[k] = new_factor else: lst_S[nb_factors - nb_factors_so_far] = new_factor if graphical_display: plt.figure() plt.subplot(221) plt.title('Input residual Iteration {}, etape split'.format(k)) plt.imshow(arr_residual) plt.colorbar() plt.subplot(222) if residual_on_right: plt.imshow(f_lambda_prime * (new_factor @ new_residual)) plt.title('lambda * new_factor @ new_residual') else: plt.imshow(f_lambda_prime * (new_residual @ new_factor)) plt.title('lambda * new_residual @ new_factor') plt.colorbar() plt.subplot(223) plt.imshow(f_lambda_prime * new_factor) plt.colorbar() plt.title('lambda*new_factor') plt.subplot(224) plt.imshow(new_residual) plt.colorbar() plt.title('new_residual') plt.show() # get the k first elements [:k+1] and the next one (k+1)th as arr_residual (depend on the residual_on_right option) logger.info("Step finetuning") objective_function[k, 1] = compute_objective_function( arr_X_target, f_lambda, lst_S) func_fine_tune_step_palm4msa = lambda lst_S_init: palm4msa( arr_X_target=arr_X_target, lst_S_init=lst_S_init, nb_factors=nb_factors_so_far + 1, lst_projection_functions=lst_dct_projection_function[k]["finetune" ], f_lambda_init=f_lambda, nb_iter=nb_iter, update_right_to_left=update_right_to_left, graphical_display=graphical_display) if residual_on_right: f_lambda, ( *lst_S[:nb_factors_so_far], arr_residual ), _, _, nb_iter_this_factor_bis = func_fine_tune_step_palm4msa( lst_S_init=lst_S[:nb_factors_so_far] + [new_residual]) else: f_lambda, ( arr_residual, *lst_S[-nb_factors_so_far:] ), _, _, nb_iter_this_factor_bis = func_fine_tune_step_palm4msa( lst_S_init=[new_residual] + lst_S[-nb_factors_so_far:]) lst_nb_iter_by_factor.append(nb_iter_this_factor + nb_iter_this_factor_bis) objective_function[k, 2] = compute_objective_function( arr_X_target, f_lambda, lst_S) if graphical_display: plt.figure() plt.subplot(221) plt.title('Residual Iteration {}, step fine tune '.format(k)) plt.imshow(arr_residual) plt.colorbar() plt.subplot(222) plt.imshow( f_lambda * get_side_prod(lst_S[:nb_factors_so_far] + [arr_residual])) plt.colorbar() plt.title('reconstructed') plt.subplot(223) plt.imshow(lst_S[k]) plt.colorbar() plt.title('current factor') plt.subplot(224) plt.imshow(arr_residual) plt.colorbar() plt.title('residual (right factor)') plt.show() # last factor is residual of last palm4LED if residual_on_right: lst_S[-1] = arr_residual else: lst_S[0] = arr_residual objective_function[nb_factors - 1, :] = np.array( [compute_objective_function(arr_X_target, f_lambda, lst_S)] * 3) if len(lst_S) == 1: arr_X_curr = f_lambda * lst_S[0] else: arr_X_curr = f_lambda * multi_dot(lst_S) return f_lambda, lst_S, arr_X_curr, lst_nb_iter_by_factor, objective_function
graphical_display=False, track_objective=False) log_memory_usage( "Memory after palm on top of kmeans in process_palm_on_top_of_kmeans") _lambda = _lambda_tmp / eye_norm lst_factors_ = op_factors.get_list_of_factors() op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) return op_centroids if __name__ == "__main__": logger.info("Command line: " + " ".join(sys.argv)) log_memory_usage("Memory at startup") arguments = docopt.docopt(__doc__) paraman = ParameterManager(arguments) initialized_results = dict((v, None) for v in lst_results_header) resprinter = ResultPrinter(output_file=paraman["--output-file_resprinter"]) resprinter.add(initialized_results) resprinter.add(paraman) objprinter = ObjectiveFunctionPrinter( output_file=paraman["--output-file_objprinter"]) has_failed = False if paraman["--verbose"]: daiquiri.setup(level=logging.DEBUG) else: daiquiri.setup(level=logging.INFO)
n_features=n_features, centers=n_centers) U_centroids_hat = X[np.random.permutation(X.shape[0])[:nb_clusters]] # kmeans++ initialization is not feasible because complexity is O(ndk)... residual_on_right = True sparsity_factor = 2 nb_iter_palm = 30 delta_objective_error_threshold_in_palm = 1e-6 track_objective_in_palm = True lst_constraints, lst_constraints_vals = build_constraint_set_smart( U_centroids_hat.shape[0], U_centroids_hat.shape[1], nb_factors, sparsity_factor=sparsity_factor, residual_on_right=residual_on_right) logger.info("constraints: {}".format(pformat(lst_constraints_vals))) hierarchical_palm_init = { "init_lambda": 1., "nb_iter": nb_iter_palm, "lst_constraint_sets": lst_constraints, "residual_on_right": residual_on_right, "delta_objective_error_threshold": delta_objective_error_threshold_in_palm, "track_objective": track_objective_in_palm } logger.info('Running QuicK-means with H-Palm') objective_function_with_hier_palm, op_centroids_hier, indicator_hier, lst_objective_function_hier_palm = \ qmeans(X, nb_clusters,
def kmeans_minibatch(X_data, K_nb_cluster, nb_iter, initialization, batch_size, delta_objective_error_threshold=1e-6, proj_l1=False, _lambda=None, epsilon=None): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param initialization: The (K, d) matrix of centroids at initialization. :param batch_size: The size of each batch. :param delta_objective_error_threshold: The normalized difference between the error criterion at 2 successive step must be greater or equal to that value. :return: """ logger.debug("Compute squared froebenius norm of data") X_data_norms = get_squared_froebenius_norm_line_wise_batch_by_batch( X_data, batch_size) # Initialize our centroids by picking random data points U_centroids = copy.deepcopy(initialization) objective_function = np.empty((nb_iter, )) total_nb_of_minibatch = X_data.shape[0] // batch_size # Loop for the maximum number of iterations i_iter = 0 delta_objective_error = np.inf while i_iter < nb_iter and (delta_objective_error > delta_objective_error_threshold): logger.info("Iteration number {}/{}".format(i_iter, nb_iter)) # Prepare next epoch full_count_vector = np.zeros(K_nb_cluster, dtype=int) full_indicator_vector = np.zeros(X_data.shape[0], dtype=int) U_centroids_before = np.copy(U_centroids) U_centroids = np.zeros_like(U_centroids_before) for i_minibatch, example_batch_indexes in enumerate( DataGenerator(X_data, batch_size=batch_size, return_indexes=True)): logger.info( "Minibatch number {}/{}; Iteration number {}/{}".format( i_minibatch, total_nb_of_minibatch, i_iter, nb_iter)) example_batch = X_data[example_batch_indexes] example_batch_norms = X_data_norms[example_batch_indexes] indicator_vector, distances = assign_points_to_clusters( example_batch, U_centroids_before, X_norms=example_batch_norms) full_indicator_vector[example_batch_indexes] = indicator_vector cluster_names, counts = np.unique(indicator_vector, return_counts=True) count_vector = np.zeros(K_nb_cluster) count_vector[cluster_names] = counts full_count_vector = update_clusters(example_batch, U_centroids, K_nb_cluster, full_count_vector, count_vector, indicator_vector) # Update centroid location using the newly # assigned data point classes if proj_l1: if _lambda is None or epsilon is None: raise ValueError( "epsilon and lambda must be set if proj_l1 is True") for i_centroid, centroid in enumerate(U_centroids): U_centroids[i_centroid, :] = proj_onto_l1_ball( _lambda=_lambda, epsilon_tol=epsilon, vec=centroid) objective_function[i_iter, ] = compute_objective_by_batch( X_data, U_centroids, full_indicator_vector, batch_size) if i_iter >= 1: delta_objective_error = np.abs( objective_function[i_iter] - objective_function[i_iter - 1] ) / objective_function[ i_iter - 1] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilée i_iter += 1 return objective_function[:i_iter], U_centroids, full_indicator_vector
lst_factors = [np.eye(d) for _ in range(nb_factors)] lst_factors[-1] = np.zeros((d, d)) _lambda = 1. # init the scaling factor at 1 # Create the projection operators for each factor lst_proj_op_by_fac_step, lst_proj_op_by_fac_step_desc = build_constraint_set_smart( left_dim=d, right_dim=d, nb_factors=nb_factors, sparsity_factor=sparsity_factor, residual_on_right=True, fast_unstable_proj=False, constant_first=False) logger.info( "Description of projection operators for each iteration of hierarchical_palm: \n{}" .format(pprint.pformat(lst_proj_op_by_fac_step_desc))) print(np.__version__) # Call the algorithm final_lambda, final_factors, final_X, _, _ = hierarchical_palm4msa( arr_X_target=H, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=_lambda, nb_iter=nb_iter, update_right_to_left=True, residual_on_right=True) # Vizualization utility visual_evaluation_palm4msa(H, lst_factors, final_factors, final_X)
nb_clusters = 10 nb_iter_kmeans = 10 nb_factors = 5 U_centroids_hat = X[np.random.permutation(X.shape[0])[:nb_clusters]] # kmeans++ initialization is not feasible because complexity is O(ndk)... # Initialize palm4msa sparsity_factor = 2 nb_iter_palm = 30 delta_objective_error_threshold_in_palm = 1e-6 # Create constraints for palm4msa lst_constraints, lst_constraints_vals = build_constraint_set_smart( U_centroids_hat.shape[0], U_centroids_hat.shape[1], nb_factors, sparsity_factor=sparsity_factor, residual_on_right=True) logger.info("Description of constraints: \n{}".format(pformat(lst_constraints_vals))) hierarchical_palm_init = { "init_lambda": 1., "nb_iter": nb_iter_palm, "lst_constraint_sets": lst_constraints, "residual_on_right": True, "delta_objective_error_threshold": delta_objective_error_threshold_in_palm, "track_objective": False } logger.info('Running QuicK-means with H-Palm') # QKmeans with hierarchical palm4msa objective_function_with_hier_palm, op_centroids_hier, indicator_hier, lst_objective_function_hier_palm = \ qmeans(X,
def make_1nn_evaluation(x_train, y_train, x_test, y_test, U_centroids, indicator_vector): """ Do the 1-nearest neighbor classification using `x_train`, `y_train` as support and `x_test`, `y_test` as evaluation set. The scikilearn classifiers (brute, kdtree and balltree) are called only in the case where it is the kmeans version of the program that is called (for simplicity purposes: not do it many times). Time is recorded. Classification accuracy is recorded. :param x_train: Train data set as ndarray. :param y_train: Train labels as categories in ndarray. :param x_test: Test data as ndarray. :param y_test: Test labels as categories. :param U_centroids: The matrix of centroids as ndarray or SparseFactor object :param indicator_vector: The indicator vector for this matrix of centroids and this train data. :return: """ def scikit_evaluation(str_type): """ Do the scikit learn version of nearest neighbor (used for comparison) :param str_type: :return: """ clf = KNeighborsClassifier(n_neighbors=1, algorithm=str_type) clf.fit(x_train, y_train) log_memory_usage( "Memory after definition of neighbors classifiers in scikit_evaluation of make_1nn_evaluation" ) start_inference_time = time.time() predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): predictions[obs_idx] = clf.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() log_memory_usage( "Memory after label assignation in scikit_evaluation of make_1nn_evaluation" ) inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_{}_inference_time".format(str_type): inference_time, "1nn_{}_accuracy".format(str_type): accuracy } resprinter.add(results_1nn) return inference_time def kmean_tree_evaluation(): """ Do the K-means partitioning version of nearest neighbor?=. :return: """ # for each cluster, there is a sub nearest neighbor classifier for points in that cluster. lst_clf_by_cluster = [ KNeighborsClassifier(n_neighbors=1, algorithm="brute").fit( x_train[indicator_vector == i], y_train[indicator_vector == i]) for i in range(U_centroids.shape[0]) ] log_memory_usage( "Memory after definition of neighbors classifiers in kmean_tree_evaluation of make_1nn_evaluation" ) # precomputed_centroid_norms = get_squared_froebenius_norm(landmarks) precomputed_centroid_norms = None start_inference_time = time.time() distances = get_distances( x_test, U_centroids, precomputed_centroids_norm=precomputed_centroid_norms) stop_get_distances_time = time.time() get_distance_time = stop_get_distances_time - start_inference_time log_memory_usage( "Memory after distances computation with clusters in kmean_tree_evaluation of make_1nn_evaluation" ) indicator_vector_test = np.argmin(distances, axis=1) predictions = np.empty_like(y_test) for obs_idx, obs_test in enumerate(x_test): # get the cluster to which belongs this data point and call the associated nearest neighbor classifier idx_cluster = indicator_vector_test[obs_idx] clf_cluster = lst_clf_by_cluster[idx_cluster] predictions[obs_idx] = clf_cluster.predict(obs_test.reshape(1, -1))[0] stop_inference_time = time.time() log_memory_usage( "Memory after label assignation in kmean_tree_evaluation of make_1nn_evaluation" ) inference_time = (stop_inference_time - start_inference_time) accuracy = np.sum(predictions == y_test) / y_test.shape[0] results_1nn = { "1nn_kmean_inference_time": inference_time, "1nn_get_distance_time": get_distance_time / x_test.shape[0], "1nn_kmean_accuracy": accuracy } resprinter.add(results_1nn) return inference_time logger.info("1 nearest neighbor with k-means search") kmean_tree_time = kmean_tree_evaluation() # if paraman["kmeans"]: lst_knn_types = ["brute", "ball_tree", "kd_tree"] for knn_type in lst_knn_types: # the classification must not take more than 10 times the time taken for the K means 1 nn classification or # it will stop. signal.signal(signal.SIGALRM, timeout_signal_handler) signal.alarm(int(kmean_tree_time * 10)) # start alarm try: logger.info( "1 nearest neighbor with {} search".format(knn_type)) scikit_evaluation(knn_type) except TimeoutError as te: logger.warning( "Timeout during execution of 1-nn with {} version: {}". format(knn_type, te)) signal.alarm(0) # stop alarm for next evaluation
def qkmeans_minibatch(X_data: np.ndarray, K_nb_cluster: int, nb_iter: int, nb_factors: int, params_palm4msa: dict, initialization: np.ndarray, batch_size: int, hierarchical_inside=False, delta_objective_error_threshold=1e-6, hierarchical_init=False): """ :param X_data: The data matrix of n examples in dimensions d in shape (n, d). :param K_nb_cluster: The number of clusters to look for. :param nb_iter: The maximum number of iteration. :param nb_factors: The number of factors for the decomposition. :param initialization: The initial matrix of centroids not yet factorized. :param params_palm4msa: The dictionnary of parameters for the palm4msa algorithm. :param hierarchical_inside: Tell the algorithm if the hierarchical version of palm4msa should be used. :param delta_objective_error_threshold: :param hierarchical_init: Tells if the algorithm should make the initialization of sparse factors with the hierarchical version of palm or not. :param batch_size: The size of each batch. :return: """ assert K_nb_cluster == initialization.shape[0] logger.debug("Compute squared froebenius norm of data") X_data_norms = get_squared_froebenius_norm_line_wise_batch_by_batch( X_data, batch_size) nb_examples = X_data.shape[0] total_nb_of_minibatch = X_data.shape[0] // batch_size X_centroids_hat = copy.deepcopy(initialization) # ################################ INIT PALM4MSA ############################### logger.info("Initializing QKmeans with PALM algorithm") lst_factors = init_lst_factors(K_nb_cluster, X_centroids_hat.shape[1], nb_factors) eye_norm = np.sqrt(K_nb_cluster) ########################## # GET PARAMS OF PALM4MSA # ########################## init_lambda = params_palm4msa["init_lambda"] nb_iter_palm = params_palm4msa["nb_iter"] lst_proj_op_by_fac_step = params_palm4msa["lst_constraint_sets"] residual_on_right = params_palm4msa["residual_on_right"] delta_objective_error_threshold_inner_palm = params_palm4msa[ "delta_objective_error_threshold"] track_objective_palm = params_palm4msa["track_objective"] #################### # INIT RUN OF PALM # #################### if hierarchical_inside or hierarchical_init: _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical= \ hierarchical_palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, track_objective_palm=track_objective_palm, delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm, return_objective_function=track_objective_palm) else: _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \ palm4msa( arr_X_target=np.eye(K_nb_cluster) @ X_centroids_hat, lst_S_init=lst_factors, nb_factors=len(lst_factors), lst_projection_functions=lst_proj_op_by_fac_step[-1][ "finetune"], f_lambda_init=init_lambda * eye_norm, nb_iter=nb_iter_palm, update_right_to_left=True, track_objective=track_objective_palm, delta_objective_error_threshold=delta_objective_error_threshold_inner_palm) # ################################################################ lst_factors = None # safe assignment for debug _lambda = _lambda_tmp / eye_norm objective_function = np.ones(nb_iter) * -1 lst_all_objective_functions_palm = [] lst_all_objective_functions_palm.append(objective_palm) i_iter = 0 delta_objective_error = np.inf while ((i_iter < nb_iter) and (delta_objective_error > delta_objective_error_threshold)): logger.info("Iteration number {}/{}".format(i_iter, nb_iter)) # Re-init palm factors for iteration lst_factors_ = op_factors.get_list_of_factors() op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) # Prepare next epoch full_count_vector = np.zeros(K_nb_cluster, dtype=int) full_indicator_vector = np.zeros(X_data.shape[0], dtype=int) X_centroids_hat = np.zeros_like(X_centroids_hat) for i_minibatch, example_batch_indexes in enumerate( DataGenerator(X_data, batch_size=batch_size, return_indexes=True)): logger.info( "Minibatch number {}/{}; Iteration number {}/{}".format( i_minibatch, total_nb_of_minibatch, i_iter, nb_iter)) example_batch = X_data[example_batch_indexes] example_batch_norms = X_data_norms[example_batch_indexes] ########################## # Update centroid oracle # ########################## indicator_vector, distances = assign_points_to_clusters( example_batch, op_centroids, X_norms=example_batch_norms) full_indicator_vector[example_batch_indexes] = indicator_vector cluster_names, counts = np.unique(indicator_vector, return_counts=True) count_vector = np.zeros(K_nb_cluster) count_vector[cluster_names] = counts full_count_vector = update_clusters(example_batch, X_centroids_hat, K_nb_cluster, full_count_vector, count_vector, indicator_vector) objective_function[i_iter] = compute_objective_by_batch( X_data, op_centroids, full_indicator_vector, batch_size) # inplace modification of X_centrois_hat and full_count_vector and full_indicator_vector check_cluster_integrity(X_data, X_centroids_hat, K_nb_cluster, full_count_vector, full_indicator_vector) ######################### # Do palm for iteration # ######################### # create the diagonal of the sqrt of those counts diag_counts_sqrt_normalized = csr_matrix( (np.sqrt(full_count_vector / nb_examples), (np.arange(K_nb_cluster), np.arange(K_nb_cluster)))) diag_counts_sqrt = np.sqrt(full_count_vector) # set it as first factor op_factors.set_factor(0, diag_counts_sqrt_normalized) if hierarchical_inside: _lambda_tmp, op_factors, _, objective_palm, array_objective_hierarchical = \ hierarchical_palm4msa( arr_X_target=diag_counts_sqrt[:, None,] * X_centroids_hat, lst_S_init=op_factors.get_list_of_factors(), lst_dct_projection_function=lst_proj_op_by_fac_step, f_lambda_init=_lambda * np.sqrt(nb_examples), nb_iter=nb_iter_palm, update_right_to_left=True, residual_on_right=residual_on_right, return_objective_function=track_objective_palm, track_objective_palm=track_objective_palm, delta_objective_error_threshold_palm=delta_objective_error_threshold_inner_palm) else: _lambda_tmp, op_factors, _, objective_palm, nb_iter_palm = \ palm4msa(arr_X_target=diag_counts_sqrt[:, None,] * X_centroids_hat, lst_S_init=op_factors.get_list_of_factors(), nb_factors=op_factors.n_factors, lst_projection_functions=lst_proj_op_by_fac_step[-1][ "finetune"], f_lambda_init=_lambda * np.sqrt(nb_examples), nb_iter=nb_iter_palm, update_right_to_left=True, track_objective=track_objective_palm, delta_objective_error_threshold=delta_objective_error_threshold_inner_palm) _lambda = _lambda_tmp / np.sqrt(nb_examples) ############################ lst_all_objective_functions_palm.append(objective_palm) if i_iter >= 1: delta_objective_error = np.abs(objective_function[i_iter] - objective_function[i_iter - 1] ) / objective_function[i_iter - 1] # todo vérifier que l'erreur absolue est plus petite que le threshold plusieurs fois d'affilee i_iter += 1 op_centroids = SparseFactors([lst_factors_[1] * _lambda] + lst_factors_[2:]) return objective_function[: i_iter], op_centroids, full_indicator_vector, lst_all_objective_functions_palm