def tes_decompose_M_eigsh(self): """Tests the whole framework and the resulting SVD/Eigendecomposition of M.""" num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0], Y.shape[0] and n voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1] num_concepts_list = [3, 4, 15, 100, 130] # controls Y.shape[1] first_k_list = [1, 2, 6, 50, 30] lambda_values = [1, 0.33, 1.66] seeds_z = [12345, 1111, 222222] # generates different Z's seeds_y = [1, 123123123, 50000] # generates different Y's seeds_v = [123, 1111, 22222, 333333] # generates different v's for the multiplication test for i in range(len(num_documents_per_language_list)): print("Starting %d" % i) num_docs_per_lang = num_documents_per_language_list[i] voc_size_per_lang = voc_size_per_lang_list[i] num_concepts = num_concepts_list[i] # dimensions of vector v total_vocabulary = np.sum(voc_size_per_lang) k = first_k_list[i] for seed_z in seeds_z: for seed_y in seeds_y: data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y, seed_z = seed_z) for lambda_ in lambda_values: operations_obj = Operations(data_obj, lambda_) dense_M = self.get_dense_M(operations_obj) u, s, vh = np.linalg.svd(dense_M, full_matrices=False) e_vals = s[:k] e_vals = np.flip(e_vals, axis=0) e_vecs = np.flip(vh[:k].T, axis=1) vals, vecs = operations_obj.decompose_M_eigsh(k) #print(e_vals) self.assertTrue(np.allclose(vals, e_vals)) self.assertTrue(np.allclose(np.abs(e_vecs), np.abs(vecs))) ## Test multiply_by_V for seed_v in seeds_v: v = self.generate_v(k, seed_v) dense_multiply = vecs @ v custom_multiply = operations_obj.multiply_by_V_left(v) self.assertTrue(np.allclose(custom_multiply, dense_multiply)) ## Test multiply_by_V_T for seed_v in seeds_v: v = self.generate_v(num_concepts, seed_v) dense_multiply = vecs.T @ v custom_multiply = operations_obj.multiply_by_V_T_left(v) self.assertTrue(np.allclose(custom_multiply, dense_multiply)) ## Test multiply_by_M2 dense_M2 = self.get_dense_M2(operations_obj) for seed_v in seeds_v: v = self.generate_v(total_vocabulary, seed_v) dense_multiply = dense_M2 @ v custom_multiply = Operations.multiply_by_M2_left(v, operations_obj) self.assertTrue(np.allclose(custom_multiply, dense_multiply)) ## Test M2 eigendecomposition u, s, vh = np.linalg.svd(dense_M2, full_matrices=False) e_vals = s[:k] e_vals = np.flip(e_vals, axis=0) e_vecs = np.flip(vh[:k].T, axis=1) vals, vecs = operations_obj.decompose_M2_eigsh(k) #print(e_vals) self.assertTrue(np.allclose(vals, e_vals)) self.assertTrue(np.allclose(np.abs(e_vecs), np.abs(vecs)))
def run_experiment(self): '''Compleates an actual run of the full pipeline, with the parameters corresponding to the arguments passed in the constructor''' params = self.params print(params) cg_max_iter = 500 eigs_max_iter = 250 training_concepts_file_name = params['training_concepts_file_name'] validation_set_file_name = params['validation_set_file_name'] case_folding_flag = params['case_folding_flag'] _lambda = params['lambda'] cg_tol_1 = 10**(-1 * params['cg_tol_1']) eigs_tol_1 = 10**(-1 * params['eigs_tol_1']) # Same for now cg_tol_2 = 10**(-1 * params['cg_tol_2']) eigs_tol_2 = 10**(-1 * params['eigs_tol_2']) dims = params['dimensions'] vocabulary_size = params['vocabulary_size'] data_obj = Data() data_obj.load_training(training_concepts_file_name, validation_set_file_name, case_folding_flag, vocabulary_size) operations_obj = Operations(data_obj, _lambda, cg_max_iter, cg_tol_1) start = default_timer() try: vals, vecs = operations_obj.decompose_M_eigsh( dims, eigs_max_iter, eigs_tol_1) except ArpackError as e: try: print("ERROR occured!") print(e) vals, vecs = operations_obj.decompose_M_eigsh( dims, eigs_max_iter, eigs_tol_1, True) except ArpackError as e: print("FAIL! Can't complete the decomposition!") return end = default_timer() time_elapsed = end - start print("Finished decomposition one: ", time_elapsed) training_outcome = {} training_outcome['e_vals'] = vals training_outcome['e_vecs'] = vecs results_obj = {} results_obj['training_outcome'] = training_outcome results_obj['parameters'] = params results_obj['data'] = data_obj.final_dataset_dump_name with open(self.results_dump_path, 'wb') as f: pickle.dump(results_obj, f, protocol=4) start = default_timer() try: vals_m2, vecs_m2 = operations_obj.decompose_M2_eigsh( dims, eigs_max_iter, eigs_tol_1) print(vals_m2) # Visual sanity check except ArpackError as e: try: print("ERROR occured!") print(e) vals_m2, vecs_m2 = operations_obj.decompose_M2_eigsh( dims, eigs_max_iter, eigs_tol_1, True) except ArpackError as e: print("FAIL! Can't complete the decomposition!") return end = default_timer() time_elapsed = end - start print("Finished decomposition two: ", time_elapsed) training_outcome['M2_e_vals'] = vals_m2 training_outcome['M2_e_vecs'] = vecs_m2 # training_outcome['cg_residuals'] = operations_obj.cg_residuals training_outcome['num_iter'] = operations_obj.num_iter # training_outcome['cg_residuals2'] = operations_obj.cg_residuals2 training_outcome['num_iter2'] = operations_obj.num_iter2 training_outcome['time_consumed'] = operations_obj.time_consumed results_obj['training_outcome'] = training_outcome with open(self.results_dump_path, 'wb') as f: pickle.dump(results_obj, f, protocol=4) self.logger.revert_standard_output() self.logger.log_run()