def tes_multiply_by_B_left(self): """Tests the functionality of multiplying a vector with A from the left. The tested routine is used in the CG method.""" num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0] and the value of n voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1] num_concepts_list = np.ones_like(voc_size_per_lang_list) # dummy used to create the data object lambda_values = [1, 0.5, 0.33, 1.66, 2] seeds_z = [12345, 1111, 222222] # generates different Z's seeds_v = [123, 1111, 22222, 333333] # generates different v's for i in range(len(num_documents_per_language_list)): num_docs_per_lang = num_documents_per_language_list[i] voc_size_per_lang = voc_size_per_lang_list[i] num_concepts = num_concepts_list[i] total_vocabulary = np.sum(voc_size_per_lang) for seed_z in seeds_z: data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_z = seed_z) Z_dense = data_obj.Z.todense() for lambda_ in lambda_values: operations_obj = Operations(data_obj, lambda_) for seed_v in seeds_v: v = self.generate_v(total_vocabulary, seed_v) dense_B = self.get_dense_B(operations_obj) dense_multiply = dense_B @ v v = v.reshape(v.shape[0]) custom_multiply = Operations.multiply_by_B_left(v, operations_obj) self.assertTrue(np.allclose(custom_multiply, dense_multiply.reshape(v.shape[0])))
def tes_multiply_by_M(self): """Tests the functionality of multiplying a vector by M from the left. The tested routine is used by the iterative method for finding the SVD/Eigendecomposition decomposition of M.""" num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0], Y.shape[0] and n voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1] num_concepts_list = [3, 4, 15, 100, 130] # controls Y.shape[1] lambda_values = [1, 0.5, 0.33, 1.66, 2] seeds_z = [12345, 1111, 222222] # generates different Z's seeds_y = [1, 123123123, 50000] # generates different Y's seeds_v = [123, 1111, 22222, 333333] # generates different v's for i in range(len(num_documents_per_language_list)): num_docs_per_lang = num_documents_per_language_list[i] voc_size_per_lang = voc_size_per_lang_list[i] num_concepts = num_concepts_list[i] # dimensions of vector v for seed_z in seeds_z: for seed_y in seeds_y: data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y, seed_z = seed_z) for lambda_ in lambda_values: operations_obj = Operations(data_obj, lambda_) for seed_v in seeds_v: v = self.generate_v(num_concepts, seed_v) v = v.reshape(v.shape[0]) dense_M = self.get_dense_M(operations_obj) dense_multiply = dense_M @ v custom_multiply = Operations.multiply_by_M_left(v, operations_obj) self.assertTrue(np.allclose(custom_multiply, dense_multiply))
def tes_multiply_by_Z_T(self): """Tests the functionality of multiplying a vector with Z transpose from the left""" num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0] voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1] num_concepts_list = np.ones_like(voc_size_per_lang_list) # dummy used to create the data object seeds_z = [12345, 1111, 222222] # generates different Z's seeds_v = [123, 1111, 22222, 333333] # generates different v's for i in range(len(num_documents_per_language_list)): num_docs_per_lang = num_documents_per_language_list[i] voc_size_per_lang = voc_size_per_lang_list[i] num_concepts = num_concepts_list[i] total_num_of_docs = np.sum(num_docs_per_lang) for seed_z in seeds_z: data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_z = seed_z) operations_obj = Operations(data_obj) Z_dense = data_obj.Z.todense() Z_dense_T = data_obj.Z_T.todense() for seed_v in seeds_v: v = self.generate_v(total_num_of_docs, seed_v) dense_multiply = Z_dense_T @ v custom_multiply = operations_obj.multiply_by_Z_T_left(v) mkl_multiply = operations_obj.multiply_by_Z_T_viaMKL(v) self.assertTrue(np.allclose(mkl_multiply, custom_multiply)) self.assertTrue(np.allclose(custom_multiply, dense_multiply))
def tes_multiply_by_Y_T_left(self): """Tests the functionality of multiplying a vector with Y transpose from the left""" num_documents_per_language_list = [[8], [8], [100], [123]] # controls Y.shape[0] num_concepts_list = [5, 8, 35, 62] # controls Y.shape[1] seeds_y = [12345, 1111, 222222] # generates different Y's seeds_v = [1233, 111111, 22, 22222] # generates different v's for i in range(len(num_concepts_list)): num_docs_per_lang = voc_size_per_lang = num_documents_per_language_list[i] num_concepts = num_concepts_list[i] total_num_of_docs = np.sum(num_docs_per_lang) for seed_y in seeds_y: data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y) operations_obj = Operations(data_obj) Y_dense = data_obj.Y.todense() Y_dense_T = Y_dense.T for seed_v in seeds_v: v = self.generate_v(total_num_of_docs, seed_v) dense_multiply = Y_dense_T @ v custom_multiply = operations_obj.multiply_by_Y_T_left(v) self.assertTrue(np.allclose(custom_multiply, dense_multiply))
def test_multiply_by_inverse_cg(self): """Tests the functionality of multiplying a vector by A inverse from the left. The product is generated using CG method.""" num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0] and the value of n voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1] num_concepts_list = np.ones_like(voc_size_per_lang_list) # dummy used to create the data object lambda_values = [1, 0.5, 0.33, 1.66, 2] seeds_z = [12345, 1111, 222222] # generates different Z's seeds_v = [123, 1111, 22222, 333333] # generates different v's for i in range(len(num_documents_per_language_list)): num_docs_per_lang = num_documents_per_language_list[i] voc_size_per_lang = voc_size_per_lang_list[i] num_concepts = num_concepts_list[i] total_vocabulary = np.sum(voc_size_per_lang) # dimension of vector v for seed_z in seeds_z: data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_z = seed_z) #Z_dense = data_obj.Z.todense() for lambda_ in lambda_values: operations_obj = Operations(data_obj, lambda_) for seed_v in seeds_v: v = self.generate_v(total_vocabulary, seed_v) dense_B = self.get_dense_B(operations_obj) #print(self.is_pos_def(dense_A)) dense_multiply = np.linalg.inv(dense_B) @ v return_obj = operations_obj.multiply_by_inverse_cg(v) custom_multiply = return_obj[0].reshape(total_vocabulary, 1) #if not np.allclose(custom_multiply, dense_multiply): #print(dense_multiply) # print(Z_dense.shape) # print(i) # print(seed_z) # print(lambda_) # print(seed_v) self.assertTrue(np.allclose(custom_multiply, dense_multiply))
def tes_multiply_by_const_left(self): """Tests the functionality of multiplying a vector with the constant factor from the left""" num_documents_per_language_list = [[8], [8], [100], [123]] # controls the value of n in the constant factor seeds_v = [123, 1111, 22222, 333333] # generates different v's seed_y = 123 for i in range(len(num_documents_per_language_list)): num_docs_per_lang = voc_size_per_lang = num_documents_per_language_list[i] num_concepts = num_docs_per_lang[0] data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y) operations_obj = Operations(data_obj) n = data_obj.Z.shape[0] const_dense = np.eye(n) - 1/n for seed_v in seeds_v: v = self.generate_v(num_concepts, seed_v) dense_multiply = const_dense @ v custom_multiply = operations_obj.multiply_by_const_left(v) cython_multiply = operations_obj.multiply_by_const_left_cython(v) self.assertTrue(np.allclose(custom_multiply, dense_multiply)) self.assertTrue(np.allclose(dense_multiply, cython_multiply.reshape(v.shape[0], 1)))
def tes_decompose_M_eigsh(self): """Tests the whole framework and the resulting SVD/Eigendecomposition of M.""" num_documents_per_language_list = [[13], [4], [12, 22, 10], [132, 123, 123], [123, 120, 130]] # controls Z.shape[0], Y.shape[0] and n voc_size_per_lang_list = [[6], [10], [23, 30, 4], [40, 70,50], [123, 20, 120]] # controls Z.shape[1] num_concepts_list = [3, 4, 15, 100, 130] # controls Y.shape[1] first_k_list = [1, 2, 6, 50, 30] lambda_values = [1, 0.33, 1.66] seeds_z = [12345, 1111, 222222] # generates different Z's seeds_y = [1, 123123123, 50000] # generates different Y's seeds_v = [123, 1111, 22222, 333333] # generates different v's for the multiplication test for i in range(len(num_documents_per_language_list)): print("Starting %d" % i) num_docs_per_lang = num_documents_per_language_list[i] voc_size_per_lang = voc_size_per_lang_list[i] num_concepts = num_concepts_list[i] # dimensions of vector v total_vocabulary = np.sum(voc_size_per_lang) k = first_k_list[i] for seed_z in seeds_z: for seed_y in seeds_y: data_obj = self.generate_data_obj(num_concepts, num_docs_per_lang, voc_size_per_lang, seed_y = seed_y, seed_z = seed_z) for lambda_ in lambda_values: operations_obj = Operations(data_obj, lambda_) dense_M = self.get_dense_M(operations_obj) u, s, vh = np.linalg.svd(dense_M, full_matrices=False) e_vals = s[:k] e_vals = np.flip(e_vals, axis=0) e_vecs = np.flip(vh[:k].T, axis=1) vals, vecs = operations_obj.decompose_M_eigsh(k) #print(e_vals) self.assertTrue(np.allclose(vals, e_vals)) self.assertTrue(np.allclose(np.abs(e_vecs), np.abs(vecs))) ## Test multiply_by_V for seed_v in seeds_v: v = self.generate_v(k, seed_v) dense_multiply = vecs @ v custom_multiply = operations_obj.multiply_by_V_left(v) self.assertTrue(np.allclose(custom_multiply, dense_multiply)) ## Test multiply_by_V_T for seed_v in seeds_v: v = self.generate_v(num_concepts, seed_v) dense_multiply = vecs.T @ v custom_multiply = operations_obj.multiply_by_V_T_left(v) self.assertTrue(np.allclose(custom_multiply, dense_multiply)) ## Test multiply_by_M2 dense_M2 = self.get_dense_M2(operations_obj) for seed_v in seeds_v: v = self.generate_v(total_vocabulary, seed_v) dense_multiply = dense_M2 @ v custom_multiply = Operations.multiply_by_M2_left(v, operations_obj) self.assertTrue(np.allclose(custom_multiply, dense_multiply)) ## Test M2 eigendecomposition u, s, vh = np.linalg.svd(dense_M2, full_matrices=False) e_vals = s[:k] e_vals = np.flip(e_vals, axis=0) e_vecs = np.flip(vh[:k].T, axis=1) vals, vecs = operations_obj.decompose_M2_eigsh(k) #print(e_vals) self.assertTrue(np.allclose(vals, e_vals)) self.assertTrue(np.allclose(np.abs(e_vecs), np.abs(vecs)))
def run_experiment(self): '''Compleates an actual run of the full pipeline, with the parameters corresponding to the arguments passed in the constructor''' params = self.params print(params) cg_max_iter = 500 eigs_max_iter = 250 training_concepts_file_name = params['training_concepts_file_name'] validation_set_file_name = params['validation_set_file_name'] case_folding_flag = params['case_folding_flag'] _lambda = params['lambda'] cg_tol_1 = 10**(-1 * params['cg_tol_1']) eigs_tol_1 = 10**(-1 * params['eigs_tol_1']) # Same for now cg_tol_2 = 10**(-1 * params['cg_tol_2']) eigs_tol_2 = 10**(-1 * params['eigs_tol_2']) dims = params['dimensions'] vocabulary_size = params['vocabulary_size'] data_obj = Data() data_obj.load_training(training_concepts_file_name, validation_set_file_name, case_folding_flag, vocabulary_size) operations_obj = Operations(data_obj, _lambda, cg_max_iter, cg_tol_1) start = default_timer() try: vals, vecs = operations_obj.decompose_M_eigsh( dims, eigs_max_iter, eigs_tol_1) except ArpackError as e: try: print("ERROR occured!") print(e) vals, vecs = operations_obj.decompose_M_eigsh( dims, eigs_max_iter, eigs_tol_1, True) except ArpackError as e: print("FAIL! Can't complete the decomposition!") return end = default_timer() time_elapsed = end - start print("Finished decomposition one: ", time_elapsed) training_outcome = {} training_outcome['e_vals'] = vals training_outcome['e_vecs'] = vecs results_obj = {} results_obj['training_outcome'] = training_outcome results_obj['parameters'] = params results_obj['data'] = data_obj.final_dataset_dump_name with open(self.results_dump_path, 'wb') as f: pickle.dump(results_obj, f, protocol=4) start = default_timer() try: vals_m2, vecs_m2 = operations_obj.decompose_M2_eigsh( dims, eigs_max_iter, eigs_tol_1) print(vals_m2) # Visual sanity check except ArpackError as e: try: print("ERROR occured!") print(e) vals_m2, vecs_m2 = operations_obj.decompose_M2_eigsh( dims, eigs_max_iter, eigs_tol_1, True) except ArpackError as e: print("FAIL! Can't complete the decomposition!") return end = default_timer() time_elapsed = end - start print("Finished decomposition two: ", time_elapsed) training_outcome['M2_e_vals'] = vals_m2 training_outcome['M2_e_vecs'] = vecs_m2 # training_outcome['cg_residuals'] = operations_obj.cg_residuals training_outcome['num_iter'] = operations_obj.num_iter # training_outcome['cg_residuals2'] = operations_obj.cg_residuals2 training_outcome['num_iter2'] = operations_obj.num_iter2 training_outcome['time_consumed'] = operations_obj.time_consumed results_obj['training_outcome'] = training_outcome with open(self.results_dump_path, 'wb') as f: pickle.dump(results_obj, f, protocol=4) self.logger.revert_standard_output() self.logger.log_run()