def experiment_2b(W_init, H_init, results_folder, ranks, no_iterations): for r in ranks: W_init_r = W_init[:, :r] H_init_r = H_init[:r, :] W, H, divergence_by_it = NMF_divergence(data_matrix, W_init_r, H_init_r, n, m, r, no_iterations, 1, report_progress=True, save_progress_to=results_folder) save_results(results_folder, W, H, unique_name='r={}_final'.format(r), additional_data_name_to_array_dict={ 'divergence_record' : divergence_by_it}, row_names_list=genes, column_names_list=patients)
def experiment_2c(ranks, no_iterations, rpkm_matrix_path, ensg_to_go_path, results_folder): genes_with_GO_annotations = list( set(pd.read_csv(ensg_to_go_path, index_col=0)['ENSEMBL'])) rpkm_matrix = pd.read_csv(rpkm_matrix_path, index_col=0).loc[genes_with_GO_annotations] patients = list(rpkm_matrix.columns) n = len(genes_with_GO_annotations) m = len(patients) rpkm_matrix = np.array(rpkm_matrix) W_init, H_init = NNDSVDar_initialization(rpkm_matrix, n, m, max(ranks)) for r in ranks: W_init_r = W_init[:, :r] H_init_r = H_init[:r, :] W, H, divergence_by_it = NMF_divergence(rpkm_matrix, W_init_r, H_init_r, n, m, r, no_iterations, 100, report_progress=True) save_results(results_folder, W, H, unique_name='r={}_final'.format(r), additional_data_name_to_array_dict={ 'divergence_record': divergence_by_it }, row_names_list=genes_with_GO_annotations, column_names_list=patients)
def experiment_2d(ranks, no_iterations, rpkm_matrix_path, genes_with_GO_annotations, results_folder, repeat_no): rpkm_matrix = pd.read_csv(rpkm_matrix_path, index_col=0).loc[genes_with_GO_annotations] patients = list(rpkm_matrix.columns) n = len(genes_with_GO_annotations) m = len(patients) rpkm_matrix = np.array(rpkm_matrix) random_state = np.random.RandomState(repeat_no) for r in ranks: print('\n\nRANK {}, REPEAT {}\n\n'.format(r, repeat_no)) W_init = random_state.uniform(size=(n, r)) H_init = random_state.uniform(size=(r, m)) W, H, divergence_by_it = NMF_divergence(rpkm_matrix, W_init, H_init, n, m, r, no_iterations, 100, report_progress=True) save_results(results_folder, W, H, unique_name='r={}_repeat={}'.format(r, repeat_no), additional_data_name_to_array_dict={ 'divergence_record': divergence_by_it }, row_names_list=genes_with_GO_annotations, column_names_list=patients)
np.save(initializations_path + 'W_init_' + code, W_init) np.save(initializations_path + 'H_init_' + code, H_init) # REAL NMF W, H, divergence_by_it = NMF_divergence(V_15000, W_init.copy(), H_init.copy(), n, m, r, iterations, divergence_calc_frequency, report_progress=True) save_results(W, H, {'divergence_record': divergence_by_it}, row_names_list=genes, column_names_list=patients, main_results_folder=results_path_main) # PERMUTATION NMF W, H, divergence_by_it = NMF_divergence(V_15000_columns_permuted, W_init, H_init, n, m, r, iterations, divergence_calc_frequency, report_progress=True) save_results(W, H, {'divergence_record': divergence_by_it},
data_path = "C:/Users/hanne/Documents/PROJECT/Project Data/CM_experiment_2_data.csv" main_results_folder = "C:/Users/hanne/Documents/PROJECT/Project Data/Experiment_2a_results/" data_matrix = pd.read_csv(data_path, index_col=0) genes = list(data_matrix.index) patients = list(data_matrix.columns) n = len(genes) m = len(patients) ranks = [2, 5, 10, 15, 20, 25, 30, 40, 50, 60, 70, 80, 100, 150] data_matrix = np.array(data_matrix) for r in ranks: random = np.random.RandomState(42) W_init = random.uniform(0, 1, (n, r)) H_init = random.uniform(0, 1, (r, m)) W, H, divergence_by_it = NMF_divergence(data_matrix, W_init, H_init, n, m, r, 5000, 1, report_progress=True, save_progress_to=main_results_folder) save_results(main_results_folder, W, H, unique_name='r={}_final'.format(r), additional_data_name_to_array_dict={ 'divergence_record' : divergence_by_it}, row_names_list=genes, column_names_list=patients)
r = ranks[idx] repeat = best_repeats[idx] # Loading best initialisations by divergence after 100 iterations as initialisation for longer run code = 'rep={}_rank={}'.format(repeat, r) print(code) W_init = np.load(initializations_path + 'W_init_' + code + '.npy') H_init = np.load(initializations_path + 'H_init_' + code + '.npy') # REAL NMF W, H, divergence_by_it = NMF_divergence(V_15000, W_init.copy(), H_init.copy(), n, m, r, iterations, divergence_calc_frequency, report_progress=True) save_results(results_path_main, W, H, unique_name='rank={}'.format(r), additional_data_name_to_array_dict={ 'divergence_record': divergence_by_it }, row_names_list=genes, column_names_list=patients)