def main(): os.system("module load tensorflow") project = test_CNN.get_project_and_check_arguments(sys.argv, 'display_graphs.py') model_ids = [] checkpoints_folder = project.checkpoints_folder_tmp for filename in os.listdir(checkpoints_folder): if filename.endswith(".tar"): model_ids.append(filename[:-len(".tar")]) print("len model_ids: ", len(model_ids)) draw_k_graph(project, model_ids) exit() ############################################################################### colors = sns.color_palette("hls", project.MAXIMAL_K) sorted_ids = [None] * project.MAXIMAL_K for model_validation_id in model_ids: split_id = model_validation_id.split("_") if len(split_id) == 4: print("split_id: ", split_id) k = int(model_validation_id.split("_")[2]) else: k = 4 sorted_ids[k-1] = model_validation_id print("sorted_ids: ", sorted_ids) for k in range(1, project.MAXIMAL_K+1): model_validation_id = sorted_ids[k-1] draw_ROC_graphs(project, model_validation_id, k, colors[k-1]) print("End!!!")
def main(): project = test_CNN.get_project_and_check_arguments( sys.argv, 'read_filters_and_run_Homer_compare_motifs.py') sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list( project) for best_model_validation_id in sorted_models_list: train_species = map_model_ids[best_model_validation_id] if trained_on_all_species_only: trained_species_index = len(project.species) - 2 # TODO update if train_species != project.species[trained_species_index]: continue print("train_name = ", train_species) model_dir = test_CNN.create_directories(project, best_model_validation_id) conv_results_dir = os.path.join(model_dir, 'convolution_results') filters_folder = os.path.join(conv_results_dir, "filters", "layer_1") num_filters_in_first_layer = project.CNN_structure.get_kernels_shape_and_number( 1)[1] create_filters_file_for_Homer_compareMotifs( filters_folder, num_filters_in_first_layer, train_species) # run Homer: script = "~tommy/Work/HOMER/bin/compareMotifs.pl" motifs_file = os.path.join(filters_folder, "filters_file.txt") output_directory = os.path.join(filters_folder, "Homer_compareMotifs/") known_motifs = "/cs/cbio/tommy/HOMER/data/knownTFs/vertebrates/all.motifs" homer_results_file = os.path.join(filters_folder, "homer_results.txt") os.system(script + " " + motifs_file + " " + output_directory + " -known " + known_motifs + " -cpu 2 2>&1 | tee " + homer_results_file) print("End!")
def main(): project = test_CNN.get_project_and_check_arguments( sys.argv, 'run_Homer_find_denovo_motifs.py') for species_name in project.species: print("species_name = ", species_name, " : ") if species_name != "simulated": # species_dir = os.path.join(project.text_samples_base_dir, species_name) Homer_dir = create_data_for_Homer.create_Homer_directory( project, species_name) else: # species_dir = project.text_samples_base_dir Homer_dir = create_data_for_Homer.create_Homer_directory(project) # run Homer findMotifs: script = "~tommy/Work/HOMER/bin/findMotifs.pl" positive_samples_path = os.path.join(Homer_dir, species_name + "_positive.fa") negative_samples_path = os.path.join(Homer_dir, species_name + "_negative.fa") output_directory = os.path.join(Homer_dir, "motifResults/") homer_results_file = os.path.join(Homer_dir, "homer_results.txt") os.system(script + " " + positive_samples_path + " fasta " + output_directory + " -fasta " + negative_samples_path + " -S 5 -p 16 -bits -mset vertebrates 2>&1 | tee " + homer_results_file) print("End!")
def main(): num_times_negative_data_is_taken = 0 project = test_CNN.get_project_and_check_arguments( sys.argv, "run_data_loader.py", num_times_negative_data_is_taken=num_times_negative_data_is_taken) print "start creating data for project: ", project.project_name for species in project.species: species_text_samples_dir = os.path.join(project.text_samples_base_dir, species) species_npy_samples_dir = os.path.join(project.samples_base_dir, species) if not os.path.exists(species_text_samples_dir) and not os.path.isdir( species_text_samples_dir): print("make directory: ", species_text_samples_dir) os.makedirs(species_text_samples_dir) if not os.path.exists(species_npy_samples_dir) and not os.path.isdir( species_npy_samples_dir): print("make directory: ", species_npy_samples_dir) os.makedirs(species_npy_samples_dir) data_loader = DataLoaderH3K27acvsExpandedNeg(project) data_loader.get_all_positive_and_negative_samples( num_times_negative_data_is_taken) data_loader.create_data_for_each_species() data_loader.create_data_from_all_species_together() print "End!"
def main(): project = test_CNN.get_project_and_check_arguments( sys.argv, 'create_data_for_Homer.py') for species_name in project.species: print("species_name = ", species_name, " : ") if species_name != "simulated": species_dir = os.path.join(project.text_samples_base_dir, species_name) Homer_dir = create_Homer_directory(project, species_name) else: species_dir = project.text_samples_base_dir Homer_dir = create_Homer_directory(project) out_positive_samples_path = os.path.join(Homer_dir, species_name + "_positive.fa") out_negative_samples_path = os.path.join(Homer_dir, species_name + "_negative.fa") input_positive_samples_path = os.path.join(species_dir, "positive_samples") if project.k: input_negative_samples_path = os.path.join( species_dir, project.k_let_dirs[project.k - 1], "negative_samples") else: input_negative_samples_path = os.path.join(species_dir, "negative_samples") write_data_file_for_Homer(True, input_positive_samples_path, out_positive_samples_path) write_data_file_for_Homer(False, input_negative_samples_path, out_negative_samples_path) print("End!")
def main(): project = test_CNN.get_project_and_check_arguments(sys.argv, "run_data_loader.py") print "start creating data for project: ", project.project_name data_loader = DataLoader(project, motifs_base_path) data_loader.create_npy_files() print "End!"
def main(): project = test_CNN.get_project_and_check_arguments(sys.argv, "run_data_loader.py") print "start creating data for project: ", project.project_name data_loader = DataLoaderNegDatavsShuffle(project) data_loader.get_all_positive_and_negative_samples() data_loader.create_data_from_all_species_together() print "End!"
def main(): project = test_CNN.get_project_and_check_arguments(sys.argv, "run_data_loader.py") # data_handle.remove_files(project.text_samples_base_dir, ".txt", project.species) print "start creating data for project: ", project.project_name data_loader = DataLoaderH3K27acvsNeg(project) data_loader.get_all_positive_and_negative_samples() data_loader.create_data_for_each_species() data_loader.create_data_from_all_species_together() print "End!"
def main(): project = test_CNN.get_project_and_check_arguments(sys.argv, 'create_species_dirs.py') project_data_dir = os.path.join(base_path, project.project_name, 'data') for dir_name in ['npy_files', 'samples']: dir_path = os.path.join(project_data_dir, dir_name) for species in project.species: species_dir = os.path.join(dir_path, species) if not os.path.exists(species_dir) and not os.path.isdir(species_dir): print("make directory: ", species_dir) os.makedirs(species_dir)
def main(): project = test_CNN.get_project_and_check_arguments(sys.argv, "run_data_loader.py") print "start creating data for project: ", project.project_name for species in project.species: species_text_samples_dir = os.path.join( project.text_samples_base_dir, species, project.k_let_dirs[project.k - 1]) species_npy_samples_dir = os.path.join(project.samples_base_dir, species) if not os.path.exists(species_text_samples_dir) and not os.path.isdir( species_text_samples_dir): print("make directory: ", species_text_samples_dir) os.makedirs(species_text_samples_dir) if not os.path.exists(species_npy_samples_dir) and not os.path.isdir( species_npy_samples_dir): print("make directory: ", species_npy_samples_dir) os.makedirs(species_npy_samples_dir) data_loader = DataLoaderTFvsShuffle(project) data_loader.get_all_positive_and_negative_samples() data_loader.create_data_from_all_species_together() print "End!"
def main(): project = test_CNN.get_project_and_check_arguments( sys.argv, 'tensor_visualization.py') sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list( project) number_of_species = len(project.species) # with open(conv1_results_output_file, 'w') as conv_file: for index_train_species in range(len(project.species)): if trained_on_all_species_only: trained_species_index = number_of_species - 2 # all species 238000 # change if needed train_species = project.species[trained_species_index] best_model_validation_id = (list(map_model_ids.keys()))[0] if index_train_species != 0: break else: best_model_validation_id = sorted_models_list[index_train_species] train_species = map_model_ids[best_model_validation_id] print("start show convolution on species: ", train_species) model_dir = test_CNN.create_directories(project, best_model_validation_id) conv_results_dir = os.path.join(model_dir, 'convolution_results') get_filters(project, train_species, conv_results_dir, best_model_validation_id)
def main(): project = test_CNN.get_project_and_check_arguments( sys.argv, 'display_heatmap_enhancers.py') test_results_file_path = os.path.join(project.CNN_output_dir, "test_results.txt") number_of_test_species = len(project.species) - 2 tested_on_all_species = [] for i in range(number_of_test_species): tested_on_one_species = [None] * len(project.species) tested_on_all_species.append(tested_on_one_species) test_counter = 0 number_of_values = 0 sum_v = 0 with open(test_results_file_path) as results_file: for line in results_file: if "\n" in line: new_line = line[:-1] if new_line in project.species: train_index = project.species.index(new_line) continue split_line = new_line.split() for value in split_line: number_of_values += 1 sum_v += float(value) old_test_species = old_species_name_order[test_counter] new_test_index = project.species.index(old_test_species) tested_on_all_species[new_test_index][train_index] = float( value) test_counter += 1 if test_counter == number_of_test_species: test_counter = 0 figure_path = os.path.join(project.CNN_output_dir, "heatmap_enhancers_vs_negative_data_cbar.pdf") create_heatmap(project, tested_on_all_species, figure_path)
def main(): project = test_CNN.get_project_and_check_arguments(sys.argv, 'show_convolution.py') sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list( project) number_of_species = len(project.species) if trained_on_all_species_only: trained_species_index = number_of_species - 2 # all species 238000 train_species = project.species[trained_species_index] best_model_validation_id = (list(map_model_ids.keys()))[0] test_species = train_species array_true_labels, array_prediction_scores = test_CNN.import_model_and_test( project, best_model_validation_id, test_species, train_species) max_score = max(array_prediction_scores) print("max_score = ", max_score) argmax = np.argmax(array_prediction_scores) print("argmax = ", argmax) label_max = array_true_labels[argmax] print("label_max = ", label_max) min_score = min(array_prediction_scores) print("min_score = ", min_score) argmin = np.argmin(array_prediction_scores) print("argmin = ", argmin) label_min = array_true_labels[argmin] print("label_min = ", label_min) conv_results_all_layers_max, sample_max_score, conv_results_all_layers_min, sample_min_score = \ restore_model_and_get_conv_results(project, train_species, best_model_validation_id, argmin, argmax) create_convolution_figures_and_text_files(project, best_model_validation_id, train_species, conv_results_all_layers_max, sample_max_score, True) create_convolution_figures_and_text_files(project, best_model_validation_id, train_species, conv_results_all_layers_min, sample_min_score, False) else: for best_model_validation_id in sorted_models_list: train_species = map_model_ids[best_model_validation_id] test_species = train_species array_true_labels, array_prediction_scores = test_CNN.import_model_and_test( project, best_model_validation_id, test_species, train_species) max_score = max(array_prediction_scores) print("max_score = ", max_score) argmax = np.argmax(array_prediction_scores) print("argmax = ", argmax) label_max = array_true_labels[argmax] print("label_max = ", label_max) min_score = min(array_prediction_scores) print("min_score = ", min_score) argmin = np.argmin(array_prediction_scores) print("argmin = ", argmin) label_min = array_true_labels[argmin] print("label_min = ", label_min) conv_results_all_layers_max, sample_max_score, conv_results_all_layers_min, sample_min_score =\ restore_model_and_get_conv_results(project, train_species, best_model_validation_id, argmin, argmax) create_convolution_figures_and_text_files( project, best_model_validation_id, train_species, conv_results_all_layers_max, sample_max_score, True) create_convolution_figures_and_text_files( project, best_model_validation_id, train_species, conv_results_all_layers_min, sample_min_score, False) print("End!!!")
def main(): fig = plt.figure(1) x = [0, 1] plt.plot(x, x, 'k--') project = test_CNN.get_project_and_check_arguments(sys.argv, "roc_comparison.py") TF_name = project.PWM.split("_")[0] # CEBPA for example figure_roc_path = os.path.join( project.basic_output_dir, "ROC_comparison_between_models_" + project.distribution_samples_center_dir + "_sigma_" + str(project.sigma) + ".pdf") # CNN: sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list( project) index_train_species = 0 best_model_validation_id = sorted_models_list[index_train_species] train_species = map_model_ids[best_model_validation_id] print("train_species: ", train_species) model_dir = test_CNN.create_directories(project, best_model_validation_id) CNN_scores, CNN_labels = read_scores_and_labels_files( model_dir, project, best_model_validation_id=best_model_validation_id) model_label = "CNN: , AUC: " add_roc_curve(CNN_labels, CNN_scores, fig, model_label) # PSSM models: for pssm_model in PSSM_models: # ["denovo", "JASPAR"] for pr in ["with_prior", "without_prior"]: dir_name = "CEBPA_" + pssm_model PSSM_output_dir = os.path.join( project.PSSM_output_dir, project.distribution_samples_center_dir, dir_name) PSSM_scores, PSSM_labels = read_scores_and_labels_files( PSSM_output_dir, project, pssm_model=pssm_model, pr=pr) if pr == "with_prior": # model_label = "PSSM, " + pssm_model + ", with location prior, AUC: " if pssm_model == "denovo": model_label = "denovo motif w/ location prior, AUC: " elif pssm_model == "JASPAR": model_label = "Gold standard model - True motif w/ location prior, AUC: " else: # model_label = "PSSM, " + pssm_model + ", AUC: " if pssm_model == "denovo": model_label = "denovo motif w/o location prior, AUC: " elif pssm_model == "JASPAR": model_label = "True motif w/o location prior, AUC: " add_roc_curve(PSSM_labels, PSSM_scores, fig, model_label) plt.xlabel('False positive rate', fontsize=20) plt.ylabel('True positive rate', fontsize=20) if titles: plt.suptitle('ROC curve') data_name = " ".join(project.project_name.split("_")) plt.title('Comparison between models, ' + data_name + ' of single TF: ' + TF_name) if legend: plt.legend(loc='best') new_figure_path = figure_roc_path[:-len(".pdf")] + "_with_legend.pdf" else: new_figure_path = figure_roc_path plt.savefig(new_figure_path, format='pdf') print("saving figure: ", new_figure_path, "\n\n")
def main(): project = test_CNN.get_project_and_check_arguments( sys.argv, 'display_heatmap_TF.py') sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list( project) if "Gallus_gallus" in project.species: train_indices_map = { "Canis_familiaris": 2, "Gallus_gallus": 4, "Homo_sapiens": 0, "Monodelphis_domestica": 3, "Mus_musculus": 1, "All_species_12000": 5, "All_species_60000": 6 } else: train_indices_map = { "Canis_familiaris": 2, "Homo_sapiens": 0, "Monodelphis_domestica": 3, "Mus_musculus": 1, "All_species_12000": 4, "All_species_60000": 5 } test_results_file_1 = project.test_file tested_on_Cfam_results = [None] * len(project.species) tested_on_Mmus_results = [None] * len(project.species) if "Gallus_gallus" in project.species: tested_on_Ggal_results = [None] * len(project.species) tested_on_Mdom_results = [None] * len(project.species) tested_on_Hsap_results = [None] * len(project.species) matrix_results_CEBPA_k_4 = [] auc = None for test_results_file in [test_results_file_1]: with open(test_results_file) as results_file: for line in results_file: if re.match("^\s$", line): continue elif line.startswith("train:") or \ line.startswith("finish test"): continue split_line = line.split() if line.startswith("best_model_validation_id"): model_id = split_line[2] train = map_model_ids[model_id] elif line.startswith("test:"): test = split_line[1] elif line.startswith("auc:"): auc = float(split_line[1]) if auc: train_index = train_indices_map[train] if test == "Canis_familiaris": tested_on_Cfam_results[train_index] = auc elif test == "Mus_musculus": tested_on_Mmus_results[train_index] = auc elif test == "Gallus_gallus": tested_on_Ggal_results[train_index] = auc elif test == "Monodelphis_domestica": tested_on_Mdom_results[train_index] = auc elif test == "Homo_sapiens": tested_on_Hsap_results[train_index] = auc if project.k: figure_path = os.path.join( project.CNN_output_dir, "heatmap_TF_vs_k_shuffle_k_" + str(project.k) + ".pdf") else: figure_path = os.path.join(project.CNN_output_dir, "heatmap_TF_vs_negative_data.pdf") results_path = os.path.join(project.CNN_output_dir, "heatmap_results_TF_vs_negative_data.txt") with open(results_path, "w") as out_results: if "Gallus_gallus" in project.species: matrix_results_CEBPA_k_4.append(tested_on_Ggal_results) string_list = "Chicken: " for i in tested_on_Ggal_results: string_list += str(i) + "," out_results.write(string_list + "\n") matrix_results_CEBPA_k_4.append(tested_on_Mdom_results) string_list = "Opossum: " for i in tested_on_Mdom_results: string_list += str(i) + "," out_results.write(string_list + "\n") matrix_results_CEBPA_k_4.append(tested_on_Cfam_results) string_list = "Dog: " for i in tested_on_Cfam_results: string_list += str(i) + "," out_results.write(string_list + "\n") matrix_results_CEBPA_k_4.append(tested_on_Mmus_results) string_list = "Mouse: " for i in tested_on_Mmus_results: string_list += str(i) + "," out_results.write(string_list + "\n") matrix_results_CEBPA_k_4.append(tested_on_Hsap_results) string_list = "Human: " for i in tested_on_Hsap_results: string_list += str(i) + "," out_results.write(string_list + "\n") create_heatmap(project, matrix_results_CEBPA_k_4, figure_path)
def main(): # os.system("module load tensorflow;") project = test_CNN.get_project_and_check_arguments(sys.argv, 'run_test_CNN.py') project.num_times_negative_data_is_taken = n sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list( project) sum_auc = 0 with open(project.test_file, 'a+') as out_file: number_of_species = len(project.species) number_species_tested_on = 0 for index_to_test_on in range(number_of_species + 1): if index_to_test_on != number_of_species: test_species = project.species[index_to_test_on] if "All_species" in test_species: continue number_species_tested_on += 1 print("test on species: ", test_species) for best_model_validation_id in sorted_models_list: # last_iteration - just plot the average auc of all tested species on the ROC figure if index_to_test_on == number_of_species: average_auc_returned = test_CNN.draw_roc_curve( array_true_labels, array_prediction_scores, project, model_dir, best_model_validation_id, out_file, train_species, test_species, plot_ROC=plot_ROC, average_auc=average_auc) print("average_auc_returned : ", average_auc_returned) out_file.write("average_auc_returned: {0:.3f}".format( average_auc_returned) + "\n") break train_species = map_model_ids[best_model_validation_id] if trained_on_one_species_only: if train_on_human: if "Homo_sapiens" in project.species: trained_species_index = project.species.index( "Homo_sapiens") elif "Human" in project.species: trained_species_index = project.species.index( "Human") elif train_on_all_samples: trained_species_index = number_of_species - 2 # all species 238000 elif train_on_dog: if "Canis_familiaris" in project.species: trained_species_index = project.species.index( "Canis_familiaris") elif "Dog" in project.species: trained_species_index = project.species.index( "Dog") train_species = project.species[trained_species_index] print("train_species: ", train_species) # if project.project_name == "TF_vs_k_shuffle": # model_dir = os.path.join(project.checkpoints_folder, best_model_validation_id) # else: model_dir = os.path.join(project.checkpoints_folder_tmp, best_model_validation_id) if not os.path.exists(model_dir) and not os.path.isdir( model_dir): tar = tarfile.open(model_dir + ".tar") tar.extractall(path=model_dir) tar.close() print("train on species: ", train_species) model_dir = test_CNN.create_directories( project, best_model_validation_id) array_true_labels, array_prediction_scores = \ test_CNN.import_model_and_test(project, best_model_validation_id, test_species, train_species, out_file) test_CNN.write_labels_and_scores(model_dir, best_model_validation_id, array_prediction_scores, array_true_labels, test_species) auc = test_CNN.draw_roc_curve(array_true_labels, array_prediction_scores, project, model_dir, best_model_validation_id, out_file, train_species, test_species, plot_ROC=plot_ROC) print("auc: ", auc) # print("sum_auc: ", sum_auc) sum_auc += auc average_auc = (sum_auc / number_species_tested_on) if index_to_test_on != number_of_species: out_file.write('finish test on species: ' + test_species + '\n\n\n') out_file.flush() print() # print("average_auc : ", average_auc) # out_file.write("average AUC: {0:.3f}".format(average_auc) + "\n") print("end")