Пример #1
0
def main():
    os.system("module load tensorflow")
    project = test_CNN.get_project_and_check_arguments(sys.argv, 'display_graphs.py')
    model_ids = []
    checkpoints_folder = project.checkpoints_folder_tmp
    for filename in os.listdir(checkpoints_folder):
        if filename.endswith(".tar"):
            model_ids.append(filename[:-len(".tar")])
    print("len model_ids: ", len(model_ids))
    draw_k_graph(project, model_ids)
    exit()

    ###############################################################################
    colors = sns.color_palette("hls", project.MAXIMAL_K)
    sorted_ids = [None] * project.MAXIMAL_K
    for model_validation_id in model_ids:
        split_id = model_validation_id.split("_")
        if len(split_id) == 4:
            print("split_id: ", split_id)
            k = int(model_validation_id.split("_")[2])
        else:
            k = 4
        sorted_ids[k-1] = model_validation_id
    print("sorted_ids: ", sorted_ids)

    for k in range(1, project.MAXIMAL_K+1):
        model_validation_id = sorted_ids[k-1]
        draw_ROC_graphs(project, model_validation_id, k, colors[k-1])

    print("End!!!")
Пример #2
0
def main():
    project = test_CNN.get_project_and_check_arguments(
        sys.argv, 'read_filters_and_run_Homer_compare_motifs.py')
    sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list(
        project)

    for best_model_validation_id in sorted_models_list:
        train_species = map_model_ids[best_model_validation_id]
        if trained_on_all_species_only:
            trained_species_index = len(project.species) - 2  # TODO update
            if train_species != project.species[trained_species_index]:
                continue
        print("train_name = ", train_species)

        model_dir = test_CNN.create_directories(project,
                                                best_model_validation_id)
        conv_results_dir = os.path.join(model_dir, 'convolution_results')
        filters_folder = os.path.join(conv_results_dir, "filters", "layer_1")
        num_filters_in_first_layer = project.CNN_structure.get_kernels_shape_and_number(
            1)[1]
        create_filters_file_for_Homer_compareMotifs(
            filters_folder, num_filters_in_first_layer, train_species)
        # run Homer:
        script = "~tommy/Work/HOMER/bin/compareMotifs.pl"
        motifs_file = os.path.join(filters_folder, "filters_file.txt")
        output_directory = os.path.join(filters_folder, "Homer_compareMotifs/")
        known_motifs = "/cs/cbio/tommy/HOMER/data/knownTFs/vertebrates/all.motifs"
        homer_results_file = os.path.join(filters_folder, "homer_results.txt")
        os.system(script + " " + motifs_file + " " + output_directory +
                  " -known " + known_motifs + " -cpu 2 2>&1 | tee " +
                  homer_results_file)

    print("End!")
Пример #3
0
def main():
    project = test_CNN.get_project_and_check_arguments(
        sys.argv, 'run_Homer_find_denovo_motifs.py')

    for species_name in project.species:
        print("species_name = ", species_name, " : ")
        if species_name != "simulated":
            # species_dir = os.path.join(project.text_samples_base_dir, species_name)
            Homer_dir = create_data_for_Homer.create_Homer_directory(
                project, species_name)
        else:
            # species_dir = project.text_samples_base_dir
            Homer_dir = create_data_for_Homer.create_Homer_directory(project)

        # run Homer findMotifs:
        script = "~tommy/Work/HOMER/bin/findMotifs.pl"
        positive_samples_path = os.path.join(Homer_dir,
                                             species_name + "_positive.fa")
        negative_samples_path = os.path.join(Homer_dir,
                                             species_name + "_negative.fa")
        output_directory = os.path.join(Homer_dir, "motifResults/")
        homer_results_file = os.path.join(Homer_dir, "homer_results.txt")
        os.system(script + " " + positive_samples_path + " fasta " +
                  output_directory + " -fasta " + negative_samples_path +
                  " -S 5 -p 16 -bits -mset vertebrates 2>&1 | tee " +
                  homer_results_file)
    print("End!")
Пример #4
0
def main():
    num_times_negative_data_is_taken = 0
    project = test_CNN.get_project_and_check_arguments(
        sys.argv,
        "run_data_loader.py",
        num_times_negative_data_is_taken=num_times_negative_data_is_taken)
    print "start creating data for project: ", project.project_name
    for species in project.species:
        species_text_samples_dir = os.path.join(project.text_samples_base_dir,
                                                species)
        species_npy_samples_dir = os.path.join(project.samples_base_dir,
                                               species)
        if not os.path.exists(species_text_samples_dir) and not os.path.isdir(
                species_text_samples_dir):
            print("make directory: ", species_text_samples_dir)
            os.makedirs(species_text_samples_dir)
        if not os.path.exists(species_npy_samples_dir) and not os.path.isdir(
                species_npy_samples_dir):
            print("make directory: ", species_npy_samples_dir)
            os.makedirs(species_npy_samples_dir)
    data_loader = DataLoaderH3K27acvsExpandedNeg(project)
    data_loader.get_all_positive_and_negative_samples(
        num_times_negative_data_is_taken)

    data_loader.create_data_for_each_species()
    data_loader.create_data_from_all_species_together()
    print "End!"
Пример #5
0
def main():
    project = test_CNN.get_project_and_check_arguments(
        sys.argv, 'create_data_for_Homer.py')
    for species_name in project.species:
        print("species_name = ", species_name, " : ")
        if species_name != "simulated":
            species_dir = os.path.join(project.text_samples_base_dir,
                                       species_name)
            Homer_dir = create_Homer_directory(project, species_name)
        else:
            species_dir = project.text_samples_base_dir
            Homer_dir = create_Homer_directory(project)
        out_positive_samples_path = os.path.join(Homer_dir,
                                                 species_name + "_positive.fa")
        out_negative_samples_path = os.path.join(Homer_dir,
                                                 species_name + "_negative.fa")
        input_positive_samples_path = os.path.join(species_dir,
                                                   "positive_samples")
        if project.k:
            input_negative_samples_path = os.path.join(
                species_dir, project.k_let_dirs[project.k - 1],
                "negative_samples")
        else:
            input_negative_samples_path = os.path.join(species_dir,
                                                       "negative_samples")

        write_data_file_for_Homer(True, input_positive_samples_path,
                                  out_positive_samples_path)
        write_data_file_for_Homer(False, input_negative_samples_path,
                                  out_negative_samples_path)

    print("End!")
Пример #6
0
def main():
    project = test_CNN.get_project_and_check_arguments(sys.argv,
                                                       "run_data_loader.py")
    print "start creating data for project: ", project.project_name
    data_loader = DataLoader(project, motifs_base_path)
    data_loader.create_npy_files()
    print "End!"
Пример #7
0
def main():
    project = test_CNN.get_project_and_check_arguments(sys.argv,
                                                       "run_data_loader.py")
    print "start creating data for project: ", project.project_name
    data_loader = DataLoaderNegDatavsShuffle(project)
    data_loader.get_all_positive_and_negative_samples()

    data_loader.create_data_from_all_species_together()
    print "End!"
Пример #8
0
def main():
    project = test_CNN.get_project_and_check_arguments(sys.argv, "run_data_loader.py")
    # data_handle.remove_files(project.text_samples_base_dir, ".txt", project.species)
    print "start creating data for project: ", project.project_name
    data_loader = DataLoaderH3K27acvsNeg(project)
    data_loader.get_all_positive_and_negative_samples()
    data_loader.create_data_for_each_species()
    data_loader.create_data_from_all_species_together()
    print "End!"
Пример #9
0
def main():
    project = test_CNN.get_project_and_check_arguments(sys.argv, 'create_species_dirs.py')
    project_data_dir = os.path.join(base_path, project.project_name, 'data')
    for dir_name in ['npy_files', 'samples']:
        dir_path = os.path.join(project_data_dir, dir_name)
        for species in project.species:
            species_dir = os.path.join(dir_path, species)
            if not os.path.exists(species_dir) and not os.path.isdir(species_dir):
                print("make directory: ", species_dir)
                os.makedirs(species_dir)
Пример #10
0
def main():
    project = test_CNN.get_project_and_check_arguments(sys.argv,
                                                       "run_data_loader.py")
    print "start creating data for project: ", project.project_name
    for species in project.species:
        species_text_samples_dir = os.path.join(
            project.text_samples_base_dir, species,
            project.k_let_dirs[project.k - 1])
        species_npy_samples_dir = os.path.join(project.samples_base_dir,
                                               species)
        if not os.path.exists(species_text_samples_dir) and not os.path.isdir(
                species_text_samples_dir):
            print("make directory: ", species_text_samples_dir)
            os.makedirs(species_text_samples_dir)
        if not os.path.exists(species_npy_samples_dir) and not os.path.isdir(
                species_npy_samples_dir):
            print("make directory: ", species_npy_samples_dir)
            os.makedirs(species_npy_samples_dir)

    data_loader = DataLoaderTFvsShuffle(project)
    data_loader.get_all_positive_and_negative_samples()

    data_loader.create_data_from_all_species_together()
    print "End!"
Пример #11
0
def main():
    project = test_CNN.get_project_and_check_arguments(
        sys.argv, 'tensor_visualization.py')
    sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list(
        project)
    number_of_species = len(project.species)
    # with open(conv1_results_output_file, 'w') as conv_file:

    for index_train_species in range(len(project.species)):
        if trained_on_all_species_only:
            trained_species_index = number_of_species - 2  # all species 238000 # change if needed
            train_species = project.species[trained_species_index]
            best_model_validation_id = (list(map_model_ids.keys()))[0]
            if index_train_species != 0: break
        else:
            best_model_validation_id = sorted_models_list[index_train_species]
            train_species = map_model_ids[best_model_validation_id]

        print("start show convolution on species: ", train_species)
        model_dir = test_CNN.create_directories(project,
                                                best_model_validation_id)
        conv_results_dir = os.path.join(model_dir, 'convolution_results')
        get_filters(project, train_species, conv_results_dir,
                    best_model_validation_id)
def main():
    project = test_CNN.get_project_and_check_arguments(
        sys.argv, 'display_heatmap_enhancers.py')
    test_results_file_path = os.path.join(project.CNN_output_dir,
                                          "test_results.txt")
    number_of_test_species = len(project.species) - 2
    tested_on_all_species = []
    for i in range(number_of_test_species):
        tested_on_one_species = [None] * len(project.species)
        tested_on_all_species.append(tested_on_one_species)
    test_counter = 0
    number_of_values = 0
    sum_v = 0
    with open(test_results_file_path) as results_file:
        for line in results_file:
            if "\n" in line:
                new_line = line[:-1]
            if new_line in project.species:
                train_index = project.species.index(new_line)
                continue
            split_line = new_line.split()
            for value in split_line:
                number_of_values += 1
                sum_v += float(value)
                old_test_species = old_species_name_order[test_counter]
                new_test_index = project.species.index(old_test_species)
                tested_on_all_species[new_test_index][train_index] = float(
                    value)
                test_counter += 1
                if test_counter == number_of_test_species:
                    test_counter = 0

    figure_path = os.path.join(project.CNN_output_dir,
                               "heatmap_enhancers_vs_negative_data_cbar.pdf")

    create_heatmap(project, tested_on_all_species, figure_path)
Пример #13
0
def main():

    project = test_CNN.get_project_and_check_arguments(sys.argv,
                                                       'show_convolution.py')
    sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list(
        project)
    number_of_species = len(project.species)
    if trained_on_all_species_only:
        trained_species_index = number_of_species - 2  # all species 238000
        train_species = project.species[trained_species_index]
        best_model_validation_id = (list(map_model_ids.keys()))[0]
        test_species = train_species
        array_true_labels, array_prediction_scores = test_CNN.import_model_and_test(
            project, best_model_validation_id, test_species, train_species)

        max_score = max(array_prediction_scores)
        print("max_score = ", max_score)
        argmax = np.argmax(array_prediction_scores)
        print("argmax = ", argmax)
        label_max = array_true_labels[argmax]
        print("label_max = ", label_max)
        min_score = min(array_prediction_scores)
        print("min_score = ", min_score)
        argmin = np.argmin(array_prediction_scores)
        print("argmin = ", argmin)
        label_min = array_true_labels[argmin]
        print("label_min = ", label_min)
        conv_results_all_layers_max, sample_max_score, conv_results_all_layers_min, sample_min_score = \
            restore_model_and_get_conv_results(project, train_species,
                                               best_model_validation_id, argmin, argmax)
        create_convolution_figures_and_text_files(project,
                                                  best_model_validation_id,
                                                  train_species,
                                                  conv_results_all_layers_max,
                                                  sample_max_score, True)
        create_convolution_figures_and_text_files(project,
                                                  best_model_validation_id,
                                                  train_species,
                                                  conv_results_all_layers_min,
                                                  sample_min_score, False)
    else:
        for best_model_validation_id in sorted_models_list:
            train_species = map_model_ids[best_model_validation_id]
            test_species = train_species
            array_true_labels, array_prediction_scores = test_CNN.import_model_and_test(
                project, best_model_validation_id, test_species, train_species)

            max_score = max(array_prediction_scores)
            print("max_score = ", max_score)
            argmax = np.argmax(array_prediction_scores)
            print("argmax = ", argmax)
            label_max = array_true_labels[argmax]
            print("label_max = ", label_max)
            min_score = min(array_prediction_scores)
            print("min_score = ", min_score)
            argmin = np.argmin(array_prediction_scores)
            print("argmin = ", argmin)
            label_min = array_true_labels[argmin]
            print("label_min = ", label_min)
            conv_results_all_layers_max, sample_max_score, conv_results_all_layers_min, sample_min_score =\
                restore_model_and_get_conv_results(project, train_species,
                                                   best_model_validation_id, argmin, argmax)
            create_convolution_figures_and_text_files(
                project, best_model_validation_id, train_species,
                conv_results_all_layers_max, sample_max_score, True)
            create_convolution_figures_and_text_files(
                project, best_model_validation_id, train_species,
                conv_results_all_layers_min, sample_min_score, False)
    print("End!!!")
Пример #14
0
def main():
    fig = plt.figure(1)
    x = [0, 1]
    plt.plot(x, x, 'k--')
    project = test_CNN.get_project_and_check_arguments(sys.argv,
                                                       "roc_comparison.py")
    TF_name = project.PWM.split("_")[0]  # CEBPA for example
    figure_roc_path = os.path.join(
        project.basic_output_dir, "ROC_comparison_between_models_" +
        project.distribution_samples_center_dir + "_sigma_" +
        str(project.sigma) + ".pdf")

    # CNN:
    sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list(
        project)
    index_train_species = 0
    best_model_validation_id = sorted_models_list[index_train_species]
    train_species = map_model_ids[best_model_validation_id]
    print("train_species: ", train_species)
    model_dir = test_CNN.create_directories(project, best_model_validation_id)
    CNN_scores, CNN_labels = read_scores_and_labels_files(
        model_dir, project, best_model_validation_id=best_model_validation_id)
    model_label = "CNN: , AUC: "
    add_roc_curve(CNN_labels, CNN_scores, fig, model_label)

    # PSSM models:
    for pssm_model in PSSM_models:  # ["denovo", "JASPAR"]

        for pr in ["with_prior", "without_prior"]:
            dir_name = "CEBPA_" + pssm_model

            PSSM_output_dir = os.path.join(
                project.PSSM_output_dir,
                project.distribution_samples_center_dir, dir_name)
            PSSM_scores, PSSM_labels = read_scores_and_labels_files(
                PSSM_output_dir, project, pssm_model=pssm_model, pr=pr)
            if pr == "with_prior":
                # model_label = "PSSM, " + pssm_model + ", with location prior, AUC: "
                if pssm_model == "denovo":
                    model_label = "denovo motif w/ location prior, AUC: "
                elif pssm_model == "JASPAR":
                    model_label = "Gold standard model - True motif w/ location prior, AUC: "
            else:
                # model_label = "PSSM, " + pssm_model + ", AUC: "
                if pssm_model == "denovo":
                    model_label = "denovo motif w/o location prior, AUC: "
                elif pssm_model == "JASPAR":
                    model_label = "True motif w/o location prior, AUC: "
            add_roc_curve(PSSM_labels, PSSM_scores, fig, model_label)

    plt.xlabel('False positive rate', fontsize=20)
    plt.ylabel('True positive rate', fontsize=20)

    if titles:
        plt.suptitle('ROC curve')
        data_name = " ".join(project.project_name.split("_"))
        plt.title('Comparison between models, ' + data_name +
                  ' of single TF: ' + TF_name)
    if legend:
        plt.legend(loc='best')
        new_figure_path = figure_roc_path[:-len(".pdf")] + "_with_legend.pdf"
    else:
        new_figure_path = figure_roc_path
    plt.savefig(new_figure_path, format='pdf')
    print("saving figure: ", new_figure_path, "\n\n")
Пример #15
0
def main():
    project = test_CNN.get_project_and_check_arguments(
        sys.argv, 'display_heatmap_TF.py')
    sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list(
        project)

    if "Gallus_gallus" in project.species:
        train_indices_map = {
            "Canis_familiaris": 2,
            "Gallus_gallus": 4,
            "Homo_sapiens": 0,
            "Monodelphis_domestica": 3,
            "Mus_musculus": 1,
            "All_species_12000": 5,
            "All_species_60000": 6
        }
    else:
        train_indices_map = {
            "Canis_familiaris": 2,
            "Homo_sapiens": 0,
            "Monodelphis_domestica": 3,
            "Mus_musculus": 1,
            "All_species_12000": 4,
            "All_species_60000": 5
        }

    test_results_file_1 = project.test_file
    tested_on_Cfam_results = [None] * len(project.species)
    tested_on_Mmus_results = [None] * len(project.species)
    if "Gallus_gallus" in project.species:
        tested_on_Ggal_results = [None] * len(project.species)
    tested_on_Mdom_results = [None] * len(project.species)
    tested_on_Hsap_results = [None] * len(project.species)

    matrix_results_CEBPA_k_4 = []
    auc = None
    for test_results_file in [test_results_file_1]:
        with open(test_results_file) as results_file:
            for line in results_file:
                if re.match("^\s$", line):
                    continue
                elif line.startswith("train:") or \
                        line.startswith("finish test"):
                    continue
                split_line = line.split()
                if line.startswith("best_model_validation_id"):
                    model_id = split_line[2]
                    train = map_model_ids[model_id]
                elif line.startswith("test:"):
                    test = split_line[1]

                elif line.startswith("auc:"):
                    auc = float(split_line[1])
                if auc:
                    train_index = train_indices_map[train]
                    if test == "Canis_familiaris":
                        tested_on_Cfam_results[train_index] = auc
                    elif test == "Mus_musculus":
                        tested_on_Mmus_results[train_index] = auc
                    elif test == "Gallus_gallus":
                        tested_on_Ggal_results[train_index] = auc
                    elif test == "Monodelphis_domestica":
                        tested_on_Mdom_results[train_index] = auc
                    elif test == "Homo_sapiens":
                        tested_on_Hsap_results[train_index] = auc

    if project.k:
        figure_path = os.path.join(
            project.CNN_output_dir,
            "heatmap_TF_vs_k_shuffle_k_" + str(project.k) + ".pdf")
    else:
        figure_path = os.path.join(project.CNN_output_dir,
                                   "heatmap_TF_vs_negative_data.pdf")
        results_path = os.path.join(project.CNN_output_dir,
                                    "heatmap_results_TF_vs_negative_data.txt")
    with open(results_path, "w") as out_results:
        if "Gallus_gallus" in project.species:
            matrix_results_CEBPA_k_4.append(tested_on_Ggal_results)
            string_list = "Chicken: "
            for i in tested_on_Ggal_results:
                string_list += str(i) + ","
            out_results.write(string_list + "\n")

        matrix_results_CEBPA_k_4.append(tested_on_Mdom_results)
        string_list = "Opossum: "
        for i in tested_on_Mdom_results:
            string_list += str(i) + ","
        out_results.write(string_list + "\n")

        matrix_results_CEBPA_k_4.append(tested_on_Cfam_results)
        string_list = "Dog: "
        for i in tested_on_Cfam_results:
            string_list += str(i) + ","
        out_results.write(string_list + "\n")

        matrix_results_CEBPA_k_4.append(tested_on_Mmus_results)
        string_list = "Mouse: "
        for i in tested_on_Mmus_results:
            string_list += str(i) + ","
        out_results.write(string_list + "\n")

        matrix_results_CEBPA_k_4.append(tested_on_Hsap_results)
        string_list = "Human: "
        for i in tested_on_Hsap_results:
            string_list += str(i) + ","
        out_results.write(string_list + "\n")

    create_heatmap(project, matrix_results_CEBPA_k_4, figure_path)
Пример #16
0
def main():
    # os.system("module load tensorflow;")
    project = test_CNN.get_project_and_check_arguments(sys.argv,
                                                       'run_test_CNN.py')
    project.num_times_negative_data_is_taken = n
    sorted_models_list, map_model_ids = test_CNN.get_sorted_models_list(
        project)
    sum_auc = 0
    with open(project.test_file, 'a+') as out_file:
        number_of_species = len(project.species)
        number_species_tested_on = 0
        for index_to_test_on in range(number_of_species + 1):
            if index_to_test_on != number_of_species:
                test_species = project.species[index_to_test_on]
                if "All_species" in test_species:
                    continue
                number_species_tested_on += 1
                print("test on species: ", test_species)
            for best_model_validation_id in sorted_models_list:
                # last_iteration - just plot the average auc of all tested species on the ROC figure
                if index_to_test_on == number_of_species:
                    average_auc_returned = test_CNN.draw_roc_curve(
                        array_true_labels,
                        array_prediction_scores,
                        project,
                        model_dir,
                        best_model_validation_id,
                        out_file,
                        train_species,
                        test_species,
                        plot_ROC=plot_ROC,
                        average_auc=average_auc)
                    print("average_auc_returned : ", average_auc_returned)
                    out_file.write("average_auc_returned: {0:.3f}".format(
                        average_auc_returned) + "\n")
                    break
                train_species = map_model_ids[best_model_validation_id]
                if trained_on_one_species_only:
                    if train_on_human:
                        if "Homo_sapiens" in project.species:
                            trained_species_index = project.species.index(
                                "Homo_sapiens")
                        elif "Human" in project.species:
                            trained_species_index = project.species.index(
                                "Human")
                    elif train_on_all_samples:
                        trained_species_index = number_of_species - 2  # all species 238000
                    elif train_on_dog:
                        if "Canis_familiaris" in project.species:
                            trained_species_index = project.species.index(
                                "Canis_familiaris")
                        elif "Dog" in project.species:
                            trained_species_index = project.species.index(
                                "Dog")
                    train_species = project.species[trained_species_index]
                    print("train_species: ", train_species)
                # if project.project_name == "TF_vs_k_shuffle":
                #     model_dir = os.path.join(project.checkpoints_folder, best_model_validation_id)
                # else:
                model_dir = os.path.join(project.checkpoints_folder_tmp,
                                         best_model_validation_id)
                if not os.path.exists(model_dir) and not os.path.isdir(
                        model_dir):
                    tar = tarfile.open(model_dir + ".tar")
                    tar.extractall(path=model_dir)
                    tar.close()
                print("train on species: ", train_species)
                model_dir = test_CNN.create_directories(
                    project, best_model_validation_id)
                array_true_labels, array_prediction_scores = \
                    test_CNN.import_model_and_test(project, best_model_validation_id,
                                                   test_species, train_species, out_file)
                test_CNN.write_labels_and_scores(model_dir,
                                                 best_model_validation_id,
                                                 array_prediction_scores,
                                                 array_true_labels,
                                                 test_species)
                auc = test_CNN.draw_roc_curve(array_true_labels,
                                              array_prediction_scores,
                                              project,
                                              model_dir,
                                              best_model_validation_id,
                                              out_file,
                                              train_species,
                                              test_species,
                                              plot_ROC=plot_ROC)
                print("auc: ", auc)
                # print("sum_auc: ", sum_auc)
                sum_auc += auc
                average_auc = (sum_auc / number_species_tested_on)
            if index_to_test_on != number_of_species:
                out_file.write('finish test on species: ' + test_species +
                               '\n\n\n')
                out_file.flush()
            print()
        # print("average_auc : ", average_auc)
        # out_file.write("average AUC: {0:.3f}".format(average_auc) + "\n")

    print("end")