def main(): exon_class_bp.set_debug(0) exon_type = "CCE" output = os.path.realpath(os.path.dirname(__file__)).replace( "src/stretch_calculator", "result/stretch_calculator/") file_dir = os.path.realpath(os.path.dirname(__file__)).replace( "src/stretch_calculator", "result/") if not os.path.isdir(output): os.mkdir(output) ctrl_dir = os.path.realpath( os.path.dirname(__file__)) + "/control_dictionaries/" sys.path.insert(0, ctrl_dir) fasterdb = os.path.realpath(os.path.dirname(__file__)).replace( "src/stretch_calculator", "data/fasterDB_lite.db") seddb = os.path.realpath(os.path.dirname(__file__)).replace( "src/stretch_calculator", "data/sed.db") cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) type_factors = ["exon", "spliceosome", "spliceosome"] regulations = ["down", "down", "up"] for i in range(len(type_factors)): type_analysis = type_factors[i] regulation = regulations[i] name_file, list_file = initiate_list_of_factor(file_dir, exon_type, type_analysis) dict_stretch_3ss = { "X".join(map(str, stretch_data)): {nt: [] for nt in config.nt_list} for stretch_data in config.stretches } for j in range(len(name_file)): if name_file[j] != exon_type: exon_list = extract_data(cnx, cnx_sed, list_file, name_file, j, regulation) for stretch_data in config.stretches: stretch_dic = get_stretch_score_list( exon_list, stretch_data) for nt in config.nt_list: dict_stretch_3ss["X".join(map( str, stretch_data))][nt].append(stretch_dic[nt]) else: for stretch_data in config.stretches: mod = __import__("%s_stretches" % exon_type) st_name = "X".join(map(str, stretch_data)) ctrl_dic = eval("mod.stretch_%s" % st_name) for nt in config.nt_list: dict_stretch_3ss[st_name][nt].append(ctrl_dic[nt]) for stretch_data in config.stretches: st_name = "X".join(map(str, stretch_data)) for nt in config.nt_list: create_figure( dict_stretch_3ss[st_name][nt], name_file, output, regulation, "nb_stretch_%s-%s_%s_nt" % (stretch_data[1], stretch_data[0], nt), type_analysis) dataframe_creator( dict_stretch_3ss[st_name][nt], name_file, output, regulation, "nb_stretch_%s-%s_%s_nt" % (stretch_data[1], stretch_data[0], nt), type_analysis)
def main_2g(list_file, name_file, exon_type, fasterdb, seddb, output): """ :param list_file: (list of str) list of exons files in the form \ of GC_rich_exon file. :param name_file: (list of str) the name of each files of exons \ given in ``list_file`` :param exon_type: (str) the control exons :param output: (str) folder where the result will be created :param seddb: (str) path to sed database :param fasterdb: (str) path to fasterdb database """ list_file.append(None) name_file.append(exon_type) exon_class_bp.set_debug(0) ctrl_dir = os.path.realpath(os.path.dirname(__file__)) + \ "/control_dictionaries/" sys.path.insert(0, ctrl_dir) cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) type_analysis = "exon" regulation = "down" dict_stretch_3ss = { "X".join(map(str, stretch_data)): {nt: [] for nt in config.nt_list} for stretch_data in config.stretches } for j in range(len(name_file)): if name_file[j] != exon_type: exon_list = extract_data(cnx, cnx_sed, list_file, name_file, j, regulation) for stretch_data in config.stretches: stretch_dic = get_stretch_score_list(exon_list, stretch_data) for nt in config.nt_list: dict_stretch_3ss["X".join(map( str, stretch_data))][nt].append(stretch_dic[nt]) else: for stretch_data in config.stretches: mod = __import__("%s_stretches" % exon_type) st_name = "X".join(map(str, stretch_data)) ctrl_dic = eval("mod.stretch_%s" % st_name) for nt in config.nt_list: dict_stretch_3ss[st_name][nt].append(ctrl_dic[nt]) for stretch_data in config.stretches: st_name = "X".join(map(str, stretch_data)) for nt in config.nt_list: create_figure( dict_stretch_3ss[st_name][nt], name_file, output, regulation, "2.2G_nb_stretch_%s-%s_%s_nt" % (stretch_data[1], stretch_data[0], nt), type_analysis) dataframe_creator( dict_stretch_3ss[st_name][nt], name_file, output, regulation, "2.2G_nb_stretch_%s-%s_%s_nt" % (stretch_data[1], stretch_data[0], nt), type_analysis)
def main(exon_file, name_table, list_sf, sed, fasterdb, output, ss="5'ss"): """ Create a table showing for the exon commons in exon_files files \ their surrounding introns length and their MFE at their 5'ss. :param exon_file: (str) a file containing gc/at exons :param name_table: (str) the name of the resulting table :param list_sf: (List(vtype=str)) list of sf name :param sed: (str) path to sed database :param fasterdb: (str) path to fasterdb database :param output: (str) file were the output will be created :param ss: (str) the splicing site of interest """ sf_names = "_".join([name_table] + list_sf) exon_class.set_debug(1) exon_class_bp.set_debug(debug=1) cnx_sed = sqlite3.connect(sed) cnx_fasterdb = sqlite3.connect(fasterdb) exon_list = [] print("Getting exon from file") exon_list.append(get_exon(exon_file)) print("Getting regulated exons") for sf in list_sf: tmp = udf.get_every_events_4_a_sl(cnx_sed, sf, "down") tmp = [[int(v[0]), int(v[1])] for v in tmp] exon_list.append(tmp) print("\t%s : %s down-regulated exons" % (sf, len(tmp))) new_exon_list = reduce(get_union_exon, exon_list) print("Commons exons : %s" % len(new_exon_list)) print("Getting commons exons data !") df = get_exon_data(cnx_sed, new_exon_list, ss) if ss == "5'ss": noutput = output + "/rnafold_" + sf_names + "_commons_down_exons/" print("Computing MFE") df = computing_mfe(cnx_fasterdb, df, noutput) else: # Code to compute number of good branch point print("Computing Good branch point") nexon_list = df[["gene_name", "gene_id", "pos"]].values df2 = svm_bp_finder_launcher(cnx_fasterdb, nexon_list, output) print(df2.head()) print(df.head()) df = pd.merge(df, df2, how="right", on=["gene_id", "pos"]) print("Writing results !") df.to_csv("%s/%s_commons_down_exons.csv" % (output, sf_names), sep="\t", index=False)
def main(): regulation = "down" exon_class_bp.set_debug(0) base = os.path.dirname( os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) output = base + "/result/experimental_branch_point" at_exon_file = base + "/result/AT_rich_exons" gc_exon_file = base + "/result/GC_rich_exons" fasterdb = base + "/data/fasterDB_lite.db" seddb = base + "/data/sed.db" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) exon_type = "CCE" at_exon = read_file(at_exon_file) gc_exon = read_file(gc_exon_file) exon2remove = [ list(map(int, exon)) for exon in udf.get_exon_regulated_by_sf(cnx_sed, regulation) ] ctrl_exons = get_ctrl_exons(cnx, exon_type, exon2remove) exon_list = gc_exon + at_exon + ctrl_exons type_exon = ["GC-exons"] * len(gc_exon) + \ ["AT-exons"] * len(at_exon) + \ ["%s-exons" % exon_type] * len(ctrl_exons) tot = len(exon_list) count = 0 count_none = 0 print("Creating bed of predicted branch points") with open("%s/predicted_branch_points.bed" % output, "w") as outf: for exon, name_exon in zip(exon_list, type_exon): exon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) nb_good_bp, list_pos = function_bp.goob_bp_only(exon) if list_pos is not None: for line in list_pos: line[3] += "_" + name_exon line[0] = "chr" + str(line[0]) outf.write("\t".join(list(map(str, line))) + "\n") else: count_none += 1 count += 1 sys.stdout.write("%s/%s (%s) \r" % (count, tot, count_none)) cnx.close() cnx_sed.close()
def control_dictionaries_creator(): """ Create the control dictionary containing the values corresponding to the score of bp and ppt for every control exons """ exon_class_bp.set_debug(0) dir_path = os.path.dirname(os.path.realpath(__file__)) fasterdb = os.path.dirname(os.path.realpath(__file__)).replace( "src/make_control_files_bp_ppt", "data/fasterDB_lite.db") seddb = os.path.dirname(os.path.realpath(__file__)).replace( "src/make_control_files_bp_ppt", "data/sed.db") ctrl_dir = dir_path + "/control_dictionaries/" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) exon2remove = union_dataset_function.get_exon_regulated_by_sf( cnx_sed, "down") if not os.path.isdir(ctrl_dir): os.mkdir(ctrl_dir) exon_type = ["CCE"] sizes = [100, 50, 35, 25] for cur_exon_type in exon_type: ctrl_exon_list = get_control_exon_information(cnx, cur_exon_type, exon2remove) print("retrieving upstream intron sequence") list_exon = [ exon_class_bp.ExonClass(cnx, exon[0], exon[1], exon[2]) for exon in ctrl_exon_list ] for size in sizes: print("calculating bp and ppt score") bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, sequence_list, ag_count_list, \ hbound_list, uaa_list, una_list = function_bp.bp_ppt_calculator(list_exon, size) cur_file = open( ctrl_dir + cur_exon_type + "_" + str(size) + "_bp_ppt_score.py", "w") cur_file.write("bp_score=" + str(bp_score_list) + "\n") cur_file.write("ppt_score=" + str(ppt_score_list) + "\n") cur_file.write("nb_bp=" + str(nb_bp_list) + "\n") cur_file.write("nb_good_bp=" + str(nb_good_bp_list) + "\n") cur_file.write("bp_seq=" + str(sequence_list) + "\n") cur_file.write("ag_count=" + str(ag_count_list) + "\n") cur_file.write("hbound=" + str(hbound_list) + "\n") cur_file.write("uaa_count=" + str(uaa_list) + "\n") cur_file.write("una_count=" + str(una_list) + "\n") cur_file.close()
""" Description: Create a bed file containing every GC and AT exons and their GC frequency """ import sqlite3 import os import sys mydir = os.path.dirname(os.path.dirname(__file__)) sys.path.insert(0, mydir) import union_dataset_function as udf from figure_creator import get_exons_list bp_dir = mydir + "/make_control_files_bp_ppt" sys.path.insert(0, bp_dir) import exon_class_bp exon_class_bp.set_debug(0) from function_bp import bp_ppt_calculator mfe_dir = mydir + "/minimum_free_energy" sys.path.insert(0, mfe_dir) import exon_class exon_class.set_debug(0) from function import mfe_calculator stretch_dir = mydir + "/stretch_calculator" sys.path.insert(0, stretch_dir) from stretch_calculator import stretch_counter def get_exon_from_file(exon_file): """ Get evey exon stored into ``exon_file``.
def main_2efg(list_file, name_file, exon_type, seddb, fasterdb, output): """ :param list_file: (list of str) list of exons files in the form \ of GC_rich_exon file. :param name_file: (list of str) the name of each files of exons \ given in ``list_file`` :param exon_type: (str) the control exons of interest :param seddb: (str) path to sed database :param fasterdb: (str) path to fasterdb database :param output: (str) path where the result will be created """ list_file.append(None) name_file.append(exon_type) exon_class_bp.set_debug(0) ctrl_output = os.path.realpath(os.path.dirname(__file__)).replace( "src/make_control_files_bp_ppt", "result/make_control_files_bp_ppt/") if not os.path.isdir(ctrl_output): os.mkdir(ctrl_output) ctrl_dir = os.path.realpath( os.path.dirname(__file__)) + "/control_dictionaries/" sys.path.insert(0, ctrl_dir) cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) type_analysis = "exon" regulation = "down" dict_score_3ss = { k: { "bp_score_list": [], "ppt_score_list": [], "nb_bp_list": [], "gc_weblogo": [], "nb_good_bp_list": [], "ag_count": [], "hbound": [], "uaa_count": [], "una_count": [] } for k in [100, 50, 25] } for i in range(len(name_file)): if name_file[i] != exon_type: exon_list = extract_data(cnx, cnx_sed, list_file, name_file, i, regulation) for size in dict_score_3ss.keys(): bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, bp_seq_list, ag_count_list,\ hbound_list, uaa_list, una_list = \ get_bp_ppt_score_list(ctrl_output, exon_list, name_file[i], size, regulation) dict_score_3ss[size]["nb_good_bp_list"].append(nb_good_bp_list) if size == 25: web_logo_creator( bp_seq_list, "2.1F_%s_%s_exons_%s_nt" % (name_file[i], regulation, size), output) dict_score_3ss[size]["gc_weblogo"].append( get_weblogo_gc_count(bp_seq_list)) dict_score_3ss[size]["hbound"].append(hbound_list) dict_score_3ss[size]["una_count"].append(una_list) else: for size in dict_score_3ss.keys(): mod = __import__("%s_%s_bp_ppt_score" % (exon_type, size)) dict_score_3ss[size]["nb_good_bp_list"].append(mod.nb_good_bp) dict_score_3ss[size]["gc_weblogo"].append( get_weblogo_gc_count(mod.bp_seq)) dict_score_3ss[size]["hbound"].append(mod.hbound) dict_score_3ss[size]["una_count"].append(mod.una_count) for size in dict_score_3ss.keys(): if size == 25: create_figure(dict_score_3ss[size]["hbound"], name_file, output, regulation, "2.2E_nb_h_bound_%s_nt" % size, type_analysis, "violin") dataframe_creator(dict_score_3ss[size]["hbound"], name_file, output, regulation, "2.2E_nb_h_bound_%s_nt" % size, type_analysis) create_figure(dict_score_3ss[size]["gc_weblogo"], name_file, output, regulation, "2.2F_gc_weblogo_%s_nt" % size, type_analysis, "box") dataframe_creator(dict_score_3ss[size]["gc_weblogo"], name_file, output, regulation, "2.2F_gc_weblogo_%s_nt" % size, type_analysis) if size == 50: create_figure(dict_score_3ss[size]["una_count"], name_file, output, regulation, "2.1G_UNA_count(%snt)" % size, type_analysis, "box") dataframe_creator(dict_score_3ss[size]["una_count"], name_file, output, regulation, "2.1G_UNA_count(%snt)" % size, type_analysis) if size == 100: create_barplot(dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation, "2.1E_prop_nb_good_branch_point_(%snt)" % size, type_analysis) write_proportion_pvalues( dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation, "2.1E_prop_nb_good_branch_point_(%snt)" % size, type_analysis) # dataframe_creator(dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation, # "2.1E_prop_nb_good_branch_point_(%snt)" % size, type_analysis) cnx.close() cnx_sed.close()
def main(): exon_class_bp.set_debug(0) exon_type = "CCE" ctrl_output = os.path.realpath(os.path.dirname(__file__)).replace( "src/make_control_files_bp_ppt", "result/make_control_files_bp_ppt/") output = os.path.realpath(os.path.dirname(__file__)).replace( "src/make_control_files_bp_ppt", "result/bp_ppt_score/") file_dir = os.path.realpath(os.path.dirname(__file__)).replace( "src/make_control_files_bp_ppt", "result/") if not os.path.isdir(ctrl_output): os.mkdir(ctrl_output) if not os.path.isdir(output): os.mkdir(output) ctrl_dir = os.path.realpath( os.path.dirname(__file__)) + "/control_dictionaries/" sys.path.insert(0, ctrl_dir) fasterdb = os.path.realpath(os.path.dirname(__file__)).replace( "src/make_control_files_bp_ppt", "data/fasterDB_lite.db") seddb = os.path.realpath(os.path.dirname(__file__)).replace( "src/make_control_files_bp_ppt", "data/sed.db") cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) type_factors = ["exon", "spliceosome", "spliceosome"] regulations = ["down", "down", "up"] for j in range(len(type_factors)): type_analysis = type_factors[j] regulation = regulations[j] name_file, list_file = initiate_list_of_factor(file_dir, exon_type, type_analysis) dict_score_3ss = { k: { "bp_score_list": [], "ppt_score_list": [], "nb_bp_list": [], "gc_weblogo": [], "nb_good_bp_list": [], "ag_count": [], "hbound": [], "uaa_count": [], "una_count": [] } for k in [100, 50, 35, 25] } # list_force_acceptor = [] # list_force_donor = [] for i in range(len(name_file)): if name_file[i] != exon_type: exon_list = extract_data(cnx, cnx_sed, list_file, name_file, i, regulation) # list_force_acceptor.append(get_redundant_list_of_value(cnx_sed, exon_list, "force_acceptor")) # list_force_donor.append(get_redundant_list_of_value(cnx_sed, exon_list, "force_donor")) for size in dict_score_3ss.keys(): bp_score_list, ppt_score_list, nb_bp_list, nb_good_bp_list, bp_seq_list, ag_count_list,\ hbound_list, uaa_list, una_list = \ get_bp_ppt_score_list(ctrl_output, exon_list, name_file[i], size, regulation) # dict_score_3ss[size]["bp_score_list"].append(bp_score_list) # dict_score_3ss[size]["ppt_score_list"].append(ppt_score_list) # dict_score_3ss[size]["nb_bp_list"].append(nb_bp_list) dict_score_3ss[size]["nb_good_bp_list"].append( nb_good_bp_list) web_logo_creator( bp_seq_list, "%s_%s_exons_%s_nt" % (name_file[i], regulation, size), output) dict_score_3ss[size]["gc_weblogo"].append( get_weblogo_gc_count(bp_seq_list)) # dict_score_3ss[size]["ag_count"].append(ag_count_list) dict_score_3ss[size]["hbound"].append(hbound_list) # dict_score_3ss[size]["uaa_count"].append(uaa_list) dict_score_3ss[size]["una_count"].append(una_list) else: for size in dict_score_3ss.keys(): mod = __import__("%s_%s_bp_ppt_score" % (exon_type, size)) # dict_score_3ss[size]["bp_score_list"].append(mod.bp_score) # dict_score_3ss[size]["ppt_score_list"].append(mod.ppt_score) # dict_score_3ss[size]["nb_bp_list"].append(mod.nb_bp) dict_score_3ss[size]["nb_good_bp_list"].append( mod.nb_good_bp) web_logo_creator( mod.bp_seq, "%s_%s_exons_%s_nt" % (name_file[i], regulation, size), output) dict_score_3ss[size]["gc_weblogo"].append( get_weblogo_gc_count(mod.bp_seq)) # dict_score_3ss[size]["ag_count"].append(mod.ag_count) dict_score_3ss[size]["hbound"].append(mod.hbound) # dict_score_3ss[size]["uaa_count"].append(mod.uaa_count) dict_score_3ss[size]["una_count"].append(mod.una_count) # list_force_acceptor.append(get_control_exon_information(cnx_sed, exon_type, "force_acceptor")) # list_force_donor.append(get_control_exon_information(cnx_sed, exon_type, "force_donor")) for size in dict_score_3ss.keys(): print("------------> %s nt " % size) create_barplot(dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation, "prop_nb_good_branch_point_(%snt)" % size, type_analysis) # create_distplot_bp(dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation, # "prop_nb_good_branch_point_(%snt)" % size, type_analysis) # write_proportion_pvalues(dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation, # "prop_nb_good_branch_point_(%snt)" % size, type_analysis) dataframe_creator(dict_score_3ss[size]["nb_good_bp_list"], name_file, output, regulation, "prop_nb_good_branch_point_(%snt)" % size, type_analysis) # create_barplot(dict_score_3ss[size]["ag_count"], name_file, output, regulation, # "AG_count_downstream_bp(%snt)" % size, type_analysis) # write_proportion_pvalues(dict_score_3ss[size]["ag_count"], name_file, output, regulation, # "AG_count_downstream_bp(%snt)" % size, type_analysis) # create_barplot(dict_score_3ss[size]["uaa_count"], name_file, output, regulation, # "UAA_count(%snt)" % size, type_analysis) # write_proportion_pvalues(dict_score_3ss[size]["uaa_count"], name_file, output, regulation, # "UAA_count(%snt)" % size, type_analysis) create_figure(dict_score_3ss[size]["una_count"], name_file, output, regulation, "UNA_count(%snt)" % size, type_analysis, "box") dataframe_creator(dict_score_3ss[size]["una_count"], name_file, output, regulation, "UNA_count(%snt)" % size, type_analysis) # write_proportion_pvalues(dict_score_3ss[size]["una_count"], name_file, output, regulation, # "UNA_count(%snt)" % size, type_analysis) create_figure(dict_score_3ss[size]["hbound"], name_file, output, regulation, "nb_h_bound_%s_nt" % size, type_analysis, "violin") dataframe_creator(dict_score_3ss[size]["hbound"], name_file, output, regulation, "nb_h_bound_%s_nt" % size, type_analysis) create_figure(dict_score_3ss[size]["gc_weblogo"], name_file, output, regulation, "gc_weblogo_%s_nt" % size, type_analysis, "box") dataframe_creator(dict_score_3ss[size]["gc_weblogo"], name_file, output, regulation, "gc_weblogo_%s_nt" % size, type_analysis)