def create_othergc_exon_file(cnx, output): """ Create the GC-exons file regulated by SRSF2 HNRNPC and SRSF3. :param cnx: (sqlite3 object) allow connection to sed database :param output: (str) path were the exon files will be created """ output = output + "/other_GC_exons/" if not os.path.isdir(output): os.mkdir(output) gc_exon_all = get_exons_list(cnx, group_factor.gc_rich_down, "down") print("GC exons : %s" % len(gc_exon_all)) at_exon_all = get_exons_list(cnx, group_factor.at_rich_down, "down") print("Number of at exons all: %s" % len(at_exon_all)) other_gc_all = get_exons_list(cnx, group_factor.other, "down") venn_diagram_creator([other_gc_all, gc_exon_all, at_exon_all], ["hnRNPC & SRSF3 & SRSF2", "GC-exons", "AT-exons"], output) other_gc_list = [get_exons_list(cnx, [fact], "down") for fact in group_factor.other] full_list = other_gc_list + [gc_exon_all, at_exon_all] full_names = list(group_factor.other) + ["GC_exons", "AT_exons"] common = get_multi_list_common_exons(full_list) for exon_list, list_name in zip(full_list, full_names): start = len(exon_list) print("%s : %s exons" % (list_name, start)) file_writer(exon_list, list_name + "_all", output) exon_list = [exon for exon in exon_list if exon not in common] stop = len(exon_list) loss = round((start - stop) / start * 100, 1) print("%s : %s exons after filter (loss of %s percent)" % (list_name, stop, loss)) file_writer(exon_list, list_name, output)
def create_unregulated_exon_list(cnx, output, exon_type): """ Create the list of GC/At rich unregulated exons :param cnx: (sqlite3 connect objecy) connection to sed database :param output: (str) path were the exon files will be created :param exon_type: (str) the type of control exons """ gc_exon_all = get_exons_list(cnx, group_factor.gc_rich_down, "down") at_exon_all = get_exons_list(cnx, group_factor.at_rich_down, "down") gc_exon = [exon for exon in gc_exon_all if exon not in at_exon_all] at_exon = [exon for exon in at_exon_all if exon not in gc_exon_all] first_exons = find_first_exons(cnx, exon_type) last_exons = find_last_exons(cnx, exon_type) exon2remove = gc_exon + at_exon + last_exons print("exon to remove : %s" % len(exon2remove)) min_intron_size, gc_content = \ get_control_exon_information(cnx, exon_type, exon2remove) gc_exon = get_exons_of_interest(cnx, exon_type, exon2remove, min_intron_size, gc_content, "GC") at_exon = get_exons_of_interest(cnx, exon_type, exon2remove, min_intron_size, gc_content, "AT") gc_exon = [exon for exon in gc_exon if exon not in at_exon] at_exon = [exon for exon in at_exon if exon not in gc_exon] print("gc_exon : %s exons" % len(gc_exon)) print("at_exon : %s exons" % len(at_exon)) file_writer(gc_exon, "GC_unregulated", output) file_writer(at_exon, "AT_unregulated", output) cnx.close()
def main(): """ Make the enrichment analysis comparing the frequencies of exon regulated by splicesome factors \ for an AT and GC exons list. """ nb_iteration = 10 seddb = os.path.realpath(os.path.dirname(__file__)).replace( "src/GC_AT_group_regulated_U1_U2", "data/sed.db") cnx = sqlite3.connect(seddb) output = os.path.realpath(os.path.dirname(__file__)).replace( "src/GC_AT_group_regulated_U1_U2", "result/GC_AT_group_regulated_U1_U2/") if not os.path.isdir(output): os.mkdir(output) div_group = { "AT_rich": group_factor.at_rich_down, "GC_rich": group_factor.gc_rich_down, "SNRPC": ["SNRPC"], "SNRNP70": ["SNRNP70"], "DDX5_17": ["DDX5_DDX17"], "SF1": ["SF1"], "U2AF1": ["U2AF1"], "U2AF2": ['U2AF2'], "SF3A3": ["SF3A3"], "SF3B4": ["SF3B4"] } dic_exon = {} for name_group in div_group.keys(): print("Getting all exon regulated by %s factor" % name_group) dic_exon[name_group] = get_exons_list(cnx, div_group[name_group], "down") at_gc_intersection = exon_intersection(dic_exon["AT_rich"], dic_exon["GC_rich"]) u1_u2_intersection = exon_intersection( get_exons_list(cnx, group_factor.u1_factors, "down"), get_exons_list(cnx, group_factor.u2_factors, "down")) print("GC-AT group intersection : %s exons" % len(at_gc_intersection)) print("U1-U2 interesection : %s exons" % len(u1_u2_intersection)) dic_exon["GC_pure"] = exon_difference(dic_exon["GC_rich"], at_gc_intersection) dic_exon["AT_pure"] = exon_difference(dic_exon["AT_rich"], at_gc_intersection) for key in dic_exon: print("%s : %s" % (key, len(dic_exon[key]))) dic_spliceosome = {} for key in dic_exon.keys(): if "AT" not in key and "GC" not in key: dic_spliceosome[key] = dic_exon[key] analysis_dic, super_dict = analysis_maker(dic_exon["GC_pure"], dic_exon["AT_pure"], dic_spliceosome, u1_u2_intersection, nb_iteration) file_writer(output, analysis_dic, nb_iteration, "GC", "AT") with open("%ssuper_dict_%s.txt" % (output, nb_iteration), "w") as outfile: outfile.write("super_dict=%s\n" % str(super_dict)) cnx.close()
def create_ct_ga_rich_exon_list(cnx, output): """ Create the GA and the CT rich exons list. :param cnx: (sqlite3 object) allow connection to sed database :param output: (str) path were the exon files will be created """ ga_exon_all = get_exons_list(cnx, group_factor.ga_rich_down, "down") ct_exon_all = get_exons_list(cnx, group_factor.ct_rich_down, "down") ga_exon = [exon for exon in ga_exon_all if exon not in ct_exon_all] ct_exon = [exon for exon in ct_exon_all if exon not in ga_exon_all] print("ga_exon_all : %s exons" % len(ga_exon_all)) print("ga_exon : %s exons" % len(ga_exon)) print("ct_exon_all : %s exons" % len(ct_exon_all)) print("ct_exon : %s exons" % len(ct_exon)) file_writer(ga_exon, "GA_rich", output) file_writer(ct_exon, "CT_rich", output)
def main(): """ Create a bed file containing info about GC frequency of every GC-AT exons. """ base = os.path.dirname( os.path.dirname(os.path.dirname(os.path.realpath(__file__)))) seddb = base + "/data/sed.db" fasterdb = base + "/data/fasterDB_lite.db" output = base + "/result/correlation_GC-AT-exons_TAD" if not os.path.isdir(output): os.mkdir(output) cnx = sqlite3.connect(seddb) u1_exons = [ list(map(int, exon)) for exon in get_exons_list( cnx, ["SNRPC", "SNRNP70", "DDX5_DDX17"], "down") ] u2_exons = [ list(map(int, exon)) for exon in get_exons_list( cnx, ["U2AF2", "SF1", "SF3A3", "SF3B4"], "down") ] print("U1-exons : %s exons" % len(u1_exons)) print("U2-exons : %s exons" % len(u2_exons)) exon_list = udf.get_exon_regulated_by_sf(cnx, "down") print("Getting exon data ...") exon_data = get_exon_info(cnx, seddb, fasterdb, exon_list, u1_exons, u2_exons) print("Writing bed") write_bed(output, exon_data, "data_for_regulated_exons") cnx.close() cnx = sqlite3.connect(seddb) exon2remove = udf.get_exon_regulated_by_sf(cnx, "down") cnx_fasterdb = sqlite3.connect(fasterdb) exon_list = get_control_exon_information(cnx_fasterdb, "CCE", exon2remove) + exon2remove cnx_fasterdb.close() print("CCE exons + regulated exons : %s" % len(exon_list)) print("Getting CCE + regulated exon data ...") exon_data = get_exon_info(cnx, seddb, fasterdb, exon_list, u1_exons, u2_exons) print("Writing bed") write_bed(output, exon_data, "data_for_regulated_CCE_exons") cnx.close()
def main_3g(list_file, name_file, seddb, output, reverse): """ Make the enrichment analysis comparing the frequencies of exon regulated by splicesome factors \ for an AT and GC exons list. """ nb_iteration = 10000 cnx = sqlite3.connect(seddb) div_group = { "SNRPC": ["SNRPC"], "SNRNP70": ["SNRNP70"], # "DDX5_17": ["DDX5_DDX17"], "SF1": ["SF1"], "U2AF1": ["U2AF1"], "U2AF2": ['U2AF2'], # "SF3A3": ["SF3A3"], "SF3B4": ["SF3B4"] } dic_exon = {} for name_group in div_group.keys(): print("Getting all exon regulated by %s factor" % name_group) dic_exon[name_group] = get_exons_list(cnx, div_group[name_group], "down") for key in dic_exon: print("%s : %s" % (key, len(dic_exon[key]))) dic_spliceosome = {} for key in dic_exon.keys(): if "AT" not in key and "GC" not in key: dic_spliceosome[key] = dic_exon[key] analysis_dic, super_dict = analysis_maker_bis(list_file, name_file, dic_spliceosome, nb_iteration) filename = file_writer(output, analysis_dic, nb_iteration, name_file[0], name_file[1]) fig_3g(filename, output, reverse) with open("%ssuper_dict_%s.txt" % (output, nb_iteration), "w") as outfile: outfile.write("super_dict=%s\n" % str(super_dict)) cnx.close()