def control_dictionaries_creator(): """ Create the control dictionary containing the values corresponding to the score of bp and ppt for every control exons """ exon_class.set_debug(0) dir_path = os.path.dirname(os.path.realpath(__file__)) fasterdb = os.path.dirname(os.path.realpath(__file__)).replace( "src/minimum_free_energy", "data/fasterDB_lite.db") seddb = os.path.dirname(os.path.realpath(__file__)).replace( "src/minimum_free_energy", "data/sed.db") ctrl_dir = dir_path + "/control_dictionaries/" cnx = sqlite3.connect(fasterdb) cnx_sed = sqlite3.connect(seddb) if not os.path.isdir(ctrl_dir): os.mkdir(ctrl_dir) exon_type = "CCE" exon2remove = union_dataset_function.get_exon_regulated_by_sf( cnx_sed, "down") ctrl_exon_list = get_control_exon_information(cnx, exon_type, exon2remove) print("retrieving upstream intron sequence") list_exon = [ exon_class.ExonClass(cnx, exon[0], exon[1], exon[2]) for exon in ctrl_exon_list ] print("calculating mfe") mfe_list_3ss, mfe_list_5ss = function.mfe_calculator(list_exon) cur_file = open(ctrl_dir + exon_type + "_mfe.py", "w") cur_file.write("mfe_3ss=" + str(mfe_list_3ss) + "\n") cur_file.write("mfe_5ss=" + str(mfe_list_5ss) + "\n") cur_file.close()
def computing_mfe(cnx, df, output): """ Add a column mfe 5'ss to the existing dataframe. :param cnx: (sqlite3 dataframe object) connection to fasterdb :param df: (pandas dataframe) table of exons :param output: (str) files were the mfe results will be created :return: (pandas dataframe) table of exons with mfe data """ exon_class_list = [] exon_list = df[["gene_name", "gene_id", "pos"]].values for exon in exon_list: exon_class_list.append( exon_class.ExonClass(cnx, exon[0], exon[1], exon[2])) mfe_3ss, mfe_5ss = function.mfe_calculator(exon_class_list, output, ps=True) df["mfe_5ss"] = mfe_5ss return df
def get_mfe_score_list(output, exon_list, name_list): """ Calculate or retrieve mfe score :param output: (string) path where the result will be stored or retrieved :param exon_list: (list of ExonClass object) List of exons :param name_list: (string) the name of the exons list :return: (2 list of floats) list of bp score and list of ppt score """ name_store_file = "%s%s_mfe_score.py" % (output, name_list) print(name_store_file) if not os.path.isfile(name_store_file): print("Calculating mfe score using RNAfold") mfe_3ss, mfe_5ss = function.mfe_calculator(exon_list) with open(name_store_file, "w") as outfile: outfile.write("mfe_3ss=" + str(mfe_3ss) + "\n") outfile.write("mfe_5ss=" + str(mfe_5ss) + "\n") else: print("recovering mfe score already stored in %s" % name_store_file) sys.path.insert(0, output) stored = __import__("%s_mfe_score" % name_list) mfe_3ss = stored.mfe_3ss mfe_5ss = stored.mfe_5ss return mfe_3ss, mfe_5ss
def get_exon_info(cnx, sedb, fasterdb_file, exon_list, u1_exons, u2_exons): """ :param cnx: (sqlite3 connect object) connexion to fasterdb :param fasterdb_file: (str) an sqlite3 database file :param sedb: (str) path to sed database :param exon_list: (list of 2 int) list of exons :param u1_exons: (list of list of 2 int) list of exons regulated by U1 :param u2_exons: (list of list of 2 int) list of exons regulated by U2 :return: (list of list of value) list of data """ dic = {-1: "-", 1: "+"} cursor = cnx.cursor() cursor.execute("ATTACH DATABASE ? as sed", (sedb, )) cursor.execute("ATTACH DATABASE ? as fasterdb", (fasterdb_file, )) if exon_list is None: query = """ SELECT t1.id_gene, t1.pos_on_gene, t1.chromosome, t1.start_on_chromosome, t1.end_on_chromosome, t2.strand, t3.iupac_exon, t3.upstream_intron_size, t3.downstream_intron_size FROM fasterdb.exons as t1, fasterdb.genes as t2, sed.sed as t3 WHERE t3.gene_id = t1.id_gene AND t3.exon_pos = t1.pos_on_gene AND t1.id_gene = t2.id AND t3.exon_type LIKE '%CCE%' """ cursor.execute(query) res = cursor.fetchall() new_res = [] for exon in res: exon = list(exon) exon[3] = int(exon[3]) - 1 cexon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) exon_data = bp_ppt_calculator([cexon]) mexon = exon_class.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) mfe_5ss, mfe_3ss = mfe_calculator([mexon]) stretch = catch_index_error(stretch_counter([cexon])["T"], 0) dic_info = { "GC_content": exon[6].split(";")[4], "upstream_intron_size": exon[7], "downstream_intron_size": exon[8], "UNA_count": catch_index_error(exon_data[8], 0), "Hbound_count": catch_index_error(exon_data[6], 0), "good_bp": catch_index_error(exon_data[3], 0), "MFE_5SS": catch_index_error(mfe_5ss, 0), "MFE_3SS": catch_index_error(mfe_3ss, 0), "T_stretch": stretch, "U1-regulated": is_in(exon[0:2], u1_exons), "U2-regulated": is_in(exon[0:2], u2_exons), } new_res.append(exon[2:5] + ["%s_%s" % (exon[0], exon[1])] + \ ["0", dic[exon[5]]] + [str(dic_info)]) return new_res count = 0 tot = len(exon_list) result = [] for exon in exon_list: count += 1 query = """ SELECT t1.chromosome, t1.start_on_chromosome, t1.end_on_chromosome, t2.strand, t3.iupac_exon, t3.upstream_intron_size, t3.downstream_intron_size FROM fasterdb.exons as t1, fasterdb.genes as t2, sed.sed as t3 WHERE t3.gene_id = t1.id_gene AND t3.exon_pos = t1.pos_on_gene AND t1.id_gene = t2.id AND t3.gene_id = %s AND t3.exon_pos = %s """ % (exon[0], exon[1]) cursor.execute(query) res = cursor.fetchall() if len(res) > 1: raise IndexError("Error only one row shoud be return for %s" % exon) tmp = list(res[0]) tmp[1] = int(tmp[1]) - 1 cexon = exon_class_bp.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) exon_data = bp_ppt_calculator([cexon]) mexon = exon_class.ExonClass(cnx, str(exon[0]), exon[0], exon[1]) mfe_5ss, mfe_3ss = mfe_calculator([mexon]) stretch = catch_index_error(stretch_counter([cexon])["T"], 0) dic_info = { "GC_content": tmp[4].split(";")[4], "upstream_intron_size": tmp[5], "downstream_intron_size": tmp[6], "UNA_count": catch_index_error(exon_data[8], 0), "Hbound_count": catch_index_error(exon_data[6], 0), "good_bp": catch_index_error(exon_data[3], 0), "MFE_5SS": catch_index_error(mfe_5ss, 0), "MFE_3SS": catch_index_error(mfe_3ss, 0), "T_stretch": stretch, "U1-regulated": is_in(exon[0:2], u1_exons), "U2-regulated": is_in(exon[0:2], u2_exons), } exon_data = tmp[0:3] + ["%s_%s" % (exon[0], exon[1])] + \ ["0", dic[tmp[3]]] + [str(dic_info)] result.append(exon_data) sys.stdout.write("Processing %s/%s\t\t\t\r" % (count, tot)) sys.stdout.flush() return result