def redundant_ag_at_and_u1_u2(cnx, regulation): """ Create the list of redundant exons between the AT and GC rich list of exons and \ between the U1 and U2 list of exons :param cnx: (sqlite3 connect object) allow connection to sed database :param regulation: (string) the regulation we want for the common exons :return: (list of list of 2 int) list of exons identified by their gene id and their exons position """ exon_at = [] for sf_name in group_factor.at_rich_down: exon_at += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_at = union_dataset_function.washing_events_all(exon_at) exon_gc = [] for sf_name in group_factor.gc_rich_down: exon_gc += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_gc = union_dataset_function.washing_events_all(exon_gc) global redundant_gc_at redundant_gc_at = [exon for exon in exon_at if exon in exon_gc] print("redundant exon GC and AT rich : %s" % len(redundant_gc_at)) exon_u1 = [] for sf_name in group_factor.u1_factors: exon_u1 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_u1 = union_dataset_function.washing_events_all(exon_u1) exon_u2 = [] for sf_name in group_factor.u2_factors: exon_u2 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_u2 = union_dataset_function.washing_events_all(exon_u2) global redundant_u1_u2 redundant_u1_u2 = [exon for exon in exon_u1 if exon in exon_u2] print("redundant exon U1 and U2 rich : %s" % len(redundant_u1_u2))
def get_exon_list(cnx, annotation_name, regulation): """ Get the exon_list wanted. :param cnx: (sqlite3 connect object) connection to sed database :param annotation_name: (string) GC-AT or a sf_name :param regulation: (string) the regulation of an exon list by a factor(s) :return: (list of 2 int) gene id and exon_pos """ if "GC" in annotation_name or "AT" in annotation_name: annotation_name = annotation_name.split("_")[0] folder = os.path.realpath(os.path.dirname(__file__)).replace( "src", "data/") my_file = "%s%s_rich_exons" % (folder, annotation_name) exon_list = extract_exon_list(my_file) elif "U1-FACTORS" in annotation_name or "U2-FACTORS" in annotation_name: annotation_name = annotation_name.split("_")[0] dic_name = { "U1-FACTORS": ["SNRPC", "SNRNP70", "DDX5_DDX17"], "U2-FACTORS": ["U2AF2", "SF1", "SF3A3", "SF3B4"] } exon_list = [] for sf_name in dic_name[annotation_name]: exon_list += union_dataset_function.get_every_events_4_a_sl( cnx, sf_name, regulation) exon_list = union_dataset_function.washing_events_all(exon_list) else: annotation_name = annotation_name.split("_")[0] sf_name = annotation_name.upper() sf_name = sf_name.replace("SFRS", "SRSF").replace("TRA2A", "TRA2A_B").replace( "DDX5-17", "DDX5_DDX17") exon_list = union_dataset_function.get_every_events_4_a_sl( cnx, sf_name, regulation) return exon_list
def extract_data(cnx, cnx_sed, list_files, list_names, pos, regulation): """ :param cnx: (sqlite3 connect object) connection to fasterDB lite :param cnx_sed: (sqlite3 connect object) connection to sed :param list_files: (list of string) list of files containing exon set :param list_names: (list of string) the name of exon set :param pos: (int) the position of interest within the list ``list_files`` and ``list_names``. \ Those 2 lists must have the same length :param regulation: (string) up or down :return: (list of ExonClass object) list of exon. """ if list_files: exon_list = extract_exon_files(cnx, list_files[pos]) else: dic_name = { "U1-factors": ["SNRPC", "SNRNP70", "DDX5_DDX17"], "U2-factors": ["U2AF2", "SF1", "SF3A3", "SF3B4"] } exon_list_tmp = [] for sf_name in dic_name[list_names[pos]]: exon_list_tmp += union_dataset_function.get_every_events_4_a_sl( cnx_sed, sf_name, regulation) exon_list_tmp = union_dataset_function.washing_events_all( exon_list_tmp) exon_list = [ exon_class_bp.ExonClass(cnx, str(exon[0]), int(exon[0]), int(exon[1])) for exon in exon_list_tmp ] print("%s : %s %s exons" % (list_names[pos], len(exon_list), regulation)) return exon_list
def difference(cnx, list1, list2, regulation): """ Return the exons regulated by the factors in list1 if they are not regulated by the factors in list2 :param cnx: (sqlite3 connect object) connection to sed database :param list1: (list of string) list of splicing factors :param list2: (list of strings) list of splicing factros :param regulation: (string) the exons with the regulation ``regulation`` regulated by the splicing factors in \ ``list1`` or ``list2`` :return:(list of list of 2 int """ exon_list1 = [] exon_list2 = [] for sf_name in list1: exon_list1 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_list1 = union_dataset_function.washing_events_all(exon_list1) for sf_name in list2: exon_list2 += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_list2 = union_dataset_function.washing_events_all(exon_list2) return [exon for exon in exon_list1 if exon not in exon_list2]
def get_gene_values(cnx, sf_list, target_column1, target_column2, regulation): """ Return the values of target_column in every`\ `regulation`` exons regulated by a splicing factor in (one or multiple) cell lines. :param cnx: (sqlite3 connexion object) allow connexion to sed database :param sf_list: (list of string) the list of splicing factor studied :param target_column1: (string) the value for which we want to get the median value for the ``regulation`` \ exon. :param target_column2: (string) the value for which we want to get the median value for the ``regulation`` \ exon. :param regulation: (list of string) up or down or up + down :return: 3 lists : * values : (list of list of float) each sublist corresponds to the value of `` target_column`` for \ every exons regulated by a splicing factor * exon_name : (list of list of string) each sublist corresponds to the name of \ every exons regulated by a splicing factor - the value in the sublist **i** position **j** \ in the ``value`` and ``exon_name`` corresponds to the same exons * all_sf (list of string) list of each sf studied """ if "$" in target_column1: target_column1, nt1 = target_column1.split("$") else: nt1 = None if "$" in target_column2: target_column2, nt2 = target_column2.split("$") else: nt2 = None exon_list = [] if isinstance(sf_list[0], str): for sf_name in sf_list: exon_list += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_list = union_dataset_function.washing_events_all(exon_list) else: exon_list = sf_list gene_id = [] for val in exon_list: if val[0] not in gene_id: gene_id.append(val[0]) gene_name = [union_dataset_function.get_gene_name(cnx, my_id) for my_id in gene_id] if nt1: values1 = get_list_of_value_iupac_dnt(cnx, exon_list, target_column1, nt1) else: values1 = functions.get_list_of_value(cnx, exon_list, target_column1) if nt2: values2 = get_list_of_value_iupac_dnt(cnx, exon_list, target_column2, nt2) else: values2 = functions.get_list_of_value(cnx, exon_list, target_column2) return values1, values2, gene_name
def get_exons_list(cnx, sf_list, regulation): """ Return every non-redundant exons regulated by at least one factor in ``sf_list`` (with the regulation \ ``regulation``) :param cnx: (sqlite3 connexion object) allow connexion to sed database :param sf_list: (list of string) the list of splicing factor studied :param regulation: (list of string) up or down or up + down :return: (list of list of int) list of exons shownig the regulation ``regulation`` at least for a factor \ in ``sf_list`` """ exon_list = [] for sf_name in sf_list: exon_list += union_dataset_function.get_every_events_4_a_sl( cnx, sf_name, regulation) exon_list = union_dataset_function.washing_events_all(exon_list) return exon_list
def get_exons_values(cnx, sf_list, target_column1, target_column2, regulation): """ Return the values of target_column in every` \ `regulation`` exons regulated by a splicing factor in (one or multiple) cell lines. :param cnx: (sqlite3 connexion object) allow connexion to sed database :param sf_list: (list of string) the list of splicing factor studied :param target_column1: (string) the value for which we want to get the median value for the ``regulation`` exon. :param target_column2: (string) the value for which we want to get the median value for the ``regulation`` exon. :param regulation: (list of string) up or down or up + down :return: 3 lists : * values : (list of list of float) each sublist corresponds to the value of `` target_column`` for \ every exons regulated by a splicing factor * exon_name : (list of list of string) each sublist corresponds to the name of \ every exons regulated by a splicing factor - the value in the sublist **i** position **j** \ in the ``value`` and ``exon_name`` corresponds to the same exons * all_sf (list of string) list of each sf studied """ if "$" in target_column1: target_column1, nt1 = target_column1.split("$") else: nt1 = None if "$" in target_column2: target_column2, nt2 = target_column2.split("$") else: nt2 = None exon_list = [] if isinstance(sf_list[0], str): for sf_name in sf_list: exon_list += union_dataset_function.get_every_events_4_a_sl(cnx, sf_name, regulation) exon_list = union_dataset_function.washing_events_all(exon_list) else: exon_list = sf_list print(len(exon_list)) exon_name = ["%s_%s" % (union_dataset_function.get_gene_name(cnx, a[0]), a[1]) for a in exon_list] values1 = get_interest_values(cnx, exon_list, target_column1, nt1) values2 = get_interest_values(cnx, exon_list, target_column2, nt2) if len(exon_name) * 2 == len(values1): exon_name = ["%s_upstream" % a for a in exon_name] + \ ["%s_downstream" % a for a in exon_name] return values1, values2, exon_name