def single_element_cleaner(list_of_datasets, remove_single_domain_elements, filtered_elements): import itertools import interacting_domain mode = config_variables.mode upstream = config_variables.upstream TSS_or_intra_genic_for_domain_filter = config_variables.TSS_or_intra_genic_for_domain_filter chrom_mask_non_single_domain_elements = {} chroms, coordinates = dataset_time_series_dict[list_of_datasets[0]][0], dataset_time_series_dict[list_of_datasets[0]][1] interacting_domains = np.loadtxt('report_hESC_Combined_converted.csv', dtype = str, usecols = (4, 12, 13), delimiter = ',') if mode == "promoter_enhancer_interactions": if TSS_or_intra_genic_for_domain_filter == "TSS_only": TSS_coordinates = extract_TSS_coordinates(upstream) coordinates = np.column_stack((TSS_coordinates-1, TSS_coordinates+1)) for chrom_ in np.unique(chroms): filtered_elements_chrom = filtered_elements[chroms == chrom_] chrom_coordinates = coordinates[chroms == chrom_] if len(chrom_coordinates) and sum(interacting_domains[:, 0] == chrom_): matrix_left = interacting_domain.interacting_domains(chrom_coordinates, np.array([]).reshape(0,2), chrom_, state = "left", matrix_version = True) matrix_right = interacting_domain.interacting_domains(chrom_coordinates, np.array([]).reshape(0,2), chrom_, state = "right", matrix_version = True) mask = np.ones_like(matrix_left) mask[range(len(mask)), range(len(mask))] = False mask[:, np.invert(filtered_elements_chrom)] = False matrix_allocations_joint = matrix_left*mask + matrix_right*mask #number_of_elemenets_in_the_same_domain = matrix_allocations.sum(1) matrix_allocations_unique_to_left = matrix_allocations_joint - matrix_right*mask matrix_allocations_unique_to_right = matrix_allocations_joint - matrix_left*mask shared_allocations = matrix_allocations_joint - (matrix_allocations_unique_to_left + matrix_allocations_unique_to_right) promoter_is_shared = shared_allocations.sum(1).astype(bool) # if any is true survived = np.zeros(len(chrom_coordinates), bool) survived[promoter_is_shared] = ((matrix_allocations_unique_to_left.sum(1) > 0) + (matrix_allocations_unique_to_right.sum(1) > 0))[promoter_is_shared] survived[np.invert(promoter_is_shared)] = ((matrix_left*mask).sum(1) > 0)[np.invert(promoter_is_shared)] chrom_mask_non_single_domain_elements[chrom_] = survived else: chrom_mask_non_single_domain_elements[chrom_] = np.ones(sum(chroms == chrom_), bool) chrom_mask_non_single_domain_elements_total = np.array(list(itertools.chain.from_iterable([chrom_mask_non_single_domain_elements[chrom__] for chrom__ in np.unique(chroms)]))) return chrom_mask_non_single_domain_elements_total
def chrom_specific_negative_interactions(chrom, mode, prior_mode=False): enh_coordinates, pro_coordinates, indexes_p, indexes_e, total_p, total_e = initialise_variables( chrom) length_chr = len(indexes_p) + len(indexes_e) chrom_pro_not_survived = dict_chrom_pro_not_survived[chrom] chrom_pro_survived = dict_chrom_pro_survived[chrom] chrom_enh_not_survived = dict_chrom_enh_not_survived[chrom] chrom_enh_survived = dict_chrom_enh_survived[chrom] # interaction_domains_adjustments beginning ----------------------- if domain: if TSS_or_intra_genic_for_domain_filter == "Intra_genic": coords_pro_domain = pro_coordinates[indexes_p] elif TSS_or_intra_genic_for_domain_filter == "TSS_only": coords_pro_domain = np.column_stack( (TSS_coordinates[indexes_p] - 1, TSS_coordinates[indexes_p] + 1)) domain_matrix = interacting_domain.interacting_domains( coords_pro_domain, enh_coordinates[indexes_e], chrom, 'left', True) domain_matrix = domain_matrix + interacting_domain.interacting_domains( coords_pro_domain, enh_coordinates[indexes_e], chrom, 'right', True) else: domain_matrix = True # interaction_domains_adjustments_ending ----------------------- def promoter_enhancer_interactions_generator(): chr_interactions_pro_enh = chr_interactions_dict_pro_enh[chrom] if config_variables.alternative_classificator_outside_enhancers: chrom_interacting_enhancers_pro = config_variables.chrom_interacting_enhancers_pro[ chrom] else: chrom_interacting_enhancers_pro = np.unique( un_string(chr_interactions_dict_pro_enh[chrom])[:, 1]) chrom_interacting_promoters_pro = np.unique( un_string(chr_interactions_dict_pro_enh[chrom])[:, 0]) interaction_matrix = np.zeros((length_chr, length_chr), bool) interaction_matrix[range(length_chr), range(length_chr)] = True # gets rid of diagonal interaction_matrix[np.tril_indices( length_chr)] = True # gets rid of symmetric interactions interaction_matrix[0:len(indexes_p), 0:len( indexes_p)] = True # gets rid of promoter_promoter_interactions features = np.array( ['p{0}'.format(ind) for ind in indexes_p] + ['e{0}'.format(ind) for ind in indexes_e ]) # creates a frame with chromosome specific interactions true_pro_enh_indexes = un_string(chr_interactions_pro_enh) print 'number of pro_enh true interactions: ', len( chr_interactions_pro_enh) if len(chrom_pro_not_survived): interaction_matrix[ chrom_pro_not_survived - total_p, :] = True # gets rid of negative interactions which could be generated by filtered promoters if interacting_negatives: mask_interacting_promoters = np.zeros(length_chr).astype( bool ) # we don't have to filter out enhancers which didn't pass the filter thresold. Since we consider only the interacting enhancers that's a subset of survived enhnacers. mask_interacting_promoters[chrom_interacting_promoters_pro - total_p] = True mask_non_interacting_promoters = np.invert( mask_interacting_promoters) interaction_matrix[ mask_non_interacting_promoters, len( indexes_p ):] = True # it's equivalent to interacting_enhancers_mask_invert #if config_variables.disentagled_features_validation: #true_pro_enh_indexes = un_string(config_variables.chr_interactions_dict_pro_enh_TSS[chrom]) #chrom_interacting_enhancers_pro = np.unique(true_pro_enh_indexes[:, 1]) mask_interacting_enhancers = np.zeros(length_chr).astype( bool ) # we don't have to filter out enhancers which didn't pass the filter thresold. Since we consider only the interacting enhancers that's a subset of survived enhnacers. mask_interacting_enhancers[chrom_interacting_enhancers_pro - total_e + len(indexes_p)] = True mask_non_interacting_enhancers = np.invert(mask_interacting_enhancers) #interaction_matrix[:len(indexes_p), mask_non_interacting_enhancers] = True # it's equivalent to interacting_enhancers_mask_invert if interacting_enhancers_only or prior_mode: interaction_matrix[:len( indexes_p ), mask_non_interacting_enhancers] = True # it's equivalent to interacting_enhancers_mask_invert elif len(chrom_enh_not_survived): interaction_matrix[:len(indexes_p), len(indexes_p) + chrom_enh_not_survived - total_e] = True # gets rid of filtered out enhancers which could be causing nans due to their correlations if distant_enh_only and len(dict_chrom_proximal[chrom]): interaction_matrix[:len(indexes_p), len(indexes_p) + dict_chrom_proximal[chrom] - total_e] = True interaction_matrix[true_pro_enh_indexes[:, 0] - total_p, true_pro_enh_indexes[:, 1] - total_e + len(indexes_p)] = True interaction_matrix[ len(indexes_p):len(indexes_p) + len(indexes_e), len(indexes_p):len(indexes_p) + len(indexes_e)] = True # gets rid of enhancers-enhancer block indexes_of_zero_interactions = np.where( True == np.invert(interaction_matrix) * domain_matrix) column_1st = indexes_of_zero_interactions[0] column_2nd = indexes_of_zero_interactions[1] - len(indexes_p) prom_enh_false_interactions = np.concatenate( (column_1st[:, None], column_2nd[:, None]), axis=1) #pro-enh interactions end----------------------------------------------------------------- return prom_enh_false_interactions if "promoter_enhancer_interactions" in mode: negative_of_type_of_interactions = promoter_enhancer_interactions_generator( ) print 'number of pro_enh false interactions: ', len( negative_of_type_of_interactions) def enhancer_enhancer_interactions_generator(): chr_interactions_enh_enh = chr_interactions_dict_enh_enh[chrom] chrom_interacting_enhancers_enh = np.unique( un_string(chr_interactions_dict_enh_enh[chrom])[:, 0]) chrom_interacting_enhancers_enh = np.unique(np.r_[ chrom_interacting_enhancers_enh, np.unique(un_string(chr_interactions_dict_enh_enh[chrom])[:, 1])]) interaction_matrix = np.zeros((length_chr, length_chr), bool) interaction_matrix[range(length_chr), range(length_chr)] = True # gets rid of diagonal interaction_matrix[0:len(indexes_p), 0:len( indexes_p)] = True # gets rid of promoter_promoter_interactions interaction_matrix[:len(indexes_p), len(indexes_p):len(indexes_p) + len(indexes_e )] = True # gets rid of promoter-enhancer block print 'number of enh_enh true interactions: ', len( chr_interactions_enh_enh) #enh-enh interactions start----------------------------------------------------------------- if len(chrom_enh_not_survived): interaction_matrix[len(indexes_p) + chrom_enh_not_survived - total_e, :] = True # sorts out raws if distant_enh_only and len(dict_chrom_proximal[chrom]): interaction_matrix[len(indexes_p) + dict_chrom_proximal[chrom] - total_e, :] = True if interacting_negatives: mask_interacting_enhancers = np.zeros(length_chr).astype(bool) mask_interacting_enhancers[chrom_interacting_enhancers_enh - total_e + len(indexes_p)] = True mask_non_interacting_enhancers = np.invert( mask_interacting_enhancers) interaction_matrix[mask_non_interacting_enhancers, len(indexes_p):] = True #sort out columns-------------------------------------- mask_interacting_enhancers = np.zeros(length_chr).astype(bool) mask_interacting_enhancers[chrom_interacting_enhancers_enh - total_e + len(indexes_p)] = True mask_non_interacting_enhancers = np.invert(mask_interacting_enhancers) if interacting_enhancers_only or prior_mode: interaction_matrix[ len(indexes_p):, mask_non_interacting_enhancers] = True # it's equivalent to interacting_enhancers_mask_invert elif len(chrom_enh_not_survived): interaction_matrix[ len(indexes_p):, len(indexes_p) + chrom_enh_not_survived - total_e] = True # gets rid of filtered out enhancers which could be causing nans due to their correlations if distant_enh_only and len(dict_chrom_proximal[chrom]): interaction_matrix[len(indexes_p):, len(indexes_p) + dict_chrom_proximal[chrom] - total_e] = True #sort out columns--------------------------------------end true_enh_enh_indexes = un_string(chr_interactions_enh_enh) interaction_matrix[true_enh_enh_indexes[:, 0] - total_e + len(indexes_p), true_enh_enh_indexes[:, 1] - total_e + len(indexes_p)] = True interaction_matrix[true_enh_enh_indexes[:, 1] - total_e + len(indexes_p), true_enh_enh_indexes[:, 0] - total_e + len(indexes_p)] = True interaction_matrix[np.tril_indices( length_chr)] = True # gets rid of symmetric interactions indexes_of_zero_interactions = np.where( True == np.invert(interaction_matrix) * domain_matrix) column_1st = indexes_of_zero_interactions[0] - len(indexes_p) column_2nd = indexes_of_zero_interactions[1] - len(indexes_p) enh_enh_false_interactions = np.concatenate( (column_1st[:, None], column_2nd[:, None]), axis=1) return enh_enh_false_interactions if "enhancer_enhancer_interactions" in mode: negative_of_type_of_interactions = enhancer_enhancer_interactions_generator( ) print 'number of enh_enh false interactions: ', len( negative_of_type_of_interactions) return negative_of_type_of_interactions
def generator(pro_survived, enh_survived, domain, max_path): import copy import numpy as np import re import config_variables promoter_overlaps_enhancer_file = config_variables.promoter_overlaps_enhancer_file upstream = config_variables.upstream downstream = config_variables.downstream link_data_set_name_to_file_name = config_variables.link_data_set_name_to_file_name dataset_time_series_dict = config_variables.dataset_time_series_dict TSS_or_intra_genic_for_domain_filter = config_variables.TSS_or_intra_genic_for_domain_filter name_of_time_series_promoter_file_for_TSS_start = config_variables.name_of_time_series_promoter_file_for_TSS_start temp_output = config_variables.temp_output #parameters------------------------- #ovenh_ovenh_pro_pro_version = False #max_pro_enh_mode = True #----------------------------------- enhancer_enhancer_inter = np.loadtxt( temp_output + 'enhancer_enhancer_interactions_{0}_{1}'.format(upstream, downstream), usecols=(0, 1, 2), dtype=str, delimiter='\t') promoter_promoter_inter = np.loadtxt( temp_output + 'promoter_promoter_interactions_{0}_{1}'.format(upstream, downstream), usecols=(0, 1, 2), dtype=str, delimiter='\t') promoter_enhancer_inter = np.loadtxt( temp_output + 'promoter_enhancer_interactions_{0}_{1}'.format(upstream, downstream), usecols=(0, 1, 2), dtype=str, delimiter='\t') un_stringer = lambda x: int(re.findall('\d+', x)[0]) def un_featurer(array): f = lambda x: re.findall('\D+', x)[0] return np.array(map(f, array)) def un_string(array_): return np.c_[np.array(map(un_stringer, array_[:, 0]))[:, None], np.array(map(un_stringer, array_[:, 1]))[:, None]] enh_enh_indexes_list = un_string(enhancer_enhancer_inter[:, 1:]) pro_enh_indexes_list = un_string(promoter_enhancer_inter[:, 1:]) pro_pro_indexes_list = un_string(promoter_promoter_inter[:, 1:]) def filter_(array_, filt_1, filt_2): return np.in1d(map(un_stringer, array_[:, 1]), filt_1) * np.in1d( map(un_stringer, array_[:, 2]), filt_2) #cleans interactions: #---------------------------------------------------------------------------------------------------------------------------------------- promoter_promoter_inter = promoter_promoter_inter[filter_( promoter_promoter_inter, pro_survived, pro_survived)] enhancer_enhancer_inter = enhancer_enhancer_inter[filter_( enhancer_enhancer_inter, enh_survived, enh_survived)] promoter_enhancer_inter = promoter_enhancer_inter[filter_( promoter_enhancer_inter, pro_survived, enh_survived)] #---------------------------------------------------------------------------------------------------------------------------------------- enh_enh_indexes_list = un_string(enhancer_enhancer_inter[:, 1:]) pro_enh_indexes_list = un_string(promoter_enhancer_inter[:, 1:]) pro_pro_indexes_list = un_string(promoter_promoter_inter[:, 1:]) def prepare_overlaps(): overlaps = np.loadtxt(promoter_overlaps_enhancer_file, delimiter='\t', usecols=(4, 8), dtype=int) overlaps_promoter_enhancer_inter = overlaps diction_overlaps_ovenh = {} for overl in overlaps[:, 1]: promoters = list(overlaps[overl == overlaps[:, 1], 0]) diction_overlaps_ovenh[overl] = promoters return diction_overlaps_ovenh diction_overlaps_ovenh = prepare_overlaps() def promoter_promoter_adder(diction_overlaps_ovenh, index_1, index_2, chro, pro_pro_indexes_list, promoter_promoter_inter): print 'ovenh-ovenh' promoters_1 = diction_overlaps_ovenh[index_1] promoters_2 = diction_overlaps_ovenh[index_2] promoters_1 = promoters_1[np.in1d( promoters_1, pro_survived )] # converts ER signals which overlap filtered out promoters into distant peaks promoters_2 = promoters_2[np.in1d(promoters_2, pro_survived)] legend = np.r_[promoters_1, promoters_2] if len(legend) > 1: counts, bins = np.histogram( legend, np.arange(min(legend), max(legend) + 2)) digitize = np.digitize(legend, np.arange(min(legend), max(legend) + 2)) - 1 if len(np.where(counts > 1)[0]): print 'ambigous allocation - peak overlaps two promoters which interact through ChIA-PET index {0}, {1}'.format( index_1, index_2) matrix = np.ones((len(legend), len(legend)), dtype=bool) matrix[:len(promoters_1), :len(promoters_1)] = False matrix[len(promoters_1):len(promoters_1) + len(promoters_2), len(promoters_1):len(promoters_1) + len(promoters_2)] = False matrix[np.tril_indices( len(legend))] = False # upper triangular elements pro_inter = np.c_[legend[np.where(matrix)[0]], legend[np.where(matrix)[1]]] pro_inter_symbolic = [[ chro, 'ovpro{0}'.format(ind1), 'ovpro{0}'.format(ind2) ] for ind1, ind2 in pro_inter] for el_1, el_2 in zip(pro_inter, pro_inter_symbolic): lista = list(el_1) if lista not in map(list, pro_pro_indexes_list): pro_pro_indexes_list = np.r_[pro_pro_indexes_list, el_1] promoter_promoter_inter = np.r_[promoter_promoter_inter, el_2] return pro_pro_indexes_list, promoter_promoter_inter def ER_enhancer_Non_enhancer_pro_adder(pro_enh_indexes_list, promoter_enhancer_inter, pro_pro_indexes_list, promoter_promoter_inter): enhancer_enhancer_inter_filtered = [] pro_enh_indexes_list_added = [] pro_enh_indexes_list_symbolic_added = [] for el in enhancer_enhancer_inter: chro = el[0] index_1 = int(un_stringer(el[1])) index_2 = int(un_stringer(el[2])) feature_1 = re.findall('\D+', el[1])[0] feature_2 = re.findall('\D+', el[2])[0] if feature_1 == 'ovenh' and feature_2 == 'enh': for pro_dict in diction_overlaps_ovenh[ index_1]: # takes promoters corresponding to ER overlapping with them, previus step will clean out overnh which doesnt have signal pro_enh_int = [pro_dict, index_2] pro_enh_int_symbolic = [ chro, 'ovpro{0}'.format(pro_dict), 'enh{0}'.format(index_2) ] if pro_enh_int not in map( list, pro_enh_indexes_list) and pro_dict in pro_survived: pro_enh_indexes_list_added += [pro_enh_int] pro_enh_indexes_list_symbolic_added += [ pro_enh_int_symbolic ] elif feature_1 == 'enh' and feature_2 == 'ovenh': # here you've got to check what index has got the promoter. for pro_dict in diction_overlaps_ovenh[index_2]: pro_enh_int = [pro_dict, index_1] pro_enh_int_symbolic = [ chro, 'ovpro{0}'.format(pro_dict), 'enh{0}'.format(index_1) ] if pro_enh_int not in map( list, pro_enh_indexes_list) and pro_dict in pro_survived: pro_enh_indexes_list_added += [pro_enh_int] pro_enh_indexes_list_symbolic_added += [ pro_enh_int_symbolic ] elif feature_1 == 'ovenh' and feature_2 == 'ovenh': if ovenh_ovenh_pro_pro_version: pro_pro_indexes_list, promoter_promoter_inter = promoter_promoter_adder( diction_overlaps_ovenh, index_1, index_2, chro, pro_pro_indexes_list, promoter_promoter_inter ) # converts ovenh-ovenh interactions to promoter-promoter interactions else: enhancer_enhancer_inter_filtered.append( el ) # keeps ovenh-ovenh interactions as enh-enh interactions else: enhancer_enhancer_inter_filtered.append(el) if len(pro_enh_indexes_list_added): pro_enh_indexes_list = np.r_[pro_enh_indexes_list, pro_enh_indexes_list_added] promoter_enhancer_inter = np.r_[ promoter_enhancer_inter, pro_enh_indexes_list_symbolic_added] return pro_enh_indexes_list, promoter_enhancer_inter, pro_pro_indexes_list, promoter_promoter_inter #pro_enh_indexes_list, promoter_enhancer_inter, pro_pro_indexes_list, promoter_promoter_inter = ER_enhancer_Non_enhancer_pro_adder(pro_enh_indexes_list, promoter_enhancer_inter, pro_pro_indexes_list, promoter_promoter_inter) # it doesn't do a thing if you take only distal enhancers. Overlapping enhancers are then not among distal enhancers which you allow to form with with promoters. def ER_ovenh_pro_pro_adder(): filtered_promoter_enhancer_inter = [] overlaps = np.loadtxt(promoter_overlaps_enhancer_file, delimiter='\t', usecols=(4, 8), dtype=int) overlaps_promoter_enhancer_inter = overlaps diction_overlaps_ovenh = {} for overl in overlaps[:, 1]: promoters = list(overlaps[overl == overlaps[:, 1], 0]) diction_overlaps_ovenh[overl] = promoters for el in promoter_enhancer_inter: index_1 = int(re.findall('\d+', el[1])[0]) index_2 = int(re.findall('\d+', el[2])[0]) feature_2 = re.findall('\D+', el[1])[0] if feature_2 == 'ovenh': for pro_dict in diction_overlaps_ovenh[index_2]: #pro_pro_int = [index_1, dict_pro_survived[pro_dict]] pro_pro_int = [index_1, pro_dict] if pro_pro_int not in filtered_promoter_promoter_inter and pro_dict in pro_survived: filtered_promoter_promoter_inter.append(pro_pro_int) else: filtered_promoter_enhancer_inter.append(el) return np.array(filtered_promoter_enhancer_inter) #if not(max_pro_enh_mode): promoter_enhancer_inter = ER_ovenh_pro_pro_adder() #if the aim is to maximise number of promoter-enhancer interactions it could be best to set it to false def scan_through_pro_pro_inter_and_add_pro_enh_inter( ): # it's kind of arbitrary what we are trying to get rid of.. so this part can actually be coded, so that when pro-pro then pro-enh and enh-enh... but it may be best to do it with interaction matrix pass def into_string(array, s1, s2): array_ = un_string(array[:, 1:]) array_ = np.c_[array[:, 0][:, None], np.array([s1 + str(index) for index in array_[:, 0]])[:, None], np.array([s2 + str(index) for index in array_[:, 1]])[:, None]] return array_ promoter_enhancer_inter = into_string(promoter_enhancer_inter, 'p', 'e') promoter_promoter_inter = into_string(promoter_promoter_inter, 'p', 'p') enhancer_enhancer_inter = into_string(enhancer_enhancer_inter, 'e', 'e') #-------------------------------------------------------------------------------------------------------------------------------------------------------- def stringer(array, st): return np.array(map(lambda x: '{0}{1}'.format(st, x), array)) def filter_domains(symb_1, symb_2, chr_interactions, length_chr, row_indexes_plus, column_indexes_plus): if len(chr_interactions) > 0: interaction_matrix = np.zeros((length_chr, length_chr), bool) true_indexes = un_string(chr_interactions) interaction_matrix[true_indexes[:, 0] + row_indexes_plus, true_indexes[:, 1] + column_indexes_plus] = True interaction_matrix[true_indexes[:, 1] + column_indexes_plus, true_indexes[:, 0] + row_indexes_plus] = True interaction_matrix[np.tril_indices( length_chr)] = False # gets rid of symmetric interactions print 'number of original: {0} {1} - {2} true survived domain filtering: {3}'.format( len(true_indexes), symb_1, symb_2, np.sum(interaction_matrix * domain_matrix)) domain_raws = (np.where(interaction_matrix * domain_matrix)[0] - row_indexes_plus) domain_columns = (np.where(interaction_matrix * domain_matrix)[1] - column_indexes_plus) true_indexes_dom_symb = np.c_[ stringer(domain_raws, symb_1)[:, None], stringer(domain_columns, symb_2)[:, None]] s_1 = len(true_indexes_dom_symb) return np.c_[np.array(['chr{0}'.format(i)] * s_1)[:, None], true_indexes_dom_symb] else: return [] if domain: import interacting_domain def extract_TSS_coordinates(upstream): data = np.loadtxt(name_of_time_series_promoter_file_for_TSS_start, dtype=str, delimiter='\t') plus_strand = data[:, 4] == '+' TSS_coordinates = np.zeros(len(plus_strand), int) TSS_coordinates[plus_strand] = data[plus_strand, 1].astype(int) - upstream TSS_coordinates[np.invert(plus_strand)] = data[ np.invert(plus_strand), 2].astype(int) + upstream return TSS_coordinates TSS_coordinates = extract_TSS_coordinates(upstream) def initialise_variables(chrom): name_of_pro_t_s = link_data_set_name_to_file_name["promoters"][ "ER"] name_of_enh_t_s = link_data_set_name_to_file_name["enhancers"][ "ER"] pro_chroms, pro_coordinates, ts_p = dataset_time_series_dict[ name_of_pro_t_s] enh_chroms, enh_coordinates, ts_e = dataset_time_series_dict[ name_of_enh_t_s] indexes_p = np.where(pro_chroms == chrom)[ 0] # gives the number of promoters for a chromosome indexes_e = np.where(enh_chroms == chrom)[ 0] # gives the number of enhancers for a chromosome return pro_chroms, enh_chroms, pro_coordinates, enh_coordinates, indexes_p, indexes_e enhancer_enhancer_inter_dom = {} promoter_enhancer_inter_dom = {} promoter_promoter_inter_dom = {} total_p = total_e = 0 for i in np.concatenate((np.array(range(1, 23), dtype='S2'), ['X'], ['Y'])): chr_interactions_enh_enh = enhancer_enhancer_inter[ enhancer_enhancer_inter[:, 0] == 'chr{0}'.format(i)][:, 1:] chr_interactions_pro_enh = promoter_enhancer_inter[ promoter_enhancer_inter[:, 0] == 'chr{0}'.format(i)][:, 1:] chr_interactions_pro_pro = promoter_promoter_inter[ promoter_promoter_inter[:, 0] == 'chr{0}'.format(i)][:, 1:] if not (len(chr_interactions_enh_enh) * len(chr_interactions_pro_enh) * len(chr_interactions_pro_pro)): continue pro_chroms, enh_chroms, pro_coord, enh_coord, indexes_p, indexes_e = initialise_variables( 'chr{0}'.format(i)) if TSS_or_intra_genic_for_domain_filter == "TSS_only": pro_coord = np.column_stack( (TSS_coordinates, TSS_coordinates + 2)) length_chr = len(indexes_p) + len(indexes_e) chrom_pro_coord = pro_coord[indexes_p] chrom_ER_coord = enh_coord[indexes_e] domain_matrix = interacting_domain.interacting_domains( chrom_pro_coord, chrom_ER_coord, 'chr{0}'.format(i), 'left') domain_matrix = domain_matrix + interacting_domain.interacting_domains( chrom_pro_coord, chrom_ER_coord, 'chr{0}'.format(i), 'right') promoter_enhancer_inter_dom['chr{0}'.format(i)] = filter_domains( 'p', 'e', chr_interactions_pro_enh, length_chr, -total_p, -total_e + len(indexes_p)) enhancer_enhancer_inter_dom['chr{0}'.format(i)] = filter_domains( 'e', 'e', chr_interactions_enh_enh, length_chr, -total_e + len(indexes_p), -total_e + len(indexes_p)) promoter_promoter_inter_dom['chr{0}'.format(i)] = filter_domains( 'p', 'p', chr_interactions_pro_pro, length_chr, -total_p, -total_p) # interaction_domains_adjustments end ----------------------- total_p += len(indexes_p) total_e += len(indexes_e) promoter_enhancer_inter_dom['chrY'] = [] enhancer_enhancer_inter_dom['chrY'] = [] promoter_promoter_inter_dom['chrY'] = [] enhancer_enhancer_inter = enhancer_enhancer_inter_dom['chr1'] promoter_enhancer_inter = promoter_enhancer_inter_dom['chr1'] promoter_promoter_inter = promoter_promoter_inter_dom['chr1'] for i in np.r_[np.arange(2, 23).astype('S2'), ['X'], ['Y']]: add_1 = promoter_enhancer_inter_dom['chr{0}'.format(i)] if len(add_1): promoter_enhancer_inter = np.r_[promoter_enhancer_inter, add_1] add_2 = enhancer_enhancer_inter_dom['chr{0}'.format(i)] if len(add_2): enhancer_enhancer_inter = np.r_[enhancer_enhancer_inter, add_2] add_3 = promoter_promoter_inter_dom['chr{0}'.format(i)] if len(add_3): promoter_promoter_inter = np.r_[promoter_promoter_inter, add_3] interactions_to_save = np.r_[promoter_enhancer_inter, enhancer_enhancer_inter] interactions_to_save = np.c_[interactions_to_save, np.ones(len(interactions_to_save))[:, None]] #if not(generate_intermediates): return interactions_of_path def prepares_reverse_map_and_uniqueness(): promoters = np.r_[promoter_promoter_inter[:, [0, 1]], promoter_promoter_inter[:, [0, 2]], promoter_enhancer_inter[:, [0, 1]]] promoters = np.array(list(set(map(tuple, promoters)))) promoters_sort_indexes = np.argsort(map(un_stringer, promoters[:, 1])) promoters = promoters[promoters_sort_indexes] enhancers = np.r_[enhancer_enhancer_inter[:, [0, 1]], enhancer_enhancer_inter[:, [0, 2]], promoter_enhancer_inter[:, [0, 2]]] enhancers = np.array(list(set(map(tuple, enhancers)))) enhancers_sort_indexes = np.argsort(map(un_stringer, enhancers[:, 1])) enhancers = enhancers[enhancers_sort_indexes] pro_enh_unique = np.r_[promoters, enhancers] chroms_frame, pro_enh_unique_ordered = [], [] for i in np.r_[np.arange(1, 23).astype('S2'), ['X'], ['Y']]: chr_mask = pro_enh_unique[:, 0] == 'chr{0}'.format(i) chroms, chroms_features = pro_enh_unique[ chr_mask, 0], pro_enh_unique[chr_mask, 1] chroms_frame = np.r_[chroms_frame, chroms] pro_enh_unique_ordered = np.r_[pro_enh_unique_ordered, chroms_features] dict_inter = {} for index, el in enumerate(pro_enh_unique_ordered): dict_inter[el] = index return pro_enh_unique_ordered, chroms_frame, dict_inter unique_features, chroms_frame, dict_inter = prepares_reverse_map_and_uniqueness( ) def difference_interactions_prod(difference_arr, path): indexes_of_lower_diagonal = np.tril_indices(len(unique_features)) difference_arr[indexes_of_lower_diagonal] = False #difference_arr[range(len(unique_features)), range(len(unique_features))] = False indexes_of_non_zero_interactions = np.where(difference_arr) column_chr = chroms_frame[indexes_of_non_zero_interactions[0]].astype( str) column_1st = unique_features[indexes_of_non_zero_interactions[0]] column_2nd = unique_features[indexes_of_non_zero_interactions[1]] column_path = np.array([path] * len(indexes_of_non_zero_interactions[0]), str) diff_interactions = np.c_[column_chr[:, None], column_1st[:, None], column_2nd[:, None], column_path[:, None]] #print diff_interactions return diff_interactions a, b = [], [] matrix_of_interactions = np.zeros( (len(unique_features), len(unique_features)), bool) matrix_of_interactions[range(len(unique_features)), range(len(unique_features))] = True cumulative_old = np.zeros((len(unique_features), len(unique_features)), bool) concat = np.r_[promoter_enhancer_inter, enhancer_enhancer_inter, promoter_promoter_inter] indexes_1, indexes_2 = [], [] for chr_, feature_1, feature_2 in concat: indexes_1.append(dict_inter[feature_1]), indexes_2.append( dict_inter[feature_2]) matrix_of_interactions[[indexes_1, indexes_2], [indexes_2, indexes_1]] = True matrix_of_interactions_so_far = matrix_of_interactions cumulative = matrix_of_interactions collective = cumulative.astype(int) #ll = [el for el in map(list, concat) if el not in map(list, interactions_of_path[:,:3])] path = 1 interactions_of_path = difference_interactions_prod( cumulative - cumulative_old, path) def gets_rid_of_promoter_promoter_inter(array_): return np.prod(np.c_[un_featurer(array_[:, 0])[:, None], un_featurer(array_[:, 1])[:, None]] == ['p', 'p'], axis=1) == False print 'path:', path, 'size =', ( sum(sum(np.array(matrix_of_interactions_so_far))) - len(unique_features)) / 2 while not (np.array_equal(cumulative, cumulative_old)): path += 1 cumulative_old = cumulative matrix_of_interactions_so_far = np.dot(matrix_of_interactions, matrix_of_interactions_so_far) cumulative = matrix_of_interactions_so_far difference = cumulative - cumulative_old collective = collective + difference.astype(int) * path print 'path:', path, 'size =', ( sum(sum(np.array(matrix_of_interactions_so_far))) - len(unique_features)) / 2 if path == max_path: break interactions_of_path = np.r_[ interactions_of_path, difference_interactions_prod(difference, path)] mask = gets_rid_of_promoter_promoter_inter(interactions_of_path[:, [1, 2]]) interactions_of_path = interactions_of_path[mask] return interactions_of_path
def chrom_specific_negative_interactions(chrom, mode, prior_mode = False): enh_coordinates, pro_coordinates, indexes_p, indexes_e, total_p, total_e = initialise_variables(chrom) length_chr = len(indexes_p) + len(indexes_e) chrom_pro_not_survived = dict_chrom_pro_not_survived[chrom] chrom_pro_survived = dict_chrom_pro_survived[chrom] chrom_enh_not_survived = dict_chrom_enh_not_survived[chrom] chrom_enh_survived = dict_chrom_enh_survived[chrom] # interaction_domains_adjustments beginning ----------------------- if domain: if TSS_or_intra_genic_for_domain_filter == "Intra_genic": coords_pro_domain = pro_coordinates[indexes_p] elif TSS_or_intra_genic_for_domain_filter == "TSS_only": coords_pro_domain = np.column_stack((TSS_coordinates[indexes_p]-1, TSS_coordinates[indexes_p]+1)) domain_matrix = interacting_domain.interacting_domains(coords_pro_domain, enh_coordinates[indexes_e], chrom, 'left', True) domain_matrix = domain_matrix + interacting_domain.interacting_domains(coords_pro_domain, enh_coordinates[indexes_e], chrom, 'right', True) else: domain_matrix = True # interaction_domains_adjustments_ending ----------------------- def promoter_enhancer_interactions_generator(): chr_interactions_pro_enh = chr_interactions_dict_pro_enh[chrom] if config_variables.alternative_classificator_outside_enhancers: chrom_interacting_enhancers_pro = config_variables.chrom_interacting_enhancers_pro[chrom] else: chrom_interacting_enhancers_pro = np.unique(un_string(chr_interactions_dict_pro_enh[chrom])[:,1]) chrom_interacting_promoters_pro = np.unique(un_string(chr_interactions_dict_pro_enh[chrom])[:,0]) interaction_matrix = np.zeros((length_chr, length_chr), bool) interaction_matrix[range(length_chr), range(length_chr)] = True # gets rid of diagonal interaction_matrix[np.tril_indices(length_chr)] = True # gets rid of symmetric interactions interaction_matrix[0:len(indexes_p), 0:len(indexes_p)] = True # gets rid of promoter_promoter_interactions features = np.array(['p{0}'.format(ind) for ind in indexes_p] + ['e{0}'.format(ind) for ind in indexes_e]) # creates a frame with chromosome specific interactions true_pro_enh_indexes = un_string(chr_interactions_pro_enh) print 'number of pro_enh true interactions: ', len(chr_interactions_pro_enh) if len(chrom_pro_not_survived): interaction_matrix[chrom_pro_not_survived - total_p, :] = True # gets rid of negative interactions which could be generated by filtered promoters if interacting_negatives: mask_interacting_promoters = np.zeros(length_chr).astype(bool)# we don't have to filter out enhancers which didn't pass the filter thresold. Since we consider only the interacting enhancers that's a subset of survived enhnacers. mask_interacting_promoters[chrom_interacting_promoters_pro - total_p] = True mask_non_interacting_promoters = np.invert(mask_interacting_promoters) interaction_matrix[mask_non_interacting_promoters, len(indexes_p):] = True # it's equivalent to interacting_enhancers_mask_invert mask_interacting_enhancers = np.zeros(length_chr).astype(bool)# we don't have to filter out enhancers which didn't pass the filter thresold. Since we consider only the interacting enhancers that's a subset of survived enhnacers. mask_interacting_enhancers[chrom_interacting_enhancers_pro - total_e + len(indexes_p)] = True mask_non_interacting_enhancers = np.invert(mask_interacting_enhancers) #interaction_matrix[:len(indexes_p), mask_non_interacting_enhancers] = True # it's equivalent to interacting_enhancers_mask_invert if interacting_enhancers_only or prior_mode: interaction_matrix[:len(indexes_p), mask_non_interacting_enhancers] = True # it's equivalent to interacting_enhancers_mask_invert elif len(chrom_enh_not_survived): interaction_matrix[:len(indexes_p), len(indexes_p) + chrom_enh_not_survived - total_e] = True # gets rid of filtered out enhancers which could be causing nans due to their correlations if distant_enh_only and len(dict_chrom_proximal[chrom]): interaction_matrix[:len(indexes_p), len(indexes_p) + dict_chrom_proximal[chrom] - total_e] = True interaction_matrix[true_pro_enh_indexes[:, 0] - total_p, true_pro_enh_indexes[:, 1] - total_e + len(indexes_p)] = True interaction_matrix[len(indexes_p): len(indexes_p) + len(indexes_e), len(indexes_p): len(indexes_p) + len(indexes_e)] = True # gets rid of enhancers-enhancer block indexes_of_zero_interactions = np.where(True == np.invert(interaction_matrix)*domain_matrix) column_1st = indexes_of_zero_interactions[0] column_2nd = indexes_of_zero_interactions[1] - len(indexes_p) prom_enh_false_interactions = np.concatenate((column_1st[:,None], column_2nd[:,None]), axis=1) #pro-enh interactions end----------------------------------------------------------------- return prom_enh_false_interactions if "promoter_enhancer_interactions" in mode: negative_of_type_of_interactions = promoter_enhancer_interactions_generator() print 'number of pro_enh false interactions: ', len(negative_of_type_of_interactions) def enhancer_enhancer_interactions_generator(): chr_interactions_enh_enh = chr_interactions_dict_enh_enh[chrom] chrom_interacting_enhancers_enh = np.unique(un_string(chr_interactions_dict_enh_enh[chrom])[:,0]) chrom_interacting_enhancers_enh = np.unique(np.r_[chrom_interacting_enhancers_enh, np.unique(un_string(chr_interactions_dict_enh_enh[chrom])[:,1])]) interaction_matrix = np.zeros((length_chr, length_chr), bool) interaction_matrix[range(length_chr), range(length_chr)] = True # gets rid of diagonal interaction_matrix[0:len(indexes_p), 0:len(indexes_p)] = True # gets rid of promoter_promoter_interactions interaction_matrix[:len(indexes_p), len(indexes_p): len(indexes_p) + len(indexes_e)] = True # gets rid of promoter-enhancer block print 'number of enh_enh true interactions: ', len(chr_interactions_enh_enh) #enh-enh interactions start----------------------------------------------------------------- if len(chrom_enh_not_survived): interaction_matrix[len(indexes_p) + chrom_enh_not_survived - total_e, :] = True # sorts out raws if distant_enh_only and len(dict_chrom_proximal[chrom]): interaction_matrix[len(indexes_p) + dict_chrom_proximal[chrom] - total_e, :] = True if interacting_negatives: mask_interacting_enhancers = np.zeros(length_chr).astype(bool) mask_interacting_enhancers[chrom_interacting_enhancers_enh - total_e + len(indexes_p)] = True mask_non_interacting_enhancers = np.invert(mask_interacting_enhancers) interaction_matrix[mask_non_interacting_enhancers, len(indexes_p):] = True #sort out columns-------------------------------------- mask_interacting_enhancers = np.zeros(length_chr).astype(bool) mask_interacting_enhancers[chrom_interacting_enhancers_enh - total_e + len(indexes_p)] = True mask_non_interacting_enhancers = np.invert(mask_interacting_enhancers) if interacting_enhancers_only or prior_mode: interaction_matrix[len(indexes_p):, mask_non_interacting_enhancers] = True # it's equivalent to interacting_enhancers_mask_invert elif len(chrom_enh_not_survived): interaction_matrix[len(indexes_p):, len(indexes_p) + chrom_enh_not_survived - total_e] = True # gets rid of filtered out enhancers which could be causing nans due to their correlations if distant_enh_only and len(dict_chrom_proximal[chrom]): interaction_matrix[len(indexes_p):, len(indexes_p) + dict_chrom_proximal[chrom] - total_e] = True #sort out columns--------------------------------------end true_enh_enh_indexes = un_string(chr_interactions_enh_enh) interaction_matrix[true_enh_enh_indexes[:,0] - total_e + len(indexes_p), true_enh_enh_indexes[:,1] - total_e + len(indexes_p)] = True interaction_matrix[true_enh_enh_indexes[:,1] - total_e + len(indexes_p), true_enh_enh_indexes[:,0] - total_e + len(indexes_p)] = True interaction_matrix[np.tril_indices(length_chr)] = True # gets rid of symmetric interactions indexes_of_zero_interactions = np.where(True == np.invert(interaction_matrix)*domain_matrix) column_1st = indexes_of_zero_interactions[0] - len(indexes_p) column_2nd = indexes_of_zero_interactions[1] - len(indexes_p) enh_enh_false_interactions = np.concatenate((column_1st[:, None], column_2nd[:, None]), axis=1) return enh_enh_false_interactions if "enhancer_enhancer_interactions" in mode: negative_of_type_of_interactions = enhancer_enhancer_interactions_generator() print 'number of enh_enh false interactions: ', len(negative_of_type_of_interactions) return negative_of_type_of_interactions
def single_element_cleaner(list_of_datasets, remove_single_domain_elements, filtered_elements): import itertools import interacting_domain mode = config_variables.mode upstream = config_variables.upstream TSS_or_intra_genic_for_domain_filter = config_variables.TSS_or_intra_genic_for_domain_filter chrom_mask_non_single_domain_elements = {} chroms, coordinates = dataset_time_series_dict[list_of_datasets[0]][ 0], dataset_time_series_dict[list_of_datasets[0]][1] interacting_domains = np.loadtxt('report_hESC_Combined_converted.csv', dtype=str, usecols=(4, 12, 13), delimiter=',') if mode == "promoter_enhancer_interactions": if TSS_or_intra_genic_for_domain_filter == "TSS_only": TSS_coordinates = extract_TSS_coordinates(upstream) coordinates = np.column_stack( (TSS_coordinates - 1, TSS_coordinates + 1)) for chrom_ in np.unique(chroms): filtered_elements_chrom = filtered_elements[chroms == chrom_] chrom_coordinates = coordinates[chroms == chrom_] if len(chrom_coordinates) and sum(interacting_domains[:, 0] == chrom_): matrix_left = interacting_domain.interacting_domains( chrom_coordinates, np.array([]).reshape(0, 2), chrom_, state="left", matrix_version=True) matrix_right = interacting_domain.interacting_domains( chrom_coordinates, np.array([]).reshape(0, 2), chrom_, state="right", matrix_version=True) mask = np.ones_like(matrix_left) mask[range(len(mask)), range(len(mask))] = False mask[:, np.invert(filtered_elements_chrom)] = False matrix_allocations_joint = matrix_left * mask + matrix_right * mask #number_of_elemenets_in_the_same_domain = matrix_allocations.sum(1) matrix_allocations_unique_to_left = matrix_allocations_joint - matrix_right * mask matrix_allocations_unique_to_right = matrix_allocations_joint - matrix_left * mask shared_allocations = matrix_allocations_joint - ( matrix_allocations_unique_to_left + matrix_allocations_unique_to_right) promoter_is_shared = shared_allocations.sum(1).astype( bool) # if any is true survived = np.zeros(len(chrom_coordinates), bool) survived[promoter_is_shared] = ( (matrix_allocations_unique_to_left.sum(1) > 0) + (matrix_allocations_unique_to_right.sum(1) > 0) )[promoter_is_shared] survived[np.invert(promoter_is_shared)] = ( (matrix_left * mask).sum(1) > 0)[np.invert(promoter_is_shared)] chrom_mask_non_single_domain_elements[chrom_] = survived else: chrom_mask_non_single_domain_elements[chrom_] = np.ones( sum(chroms == chrom_), bool) chrom_mask_non_single_domain_elements_total = np.array( list( itertools.chain.from_iterable([ chrom_mask_non_single_domain_elements[chrom__] for chrom__ in np.unique(chroms) ]))) return chrom_mask_non_single_domain_elements_total
def generator(pro_survived, enh_survived, domain, max_path): import copy import numpy as np import re import config_variables promoter_overlaps_enhancer_file = config_variables.promoter_overlaps_enhancer_file upstream = config_variables.upstream downstream = config_variables.downstream link_data_set_name_to_file_name = config_variables.link_data_set_name_to_file_name dataset_time_series_dict = config_variables.dataset_time_series_dict TSS_or_intra_genic_for_domain_filter = config_variables.TSS_or_intra_genic_for_domain_filter name_of_time_series_promoter_file_for_TSS_start = config_variables.name_of_time_series_promoter_file_for_TSS_start temp_output = config_variables.temp_output #parameters------------------------- ovenh_ovenh_pro_pro_version = False max_pro_enh_mode = True #----------------------------------- enhancer_enhancer_inter = np.loadtxt(temp_output + 'enhancer_enhancer_interactions_{0}_{1}'.format(upstream, downstream), usecols = (0,1,2), dtype = str, delimiter = '\t') promoter_promoter_inter = np.loadtxt(temp_output + 'promoter_promoter_interactions_{0}_{1}'.format(upstream, downstream), usecols = (0,1,2), dtype = str, delimiter = '\t') promoter_enhancer_inter = np.loadtxt(temp_output + 'promoter_enhancer_interactions_{0}_{1}'.format(upstream, downstream), usecols = (0,1,2), dtype = str, delimiter = '\t') un_stringer = lambda x: int(re.findall('\d+', x)[0]) def un_featurer(array): f = lambda x: re.findall('\D+', x)[0]; return np.array(map(f, array)) def un_string(array_): return np.c_[np.array(map(un_stringer, array_[:, 0]))[:, None], np.array(map(un_stringer, array_[:, 1]))[:, None]] enh_enh_indexes_list = un_string(enhancer_enhancer_inter[:,1:]) pro_enh_indexes_list = un_string(promoter_enhancer_inter[:,1:]) pro_pro_indexes_list = un_string(promoter_promoter_inter[:,1:]) def filter_(array_, filt_1, filt_2): return np.in1d(map(un_stringer, array_[:, 1]), filt_1) * np.in1d(map(un_stringer, array_[:, 2]), filt_2) #cleans interactions: #---------------------------------------------------------------------------------------------------------------------------------------- promoter_promoter_inter = promoter_promoter_inter[filter_(promoter_promoter_inter, pro_survived, pro_survived)] enhancer_enhancer_inter = enhancer_enhancer_inter[filter_(enhancer_enhancer_inter, enh_survived, enh_survived)] promoter_enhancer_inter = promoter_enhancer_inter[filter_(promoter_enhancer_inter, pro_survived, enh_survived)] #---------------------------------------------------------------------------------------------------------------------------------------- enh_enh_indexes_list = un_string(enhancer_enhancer_inter[:,1:]) pro_enh_indexes_list = un_string(promoter_enhancer_inter[:,1:]) pro_pro_indexes_list = un_string(promoter_promoter_inter[:,1:]) def prepare_overlaps(): overlaps = np.loadtxt(promoter_overlaps_enhancer_file, delimiter = '\t', usecols = (4, 8), dtype = int) overlaps_promoter_enhancer_inter = overlaps diction_overlaps_ovenh = {} for overl in overlaps[:, 1]: promoters = list(overlaps[overl == overlaps[:, 1], 0]); diction_overlaps_ovenh[overl] = promoters return diction_overlaps_ovenh diction_overlaps_ovenh = prepare_overlaps() def promoter_promoter_adder(diction_overlaps_ovenh, index_1, index_2, chro, pro_pro_indexes_list, promoter_promoter_inter): print 'ovenh-ovenh' promoters_1 = diction_overlaps_ovenh[index_1] promoters_2 = diction_overlaps_ovenh[index_2] promoters_1 = promoters_1[np.in1d(promoters_1, pro_survived)] # converts ER signals which overlap filtered out promoters into distant peaks promoters_2 = promoters_2[np.in1d(promoters_2, pro_survived)] legend = np.r_[promoters_1, promoters_2] if len(legend) > 1: counts, bins = np.histogram(legend, np.arange(min(legend), max(legend) + 2)) digitize = np.digitize(legend, np.arange(min(legend), max(legend) + 2)) - 1 if len(np.where(counts > 1)[0]): print 'ambigous allocation - peak overlaps two promoters which interact through ChIA-PET index {0}, {1}'.format(index_1, index_2) matrix = np.ones((len(legend), len(legend)), dtype = bool) matrix[:len(promoters_1), :len(promoters_1)] = False matrix[len(promoters_1): len(promoters_1) + len(promoters_2), len(promoters_1): len(promoters_1) + len(promoters_2)] = False matrix[np.tril_indices(len(legend))] = False # upper triangular elements pro_inter = np.c_[legend[np.where(matrix)[0]], legend[np.where(matrix)[1]]] pro_inter_symbolic = [[chro ,'ovpro{0}'.format(ind1), 'ovpro{0}'.format(ind2)] for ind1, ind2 in pro_inter] for el_1, el_2 in zip(pro_inter, pro_inter_symbolic): lista = list(el_1) if lista not in map(list, pro_pro_indexes_list): pro_pro_indexes_list = np.r_[pro_pro_indexes_list, el_1] promoter_promoter_inter = np.r_[promoter_promoter_inter, el_2] return pro_pro_indexes_list, promoter_promoter_inter def ER_enhancer_Non_enhancer_pro_adder(pro_enh_indexes_list, promoter_enhancer_inter, pro_pro_indexes_list, promoter_promoter_inter): enhancer_enhancer_inter_filtered = [] pro_enh_indexes_list_added = [] pro_enh_indexes_list_symbolic_added = [] for el in enhancer_enhancer_inter: chro = el[0] index_1 = int(un_stringer(el[1])) index_2 = int(un_stringer(el[2])) feature_1 = re.findall('\D+', el[1])[0] feature_2 = re.findall('\D+', el[2])[0] if feature_1 == 'ovenh' and feature_2 == 'enh': for pro_dict in diction_overlaps_ovenh[index_1]: # takes promoters corresponding to ER overlapping with them, previus step will clean out overnh which doesnt have signal pro_enh_int = [pro_dict, index_2] pro_enh_int_symbolic = [chro, 'ovpro{0}'.format(pro_dict), 'enh{0}'.format(index_2)] if pro_enh_int not in map(list, pro_enh_indexes_list) and pro_dict in pro_survived: pro_enh_indexes_list_added += [pro_enh_int] pro_enh_indexes_list_symbolic_added += [pro_enh_int_symbolic] elif feature_1 == 'enh' and feature_2 == 'ovenh': # here you've got to check what index has got the promoter. for pro_dict in diction_overlaps_ovenh[index_2]: pro_enh_int = [pro_dict, index_1] pro_enh_int_symbolic = [chro, 'ovpro{0}'.format(pro_dict), 'enh{0}'.format(index_1)] if pro_enh_int not in map(list, pro_enh_indexes_list) and pro_dict in pro_survived: pro_enh_indexes_list_added += [pro_enh_int] pro_enh_indexes_list_symbolic_added += [pro_enh_int_symbolic] elif feature_1 == 'ovenh' and feature_2 == 'ovenh': if ovenh_ovenh_pro_pro_version: pro_pro_indexes_list, promoter_promoter_inter = promoter_promoter_adder(diction_overlaps_ovenh, index_1, index_2, chro, pro_pro_indexes_list, promoter_promoter_inter) # converts ovenh-ovenh interactions to promoter-promoter interactions else: enhancer_enhancer_inter_filtered.append(el) # keeps ovenh-ovenh interactions as enh-enh interactions else: enhancer_enhancer_inter_filtered.append(el) if len(pro_enh_indexes_list_added): pro_enh_indexes_list = np.r_[pro_enh_indexes_list, pro_enh_indexes_list_added] promoter_enhancer_inter = np.r_[promoter_enhancer_inter, pro_enh_indexes_list_symbolic_added] return pro_enh_indexes_list, promoter_enhancer_inter, pro_pro_indexes_list, promoter_promoter_inter pro_enh_indexes_list, promoter_enhancer_inter, pro_pro_indexes_list, promoter_promoter_inter = ER_enhancer_Non_enhancer_pro_adder(pro_enh_indexes_list, promoter_enhancer_inter, pro_pro_indexes_list, promoter_promoter_inter) def ER_ovenh_pro_pro_adder(): filtered_promoter_enhancer_inter = [] overlaps = np.loadtxt(promoter_overlaps_enhancer_file, delimiter = '\t', usecols = (4, 8), dtype = int) overlaps_promoter_enhancer_inter = overlaps diction_overlaps_ovenh = {} for overl in overlaps[:, 1]: promoters = list(overlaps[overl == overlaps[:, 1], 0]) diction_overlaps_ovenh[overl] = promoters for el in promoter_enhancer_inter: index_1 = int(re.findall('\d+', el[1])[0]) index_2 = int(re.findall('\d+', el[2])[0]) feature_2 = re.findall('\D+', el[1])[0] if feature_2 == 'ovenh': for pro_dict in diction_overlaps_ovenh[index_2]: #pro_pro_int = [index_1, dict_pro_survived[pro_dict]] pro_pro_int = [index_1, pro_dict] if pro_pro_int not in filtered_promoter_promoter_inter and pro_dict in pro_survived: filtered_promoter_promoter_inter.append(pro_pro_int) else: filtered_promoter_enhancer_inter.append(el) return np.array(filtered_promoter_enhancer_inter) if not(max_pro_enh_mode): promoter_enhancer_inter = ER_ovenh_pro_pro_adder() #if the aim is to maximise number of promoter-enhancer interactions it could be best to set it to false def scan_through_pro_pro_inter_and_add_pro_enh_inter(): # it's kind of arbitrary what we are trying to get rid of.. so this part can actually be coded, so that when pro-pro then pro-enh and enh-enh... but it may be best to do it with interaction matrix pass def into_string(array, s1, s2): array_ = un_string(array[:,1:]) array_ = np.c_[array[:,0][:,None], np.array([s1 + str(index) for index in array_[:,0]])[:, None], np.array([s2 + str(index) for index in array_[:,1]])[:, None]] return array_ promoter_enhancer_inter = into_string(promoter_enhancer_inter,'p', 'e') promoter_promoter_inter = into_string(promoter_promoter_inter,'p', 'p') enhancer_enhancer_inter = into_string(enhancer_enhancer_inter,'e', 'e') #-------------------------------------------------------------------------------------------------------------------------------------------------------- def stringer(array, st): return np.array(map(lambda x: '{0}{1}'.format(st,x), array)) def filter_domains(symb_1, symb_2, chr_interactions, length_chr, row_indexes_plus, column_indexes_plus): if len(chr_interactions) > 0: interaction_matrix = np.zeros((length_chr, length_chr), bool) true_indexes = un_string(chr_interactions) interaction_matrix[true_indexes[:,0] + row_indexes_plus, true_indexes[:,1] + column_indexes_plus] = True interaction_matrix[true_indexes[:,1] + column_indexes_plus, true_indexes[:,0] + row_indexes_plus] = True interaction_matrix[np.tril_indices(length_chr)] = False # gets rid of symmetric interactions print 'number of original: {0} {1} - {2} true survived domain filtering: {3}'.format(len(true_indexes), symb_1, symb_2, np.sum(interaction_matrix*domain_matrix)) domain_raws = (np.where(interaction_matrix*domain_matrix)[0] - row_indexes_plus) domain_columns = (np.where(interaction_matrix*domain_matrix)[1] - column_indexes_plus) true_indexes_dom_symb = np.c_[stringer(domain_raws, symb_1)[:, None], stringer(domain_columns, symb_2)[:, None]] s_1 = len(true_indexes_dom_symb) return np.c_[np.array(['chr{0}'.format(i)]*s_1)[:,None], true_indexes_dom_symb] else: return [] if domain: import interacting_domain def extract_TSS_coordinates(upstream): data = np.loadtxt(name_of_time_series_promoter_file_for_TSS_start, dtype = str, delimiter = '\t') plus_strand = data[:, 4] == '+' TSS_coordinates = np.zeros(len(plus_strand), int) TSS_coordinates[plus_strand] = data[plus_strand, 1].astype(int) + upstream TSS_coordinates[np.invert(plus_strand)] = data[np.invert(plus_strand), 2].astype(int) + upstream return TSS_coordinates TSS_coordinates = extract_TSS_coordinates(upstream) def initialise_variables(chrom): name_of_pro_t_s = link_data_set_name_to_file_name["promoters"]["ER"] name_of_enh_t_s = link_data_set_name_to_file_name["enhancers"]["ER"] pro_chroms, pro_coordinates, ts_p = dataset_time_series_dict[name_of_pro_t_s] enh_chroms, enh_coordinates, ts_e = dataset_time_series_dict[name_of_enh_t_s] indexes_p = np.where(pro_chroms==chrom)[0] # gives the number of promoters for a chromosome indexes_e = np.where(enh_chroms==chrom)[0] # gives the number of enhancers for a chromosome return pro_chroms, enh_chroms, pro_coordinates, enh_coordinates, indexes_p, indexes_e enhancer_enhancer_inter_dom = {} promoter_enhancer_inter_dom = {} promoter_promoter_inter_dom = {} total_p = total_e = 0 for i in np.concatenate((np.array(range(1, 23), dtype='S2'), ['X'], ['Y'])): chr_interactions_enh_enh = enhancer_enhancer_inter[enhancer_enhancer_inter[:,0]=='chr{0}'.format(i)][:,1:] chr_interactions_pro_enh = promoter_enhancer_inter[promoter_enhancer_inter[:,0]=='chr{0}'.format(i)][:,1:] chr_interactions_pro_pro = promoter_promoter_inter[promoter_promoter_inter[:,0]=='chr{0}'.format(i)][:,1:] if not(len(chr_interactions_enh_enh)*len(chr_interactions_pro_enh)*len(chr_interactions_pro_pro)): continue pro_chroms, enh_chroms, pro_coord, enh_coord, indexes_p, indexes_e = initialise_variables('chr{0}'.format(i)) if TSS_or_intra_genic_for_domain_filter == "TSS_only": pro_coord = np.column_stack((TSS_coordinates, TSS_coordinates + 2)) length_chr = len(indexes_p) + len(indexes_e) chrom_pro_coord = pro_coord[indexes_p] chrom_ER_coord = enh_coord[indexes_e] domain_matrix = interacting_domain.interacting_domains(chrom_pro_coord, chrom_ER_coord, 'chr{0}'.format(i), 'left') domain_matrix = domain_matrix + interacting_domain.interacting_domains(chrom_pro_coord, chrom_ER_coord, 'chr{0}'.format(i), 'right') promoter_enhancer_inter_dom['chr{0}'.format(i)] = filter_domains('p','e', chr_interactions_pro_enh, length_chr, - total_p, - total_e + len(indexes_p)) enhancer_enhancer_inter_dom['chr{0}'.format(i)] = filter_domains('e','e', chr_interactions_enh_enh, length_chr, - total_e + len(indexes_p), - total_e + len(indexes_p)) promoter_promoter_inter_dom['chr{0}'.format(i)] = filter_domains('p','p', chr_interactions_pro_pro, length_chr, - total_p, - total_p) # interaction_domains_adjustments end ----------------------- total_p += len(indexes_p) total_e += len(indexes_e) promoter_enhancer_inter_dom['chrY'] = [] enhancer_enhancer_inter_dom['chrY'] = [] promoter_promoter_inter_dom['chrY'] = [] enhancer_enhancer_inter = enhancer_enhancer_inter_dom['chr1'] promoter_enhancer_inter = promoter_enhancer_inter_dom['chr1'] promoter_promoter_inter = promoter_promoter_inter_dom['chr1'] for i in np.r_[np.arange(2, 23).astype('S2'), ['X'], ['Y']]: add_1 = promoter_enhancer_inter_dom['chr{0}'.format(i)] if len(add_1): promoter_enhancer_inter = np.r_[promoter_enhancer_inter, add_1] add_2 = enhancer_enhancer_inter_dom['chr{0}'.format(i)] if len(add_2): enhancer_enhancer_inter = np.r_[enhancer_enhancer_inter, add_2] add_3 = promoter_promoter_inter_dom['chr{0}'.format(i)] if len(add_3): promoter_promoter_inter = np.r_[promoter_promoter_inter, add_3] interactions_to_save = np.r_[promoter_enhancer_inter, enhancer_enhancer_inter] interactions_to_save = np.c_[interactions_to_save, np.ones(len(interactions_to_save))[:, None]] #if not(generate_intermediates): return interactions_of_path def prepares_reverse_map_and_uniqueness(): promoters = np.r_[promoter_promoter_inter[:,[0,1]], promoter_promoter_inter[:,[0,2]], promoter_enhancer_inter[:,[0,1]]] promoters = np.array(list(set(map(tuple, promoters)))) promoters_sort_indexes = np.argsort(map(un_stringer, promoters[:,1])) promoters = promoters[promoters_sort_indexes] enhancers = np.r_[enhancer_enhancer_inter[:,[0,1]], enhancer_enhancer_inter[:,[0,2]], promoter_enhancer_inter[:,[0,2]]] enhancers = np.array(list(set(map(tuple, enhancers)))) enhancers_sort_indexes = np.argsort(map(un_stringer, enhancers[:,1])) enhancers = enhancers[enhancers_sort_indexes] pro_enh_unique = np.r_[promoters, enhancers] chroms_frame, pro_enh_unique_ordered = [], [] for i in np.r_[np.arange(1, 23).astype('S2'), ['X'], ['Y']]: chr_mask = pro_enh_unique[:,0] == 'chr{0}'.format(i); chroms, chroms_features = pro_enh_unique[chr_mask,0], pro_enh_unique[chr_mask,1]; chroms_frame = np.r_[chroms_frame, chroms]; pro_enh_unique_ordered = np.r_[pro_enh_unique_ordered, chroms_features] dict_inter = {} for index, el in enumerate(pro_enh_unique_ordered): dict_inter[el] = index return pro_enh_unique_ordered, chroms_frame, dict_inter unique_features, chroms_frame, dict_inter = prepares_reverse_map_and_uniqueness() def difference_interactions_prod(difference_arr, path): indexes_of_lower_diagonal = np.tril_indices(len(unique_features)) difference_arr[indexes_of_lower_diagonal] = False #difference_arr[range(len(unique_features)), range(len(unique_features))] = False indexes_of_non_zero_interactions = np.where(difference_arr) column_chr = chroms_frame[indexes_of_non_zero_interactions[0]].astype(str) column_1st = unique_features[indexes_of_non_zero_interactions[0]] column_2nd = unique_features[indexes_of_non_zero_interactions[1]] column_path = np.array([path]*len(indexes_of_non_zero_interactions[0]), str) diff_interactions = np.c_[column_chr[:,None], column_1st[:,None], column_2nd[:,None], column_path[:,None]] #print diff_interactions return diff_interactions a, b = [], [] matrix_of_interactions = np.zeros((len(unique_features),len(unique_features)), bool) matrix_of_interactions[range(len(unique_features)), range(len(unique_features))] = True cumulative_old = np.zeros((len(unique_features),len(unique_features)), bool) concat = np.r_[promoter_enhancer_inter, enhancer_enhancer_inter, promoter_promoter_inter] indexes_1, indexes_2 = [], [] for chr_, feature_1, feature_2 in concat: indexes_1.append(dict_inter[feature_1]), indexes_2.append(dict_inter[feature_2]) matrix_of_interactions[[indexes_1, indexes_2], [indexes_2, indexes_1]] = True matrix_of_interactions_so_far = matrix_of_interactions cumulative = matrix_of_interactions collective = cumulative.astype(int) #ll = [el for el in map(list, concat) if el not in map(list, interactions_of_path[:,:3])] path = 1 interactions_of_path = difference_interactions_prod(cumulative-cumulative_old, path) def gets_rid_of_promoter_promoter_inter(array_): return np.prod(np.c_[un_featurer(array_[:,0])[:,None], un_featurer(array_[:,1])[:,None]] == ['p','p'], axis = 1) == False print 'path:' ,path, 'size =', (sum(sum(np.array(matrix_of_interactions_so_far))) - len(unique_features))/2 while not(np.array_equal(cumulative, cumulative_old)): path += 1 cumulative_old = cumulative matrix_of_interactions_so_far = np.dot(matrix_of_interactions, matrix_of_interactions_so_far) cumulative = matrix_of_interactions_so_far difference = cumulative - cumulative_old collective = collective + difference.astype(int)*path print 'path:' ,path, 'size =', (sum(sum(np.array(matrix_of_interactions_so_far))) - len(unique_features))/2 if path == max_path: break interactions_of_path = np.r_[interactions_of_path, difference_interactions_prod(difference, path)] mask = gets_rid_of_promoter_promoter_inter(interactions_of_path[:,[1,2]]) interactions_of_path = interactions_of_path[mask] return interactions_of_path