def get_recurrence_go_no_rec_array_cluster_id_ls(self, curs, pattern_table, mcl_id2go_no_set): """ 2006-09-26 from pattern_table and use mcl_id2go_no_set go_no_list is the go_id Set mcl_id2enc_recurrence is for get_recurrence_rec_array_bs_no_list() """ sys.stderr.write("Getting recurrence_go_no_rec_array_cluster_id_ls...\n") no_of_datasets = 0 go_no2recurrence_cluster_id = {} mcl_id2enc_recurrence = {} curs.execute("DECLARE crs CURSOR FOR SELECT id, recurrence_array from %s"\ %pattern_table) curs.execute("fetch 5000 from crs") rows = curs.fetchall() counter = 0 real_counter = 0 while rows: for row in rows: mcl_id, recurrence_array = row if mcl_id in mcl_id2go_no_set: #if this pattern has functions predicted recurrence_array = recurrence_array[1:-1].split(',') recurrence_array = map(float, recurrence_array) #this is not a binary 0/1 array occurrence_cutoff_func = lambda x: int(x>=0.8) #0.8 is arbitrary recurrence_array = map(occurrence_cutoff_func, recurrence_array) if no_of_datasets == 0: no_of_datasets = len(recurrence_array) go_no_list = mcl_id2go_no_set[mcl_id] encoded_recurrence = encodeOccurrenceBv(recurrence_array) mcl_id2enc_recurrence[mcl_id] = encoded_recurrence #2006-09-26 for go_no in go_no_list: if go_no not in go_no2recurrence_cluster_id: go_no2recurrence_cluster_id[go_no] = [encoded_recurrence, Set([mcl_id])] #use Set() because mcl_id has duplicates due to different p-values else: go_no2recurrence_cluster_id[go_no][0] = \ go_no2recurrence_cluster_id[go_no][0] | encoded_recurrence go_no2recurrence_cluster_id[go_no][1].add(mcl_id) real_counter += 1 counter += 1 if self.report: sys.stderr.write("%s%s\t%s"%('\x08'*20, counter, real_counter)) curs.execute("fetch 5000 from crs") rows = curs.fetchall() curs.execute("close crs") recurrence_go_no_rec_array_cluster_id_ls = [] for go_no in go_no2recurrence_cluster_id: encoded_recurrence, mcl_id_set = go_no2recurrence_cluster_id[go_no] recurrence_array = decodeOccurrence(encoded_recurrence) #not binary vector recurrence = len(recurrence_array) recurrence_go_no_rec_array_cluster_id_ls.append([recurrence, go_no, recurrence_array, mcl_id_set]) recurrence_go_no_rec_array_cluster_id_ls.sort() sys.stderr.write("End getting recurrence_go_no_rec_array_cluster_id_ls.\n") return recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets, mcl_id2enc_recurrence
def get_recurrence_rec_array_bs_no_list(self, curs, cluster_bs_table, mcl_id2enc_recurrence): """ 11-01-05 """ sys.stderr.write("Getting recurrence_rec_array_bs_no_list...\n") bs_no2enc_recurrence = {} curs.execute("DECLARE crs CURSOR FOR select c.mcl_id, c.bs_no_list from %s c"%(cluster_bs_table)) curs.execute("fetch 5000 from crs") rows = curs.fetchall() counter = 0 real_counter =0 while rows: for row in rows: mcl_id, bs_no_list = row if mcl_id in mcl_id2enc_recurrence: encoded_recurrence = mcl_id2enc_recurrence[mcl_id] bs_no_list = bs_no_list[1:-1].split(',') bs_no_list = map(int, bs_no_list) for bs_no in bs_no_list: if bs_no not in bs_no2enc_recurrence: bs_no2enc_recurrence[bs_no] = encoded_recurrence else: bs_no2enc_recurrence[bs_no] |= encoded_recurrence real_counter += 1 counter += 1 if self.report: sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter)) curs.execute("fetch 10000 from crs") rows = curs.fetchall() curs.execute("close crs") recurrence_rec_array_bs_no_list = [] for bs_no, enc_recurrence in bs_no2enc_recurrence.iteritems(): recurrence_array = decodeOccurrence(enc_recurrence) #not binary vector recurrence = len(recurrence_array) recurrence_rec_array_bs_no_list.append([recurrence, recurrence_array, bs_no]) recurrence_rec_array_bs_no_list.sort() sys.stderr.write("End getting recurrence_rec_array_bs_no_list.\n") return recurrence_rec_array_bs_no_list
def get_core_vertex_set(self, vertex_list, recurrence_array, degree_cut_off): """ 12-16-05 global structures used: --self.edge2encodedOccurrence 12-18-05 expand to all datasets --init_graph_from_vertex_set() --decodeOccurrence() --remove_loose_part_of_graph() --remove_singleton_vertices() --get_vertex_min_degree() """ no_of_datasets = len(recurrence_array) #initialize all graphs graph_list = [None]*no_of_datasets anti_vertex_id_list = [None]*no_of_datasets recurrence_set = Set() for i in range(no_of_datasets): graph_list[i], anti_vertex_id_list[i] = self.init_graph_from_vertex_set(vertex_list) if recurrence_array[i] == 1: recurrence_set.add(i) no_of_vertices = len(vertex_list) #vertex_list.sort() #presorted #construct graphs for each 'on' dataset for i in range(no_of_vertices): for j in range(i+1, no_of_vertices): edge_tuple = (vertex_list[i], vertex_list[j]) """ if self.debug: print "checking", edge_tuple """ if edge_tuple in self.edge2encodedOccurrence: edge_recurrence = decodeOccurrence(self.edge2encodedOccurrence[edge_tuple]) #starting from 1 """ if self.debug: print "edge_recurrence", edge_recurrence """ for k in edge_recurrence: index = k-1 v_descriptor1 = anti_vertex_id_list[index][vertex_list[i]] v_descriptor2 = anti_vertex_id_list[index][vertex_list[j]] graph_list[index].add_edge(v_descriptor1, v_descriptor2) #remove loose part for each graph on_dataset_index_ls = [0]*no_of_datasets for i in range(no_of_datasets): if graph_list[i].num_edges()>1: #at least the graph has two edges degree_percentage = self.remove_loose_part_of_graph(graph_list[i], degree_cut_off) if graph_list[i].num_vertices()>=4: #min graph size """ if self.debug: print "graph %s has %s vertices remaining with degree_percentage: %s."%(i, graph_list[i].num_vertices(), degree_percentage) """ on_dataset_index_ls[i] = 1 #this dataset should be counted as 'on' else: on_dataset_index_ls[i] = 0 #find core vertex_list only in those recurrent 'on' datasets vertex_id2occurrence = {} recurrent_and_on_datasets_ls = [] on_but_not_recurrent_dataset2vertex_set = {} for i in range(no_of_datasets): if recurrence_array[i] == 1 and on_dataset_index_ls[i] == 1: recurrent_and_on_datasets_ls.append(i) for v in graph_list[i].vertices: vertex_id = graph_list[i].vertex_properties['vertex_id'][v] if vertex_id not in vertex_id2occurrence: vertex_id2occurrence[vertex_id] = 0 vertex_id2occurrence[vertex_id] += 1 if recurrence_array[i] == 0 and on_dataset_index_ls[i] == 1: on_but_not_recurrent_dataset2vertex_set[i] = Set() for v in graph_list[i].vertices: vertex_id = graph_list[i].vertex_properties['vertex_id'][v] on_but_not_recurrent_dataset2vertex_set[i].add(vertex_id) #only vertices in recurrent and 'on' datasets go into core_vertex_set core_vertex_set = Set() for vertex_id in vertex_id2occurrence: if vertex_id2occurrence[vertex_id] == len(recurrent_and_on_datasets_ls): core_vertex_set.add(vertex_id) #find other on datasets from on_but_not_recurrent_dataset2vertex_set for dataset_no, vertex_set in on_but_not_recurrent_dataset2vertex_set.iteritems(): intersection_set = core_vertex_set & vertex_set if len(intersection_set)==len(core_vertex_set): recurrent_and_on_datasets_ls.append(dataset_no) core_vertex_ls = list(core_vertex_set) core_vertex_ls.sort() recurrent_and_on_datasets_ls.sort() return core_vertex_ls, recurrent_and_on_datasets_ls