def fill_edge2encodedOccurrence(self, sig_vector_fname, min_sup, max_sup, total_vertex_set=None): """ 04-04-06 """ sys.stderr.write("Getting edge2encodedOccurrence...\n") from MpiFromDatasetSignatureToPattern import encodeOccurrenceBv edge2encodedOccurrence = {} reader = csv.reader(open(sig_vector_fname), delimiter='\t') no_of_datasets = 0 counter = 0 for row in reader: edge = row[:2] edge = map(int, edge) #04-06-06 any vertex of the edge doesn't appear in total_vertex_set, skip the edge if total_vertex_set and (edge[0] not in total_vertex_set or edge[1] not in total_vertex_set): continue edge.sort() #04-06-06 in ascending order sig_vector = row[2:] sig_vector = map(int, sig_vector) if no_of_datasets==0: no_of_datasets = len(sig_vector) if sum(sig_vector)>=min_sup and sum(sig_vector)<=max_sup: edge2encodedOccurrence[tuple(edge)] = encodeOccurrenceBv(sig_vector) sys.stderr.write("Done.\n") del reader return edge2encodedOccurrence, no_of_datasets
def fill_edge2encodedOccurrence( self, hostname, dbname, schema, edge2encodedOccurrence, min_sup, max_sup, edge_table="edge_cor_vector" ): """ 09-05-05 get the edge2encodedOccurrence from the database """ sys.stderr.write("Getting edges...\n") (conn, curs) = db_connect(hostname, dbname, schema) curs.execute( "DECLARE crs CURSOR FOR select edge_name,sig_vector \ from %s" % (edge_table) ) curs.execute("fetch 5000 from crs") rows = curs.fetchall() no_of_datasets = 0 counter = 0 while rows: for row in rows: edge = row[0][1:-1].split(",") edge = map(int, edge) sig_vector = row[1][1:-1].split(",") sig_vector = map(int, sig_vector) if no_of_datasets == 0: no_of_datasets = len(sig_vector) if sum(sig_vector) >= min_sup and sum(sig_vector) <= max_sup: edge2encodedOccurrence[tuple(edge)] = encodeOccurrenceBv(sig_vector) curs.execute("fetch 5000 from crs") rows = curs.fetchall() sys.stderr.write("Done.\n") return no_of_datasets
def get_recurrence_go_no_rec_array_cluster_id_ls(self, curs, pattern_table, mcl_id2go_no_set): """ 2006-09-26 from pattern_table and use mcl_id2go_no_set go_no_list is the go_id Set mcl_id2enc_recurrence is for get_recurrence_rec_array_bs_no_list() """ sys.stderr.write("Getting recurrence_go_no_rec_array_cluster_id_ls...\n") no_of_datasets = 0 go_no2recurrence_cluster_id = {} mcl_id2enc_recurrence = {} curs.execute("DECLARE crs CURSOR FOR SELECT id, recurrence_array from %s"\ %pattern_table) curs.execute("fetch 5000 from crs") rows = curs.fetchall() counter = 0 real_counter = 0 while rows: for row in rows: mcl_id, recurrence_array = row if mcl_id in mcl_id2go_no_set: #if this pattern has functions predicted recurrence_array = recurrence_array[1:-1].split(',') recurrence_array = map(float, recurrence_array) #this is not a binary 0/1 array occurrence_cutoff_func = lambda x: int(x>=0.8) #0.8 is arbitrary recurrence_array = map(occurrence_cutoff_func, recurrence_array) if no_of_datasets == 0: no_of_datasets = len(recurrence_array) go_no_list = mcl_id2go_no_set[mcl_id] encoded_recurrence = encodeOccurrenceBv(recurrence_array) mcl_id2enc_recurrence[mcl_id] = encoded_recurrence #2006-09-26 for go_no in go_no_list: if go_no not in go_no2recurrence_cluster_id: go_no2recurrence_cluster_id[go_no] = [encoded_recurrence, Set([mcl_id])] #use Set() because mcl_id has duplicates due to different p-values else: go_no2recurrence_cluster_id[go_no][0] = \ go_no2recurrence_cluster_id[go_no][0] | encoded_recurrence go_no2recurrence_cluster_id[go_no][1].add(mcl_id) real_counter += 1 counter += 1 if self.report: sys.stderr.write("%s%s\t%s"%('\x08'*20, counter, real_counter)) curs.execute("fetch 5000 from crs") rows = curs.fetchall() curs.execute("close crs") recurrence_go_no_rec_array_cluster_id_ls = [] for go_no in go_no2recurrence_cluster_id: encoded_recurrence, mcl_id_set = go_no2recurrence_cluster_id[go_no] recurrence_array = decodeOccurrence(encoded_recurrence) #not binary vector recurrence = len(recurrence_array) recurrence_go_no_rec_array_cluster_id_ls.append([recurrence, go_no, recurrence_array, mcl_id_set]) recurrence_go_no_rec_array_cluster_id_ls.sort() sys.stderr.write("End getting recurrence_go_no_rec_array_cluster_id_ls.\n") return recurrence_go_no_rec_array_cluster_id_ls, no_of_datasets, mcl_id2enc_recurrence
def get_gene2enc_array(self, gim_inputfname, gene_id2no): sys.stderr.write("Getting gene2enc_array...\n") reader = csv.reader(open(gim_inputfname), delimiter='\t') gene2enc_array = {} for row in reader: no_of_occurrences, occ_array, gene_id = row[0], row[1:-1], row[-1] if gene_id in gene_id2no: gene_no = gene_id2no[gene_id] occ_array = map(int, occ_array) gene2enc_array[gene_no] = encodeOccurrenceBv(occ_array) sys.stderr.write("End getting gene2enc_array.\n") return gene2enc_array