def dstruc_loadin(self, curs): ''' ''' sys.stderr.write("Loading Data STructure...\n") from codense.common import get_go_no2go_id, get_gene_no2gene_id, get_go_no2name, get_gene_id2gene_no, get_gene_no2go_no self.go_no2go_id = get_go_no2go_id(curs) self.go_no2go_name = get_go_no2name(curs) self.gene_no2gene_id = get_gene_no2gene_id(curs) self.gene_id2gene_no = get_gene_id2gene_no(curs) self.global_gene_to_go_dict = get_gene_no2go_no(curs) #04-01-05 the second kind in label_dict gene_no2no = {} for gene_no in self.gene_no2gene_id: gene_no2no[gene_no] = gene_no self.label_dict = {1:self.gene_no2gene_id, 2: gene_no2no} curs.execute("select gene_no,go_functions from gene") if self.type == 3: curs.execute("select array_upper(recurrence_array,1) from %s limit 1"%self.table) rows = curs.fetchall() self.no_of_datasets = int(rows[0][0]) sys.stderr.write("Done\n")
def run(self): communicator = MPI.world.duplicate() node_rank = communicator.rank free_computing_nodes = range(1,communicator.size-1) #exclude the last node if node_rank == 0: (conn, curs) = db_connect(self.hostname, self.dbname, self.schema) schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit) gene_id2no = get_gene_id2gene_no(curs) gene2enc_array = self.get_gene2enc_array(self.gim_inputfname, gene_id2no) gene2enc_array_pickle = cPickle.dumps(gene2enc_array, -1) gene_no2id = get_gene_no2gene_id(curs) gene_no2go_no = get_gene_no2go_no(curs) gene_no2id_pickle = cPickle.dumps(gene_no2id, -1) gene_no2go_no_pickle = cPickle.dumps(gene_no2go_no, -1) for node in free_computing_nodes: #send it to the computing_node communicator.send(gene2enc_array_pickle, node, 0) communicator.send(gene_no2id_pickle, communicator.size-1, 0) communicator.send(gene_no2go_no_pickle, communicator.size-1, 0) elif node_rank in free_computing_nodes: data, source, tag = communicator.receiveString(0, 0) gene2enc_array = cPickle.loads(data) #take the data elif node_rank==communicator.size-1: schema_instance = form_schema_tables(self.fname, self.acc_cutoff, self.lm_bit) data, source, tag = communicator.receiveString(0, 0) gene_no2id = cPickle.loads(data) data, source, tag = communicator.receiveString(0, 0) gene_no2go_no = cPickle.loads(data) mpi_synchronize(communicator) if node_rank == 0: curs.execute("DECLARE crs CURSOR FOR SELECT p.id, p.vertex_set, p.edge_set, p.recurrence_array,\ g.go_no_list from %s p, %s g where g.mcl_id=p.id"%(schema_instance.pattern_table, schema_instance.good_cluster_table)) input_node(communicator, curs, free_computing_nodes, self.message_size, self.report) elif node_rank in free_computing_nodes: parameter_list = [gene2enc_array, self.dataset_signature_set, self.p_value_cut_off] computing_node(communicator, parameter_list, self.computing_node_handler, report=self.report) elif node_rank==communicator.size-1: if not os.path.isdir(self.pic_output_dir): os.makedirs(self.pic_output_dir) cluster_info_instance = cluster_info() ofname = os.path.join(self.pic_output_dir, '%s_p%s'%(schema_instance.good_cluster_table, self.p_value_cut_off)) writer = csv.writer(open(ofname, 'w'), delimiter='\t') parameter_list = [self.pic_output_dir, cluster_info_instance, gene_no2id, gene_no2go_no, writer] output_node(communicator, free_computing_nodes, parameter_list, self.output_node_handler, self.report) del writer
def parse_cluster_fname(self, curs, cluster_fname, gim_inputfname, cluster_id_set, schema_instance): """ 01-24-06 a lot of analogy to codense2db.py's run() """ sys.stderr.write("Parsing cluster_fname: %s ...\n"%os.path.basename(cluster_fname)) codense2db_instance = codense2db() codense2db_instance.create_tables(curs, schema_instance.splat_table, \ schema_instance.mcl_table, schema_instance.pattern_table) gene_id2gene_no = get_gene_id2gene_no(curs) gene_no2incidence_array = get_gene_no2incidence_array(gim_inputfname, gene_id2gene_no) known_gene_no2go_no_set = get_known_genes_dict(curs) counter = 0 real_counter = 0 cluster_id2properties = {} #additional properties for prediction_pair2instance reader = csv.reader(open(cluster_fname, 'r'), delimiter='\t') for row in reader: counter += 1 #only those who are in cluster_id_set if counter in cluster_id_set: #cluster_id starts from 1 cluster_list = codense2db_instance.fimbfs_parser(row, gene_no2incidence_array, curs) for cluster in cluster_list: real_counter += 1 cluster.unknown_gene_ratio = codense2db_instance.calculate_unknown_gene_ratio(cluster.vertex_set, \ known_gene_no2go_no_set) cluster.cluster_id = counter #line number is the cluster_id codense2db_instance.db_submit(curs, cluster, schema_instance.pattern_table) cluster_id2properties[cluster.cluster_id] = [cluster.connectivity, cluster.unknown_gene_ratio, cluster.vertex_set] if real_counter==len(cluster_id_set): #all relevant clusters have been got, ignore remaining clusters break if self.report and counter%2000==0: sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter)) if self.report: sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter)) del reader sys.stderr.write("Done.\n") return cluster_id2properties