def patternFormation(self, communicator, signature2pattern, of, no_cc, \ edge_sig_matrix, no_of_datasets, min_cluster_size, debug): """ 08-07-05 datasetSignatureFname is outputed by fim_closed intermediateFile is outputed by outputEdgeData() 08-08-05 store the occurrence_vector in edge2occurrence_vector 08-24-05 edge_sig_matrix replaces the intermediateFile 01-04-06 1st part split out to be readin_signature2pattern() 01-07-06 back to edge_sig_matrix add min_cluster_size (loop) --decodeOccurrenceBv() --outputCcFromEdgeList() --get_combined_vector() --codense2db_instance.parse_recurrence() """ sys.stderr.write("node %s starts patternFormation ...\n"%(communicator.rank)) codense2db_instance = codense2db() counter = 0 for edge_occurrenceBinaryForm_row in edge_sig_matrix: counter += 1 edge = edge_occurrenceBinaryForm_row[:2] occurrenceBinaryForm = edge_occurrenceBinaryForm_row[2] #08-24-05 already encoded when edge_sig_matrix is filled in #occurrence_vector = decodeOccurrenceToBv(occurrenceBinaryForm, no_of_datasets) signatureToBeDeleted = [] for signature in signature2pattern: frequency = signature2pattern[signature][0] if (occurrenceBinaryForm&signature)==signature: signature2pattern[signature].append(edge) """ if debug: sys.stderr.write("the occurrence_vector of edge %s is %s\n"%(repr(edge), \ repr(decodeOccurrenceToBv(occurrenceBinaryForm, no_of_datasets)))) sys.stderr.write("occurrence_vector's binary form is %s, signature is %s\n"%(occurrenceBinaryForm, signature)) """ if len(signature2pattern[signature]) == frequency+1: #the 1st entry is frequency signatureToBeDeleted.append(signature) """ if debug: sys.stderr.write("signature %s to be deleted, its pattern is %s\n"%(signature, repr(signature2pattern[signature]))) """ """ edge_tuple = tuple(edge) if edge_tuple not in edge2occurrence_vector: edge2occurrence_vector[edge_tuple] = [1] edge2occurrence_vector[edge_tuple].append(occurrence_vector) else: edge2occurrence_vector[edge_tuple][0] += 1 """ for signature in signatureToBeDeleted: edge_list = signature2pattern[signature][1:] outputCcFromEdgeList(of, signature, edge_list, codense2db_instance, min_cluster_size, no_cc) del signature2pattern[signature] sys.stderr.write("node %s patternFormation done.\n"%(communicator.rank))
def init(self): """ 02-24-05 instantiate a class, create the temp directory if necessary, 03-20-05 name two descending tables. """ from splat_to_db import splat_to_db from visualize.clustering_test import clustering_test from codense.codense2db import codense2db self.splat_to_db_instance = splat_to_db() self.clustering_test_instance = clustering_test() self.codense2db_instance = codense2db() if not os.path.isdir(self.dir_files): os.makedirs(self.dir_files) else: sys.stderr.write("Warning, directory %s already exists.\n"%(self.dir_files)) self.tmpinfname = os.path.join(self.dir_files, 'input') self.tmpoutfname = os.path.join(self.dir_files, 'output') self.crack_dict = {1: crack_by_modes(self.debug), 2:crack_by_splat(self.debug)} self.argument1_dict = {1: self.clustering_test_instance, 2: self.splat_to_db_instance} #two descending tables self.splat_table = '%ss'%self.table self.mcl_table = self.splat_table.replace('splat','mcl') if self.mcl_table == self.splat_table: sys.stderr.write("Error: new splat and mcl tables have the same name, %s\n"%self.splat_table) sys.exit(2)
def run(self): """ 09-05-05 Watch: when sending via MPI, tag 0 means from node 0, tag 1 means goes to the last node. 10-21-05 replace output_node() with the one from codense.common for better scheduling --fill_edge2encodedOccurrence() --input_node() --get_cluster_block() --computing_node() --node_fire() --output_node() --output_cluster() --uniqueSort() """ communicator = MPI.world.duplicate() node_rank = communicator.rank intermediateFile = "%s.unsorted" % self.outputfile # intermediateFile to store concatenated results if communicator.rank == (communicator.size - 1): edge2encodedOccurrence = {} no_of_datasets = self.fill_edge2encodedOccurrence( self.hostname, self.dbname, self.schema, edge2encodedOccurrence, self.min_sup, self.max_sup ) mpi_synchronize(communicator) if node_rank == 0: self.input_node( communicator, self.inputfile, self.min_size, self.cluster_block_size, self.cluster_block_edges ) elif node_rank <= communicator.size - 2: # exclude the last node self.computing_node(communicator, self.cluster_block_size, self.min_size, self.min_con) elif node_rank == communicator.size - 1: codense2db_instance = codense2db() free_computing_nodes = range(1, communicator.size - 1) writer = csv.writer(open(intermediateFile, "w"), delimiter="\t") parameter_list = [writer, codense2db_instance, edge2encodedOccurrence, no_of_datasets] output_node( communicator, free_computing_nodes, parameter_list, self.output_cluster, report=self.report, type=Numeric.Int, ) del writer # 10-21-05self.output_node(communicator, intermediateFile, codense2db_instance, edge2encodedOccurrence, no_of_datasets) mpi_synchronize(communicator) # collecting if node_rank == 0: MpiFromDatasetSignatureToPattern_instance = MpiFromDatasetSignatureToPattern() MpiFromDatasetSignatureToPattern_instance.uniqueSort(intermediateFile, self.outputfile)
def parse_cluster_fname(self, curs, cluster_fname, gim_inputfname, cluster_id_set, schema_instance): """ 01-24-06 a lot of analogy to codense2db.py's run() """ sys.stderr.write("Parsing cluster_fname: %s ...\n"%os.path.basename(cluster_fname)) codense2db_instance = codense2db() codense2db_instance.create_tables(curs, schema_instance.splat_table, \ schema_instance.mcl_table, schema_instance.pattern_table) gene_id2gene_no = get_gene_id2gene_no(curs) gene_no2incidence_array = get_gene_no2incidence_array(gim_inputfname, gene_id2gene_no) known_gene_no2go_no_set = get_known_genes_dict(curs) counter = 0 real_counter = 0 cluster_id2properties = {} #additional properties for prediction_pair2instance reader = csv.reader(open(cluster_fname, 'r'), delimiter='\t') for row in reader: counter += 1 #only those who are in cluster_id_set if counter in cluster_id_set: #cluster_id starts from 1 cluster_list = codense2db_instance.fimbfs_parser(row, gene_no2incidence_array, curs) for cluster in cluster_list: real_counter += 1 cluster.unknown_gene_ratio = codense2db_instance.calculate_unknown_gene_ratio(cluster.vertex_set, \ known_gene_no2go_no_set) cluster.cluster_id = counter #line number is the cluster_id codense2db_instance.db_submit(curs, cluster, schema_instance.pattern_table) cluster_id2properties[cluster.cluster_id] = [cluster.connectivity, cluster.unknown_gene_ratio, cluster.vertex_set] if real_counter==len(cluster_id_set): #all relevant clusters have been got, ignore remaining clusters break if self.report and counter%2000==0: sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter)) if self.report: sys.stderr.write("%s%s/%s"%('\x08'*20, counter, real_counter)) del reader sys.stderr.write("Done.\n") return cluster_id2properties
def __init__(self, hostname='zhoudb', dbname='graphdb', schema=None, table=None, mcl_table=None, \ gene_p_table=None, gene_table=None, function=0, functioncolor='green', centralnode=1, mcl_id=1, \ type=1, output_fname=None, plot_type="dot", label=1): self.hostname = hostname self.dbname = dbname self.schema = schema self.table = table self.mcl_table = mcl_table self.gene_p_table = gene_p_table self.gene_table = gene_table self.function = int(function) self.functioncolor = functioncolor self.centralnode = int(centralnode) self.mcl_id = int(mcl_id) self.type = int(type) self.output_fname = output_fname self.plot_type = plot_type self.label = int(label) """ 04-06-05 other initializations """ #the table for edge_correlation_vector self.edge_table = 'edge_cor_vector' #mapping between go_no and go_id self.go_no2go_id = {} #mapping between go_no and go's name self.go_no2go_name = {} #mapping between gene_no and gene_id self.gene_no2gene_id = {} self.gene_id2gene_no = {} self.global_gene_to_go_dict = {} self.label_dict = {} self.order_1st_id2all_clusters = {} self.codense2db_instance = codense2db()
def patternFormation(self, signature2pattern, node_outputfile, no_cc, \ edge_sig_vector_queue, no_of_datasets, debug): """ 08-07-05 datasetSignatureFname is outputed by fim_closed intermediateFile is outputed by outputEdgeData() 08-08-05 store the occurrence_vector in edge2occurrence_vector 08-24-05 edge_sig_matrix replaces the intermediateFile 01-04-06 1st part split out to be readin_signature2pattern() (loop) --decodeOccurrenceBv() --outputCcFromEdgeList() --get_combined_vector() --codense2db_instance.parse_recurrence() """ sys.stderr.write("Thread of node %s starts patternFormation ...\n"%(self.rank)) of = open(node_outputfile, 'w') codense2db_instance = codense2db() counter = 0 edge_occurrenceBinaryForm_row = edge_sig_vector_queue.get() while edge_occurrenceBinaryForm_row!= -1: counter += 1 edge = edge_occurrenceBinaryForm_row[:2] occurrenceBinaryForm = edge_occurrenceBinaryForm_row[2] #08-24-05 already encoded when edge_sig_matrix is filled in #occurrence_vector = decodeOccurrenceToBv(occurrenceBinaryForm, no_of_datasets) signatureToBeDeleted = [] for signature in signature2pattern: frequency = signature2pattern[signature][0] if (occurrenceBinaryForm&signature)==signature: signature2pattern[signature].append(edge) if debug: sys.stderr.write("the occurrence_vector of edge %s is %s\n"%(repr(edge), \ repr(decodeOccurrenceToBv(occurrenceBinaryForm, no_of_datasets)))) sys.stderr.write("occurrence_vector's binary form is %s, signature is %s\n"%(occurrenceBinaryForm, signature)) if len(signature2pattern[signature]) == frequency+1: #the 1st entry is frequency signatureToBeDeleted.append(signature) if debug: sys.stderr.write("signature %s to be deleted, its pattern is %s\n"%(signature, repr(signature2pattern[signature]))) """ edge_tuple = tuple(edge) if edge_tuple not in edge2occurrence_vector: edge2occurrence_vector[edge_tuple] = [1] edge2occurrence_vector[edge_tuple].append(occurrence_vector) else: edge2occurrence_vector[edge_tuple][0] += 1 """ for signature in signatureToBeDeleted: edge_list = signature2pattern[signature][1:] outputCcFromEdgeList(of, signature, edge_list, codense2db_instance, no_cc) del signature2pattern[signature] edge_occurrenceBinaryForm_row = edge_sig_vector_queue.get() if len(signature2pattern)>1: sys.stderr.write('Weird %s signatures are still available\n'%len(signature2pattern)) if debug: sys.stderr.write('%s\n'%repr(signature2pattern)) of.close() sys.stderr.write("Thread of node %s patternFormation done.\n"%(self.rank))