def seed_column_members(data_matrix, row_membership, num_clusters, num_clusters_per_column): """Default column membership seeder ('best') In case of multiple input ratio matrices, we assume that these matrices have been combined into data_matrix""" num_rows = data_matrix.num_rows num_cols = data_matrix.num_columns # create a submatrix for each cluster cscores = np.zeros([data_matrix.num_columns, num_clusters]) for cluster_num in xrange(1, num_clusters + 1): current_cluster_rows = [] for row_index in xrange(num_rows): if row_membership[row_index][0] == cluster_num: current_cluster_rows.append(data_matrix.row_names[row_index]) submatrix = data_matrix.submatrix_by_name( row_names=current_cluster_rows) _, scores = scoring.compute_column_scores_submatrix(submatrix) cscores.T[cluster_num - 1] = -scores start_time = util.current_millis() column_members = [util.rorder(cscores[i], num_clusters_per_column) for i in xrange(num_cols)] elapsed = util.current_millis() - start_time logging.debug("seed column members in %f s.", elapsed % 1000.0) return column_members
def __init__(self, id, organism, membership, ratios, seqtype, config_params=None): """creates a ScoringFunction""" scoring.ScoringFunctionBase.__init__(self, id, organism, membership, ratios, config_params=config_params) # attributes accessible by subclasses self.seqtype = seqtype self.__setup_meme_suite(config_params) self.num_motif_func = util.get_iter_fun(config_params['MEME'], "nmotifs", config_params['num_iterations']) self.__last_motif_infos = None self.__last_iteration_result = {} self.all_pvalues = None self.last_result = None self.update_log = scoring.RunLog("motif-score-" + seqtype, config_params) self.motif_log = scoring.RunLog("motif-motif-" + seqtype, config_params) used_genes = sorted(ratios.row_names) self.used_seqs = organism.sequences_for_genes_scan( used_genes, seqtype=self.seqtype) logging.debug("building reverse map...") start_time = util.current_millis() self.reverse_map = self.__build_reverse_map(ratios) logging.debug("reverse map built in %d ms.", util.current_millis() - start_time) self.__last_results = None # caches the results of the previous meme run
def compute_row_scores(membership, matrix, num_clusters, config_params): """for each cluster 1, 2, .. num_clusters compute the row scores for the each row name in the input name matrix""" start_time = util.current_millis() cluster_row_scores = __compute_row_scores_for_clusters( membership, matrix, num_clusters, config_params) # TODO: replace the nan/inf-Values with the quantile-thingy in the R-version logging.debug("__compute_row_scores_for_clusters() in %f s.", (util.current_millis() - start_time) / 1000.0) # rearrange result into a DataMatrix, where rows are indexed by gene # and columns represent clusters start_time = util.current_millis() values = np.zeros((matrix.num_rows, num_clusters)) # note that cluster is 0 based on a matrix for cluster in xrange(num_clusters): row_scores = cluster_row_scores[cluster] values[:, cluster] = row_scores result = dm.DataMatrix(matrix.num_rows, num_clusters, row_names=matrix.row_names, values=values) logging.debug("made result matrix in %f s.", (util.current_millis() - start_time) / 1000.0) return result
def run_iteration(self, iteration, force=False): """Run a single cMonkey iteration Keyword arguments: iteration -- The iteration number to run force -- Set to true to force recalculations (DEFAULT:FALSE) """ logging.info("Iteration # %d", iteration) iteration_result = {'iteration': iteration, 'score_means': {}} if force: rscores = self.row_scoring.compute_force(iteration_result) else: rscores = self.row_scoring.compute(iteration_result) start_time = util.current_millis() if force: cscores = self.column_scoring.compute_force(iteration_result) else: cscores = self.column_scoring.compute(iteration_result) elapsed = util.current_millis() - start_time if elapsed > 0.0001: logging.debug("computed column_scores in %f s.", elapsed / 1000.0) self.membership().update(self.ratios, rscores, cscores, self['num_iterations'], iteration_result) mean_net_score = 0.0 mean_mot_pvalue = 0.0 if 'networks' in iteration_result.keys(): mean_net_score = iteration_result['networks'] mean_mot_pvalue = "NA" if 'motif-pvalue' in iteration_result.keys(): mean_mot_pvalue = "" mean_mot_pvalues = iteration_result['motif-pvalue'] mean_mot_pvalue = "" for seqtype in mean_mot_pvalues.keys(): mean_mot_pvalue = mean_mot_pvalue + (" '%s' = %f" % (seqtype, mean_mot_pvalues[seqtype])) logging.debug('mean net = %s | mean mot = %s', str(mean_net_score), mean_mot_pvalue) # Reduce I/O, will write the results to database only on a debug run if not self['minimize_io']: if iteration == 1 or (iteration % self['result_freq'] == 0): self.write_results(iteration_result) # This should not be too much writing, so we can keep it OUT of minimize_io option...? if iteration == 1 or (iteration % self['stats_freq'] == 0): self.write_stats(iteration_result) self.update_iteration(iteration) if 'dump_results' in self['debug'] and (iteration == 1 or (iteration % self['debug_freq'] == 0)): # write complete result into a cmresults.tsv conn = self.__dbconn() path = os.path.join(self['output_dir'], 'cmresults-%04d.tsv.bz2' % iteration) with bz2.BZ2File(path, 'w') as outfile: debug.write_iteration(conn, outfile, iteration, self['num_clusters'], self['output_dir'])
def set_config_general(config, params): """Process General section""" # override directories tmp_dir = config.get('General', 'tmp_dir') if tmp_dir: tempfile.tempdir = tmp_dir params['output_dir'] = config.get('General', 'output_dir') params['cache_dir'] = config.get('General', 'cache_dir') params['tmp_dir'] = tmp_dir params['dbfile_name'] = config.get('General', 'dbfile_name') params['normalize_ratios'] = config.getboolean('General', 'normalize_ratios') params['num_iterations'] = config.getint("General", "num_iterations") params['start_iteration'] = config.getint("General", "start_iteration") params['multiprocessing'] = config.getboolean('General', 'use_multiprocessing') params['case_sensitive'] = config.getboolean('General', 'case_sensitive') params['num_cores'] = get_config_int('General', 'num_cores', None) params['postadjust'] = config.getboolean('General', 'postadjust') params['log_subresults'] = config.getboolean('General', 'log_subresults') params['add_fuzz'] = config.get('General', 'add_fuzz') # python can have large seeds, R, however has a 32 bit limit it seems params['random_seed'] = get_config_int(config, 'General', 'random_seed', util.current_millis() % 2147483647) params['stats_freq'] = config.getint('General', 'stats_frequency') params['result_freq'] = config.getint('General', 'result_frequency') params['debug_freq'] = config.getint('General', 'debug_frequency') # implicit parameters for compatibility params['use_operons'] = get_config_boolean(config, 'General', 'use_operons', True) params['use_string'] = get_config_boolean(config, 'General', 'use_string', True) params['checkratios'] = get_config_boolean(config, 'General', 'checkratios', False) params['organism_code'] = get_config_str(config, 'General', 'organism_code', None)
def do_compute(self, iteration_result, ref_matrix=None): """compute method, iteration is the 0-based iteration number""" matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) network_scores = {} for network in self.networks(): logging.debug("Compute scores for network '%s', WEIGHT: %f", network.name, network.weight) start_time = util.current_millis() network_score = self.__compute_network_cluster_scores(network) network_scores[network.name] = network_score self.__update_score_matrix(matrix, network_score, network.weight) elapsed = util.current_millis() - start_time logging.debug("NETWORK '%s' SCORING TIME: %f s.", network.name, (elapsed / 1000.0)) # compute and store score means self.score_means = self.__update_score_means(network_scores) return matrix
def get_col_density_scores(membership, col_scores): num_clusters = membership.num_clusters() cscore_range = abs(col_scores.max() - col_scores.min()) colscore_bandwidth = max(cscore_range / 100.0, 0.001) cd_scores = dm.DataMatrix(col_scores.num_rows, col_scores.num_columns, col_scores.row_names, col_scores.column_names) cds_values = cd_scores.values start_time = util.current_millis() for cluster in xrange(1, num_clusters + 1): # instead of assigning the cc_scores values per row, we can assign to the # transpose and let numpy do the assignment cds_values.T[cluster - 1] = get_cc_scores(membership, col_scores, colscore_bandwidth, cluster) elapsed = util.current_millis() - start_time logging.debug("CC_SCORES IN %f s.", elapsed / 1000.0) return cd_scores
def quantile_normalize_scores(matrices, weights=None): """quantile normalize scores against each other""" logging.info("COMPUTING WEIGHTED MEANS...") start_time = util.current_millis() # rearranges the scores in the input matrices into a matrix # with |matrices| columns where the columns contain the values # of each matrix in sorted order flat_values = np.transpose(np.asarray([np.sort(matrix.values.flatten()) for matrix in matrices])) elapsed = util.current_millis() - start_time logging.info("flattened/sorted score matrices in %f s.", elapsed / 1000.0) start_time = util.current_millis() if weights is not None: # multiply each column of matrix with each component of the # weight vector: Using matrix multiplication resulted in speedup # from 125 s. to 0.125 seconds over apply_along_axis() (1000x faster)! scaled = weights * flat_values scale = np.sum(np.ma.masked_array(weights, np.isnan(weights))) tmp_mean = util.row_means(scaled) / scale else: tmp_mean = util.row_means(flat_values) elapsed = util.current_millis() - start_time logging.info("weighted means in %f s.", elapsed / 1000.0) start_time = util.current_millis() result = qm_result_matrices(matrices, tmp_mean) elapsed = util.current_millis() - start_time logging.info("result matrices built in %f s.", elapsed / 1000.0) return result
def get_row_density_scores(membership, row_scores): """getting density scores improves small clusters""" num_clusters = membership.num_clusters() rscore_range = abs(row_scores.max() - row_scores.min()) rowscore_bandwidth = max(rscore_range / 100.0, 0.001) rd_scores = dm.DataMatrix(row_scores.num_rows, row_scores.num_columns, row_scores.row_names, row_scores.column_names) rds_values = rd_scores.values start_time = util.current_millis() for cluster in xrange(1, num_clusters + 1): # instead of assigning the rr_scores values per row, we can assign to the # transpose and let numpy do the assignment rds_values.T[cluster - 1] = get_rr_scores(membership, row_scores, rowscore_bandwidth, cluster) elapsed = util.current_millis() - start_time logging.debug("RR_SCORES IN %f s.", elapsed / 1000.0) return rd_scores
def update(self, matrix, row_scores, column_scores, num_iterations, iteration_result): """top-level update method""" start = util.current_millis() row_scores, column_scores = fuzzify(self, row_scores, column_scores, num_iterations, iteration_result, self.__config_params['add_fuzz']) elapsed = util.current_millis() - start logging.debug("fuzzify took %f s.", elapsed / 1000.0) # pickle the (potentially fuzzed) row scores to use them # in the post adjustment step. We only need to do that in the last # iteration iteration = iteration_result['iteration'] if iteration == num_iterations: with open(self.pickle_path(), 'wb') as outfile: pickle.dump(row_scores, outfile) start = util.current_millis() rd_scores, cd_scores = get_density_scores(self, row_scores, column_scores) elapsed = util.current_millis() - start logging.debug("GET_DENSITY_SCORES() took %f s.", elapsed / 1000.0) start = util.current_millis() compensate_size(self, matrix, rd_scores, cd_scores) elapsed = util.current_millis() - start logging.debug("COMPENSATE_SIZE() took %f s.", elapsed / 1000.0) start_time = util.current_millis() update_for_rows(self, rd_scores, self.__config_params['multiprocessing']) elapsed = util.current_millis() - start_time logging.debug("update_for rdscores finished in %f s.", elapsed / 1000.0) start_time = util.current_millis() update_for_cols(self, cd_scores, self.__config_params['multiprocessing']) elapsed = util.current_millis() - start_time logging.debug("update_for cdscores finished in %f s.", elapsed / 1000.0)
def set_config_general(config, params): """Process General section""" # override directories tmp_dir = config.get("General", "tmp_dir") if tmp_dir: tempfile.tempdir = tmp_dir try: # Only resumed or final runs should have a stored command line params["command_line"] = config.get("General", "command_line") except: pass params["output_dir"] = config.get("General", "output_dir") params["cache_dir"] = config.get("General", "cache_dir") params["tmp_dir"] = tmp_dir params["pipeline_file"] = config.get("General", "pipeline_file") params["dbfile_name"] = config.get("General", "dbfile_name") params["rsat_base_url"] = config.get("General", "rsat_base_url") params["rsat_features"] = config.get("General", "rsat_features") params["rsat_organism"] = config.get("General", "rsat_organism") params["rsat_dir"] = config.get("General", "rsat_dir") params["normalize_ratios"] = config.getboolean("General", "normalize_ratios") params["num_iterations"] = config.getint("General", "num_iterations") params["start_iteration"] = config.getint("General", "start_iteration") params["multiprocessing"] = config.getboolean("General", "use_multiprocessing") params["case_sensitive"] = config.getboolean("General", "case_sensitive") params["num_cores"] = get_config_int(config, "General", "num_cores", None) params["postadjust"] = config.getboolean("General", "postadjust") params["log_subresults"] = config.getboolean("General", "log_subresults") params["add_fuzz"] = config.get("General", "add_fuzz") # python can have large seeds, R, however has a 32 bit limit it seems params["random_seed"] = get_config_int(config, "General", "random_seed", util.current_millis() % 2147483647) params["stats_freq"] = config.getint("General", "stats_frequency") params["result_freq"] = config.getint("General", "result_frequency") params["debug_freq"] = config.getint("General", "debug_frequency") # implicit parameters for compatibility params["use_operons"] = get_config_boolean(config, "General", "use_operons", True) params["use_string"] = get_config_boolean(config, "General", "use_string", True) params["checkratios"] = get_config_boolean(config, "General", "checkratios", False) params["organism_code"] = get_config_str(config, "General", "organism_code", None) params["use_BSCM"] = get_config_boolean(config, "General", "use_BSCM", False) params["use_chi2"] = get_config_boolean(config, "General", "use_chi2", False)
def do_compute(self, iteration_result, ref_matrix): """compute method Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES logging.info("Compute scores for set enrichment...") start_time = util.current_millis() matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] SET_MATRIX = self.ratios SET_MEMBERSHIP = self.membership SET_SYNONYMS = self.organism.thesaurus() if CANONICAL_ROWNAMES is None: CANONICAL_ROWNAMES = set( map(lambda n: SET_SYNONYMS[n] if n in SET_SYNONYMS else n, self.ratios.row_names)) if CANONICAL_ROW_INDEXES is None: CANONICAL_ROW_INDEXES = {} for index, row in enumerate(self.ratios.row_names): if row in SET_SYNONYMS: CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index else: CANONICAL_ROW_INDEXES[row] = index ref_min_score = np.nanpercentile(ref_matrix.values, 10.0) logging.info('REF_MIN_SCORE: %f', ref_min_score) set_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_set.csv') pval_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_pvalue.csv') for set_type in self.__set_types: SET_SET_TYPE = set_type logging.info("PROCESSING SET TYPE '%s'", set_type.name) start1 = util.current_millis() cutoff = self.bonferroni_cutoff() if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map( compute_cluster_score, [(cluster, cutoff, ref_min_score) for cluster in xrange(1, self.num_clusters() + 1)]) else: results = [] for cluster in xrange(1, self.num_clusters() + 1): results.append( compute_cluster_score( (cluster, cutoff, ref_min_score))) elapsed1 = util.current_millis() - start1 logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...", elapsed1 / 1000.0) if not os.path.exists(set_filepath): setFile = open(set_filepath, 'w') setFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) pvFile = open(pval_filepath, 'w') pvFile.write(',' + ','.join( [str(i) for i in xrange(1, self.num_clusters() + 1)])) else: setFile = open(set_filepath, 'a') pvFile = open(pval_filepath, 'a') minSets = [] pValues = [] for cluster in xrange(1, self.num_clusters() + 1): # store the best enriched set determined scores, min_set, min_pvalue = results[cluster - 1] minSets.append(min_set) pValues.append(min_pvalue) for row in xrange(len(self.gene_names())): matrix.values[row][cluster - 1] += scores[row] * set_type.weight setFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in minSets])) pvFile.write('\n' + str(iteration_result['iteration']) + ',' + ','.join([str(i) for i in pValues])) setFile.close() pvFile.close() logging.info("SET ENRICHMENT FINISHED IN %f s.\n", (util.current_millis() - start_time) / 1000.0) # cleanup SET_SET_TYPE = None SET_MATRIX = None SET_MEMBERSHIP = None SET_SYNONYMS = None return matrix
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params['quantile_normalize'] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if ('dump_scores' in config_params['debug'] and (iteration == 1 or (iteration % config_params['debug_freq'] == 0))): funs = config_params['pipeline']['row-scoring']['args'][ 'functions'] m.write_tsv_file(os.path.join( config_params['output_dir'], 'score-%s-%04d.tsv' % (funs[i]['id'], iteration)), compressed=False) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores( result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([ mat.values[index_map[row], cluster - 1] for row in row_members ]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 1: pick from sorted values', i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.debug( 'SPARSE SCORES - %d attempt 2: pick minimum value', i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.debug('SPARSE SCORES - %d not normalizing!', i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix(matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score) else: return None
def do_compute(self, iteration_result, ref_matrix): """compute method Note: will return None if not computed yet and the result of a previous scoring if the function is not supposed to actually run in this iteration """ global SET_MATRIX, SET_MEMBERSHIP, SET_SET_TYPE, SET_SYNONYMS, CANONICAL_ROWNAMES, CANONICAL_ROW_INDEXES logging.info("Compute scores for set enrichment...") start_time = util.current_millis() matrix = dm.DataMatrix(len(self.gene_names()), self.num_clusters(), self.gene_names()) use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] SET_MATRIX = self.ratios SET_MEMBERSHIP = self.membership SET_SYNONYMS = self.organism.thesaurus() if CANONICAL_ROWNAMES is None: CANONICAL_ROWNAMES = set(map(lambda n: SET_SYNONYMS[n] if n in SET_SYNONYMS else n, self.ratios.row_names)) if CANONICAL_ROW_INDEXES is None: CANONICAL_ROW_INDEXES = {} for index, row in enumerate(self.ratios.row_names): if row in SET_SYNONYMS: CANONICAL_ROW_INDEXES[SET_SYNONYMS[row]] = index else: CANONICAL_ROW_INDEXES[row] = index ref_min_score = ref_matrix.min() logging.info('REF_MIN_SCORE: %f', ref_min_score) set_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_set.csv') pval_filepath = os.path.join(self.config_params['output_dir'], 'setEnrichment_pvalue.csv') for set_type in self.__set_types: SET_SET_TYPE = set_type logging.info("PROCESSING SET TYPE '%s'", set_type.name) start1 = util.current_millis() if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map(compute_cluster_score, [(cluster, self.bonferroni_cutoff(), ref_min_score) for cluster in xrange(1, self.num_clusters() + 1)]) else: results = [] for cluster in xrange(1, self.num_clusters() + 1): results.append(compute_cluster_score((cluster, self.bonferroni_cutoff(), ref_min_score))) elapsed1 = util.current_millis() - start1 logging.info("ENRICHMENT SCORES COMPUTED in %f s, STORING...", elapsed1 / 1000.0) if not os.path.exists(set_filepath): setFile = open(set_filepath, 'w') setFile.write(',' + ','.join([str(i) for i in xrange(1, self.num_clusters() + 1)])) pvFile = open(pval_filepath, 'w') pvFile.write(',' + ','.join([str(i) for i in xrange(1, self.num_clusters() + 1)])) else: setFile = open(set_filepath, 'a') pvFile = open(pval_filepath, 'a') minSets = [] pValues = [] for cluster in xrange(1, self.num_clusters() + 1): # store the best enriched set determined scores, min_set, min_pvalue = results[cluster - 1] minSets.append(min_set) pValues.append(min_pvalue) for row in xrange(len(self.gene_names())): matrix.values[row][cluster - 1] += scores[row] * set_type.weight setFile.write('\n'+str(iteration_result['iteration'])+','+','.join([str(i) for i in minSets])) pvFile.write('\n'+str(iteration_result['iteration'])+','+','.join([str(i) for i in pValues])) setFile.close() pvFile.close() logging.info("SET ENRICHMENT FINISHED IN %f s.\n", (util.current_millis() - start_time) / 1000.0) # cleanup SET_SET_TYPE = None SET_MATRIX = None SET_MEMBERSHIP = None SET_SYNONYMS = None return matrix
def compute_pvalues(self, iteration_result, num_motifs, force): """Compute motif scores. The result is a dictionary from cluster -> (feature_id, pvalue) containing a sparse gene-to-pvalue mapping for each cluster In order to influence the sequences that go into meme, the user can specify a list of sequence filter functions that have the signature (seqs, feature_ids, distance) -> seqs These filters are applied in the order they appear in the list. """ global SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP cluster_pvalues = {} min_cluster_rows_allowed = self.config_params['memb.min_cluster_rows_allowed'] max_cluster_rows_allowed = self.config_params['memb.max_cluster_rows_allowed'] use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] # extract the sequences for each cluster, slow start_time = util.current_millis() SEQUENCE_FILTERS = self.__sequence_filters ORGANISM = self.organism MEMBERSHIP = self.membership cluster_seqs_params = [(cluster, self.seqtype) for cluster in xrange(1, self.num_clusters() + 1)] if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: seqs_list = pool.map(cluster_seqs, cluster_seqs_params) else: seqs_list = [cluster_seqs(p) for p in cluster_seqs_params] SEQUENCE_FILTERS = None ORGANISM = None MEMBERSHIP = None logging.debug("prepared sequences in %d ms.", util.current_millis() - start_time) # Make the parameters, this is fast enough start_time = util.current_millis() params = {} for cluster in xrange(1, self.num_clusters() + 1): # Pass the previous run's seed if possible if self.__last_motif_infos is not None: previous_motif_infos = self.__last_motif_infos.get(cluster, None) else: previous_motif_infos = None seqs, feature_ids = seqs_list[cluster - 1] params[cluster] = ComputeScoreParams(iteration_result['iteration'], cluster, feature_ids, seqs, self.used_seqs, self.meme_runner(), min_cluster_rows_allowed, max_cluster_rows_allowed, num_motifs, previous_motif_infos, self.config_params['output_dir'], self.config_params['num_iterations'], self.config_params['debug']) logging.debug("prepared MEME parameters in %d ms.", util.current_millis() - start_time) # create motif result map if necessary for cluster in xrange(1, self.num_clusters() + 1): if not cluster in iteration_result: iteration_result[cluster] = {} # Optimization: # if the cluster hasn't changed since last time, reuse the last results # we do this by filtering out the parameters of the clusters that did not # change if not force and self.__last_results is not None: oldlen = len(params) params = {cluster: params[cluster] for cluster in xrange(1, self.num_clusters() + 1) if params[cluster].feature_ids != self.__last_results[cluster][0]} newlen = len(params) if oldlen - newlen > 0: logging.debug("%d clusters did not change !!!", oldlen - newlen) # compute and store motif results self.__last_motif_infos = {} if self.__last_results is None: self.__last_results = {} if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map(compute_cluster_score, params.values()) results = {r[0]: r[1:] for r in results} # indexed by cluster for cluster in xrange(1, self.num_clusters() + 1): if cluster in results: pvalues, run_result = results[cluster] self.__last_results[cluster] = (params[cluster].feature_ids, pvalues, run_result) else: feature_ids, pvalues, run_result = self.__last_results[cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json(run_result) iteration_result[cluster]['pvalues'] = pvalues else: for cluster in xrange(1, self.num_clusters() + 1): if cluster in params: _, pvalues, run_result = compute_cluster_score(params[cluster]) self.__last_results[cluster] = (params[cluster].feature_ids, pvalues, run_result) else: _, pvalues, run_result = self.__last_results[cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json(run_result) iteration_result[cluster]['pvalues'] = pvalues return cluster_pvalues
def compute_pvalues(self, iteration_result, num_motifs, force): """Compute motif scores. The result is a dictionary from cluster -> (feature_id, pvalue) containing a sparse gene-to-pvalue mapping for each cluster In order to influence the sequences that go into meme, the user can specify a list of sequence filter functions that have the signature (seqs, feature_ids, distance) -> seqs These filters are applied in the order they appear in the list. """ global SEQUENCE_FILTERS, ORGANISM, MEMBERSHIP cluster_pvalues = {} min_cluster_rows_allowed = self.config_params[ 'memb.min_cluster_rows_allowed'] max_cluster_rows_allowed = self.config_params[ 'memb.max_cluster_rows_allowed'] use_multiprocessing = self.config_params[scoring.KEY_MULTIPROCESSING] # extract the sequences for each cluster, slow start_time = util.current_millis() SEQUENCE_FILTERS = self.__sequence_filters ORGANISM = self.organism MEMBERSHIP = self.membership cluster_seqs_params = [(cluster, self.seqtype) for cluster in xrange(1, self.num_clusters() + 1)] if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: seqs_list = pool.map(cluster_seqs, cluster_seqs_params) else: seqs_list = [cluster_seqs(p) for p in cluster_seqs_params] SEQUENCE_FILTERS = None ORGANISM = None MEMBERSHIP = None logging.debug("prepared sequences in %d ms.", util.current_millis() - start_time) # Make the parameters, this is fast enough start_time = util.current_millis() params = {} for cluster in xrange(1, self.num_clusters() + 1): # Pass the previous run's seed if possible if self.__last_motif_infos is not None: previous_motif_infos = self.__last_motif_infos.get( cluster, None) else: previous_motif_infos = None seqs, feature_ids = seqs_list[cluster - 1] params[cluster] = ComputeScoreParams( iteration_result['iteration'], cluster, feature_ids, seqs, self.used_seqs, self.meme_runner(), min_cluster_rows_allowed, max_cluster_rows_allowed, num_motifs, previous_motif_infos, self.config_params['output_dir'], self.config_params['num_iterations'], self.config_params['debug']) logging.debug("prepared MEME parameters in %d ms.", util.current_millis() - start_time) # create motif result map if necessary for cluster in xrange(1, self.num_clusters() + 1): if not cluster in iteration_result: iteration_result[cluster] = {} # Optimization: # if the cluster hasn't changed since last time, reuse the last results # we do this by filtering out the parameters of the clusters that did not # change if not force and self.__last_results is not None: oldlen = len(params) params = { cluster: params[cluster] for cluster in xrange(1, self.num_clusters() + 1) if params[cluster].feature_ids != self.__last_results[cluster][0] } newlen = len(params) if oldlen - newlen > 0: logging.debug("%d clusters did not change !!!", oldlen - newlen) # compute and store motif results self.__last_motif_infos = {} if self.__last_results is None: self.__last_results = {} if use_multiprocessing: with util.get_mp_pool(self.config_params) as pool: results = pool.map(compute_cluster_score, params.values()) results = {r[0]: r[1:] for r in results} # indexed by cluster for cluster in xrange(1, self.num_clusters() + 1): if cluster in results: pvalues, run_result = results[cluster] self.__last_results[cluster] = ( params[cluster].feature_ids, pvalues, run_result) else: feature_ids, pvalues, run_result = self.__last_results[ cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[ cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json( run_result) iteration_result[cluster]['pvalues'] = pvalues else: for cluster in xrange(1, self.num_clusters() + 1): if cluster in params: _, pvalues, run_result = compute_cluster_score( params[cluster]) self.__last_results[cluster] = ( params[cluster].feature_ids, pvalues, run_result) else: _, pvalues, run_result = self.__last_results[cluster] cluster_pvalues[cluster] = pvalues if run_result: self.__last_motif_infos[cluster] = run_result.motif_infos iteration_result[cluster]['motif-info'] = meme_json(run_result) iteration_result[cluster]['pvalues'] = pvalues return cluster_pvalues
def run_iterations(self, start_iter=None, num_iter=None): if start_iter is None: start_iter = self.config_params['start_iteration'] if num_iter is None: num_iter=self.config_params['num_iterations'] + 1 if self.config_params['interactive']: # stop here in interactive mode return for iteration in range(start_iter, num_iter): start_time = util.current_millis() force = self.config_params['resume'] and iteration == start_iter self.run_iteration(iteration, force=force) # garbage collection after everything in iteration went out of scope gc.collect() elapsed = util.current_millis() - start_time logging.debug("performed iteration %d in %f s.", iteration, elapsed / 1000.0) """run post processing after the last iteration. We store the results in num_iterations + 1 to have a clean separation""" if self.config_params['postadjust']: logging.info("Postprocessing: Adjusting the clusters....") # run combiner using the weights of the last iteration rscores = self.row_scoring.combine_cached(self.config_params['num_iterations']) rd_scores = memb.get_row_density_scores(self.membership(), rscores) logging.info("Recomputed combined + density scores.") memb.postadjust(self.membership(), rd_scores) BSCM_obj = self.column_scoring.get_BSCM() if not (BSCM_obj is None): new_membership = BSCM_obj.resplit_clusters(self.membership(), cutoff=0.05) logging.info("Adjusted. Now re-run scoring (iteration: %d)", self.config_params['num_iterations']) iteration_result = {'iteration': self.config_params['num_iterations'] + 1, 'score_means': {}} combined_scores = self.row_scoring.compute_force(iteration_result) # write the combined scores for benchmarking/diagnostics with open(self.combined_rscores_pickle_path(), 'wb') as outfile: pickle.dump(combined_scores, outfile) self.write_results(iteration_result) self.write_stats(iteration_result) self.update_iteration(iteration) # default behaviour: # always write complete result into a cmresults.tsv for R/cmonkey # compatibility session = self.dbsession() path = os.path.join(self.config_params['output_dir'], 'cmresults-postproc.tsv.bz2') with bz2.BZ2File(path, 'w') as outfile: debug.write_iteration(session, outfile, self.config_params['num_iterations'] + 1, self.config_params['num_clusters'], self.config_params['output_dir']) # additionally: run tomtom on the motifs if requested if (self.config_params['MEME']['global_background'] == 'True' and self.config_params['Postprocessing']['run_tomtom'] == 'True'): meme.run_tomtom(session, self.config_params['output_dir'], self.config_params['MEME']['version']) self.write_finish_info() logging.info("Done !!!!")
def run_iterations(self, start_iter=None, num_iter=None): if start_iter is None: start_iter = self['start_iteration'] if num_iter is None: num_iter=self['num_iterations'] + 1 if self.config_params['interactive']: # stop here in interactive mode return for iteration in range(start_iter, num_iter): start_time = util.current_millis() force = self['resume'] and iteration == start_iter self.run_iteration(iteration, force=force) # garbage collection after everything in iteration went out of scope gc.collect() elapsed = util.current_millis() - start_time logging.debug("performed iteration %d in %f s.", iteration, elapsed / 1000.0) if 'profile_mem' in self['debug'] and (iteration == 1 or iteration % 100 == 0): with open(os.path.join(self['output_dir'], 'memprofile.tsv'), 'a') as outfile: self.write_mem_profile(outfile, iteration) """run post processing after the last iteration. We store the results in num_iterations + 1 to have a clean separation""" if self['postadjust']: logging.info("Postprocessing: Adjusting the clusters....") # run combiner using the weights of the last iteration rscores = self.row_scoring.combine_cached(self['num_iterations']) rd_scores = memb.get_row_density_scores(self.membership(), rscores) logging.info("Recomputed combined + density scores.") memb.postadjust(self.membership(), rd_scores) BSCM_obj = self.column_scoring.get_BSCM() if not (BSCM_obj is None): new_membership = BSCM_obj.resplit_clusters(self.membership(), cutoff=0.05) logging.info("Adjusted. Now re-run scoring (iteration: %d)", self['num_iterations']) iteration_result = {'iteration': self['num_iterations'] + 1, 'score_means': {}} combined_scores = self.row_scoring.compute_force(iteration_result) # write the combined scores for benchmarking/diagnostics with open(self.combined_rscores_pickle_path(), 'wb') as outfile: pickle.dump(combined_scores, outfile) self.write_results(iteration_result) self.write_stats(iteration_result) self.update_iteration(iteration) # default behaviour: # always write complete result into a cmresults.tsv for R/cmonkey # compatibility conn = self.__dbconn() path = os.path.join(self['output_dir'], 'cmresults-postproc.tsv.bz2') with bz2.BZ2File(path, 'w') as outfile: debug.write_iteration(conn, outfile, self['num_iterations'] + 1, self['num_clusters'], self['output_dir']) # TODO: Why is conn never closed? Where does it write to the db? # additionally: run tomtom on the motifs if requested if (self['MEME']['global_background'] == 'True' and self['Postprocessing']['run_tomtom'] == 'True'): meme.run_tomtom(conn, self['output_dir'], self['MEME']['version']) self.write_finish_info() logging.info("Done !!!!")
def combine(result_matrices, score_scalings, membership, iteration, config_params): """This is the combining function, taking n result matrices and scalings""" quantile_normalize = config_params["quantile_normalize"] for i, m in enumerate(result_matrices): m.fix_extreme_values() m.subtract_with_quantile(0.99) # debug mode: print scoring matrices before combining if "dump_scores" in config_params["debug"] and ( iteration == 1 or (iteration % config_params["debug_freq"] == 0) ): funs = config_params["pipeline"]["row-scoring"]["args"]["functions"] m.write_tsv_file( os.path.join(config_params["output_dir"], "score-%s-%04d.tsv" % (funs[i]["id"], iteration)), compressed=False, ) if quantile_normalize: if len(result_matrices) > 1: start_time = util.current_millis() result_matrices = dm.quantile_normalize_scores(result_matrices, score_scalings) elapsed = util.current_millis() - start_time logging.debug("quantile normalize in %f s.", elapsed / 1000.0) in_matrices = [m.values for m in result_matrices] else: in_matrices = [] num_clusters = membership.num_clusters() mat = result_matrices[0] index_map = {name: index for index, name in enumerate(mat.row_names)} # we assume matrix 0 is always the gene expression score # we also assume that the matrices are already extreme value # fixed rsm = [] for cluster in range(1, num_clusters + 1): row_members = sorted(membership.rows_for_cluster(cluster)) rsm.extend([mat.values[index_map[row], cluster - 1] for row in row_members]) scale = util.mad(rsm) if scale == 0: # avoid that we are dividing by 0 scale = util.r_stddev(rsm) if scale != 0: median_rsm = util.median(rsm) rsvalues = (mat.values - median_rsm) / scale num_rows, num_cols = rsvalues.shape rscores = dm.DataMatrix(num_rows, num_cols, mat.row_names, mat.column_names, values=rsvalues) rscores.fix_extreme_values() else: logging.warn("combiner scaling -> scale == 0 !!!") rscores = mat in_matrices.append(rscores.values) if len(result_matrices) > 1: rs_quant = util.quantile(rscores.values, 0.01) logging.debug("RS_QUANT = %f", rs_quant) for i in range(1, len(result_matrices)): values = result_matrices[i].values qqq = abs(util.quantile(values, 0.01)) if qqq == 0: logging.debug("SPARSE SCORES - %d attempt 1: pick from sorted values", i) qqq = sorted(values.ravel())[9] if qqq == 0: logging.debug("SPARSE SCORES - %d attempt 2: pick minimum value", i) qqq = abs(values.min()) if qqq != 0: values = values / qqq * abs(rs_quant) else: logging.debug("SPARSE SCORES - %d not normalizing!", i) in_matrices.append(values) if len(result_matrices) > 0: start_time = util.current_millis() # assuming same format of all matrices combined_score = np.zeros(in_matrices[0].shape) for i in xrange(len(in_matrices)): combined_score += in_matrices[i] * score_scalings[i] elapsed = util.current_millis() - start_time logging.debug("combined score in %f s.", elapsed / 1000.0) matrix0 = result_matrices[0] # as reference for names return dm.DataMatrix( matrix0.num_rows, matrix0.num_columns, matrix0.row_names, matrix0.column_names, values=combined_score ) else: return None