def score(self, table): prepared_table, __ = prepare_data_table( table, score_columns=self.score_columns) texp = Experiment(prepared_table) score = self.classifier.score(texp, True) texp["d_score"] = (score - self.mu) / self.nu s_values, q_values = lookup_s_and_q_values_from_error_table( texp["d_score"].values, self.error_stat.df) texp["m_score"] = q_values texp["s_value"] = s_values logging.info("mean m_score = %e, std_dev m_score = %e" % (np.mean(q_values), np.std(q_values, ddof=1))) logging.info("mean s_value = %e, std_dev s_value = %e" % (np.mean(s_values), np.std(s_values, ddof=1))) texp.add_peak_group_rank() df = table.join(texp[["d_score", "m_score", "peak_group_rank"]]) if CONFIG.get("compute.probabilities"): df = self.add_probabilities(df, texp) if CONFIG.get("target.compress_results"): to_drop = [ n for n in df.columns if n.startswith("var_") or n.startswith("main_") ] df.drop(to_drop, axis=1, inplace=True) return df
def cleanup_and_check(df): score_columns = ["main_score" ] + [c for c in df.columns if c.startswith("var_")] # this is fast but not easy to read # find peak groups with in valid scores: sub_df = df.loc[:, score_columns] flags = ~pd.isnull(sub_df) valid_rows = flags.all(axis=1) df_cleaned = df.loc[valid_rows, :] # decoy / non decoy sub tables df_decoy = df_cleaned[df_cleaned["is_decoy"].eq(True)] df_target = df_cleaned[df_cleaned["is_decoy"].eq(False)] # groups decoy_groups = set(df_decoy["tg_id"]) target_groups = set(df_target["tg_id"]) n_decoy = len(decoy_groups) n_target = len(target_groups) msg = "data set contains %d decoy and %d target transition groups" % ( n_decoy, n_target) logging.info(msg) if n_decoy < 10 or n_target < 10: logging.error("need at least 10 decoy groups ans 10 non decoy groups") raise Exception( "need at least 10 decoy groups ans 10 non decoy groups. %s" % msg) return df_cleaned
def apply_scorer_out_of_core(self, pathes, delim, check_cols, loaded_scorer): self.check_table_headers(pathes, delim, check_cols) with timer(): logging.info("apply scorer to input data") result, __, used_weights = self._apply_scorer_out_of_core(pathes, delim, loaded_scorer) logging.info("processing input data finished") return result, None, used_weights
def cleanup_and_check(df): score_columns = ["main_score"] + [c for c in df.columns if c.startswith("var_")] # this is fast but not easy to read # find peak groups with in valid scores: sub_df = df.loc[:, score_columns] flags = ~pd.isnull(sub_df) valid_rows = flags.all(axis=1) df_cleaned = df.loc[valid_rows, :] # decoy / non decoy sub tables df_decoy = df_cleaned[df_cleaned["is_decoy"].eq(True)] df_target = df_cleaned[df_cleaned["is_decoy"].eq(False)] # groups decoy_groups = set(df_decoy["tg_id"]) target_groups = set(df_target["tg_id"]) n_decoy = len(decoy_groups) n_target = len(target_groups) msg = "data set contains %d decoy and %d target transition groups" % (n_decoy, n_target) logging.info(msg) if n_decoy < 10 or n_target < 10: logging.error("need at least 10 decoy groups ans 10 non decoy groups") raise Exception("need at least 10 decoy groups ans 10 non decoy groups. %s" % msg) return df_cleaned
def score(self, table): prepared_table, __ = prepare_data_table(table, score_columns=self.score_columns) texp = Experiment(prepared_table) score = self.classifier.score(texp, True) texp["d_score"] = (score - self.mu) / self.nu s_values, q_values = lookup_s_and_q_values_from_error_table(texp["d_score"].values, self.error_stat.df) texp["m_score"] = q_values texp["s_value"] = s_values logging.info("mean m_score = %e, std_dev m_score = %e" % (np.mean(q_values), np.std(q_values, ddof=1))) logging.info("mean s_value = %e, std_dev s_value = %e" % (np.mean(s_values), np.std(s_values, ddof=1))) texp.add_peak_group_rank() df = table.join(texp[["d_score", "m_score", "peak_group_rank"]]) if CONFIG.get("compute.probabilities"): df = self.add_probabilities(df, texp) if CONFIG.get("target.compress_results"): to_drop = [n for n in df.columns if n.startswith("var_") or n.startswith("main_")] df.drop(to_drop, axis=1, inplace=True) return df
def apply_scorer_out_of_core(self, pathes, delim, check_cols, loaded_scorer): self.check_table_headers(pathes, delim, check_cols) with timer(): logging.info("apply scorer to input data") result, __, used_weights = self._apply_scorer_out_of_core( pathes, delim, loaded_scorer) logging.info("processing input data finished") return result, None, used_weights
def apply_weights(self, pathes, delim_in, check_cols, loaded_weights): self.check_table_headers(pathes, delim_in, check_cols) tables = list(self.read_tables_iter(pathes, delim_in)) with timer(): logging.info("apply weights") result, scorer, trained_weights = self._apply_weights(tables, loaded_weights) logging.info("processing input data finished") return result, scorer, trained_weights
def learn_and_apply_out_of_core(self, pathes, delim, check_cols): self.check_table_headers(pathes, delim, check_cols) with timer(): logging.info("learn and apply classifier out of core") result, scorer, trained_weights = self._learn_and_apply_out_of_core(pathes, delim) logging.info("processing input data finished") return result, scorer, trained_weights
def apply_scorer(self, pathes, delim, check_cols, loaded_scorer): self.check_table_headers(pathes, delim, check_cols) tables = list(self.read_tables_iter(pathes, delim)) with timer(): logging.info("apply scorer to input data") result, __, trained_weights = self._apply_scorer(tables, loaded_scorer) scorer = None logging.info("processing input data finished") return result, scorer, trained_weights
def apply_weights_out_of_core(self, pathes, delim, check_cols, loaded_weights): self.check_table_headers(pathes, delim, check_cols) with timer(): logging.info("apply weights out of core") result, scorer, trained_weights = self._apply_weights_out_of_core(pathes, delim, loaded_weights) logging.info("processing input data finished") return result, scorer, trained_weights
def apply_weights(self, pathes, delim_in, check_cols, loaded_weights): self.check_table_headers(pathes, delim_in, check_cols) tables = list(self.read_tables_iter(pathes, delim_in)) with timer(): logging.info("apply weights") result, scorer, trained_weights = self._apply_weights( tables, loaded_weights) logging.info("processing input data finished") return result, scorer, trained_weights
def apply_scorer(self, pathes, delim, check_cols, loaded_scorer): self.check_table_headers(pathes, delim, check_cols) tables = list(self.read_tables_iter(pathes, delim)) with timer(): logging.info("apply scorer to input data") result, __, trained_weights = self._apply_scorer( tables, loaded_scorer) scorer = None logging.info("processing input data finished") return result, scorer, trained_weights
def apply_weights_out_of_core(self, pathes, delim, check_cols, loaded_weights): self.check_table_headers(pathes, delim, check_cols) with timer(): logging.info("apply weights out of core") result, scorer, trained_weights = self._apply_weights_out_of_core( pathes, delim, loaded_weights) logging.info("processing input data finished") return result, scorer, trained_weights
def learn_and_apply_out_of_core(self, pathes, delim, check_cols): self.check_table_headers(pathes, delim, check_cols) with timer(): logging.info("learn and apply classifier out of core") result, scorer, trained_weights = self._learn_and_apply_out_of_core( pathes, delim) logging.info("processing input data finished") return result, scorer, trained_weights
def learn_and_apply(self, pathes, delim, check_cols): self.check_table_headers(pathes, delim, check_cols) tables = list(self.read_tables_iter(pathes, delim)) with timer(): logging.info("learn and apply classifier from input data") result, scorer, trained_weights = self._learn_and_apply(tables) logging.info("processing input data finished") return result, scorer, trained_weights
def run(self): self.prefix = self.check_pathes() dirname = self.determine_output_dir_name() out_pathes = self.create_out_pathes(dirname) extra_writes = dict(self.extra_writes(dirname)) to_check = list(v for p in out_pathes for v in p.values()) to_check.extend(extra_writes.values()) if not CONFIG.get("target.overwrite"): error = check_if_any_exists(to_check) if error: return False self.check_cols = ["transition_group_id", "run_id", "decoy"] if CONFIG.get("export.mayu"): self.check_cols += mayu_cols() if 'm_score' in self.check_cols: self.check_cols.remove( 'm_score') # The m_score is calculated by the learner # and should not be in the OpenSwathWorkflow output logging.info("config settings:") for k, v in sorted(CONFIG.config.items()): logging.info(" %s: %s" % (k, v)) start_at = time.time() with warnings.catch_warnings(): warnings.simplefilter("ignore") (result, scorer, weights) = self.run_algo() needed = time.time() - start_at set_pandas_print_options() self.print_summary(result) pvalues = None if scorer is None else scorer.target_pvalues self.save_results(result, extra_writes, out_pathes, pvalues) self.save_scorer(scorer, extra_writes) self.save_weights(weights, extra_writes) seconds = int(needed) msecs = int(1000 * (needed - seconds)) minutes = int(needed / 60.0) print "NEEDED", if minutes: print minutes, "minutes and", print "%d seconds and %d msecs wall time" % (seconds, msecs) print
def _main(args): pathes = parse_cmdline(args) apply_scorer = CONFIG.get("apply_scorer") apply_weights = CONFIG.get("apply_weights") prefix = CONFIG.get("target.prefix") merge_results = CONFIG.get("multiple_files.merge_results") delim_in = CONFIG.get("delim.in") delim_out = CONFIG.get("delim.out") out_of_core = CONFIG.get("out_of_core") random_seed = CONFIG.get("random_seed") num_processes = CONFIG.get("num_processes") if random_seed is not None and num_processes != 1: raise Exception( "Setting random seed does not work if you run pyprophet with multiple " "processes. Using more than one process is rarely faster.") if random_seed is not None: np.random.seed(random_seed) if apply_scorer and apply_weights: raise Exception("can not apply scorer and weights at the same time") learning_mode = not apply_scorer and not apply_weights if learning_mode: if out_of_core: PyProphetOutOfCoreLearner(pathes, prefix, merge_results, delim_in, delim_out).run() else: PyProphetLearner(pathes, prefix, merge_results, delim_in, delim_out).run() elif apply_weights: if out_of_core: PyProphetOutOfCoreWeightApplier(pathes, prefix, merge_results, apply_weights, delim_in, delim_out).run() else: PyProphetWeightApplier(pathes, prefix, merge_results, apply_weights, delim_in, delim_out).run() else: if out_of_core: logging.info( "out_of_core setting ignored: this parameter has no influence for " "applying a persisted scorer") PyProphetOutOfCoreScorerApplier(pathes, prefix, merge_results, apply_scorer, delim_in, delim_out).run()
def run(self): self.prefix = self.check_pathes() dirname = self.determine_output_dir_name() out_pathes = self.create_out_pathes(dirname) extra_writes = dict(self.extra_writes(dirname)) to_check = list(v for p in out_pathes for v in p.values()) to_check.extend(extra_writes.values()) if not CONFIG.get("target.overwrite"): error = check_if_any_exists(to_check) if error: return False self.check_cols = ["transition_group_id", "run_id", "decoy"] if CONFIG.get("export.mayu"): self.check_cols += mayu_cols() logging.info("config settings:") for k, v in sorted(CONFIG.config.items()): logging.info(" %s: %s" % (k, v)) start_at = time.time() with warnings.catch_warnings(): warnings.simplefilter("ignore") (result, scorer, weights) = self.run_algo() compress = CONFIG.get("target.compress_results") needed = time.time() - start_at set_pandas_print_options() self.print_summary(result) self.save_results(result, extra_writes, out_pathes) self.save_scorer(scorer, extra_writes) self.save_weights(weights, extra_writes) seconds = int(needed) msecs = int(1000 * (needed - seconds)) minutes = int(needed / 60.0) print "NEEDED", if minutes: print minutes, "minutes and", print "%d seconds and %d msecs wall time" % (seconds, msecs) print
def _build_result(self, tables, final_classifier, score_columns, experiment, all_test_target_scores, all_test_decoy_scores): merge_results = CONFIG.get("multiple_files.merge_results") weights = final_classifier.get_parameters() scorer = Scorer(final_classifier, score_columns, experiment, all_test_target_scores, all_test_decoy_scores, merge_results) scored_tables = list(scorer.score_many(tables)) final_statistics, summary_statistics = scorer.get_error_stats() result = Result(summary_statistics, final_statistics, scored_tables) logging.info("calculated scoring and statistics") return result, scorer, weights
def _apply_weights_on_exp(self, experiment, loaded_weights): learner = self.semi_supervised_learner logging.info("start application of pretrained weights") clf_scores = learner.score(experiment, loaded_weights) experiment.set_and_rerank("classifier_score", clf_scores) all_test_target_scores = experiment.get_top_target_peaks()["classifier_score"] all_test_decoy_scores = experiment.get_top_decoy_peaks()["classifier_score"] logging.info("finished pretrained scoring") ws = [loaded_weights.flatten()] final_classifier = self.semi_supervised_learner.averaged_learner(ws) return final_classifier, all_test_target_scores, all_test_decoy_scores
def timer(name=""): start_at = time.time() yield needed = time.time() - start_at hours = int(needed / 3600) needed -= hours * 3600 minutes = int(needed / 60) needed -= minutes * 60 if name: logging.info("time needed for %s: %02d:%02d:%.1f" % (name, hours, minutes, needed)) else: logging.info("time needed: %02d:%02d:%.1f" % (hours, minutes, needed))
def determine_output_dir_name(self): # from now on: paramterchecks above only for learning mode dirname = CONFIG.get("target.dir") if dirname is None: dirnames = set(os.path.dirname(path) for path in self.pathes) # is always ok for not learning_mode, which includes that pathes has only one entry if len(dirnames) > 1: raise Exception("could not derive common directory name of input files, please use " "--target.dir option") dirname = dirnames.pop() if dirname and not os.path.exists(dirname): os.makedirs(dirname) logging.info("created folder %s" % dirname) return dirname
def _apply_weights_on_exp(self, experiment, loaded_weights): learner = self.semi_supervised_learner logging.info("start application of pretrained weights") clf_scores = learner.score(experiment, loaded_weights) experiment.set_and_rerank("classifier_score", clf_scores) all_test_target_scores = experiment.get_top_target_peaks( )["classifier_score"] all_test_decoy_scores = experiment.get_top_decoy_peaks( )["classifier_score"] logging.info("finished pretrained scoring") ws = [loaded_weights.flatten()] final_classifier = self.semi_supervised_learner.averaged_learner(ws) return final_classifier, all_test_target_scores, all_test_decoy_scores
def learn_randomized(self, experiment): assert isinstance(experiment, Experiment) num_iter = CONFIG.get("semi_supervised_learner.num_iter") logging.info("start learn_randomized") fraction = CONFIG.get("xeval.fraction") is_test = CONFIG.get("is_test") experiment.split_for_xval(fraction, is_test) train = experiment.get_train_peaks() train.rank_by("main_score") params, clf_scores = self.start_semi_supervised_learning(train) train.set_and_rerank("classifier_score", clf_scores) # semi supervised iteration: for inner in range(num_iter): params, clf_scores = self.iter_semi_supervised_learning(train) train.set_and_rerank("classifier_score", clf_scores) # after semi supervised iteration: classify full dataset clf_scores = self.score(experiment, params) mu, nu = mean_and_std_dev(clf_scores) experiment.set_and_rerank("classifier_score", clf_scores) td_scores = experiment.get_top_decoy_peaks()["classifier_score"] mu, nu = mean_and_std_dev(td_scores) experiment["classifier_score"] = (experiment["classifier_score"] - mu) / nu experiment.rank_by("classifier_score") top_test_peaks = experiment.get_top_test_peaks() top_test_target_scores = top_test_peaks.get_target_peaks( )["classifier_score"] top_test_decoy_scores = top_test_peaks.get_decoy_peaks( )["classifier_score"] logging.info("end learn_randomized") return top_test_target_scores, top_test_decoy_scores, params
def _learn(self, experiment): is_test = CONFIG.get("is_test") if is_test: # for reliable results experiment.df.sort("tg_id", ascending=True, inplace=True) learner = self.semi_supervised_learner ws = [] neval = CONFIG.get("xeval.num_iter") num_processes = CONFIG.get("num_processes") all_test_target_scores = [] all_test_decoy_scores = [] logging.info("learn and apply scorer") logging.info("start %d cross evals using %d processes" % (neval, num_processes)) if num_processes == 1: for k in range(neval): (ttt_scores, ttd_scores, w) = learner.learn_randomized(experiment) all_test_target_scores.extend(ttt_scores) all_test_decoy_scores.extend(ttd_scores) ws.append(w.flatten()) else: pool = multiprocessing.Pool(processes=num_processes) while neval: remaining = max(0, neval - num_processes) todo = neval - remaining neval -= todo args = ((learner, "learn_randomized", (experiment, )), ) * todo res = pool.map(unwrap_self_for_multiprocessing, args) ttt_scores = [ti for r in res for ti in r[0]] ttd_scores = [ti for r in res for ti in r[1]] ws.extend([r[2] for r in res]) all_test_target_scores.extend(ttt_scores) all_test_decoy_scores.extend(ttd_scores) logging.info("finished cross evals") logging.info("") # only use socres from last iteration to build statistical model: if CONFIG.get("semi_supervised_learner.stat_best"): all_test_target_scores = ttt_scores all_test_decoy_scores = ttd_scores # we only use weights from last iteration if indicated: if CONFIG.get("semi_supervised_learner.use_best"): ws = [ws[-1]] final_classifier = self.semi_supervised_learner.averaged_learner(ws) return final_classifier, all_test_target_scores, all_test_decoy_scores
def _main(args): pathes = parse_cmdline(args) apply_scorer = CONFIG.get("apply_scorer") apply_weights = CONFIG.get("apply_weights") prefix = CONFIG.get("target.prefix") merge_results = CONFIG.get("multiple_files.merge_results") delim_in = CONFIG.get("delim.in") delim_out = CONFIG.get("delim.out") out_of_core = CONFIG.get("out_of_core") random_seed = CONFIG.get("random_seed") if random_seed is not None: random.seed(random_seed) if apply_scorer and apply_weights: raise Exception("can not apply scorer and weights at the same time") learning_mode = not apply_scorer and not apply_weights if learning_mode: if out_of_core: PyProphetOutOfCoreLearner(pathes, prefix, merge_results, delim_in, delim_out).run() else: PyProphetLearner(pathes, prefix, merge_results, delim_in, delim_out).run() elif apply_weights: if out_of_core: PyProphetOutOfCoreWeightApplier( pathes, prefix, merge_results, apply_weights, delim_in, delim_out).run() else: PyProphetWeightApplier( pathes, prefix, merge_results, apply_weights, delim_in, delim_out).run() else: if out_of_core: logging.info("out_of_core setting ignored: this parameter has no influence for " "applying a persisted scorer") PyProphetOutOfCoreScorerApplier( pathes, prefix, merge_results, apply_scorer, delim_in, delim_out).run()
def learn_randomized(self, experiment): assert isinstance(experiment, Experiment) num_iter = CONFIG.get("semi_supervised_learner.num_iter") logging.info("start learn_randomized") fraction = CONFIG.get("xeval.fraction") is_test = CONFIG.get("is_test") experiment.split_for_xval(fraction, is_test) train = experiment.get_train_peaks() train.rank_by("main_score") params, clf_scores = self.start_semi_supervised_learning(train) train.set_and_rerank("classifier_score", clf_scores) # semi supervised iteration: for inner in range(num_iter): params, clf_scores = self.iter_semi_supervised_learning(train) train.set_and_rerank("classifier_score", clf_scores) # after semi supervised iteration: classify full dataset clf_scores = self.score(experiment, params) mu, nu = mean_and_std_dev(clf_scores) experiment.set_and_rerank("classifier_score", clf_scores) td_scores = experiment.get_top_decoy_peaks()["classifier_score"] mu, nu = mean_and_std_dev(td_scores) experiment["classifier_score"] = (experiment["classifier_score"] - mu) / nu experiment.rank_by("classifier_score") top_test_peaks = experiment.get_top_test_peaks() top_test_target_scores = top_test_peaks.get_target_peaks()["classifier_score"] top_test_decoy_scores = top_test_peaks.get_decoy_peaks()["classifier_score"] logging.info("end learn_randomized") return top_test_target_scores, top_test_decoy_scores, params
def posterior_pg_prob(dvals, target_scores, decoy_scores, error_stat, number_target_peaks, number_target_pg, given_scores, lambda_): """Compute posterior probabilities for each peakgroup - Estimate the true distribution by using all target peakgroups above the given the cutoff (estimated FDR as given as input). Assume gaussian distribution. - Estimate the false/decoy distribution by using all decoy peakgroups. Assume gaussian distribution. """ # Note that num_null and num_total are the sum of the # cross-validated statistics computed before, therefore the total # number of data points selected will be # len(data) / xeval.fraction * xeval.num_iter # logging.info("Posterior Probability estimation:") logging.info("Estimated number of null %.2f out of a total of %s." % (error_stat.num_null, error_stat.num_total)) prior_chrom_null = error_stat.num_null / error_stat.num_total number_true_chromatograms = (1.0 - prior_chrom_null) * number_target_peaks prior_peakgroup_true = number_true_chromatograms / number_target_pg logging.info("Prior for a peakgroup: %s" % (number_true_chromatograms / number_target_pg)) logging.info("Prior for a chromatogram: %s" % (1.0 - prior_chrom_null)) logging.info("Estimated number of true chromatograms: %s out of %s" % (number_true_chromatograms, number_target_peaks)) logging.info("Number of target data: %s" % number_target_pg) logging.info("") # Estimate a suitable cutoff in discriminant score (d_score) # target_scores = experiment.get_top_target_peaks().df["d_score"] # decoy_scores = experiment.get_top_decoy_peaks().df["d_score"] estimated_cutoff = find_cutoff(target_scores, decoy_scores, lambda_, 0.15, False, False) target_scores_above = target_scores[target_scores > estimated_cutoff] # Use all decoys and top-peaks of top target chromatograms to # parametrically estimate the two distributions p_decoy = scipy.stats.norm.pdf(given_scores, np.mean(dvals), np.std(dvals, ddof=1)) p_target = scipy.stats.norm.pdf( given_scores, np.mean(target_scores_above), np.std(target_scores_above, ddof=1)) # Bayesian inference # Posterior probabilities for each peakgroup pp_pg_pvalues = p_target * prior_peakgroup_true / (p_target * prior_peakgroup_true + p_decoy * (1.0 - prior_peakgroup_true)) return pp_pg_pvalues
def read_tables_iter(self, pathes, delim): logging.info("process %s" % ", ".join(pathes)) for path in pathes: part = read_csv(path, delim) yield part
def log_summary(self): logging.info("summary input file:") logging.info(" %d lines" % len(self.df)) logging.info(" %d transition groups" % len(self.df.tg_id.unique())) logging.info(" %d scores including main score" % (len(self.df.columns.values) - 6))
def posterior_pg_prob(dvals, target_scores, decoy_scores, error_stat, number_target_peaks, number_target_pg, given_scores, lambda_): """Compute posterior probabilities for each peakgroup - Estimate the true distribution by using all target peakgroups above the given the cutoff (estimated FDR as given as input). Assume gaussian distribution. - Estimate the false/decoy distribution by using all decoy peakgroups. Assume gaussian distribution. """ # Note that num_null and num_total are the sum of the # cross-validated statistics computed before, therefore the total # number of data points selected will be # len(data) / xeval.fraction * xeval.num_iter # logging.info("Posterior Probability estimation:") logging.info("Estimated number of null %.2f out of a total of %s." % (error_stat.num_null, error_stat.num_total)) prior_chrom_null = error_stat.num_null / error_stat.num_total number_true_chromatograms = (1.0 - prior_chrom_null) * number_target_peaks prior_peakgroup_true = number_true_chromatograms / number_target_pg logging.info("Prior for a peakgroup: %s" % (number_true_chromatograms / number_target_pg)) logging.info("Prior for a chromatogram: %s" % (1.0 - prior_chrom_null)) logging.info("Estimated number of true chromatograms: %s out of %s" % (number_true_chromatograms, number_target_peaks)) logging.info("Number of target data: %s" % number_target_pg) logging.info("") # Estimate a suitable cutoff in discriminant score (d_score) # target_scores = experiment.get_top_target_peaks().df["d_score"] # decoy_scores = experiment.get_top_decoy_peaks().df["d_score"] estimated_cutoff = find_cutoff(target_scores, decoy_scores, lambda_, 0.15, False, False) target_scores_above = target_scores[target_scores > estimated_cutoff] # Use all decoys and top-peaks of top target chromatograms to # parametrically estimate the two distributions p_decoy = scipy.stats.norm.pdf(given_scores, np.mean(dvals), np.std(dvals, ddof=1)) p_target = scipy.stats.norm.pdf(given_scores, np.mean(target_scores_above), np.std(target_scores_above, ddof=1)) # Bayesian inference # Posterior probabilities for each peakgroup pp_pg_pvalues = p_target * prior_peakgroup_true / ( p_target * prior_peakgroup_true + p_decoy * (1.0 - prior_peakgroup_true)) return pp_pg_pvalues