def check_pathes(self): if len(self.pathes) > 1 and not self.merge_results and self.prefix: logging.warn("ignore --target.prefix=%r" % self.prefix) if self.prefix is None: prefixes = [os.path.splitext(os.path.basename(path))[0] for path in self.pathes] common_prefix = os.path.commonprefix(prefixes) # is always ok for not learning_mode, which includes that pathes has only one entry if not common_prefix: raise Exception("could not derive common prefix of input file names, please use " "--target.prefix option") prefix = common_prefix return prefix
def save_results(self, result, extra_writes, out_pathes): summ_stat_path = extra_writes.get("summ_stat_path") if summ_stat_path is not None: result.summary_statistics.to_csv(summ_stat_path, self.delim_out, index=False) print "WRITTEN: ", summ_stat_path full_stat_path = extra_writes.get("full_stat_path") if full_stat_path is not None: result.final_statistics.to_csv(full_stat_path, sep=self.delim_out, index=False) print "WRITTEN: ", full_stat_path for scored_table, out_path in zip(result.scored_tables, out_pathes): cutoff = CONFIG.get("d_score.cutoff") scored_table.to_csv(out_path.scored_table, out_path.filtered_table, cutoff, sep=self.delim_out, index=False) print "WRITTEN: ", out_path.scored_table print "WRITTEN: ", out_path.filtered_table if result.final_statistics is not None: cutoffs = result.final_statistics["cutoff"].values svalues = result.final_statistics["svalue"].values qvalues = result.final_statistics["qvalue"].values decoys, targets, top_decoys, top_targets = scored_table.scores() plot_data = save_report( out_path.report, self.prefix, decoys, targets, top_decoys, top_targets, cutoffs, svalues, qvalues) print "WRITTEN: ", out_path.report cutoffs, svalues, qvalues, top_targets, top_decoys = plot_data for (name, values) in [("cutoffs", cutoffs), ("svalues", svalues), ("qvalues", qvalues), ("d_scores_top_target_peaks", top_targets), ("d_scores_top_decoy_peaks", top_decoys)]: path = out_path[name] with open(path, "w") as fp: fp.write(" ".join("%e" % v for v in values)) print "WRITTEN: ", path if CONFIG.get("export.mayu"): if result.final_statistics: export_mayu(out_pathes.mayu_cutoff, out_pathes.mayu_fasta, out_pathes.mayu_csv, scored_table, result.final_statistics) print "WRITTEN: ", out_pathes.mayu_cutoff print "WRITTEN: ", out_pathes.mayu_fasta print "WRITTEN: ", out_pathes.mayu_csv else: logging.warn("can not write mayu table in this case")
def prepare_data_table(table, tg_id_name="transition_group_id", decoy_name="decoy", main_score_name=None, score_columns=None, ): N = len(table) if not N: raise Exception("got empty input file") header = table.columns.values if score_columns is not None: missing = set(score_columns) - set(header) if missing: missing_txt = ", ".join(["'%s'" % m for m in missing]) msg = "column(s) %s missing in input file for applying existing scorer" % missing_txt raise Exception(msg) assert tg_id_name in header, "colum %s not in table" % tg_id_name assert decoy_name in header, "colum %s not in table" % decoy_name if score_columns is not None: # we assume there is exactly one main_score in score_columns as we checked that in # the run which persisted the classifier: var_column_names = [c for c in score_columns if c.startswith("var_")] main_score_name = [c for c in score_columns if c.startswith("main_")][0] else: if main_score_name is not None: assert main_score_name in header, "colum %s not in table" % main_score_name # if no main_score_name provided, look for unique column with name # starting with "main_": else: main_columns = set(c for c in header if c.startswith("main_")) if not main_columns: raise Exception("no column with main_* in table(s)") if len(main_columns) > 1: raise Exception("multiple columns with name main_* in table(s)") main_score_name = main_columns.pop() # get all other score columns, name beginning with "var_" var_column_names = tuple(h for h in header if h.startswith("var_")) if not var_column_names: raise Exception("no column with name var_* in table(s)") # collect needed data: empty_col = [0] * N empty_none_col = [None] * N tg_ids = table[tg_id_name] if not check_for_unique_blocks(tg_ids): raise Exception("transition group ids do not form unique blocks in data file") tg_map = dict() for i, tg_id in enumerate(tg_ids.unique()): tg_map[tg_id] = i tg_num_ids = [tg_map[tg_id] for tg_id in tg_ids] data = dict(tg_id=tg_ids.values, tg_num_id=tg_num_ids, is_decoy=table[decoy_name].values.astype(bool), is_top_peak=empty_col, is_train=empty_none_col, main_score=table[main_score_name].values, ) ignore_invalid_scores = CONFIG["ignore.invalid_score_columns"] column_names = ["tg_id", "tg_num_id", "is_decoy", "is_top_peak", "is_train", "main_score"] for i, v in enumerate(var_column_names): col_name = "var_%d" % i col_data = table[v] if pd.isnull(col_data).all(): msg = "column %s contains only invalid/missing values" % v if ignore_invalid_scores: logging.warn("%s. pyprophet skips this.") continue raise Exception("%s. you may use --ignore.invalid_score_columns") data[col_name] = col_data column_names.append(col_name) data["classifier_score"] = empty_col column_names.append("classifier_score") # build data frame: df = pd.DataFrame(data, columns=column_names) all_score_columns = (main_score_name,) + tuple(var_column_names) df = cleanup_and_check(df) return df, all_score_columns
def save_results(self, result, extra_writes, out_pathes, pvalues): summ_stat_path = extra_writes.get("summ_stat_path") if summ_stat_path is not None: result.summary_statistics.to_csv(summ_stat_path, self.delim_out, index=False) print "WRITTEN: ", summ_stat_path full_stat_path = extra_writes.get("full_stat_path") if full_stat_path is not None: result.final_statistics.to_csv(full_stat_path, sep=self.delim_out, index=False) print "WRITTEN: ", full_stat_path for input_path, scored_table, out_path in zip(self.pathes, result.scored_tables, out_pathes): cutoff = CONFIG.get("d_score.cutoff") scored_table.to_csv(out_path.scored_table, out_path.filtered_table, cutoff, sep=self.delim_out, index=False) print "WRITTEN: ", out_path.scored_table print "WRITTEN: ", out_path.filtered_table if CONFIG.get("rewrite_sqmass"): # get basepath basepath = input_path.split(".tsv")[0] basepath = basepath.split(".txt")[0] basepath = basepath.split(".csv")[0] # try to find a matching sqMass file sqmass_file = None if os.path.exists(basepath + ".chrom.sqMass"): sqmass_file = basepath + ".chrom.sqMass" elif os.path.exists(basepath + ".sqMass"): sqmass_file = basepath + ".sqMass" # get selected chromatograms on the filtered table df = scored_table.df[scored_table.df.d_score > cutoff] fragment_anno = df.aggr_Fragment_Annotation.unique() prec_anno = df.aggr_prec_Fragment_Annotation.unique() labels = [] for l in fragment_anno: labels.extend(l.split(";")) for l in prec_anno: labels.extend(l.split(";")) filterChromByLabels(sqmass_file, out_path.filtered_chroms, labels) if result.final_statistics is not None: cutoffs = result.final_statistics["cutoff"].values svalues = result.final_statistics["svalue"].values qvalues = result.final_statistics["qvalue"].values # pvalues = result.final_statistics["pvalue"].values decoys, targets, top_decoys, top_targets = scored_table.scores( ) lambda_ = CONFIG.get("final_statistics.lambda") plot_data = save_report(out_path.report, self.prefix, decoys, targets, top_decoys, top_targets, cutoffs, svalues, qvalues, pvalues, lambda_) print "WRITTEN: ", out_path.report cutoffs, svalues, qvalues, top_targets, top_decoys = plot_data for (name, values) in [("cutoffs", cutoffs), ("svalues", svalues), ("qvalues", qvalues), ("d_scores_top_target_peaks", top_targets), ("d_scores_top_decoy_peaks", top_decoys)]: path = out_path[name] with open(path, "w") as fp: fp.write(" ".join("%e" % v for v in values)) print "WRITTEN: ", path if CONFIG.get("export.mayu"): if result.final_statistics is not None: export_mayu(out_pathes[0]['mayu_cutoff'], out_pathes[0]['mayu_fasta'], out_pathes[0]['mayu_csv'], scored_table, result.final_statistics) print "WRITTEN: ", out_pathes[0]['mayu_cutoff'] print "WRITTEN: ", out_pathes[0]['mayu_fasta'] print "WRITTEN: ", out_pathes[0]['mayu_csv'] else: logging.warn("can not write mayu table in this case")
def prepare_data_table( table, tg_id_name="transition_group_id", decoy_name="decoy", main_score_name=None, score_columns=None, ): N = len(table) if not N: raise Exception("got empty input file") header = table.columns.values if score_columns is not None: missing = set(score_columns) - set(header) if missing: missing_txt = ", ".join(["'%s'" % m for m in missing]) msg = "column(s) %s missing in input file for applying existing scorer" % missing_txt raise Exception(msg) assert tg_id_name in header, "colum %s not in table" % tg_id_name assert decoy_name in header, "colum %s not in table" % decoy_name if score_columns is not None: # we assume there is exactly one main_score in score_columns as we checked that in # the run which persisted the classifier: var_column_names = [c for c in score_columns if c.startswith("var_")] main_score_name = [c for c in score_columns if c.startswith("main_")][0] else: if main_score_name is not None: assert main_score_name in header, "colum %s not in table" % main_score_name # if no main_score_name provided, look for unique column with name # starting with "main_": else: main_columns = set(c for c in header if c.startswith("main_")) if not main_columns: raise Exception("no column with main_* in table(s)") if len(main_columns) > 1: raise Exception( "multiple columns with name main_* in table(s)") main_score_name = main_columns.pop() # get all other score columns, name beginning with "var_" var_column_names = tuple(h for h in header if h.startswith("var_")) if not var_column_names: raise Exception("no column with name var_* in table(s)") # collect needed data: empty_col = [0] * N empty_none_col = [None] * N tg_ids = table[tg_id_name] tg_map = dict() for i, tg_id in enumerate(tg_ids.unique()): tg_map[tg_id] = i tg_num_ids = [tg_map[tg_id] for tg_id in tg_ids] data = dict( tg_id=tg_ids.values, tg_num_id=tg_num_ids, is_decoy=table[decoy_name].values.astype(bool), is_top_peak=empty_col, is_train=empty_none_col, main_score=table[main_score_name].values, ) ignore_invalid_scores = CONFIG["ignore.invalid_score_columns"] column_names = [ "tg_id", "tg_num_id", "is_decoy", "is_top_peak", "is_train", "main_score" ] for i, v in enumerate(var_column_names): col_name = "var_%d" % i col_data = table[v] if pd.isnull(col_data).all(): msg = "column %s contains only invalid/missing values" % v if ignore_invalid_scores: logging.warn("%s. pyprophet skips this.") continue raise Exception("%s. you may use --ignore.invalid_score_columns") data[col_name] = col_data column_names.append(col_name) data["classifier_score"] = empty_col column_names.append("classifier_score") # build data frame: df = pd.DataFrame(data, columns=column_names) all_score_columns = tuple(var_column_names) + (main_score_name, ) df = cleanup_and_check(df) return df, all_score_columns