Пример #1
0
    def check_pathes(self):

        if len(self.pathes) > 1 and not self.merge_results and self.prefix:
            logging.warn("ignore --target.prefix=%r" % self.prefix)

        if self.prefix is None:
            prefixes = [os.path.splitext(os.path.basename(path))[0] for path in self.pathes]
            common_prefix = os.path.commonprefix(prefixes)
            # is always ok for not learning_mode, which includes that pathes has only one entry
            if not common_prefix:
                raise Exception("could not derive common prefix of input file names, please use "
                                "--target.prefix option")
            prefix = common_prefix
        return prefix
Пример #2
0
    def save_results(self, result, extra_writes, out_pathes):
        summ_stat_path = extra_writes.get("summ_stat_path")
        if summ_stat_path is not None:
            result.summary_statistics.to_csv(summ_stat_path, self.delim_out, index=False)
            print "WRITTEN: ", summ_stat_path

        full_stat_path = extra_writes.get("full_stat_path")
        if full_stat_path is not None:
            result.final_statistics.to_csv(full_stat_path, sep=self.delim_out, index=False)
            print "WRITTEN: ", full_stat_path

        for scored_table, out_path in zip(result.scored_tables, out_pathes):

            cutoff = CONFIG.get("d_score.cutoff")
            scored_table.to_csv(out_path.scored_table, out_path.filtered_table, cutoff, sep=self.delim_out, index=False)
            print "WRITTEN: ", out_path.scored_table
            print "WRITTEN: ", out_path.filtered_table

            if result.final_statistics is not None:

                cutoffs = result.final_statistics["cutoff"].values
                svalues = result.final_statistics["svalue"].values
                qvalues = result.final_statistics["qvalue"].values
                decoys, targets, top_decoys, top_targets = scored_table.scores()
                plot_data = save_report(
                    out_path.report, self.prefix, decoys, targets, top_decoys, top_targets, cutoffs, svalues, qvalues)
                print "WRITTEN: ", out_path.report

                cutoffs, svalues, qvalues, top_targets, top_decoys = plot_data
                for (name, values) in [("cutoffs", cutoffs), ("svalues", svalues), ("qvalues", qvalues),
                                       ("d_scores_top_target_peaks", top_targets),
                                       ("d_scores_top_decoy_peaks", top_decoys)]:
                    path = out_path[name]
                    with open(path, "w") as fp:
                        fp.write(" ".join("%e" % v for v in values))
                    print "WRITTEN: ", path

            if CONFIG.get("export.mayu"):
                if result.final_statistics:
                    export_mayu(out_pathes.mayu_cutoff, out_pathes.mayu_fasta,
                                out_pathes.mayu_csv, scored_table, result.final_statistics)
                    print "WRITTEN: ", out_pathes.mayu_cutoff
                    print "WRITTEN: ", out_pathes.mayu_fasta
                    print "WRITTEN: ", out_pathes.mayu_csv
                else:
                    logging.warn("can not write mayu table in this case")
Пример #3
0
def prepare_data_table(table, tg_id_name="transition_group_id",
                       decoy_name="decoy",
                       main_score_name=None,
                       score_columns=None,
                       ):
    N = len(table)
    if not N:
        raise Exception("got empty input file")
    header = table.columns.values
    if score_columns is not None:
        missing = set(score_columns) - set(header)
        if missing:
            missing_txt = ", ".join(["'%s'" % m for m in missing])
            msg = "column(s) %s missing in input file for applying existing scorer" % missing_txt
            raise Exception(msg)

    assert tg_id_name in header, "colum %s not in table" % tg_id_name
    assert decoy_name in header, "colum %s not in table" % decoy_name

    if score_columns is not None:
        # we assume there is exactly one main_score in score_columns as we checked that in
        # the run which persisted the classifier:
        var_column_names = [c for c in score_columns if c.startswith("var_")]
        main_score_name = [c for c in score_columns if c.startswith("main_")][0]
    else:
        if main_score_name is not None:
            assert main_score_name in header, "colum %s not in table" % main_score_name

        # if no main_score_name provided, look for unique column with name
        # starting with "main_":
        else:
            main_columns = set(c for c in header if c.startswith("main_"))
            if not main_columns:
                raise Exception("no column with main_* in table(s)")
            if len(main_columns) > 1:
                raise Exception("multiple columns with name main_* in table(s)")
            main_score_name = main_columns.pop()

        # get all other score columns, name beginning with "var_"
        var_column_names = tuple(h for h in header if h.startswith("var_"))

        if not var_column_names:
            raise Exception("no column with name var_* in table(s)")

    # collect needed data:
    empty_col = [0] * N
    empty_none_col = [None] * N

    tg_ids = table[tg_id_name]

    if not check_for_unique_blocks(tg_ids):
        raise Exception("transition group ids do not form unique blocks in data file")

    tg_map = dict()
    for i, tg_id in enumerate(tg_ids.unique()):
        tg_map[tg_id] = i
    tg_num_ids = [tg_map[tg_id] for tg_id in tg_ids]

    data = dict(tg_id=tg_ids.values,
                tg_num_id=tg_num_ids,
                is_decoy=table[decoy_name].values.astype(bool),
                is_top_peak=empty_col,
                is_train=empty_none_col,
                main_score=table[main_score_name].values,
                )

    ignore_invalid_scores = CONFIG["ignore.invalid_score_columns"]
    column_names = ["tg_id", "tg_num_id", "is_decoy", "is_top_peak", "is_train", "main_score"]
    for i, v in enumerate(var_column_names):
        col_name = "var_%d" % i
        col_data = table[v]
        if pd.isnull(col_data).all():
            msg = "column %s contains only invalid/missing values" % v
            if ignore_invalid_scores:
                logging.warn("%s. pyprophet skips this.")
                continue
            raise Exception("%s. you may use --ignore.invalid_score_columns")
        data[col_name] = col_data
        column_names.append(col_name)

    data["classifier_score"] = empty_col
    column_names.append("classifier_score")

    # build data frame:
    df = pd.DataFrame(data, columns=column_names)

    all_score_columns = (main_score_name,) + tuple(var_column_names)
    df = cleanup_and_check(df)
    return df, all_score_columns
Пример #4
0
    def save_results(self, result, extra_writes, out_pathes, pvalues):
        summ_stat_path = extra_writes.get("summ_stat_path")
        if summ_stat_path is not None:
            result.summary_statistics.to_csv(summ_stat_path,
                                             self.delim_out,
                                             index=False)
            print "WRITTEN: ", summ_stat_path

        full_stat_path = extra_writes.get("full_stat_path")
        if full_stat_path is not None:
            result.final_statistics.to_csv(full_stat_path,
                                           sep=self.delim_out,
                                           index=False)
            print "WRITTEN: ", full_stat_path

        for input_path, scored_table, out_path in zip(self.pathes,
                                                      result.scored_tables,
                                                      out_pathes):

            cutoff = CONFIG.get("d_score.cutoff")
            scored_table.to_csv(out_path.scored_table,
                                out_path.filtered_table,
                                cutoff,
                                sep=self.delim_out,
                                index=False)
            print "WRITTEN: ", out_path.scored_table
            print "WRITTEN: ", out_path.filtered_table

            if CONFIG.get("rewrite_sqmass"):

                # get basepath
                basepath = input_path.split(".tsv")[0]
                basepath = basepath.split(".txt")[0]
                basepath = basepath.split(".csv")[0]

                # try to find a matching sqMass file
                sqmass_file = None
                if os.path.exists(basepath + ".chrom.sqMass"):
                    sqmass_file = basepath + ".chrom.sqMass"
                elif os.path.exists(basepath + ".sqMass"):
                    sqmass_file = basepath + ".sqMass"

                # get selected chromatograms on the filtered table
                df = scored_table.df[scored_table.df.d_score > cutoff]
                fragment_anno = df.aggr_Fragment_Annotation.unique()
                prec_anno = df.aggr_prec_Fragment_Annotation.unique()

                labels = []
                for l in fragment_anno:
                    labels.extend(l.split(";"))
                for l in prec_anno:
                    labels.extend(l.split(";"))

                filterChromByLabels(sqmass_file, out_path.filtered_chroms,
                                    labels)

            if result.final_statistics is not None:

                cutoffs = result.final_statistics["cutoff"].values
                svalues = result.final_statistics["svalue"].values
                qvalues = result.final_statistics["qvalue"].values
                # pvalues = result.final_statistics["pvalue"].values
                decoys, targets, top_decoys, top_targets = scored_table.scores(
                )
                lambda_ = CONFIG.get("final_statistics.lambda")
                plot_data = save_report(out_path.report, self.prefix, decoys,
                                        targets, top_decoys, top_targets,
                                        cutoffs, svalues, qvalues, pvalues,
                                        lambda_)
                print "WRITTEN: ", out_path.report

                cutoffs, svalues, qvalues, top_targets, top_decoys = plot_data
                for (name,
                     values) in [("cutoffs", cutoffs), ("svalues", svalues),
                                 ("qvalues", qvalues),
                                 ("d_scores_top_target_peaks", top_targets),
                                 ("d_scores_top_decoy_peaks", top_decoys)]:
                    path = out_path[name]
                    with open(path, "w") as fp:
                        fp.write(" ".join("%e" % v for v in values))
                    print "WRITTEN: ", path

            if CONFIG.get("export.mayu"):
                if result.final_statistics is not None:
                    export_mayu(out_pathes[0]['mayu_cutoff'],
                                out_pathes[0]['mayu_fasta'],
                                out_pathes[0]['mayu_csv'], scored_table,
                                result.final_statistics)
                    print "WRITTEN: ", out_pathes[0]['mayu_cutoff']
                    print "WRITTEN: ", out_pathes[0]['mayu_fasta']
                    print "WRITTEN: ", out_pathes[0]['mayu_csv']
                else:
                    logging.warn("can not write mayu table in this case")
Пример #5
0
def prepare_data_table(
    table,
    tg_id_name="transition_group_id",
    decoy_name="decoy",
    main_score_name=None,
    score_columns=None,
):
    N = len(table)
    if not N:
        raise Exception("got empty input file")
    header = table.columns.values
    if score_columns is not None:
        missing = set(score_columns) - set(header)
        if missing:
            missing_txt = ", ".join(["'%s'" % m for m in missing])
            msg = "column(s) %s missing in input file for applying existing scorer" % missing_txt
            raise Exception(msg)

    assert tg_id_name in header, "colum %s not in table" % tg_id_name
    assert decoy_name in header, "colum %s not in table" % decoy_name

    if score_columns is not None:
        # we assume there is exactly one main_score in score_columns as we checked that in
        # the run which persisted the classifier:
        var_column_names = [c for c in score_columns if c.startswith("var_")]
        main_score_name = [c for c in score_columns
                           if c.startswith("main_")][0]
    else:
        if main_score_name is not None:
            assert main_score_name in header, "colum %s not in table" % main_score_name

        # if no main_score_name provided, look for unique column with name
        # starting with "main_":
        else:
            main_columns = set(c for c in header if c.startswith("main_"))
            if not main_columns:
                raise Exception("no column with main_* in table(s)")
            if len(main_columns) > 1:
                raise Exception(
                    "multiple columns with name main_* in table(s)")
            main_score_name = main_columns.pop()

        # get all other score columns, name beginning with "var_"
        var_column_names = tuple(h for h in header if h.startswith("var_"))

        if not var_column_names:
            raise Exception("no column with name var_* in table(s)")

    # collect needed data:
    empty_col = [0] * N
    empty_none_col = [None] * N

    tg_ids = table[tg_id_name]

    tg_map = dict()
    for i, tg_id in enumerate(tg_ids.unique()):
        tg_map[tg_id] = i
    tg_num_ids = [tg_map[tg_id] for tg_id in tg_ids]

    data = dict(
        tg_id=tg_ids.values,
        tg_num_id=tg_num_ids,
        is_decoy=table[decoy_name].values.astype(bool),
        is_top_peak=empty_col,
        is_train=empty_none_col,
        main_score=table[main_score_name].values,
    )

    ignore_invalid_scores = CONFIG["ignore.invalid_score_columns"]
    column_names = [
        "tg_id", "tg_num_id", "is_decoy", "is_top_peak", "is_train",
        "main_score"
    ]
    for i, v in enumerate(var_column_names):
        col_name = "var_%d" % i
        col_data = table[v]
        if pd.isnull(col_data).all():
            msg = "column %s contains only invalid/missing values" % v
            if ignore_invalid_scores:
                logging.warn("%s. pyprophet skips this.")
                continue
            raise Exception("%s. you may use --ignore.invalid_score_columns")
        data[col_name] = col_data
        column_names.append(col_name)

    data["classifier_score"] = empty_col
    column_names.append("classifier_score")

    # build data frame:
    df = pd.DataFrame(data, columns=column_names)

    all_score_columns = tuple(var_column_names) + (main_score_name, )
    df = cleanup_and_check(df)
    return df, all_score_columns