示例#1
0
def test_nn(test_nn_params,
            prep_result,
            sim_result,
            progress=None,
            pipeline=None):
    n_channels, n_cycles = sim_result.params.n_channels_and_cycles

    n_phases = 6 if test_nn_params.include_training_set else 3
    if pipeline is not None:
        pipeline.set_phase(0, n_phases)

    shape = sim_result.test_radmat.shape
    assert len(shape) == 4
    test_radmat = sim_result.test_radmat.reshape(
        (shape[0] * shape[1], shape[2], shape[3]))
    test_dyemat = sim_result.test_dyemat.reshape(
        (shape[0] * shape[1], shape[2], shape[3]))
    test_result = nn(
        test_nn_params,
        sim_result,
        radmat=test_radmat,
        true_dyemat=test_dyemat,
        progress=progress,
    )

    test_result.true_pep_iz = ArrayResult(
        filename="test_true_pep_iz",
        shape=(shape[0] * shape[1], ),
        dtype=IndexType,
        mode="w+",
    )
    test_result.true_pep_iz[:] = np.repeat(
        np.arange(shape[0]).astype(IndexType), shape[1])
    check.t(test_result.true_pep_iz, ArrayResult)
    check.t(test_result.pred_pep_iz, ArrayResult)

    call_bag = CallBag(
        true_pep_iz=test_result.true_pep_iz.arr(),
        pred_pep_iz=test_result.pred_pep_iz.arr(),
        scores=test_result.scores.arr(),
        prep_result=prep_result,
        sim_result=sim_result,
    )

    if pipeline is not None:
        pipeline.set_phase(1, n_phases)

    test_result.peps_pr = call_bag.pr_curve_by_pep(progress=progress)

    # If there is abundance information, compute the abundance-adjusted PR
    # This call returns None if there is no abundance info avail.
    if pipeline is not None:
        pipeline.set_phase(2, n_phases)

    test_result.peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
        progress=progress)

    if test_nn_params.include_training_set:
        # Permit testing for over-fitting by classifying on the train data

        if pipeline is not None:
            pipeline.set_phase(3, n_phases)

        real_pep_iz = prep_result.peps__no_decoys().pep_i.values
        keep_rows = np.isin(sim_result.train_true_pep_iz, real_pep_iz)
        train_radmat = sim_result.train_radmat[keep_rows]
        train_dyemat = sim_result.train_dyemat[keep_rows]

        assert train_radmat.shape == shape

        train_result = nn(
            test_nn_params.use_gmm,
            sim_result,
            radmat=train_radmat,
            true_dyemat=train_dyemat,
            progress=progress,
        )
        train_result.true_pep_iz = sim_result.train_true_pep_iz
        train_result.true_pep_iz = ArrayResult(
            filename="train_true_pep_iz",
            shape=(shape[0] * shape[1], ),
            dtype=IndexType,
            mode="w+",
        )
        train_result.true_pep_iz[:] = np.repeat(
            np.arange(shape[0]).astype(IndexType), shape[1])
        check.t(train_result.true_pep_iz, ArrayResult)
        check.t(train_result.pred_pep_iz, ArrayResult)

        call_bag = CallBag(
            true_pep_iz=train_result.true_pep_iz.arr(),
            pred_pep_iz=train_result.pred_pep_iz.arr(),
            scores=train_result.scores.arr(),
            prep_result=prep_result,
            sim_result=sim_result,
        )

        if pipeline is not None:
            pipeline.set_phase(4, n_phases)

        train_result.peps_pr = call_bag.pr_curve_by_pep(progress=progress)

        if pipeline is not None:
            pipeline.set_phase(5, n_phases)

        train_result.peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
            progress=progress)

    else:
        train_result = {k: None for k in test_result.keys()}

    def rename(d, prefix):
        return {f"{prefix}{k}": v for k, v in d.items()}

    return TestNNResult(
        params=test_nn_params,
        **rename(test_result, "test_"),
        **rename(train_result, "train_"),
    )
示例#2
0
def test_rf(
    test_rf_params,
    prep_result,
    sim_result,
    train_rf_result,
    progress=None,
    pipeline=None,
):
    n_phases = 6 if test_rf_params.include_training_set else 3
    classifier = train_rf_result.classifier

    if pipeline is not None:
        pipeline.set_phase(0, n_phases)

    test_pred_pep_iz, test_scores, test_all_class_scores = classifier.classify(
        sim_result.flat_test_radmat(), test_rf_params.keep_all_class_scores,
        progress)
    test_true_pep_iz = sim_result.test_true_pep_iz()

    # We do some PR calculation during the task so that this information is readily
    # available in results & notebooks don't need to recompute it (costly).
    # TODO: it is probably worth optimizing this by only doing PR for proteins of
    # interest if this has been specified for the run, since otherwise we'll be
    # computing full PR curves for every peptide in the background which is
    # probably not interesting.
    #
    call_bag = CallBag(
        true_pep_iz=test_true_pep_iz,
        pred_pep_iz=test_pred_pep_iz,
        scores=test_scores,
        all_class_scores=test_all_class_scores,
        prep_result=prep_result,
        sim_result=sim_result,
    )

    if pipeline is not None:
        pipeline.set_phase(1, n_phases)

    if pipeline is not None:
        pipeline.set_phase(2, n_phases)

    test_peps_pr = call_bag.pr_curve_by_pep(progress=progress)

    # If there is abundance information, compute the abundance-adjusted PR
    # This call returns None if there is no abundance info avail.
    test_peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
        progress=progress)

    if test_rf_params.include_training_set:
        # Permit testing for over-fitting by classifying on the train data

        if pipeline is not None:
            pipeline.set_phase(3, n_phases)

        real_pep_iz = prep_result.peps__no_decoys().pep_i.values

        keep_rows = np.isin(sim_result.train_true_pep_iz, real_pep_iz)

        train_true_pep_iz = sim_result.train_true_pep_iz[keep_rows]
        train_radmat = sim_result.train_radmat[keep_rows]

        train_pred_pep_iz, train_scores, train_all_class_scores = classifier.classify(
            train_radmat, test_rf_params.keep_all_class_scores, progress)

        call_bag = CallBag(
            true_pep_iz=train_true_pep_iz,
            pred_pep_iz=train_pred_pep_iz,
            scores=train_scores,
            all_class_scores=train_all_class_scores,
            prep_result=prep_result,
            sim_result=sim_result,
        )

        if pipeline is not None:
            pipeline.set_phase(4, n_phases)

        train_peps_pr = call_bag.pr_curve_by_pep(progress=progress)

        if pipeline is not None:
            pipeline.set_phase(5, n_phases)

        train_peps_pr_abund = call_bag.pr_curve_by_pep_with_abundance(
            progress=progress)

    else:
        (
            train_pred_pep_iz,
            train_scores,
            train_all_class_scores,
            train_true_pep_iz,
            train_peps_pr,
            train_peps_pr_abund,
        ) = (
            None,
            None,
            None,
            None,
            None,
            None,
        )

    return TestRFResult(
        params=test_rf_params,
        test_true_pep_iz=test_true_pep_iz,
        test_pred_pep_iz=test_pred_pep_iz,
        test_scores=test_scores,
        test_all_class_scores=test_all_class_scores,
        test_peps_pr=test_peps_pr,
        test_peps_pr_abund=test_peps_pr_abund,
        train_true_pep_iz=train_true_pep_iz,
        train_pred_pep_iz=train_pred_pep_iz,
        train_scores=train_scores,
        train_all_class_scores=train_all_class_scores,
        train_peps_pr=train_peps_pr,
        train_peps_pr_abund=train_peps_pr_abund,
    )