示例#1
0
    def _fit_example(self, **kwargs):
        x, y = get_x_y(pandas.read_csv(EXAMPLE_FILE), ["status", "time"],
                       pos_label=1)
        coxnet = CoxnetSurvivalAnalysis(**kwargs)
        coxnet.fit(x.values, y)

        return x, y, coxnet
示例#2
0
    def test_alpha_too_small():
        X, y = load_breast_cancer()
        Xt = OneHotEncoder().fit_transform(X)
        index = numpy.array([
            0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21,
            22, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34, 36, 39, 40, 41, 42, 43,
            44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 60, 61, 62, 63,
            64, 65, 66, 68, 70, 71, 72, 75, 76, 78, 79, 80, 82, 84, 85, 86, 87,
            88, 90, 91, 92, 93, 94, 95, 98, 99, 100, 102, 103, 104, 105, 107,
            108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121,
            124, 125, 126, 127, 128, 130, 131, 132, 133, 135, 136, 137, 138,
            139, 140, 143, 144, 145, 147, 148, 150, 151, 153, 154, 155, 156,
            157, 158, 160, 161, 164, 165, 166, 167, 168, 169, 170, 171, 172,
            174, 175, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 188,
            190, 191, 192, 193, 194, 195, 196, 197
        ])

        nn = CoxnetSurvivalAnalysis(alphas=[0.007295025406624247],
                                    l1_ratio=1.0)
        Xf, yf = Xt.iloc[index], y[index]

        with pytest.raises(
                ArithmeticError,
                match=
                "Numerical error, because weights are too large. Consider increasing alpha."
        ):
            nn.fit(Xf, yf)
    def test_simple(self):
        y = numpy.empty(dtype=[("D", numpy.bool), ("Y", numpy.float64)], shape=5)
        y["D"] = [True, False, False, True, False]
        y["Y"] = [7., 8., 11., 11., 23.]

        x = pandas.DataFrame({"F1": [1, 1, 1, 0, 0],
                              "F2": [23, 43, 54, 75, 67],
                              "F3": [120, 98, 78, 91, 79],
                              "F4": [0.123, 0.541, 0.784, 0.846, 0.331]})

        coxnet = CoxnetSurvivalAnalysis(l1_ratio=1.0)
        coxnet.fit(x.values, y)

        expected_alphas = numpy.array(
            [7.02666666666667, 6.40243696630484, 5.83366211207401, 5.31541564828386, 4.84320877198972, 4.41295145312887,
             4.02091700863675, 3.66370982370111, 3.3382359405709, 3.04167626017436, 2.77146212443153, 2.52525306776672,
             2.30091654511542, 2.09650946083909, 1.91026133856035, 1.74055898614351, 1.5859325229961, 1.44504264866632,
             1.31666904246323, 1.19969979362274, 1.09312177046848, 0.996011845149902, 0.907528897950459,
             0.826906531910992, 0.753446434665921, 0.686512329995589, 0.625524466706047, 0.569954597101554,
             0.519321401555745, 0.473186319551291, 0.431149751078499, 0.392847595491192, 0.357948097841098,
             0.326148975375191, 0.297174799307102, 0.270774609184727, 0.24671973919085, 0.22480183754923,
             0.204831061881182, 0.186634434881721, 0.170054346072885, 0.154947186657187, 0.141182105646904,
             0.128639876495421, 0.117211864413924, 0.106799085428826, 0.0973113490299429, 0.0886664769834391,
             0.0807895915432809, 0.0736124668960205, 0.0670729382214382])

        # FIXME
        assert_array_almost_equal(expected_alphas, coxnet.alphas_[:len(expected_alphas)])

        coef = pandas.DataFrame(coxnet.coef_[:, :len(expected_alphas)],
                                dtype=float)
        expected_coef = pandas.read_csv(SIMPLE_COEF_FILE, header=None, skiprows=1)

        assert_columns_almost_equal(coef, expected_coef)
    def test_breast_example(self):
        x, y = load_breast_cancer()
        x = column.encode_categorical(x)

        coxnet = CoxnetSurvivalAnalysis(l1_ratio=1.0)
        coxnet.fit(x.values, y)

        expected_alphas = numpy.array([
            0.207764947265866, 0.189307681974955, 0.172490109262135, 0.157166563357949, 0.143204319038428,
            0.130482442022696, 0.118890741498079, 0.108328815700004, 0.0987051822799425, 0.0899364859290742,
            0.0819467763944772, 0.0746668506343715, 0.0680336534144775, 0.0619897311537413, 0.0564827342889011,
            0.051464963847614, 0.046892958302776, 0.0427271171295661, 0.0389313578046448, 0.0354728032765984,
            0.0323214972006479, 0.0294501444711215, 0.0268338748043064, 0.0244500273239498, 0.0222779542835891,
            0.0202988422256499, 0.0184955490282766, 0.0168524554284737, 0.0153553297355215, 0.0139912045628799,
            0.0127482645108893, 0.0116157438274312, 0.0105838331601337, 0.00964359459245389, 0.00878688422772072,
            0.00800628165059773, 0.0072950256549955, 0.0066469556817389, 0.00605645845875073, 0.00551841938157428,
            0.00502817821311635, 0.00458148871890295, 0.00417448188822764, 0.00380363242263169, 0.00346572820145532,
            0.00315784245998521, 0.00287730843921864, 0.00262169628767281, 0.00238879201517371, 0.00217657831633235,
            0.00198321709761059, 0.00180703355663423, 0.00164650167585602, 0.00150023100492174, 0.0013669546172544,
            0.00124551813654232, 0.00113486973808373, 0.00103405103838443, 0.000942188794098442, 0.000858487338411865,
            0.000782221689357606, 0.00071273127036839, 0.000649414188678556, 0.000591722022016858, 0.00053915506843511,
            0.00049125801812897, 0.000447616009762226, 0.000407851037136367, 0.000371618675081733, 0.000338605096211458,
            0.000308524352698783, 0.00028111589953377, 0.000256142337807075, 0.000233387358474159, 0.000212653868789829,
            0.000193762285185162, 0.000176548977800548, 0.000160864853202119, 0.000146574063005757,
            0.000133552827223371, 0.000121688362139862, 0.000110877903434536, 0.000101027816085719,
            9.20527833489927e-05, 8.38750677843702e-05, 7.64238379317803e-05, 6.96345548028444e-05, 6.34484128750348e-05
        ])

        assert_array_almost_equal(expected_alphas, coxnet.alphas_)

        expected_deviance_ratio = numpy.array([
            0, 0.00741462796207568, 0.0135178719105177, 0.0183232499901932, 0.0221250029051101, 0.0251530137843965,
            0.0275599035016693, 0.0298664819929119, 0.033763232356598, 0.0374249162331977, 0.0409637006907067,
            0.0454486054162627, 0.0551615080395675, 0.0651612844343542, 0.0736024993960834, 0.0808820441173129,
            0.0894426534710234, 0.0992239010000626, 0.108910229105339, 0.121376204780063, 0.134004998770465,
            0.145079557491685, 0.156667501995989, 0.167543840680748, 0.178622131991811, 0.189365153169168,
            0.199027839424271, 0.20909726215036, 0.218610320633419, 0.228024278642459, 0.238171883969976,
            0.248070501745195, 0.258480494697342, 0.268971907277929, 0.280744803445048, 0.291329662029924,
            0.300942928439923, 0.309972153913063, 0.318315812887558, 0.325822700491885, 0.332992506325249,
            0.339665277042211, 0.345876707002969, 0.351605625998246, 0.357206102668659, 0.362484660673399,
            0.367624391654207, 0.372275248793233, 0.37674043994605, 0.380887801196039, 0.384795899779142,
            0.388569806807258, 0.392075171498793, 0.395375481018565, 0.398377579969751, 0.400997300805061,
            0.403375467852471, 0.405431976972633, 0.407443593366561, 0.409668341757423, 0.411628734365416,
            0.413367576771339, 0.414896999887021, 0.416268233594787, 0.417475290203319, 0.418554781508749,
            0.419526121036389, 0.420522904669104, 0.421455233639571, 0.422296101083462, 0.423049677446171,
            0.423716974236606, 0.424302533927477, 0.424825925226932, 0.425286695396174, 0.425693415010937,
            0.426052733081791, 0.426369464812111, 0.426652822940747, 0.42686317150694, 0.427072533094355,
            0.427264216646862, 0.427427314063872, 0.427573225735422, 0.427700379783919, 0.427814235325525,
            0.427912925916531, 0.427998148400703
        ])

        assert_array_almost_equal(expected_deviance_ratio, coxnet.deviance_ratio_)

        coef = pandas.DataFrame(coxnet.coef_, index=x.columns, dtype=float)
        expected_coef = pandas.read_csv(BREAST_CANCER_COEFFICIENTS_FILE, index_col=0)
        expected_coef.columns = numpy.arange(expected_coef.shape[1])

        assert_columns_almost_equal(coef, expected_coef, 5)
    def test_example_2_standardize(self):
        from sklearn.preprocessing import StandardScaler
        from sklearn.pipeline import Pipeline

        x, y = get_x_y(pandas.read_csv(EXAMPLE_FILE), ["status", "time"],
                       pos_label=1)
        expected_alphas = numpy.array(
            [0.263066005037211, 0.239695946189997, 0.218402018960187, 0.198999785536952, 0.18132119305624,
             0.165213118007272, 0.15053603994994, 0.137162833055499, 0.124977665003457, 0.113874993697428,
             0.103758653109983, 0.0945410203385227, 0.0861422566576188, 0.0784896159941638, 0.0715168148356887,
             0.0651634581142872, 0.0593745160934302, 0.0540998477267133, 0.0492937663601004, 0.0449146440159821,
             0.0409245508315483, 0.037288926528462, 0.0339762810682614, 0.0309579219007116, 0.0282077054426604,
             0.0257018106348284, 0.0234185326151886, 0.0213380947218357, 0.0194424771970012, 0.0177152611085322,
             0.0161414861369557, 0.0147075209963485, 0.0134009453666594, 0.0122104423148384, 0.0111257002729774,
             0.0101373237244409, 0.00923675182439653, 0.00841618424987191, 0.00766851363708906, 0.00698726402087929,
             0.00636653474297097, 0.00580094934331044, 0.00528560899173708, 0.00481605005665997, 0.00438820544321646,
             0.0039983693660421, 0.00364316525153066, 0.00331951649156886, 0.0030246197954287])

        scaler = StandardScaler()
        coxnet = CoxnetSurvivalAnalysis(l1_ratio=0.9)
        pipe = Pipeline([("standardize", scaler),
                         ("coxnet", coxnet)])
        pipe.fit(x.values, y)

        assert_array_almost_equal(expected_alphas, coxnet.alphas_)

        expected_coef = pandas.read_csv(EXAMPLE_COEF_FILE.format("2-std"))
        # rescale coefficients
        coef = pandas.DataFrame(coxnet.coef_ / scaler.scale_[:, numpy.newaxis],
                                columns=expected_coef.columns, dtype=float)
        assert_columns_almost_equal(coef, expected_coef, 5)
def test_pipeline_predict(func):
    X_str, y = load_breast_cancer()
    X_num = column.encode_categorical(X_str)

    est = CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001,
                                 l1_ratio=1.0,
                                 fit_baseline_model=True)
    est.fit(X_num[10:], y[10:])

    pipe = make_pipeline(
        OneHotEncoder(),
        CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001,
                               l1_ratio=1.0,
                               fit_baseline_model=True))
    pipe.fit(X_str[10:], y[10:])

    tree_pred = getattr(est, func)(X_num[:10])
    pipe_pred = getattr(pipe, func)(X_str[:10])

    for s1, s2 in zip(tree_pred, pipe_pred):
        assert_array_almost_equal(s1.x, s2.x)
        assert_array_almost_equal(s1.y, s2.y)