def test_alpha_too_small(self): X, y = load_breast_cancer() Xt = OneHotEncoder().fit_transform(X) index = numpy.array([ 0, 1, 2, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 31, 33, 34, 36, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 51, 52, 53, 54, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 68, 70, 71, 72, 75, 76, 78, 79, 80, 82, 84, 85, 86, 87, 88, 90, 91, 92, 93, 94, 95, 98, 99, 100, 102, 103, 104, 105, 107, 108, 109, 110, 111, 113, 114, 115, 116, 117, 118, 119, 120, 121, 124, 125, 126, 127, 128, 130, 131, 132, 133, 135, 136, 137, 138, 139, 140, 143, 144, 145, 147, 148, 150, 151, 153, 154, 155, 156, 157, 158, 160, 161, 164, 165, 166, 167, 168, 169, 170, 171, 172, 174, 175, 177, 178, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197 ]) nn = CoxnetSurvivalAnalysis(alphas=[0.007295025406624247], l1_ratio=1.0) Xf, yf = Xt.iloc[index], y[index] self.assertRaisesRegex( ArithmeticError, "Numerical error, because weights are too large. Consider increasing alpha.", nn.fit, Xf, yf)
def test_breast_cancer_2(): X, y = load_breast_cancer() X.loc[:, "er"] = X.loc[:, "er"].replace({"negative": 0, "positive": 1}) X.loc[:, "grade"] = X.loc[:, "grade"].replace({ "poorly differentiated": 0, "intermediate": 1, "well differentiated": 2, "unkown": 3 }) tree = SurvivalTree(max_features="log2", splitter="random", max_depth=5, min_samples_split=30, min_samples_leaf=15, random_state=6) tree.fit(X.values, y) assert tree.tree_.capacity == 11 assert_array_equal( tree.tree_.feature, numpy.array([ 55, 14, TREE_UNDEFINED, 60, 23, TREE_UNDEFINED, TREE_UNDEFINED, 31, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED ])) assert_array_equal( tree.tree_.n_node_samples, numpy.array([198, 153, 76, 77, 46, 16, 30, 31, 16, 15, 45])) assert_array_almost_equal( tree.tree_.threshold, numpy.array([ 11.3019, 9.0768, TREE_UNDEFINED, 8.6903, 6.83564, TREE_UNDEFINED, TREE_UNDEFINED, 10.66262, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED ]), 5)
def test_breast_example(self): x, y = load_breast_cancer() x = column.encode_categorical(x) coxnet = CoxnetSurvivalAnalysis(l1_ratio=1.0) coxnet.fit(x.values, y) expected_alphas = numpy.array([ 0.207764947265866, 0.189307681974955, 0.172490109262135, 0.157166563357949, 0.143204319038428, 0.130482442022696, 0.118890741498079, 0.108328815700004, 0.0987051822799425, 0.0899364859290742, 0.0819467763944772, 0.0746668506343715, 0.0680336534144775, 0.0619897311537413, 0.0564827342889011, 0.051464963847614, 0.046892958302776, 0.0427271171295661, 0.0389313578046448, 0.0354728032765984, 0.0323214972006479, 0.0294501444711215, 0.0268338748043064, 0.0244500273239498, 0.0222779542835891, 0.0202988422256499, 0.0184955490282766, 0.0168524554284737, 0.0153553297355215, 0.0139912045628799, 0.0127482645108893, 0.0116157438274312, 0.0105838331601337, 0.00964359459245389, 0.00878688422772072, 0.00800628165059773, 0.0072950256549955, 0.0066469556817389, 0.00605645845875073, 0.00551841938157428, 0.00502817821311635, 0.00458148871890295, 0.00417448188822764, 0.00380363242263169, 0.00346572820145532, 0.00315784245998521, 0.00287730843921864, 0.00262169628767281, 0.00238879201517371, 0.00217657831633235, 0.00198321709761059, 0.00180703355663423, 0.00164650167585602, 0.00150023100492174, 0.0013669546172544, 0.00124551813654232, 0.00113486973808373, 0.00103405103838443, 0.000942188794098442, 0.000858487338411865, 0.000782221689357606, 0.00071273127036839, 0.000649414188678556, 0.000591722022016858, 0.00053915506843511, 0.00049125801812897, 0.000447616009762226, 0.000407851037136367, 0.000371618675081733, 0.000338605096211458, 0.000308524352698783, 0.00028111589953377, 0.000256142337807075, 0.000233387358474159, 0.000212653868789829, 0.000193762285185162, 0.000176548977800548, 0.000160864853202119, 0.000146574063005757, 0.000133552827223371, 0.000121688362139862, 0.000110877903434536, 0.000101027816085719, 9.20527833489927e-05, 8.38750677843702e-05, 7.64238379317803e-05, 6.96345548028444e-05, 6.34484128750348e-05 ]) assert_array_almost_equal(expected_alphas, coxnet.alphas_) expected_deviance_ratio = numpy.array([ 0, 0.00741462796207568, 0.0135178719105177, 0.0183232499901932, 0.0221250029051101, 0.0251530137843965, 0.0275599035016693, 0.0298664819929119, 0.033763232356598, 0.0374249162331977, 0.0409637006907067, 0.0454486054162627, 0.0551615080395675, 0.0651612844343542, 0.0736024993960834, 0.0808820441173129, 0.0894426534710234, 0.0992239010000626, 0.108910229105339, 0.121376204780063, 0.134004998770465, 0.145079557491685, 0.156667501995989, 0.167543840680748, 0.178622131991811, 0.189365153169168, 0.199027839424271, 0.20909726215036, 0.218610320633419, 0.228024278642459, 0.238171883969976, 0.248070501745195, 0.258480494697342, 0.268971907277929, 0.280744803445048, 0.291329662029924, 0.300942928439923, 0.309972153913063, 0.318315812887558, 0.325822700491885, 0.332992506325249, 0.339665277042211, 0.345876707002969, 0.351605625998246, 0.357206102668659, 0.362484660673399, 0.367624391654207, 0.372275248793233, 0.37674043994605, 0.380887801196039, 0.384795899779142, 0.388569806807258, 0.392075171498793, 0.395375481018565, 0.398377579969751, 0.400997300805061, 0.403375467852471, 0.405431976972633, 0.407443593366561, 0.409668341757423, 0.411628734365416, 0.413367576771339, 0.414896999887021, 0.416268233594787, 0.417475290203319, 0.418554781508749, 0.419526121036389, 0.420522904669104, 0.421455233639571, 0.422296101083462, 0.423049677446171, 0.423716974236606, 0.424302533927477, 0.424825925226932, 0.425286695396174, 0.425693415010937, 0.426052733081791, 0.426369464812111, 0.426652822940747, 0.42686317150694, 0.427072533094355, 0.427264216646862, 0.427427314063872, 0.427573225735422, 0.427700379783919, 0.427814235325525, 0.427912925916531, 0.427998148400703 ]) assert_array_almost_equal(expected_deviance_ratio, coxnet.deviance_ratio_) coef = pandas.DataFrame(coxnet.coef_, index=x.columns, dtype=float) expected_coef = pandas.read_csv(BREAST_CANCER_COEFFICIENTS_FILE, index_col=0) expected_coef.columns = numpy.arange(expected_coef.shape[1]) assert_columns_almost_equal(coef, expected_coef, 5)
def breast_cancer(): X, y = load_breast_cancer() X.loc[:, "er"] = X.loc[:, "er"].replace({"negative": 0, "positive": 1}) X.loc[:, "grade"] = X.loc[:, "grade"].replace({ "intermediate": 0, "poorly differentiated": 1, "unkown": 2, "well differentiated": 3 }) return X, y
def test_pipeline_predict(breast_cancer, name, func): X_str, _ = load_breast_cancer() X_num, y = breast_cancer est = FORESTS[name](n_estimators=10, random_state=1) est.fit(X_num[10:], y[10:]) pipe = make_pipeline(OneHotEncoder(), FORESTS[name](n_estimators=10, random_state=1)) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(est, func)(X_num[:10], return_array=True) pipe_pred = getattr(pipe, func)(X_str[:10], return_array=True) assert_array_almost_equal(tree_pred, pipe_pred)
def test_fit_unpenalized(): X, y = load_breast_cancer() included = X["grade"] != "unkown" X = X.loc[included, :] y = y[included.values] X["grade"] = pandas.Series(pandas.Categorical( X["grade"].astype(object), categories=["intermediate", "poorly differentiated", "well differentiated"]), index=X.index, name="grade") enc = OneHotEncoder() X = enc.fit_transform(X) cols_unpen = ['age', 'size', 'grade=poorly differentiated', 'grade=well differentiated', 'er=positive'] X = pandas.concat(( X.loc[:, cols_unpen], X.drop(cols_unpen, axis=1)), axis=1) alphas = numpy.ones(X.shape[1]) alphas[:len(cols_unpen)] = 0.0 cph = CoxPHSurvivalAnalysis(alpha=alphas) cph.fit(X, y) coef = numpy.array([ -0.0228825990482334, 0.635554486750423, -0.242079636336473, -1.30197563647684, -2.27790151300312, 0.291950212930807, 0.210861165049552, -0.612456645638769, -0.453414844486013, -0.1239424190253, 0.196855946938761, 1.08724198521351, -0.313645443818603, -0.660016141198812, 1.07104977404073, 0.559632480471393, -0.47740746012516, -1.26199769642326, -1.40486191330444, -0.418517018253652, 0.284936091689505, -0.215531076378674, -0.200889269720281, 0.341231176941461, 0.0307350667648337, -0.212527052910377, -0.3019678509188, 0.54491723178866, -0.286914381308269, 0.370374100647823, -0.496258248067704, 0.624528657777646, 0.287884026214139, 0.022095151910937, 0.910293732936019, -0.13076488639207, 0.0857209529827562, -0.0922302696963889, 0.498136631416287, 0.937133644376614, 0.395090607856869, -1.04727952099579, -0.54974694800345, 0.442372971174454, -0.745558450753062, -0.0920496108021893, 0.75549238586293, 0.562496351046743, 0.259183349320614, 0.405816113039412, -0.0969485695700491, -0.507388915258978, -0.474246597197329, -0.209335517183595, 0.187390427612498, -0.0522568530719332, 0.0806559868641646, -0.0397654339013217, -0.269582356665396, 0.791793553908743, 0.344208857844796, -0.180165785909583, -0.7927695046551, 0.0311635012097026, -0.579429950080662, -0.264770995160963, 0.869512689697827, 0.765479119494175, -0.173588059680979, -0.199781736503338, -0.58712767650975, -0.457389854855, 0.3891865514653, 0.707309743580534, -0.121997864690072, 0.0447174402649954, 0.0319336975869795, 0.0117988435665652, -0.593691059339064, -0.838107176656365, -0.247955128152877 ]) assert_array_almost_equal(cph.coef_, coef)
def test_pipeline_predict(func): X_str, y = load_breast_cancer() X_num = encode_categorical(X_str) est = RandomSurvivalForest(n_estimators=10, random_state=1) est.fit(X_num[10:], y[10:]) pipe = make_pipeline(OneHotEncoder(), RandomSurvivalForest(n_estimators=10, random_state=1)) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(est, func)(X_num[:10]) pipe_pred = getattr(pipe, func)(X_str[:10]) assert_array_almost_equal(tree_pred, pipe_pred)
def test_pipeline_predict(breast_cancer, func): X_num, y = breast_cancer X_num = X_num.loc[:, ["er", "grade"]].values tree = SurvivalTree().fit(X_num[10:], y[10:]) X_str, _ = load_breast_cancer() X_str = X_str.loc[:, ["er", "grade"]].values pipe = make_pipeline(OrdinalEncoder(), SurvivalTree()) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(tree, func)(X_num[:10], return_array=True) pipe_pred = getattr(pipe, func)(X_str[:10], return_array=True) assert_array_almost_equal(tree_pred, pipe_pred)
def test_breast_cancer_1(): X, y = load_breast_cancer() X.loc[:, "er"] = X.loc[:, "er"].replace({"negative": 0, "positive": 1}) X.loc[:, "grade"] = X.loc[:, "grade"].replace({ "poorly differentiated": 0, "intermediate": 1, "well differentiated": 2, "unkown": 3 }) tree = SurvivalTree(max_features="auto", max_depth=5, max_leaf_nodes=10, min_samples_split=0.06, min_samples_leaf=0.03, random_state=6) tree.fit(X.values, y) assert tree.tree_.capacity == 19 assert_array_equal( tree.tree_.feature, numpy.array([ 61, 29, 5, TREE_UNDEFINED, 40, 65, TREE_UNDEFINED, 10, 12, 4, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, 10, TREE_UNDEFINED, TREE_UNDEFINED ])) assert_array_equal( tree.tree_.n_node_samples, numpy.array([ 198, 170, 28, 8, 20, 164, 6, 59, 105, 74, 31, 9, 65, 13, 7, 39, 20, 7, 13 ])) assert_array_almost_equal( tree.tree_.threshold, numpy.array([ 10.97448, 11.10251, 11.34859, TREE_UNDEFINED, 10.53533, 8.08848, TREE_UNDEFINED, 10.86403, 10.14138, 11.49171, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, TREE_UNDEFINED, 11.01874, TREE_UNDEFINED, TREE_UNDEFINED ]), 5)
def test_pipeline_predict(func): X_str, y = load_breast_cancer() X_num = column.encode_categorical(X_str) est = CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001, l1_ratio=1.0, fit_baseline_model=True) est.fit(X_num[10:], y[10:]) pipe = make_pipeline( OneHotEncoder(), CoxnetSurvivalAnalysis(alpha_min_ratio=0.0001, l1_ratio=1.0, fit_baseline_model=True)) pipe.fit(X_str[10:], y[10:]) tree_pred = getattr(est, func)(X_num[:10]) pipe_pred = getattr(pipe, func)(X_str[:10]) for s1, s2 in zip(tree_pred, pipe_pred): assert_array_almost_equal(s1.x, s2.x) assert_array_almost_equal(s1.y, s2.y)
def breast_cancer(): X_str, y = load_breast_cancer() X_num = encode_categorical(X_str) return X_num, y
def test_load_breast_cancer(): x, y = sdata.load_breast_cancer() assert x.shape == (198, 80) assert y.shape == (198,) assert_structured_array_dtype(y, 'e.tdm', 't.tdm', 51)