def ARDRegression_on_fold(feature_sets, train, test, y, y_all, X, dim, dimsum, learn_options): ''' ''' clf = ARDRegression() clf.fit(X[train], y[train][:, 0]) y_pred = clf.predict(X[test])[:, None] return y_pred, clf
def predict_features(self, df_features, df_target, idx=0, **kwargs): X = df_features.as_matrix() y = df_target.as_matrix() clf = ARDRegression(compute_score=True) clf.fit(X, y) return np.abs(clf.coef_)
def getTrainedClassifier(ticker, sd, ed, save=True): df = quandl.get('WIKI/' + ticker, start_date=sd, end_date=ed) df = df[[ 'Adj. Open', 'Adj. High', 'Adj. Low', 'Adj. Close', 'Adj. Volume' ]] # df['HL_PCT'] = (df['Adj. High'] - df['Adj. Low']) * 100 / df['Adj. Low'] # df['Change_PCT'] = (df['Adj. Close'] - df['Adj. Open']) * 100 / df['Adj. Open'] # df = df[['Adj. Close','HL_PCT', 'Change_PCT', 'Adj. Volume']] # df['HL_PCT'] = df['HL_PCT'] * 10 df['future'] = df['Adj. Close'].shift(-shift) df.dropna(inplace=True) X_train = np.array(df.drop(['future'], 1)) y_train = np.array(df['future']) # X = preprocessing.scale(X) # X_lately = X[-shift:] # X = X[:-shift] # y = y[:-shift] # X_train, X_test = cross_validation.train_test_split(X, test_size = 0.0) # y_train, y_test = cross_validation.train_test_split(y, test_size = 0.0) # p = preprocess_input( 799.70 , 801.670 , 795.2501 , 801.34 , 1161986.0) # p1 = preprocess_input(135.10,135.83,135.10,135.6900,21976977) clf = ARDRegression() clf.fit(X_train, y_train) return clf
def train(self): """ Train the linear regression model based on the observed dataset """ if self.normalize_output: (self.y, self.norm_mean, self.norm_sd) = zero_mean_unit_var_normalization(self.y) if self.intercept: train_X = sm.add_constant(self.X) else: train_X = self.X Phi = train_X regressor = ARDRegression() regressor.fit(Phi, self.y) # Best sigma self.sigma = np.sqrt(1. / regressor.alpha_) # Best alpha self.alpha = regressor.lambda_ A = np.dot(Phi.T, Phi) / self.sigma**2. + self.alpha * np.eye( Phi.shape[1]) A = A + np.eye(A.shape[0]) * 1e-5 L = scipy.linalg.cho_factor(A) self.m = scipy.linalg.cho_solve( L, np.dot(Phi.T, self.y) / self.sigma**2) # The posterior mean of w self.S = scipy.linalg.cho_solve(L, np.eye( Phi.shape[1])) # The posterior covariance of w return self.m, self.S, self.sigma, self.alpha
def test_check_is_fitted(): # Check is ValueError raised when non estimator instance passed assert_raises(ValueError, check_is_fitted, ARDRegression, "coef_") assert_raises(TypeError, check_is_fitted, "SVR", "support_") ard = ARDRegression() svr = SVR() try: assert_raises(NotFittedError, check_is_fitted, ard, "coef_") assert_raises(NotFittedError, check_is_fitted, svr, "support_") except ValueError: assert False, "check_is_fitted failed with ValueError" # NotFittedError is a subclass of both ValueError and AttributeError try: check_is_fitted(ard, "coef_", "Random message %(name)s, %(name)s") except ValueError as e: assert_equal(str(e), "Random message ARDRegression, ARDRegression") try: check_is_fitted(svr, "support_", "Another message %(name)s, %(name)s") except AttributeError as e: assert_equal(str(e), "Another message SVR, SVR") ard.fit(*make_blobs()) svr.fit(*make_blobs()) assert_equal(None, check_is_fitted(ard, "coef_")) assert_equal(None, check_is_fitted(svr, "support_"))
class ARDRegressionPrim(primitive): def __init__(self, random_state=0): super(ARDRegressionPrim, self).__init__(name='ARDRegression') self.hyperparams = [] self.type = 'Regressor' self.description = "Bayesian ARD regression. Fit the weights of a regression model, using an ARD prior. The weights of the regression model are assumed to be in Gaussian distributions. Also estimate the parameters lambda (precisions of the distributions of the weights) and alpha (precision of the distribution of the noise). The estimation is done by an iterative procedures (Evidence Maximization)" self.hyperparams_run = {'default': True} self.random_state = random_state self.model = ARDRegression() self.accept_type = 'c_r' def can_accept(self, data): return self.can_accept_c(data, 'Regression') def is_needed(self, data): # data = handle_data(data) return True def fit(self, data): data = handle_data(data) self.model.fit(data['X'], data['Y']) def produce(self, data): output = handle_data(data) output['predictions'] = self.model.predict(output['X']) output['X'] = pd.DataFrame(output['predictions'], columns=[self.name + "Pred"]) final_output = {0: output} return final_output
def test_return_std(): # Test return_std option for both Bayesian regressors def f(X): return np.dot(X, w) + b def f_noise(X, noise_mult): return f(X) + np.random.randn(X.shape[0]) * noise_mult d = 5 n_train = 50 n_test = 10 w = np.array([1.0, 0.0, 1.0, -1.0, 0.0]) b = 1.0 X = np.random.random((n_train, d)) X_test = np.random.random((n_test, d)) for decimal, noise_mult in enumerate([1, 0.1, 0.01]): y = f_noise(X, noise_mult) m1 = BayesianRidge() m1.fit(X, y) y_mean1, y_std1 = m1.predict(X_test, return_std=True) assert_array_almost_equal(y_std1, noise_mult, decimal=decimal) m2 = ARDRegression() m2.fit(X, y) y_mean2, y_std2 = m2.predict(X_test, return_std=True) assert_array_almost_equal(y_std2, noise_mult, decimal=decimal)
def test_check_is_fitted(): # Check is TypeError raised when non estimator instance passed with pytest.raises(TypeError): check_is_fitted(ARDRegression) with pytest.raises(TypeError): check_is_fitted("SVR") ard = ARDRegression() svr = SVR() try: with pytest.raises(NotFittedError): check_is_fitted(ard) with pytest.raises(NotFittedError): check_is_fitted(svr) except ValueError: assert False, "check_is_fitted failed with ValueError" # NotFittedError is a subclass of both ValueError and AttributeError try: check_is_fitted(ard, msg="Random message %(name)s, %(name)s") except ValueError as e: assert str(e) == "Random message ARDRegression, ARDRegression" try: check_is_fitted(svr, msg="Another message %(name)s, %(name)s") except AttributeError as e: assert str(e) == "Another message SVR, SVR" ard.fit(*make_blobs()) svr.fit(*make_blobs()) assert check_is_fitted(ard) is None assert check_is_fitted(svr) is None
def predict_features(self, df_features, df_target, idx=0, **kwargs): X = df_features.values y = df_target.values clf = ARDRegression(compute_score=True) clf.fit(X, y.ravel()) return np.abs(clf.coef_)
def ard_regression(train, test): train = train.copy() test = test.copy() X = train.to_numpy() X_train = np.delete(X, [train.columns.get_loc('views')], axis=1) y_train = train['views'] X = test.to_numpy() X_test = np.delete(X, [test.columns.get_loc('views')], axis=1) y_test = test['views'] reg = ARDRegression(compute_score=True) reg.fit(X_train, y_train) y_pred = reg.predict(X_test) # The mean squared error print('Mean squared error: %.2f' % mean_squared_error(y_test, y_pred, squared=True)) # The coefficient of determination: 1 is perfect prediction print('median absolute error: %.2f' % median_absolute_error(y_test, y_pred)) return None
def test_check_is_fitted(): # Check is ValueError raised when non estimator instance passed assert_raises(ValueError, check_is_fitted, ARDRegression, "coef_") assert_raises(TypeError, check_is_fitted, "SVR", "support_") ard = ARDRegression() svr = SVR(gamma='scale') try: assert_raises(NotFittedError, check_is_fitted, ard, "coef_") assert_raises(NotFittedError, check_is_fitted, svr, "support_") except ValueError: assert False, "check_is_fitted failed with ValueError" # NotFittedError is a subclass of both ValueError and AttributeError try: check_is_fitted(ard, "coef_", "Random message %(name)s, %(name)s") except ValueError as e: assert_equal(str(e), "Random message ARDRegression, ARDRegression") try: check_is_fitted(svr, "support_", "Another message %(name)s, %(name)s") except AttributeError as e: assert_equal(str(e), "Another message SVR, SVR") ard.fit(*make_blobs()) svr.fit(*make_blobs()) assert_equal(None, check_is_fitted(ard, "coef_")) assert_equal(None, check_is_fitted(svr, "support_"))
def bayeslr_python(fname, threshold): # this function conducts the bayesian linear regression # the data interaction from matlab is through excel files due to the restriction of matrix interation X = pd.read_excel(fname, sheetname=0, header=None, index=None) Y = pd.read_excel(fname, sheetname=1, header=None, index=None) X_row,X_col = X.shape Y_row,Y_col = Y.shape judge_Y = ~(pd.DataFrame.sum(Y, axis=0) == np.zeros(Y_col)) X_blr = np.zeros((Y_col,X_col+1)) sigma_blr = np.zeros((Y_col,X_col)) for i in range(0,Y_col): if judge_Y[i]: y = Y.ix[:,i] clf = ARDRegression() # clf.n_iter = 500 clf.threshold_lambda = threshold clf.fit(X, y) coef = clf.coef_.T X_blr[i, :] = np.hstack((coef,clf.intercept_)) X_blr = pd.DataFrame(X_blr) with pd.ExcelWriter(fname) as writer: X_blr.to_excel(writer, sheet_name=str(0), index=None, header=None)
def main_bak(): # trial noiseVar = 0.01 n = 500 d = 10 x = np.random.normal(0, 1, size=d * n).reshape((n, d)) w = np.random.normal(10, 1, size=d) y = np.dot(x, w) + np.random.normal(0, noiseVar, size=n) t1 = time.time() print "Running iterative ard" (witer, gamma) = iterative_ard(Xtrain=x, ytrain=y, noiseVar=noiseVar) t2 = time.time() print "Running scikit ARD" ard = ARDRegression(compute_score=True) ard.fit(x, y) t3 = time.time() print "Time taken " print "Iterative:" + str(t2 - t1) print "scikit ard:" + str(t3 - t2) print "ALL W :" print witer print ard.coef_ print w
def test_ard_regression_predict_normalize_true(): """Check that we can predict with `normalize=True` and `return_std=True`. Non-regression test for: https://github.com/scikit-learn/scikit-learn/issues/18605 """ clf = ARDRegression(normalize=True) clf.fit([[0, 0], [1, 1], [2, 2]], [0, 1, 2]) clf.predict([[1, 1]], return_std=True)
def autorelevancedetermination(self): # Fit the ARD Regression clf = ARDRegression(compute_score=True) clf.fit(self.x_train, self.y_train) z = clf.predict(self.x_test) print(np.mean(self.y_test == z)) return z
def test_toy_ard_object(): # Test BayesianRegression ARD classifier X = np.array([[1], [2], [3]]) Y = np.array([1, 2, 3]) clf = ARDRegression(compute_score=True) clf.fit(X, Y) # Check that the model could approximately learn the identity function test = [[1], [3], [4]] assert_array_almost_equal(clf.predict(test), [1, 3, 4], 2)
def test_ard_accuracy_on_easy_problem(seed, n_samples, n_features): # Check that ARD converges with reasonable accuracy on an easy problem # (Github issue #14055) X = np.random.RandomState(seed=seed).normal(size=(250, 3)) y = X[:, 1] regressor = ARDRegression() regressor.fit(X, y) abs_coef_error = np.abs(1 - regressor.coef_[1]) assert abs_coef_error < 1e-10
def make_linear(X, y): n_samples = np.shape(X)[0] n_features = np.shape(X)[1] ard = ARDRegression(compute_score=True) ard.fit(X, y) ols = LinearRegression() ols.fit(X, y) return ard, ols
def runARDRegressor(self): lm = ARDRegression(fit_intercept=True, normalize=True) print("runARDRegressor\n") lm.fit(self.m_X_train, self.m_y_train) predictY = lm.predict(self.m_X_test) score = lm.score(self.m_X_test, self.m_y_test) predictTraingY = lm.predict(self.m_X_train) self.displayPredictPlot(predictY) self.displayResidualPlot(predictY, predictTraingY) self.dispalyModelResult(lm, predictY, score)
def test_update_of_sigma_in_ard(): # Checks that `sigma_` is updated correctly after the last iteration # of the ARDRegression algorithm. See issue #10128. X = np.array([[1, 0], [0, 0]]) y = np.array([0, 0]) clf = ARDRegression(n_iter=1) clf.fit(X, y) # With the inputs above, ARDRegression prunes both of the two coefficients # in the first iteration. Hence, the expected shape of `sigma_` is (0, 0). assert clf.sigma_.shape == (0, 0) # Ensure that no error is thrown at prediction stage clf.predict(X, return_std=True)
def test_check_is_fitted_with_attributes(wrap): ard = ARDRegression() with pytest.raises(NotFittedError, match="is not fitted yet"): check_is_fitted(ard, wrap(["coef_"])) ard.fit(*make_blobs()) # Does not raise check_is_fitted(ard, wrap(["coef_"])) # Raises when using attribute that is not defined with pytest.raises(NotFittedError, match="is not fitted yet"): check_is_fitted(ard, wrap(["coef_bad_"]))
def fit_model_16(self,toWrite=False): model = ARDRegression() for data in self.cv_data: X_train, X_test, Y_train, Y_test = data model.fit(X_train,Y_train) pred = model.predict(X_test) print("Model 16 score %f" % (logloss(Y_test,pred),)) if toWrite: f2 = open('model16/model.pkl','w') pickle.dump(model,f2) f2.close()
class _ARDRegressionImpl: def __init__(self, **hyperparams): self._hyperparams = hyperparams self._wrapped_model = Op(**self._hyperparams) def fit(self, X, y=None): if y is not None: self._wrapped_model.fit(X, y) else: self._wrapped_model.fit(X) return self def predict(self, X): return self._wrapped_model.predict(X)
def _ard(*, train, test, x_predict=None, metrics, n_iter=300, tol=0.001, alpha_1=1e-06, alpha_2=1e-06, lambda_1=1e-06, lambda_2=1e-06, compute_score=False, threshold_lambda=10000.0, fit_intercept=True, normalize=False, copy_X=True, verbose=False): """For more info visit : https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.ARDRegression.html#sklearn.linear_model.ARDRegression """ model = ARDRegression(n_iter=n_iter, tol=tol, alpha_1=alpha_1, alpha_2=alpha_2, lambda_1=lambda_1, lambda_2=lambda_2, compute_score=compute_score, threshold_lambda=threshold_lambda, fit_intercept=fit_intercept, normalize=normalize, copy_X=copy_X, verbose=verbose) model.fit(train[0], train[1]) model_name = 'ARDRegression' y_hat = model.predict(test[0]) if metrics == 'mse': accuracy = _mse(test[1], y_hat) if metrics == 'rmse': accuracy = _rmse(test[1], y_hat) if metrics == 'mae': accuracy = _mae(test[1], y_hat) if x_predict is None: return (model_name, accuracy, None) y_predict = model.predict(x_predict) return (model_name, accuracy, y_predict)
def test_ard_accuracy_on_easy_problem(): # Check that ARD converges with reasonable accuracy on an easy problem # (Github issue #14055) # This particular seed seems to converge poorly in the failure-case # (scipy==1.3.0, sklearn==0.21.2) seed = 45 X = np.random.RandomState(seed=seed).normal(size=(250, 3)) y = X[:, 1] regressor = ARDRegression(n_iter=600) regressor.fit(X, y) abs_coef_error = np.abs(1 - regressor.coef_[1]) # Expect an accuracy of better than 1E-4 in most cases - # Failure-case produces 0.16! assert abs_coef_error < 0.01
def predict(self): """ trains the scikit-learn python machine learning algorithm library function https://scikit-learn.org then passes the trained algorithm the features set and returns the predicted y test values form, the function then compares the y_test values from scikit-learn predicted to y_test values passed in then returns the accuracy """ algorithm = ARDRegression(threshold_lambda=1e5) algorithm.fit(self.X_train, self.y_train) y_pred = list(algorithm.predict(self.X_test)) self.acc = OneHotPredictor.get_accuracy(y_pred, self.y_test) return self.acc
def _fit_ardr(X: np.ndarray, y: np.ndarray, threshold_lambda: float = None, line_scan: bool = False, fit_intercept: bool = False, **kwargs) -> Dict[str, Any]: """ Returns the solution `a` to the linear problem `Xa=y` obtained by using the automatic relevance determination regression (ARDR) method as implemented in scitkit-learn in the form of a dictionary with a key named `parameters`. Parameters ----------- X fit matrix y target array threshold_lambda threshold lambda parameter forwarded to sklearn line_scan whether or not to perform line-scan in order to find optimal threshold-lambda fit_intercept center data or not, forwarded to sklearn """ if threshold_lambda is not None and line_scan: raise ValueError('Specify threshold_lambda or set line_scan=True, not both') if threshold_lambda is None: threshold_lambda = 1e4 if line_scan: return _fit_ardr_line_scan(X, y, fit_intercept=fit_intercept, **kwargs) else: ardr = ARDRegression(threshold_lambda=threshold_lambda, fit_intercept=fit_intercept, **kwargs) ardr.fit(X, y) results = dict() results['parameters'] = ardr.coef_ return results
def test_check_is_fitted(): # Check is TypeError raised when non estimator instance passed assert_raises(TypeError, check_is_fitted, ARDRegression) assert_raises(TypeError, check_is_fitted, "SVR") ard = ARDRegression() svr = SVR() try: assert_raises(NotFittedError, check_is_fitted, ard) assert_raises(NotFittedError, check_is_fitted, svr) except ValueError: assert False, "check_is_fitted failed with ValueError" # NotFittedError is a subclass of both ValueError and AttributeError try: check_is_fitted(ard, msg="Random message %(name)s, %(name)s") except ValueError as e: assert str(e) == "Random message ARDRegression, ARDRegression" try: check_is_fitted(svr, msg="Another message %(name)s, %(name)s") except AttributeError as e: assert str(e) == "Another message SVR, SVR" ard.fit(*make_blobs()) svr.fit(*make_blobs()) assert check_is_fitted(ard) is None assert check_is_fitted(svr) is None # to be removed in 0.23 assert_warns_message( DeprecationWarning, "Passing attributes to check_is_fitted is deprecated", check_is_fitted, ard, ['coef_']) assert_warns_message(DeprecationWarning, "Passing all_or_any to check_is_fitted is deprecated", check_is_fitted, ard, all_or_any=any)
class ARDR(): """docstring for ClassName""" def __init__(self, ARDRegression, N): self.cores_number = int(np.ceil(multiprocessing.cpu_count() / N)) self.selected_columns = [] self.model = ARDRegression(alpha_1=1e-06, alpha_2=1e-06, compute_score=False, copy_X=True, fit_intercept=True, lambda_1=1e-06, lambda_2=1e-06, n_iter=300, normalize=False, threshold_lambda=10000.0, tol=0.001, verbose=False) print("ARDRegression Cores: ", np.nan) def fit(self, X_train, y_train, X_test, y_test, error_type="MAE"): try: self.selected_columns = np.random.choice(X_train.columns, 100, replace=False) X_train = X_train[self.selected_columns] except Exception as E: X_train = X_train error_dict = { "MSE": "rmse", "R2": {"l1", "l2"}, "MAE": "mae", "LOGLOSS": "multi_logloss" } error_metric = error_dict[error_type] self.model.fit(X_train, y_train) def predict(self, X_test): prediction = self.model.predict(X_test[self.selected_columns]) return (prediction)
def ARD(X_train, y_train, X_test, y_test): ''' Purpose: Use ARD to calculate accuracy Input: X_train, y_train, X_test, y_test Output: accuracy_score ''' clf = ARDRegression(compute_score=True) clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) y_pred = y_pred.round() #ols = LinearRegression() #ols.fit(X, y) return metrics.accuracy_score(y_test, y_pred)
from sklearn.linear_model import ARDRegression from sklearn.model_selection import cross_val_predict from sklearn.datasets import load_boston from sklearn.metrics import explained_variance_score, mean_squared_error import numpy as np import pylab as pl #Loading boston datasets boston = load_boston() # Creating Regression Design Matrix x = boston.data # Creating target dataset y = boston.target # Create ARDRegression Regression object ARD= ARDRegression(alpha_1=0.01, alpha_2=0.01, lambda_1=1e-06, lambda_2=1e-06) # Fitting a linear model using the dataset ARD.fit(x,y) # Y predicted values yp = ARD.predict(x) #Calculation 10-Fold CV yp_cv = cross_val_predict(ARD, x, y, cv=10) #Printing RMSE and Explained Variance Evariance=explained_variance_score(y,yp) Evariance_cv=explained_variance_score(y,yp_cv) RMSE =np.sqrt(mean_squared_error(y,yp)) RMSECV=np.sqrt(mean_squared_error(y,yp_cv)) print('Method: ARDRegression Regression') print('RMSE on the dataset: %.4f' %RMSE) print('RMSE on 10-fold CV: %.4f' %RMSECV) print('Explained Variance Regression Score on the dataset: %.4f' %Evariance) print('Explained Variance Regression 10-fold CV: %.4f' %Evariance_cv) #plotting real vs predicted data
def ard_regressor(self): x_train, x_test, y_train, y_test = self.preprocessing() model = ARDRegression() y_pred = model.fit(x_train, y_train).predict(x_test) self.printing(y_test, y_pred, 'ARD')
lambda_ = 4. w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) # Create noite with a precision alpha of 50. alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise ############################################################################### # Fit the ARD Regression clf = ARDRegression(compute_score=True) clf.fit(X, y) ols = LinearRegression() ols.fit(X, y) ############################################################################### # Plot the true weights, the estimated weights and the histogram of the # weights plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, 'b-', label="ARD estimate") plt.plot(ols.coef_, 'r--', label="OLS estimate") plt.plot(w, 'g-', label="Ground truth") plt.xlabel("Features") plt.ylabel("Values of the weights") plt.legend(loc=1)
def learn_model(x_mat, y): #model = SVR(kernel='rbf') model = ARDRegression() model.fit(x_mat, y) return model
lambda_ = 4. w = np.zeros(n_features) # Only keep 10 weights of interest relevant_features = np.random.randint(0, n_features, 10) for i in relevant_features: w[i] = stats.norm.rvs(loc=0, scale=1. / np.sqrt(lambda_)) # Create noise with a precision alpha of 50. alpha_ = 50. noise = stats.norm.rvs(loc=0, scale=1. / np.sqrt(alpha_), size=n_samples) # Create the target y = np.dot(X, w) + noise ############################################################################### # Fit the ARD Regression clf = ARDRegression(compute_score=True) clf.fit(X, y) ols = LinearRegression() ols.fit(X, y) ############################################################################### # Plot the true weights, the estimated weights, the histogram of the # weights, and predictions with standard deviations plt.figure(figsize=(6, 5)) plt.title("Weights of the model") plt.plot(clf.coef_, color='darkblue', linestyle='-', linewidth=2, label="ARD estimate") plt.plot(ols.coef_, color='yellowgreen', linestyle=':', linewidth=2, label="OLS estimate") plt.plot(w, color='orange', linestyle='-', linewidth=2, label="Ground truth") plt.xlabel("Features")
#Train normalizer on RNA seq, apply to rescaled gene expression if standardizeByTCGA: rnaSeqExpressionNormalized, L2Normalizer = standardizeExpression(rnaSeqExpression, L2Normalizer, log10Normalize) rescaledExpressionClinical = L2Normalizer.transform(np.log10(rescaledExpressionClinical+1)) # else: # prunedRnaSeqExpressionNormalized, L2Normalizer = standardizeExpression(prunedRnaSeqExpression.ix[cellExpression.shape[0],;], L2Normalizer, log10Normalize) # prunedArrayExpressionNormalized = L2Normalizer.transform(np.log10(prunedRescaledExpressionClinical+1)) #Load Docetaxel IC50 Data docetaxelData = getDrugIC50('Docetaxel', inputFolder) #Assemble training data with both IC50 and expression data docetaxelData = pd.merge(docetaxelData, rnaSeqExpressionNormalized, how='inner', left_index=True, right_index=True).drop('cell_line', axis=1) #Train Docetaxel model clf.fit(docetaxelData.drop(['IC50'], axis=1), docetaxelData['IC50']) #Validate on Clinical Data resistance_predictions = clf.predict(rescaledExpressionClinical) #Calculates ROC, first 11 samples correspond to sensitive patients, last 13 are resistant roc_auc_score(np.hstack((np.repeat(0,11), np.repeat(1,13))), resistance_predictions) roc_data = pd.DataFrame() roc_data['fpr'], roc_data['tpr'],roc_data['thresholds'] = roc_curve(np.hstack((np.repeat(0,11), np.repeat(1,13))), resistance_predictions) #Plot Results from bokeh.charts import show, output_file from bokeh.plotting import figure