def test_icp_regression_tree(self): # ----------------------------------------------------------------------------- # Setup training, calibration and test indices # ----------------------------------------------------------------------------- data = load_boston() idx = np.random.permutation(data.target.size) train = idx[:int(idx.size / 3)] calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] # ----------------------------------------------------------------------------- # Without normalization # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter( DecisionTreeRegressor(min_samples_leaf=5)) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.1) header = ["min", "max", "truth", "size"] size = prediction[:, 1] - prediction[:, 0] table = np.vstack([prediction.T, data.target[test], size.T]).T df = pd.DataFrame(table, columns=header) print(df) # ----------------------------------------------------------------------------- # With normalization # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter( DecisionTreeRegressor(min_samples_leaf=5)) normalizing_model = RegressorAdapter( KNeighborsRegressor(n_neighbors=1)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.1) header = ["min", "max", "truth", "size"] size = prediction[:, 1] - prediction[:, 0] table = np.vstack([prediction.T, data.target[test], size.T]).T df = pd.DataFrame(table, columns=header) print(df)
from nonconformist.icp import IcpRegressor from nonconformist.nc import RegressorNc, abs_error, abs_error_inv data = load_boston() # ----------------------------------------------------------------------------- # Setup training, calibration and test indices # ----------------------------------------------------------------------------- idx = np.random.permutation(data.target.size) train = idx[:int(idx.size / 3)] calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- icp = IcpRegressor(RegressorNc(DecisionTreeRegressor, abs_error, abs_error_inv)) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- import pandas prediction = icp.predict(data.data[test, :], significance=0.1) header = np.array(['min','max','Truth']) table = np.vstack([prediction.T, data.target[test]]).T df = pandas.DataFrame(np.vstack([header, table])) print(df)
from nonconformist.base import RegressorAdapter from nonconformist.icp import IcpRegressor from nonconformist.nc import RegressorNc, AbsErrorErrFunc, SignErrorErrFunc # ----------------------------------------------------------------------------- # Setup training, calibration and test indices # ----------------------------------------------------------------------------- data = load_boston() idx = np.random.permutation(data.target.size) train = idx[:int(idx.size / 3)] calibrate = idx[int(idx.size / 3):int(2 * idx.size / 3)] test = idx[int(2 * idx.size / 3):] # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- icp = IcpRegressor( RegressorNc(RegressorAdapter(DecisionTreeRegressor()), SignErrorErrFunc())) icp.fit(data.data[train, :], data.target[train]) icp.calibrate(data.data[calibrate, :], data.target[calibrate]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(data.data[test, :], significance=0.05) header = np.array(['min', 'max', 'Truth']) table = np.vstack([prediction.T, data.target[test]]).T df = pd.DataFrame(np.vstack([header, table])) print(df)
def train_and_test_cp_algo(i): window = 96 p = {'window': window} algorithm = BiLSTM(p) path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv' df = pd.read_csv(path).drop(['QdfTime', 'Unnamed: 0'], axis=1).fillna(0) y_raw_test = df.NetPosUsd[-120:] median_ = df.NetPosUsd.median() mad_ = mad(df.NetPosUsd.values) df.NetPosUsd = mlog_trans(df.NetPosUsd.values) # mean = df.NetPosUsd.mean() # std = df.NetPosUsd.std() # df.NetPosUsd = (df.NetPosUsd - mean) / std data = df.NetPosUsd.values def generate_index(window, data_matrix): ''' :return: ''' num_elements = data_matrix.shape[0] for start, stop in zip(range(0, num_elements - window, 1), range(window, num_elements, 1)): yield data_matrix[stop - window:stop].reshape((-1, 1)) cnt = [] for sequence in generate_index(window, data): cnt.append(sequence) cnt = np.array(cnt) X = cnt y = data[window:] X = X.reshape(X.shape[0], X.shape[1]) train_test_split = X.shape[0] - 120 - 3480 train = X[:train_test_split, :] calibrate = X[train_test_split:train_test_split + 3480, :] test = X[-120:] ytrain = y[:train_test_split] ycalibrate = y[train_test_split:train_test_split + 3480] ytest = y[-120:] underlying_model = RegressorAdapter(algorithm) normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) underlying_model2 = RegressorAdapter(algorithm) nc2 = RegressorNc(underlying_model2, AbsErrorErrFunc()) icp2 = IcpRegressor(nc2) icp2.fit(train, ytrain) icp2.calibrate(calibrate, ycalibrate) for a in tqdm(np.linspace(5, 95, 19)): # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=a / 100) header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction'] lower, upper = prediction[:, 0], prediction[:, 1] lower = mlog_inverse(lower, median_, mad_) upper = mlog_inverse(upper, median_, mad_) ytest = mlog_inverse(ytest, median_, mad_) # lower=lower*std+mean # upper=upper*std+mean # ytest=ytest*std+mean size = upper / 2 + lower / 2 table = np.vstack([lower, upper, y_raw_test, size.T]).T dfncp = pd.DataFrame(table, columns=header) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp2.predict(test, significance=a / 100) header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction'] lower, upper = prediction[:, 0], prediction[:, 1] lower = mlog_inverse(lower, median_, mad_) upper = mlog_inverse(upper, median_, mad_) ytest = mlog_inverse(ytest, median_, mad_) # lower=lower*std+mean # upper=upper*std+mean # ytest=ytest*std+mean size = upper / 2 + lower / 2 table = np.vstack([lower, upper, y_raw_test, size.T]).T dfcp = pd.DataFrame(table, columns=header) if i == 0: dfcp.to_csv( 'CP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', encoding='utf-8', index=False) else: dfcp.to_csv( 'CP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', mode='a', header=False, index=False) if i == 0: dfncp.to_csv( 'NCP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', encoding='utf-8', index=False) else: dfncp.to_csv( 'NCP' + '_' + 'cudaLSTM' + '_' + str( np.round(a).astype(int)) + '_' + 'calibrationwindow' + str( 3480) + '.csv', mode='a', header=False, index=False)
def train_and_test_cp_algo(parameters): p = parameters.copy() p.pop('algorithm') p.pop('randomized_calibration') p.pop('alpha_') p.pop('calibration_size') p.pop('WhichCP') for i in tqdm(range(29)): if parameters.get('algorithm') == 'RandomForest': algorithm = RandomForestRegressor(**p) if parameters.get('algorithm') == 'K-NearestNeighbours': algorithm = KNeighborsRegressor(**p) if parameters.get('algorithm') == 'LightGBM': algorithm = LGBMRegressor(**p) if parameters.get('algorithm') == 'LassoRegression': algorithm = Lasso(**p) if parameters.get('algorithm') == 'NeuralNetwork': algorithm = NeuralNetworkAlgorithm(p) if parameters.get('algorithm') == 'LSTM': algorithm = BiLSTM(**p) if parameters.get('algorithm') == 'GradientBoosting': algorithm =GradientBoostingRegressor(**p) path = 'data\EURUSD_NETPOSUSD_hourly_for_regresion' + str(i) + '.csv' df = pd.read_csv(path).drop(['Unnamed: 0','QdfTime'], axis=1).fillna(0) m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std() mean = df.mean(axis=0) std = df.std(axis=0) df = (df - mean) / std if parameters.get('randomized_calibration') == True: train_test_split = len(df) - 120 train_ = df.drop([ 'NetPosUsd'], axis=1).iloc[:train_test_split, :].values choose = np.random.choice(len(train_), parameters.get("calibration_size"), replace=False) calibrate = train_[choose, :] mask = np.ones(len(train_), dtype=bool) mask[choose] = False train = train_[mask, :] test = (df.drop([ 'NetPosUsd'], axis=1)).iloc[train_test_split:, :].values ytrain_ = df['NetPosUsd'][:train_test_split].values ycalibrate = ytrain_[choose] ytrain = ytrain_[mask] ytest = df['NetPosUsd'].iloc[train_test_split:] else: train_test_split = len(df) - 120 - parameters.get("calibration_size") train = df.drop([ 'NetPosUsd'], axis=1).iloc[:train_test_split, :].values calibrate = df.drop([ 'NetPosUsd'], axis=1).iloc[train_test_split:train_test_split + parameters.get("calibration_size"), :].values test = (df.drop([ 'NetPosUsd'], axis=1)).iloc[-120:,:].values ytrain = df['NetPosUsd'][:train_test_split].values ycalibrate = df['NetPosUsd'][train_test_split:train_test_split + parameters.get("calibration_size")] ytest = df['NetPosUsd'].iloc[-120:] if parameters.get("WhichCP") == 'NCP': underlying_model = RegressorAdapter(algorithm) normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction'] size = prediction[:, 1] / 2 + prediction[:, 0] / 2 prediction=prediction*s+m ytest=ytest*s+m size=size*s+m table = np.vstack([prediction.T, ytest, size.T]).T dfncp = pd.DataFrame(table, columns=header) else: underlying_model = RegressorAdapter(algorithm) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['CP_lower', 'CP_upper', 'NetPosUsd', 'prediction'] size = prediction[:, 1] / 2 + prediction[:, 0] / 2 prediction = prediction * s + m ytest = ytest * s + m size = size * s + m table = np.vstack([prediction.T, ytest, size.T]).T dfncp = pd.DataFrame(table, columns=header) if i == 0: dfncp.to_csv( parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str( np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str( parameters.get('calibration_size')) + '.csv', encoding='utf-8', index=False) else: dfncp.to_csv( parameters.get("WhichCP") + '_' + parameters.get('algorithm') + '_' + str( np.round(parameters.get('alpha_') * 100).astype(int)) + '_' + 'calibrationwindow' + str( parameters.get('calibration_size')) + '.csv', mode='a', header=False, index=False) del algorithm
def cv(df, parameters): end = len(df) - 120 out = np.zeros(3) out2 = np.zeros(3) p = parameters.copy() p.pop('algorithm') p.pop('randomized_calibration') p.pop('alpha_') if parameters.get('algorithm') == 'RandomForest': algorithm = RandomForestRegressor(**p) d = {'n_estimators': parameters.get('n_estimators'), "criterion": parameters.get("criterion"), "max_features": parameters.get("max_features"), "min_samples_split": parameters.get("min_samples_split"), "min_samples_leaf": parameters.get("min_samples_leaf") } if parameters.get('algorithm') == 'K-NearestNeighbours': algorithm = KNeighborsRegressor(**p) d = { 'n_neighbours': parameters.get('n_neighbours'), 'weights': parameters.get('weights'), 'metric': parameters.get('metric') } if parameters.get('algorithm') == 'LightGBM': algorithm = LGBMRegressor(**p) d = {"metric": parameters.get("metric"), "num_leaves": parameters.get('num_leaves'), "learning_rate": parameters.get('learning_rate'), "feature_fraction": parameters.get('feature_fraction'), "bagging_fraction": parameters.get('bagging_fraction'), "bagging_freq": parameters.get('bagging_freq'), } if parameters.get('algorithm') == 'LassoRegression': algorithm = Lasso(**p) d = {'alpha_': parameters.get('alpha_')} if parameters.get('algorithm') == 'NeuralNetwork': algorithm = NeuralNetworkAlgorithm(p) if parameters.get('algorithm') == 'LSTM': algorithm = BiLSTM(**p) d = {} d = p d['alpha_'] = parameters.get('alpha_') m, s = df['NetPosUsd'].mean(), df['NetPosUsd'].std() df=df.drop(['QdfTime' ], axis=1) mean = df.mean(axis=0) std = df.std(axis=0) df = (df - mean) / std for i, ratio in enumerate(([.5, 0.66, .84])): if parameters.get('randomized_calibration') == True: train_ = df.drop([ 'NetPosUsd'], axis=1).iloc[:int(end * ratio), :].values choose = np.random.choice(len(train_), int(end / 6), replace=False) calibrate = train_[choose, :] mask = np.ones(len(train_), dtype=bool) mask[choose] = False train = train_[mask, :] test = (df.drop([ 'NetPosUsd'], axis=1)).iloc[int(end * ratio):int(end * ratio) + int(end / 6), :].values ytrain_ = df['NetPosUsd'][:int(end * ratio)].values ycalibrate = ytrain_[choose] ytrain = ytrain_[mask] ytest = df['NetPosUsd'].iloc[int(end * ratio):int(end * ratio) + int(end / 6)] else: train = df.drop([ 'NetPosUsd'], axis=1).iloc[:int(end * ratio) - int(end / 6), :].values calibrate = df.drop([ 'NetPosUsd'], axis=1).iloc[int(end * ratio) - int(end / 6):int(end * ratio), :].values test = df.drop([ 'NetPosUsd'], axis=1).iloc[int(end * ratio):int(end * ratio) + int(end / 6), :].values ytrain = df['NetPosUsd'][:int(end * ratio) - int(end / 6)].values ycalibrate = df['NetPosUsd'][int(end * ratio) - int(end / 6):int(end * ratio)].values ytest = df['NetPosUsd'][int(end * ratio):int(end * ratio) + int(end / 6)].values # print(len(train),len(ytrain),len(calibrate),len(ycalibrate),len(test),len(ytest)) # Train and calibrate # ----------------------------------------------------------------------------- underlying_model = RegressorAdapter(algorithm) normalizing_model = RegressorAdapter(KNeighborsRegressor(n_neighbors=50)) normalizer = RegressorNormalizer(underlying_model, normalizing_model, AbsErrorErrFunc()) nc = RegressorNc(underlying_model, AbsErrorErrFunc(), normalizer) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['NCP_lower', 'NCP_upper', 'NetPosUsd', 'prediction'] size = prediction[:, 1] / 2 + prediction[:, 0] / 2 prediction = prediction * s + m ytest = ytest * s + m size = size * s + m table = np.vstack([prediction.T, ytest, size.T]).T dfncp = pd.DataFrame(table, columns=header) underlying_model = RegressorAdapter(algorithm) nc = RegressorNc(underlying_model, AbsErrorErrFunc()) icp = IcpRegressor(nc) icp.fit(train, ytrain) icp.calibrate(calibrate, ycalibrate) prediction = icp.predict(test, significance=parameters.get('alpha_')) header = ['cp_lower', 'cp_upper'] prediction = prediction * s + m table = np.vstack([prediction.T]).T dfcp = pd.DataFrame(table, columns=header) dfncp['CP_lower'] = dfcp['cp_lower'] dfncp['CP_upper'] = dfcp['cp_upper'] out[i] = qd_objective(dfncp.NetPosUsd, dfncp['CP_lower'], dfncp['CP_upper'], parameters.get('alpha_')) out2[i] = qd_objective(dfncp.NetPosUsd, dfncp['NCP_lower'], dfncp['NCP_upper'], parameters.get('alpha_')) d['CP_loss'] = np.mean(out) d['NCP_loss'] = np.mean(out2) if os.path.exists(parameters.get('algorithm') + '_cv.csv') == True: pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', mode='a', header=False, index=False) else: pd.DataFrame(data=d, index=[0]).to_csv(parameters.get('algorithm') + '_cv.csv', encoding='utf-8', index=False)
from nonconformist.nc import RegressorNc, abs_error, abs_error_inv def split_data(data, n_train, n_test): n_train = n_train*len(data)//(n_train+n_test) n_test = len(data)-n_train ind = np.random.permutation(len(data)) return data[ind[:n_train]], data[ind[n_train:n_train+n_test]] data = Orange.data.Table("auto-mpg") imp = Impute() data = imp(data) for sig in np.linspace(0.01, 0.1, 10): errs, szs = [], [] for rep in range(10): train, test = split_data(data, 2, 1) train, calib = split_data(train, 2, 1) icp = IcpRegressor(RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv)) icp.fit(train.X, train.Y) icp.calibrate(calib.X, calib.Y) pred = icp.predict(test.X, significance=sig) acc = sum(p[0] <= y <= p[1] for p, y in zip(pred, test.Y))/len(pred) err = 1-acc sz = sum(p[1]-p[0] for p in pred)/len(pred) errs.append(err) szs.append(sz) print(sig, np.mean(errs), np.mean(szs))
def create_conformal_model(): """ Description - Create conformal model - Main loop """ #Read data from file data = read_data(args.i) #Calculate descriptors using RD-kit descriptors_df = calculate_descriptors(data['smiles']) #Assign indices train_i, calibrate_i, test_i = create_indices_test_training_calibration( data) # Create indices for test,training, calibration sets test_index_total = [x for x in test_i] calibrate_index_total = [x for x in calibrate_i] #Create inductive conformal prediction regressor if args.m == 'RF': icp = IcpRegressor( NormalizedRegressorNc(RandomForestRegressor, KNeighborsRegressor, abs_error, abs_error_inv, model_params={'n_estimators': 100})) if args.m == 'SVM': #No support vector regressor print('error - no SVM-regressor avliable') icp = IcpRegressor( NormalizedRegressorNc(SVR, KNeighborsRegressor, abs_error, abs_error_inv, model_params={'n_estimators': 100})) #Create DataFrames to store data A = pandas.DataFrame(index=range(len(data))) B = pandas.DataFrame(index=range(len(data))) C = pandas.DataFrame(index=range(len(data))) iA = pandas.DataFrame(index=range(len(data))) iB = pandas.DataFrame(index=range(len(data))) iC = pandas.DataFrame(index=range(len(data))) if args.verbose: print('Number of models to create: ' + args.num_models) print('############## Starting calculations ##############') icp_s = [] for i in range(int(args.num_models)): #DEBUG 100 Xtrain, Xtest, Xcalibrate, ytrain, ytest, ycalibrate = create_train_test_calibrate_sets( data, descriptors_df, train_i, calibrate_i, test_i) #Create nornal model icp.fit(Xtrain, ytrain) #Calibrate normal model icp.calibrate(asanyarray(Xcalibrate), asanyarray(ycalibrate)) #Predrict test and training sets prediction_test = icp.predict(asanyarray(Xtest), significance=args.significance) # 0.2 prediction_calibrate = icp.predict(asanyarray(Xcalibrate), significance=args.significance) #Create DF with data blob = pandas.DataFrame(prediction_test, index=test_i) iblob = pandas.DataFrame(prediction_calibrate, index=calibrate_i) A[i] = blob[0] B[i] = blob[1] iA[i] = iblob[0] iB[i] = iblob[1] #Create new indices for next model test_index_total = np.unique( np.concatenate((test_index_total, test_i), axis=0)) calibrate_index_total = np.unique( np.concatenate((calibrate_index_total, calibrate_i), axis=0)) train_i, calibrate_i, test_i = randomize_new_indices( train_i, calibrate_i, test_i, data, i) #temp = sklearn.base.clone(icp) icp_s.append(copy.copy(icp)) ### Save models ### save_models(icp_s) if args.verbose: print( '################## Loop finished, model created, test set predicted #################' ) experimental_values = data['Observed'][test_index_total] iexperimental_values = data['Observed'][calibrate_index_total] C['median_prediction_0'] = A.median(axis=1) C['median_prediction_1'] = B.median(axis=1) C['median_prediction'] = (C['median_prediction_0'] + C['median_prediction_1']) / 2 C['median_prediction_size'] = C['median_prediction'] - C[ 'median_prediction_0'] Y_pred_median_test = C['median_prediction'].dropna() median_prediction_size = C['median_prediction_size'].dropna().tolist() num_outside_median = 0 for i in range(len(data)): try: if C['median_prediction_0'].dropna()[i] < experimental_values[ i] < C['median_prediction_1'].dropna()[i]: pass else: num_outside_median += 1 #print('Outside range') except: pass #print('error') #Internal prediction iC['median_prediction_0'] = iA.median(axis=1) iC['median_prediction_1'] = iB.median(axis=1) iC['median_prediction'] = (iC['median_prediction_0'] + iC['median_prediction_1']) / 2 iC['median_prediction_size'] = iC['median_prediction'] - iC[ 'median_prediction_0'] iY_pred_median_test = iC['median_prediction'].dropna() imedian_prediction_size = iC['median_prediction_size'].dropna().tolist() inum_outside_median = 0 for i in range(len(data)): try: if iC['median_prediction_0'].dropna()[i] < iexperimental_values[ i] < iC['median_prediction_1'].dropna()[i]: pass else: inum_outside_median += 1 #print('Outside range') except: pass #print('error') if args.verbose: print( '########################## Prediction statistics external test ##########################' ) print('') print('Number of compounds predicted in test set: ' + str(C['median_prediction'].notnull().sum())) if args.t != 'full_model': ex_r2_score = r2_score(experimental_values, Y_pred_median_test) print('R^2 (coefficient of determination): %.3f' % ex_r2_score) ex_mean_squared_error = mean_squared_error(experimental_values, Y_pred_median_test) ex_rmse = sqrt(ex_mean_squared_error) print('RMSE: %.3f' % ex_rmse) ex_MAE = mean_absolute_error(experimental_values, Y_pred_median_test) print('Mean absolute error: %.3f' % ex_MAE) print('Mean squared error: %.3f' % ex_mean_squared_error) #Average prediction range print('Mean of median prediction range: %.3f' % mean(median_prediction_size)) percent_num_outside_median = 100 * float(num_outside_median) / float( len(experimental_values)) print('Number of compounds outside of prediction range: ' + str(num_outside_median)) print('% of compounds predicted outside of prediction range: ' + str(percent_num_outside_median) + ' %') print(' ') #####Internal Prediction ######## print('Number of compounds predicted in training set: ' + str(iC['median_prediction'].notnull().sum())) iex_r2_score = r2_score(iexperimental_values, iY_pred_median_test) print('R^2 (coefficient of determination): %.3f' % iex_r2_score) iex_mean_squared_error = mean_squared_error(iexperimental_values, iY_pred_median_test) iex_rmse = sqrt(iex_mean_squared_error) print('RMSE: %.3f' % iex_rmse) print('Mean squared error: %.3f' % iex_mean_squared_error) iex_MAE = mean_absolute_error(iexperimental_values, iY_pred_median_test) print('Mean absolute error: %.3f' % iex_MAE) #Average prediction range print('Mean of median prediction range: %.3f' % mean(imedian_prediction_size)) ipercent_num_outside_median = 100 * float(inum_outside_median) / float( len(iexperimental_values)) print('Number of compounds outside of prediction range: ' + str(inum_outside_median)) print('% of compounds predicted outside of prediction range: ' + str(ipercent_num_outside_median) + ' %') print(' ') #### Plot results - plot test set if args.plot: if args.verbose: print(' ################ Plotting testset #################') fig, ax = plt.subplots() ax.errorbar(experimental_values, Y_pred_median_test, yerr=median_prediction_size, fmt='o', markeredgecolor='black', markersize=6, mew=1, ecolor='black', elinewidth=0.3, capsize=3, capthick=1, errorevery=1) #Set the size ax.set_ylim([-10, -3]) ax.set_xlim([-10, -3]) # Plot title and lables #plt.title('Median predictions with prediction ranges for the testset') plt.ylabel('Predicted log Kp') plt.xlabel('Experimental log Kp') # Draw line fit = np.polyfit(experimental_values, Y_pred_median_test, 1) x = [-10, -3] #Regression line #ax.plot(experimental_values, fit[0]*asanyarray(experimental_values)+ fit[1], color='black') #ax.plot(x, fit[0]*asanyarray(x)+ fit[1], color='black') #Creating colored dots for ref 10 #ref10_experimental = data.loc[data['Ref.'] == 10]['Observed'] #ref10_predicted = C['median_prediction'][ref10_experimental.index] #ax.scatter(ref10_experimental, ref10_predicted,marker = 'o', color ='red', s = 100) ax.plot(x, x, color='black') plt.show() #Print data in CSV-file descriptors_df['Median prediction low range'] = C['median_prediction_0'] descriptors_df['Median prediction high range'] = C['median_prediction_1'] descriptors_df['Median prediction'] = C['median_prediction'] descriptors_df['size prediction range'] = C['median_prediction_1'] - C[ 'median_prediction_0'] write_csv_with_data(data, descriptors_df, args.d) #Calculate min, max and mean values for descriptors if args.phys: print(args.phys) print('Min: ') print(descriptors_df.min()) print('Max: ') print(descriptors_df.max()) print('Mean:') print(descriptors_df.mean()) if args.pca: print('Starting PCA') print(descriptors_df[[ 'logP', 'PSA', 'MolWt', 'RingCount', 'HeavyAtomCount', 'NumRotatableBonds' ]].head(3)) print(len(descriptors_df[['size prediction range']])) #Define typ of PCA pca = PCA(n_components=2) #Select desctiptors to use in PCA df_small = descriptors_df[[ 'logP', 'PSA', 'MolWt', 'RingCount', 'HeavyAtomCount', 'NumRotatableBonds' ]] #Convert descritor values to numeric/float df_X = df_small.apply(pandas.to_numeric, errors='raise') #Scale data scaler = preprocessing.RobustScaler() #Normalizer() # MaxAbsScaler() df_X_scaled = scaler.fit_transform(df_X) #Calculate PCA pca.fit(df_X_scaled) X2 = pca.transform( df_X_scaled ) #descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']]) #----------------------------------------------------------- desc_testset_large = descriptors_df.dropna( subset=['size prediction range']) desc_testset_small = desc_testset_large[[ 'logP', 'PSA', 'MolWt', 'RingCount', 'HeavyAtomCount', 'NumRotatableBonds' ]] desc_testset_num = desc_testset_small.apply(pandas.to_numeric, errors='raise') desc_testset_scaled = scaler.fit_transform(desc_testset_num) X3 = pca.transform( desc_testset_scaled ) #desc_testset[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']]) #----------------------------------------------------------- #desc_testset = descriptors_df.dropna(subset = ['size prediction range']) Yerr_num = desc_testset_large[['size prediction range' ]].apply(pandas.to_numeric, errors='coerce') #print(pandas.Series(Yerr['size prediction range'])) yerr = list(pandas.Series(Yerr_num['size prediction range']) / 4) plt.errorbar(X3[:, 0], X3[:, 1], yerr=yerr, fmt='o', markeredgecolor='black', markersize=6, mew=1, ecolor='black', elinewidth=0.3, capsize=3, capthick=1, errorevery=1) plt.scatter(X2[:, 0], X2[:, 1]) plt.xlabel('PC1') plt.ylabel('PC2') plt.title('PCA of descriptors') plt.show()
# ----------------------------------------------------------------------------- data = Orange.data.Table('iris') X, y = data.X[:, :3], data.X[:, 3] idx = np.random.permutation(y.size) train = idx[:idx.size // 3] calibrate = idx[idx.size // 3:2 * idx.size // 3] test = idx[2 * idx.size // 3:] # ----------------------------------------------------------------------------- # Train and calibrate # ----------------------------------------------------------------------------- icp = IcpRegressor( RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv)) icp.fit(X[train, :], y[train]) icp.calibrate(X[calibrate, :], y[calibrate]) acp = AggregatedCp(IcpRegressor( RegressorNc(DecisionTreeRegressor(), abs_error, abs_error_inv)), sampler=CrossSampler()) acp.fit(X[train, :], y[train]) # ----------------------------------------------------------------------------- # Predict # ----------------------------------------------------------------------------- print('# Inductive') prediction = icp.predict(X[test, :], significance=0.1) for pred, actual in zip(prediction[:5], y[test]): print(pred, actual) print('\n# Cross')
def create_conformal_model(): """ Description - Create conformal model - Main loop """ #Read data from file data = read_data(args.i) #Calculate descriptors using RD-kit descriptors_df = calculate_descriptors(data['smiles']) #Assign indices train_i, calibrate_i, test_i = create_indices_test_training_calibration(data) # Create indices for test,training, calibration sets test_index_total = [x for x in test_i] calibrate_index_total = [x for x in calibrate_i] #Create inductive conformal prediction regressor if args.m == 'RF': icp = IcpRegressor(NormalizedRegressorNc(RandomForestRegressor, KNeighborsRegressor, abs_error, abs_error_inv, model_params={'n_estimators': 100})) if args.m == 'SVM': #No support vector regressor print('error - no SVM-regressor avliable') icp = IcpRegressor(NormalizedRegressorNc(SVR, KNeighborsRegressor, abs_error, abs_error_inv, model_params={'n_estimators': 100})) #Create DataFrames to store data A = pandas.DataFrame(index = range(len(data))) B = pandas.DataFrame(index = range(len(data))) C = pandas.DataFrame(index = range(len(data))) iA = pandas.DataFrame(index = range(len(data))) iB = pandas.DataFrame(index = range(len(data))) iC = pandas.DataFrame(index = range(len(data))) if args.verbose: print('Number of models to create: '+args.num_models) print('############## Starting calculations ##############') icp_s = [] for i in range(int(args.num_models)): #DEBUG 100 Xtrain, Xtest, Xcalibrate, ytrain, ytest, ycalibrate = create_train_test_calibrate_sets(data, descriptors_df, train_i, calibrate_i, test_i) #Create nornal model icp.fit(Xtrain, ytrain) #Calibrate normal model icp.calibrate(asanyarray(Xcalibrate), asanyarray(ycalibrate)) #Predrict test and training sets prediction_test = icp.predict(asanyarray(Xtest), significance = args.significance) # 0.2 prediction_calibrate = icp.predict(asanyarray(Xcalibrate), significance = args.significance) #Create DF with data blob = pandas.DataFrame(prediction_test, index=test_i) iblob = pandas.DataFrame(prediction_calibrate, index=calibrate_i) A[i] = blob[0] B[i] = blob[1] iA[i] = iblob[0] iB[i] = iblob[1] #Create new indices for next model test_index_total = np.unique(np.concatenate((test_index_total, test_i), axis=0)) calibrate_index_total = np.unique(np.concatenate((calibrate_index_total, calibrate_i), axis=0)) train_i, calibrate_i, test_i = randomize_new_indices(train_i, calibrate_i, test_i, data, i) #temp = sklearn.base.clone(icp) icp_s.append(copy.copy(icp)) ### Save models ### save_models(icp_s) if args.verbose: print('################## Loop finished, model created, test set predicted #################') experimental_values = data['Observed'][test_index_total] iexperimental_values = data['Observed'][calibrate_index_total] C['median_prediction_0'] = A.median(axis=1) C['median_prediction_1'] = B.median(axis=1) C['median_prediction'] = (C['median_prediction_0'] + C['median_prediction_1'])/2 C['median_prediction_size'] = C['median_prediction'] - C['median_prediction_0'] Y_pred_median_test = C['median_prediction'].dropna() median_prediction_size = C['median_prediction_size'].dropna().tolist() num_outside_median = 0 for i in range(len(data)): try: if C['median_prediction_0'].dropna()[i] < experimental_values[i] < C['median_prediction_1'].dropna()[i]: pass else: num_outside_median +=1 #print('Outside range') except: pass #print('error') #Internal prediction iC['median_prediction_0'] = iA.median(axis=1) iC['median_prediction_1'] = iB.median(axis=1) iC['median_prediction'] = (iC['median_prediction_0'] + iC['median_prediction_1'])/2 iC['median_prediction_size'] = iC['median_prediction'] - iC['median_prediction_0'] iY_pred_median_test = iC['median_prediction'].dropna() imedian_prediction_size = iC['median_prediction_size'].dropna().tolist() inum_outside_median = 0 for i in range(len(data)): try: if iC['median_prediction_0'].dropna()[i] < iexperimental_values[i] < iC['median_prediction_1'].dropna()[i]: pass else: inum_outside_median +=1 #print('Outside range') except: pass #print('error') if args.verbose: print('########################## Prediction statistics external test ##########################') print('') print('Number of compounds predicted in test set: '+ str(C['median_prediction'].notnull().sum())) if args.t != 'full_model': ex_r2_score= r2_score(experimental_values, Y_pred_median_test) print('R^2 (coefficient of determination): %.3f' % ex_r2_score) ex_mean_squared_error = mean_squared_error(experimental_values, Y_pred_median_test) ex_rmse = sqrt(ex_mean_squared_error) print('RMSE: %.3f' % ex_rmse) ex_MAE = mean_absolute_error(experimental_values, Y_pred_median_test) print('Mean absolute error: %.3f' % ex_MAE) print('Mean squared error: %.3f' % ex_mean_squared_error) #Average prediction range print('Mean of median prediction range: %.3f' % mean(median_prediction_size)) percent_num_outside_median = 100*float(num_outside_median)/float(len(experimental_values)) print('Number of compounds outside of prediction range: '+str(num_outside_median)) print('% of compounds predicted outside of prediction range: '+str(percent_num_outside_median) +' %') print(' ') #####Internal Prediction ######## print('Number of compounds predicted in training set: '+ str(iC['median_prediction'].notnull().sum())) iex_r2_score= r2_score(iexperimental_values, iY_pred_median_test) print('R^2 (coefficient of determination): %.3f' % iex_r2_score) iex_mean_squared_error = mean_squared_error(iexperimental_values, iY_pred_median_test) iex_rmse = sqrt(iex_mean_squared_error) print('RMSE: %.3f' % iex_rmse) print('Mean squared error: %.3f' % iex_mean_squared_error) iex_MAE = mean_absolute_error(iexperimental_values, iY_pred_median_test) print('Mean absolute error: %.3f' % iex_MAE) #Average prediction range print('Mean of median prediction range: %.3f' % mean(imedian_prediction_size)) ipercent_num_outside_median = 100*float(inum_outside_median)/float(len(iexperimental_values)) print('Number of compounds outside of prediction range: '+str(inum_outside_median)) print('% of compounds predicted outside of prediction range: '+str(ipercent_num_outside_median) +' %') print(' ') #### Plot results - plot test set if args.plot: if args.verbose: print(' ################ Plotting testset #################') fig, ax = plt.subplots() ax.errorbar(experimental_values, Y_pred_median_test, yerr=median_prediction_size, fmt='o', markeredgecolor = 'black', markersize = 6, mew=1, ecolor='black', elinewidth=0.3, capsize = 3, capthick=1, errorevery = 1) #Set the size ax.set_ylim([-10,-3]) ax.set_xlim([-10,-3]) # Plot title and lables #plt.title('Median predictions with prediction ranges for the testset') plt.ylabel('Predicted log Kp') plt.xlabel('Experimental log Kp') # Draw line fit = np.polyfit(experimental_values, Y_pred_median_test, 1) x = [-10,-3] #Regression line #ax.plot(experimental_values, fit[0]*asanyarray(experimental_values)+ fit[1], color='black') #ax.plot(x, fit[0]*asanyarray(x)+ fit[1], color='black') #Creating colored dots for ref 10 #ref10_experimental = data.loc[data['Ref.'] == 10]['Observed'] #ref10_predicted = C['median_prediction'][ref10_experimental.index] #ax.scatter(ref10_experimental, ref10_predicted,marker = 'o', color ='red', s = 100) ax.plot(x, x, color='black') plt.show() #Print data in CSV-file descriptors_df['Median prediction low range'] = C['median_prediction_0'] descriptors_df['Median prediction high range'] = C['median_prediction_1'] descriptors_df['Median prediction'] = C['median_prediction'] descriptors_df['size prediction range'] = C['median_prediction_1'] - C['median_prediction_0'] write_csv_with_data(data,descriptors_df, args.d) #Calculate min, max and mean values for descriptors if args.phys: print(args.phys) print('Min: ') print(descriptors_df.min()) print('Max: ') print(descriptors_df.max()) print('Mean:') print(descriptors_df.mean()) if args.pca: print('Starting PCA') print(descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']].head(3)) print(len(descriptors_df[['size prediction range']])) #Define typ of PCA pca = PCA(n_components=2) #Select desctiptors to use in PCA df_small = descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']] #Convert descritor values to numeric/float df_X = df_small.apply(pandas.to_numeric, errors='raise') #Scale data scaler = preprocessing.RobustScaler() #Normalizer() # MaxAbsScaler() df_X_scaled = scaler.fit_transform(df_X) #Calculate PCA pca.fit(df_X_scaled) X2 = pca.transform(df_X_scaled) #descriptors_df[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']]) #----------------------------------------------------------- desc_testset_large = descriptors_df.dropna(subset = ['size prediction range']) desc_testset_small = desc_testset_large[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']] desc_testset_num = desc_testset_small.apply(pandas.to_numeric, errors='raise') desc_testset_scaled = scaler.fit_transform(desc_testset_num) X3 = pca.transform(desc_testset_scaled) #desc_testset[['logP','PSA','MolWt','RingCount','HeavyAtomCount','NumRotatableBonds']]) #----------------------------------------------------------- #desc_testset = descriptors_df.dropna(subset = ['size prediction range']) Yerr_num = desc_testset_large[['size prediction range']].apply(pandas.to_numeric, errors='coerce') #print(pandas.Series(Yerr['size prediction range'])) yerr = list(pandas.Series(Yerr_num['size prediction range'])/4) plt.errorbar(X3[:,0], X3[:,1], yerr=yerr ,fmt='o', markeredgecolor = 'black', markersize = 6, mew=1, ecolor='black', elinewidth=0.3, capsize = 3, capthick=1, errorevery = 1) plt.scatter(X2[:,0], X2[:,1]) plt.xlabel('PC1') plt.ylabel('PC2') plt.title('PCA of descriptors') plt.show()