def estimate(respfile, covfile, maskfile=None, cvfolds=None, testcov=None, testresp=None, alg='gpr', configparam=None, saveoutput=True, outputsuffix=None): """ Estimate a normative model This will estimate a model in one of two settings according to the particular parameters specified (see below): * under k-fold cross-validation required settings 1) respfile 2) covfile 3) cvfolds>2 * estimating a training dataset then applying to a second test dataset required sessting 1) respfile 2) covfile 3) testcov 4) testresp * estimating on a training dataset ouput of forward maps mean and se required sessting 1) respfile 2) covfile 3) testcov The models are estimated on the basis of data stored on disk in ascii or neuroimaging data formats (nifti or cifti). Ascii data should be in tab or space delimited format with the number of subjects in rows and the number of variables in columns. Neuroimaging data will be reshaped into the appropriate format Basic usage:: estimate(respfile, covfile, [extra_arguments]) where the variables are defined below. Note that either the cfolds parameter or (testcov, testresp) should be specified, but not both. :param respfile: response variables for the normative model :param covfile: covariates used to predict the response variable :param maskfile: mask used to apply to the data (nifti only) :param cvfolds: Number of cross-validation folds :param testcov: Test covariates :param testresp: Test responses :param alg: Algorithm for normative model :param configparam: Parameters controlling the estimation algorithm :param saveoutput: Save the output to disk? Otherwise returned as arrays :param outputsuffix: Text string to add to the output filenames All outputs are written to disk in the same format as the input. These are: :outputs: * yhat - predictive mean * ys2 - predictive variance * Hyp - hyperparameters * Z - deviance scores * Rho - Pearson correlation between true and predicted responses * pRho - parametric p-value for this correlation * rmse - root mean squared error between true/predicted responses * smse - standardised mean squared error The outputsuffix may be useful to estimate multiple normative models in the same directory (e.g. for custom cross-validation schemes) """ # load data print("Processing data in " + respfile) X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] Nmod = Y.shape[1] if testcov is not None: # we have a separate test dataset Xte = fileio.load(testcov) testids = range(X.shape[0], X.shape[0] + Xte.shape[0]) if len(Xte.shape) == 1: Xte = Xte[:, np.newaxis] if testresp is not None: Yte, testmask = load_response_vars(testresp, maskfile) if len(Yte.shape) == 1: Yte = Yte[:, np.newaxis] else: sub_te = Xte.shape[0] Yte = np.zeros([sub_te, Nmod]) # treat as a single train-test split splits = CustomCV((range(0, X.shape[0]), ), (testids, )) Y = np.concatenate((Y, Yte), axis=0) X = np.concatenate((X, Xte), axis=0) # force the number of cross-validation folds to 1 if cvfolds is not None and cvfolds != 1: print("Ignoring cross-valdation specification (test data given)") cvfolds = 1 else: # we are running under cross-validation splits = KFold(n_splits=cvfolds) testids = range(0, X.shape[0]) # find and remove bad variables from the response variables # note: the covariates are assumed to have already been checked nz = np.where( np.bitwise_and(np.isfinite(Y).any(axis=0), np.var(Y, axis=0) != 0))[0] # Initialise normative model nm = norm_init(X, alg=alg, configparam=configparam) # run cross-validation loop Yhat = np.zeros_like(Y) S2 = np.zeros_like(Y) Hyp = np.zeros((Nmod, nm.n_params, cvfolds)) Z = np.zeros_like(Y) nlZ = np.zeros((Nmod, cvfolds)) for idx in enumerate(splits.split(X)): fold = idx[0] tr = idx[1][0] te = idx[1][1] # standardize responses and covariates, ignoring invalid entries iy, jy = np.ix_(tr, nz) mY = np.mean(Y[iy, jy], axis=0) sY = np.std(Y[iy, jy], axis=0) Yz = np.zeros_like(Y) Yz[:, nz] = (Y[:, nz] - mY) / sY mX = np.mean(X[tr, :], axis=0) sX = np.std(X[tr, :], axis=0) Xz = (X - mX) / sX # estimate the models for all subjects for i in range(0, len(nz)): # range(0, Nmod): print("Estimating model ", i + 1, "of", len(nz)) try: nm = norm_init(Xz[tr, :], Yz[tr, nz[i]], alg=alg, configparam=configparam) Hyp[nz[i], :, fold] = nm.estimate(Xz[tr, :], Yz[tr, nz[i]]) yhat, s2 = nm.predict(Xz[tr, :], Yz[tr, nz[i]], Xz[te, :], Hyp[nz[i], :, fold]) Yhat[te, nz[i]] = yhat * sY[i] + mY[i] S2[te, nz[i]] = s2 * sY[i]**2 nlZ[nz[i], fold] = nm.neg_log_lik if testcov is None: Z[te, nz[i]] = (Y[te, nz[i]] - Yhat[te, nz[i]]) / \ np.sqrt(S2[te, nz[i]]) else: if testresp is not None: Z[te, nz[i]] = (Y[te, nz[i]] - Yhat[te, nz[i]]) / \ np.sqrt(S2[te, nz[i]]) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print("Model ", i + 1, "of", len(nz), "FAILED!..skipping and writing NaN to outputs") print("Exception:") print(e) print(exc_type, fname, exc_tb.tb_lineno) Hyp[nz[i], :, fold] = float('nan') Yhat[te, nz[i]] = float('nan') S2[te, nz[i]] = float('nan') nlZ[nz[i], fold] = float('nan') if testcov is None: Z[te, nz[i]] = float('nan') else: if testresp is not None: Z[te, nz[i]] = float('nan') # compute performance metrics if testcov is None: MSE = np.mean((Y[testids, :] - Yhat[testids, :])**2, axis=0) RMSE = np.sqrt(MSE) # for the remaining variables, we need to ignore zero variances SMSE = np.zeros_like(MSE) Rho = np.zeros(Nmod) pRho = np.ones(Nmod) iy, jy = np.ix_(testids, nz) # ids for tested samples nonzero values SMSE[nz] = MSE[nz] / np.var(Y[iy, jy], axis=0) Rho[nz], pRho[nz] = compute_pearsonr(Y[iy, jy], Yhat[iy, jy]) else: if testresp is not None: MSE = np.mean((Y[testids, :] - Yhat[testids, :])**2, axis=0) RMSE = np.sqrt(MSE) # for the remaining variables, we need to ignore zero variances SMSE = np.zeros_like(MSE) Rho = np.zeros(Nmod) pRho = np.ones(Nmod) iy, jy = np.ix_(testids, nz) # ids tested samples nonzero values SMSE[nz] = MSE[nz] / np.var(Y[iy, jy], axis=0) Rho[nz], pRho[nz] = compute_pearsonr(Y[iy, jy], Yhat[iy, jy]) # Set writing options if saveoutput: print("Writing output ...") if fileio.file_type(respfile) == 'cifti' or \ fileio.file_type(respfile) == 'nifti': exfile = respfile else: exfile = None if outputsuffix is not None: ext = str(outputsuffix) + fileio.file_extension(respfile) else: ext = fileio.file_extension(respfile) # Write output if testcov is None: fileio.save(Yhat[testids, :].T, 'yhat' + ext, example=exfile, mask=maskvol) fileio.save(S2[testids, :].T, 'ys2' + ext, example=exfile, mask=maskvol) fileio.save(Z[testids, :].T, 'Z' + ext, example=exfile, mask=maskvol) fileio.save(Rho, 'Rho' + ext, example=exfile, mask=maskvol) fileio.save(pRho, 'pRho' + ext, example=exfile, mask=maskvol) fileio.save(RMSE, 'rmse' + ext, example=exfile, mask=maskvol) fileio.save(SMSE, 'smse' + ext, example=exfile, mask=maskvol) if cvfolds is None: fileio.save(Hyp[:, :, 0], 'Hyp' + ext, example=exfile, mask=maskvol) else: for idx in enumerate(splits.split(X)): fold = idx[0] fileio.save(Hyp[:, :, fold], 'Hyp_' + str(fold + 1) + ext, example=exfile, mask=maskvol) else: if testresp is None: fileio.save(Yhat[testids, :].T, 'yhat' + ext, example=exfile, mask=maskvol) fileio.save(S2[testids, :].T, 'ys2' + ext, example=exfile, mask=maskvol) fileio.save(Hyp[:, :, 0], 'Hyp' + ext, example=exfile, mask=maskvol) else: fileio.save(Yhat[testids, :].T, 'yhat' + ext, example=exfile, mask=maskvol) fileio.save(S2[testids, :].T, 'ys2' + ext, example=exfile, mask=maskvol) fileio.save(Z[testids, :].T, 'Z' + ext, example=exfile, mask=maskvol) fileio.save(Rho, 'Rho' + ext, example=exfile, mask=maskvol) fileio.save(pRho, 'pRho' + ext, example=exfile, mask=maskvol) fileio.save(RMSE, 'rmse' + ext, example=exfile, mask=maskvol) fileio.save(SMSE, 'smse' + ext, example=exfile, mask=maskvol) if cvfolds is None: fileio.save(Hyp[:, :, 0], 'Hyp' + ext, example=exfile, mask=maskvol) else: for idx in enumerate(splits.split(X)): fold = idx[0] fileio.save(Hyp[:, :, fold], 'Hyp_' + str(fold + 1) + ext, example=exfile, mask=maskvol) else: if testcov is None: output = (Yhat[testids, :], S2[testids, :], Hyp, Z[testids, :], Rho, pRho, RMSE, SMSE) else: if testresp is None: output = (Yhat[testids, :], S2[testids, :], Hyp[testids, :]) else: output = (Yhat[testids, :], S2[testids, :], Hyp, Z[testids, :], Rho, pRho, RMSE, SMSE) return output
def predict(covfile, respfile=None, maskfile=None, **kwargs): model_path = kwargs.pop('model_path', 'Models') job_id = kwargs.pop('job_id', None) batch_size = kwargs.pop('batch_size', None) output_path = kwargs.pop('output_path', '') outputsuffix = kwargs.pop('outputsuffix', None) if not os.path.isdir(model_path): print('Models directory does not exist!') return else: with open(os.path.join(model_path, 'meta_data.md'), 'rb') as file: meta_data = pickle.load(file) standardize = meta_data['standardize'] mY = meta_data['mean_resp'] sY = meta_data['std_resp'] mX = meta_data['mean_cov'] sX = meta_data['std_cov'] if batch_size is not None: batch_size = int(batch_size) job_id = int(job_id) - 1 if (output_path is not '') and (not os.path.isdir(output_path)): os.mkdir(output_path) # load data print("Loading data ...") X = fileio.load(covfile) if len(X.shape) == 1: X = X[:, np.newaxis] sample_num = X.shape[0] feature_num = len(glob.glob(os.path.join(model_path, 'NM_*.pkl'))) # run cross-validation loop Yhat = np.zeros([sample_num, feature_num]) S2 = np.zeros([sample_num, feature_num]) Z = np.zeros([sample_num, feature_num]) if standardize: Xz = (X - mX[0]) / sX[0] else: Xz = X # estimate the models for all subjects for i in range(feature_num): print("Prediction by model ", i + 1, "of", feature_num) nm = norm_init(Xz) nm = nm.load( os.path.join(model_path, 'NM_' + str(0) + '_' + str(i) + '.pkl')) yhat, s2 = nm.predict(Xz, **kwargs) if standardize: Yhat[:, i] = yhat * sY[0][i] + mY[0][i] S2[:, i] = s2 * sY[0][i]**2 else: Yhat[:, i] = yhat S2[:, i] = s2 if respfile is None: return (Yhat, S2) else: Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] Z = (Y - Yhat) / np.sqrt(S2) print("Evaluating the model ...") results = evaluate(Y, Yhat, S2=S2, metrics=['Rho', 'RMSE', 'SMSE', 'EXPV']) print("Evaluations Writing outputs ...") save_results(respfile, Yhat, S2, maskvol, Z=Z, outputsuffix=outputsuffix, results=results, save_path=output_path) return (Yhat, S2, Z)
def transfer(covfile, respfile, testcov=None, testresp=None, maskfile=None, **kwargs): if (not 'model_path' in list(kwargs.keys())) or \ (not 'output_path' in list(kwargs.keys())) or \ (not 'trbefile' in list(kwargs.keys())): return else: model_path = kwargs.pop('model_path') output_path = kwargs.pop('output_path') trbefile = kwargs.pop('trbefile') outputsuffix = kwargs.pop('outputsuffix', None) tsbefile = kwargs.pop('tsbefile', None) job_id = kwargs.pop('job_id', None) batch_size = kwargs.pop('batch_size', None) if batch_size is not None: batch_size = int(batch_size) job_id = int(job_id) - 1 if not os.path.isdir(output_path): os.mkdir(output_path) transferred_models_path = os.path.join(output_path, 'Models') if not os.path.isdir(transferred_models_path): os.mkdir(transferred_models_path) # load data print("Loading data ...") X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] feature_num = Y.shape[1] mY = np.mean(Y, axis=0) sY = np.std(Y, axis=0) if trbefile is not None: batch_effects_train = fileio.load(trbefile) else: batch_effects_train = np.zeros([X.shape[0], 2]) if testcov is not None: # we have a separate test dataset Xte = fileio.load(testcov) if len(Xte.shape) == 1: Xte = Xte[:, np.newaxis] ts_sample_num = Xte.shape[0] if testresp is not None: Yte, testmask = load_response_vars(testresp, maskfile) if len(Yte.shape) == 1: Yte = Yte[:, np.newaxis] else: Yte = np.zeros([ts_sample_num, feature_num]) if tsbefile is not None: batch_effects_test = fileio.load(tsbefile) else: batch_effects_test = np.zeros([Xte.shape[0], 2]) Yhat = np.zeros([ts_sample_num, feature_num]) S2 = np.zeros([ts_sample_num, feature_num]) Z = np.zeros([ts_sample_num, feature_num]) # estimate the models for all subjects for i in range(feature_num): nm = norm_init(X) if batch_size is not None: # when using nirmative_parallel print("Transferting model ", job_id * batch_size + i) nm = nm.load( os.path.join(model_path, 'NM_0_' + str(job_id * batch_size + i) + '.pkl')) else: print("Transferting model ", i + 1, "of", feature_num) nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) + '.pkl')) nm = nm.estimate_on_new_sites(X, Y[:, i], batch_effects_train) if batch_size is not None: nm.save( os.path.join( transferred_models_path, 'NM_transfered_' + str(job_id * batch_size + i) + '.pkl')) else: nm.save( os.path.join(transferred_models_path, 'NM_transfered_' + str(i) + '.pkl')) if testcov is not None: yhat, s2 = nm.predict_on_new_sites(Xte, batch_effects_test) Yhat[:, i] = yhat S2[:, i] = s2 if testresp is None: save_results(respfile, Yhat, S2, maskvol, outputsuffix=outputsuffix) return (Yhat, S2) else: Z = (Yte - Yhat) / np.sqrt(S2) print("Evaluating the model ...") results = evaluate(Yte, Yhat, S2=S2, mY=mY, sY=sY) save_results(respfile, Yhat, S2, maskvol, Z=Z, results=results, outputsuffix=outputsuffix) return (Yhat, S2, Z)
Ys[configparam['batch_effects_test'][:,0]==0,] = Xs[configparam['batch_effects_test'][:,0]==0,] * \ 0.2 + Xs[configparam['batch_effects_test'][:,0]==0,] * \ 0.25 * np.random.randn(np.sum(configparam['batch_effects_test'][:,0]==0)) Ys[configparam['batch_effects_test'][:,0]==1,] = Xs[configparam['batch_effects_test'][:,0]==1,] * \ 0.85 + 2 + 5 * np.random.randn(np.sum(configparam['batch_effects_test'][:,0]==1)) # Trivial Model configparam['model_type'] = 'linear' configparam['random_intercept'] = False configparam['random_slope'] = False configparam['random_noise'] = False configparam['hetero_noise'] = False with open('configs.pkl', 'wb') as file: pickle.dump(configparam, file) nm = norm_init(X, Y, alg='hbr', configparam='configs.pkl') nm.estimate(X, Y) yhat_trivial, s2_trivial = nm.predict(Xs) cal_er_trivial = calibration_error(Ys[configparam['batch_effects_test'][:,0]==0,], yhat_trivial[configparam['batch_effects_test'][:,0]==0,], np.sqrt(s2_trivial[configparam['batch_effects_test'][:,0]==0,]), [0.05,0.25,0.5,0.75,0.95]) + \ calibration_error(Ys[configparam['batch_effects_test'][:,0]==1,], yhat_trivial[configparam['batch_effects_test'][:,0]==1,], np.sqrt(s2_trivial[configparam['batch_effects_test'][:,0]==1,]), [0.05,0.25,0.5,0.75,0.95]) rmse_trivial = np.sqrt(np.mean((Ys - yhat_trivial)**2, axis=0)) # Random Intercept and Slope configparam['model_type'] = 'linear' configparam['random_intercept'] = True
def estimate(covfile, respfile, **kwargs): """ Estimate a normative model This will estimate a model in one of two settings according to the particular parameters specified (see below): * under k-fold cross-validation required settings 1) respfile 2) covfile 3) cvfolds>=2 * estimating a training dataset then applying to a second test dataset required sessting 1) respfile 2) covfile 3) testcov 4) testresp * estimating on a training dataset ouput of forward maps mean and se required sessting 1) respfile 2) covfile 3) testcov The models are estimated on the basis of data stored on disk in ascii or neuroimaging data formats (nifti or cifti). Ascii data should be in tab or space delimited format with the number of subjects in rows and the number of variables in columns. Neuroimaging data will be reshaped into the appropriate format Basic usage:: estimate(respfile, covfile, [extra_arguments]) where the variables are defined below. Note that either the cfolds parameter or (testcov, testresp) should be specified, but not both. :param respfile: response variables for the normative model :param covfile: covariates used to predict the response variable :param maskfile: mask used to apply to the data (nifti only) :param cvfolds: Number of cross-validation folds :param testcov: Test covariates :param testresp: Test responses :param alg: Algorithm for normative model :param configparam: Parameters controlling the estimation algorithm :param saveoutput: Save the output to disk? Otherwise returned as arrays :param outputsuffix: Text string to add to the output filenames All outputs are written to disk in the same format as the input. These are: :outputs: * yhat - predictive mean * ys2 - predictive variance * nm - normative model * Z - deviance scores * Rho - Pearson correlation between true and predicted responses * pRho - parametric p-value for this correlation * rmse - root mean squared error between true/predicted responses * smse - standardised mean squared error The outputsuffix may be useful to estimate multiple normative models in the same directory (e.g. for custom cross-validation schemes) """ # parse keyword arguments maskfile = kwargs.pop('maskfile', None) cvfolds = kwargs.pop('cvfolds', None) testcov = kwargs.pop('testcov', None) testresp = kwargs.pop('testresp', None) alg = kwargs.pop('alg', 'gpr') saveoutput = kwargs.pop('saveoutput', 'True') == 'True' savemodel = kwargs.pop('savemodel', 'False') == 'True' outputsuffix = kwargs.pop('outputsuffix', None) standardize = kwargs.pop('standardize', True) if savemodel and not os.path.isdir('Models'): os.mkdir('Models') # load data print("Processing data in " + respfile) X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] Nmod = Y.shape[1] if testcov is not None: # we have a separate test dataset run_cv = False cvfolds = 1 Xte = fileio.load(testcov) testids = range(X.shape[0], X.shape[0] + Xte.shape[0]) if len(Xte.shape) == 1: Xte = Xte[:, np.newaxis] if testresp is not None: Yte, testmask = load_response_vars(testresp, maskfile) if len(Yte.shape) == 1: Yte = Yte[:, np.newaxis] else: sub_te = Xte.shape[0] Yte = np.zeros([sub_te, Nmod]) # treat as a single train-test split splits = CustomCV((range(0, X.shape[0]), ), (testids, )) Y = np.concatenate((Y, Yte), axis=0) X = np.concatenate((X, Xte), axis=0) else: run_cv = True # we are running under cross-validation splits = KFold(n_splits=cvfolds) testids = range(0, X.shape[0]) # find and remove bad variables from the response variables # note: the covariates are assumed to have already been checked nz = np.where( np.bitwise_and(np.isfinite(Y).any(axis=0), np.var(Y, axis=0) != 0))[0] # run cross-validation loop Yhat = np.zeros_like(Y) S2 = np.zeros_like(Y) Z = np.zeros_like(Y) nlZ = np.zeros((Nmod, cvfolds)) mean_resp = [] std_resp = [] mean_cov = [] std_cov = [] for idx in enumerate(splits.split(X)): fold = idx[0] tr = idx[1][0] te = idx[1][1] # standardize responses and covariates, ignoring invalid entries iy, jy = np.ix_(tr, nz) mY = np.mean(Y[iy, jy], axis=0) sY = np.std(Y[iy, jy], axis=0) mean_resp.append(mY) std_resp.append(sY) if standardize: Yz = np.zeros_like(Y) Yz[:, nz] = (Y[:, nz] - mY) / sY mX = np.mean(X[tr, :], axis=0) sX = np.std(X[tr, :], axis=0) Xz = (X - mX) / sX mean_resp.append(mY) std_resp.append(sY) mean_cov.append(mX) std_cov.append(sX) else: Yz = Y Xz = X # estimate the models for all subjects for i in range(0, len(nz)): print("Estimating model ", i + 1, "of", len(nz)) nm = norm_init(Xz[tr, :], Yz[tr, nz[i]], alg=alg, **kwargs) try: nm = nm.estimate(Xz[tr, :], Yz[tr, nz[i]]) yhat, s2 = nm.predict(Xz[te, :], Xz[tr, :], Yz[tr, nz[i]], **kwargs) if savemodel: nm.save('Models/NM_' + str(fold) + '_' + str(nz[i]) + '.pkl') if standardize: Yhat[te, nz[i]] = yhat * sY[i] + mY[i] S2[te, nz[i]] = s2 * sY[i]**2 else: Yhat[te, nz[i]] = yhat S2[te, nz[i]] = s2 nlZ[nz[i], fold] = nm.neg_log_lik if (run_cv or testresp is not None): Z[te, nz[i]] = (Y[te, nz[i]] - Yhat[te, nz[i]]) / \ np.sqrt(S2[te, nz[i]]) except Exception as e: exc_type, exc_obj, exc_tb = sys.exc_info() fname = os.path.split(exc_tb.tb_frame.f_code.co_filename)[1] print("Model ", i + 1, "of", len(nz), "FAILED!..skipping and writing NaN to outputs") print("Exception:") print(e) print(exc_type, fname, exc_tb.tb_lineno) Yhat[te, nz[i]] = float('nan') S2[te, nz[i]] = float('nan') nlZ[nz[i], fold] = float('nan') if testcov is None: Z[te, nz[i]] = float('nan') else: if testresp is not None: Z[te, nz[i]] = float('nan') if savemodel: print('Saving model meta-data...') with open('Models/meta_data.md', 'wb') as file: pickle.dump( { 'valid_voxels': nz, 'fold_num': cvfolds, 'mean_resp': mean_resp, 'std_resp': std_resp, 'mean_cov': mean_cov, 'std_cov': std_cov, 'regressor': alg, 'standardize': standardize }, file) # compute performance metrics if (run_cv or testresp is not None): print("Evaluating the model ...") results = evaluate(Y[testids, :], Yhat[testids, :], S2=S2[testids, :], mY=mean_resp[0], sY=std_resp[0]) # Set writing options if saveoutput: if (run_cv or testresp is not None): save_results(respfile, Yhat[testids, :], S2[testids, :], maskvol, Z=Z[testids, :], results=results, outputsuffix=outputsuffix) else: save_results(respfile, Yhat[testids, :], S2[testids, :], maskvol, outputsuffix=outputsuffix) else: if (run_cv or testresp is not None): output = (Yhat[testids, :], S2[testids, :], nm, Z[testids, :], results) else: output = (Yhat[testids, :], S2[testids, :], nm) return output
def extend(covfile, respfile, maskfile=None, **kwargs): alg = kwargs.pop('alg') if alg != 'hbr': print('Model extention is only possible for HBR models.') return elif (not 'model_path' in list(kwargs.keys())) or \ (not 'output_path' in list(kwargs.keys())) or \ (not 'trbefile' in list(kwargs.keys())) or \ (not 'dummycovfile' in list(kwargs.keys()))or \ (not 'dummybefile' in list(kwargs.keys())): print('InputError: Some mandatory arguments are missing.') return else: model_path = kwargs.pop('model_path') output_path = kwargs.pop('output_path') trbefile = kwargs.pop('trbefile') dummycovfile = kwargs.pop('dummycovfile') dummybefile = kwargs.pop('dummybefile') informative_prior = kwargs.pop('job_id', 'False') == 'True' generation_factor = int(kwargs.pop('generation_factor', '10')) job_id = kwargs.pop('job_id', None) batch_size = kwargs.pop('batch_size', None) if batch_size is not None: batch_size = int(batch_size) job_id = int(job_id) - 1 if not os.path.isdir(output_path): os.mkdir(output_path) # load data print("Loading data ...") X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) batch_effects_train = fileio.load(trbefile) X_dummy = fileio.load(dummycovfile) batch_effects_dummy = fileio.load(dummybefile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] if len(X_dummy.shape) == 1: X_dummy = X_dummy[:, np.newaxis] feature_num = Y.shape[1] # estimate the models for all subjects for i in range(feature_num): nm = norm_init(X) if batch_size is not None: # when using nirmative_parallel print("Extending model ", job_id*batch_size+i) nm = nm.load(os.path.join(model_path, 'NM_0_' + str(job_id*batch_size+i) + '.pkl')) else: print("Extending model ", i+1, "of", feature_num) nm = nm.load(os.path.join(model_path, 'NM_0_' + str(i) + '.pkl')) nm = nm.extend(X, Y[:,i:i+1], batch_effects_train, X_dummy, batch_effects_dummy, samples=generation_factor, informative_prior=informative_prior) if batch_size is not None: nm.save(os.path.join(output_path, 'NM_0_' + str(job_id*batch_size+i) + '.pkl')) else: nm.save(os.path.join(output_path, 'NM_0_' + str(i) + '.pkl'))
def fit(covfile, respfile, **kwargs): # parse keyword arguments maskfile = kwargs.pop('maskfile',None) alg = kwargs.pop('alg','gpr') savemodel = kwargs.pop('savemodel','True')=='True' standardize = kwargs.pop('standardize',True) if savemodel and not os.path.isdir('Models'): os.mkdir('Models') # load data print("Processing data in " + respfile) X = fileio.load(covfile) Y, maskvol = load_response_vars(respfile, maskfile) if len(Y.shape) == 1: Y = Y[:, np.newaxis] if len(X.shape) == 1: X = X[:, np.newaxis] # find and remove bad variables from the response variables # note: the covariates are assumed to have already been checked nz = np.where(np.bitwise_and(np.isfinite(Y).any(axis=0), np.var(Y, axis=0) != 0))[0] mean_resp = [] std_resp = [] mean_cov = [] std_cov = [] # standardize responses and covariates, ignoring invalid entries mY = np.mean(Y[:, nz], axis=0) sY = np.std(Y[:, nz], axis=0) mean_resp.append(mY) std_resp.append(sY) if standardize: Yz = np.zeros_like(Y) Yz[:, nz] = (Y[:, nz] - mY) / sY mX = np.mean(X, axis=0) sX = np.std(X, axis=0) Xz = (X - mX) / sX mean_resp.append(mY) std_resp.append(sY) mean_cov.append(mX) std_cov.append(sX) else: Yz = Y Xz = X # estimate the models for all subjects for i in range(0, len(nz)): print("Estimating model ", i+1, "of", len(nz)) nm = norm_init(Xz, Yz[:, nz[i]], alg=alg, **kwargs) nm = nm.estimate(Xz, Yz[:, nz[i]], **kwargs) if savemodel: nm.save('Models/NM_' + str(0) + '_' + str(nz[i]) + '.pkl' ) if savemodel: print('Saving model meta-data...') with open('Models/meta_data.md', 'wb') as file: pickle.dump({'valid_voxels':nz, 'mean_resp':mean_resp, 'std_resp':std_resp, 'mean_cov':mean_cov, 'std_cov':std_cov, 'regressor':alg, 'standardize':standardize}, file) return nm
Y_test = -2 * X_test**2 + 2 * X_test + 1 + X_test * np.random.randn( sample_num, 1) configparam = dict() configparam['batch_size'] = 10 configparam['epochs'] = 100 configparam['m'] = 200 configparam['hidden_neuron_num'] = 10 configparam['r_dim'] = 5 configparam['z_dim'] = 3 configparam['nv'] = 0.01 configparam['device'] = torch.device('cpu') with open('NP_configs.pkl', 'wb') as file: pickle.dump(configparam, file) nm = norm_init(X_train, Y_train, alg='np', configparam='NP_configs.pkl') nm.estimate(X_train, Y_train) y_hat, ys2 = nm.predict(X_test) fig = plt.figure() ax1 = fig.add_subplot(111) ax1.scatter(X_test, Y_test, label='Test Data') ax1.errorbar(X_test, y_hat, yerr=1.96 * np.sqrt(ys2).squeeze(), fmt='.', c='y', alpha=0.2, label='95% Prediction Intervals') ax1.scatter(X_test, y_hat, c='r', label='Prediction') ax1.set_title('Estimated Function')
# Create evenly spaced X values for prediction # In[8]: # Range of X X_range = [np.min(X), np.max(X)] Xsy = np.arange(X_range[0], X_range[1], 1) Xsy = Xsy.reshape(-1, 1) # Standardize using training data params Xsyz = (Xsy - mX) / sX # Train gaussian process regression and generate predictions # In[9]: nm = norm_init(Xz, Yz, alg='gpr', configparam=None) Hyp = nm.estimate(Xz, Yz) yhat, s2 = nm.predict(Xz, Yz, Xsyz, Hyp) Yhat = yhat * sY + mY # get the predictions back in original (unstandardized) units nlZ = nm.neg_log_lik S2 = s2 * sY**2 # get predictive variance # Plot predictions and predictive variance # In[10]: f, axes = plt.subplots(1, 1) f.set_figwidth(5) f.set_figheight(5) axes.plot(Xsy, Yhat, linestyle='solid', color='b', linewidth=1.5)
############################## Data Simulation ################################ X_train, Y_train, grp_id_train, X_test, Y_test, grp_id_test, coef = \ simulate_data(simulation_method, n_samples, n_features, n_grps, working_dir=working_dir, plot=True, noise='hetero_gaussian') ################################# Methods Tests ############################### for model_type in model_types: nm = norm_init(X_train, Y_train, alg='hbr', model_type=model_type, random_intercept='True', random_slope='True', random_noise='True', hetero_noise='True', skewed_likelihood='False', order='3') nm.estimate(X_train, Y_train, trbefile=working_dir + 'trbefile.pkl') yhat, ys2 = nm.predict(X_test, tsbefile=working_dir + 'tsbefile.pkl') for i in range(n_features): sorted_idx = X_test[:, i].argsort(axis=0).squeeze() temp_X = X_test[sorted_idx, i] temp_Y = Y_test[sorted_idx, ] temp_be = grp_id_test[sorted_idx, :].squeeze() temp_yhat = yhat[sorted_idx, ] temp_s2 = ys2[sorted_idx, ]