def setUp(self): """ :return: """ try: two_class = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_twoclass.csv')) multiclass = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_multiclass.csv')) except OSError as exp: #os.system("python gen_synthetic_datasets.py") import tests.gen_synthetic_datasets two_class = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_twoclass.csv')) multiclass = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_multiclass.csv')) finally: # Load expected values for a PLS da with 2 classes self.expected_cvParams = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/pls_da_cvoarams.csv')) # check this self.da_mat = multiclass['Class_Vector'].values self.da = two_class['Class'].values self.xmat_multi = multiclass.iloc[:, 5::].values self.xmat = two_class.iloc[:, 1::].values x_scaler = ChemometricsScaler(1) y_scaler = ChemometricsScaler(1, with_mean=True, with_std=False) self.plsda = ChemometricsPLSDA(n_comps=3, xscaler=x_scaler, y_scaler=y_scaler) self.plsda_multiy = ChemometricsPLSDA(n_comps=3, xscaler=x_scaler, y_scaler=y_scaler)
def test_scalers(self): """ :return: """ x_scaler_par = ChemometricsScaler(1 / 2) x_scaler_mc = ChemometricsScaler(0) pareto_model = ChemometricsPCA(ncomps=3, scaler=x_scaler_par) mc_model = ChemometricsPCA(ncomps=3, scaler=x_scaler_mc) pareto_model.fit(self.xmat) mc_model.fit(self.xmat) assert_allclose(pareto_model.loadings, self.expected_loadings_par) assert_allclose(pareto_model.scores, self.expected_scores_par) assert_allclose(mc_model.loadings, self.expected_loadings_mc) assert_allclose(mc_model.scores, self.expected_scores_mc)
def test_scalers(self): """ :return: """ x_scaler_par = ChemometricsScaler(1 / 2) y_scaler_par = ChemometricsScaler(1 / 2) x_scaler_mc = ChemometricsScaler(0) y_scaler_mc = ChemometricsScaler(0) pareto_model = ChemometricsPLS(ncomps=3, xscaler=x_scaler_par, yscaler=y_scaler_par) pareto_model_multiy = ChemometricsPLS(ncomps=3, xscaler=x_scaler_par, yscaler=y_scaler_par) mc_model = ChemometricsPLS(ncomps=3, xscaler=x_scaler_mc, yscaler=y_scaler_mc) mc_model_multiy = ChemometricsPLS(ncomps=3, xscaler=x_scaler_mc, yscaler=y_scaler_mc) pareto_model.fit(self.xmat, self.y) pareto_model_multiy.fit(self.xmat_multiy, self.ymat) mc_model.fit(self.xmat, self.y) mc_model_multiy.fit(self.xmat_multiy, self.ymat) assert_allclose(pareto_model.scores_t, self.expected_scores_t_par) assert_allclose(pareto_model.beta_coeffs, self.expected_betas_par) assert_allclose(pareto_model.VIP(), self.expected_vip_par) #assert_allclose(pareto_model_multiy.scores_t, self.expected_scores_t_yblock_par) #assert_allclose(pareto_model_multiy.beta_coeffs, self.expected_betacoefs_yblock_par) assert_allclose(mc_model.scores_t, self.expected_scores_t_mc) assert_allclose(mc_model.beta_coeffs, self.expected_betas_mc) assert_allclose(mc_model.VIP(), self.expected_vip_mc)
def setUp(self): # Generate 2 fake classification datasets, one with 2 classes and another with 3 self.twoclass_dataset = make_classification(40, n_features=100, n_informative=5, n_redundant=5, n_classes=2) self.three_classdataset = make_classification(40, n_features=100, n_informative=5, n_redundant=5, n_classes=3) y_scaler = ChemometricsScaler(with_mean=False, with_std=False) self.plsreg = ChemometricsPLS(n_comps=3, yscaler=y_scaler) self.plslog = ChemometricsPLS_Logistic(n_comps=3)
def setUp(self): """ :return: """ try: multiclass = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_multiclass.csv')) twoclass = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_twoclass.csv')) except OSError as exp: #os.system("python gen_synthetic_datasets.py") import tests.gen_synthetic_datasets multiclass = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_multiclass.csv')) twoclass = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_twoclass.csv')) finally: # check this self.da_mat = multiclass['Class_Vector'].values self.da = twoclass['Class'].values self.xmat_multi = multiclass.iloc[:, 5::].values self.xmat = twoclass.iloc[:, 1::].values # Set up the same scalers y_scaler = ChemometricsScaler(0, with_std=False, with_mean=True) self.plsreg = ChemometricsPLS(ncomps=3, yscaler=y_scaler) self.plsda = ChemometricsPLSDA(ncomps=3) # Generate the dummy matrix so we can run the pls regression objects in the same conditions as # the discriminant ones self.dummy_y = pds.get_dummies(self.da_mat).values
def setUp(self): try: regression_problem = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/regression.csv')) except (IOError, OSError) as ioerr: import tests.gen_synthetic_datasets regression_problem = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/regression.csv')) self.mc_scaler = ChemometricsScaler(0) self.uv_scaler = ChemometricsScaler(1) self.par_scaler = ChemometricsScaler(1 / 2) self.y = regression_problem.values[:, 0][np.newaxis].T self.xmat = regression_problem.values[:, 1:4] self.xmat_mc = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/scaler_xmat_mc.csv'), delimiter=',') self.xmat_uv = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/scaler_xmat_uv.csv'), delimiter=',') self.xmat_par = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/scaler_xmat_par.csv'), delimiter=',') self.y_mc = np.loadtxt(os.path.join(os.path.dirname(__file__), './test_data/scaler_y_mc.csv'), delimiter=',') self.y_uv = np.loadtxt(os.path.join(os.path.dirname(__file__), './test_data/scaler_y_uv.csv'), delimiter=',') self.y_par = np.loadtxt(os.path.join(os.path.dirname(__file__), './test_data/scaler_y_par.csv'), delimiter=',')
from pyChemometrics import ChemometricsScaler, ChemometricsPCA import numpy as np import pandas as pds t_dset = pds.read_csv('./tests/test_data/classification_twoclass.csv') xmat = t_dset.iloc[:, 1::].values x_scaler = ChemometricsScaler(1) pcamodel = ChemometricsPCA(ncomps=3, scaler=x_scaler) pcamodel.fit(xmat) #pcamodel._screecv_optimize_ncomps(xmat, 10, stopping_condition=0.05) np.random.seed(0) pcamodel.cross_validation(xmat) pcamodel._screecv_optimize_ncomps(xmat, 10, stopping_condition=0.05) np.savetxt('./tests/test_data/pca_loadings.csv', pcamodel.loadings, fmt='%.18e', delimiter=',', newline='\n', header='', footer='', comments='#') np.savetxt('./tests/test_data/pca_scores.csv',
class TestScalerObject(unittest.TestCase): """ Use a made up dataset """ def setUp(self): try: regression_problem = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/regression.csv')) except (IOError, OSError) as ioerr: import tests.gen_synthetic_datasets regression_problem = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/regression.csv')) self.mc_scaler = ChemometricsScaler(0) self.uv_scaler = ChemometricsScaler(1) self.par_scaler = ChemometricsScaler(1 / 2) self.y = regression_problem.values[:, 0][np.newaxis].T self.xmat = regression_problem.values[:, 1:4] self.xmat_mc = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/scaler_xmat_mc.csv'), delimiter=',') self.xmat_uv = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/scaler_xmat_uv.csv'), delimiter=',') self.xmat_par = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/scaler_xmat_par.csv'), delimiter=',') self.y_mc = np.loadtxt(os.path.join(os.path.dirname(__file__), './test_data/scaler_y_mc.csv'), delimiter=',') self.y_uv = np.loadtxt(os.path.join(os.path.dirname(__file__), './test_data/scaler_y_uv.csv'), delimiter=',') self.y_par = np.loadtxt(os.path.join(os.path.dirname(__file__), './test_data/scaler_y_par.csv'), delimiter=',') def test_scaleVector(self): """ Check that scaling works with arbitrary value between 0 and 1 as expected on a single vector. """ assert_allclose( self.mc_scaler.fit_transform(self.y).squeeze(), self.y_mc) assert_allclose( self.uv_scaler.fit_transform(self.y).squeeze(), self.y_uv) assert_allclose( self.par_scaler.fit_transform(self.y).squeeze(), self.y_par) def test_scaleMatrix(self): """ Check that scaling works with arbitrary value between 0 and 1 as expected on a matrix of m samples by n features. """ assert_allclose(self.mc_scaler.fit_transform(self.xmat), self.xmat_mc) assert_allclose(self.uv_scaler.fit_transform(self.xmat), self.xmat_uv) assert_allclose(self.par_scaler.fit_transform(self.xmat), self.xmat_par) def test_inverseTransformVector(self): """ Test inverse transform of a vector """ self.mc_scaler.fit(self.y) self.uv_scaler.fit(self.y) self.par_scaler.fit(self.y) assert_allclose(self.mc_scaler.inverse_transform(self.y_mc), self.y.squeeze()) assert_allclose(self.uv_scaler.inverse_transform(self.y_uv), self.y.squeeze()) assert_allclose(self.par_scaler.inverse_transform(self.y_par), self.y.squeeze()) def test_inverseTransformMatrix(self): """ Test inverse transform of a matrix """ self.mc_scaler.fit(self.xmat) self.uv_scaler.fit(self.xmat) self.par_scaler.fit(self.xmat) assert_allclose(self.mc_scaler.inverse_transform(self.xmat_mc), self.xmat) assert_allclose(self.uv_scaler.inverse_transform(self.xmat_uv), self.xmat) assert_allclose(self.par_scaler.inverse_transform(self.xmat_par), self.xmat)
def setUp(self): try: regression_problem = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/regression.csv')) multiblock_regression_problem = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/regression_multiblock.csv')) except (IOError, OSError) as ioerr: #os.system("python gen_synthetic_datasets.py") import tests.gen_synthetic_datasets regression_problem = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/regression.csv')) multiblock_regression_problem = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/regression_multiblock.csv')) finally: # Load expected values for a PLS regression against a Y vector self.expected_loadings_p = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_loadings_p.csv'), delimiter=',') self.expected_loadings_q = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_loadings_q.csv'), delimiter=',')[np.newaxis, :] self.expected_weights_w = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_weights_w.csv'), delimiter=',') self.expected_weights_c = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_weights_c.csv'), delimiter=',')[np.newaxis, :] self.expected_scores_t = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_scores_t.csv'), delimiter=',') self.expected_scores_u = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_scores_u.csv'), delimiter=',') self.expected_betacoefs = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_betas.csv'), delimiter=',')[:, np.newaxis] self.expected_vips = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_vip.csv'), delimiter=',') self.expected_dmodx = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_dmodx.csv'), delimiter=',') # Load expected values for a PLS regression model against a Y matrix #self.expected_loadings_p_yblock = np.loadtxt('./test_data/pls_reg_yblock_loadings_p.csv', delimiter=',') #self.expected_weights_w_yblock = np.loadtxt('./test_data/pls_reg_yblock_weights_w.csv', delimiter=',') #self.expected_scores_t_yblock = np.loadtxt('./test_data/pls_reg_yblock_scores_t.csv', delimiter=',') #self.expected_scores_u_yblock = np.loadtxt('./test_data/pls_reg_yblock_scores_u.csv', delimiter=',') #self.expected_weights_c_yblock = np.loadtxt('./test_data/pls_reg_yblock_weights_c.csv', delimiter=',') #self.expected_loadings_q_yblock = np.loadtxt('./test_data/pls_reg_yblock_loadings_q.csv', delimiter=',') #self.expected_betacoefs_yblock = np.loadtxt('./test_data/pls_reg_yblock_betacoefs.csv', delimiter=',') self.expected_modelParameters = { 'R2Y': 0.99442967438303576, 'R2X': 0.022903901163376705, 'SSYcomp': np.array([5.42418672, 1.20742786, 0.27851628]), 'SSXcomp': np.array([9750.59475071, 9779.57249348, 9770.96098837]) } self.expected_cvParameters = { 'Q2Y': 0.069284226071602006, 'Q2X': -0.12391667143436425, 'MeanR2X_Training': 0.025896665665079883, 'MeanR2Y_Training': 0.99636477396947942, 'StdevR2Y_Training': 0.00091660538957527582, 'StdevR2X_Training': 0.0010098198504153058, 'StdevR2X_Test': 0.02386260538832127, 'StdevR2Y_Test': 0.25034195769401973, 'MeanR2X_Test': -0.022542842216950101, 'MeanR2Y_Test': 0.096991536519031446 } self.expected_t2 = np.array([7.00212848, 6.63400492, 5.6325462]) self.expected_outliers_t2 = np.array([5, 33]) self.expected_outliers_dmodx = np.array([]) self.expected_scores_t_par = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_scores_t_par.csv'), delimiter=',') self.expected_betas_par = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_betas_par.csv'), delimiter=',')[:, np.newaxis] self.expected_scores_t_mc = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_scores_t_mc.csv'), delimiter=',') self.expected_betas_mc = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_betas_mc.csv'), delimiter=',')[:, np.newaxis] self.expected_vip_mc = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_vip_mc.csv'), delimiter=',') self.expected_vip_par = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pls_vip_par.csv'), delimiter=',') # check this self.y = regression_problem.iloc[:, 0].values self.ymat = multiblock_regression_problem.values self.xmat = regression_problem.iloc[:, 1::].values self.xmat_multiy = multiblock_regression_problem.values self.expected_permutation = {} x_scaler = ChemometricsScaler(1) y_scaler = ChemometricsScaler(1) self.plsreg = ChemometricsPLS(ncomps=3, xscaler=x_scaler, yscaler=y_scaler) self.plsreg_multiblock = ChemometricsPLS(ncomps=3, xscaler=x_scaler, yscaler=y_scaler)
def setUp(self): """ :return: """ try: # Generate a fake classification dataset t_dset = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_twoclass.csv')) self.xmat = t_dset.iloc[:, 1::].values except (IOError, OSError, FileNotFoundError) as ioerr: import tests.gen_synthetic_datasets #os.system('python gen_synthetic_datasets.py') t_dset = pds.read_csv( os.path.join(os.path.dirname(__file__), './test_data/classification_twoclass.csv')) self.xmat = t_dset.iloc[:, 1::].values self.expected_modelParameters = { 'R2X': 0.12913056143673818, 'S0': 0.9803124001345157, 'VarExp': np.array([9.44045066, 8.79710591, 8.11561924]), 'VarExpRatio': np.array([0.04625821, 0.04310582, 0.03976653]) } self.expected_cvParameters = { 'Q2X': -0.10571035538454221, 'Mean_VarExp_Test': -0.0090083829247783621, 'Stdev_VarExp_Test': 0.0037778709253728452, 'Mean_VarExpRatio_Training': np.array([0.05108043, 0.04669199, 0.04380617]), 'Stdev_VarExpRatio_Training': np.array([0.00130025, 0.00094489, 0.00044059]) } self.expected_scores = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pca_scores.csv'), delimiter=',') self.expected_loadings = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pca_loadings.csv'), delimiter=',') self.expected_scores_mc = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pca_scores_mc.csv'), delimiter=',') self.expected_loadings_mc = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pca_loadings_mc.csv'), delimiter=',') self.expected_scores_par = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pca_scores_par.csv'), delimiter=',') self.expected_loadings_par = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pca_loadings_par.csv'), delimiter=',') self.expected_dmodx = np.loadtxt(os.path.join( os.path.dirname(__file__), './test_data/pca_dmodx.csv'), delimiter=',') cvloadings = np.loadtxt(os.path.join(os.path.dirname(__file__), './test_data/pca_cvloads.csv'), delimiter=',') self.expected_cv_meanloadings = cvloadings[0:3, :] self.expected_cv_stdevloadings = cvloadings[3::, :] self.expected_t2 = np.array([9.00313686, 8.69095296, 8.34753638]) self.expected_outlier_dmodx = np.array([]) self.expected_outlier_t2 = np.array([14]) self.x_scaler = ChemometricsScaler(1) self.pcamodel = ChemometricsPCA(ncomps=3, scaler=self.x_scaler)
from pyChemometrics import ChemometricsScaler, ChemometricsPLS import numpy as np np.random.seed(0) import pandas as pds # Use the standard datasets t_dset = pds.read_csv('./tests/test_data/regression.csv') xmat = t_dset.iloc[:, 1:4].values y = t_dset.iloc[:, 0].values y = y[np.newaxis].T mc_scaler = ChemometricsScaler(0) uv_scaler = ChemometricsScaler(1) par_scaler = ChemometricsScaler(1 / 2) xmat_mc = mc_scaler.fit_transform(xmat) y_mc = mc_scaler.fit_transform(y) xmat_uv = uv_scaler.fit_transform(xmat) y_uv = uv_scaler.fit_transform(y) xmat_par = par_scaler.fit_transform(xmat) y_par = par_scaler.fit_transform(y) np.savetxt('./tests/test_data/scaler_xmat_mc.csv', xmat_mc, fmt='%.18e', delimiter=',', newline='\n',