def test_sisso_regressor_omp(mocker): # Simple SISSO run with OMP # Mock the run of the custodian by just copying a reference SISSO.out file def copy_sisso_out(): shutil.copy( os.path.join(TEST_FILES_DIR, "runs", "OMP", "SISSO.out"), "SISSO.out", ) mocker.patch.object( pysisso.sklearn.Custodian, "run", return_value=[], side_effect=copy_sisso_out, ) with ScratchDir("."): sisso_reg = SISSORegressor.OMP(desc_dim=4) assert sisso_reg.rung == 0 assert sisso_reg.subs_sis == 1 assert sisso_reg.desc_dim == 4 assert sisso_reg.method == "L0" assert sisso_reg.L1L0_size4L0 is None X = np.array([ [8, 1, 3.01, 4], [6, 2, 3.02, 3], [2, 3, 3.01, 0], [10, 4, 3.02, -8], [4, 5, 3.01, 10], ]) y = 0.9 * X[:, 1] + 0.1 * X[:, 3] - 1.0 sisso_reg.fit(X, y) actual_sin = "SISSO_dir/SISSO.in" ref_sin = os.path.join(TEST_FILES_DIR, "runs", "OMP", "SISSO.in") assert [line for line in open(actual_sin) ] == [line for line in open(ref_sin)] sisso_out = SISSOOut.from_file(filepath="SISSO_dir/SISSO.out") assert sisso_out.params.n_rungs == sisso_reg.rung assert sisso_out.params.SIS_subspaces_sizes == [sisso_reg.subs_sis] assert sisso_out.params.descriptor_dimension == sisso_reg.desc_dim assert sisso_out.params.sparsification_method == sisso_reg.method sisso_model = sisso_out.model assert str(sisso_model.descriptors[0]) == "(feature_1)" assert str(sisso_model.descriptors[1]) == "(feature_3)"
from pysisso.sklearn import SISSORegressor # Define the data set X = np.array([ [8, 1, 3.01, 4], [6, 2, 3.02, 3], [2, 3, 3.01, 0], [10, 4, 3.02, -8], [4, 5, 3.01, 10], ]) y = 0.9 * X[:, 1] + 0.1 * X[:, 3] - 1.0 # Define the regressor and fit the data sisso_reg = SISSORegressor.OMP(desc_dim=4) sisso_reg.fit(X, y, columns=["feature_0", "feature_1", "feature_2", "feature_3"]) # Get the final model obtained sisso_out = SISSOOut.from_file(filepath="SISSO_dir/SISSO.out") sisso_model = sisso_out.model # Get the descriptors descriptors = [str(d) for d in sisso_model.descriptors] # Print the order of the OMP features # Should start with feature_1, then feature_3. # feature_0 and feature_2 might be interchanged. for idesc, desc in enumerate(descriptors): print(f"#{idesc+1}: {desc} ({sisso_model.coefficients[0][idesc]})")
def test_sisso_out(): sisso_out = SISSOOut.from_file( filepath=os.path.join(TEST_FILES_DIR, "runs", "cubic_function", "SISSO.out") ) sisso_version = sisso_out.version assert isinstance(sisso_version, SISSOVersion) assert sisso_version.version == (3, 0, 2) assert sisso_version.header_string == "Version SISSO.3.0.2, June, 2020." sisso_params = sisso_out.params assert isinstance(sisso_params, SISSOParams) assert sisso_params.number_of_samples == [100] assert sisso_params.sparsification_method == "L0" assert ( str(sisso_params) == """Parameters for SISSO : - property_type : 3 - descriptor_dimension : 3 - total_number_properties : 1 - task_weighting : [1] - number_of_samples : [100] - n_scalar_features : 1 - n_rungs : 1 - max_feature_complexity : 10 - n_dimension_types : 0 - dimension_types : [[]] - lower_bound_maxabs_value : 0.001 - upper_bound_maxabs_value : 100000.0 - SIS_subspaces_sizes : [20] - operators : ['(+)(*)(^2)(^3)(^-1)(cos)(sin)'] - sparsification_method : L0 - n_topmodels : 100 - fit_intercept : True - metric : RMSE""" ) sisso_iterations = sisso_out.iterations assert isinstance(sisso_iterations, list) assert len(sisso_iterations) == sisso_params.descriptor_dimension iteration_1 = sisso_iterations[0] last_iteration = sisso_iterations[-1] assert isinstance(iteration_1, SISSOIteration) assert isinstance(last_iteration, SISSOIteration) assert len(iteration_1.sisso_model.descriptors) == 1 assert ( len(last_iteration.sisso_model.descriptors) == sisso_params.descriptor_dimension ) assert iteration_1.iteration_number == 1 assert last_iteration.iteration_number == 3 assert iteration_1.SIS_subspace_size == 6 assert last_iteration.SIS_subspace_size == 0 model_1 = iteration_1.sisso_model last_model = last_iteration.sisso_model assert model_1.dimension == 1 assert last_model.dimension == 3 assert len(model_1.descriptors) == 1 assert len(last_model.descriptors) == 3 assert len(model_1.rmse) == 1 assert len(model_1.maxae) == 1 assert len(last_model.rmse) == 1 assert len(last_model.maxae) == 1 assert model_1.rmse[0] == pytest.approx(0.7959386860e01) assert model_1.maxae[0] == pytest.approx(0.1858248525e02) assert last_model.rmse[0] == pytest.approx(0.1757799850e01) assert last_model.maxae[0] == pytest.approx(0.4267977958e01) assert len(model_1.coefficients) == 1 assert len(last_model.coefficients) == 1 assert len(model_1.coefficients[0]) == 1 assert len(last_model.coefficients[0]) == 3 assert model_1.coefficients[0] == pytest.approx([0.2553319133e00]) assert last_model.coefficients[0] == pytest.approx( [0.9856312325e00, -0.3842863966e01, -0.1417565675e01] ) assert len(model_1.intercept) == 1 assert len(last_model.intercept) == 1 assert model_1.intercept[0] == pytest.approx(-0.5364436924e01) assert last_model.intercept[0] == pytest.approx(0.3890294191e01) descriptors_1 = model_1.descriptors assert len(descriptors_1) == 1 descriptors_last = last_model.descriptors assert len(descriptors_last) == 3 descriptor_1 = descriptors_1[0] assert isinstance(descriptor_1, SISSODescriptor) assert descriptor_1.descriptor_id == 1 assert descriptor_1.descriptor_string == "(myx)^3" df = pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=["XX", "myx", "ZZ"]) descr_1_eval = descriptor_1.evaluate(df) assert len(descr_1_eval) == 2 assert descr_1_eval[0] == pytest.approx(8) assert descr_1_eval[1] == pytest.approx(125) descriptor_last_1 = descriptors_last[0] descriptor_last_2 = descriptors_last[1] descriptor_last_3 = descriptors_last[2] assert descriptor_last_1.descriptor_id == 1 assert descriptor_last_2.descriptor_id == 2 assert descriptor_last_3.descriptor_id == 3 assert descriptor_last_1.descriptor_string == "(myx)^3" assert descriptor_last_2.descriptor_string == "(myx)^2" assert descriptor_last_3.descriptor_string == "(myx)" assert str(descriptor_last_3) == descriptor_last_3.descriptor_string descr_last_1_eval = descriptor_last_1.evaluate(df) descr_last_2_eval = descriptor_last_2.evaluate(df) descr_last_3_eval = descriptor_last_3.evaluate(df) assert descr_last_1_eval[0] == pytest.approx(8) assert descr_last_1_eval[1] == pytest.approx(125) assert descr_last_2_eval[0] == pytest.approx(4) assert descr_last_2_eval[1] == pytest.approx(25) assert descr_last_3_eval[0] == pytest.approx(2) assert descr_last_3_eval[1] == pytest.approx(5) pred_1 = model_1.predict(df) assert pred_1[0] == pytest.approx(-3.3217816175999997) assert pred_1[1] == pytest.approx(26.5520522385) pred_last = last_model.predict(df) assert pred_last[0] == pytest.approx(-6.431243163) assert pred_last[1] == pytest.approx(23.9347707285) assert sisso_out.cpu_time == pytest.approx(0.64) models = sisso_out.models assert len(models) == 3 assert isinstance(models[0], SISSOModel) assert isinstance(models[1], SISSOModel) assert isinstance(models[2], SISSOModel) # Partial SISSO output partial_sisso_out_fpath = os.path.join( TEST_FILES_DIR, "outputs", "SISSO.3.0.2.out_not_finished" ) with pytest.raises( ValueError, match=r"Should get exactly one total " r"cpu time in the string, got 0.", ): SISSOOut.from_file(filepath=partial_sisso_out_fpath) sisso_out = SISSOOut.from_file( filepath=partial_sisso_out_fpath, allow_unfinished=True ) assert len(sisso_out.iterations) == 2 assert sisso_out.cpu_time is None models = sisso_out.models assert len(models) == 2 assert isinstance(models[0], SISSOModel) assert isinstance(models[1], SISSOModel)
from pysisso.outputs import ( SISSODescriptor, SISSOIteration, SISSOModel, SISSOOut, SISSOParams, SISSOVersion, scd, ) TEST_FILES_DIR = os.path.abspath( os.path.join(pysisso.__file__, "..", "..", "test_files") ) sisso_out = SISSOOut.from_file( filepath=os.path.join(TEST_FILES_DIR, "runs", "cubic_function", "SISSO.out") ) @pytest.mark.unit def test_sisso_out(): sisso_out = SISSOOut.from_file( filepath=os.path.join(TEST_FILES_DIR, "runs", "cubic_function", "SISSO.out") ) sisso_version = sisso_out.version assert isinstance(sisso_version, SISSOVersion) assert sisso_version.version == (3, 0, 2) assert sisso_version.header_string == "Version SISSO.3.0.2, June, 2020." sisso_params = sisso_out.params assert isinstance(sisso_params, SISSOParams)
def fit(self, X, y, index=None, columns=None, tasks=None): """Fit a SISSO regression based on inputs X and output y. This method supports Multi-Task SISSO. For Single-Task SISSO, y must have a shape (n_samples) or (n_samples, 1). For Multi-Task SISSO, y must have a shape (n_samples, n_tasks). The arrays will be reshaped to fit SISSO's input files. For example, with 10 samples and 3 properties, the output array (y) will be reshaped to (30, 1). The input array (X) is left unchanged. It is also possible to provide samples without an output for some properties by setting that property to NaN. In that case, the corresponding values in the input (X) and output (y) arrays will be removed from the SISSO inputs. In the previous example, if 2 of the samples have NaN for the first property, 1 sample has Nan for the second property and 4 samples have Nan for the third property, the final output array (y) will have a shape (30-2-1-4, 1), i.e. (23, 1), while the final input array (X) will have a shape (23, n_features). Args: X: Feature vectors as an array-like of shape (n_samples, n_features). y: Target values as an array-like of shape (n_samples,) or (n_samples, n_tasks). index: List of string identifiers for each sample. If None, "sampleN" with N=[1, ..., n_samples] will be used. columns: List of string names of the features. If None, "featN" with N=[1, ..., n_features] will be used. tasks: When Multi-Task SISSO is used, this is the list of string names that will be used for each task/property. If None, "taskN" with N=[1, ..., n_tasks] will be used. """ if not self.use_custodian: raise NotImplementedError self.sisso_in = SISSOIn.from_sisso_keywords( # pylint: disable=W0201 ptype=1, ntask=self.ntask, task_weighting=self.task_weighting, desc_dim=self.desc_dim, restart=self.restart, rung=self.rung, opset=self.opset, maxcomplexity=self.maxcomplexity, dimclass=self.dimclass, maxfval_lb=self.maxfval_lb, maxfval_ub=self.maxfval_ub, subs_sis=self.subs_sis, method=self.method, L1L0_size4L0=self.L1L0_size4L0, fit_intercept=self.fit_intercept, metric=self.metric, nm_output=self.nm_output, isconvex=self.isconvex, width=self.width, nvf=self.nvf, vfsize=self.vfsize, vf2sf=self.vf2sf, npf_must=self.npf_must, L1_max_iter=self.L1_max_iter, L1_tole=self.L1_tole, L1_dens=self.L1_dens, L1_nlambda=self.L1_nlambda, L1_minrmse=self.L1_minrmse, L1_warm_start=self.L1_warm_start, L1_weighted=self.L1_weighted, ) # Set up columns. These columns are used by the SISSO model wrapper afterwards # for the prediction if columns is None and isinstance(X, pd.DataFrame): columns = list(X.columns) self.columns = columns or [ # pylint: disable=W0201 "feat{:d}".format(ifeat) for ifeat in range(1, X.shape[1] + 1) ] if len(self.columns) != X.shape[1]: raise ValueError( "Columns should be of the size of the second axis of X.") # Set up data X = np.array(X) y = np.array(y) if y.ndim == 1 or (y.ndim == 2 and y.shape[1] == 1): # Single-Task SISSO self.ntasks = 1 # pylint: disable=W0201 index = index or [ "sample{:d}".format(ii) for ii in range(1, X.shape[0] + 1) ] if len(index) != len(y) or len(index) != len(X): raise ValueError("Index, X and y should have same size.") nsample = None elif y.ndim == 2 and y.shape[1] > 1: # Multi-Task SISSO self.ntasks = y.shape[1] # pylint: disable=W0201 samples_index = index or [ "sample{:d}".format(ii) for ii in range(1, X.shape[0] + 1) ] tasks = tasks or [ "task{:d}".format(ii) for ii in range(1, self.ntasks + 1) ] newX = np.zeros((0, X.shape[1])) newy = np.array([]) index = [] nsample = [] for itask in range(self.ntasks): yadd = y[:, itask] nanindices = np.argwhere(np.isnan(yadd)).flatten() totake = [ ii for ii in range(len(yadd)) if ii not in nanindices ] newy = np.concatenate([newy, np.take(yadd, indices=totake)]) newX = np.row_stack([newX, np.take(X, indices=totake, axis=0)]) nsample.append(len(totake)) index.extend([ "{}_{}".format(sample_index, tasks[itask]) for i_sample, sample_index in enumerate(samples_index) if i_sample in totake ]) X = newX y = newy else: raise ValueError("Wrong shapes.") data = pd.DataFrame(X, index=index, columns=self.columns) data.insert(0, "target", y) data.insert(0, "identifier", index) # Set up SISSODat and SISSOIn sisso_dat = SISSODat(data=data, features_dimensions=self.features_dimensions, nsample=nsample) self.sisso_in.set_keywords_for_SISSO_dat(sisso_dat=sisso_dat) # Run SISSO if self.run_dir is None: makedirs_p("SISSO_runs") timestamp = get_timestamp() self.run_dir = tempfile.mkdtemp(suffix=None, prefix=f"SISSO_dir_{timestamp}_", dir="SISSO_runs") else: makedirs_p(self.run_dir) with cd(self.run_dir): self.sisso_in.to_file(filename="SISSO.in") sisso_dat.to_file(filename="train.dat") job = SISSOJob() c = Custodian(jobs=[job], handlers=[], validators=[]) c.run() self.sisso_out = SISSOOut.from_file( # pylint: disable=W0201 filepath="SISSO.out") # Clean run directory if (self.clean_run_dir ): # TODO: add check here to not remove "." if the user passes . ? shutil.rmtree(self.run_dir)