def get_error_distribution_dataframe(self, model, cofactors=None): """ Get signed residual distribution per patient, per sub-score & per visit. Each residual is equal to the modeled data minus the observed data. Parameters ---------- model : leaspy.models.abstract_model.AbstractModel cofactors : str, list [str], optional (default None) Contains the cofactors' names to be included in the DataFrame. By default, no cofactors are returned. If cofactors == "all", all the available cofactors are returned. Returns ------- residuals_dataframe : pandas.DataFrame Examples -------- Get mean absolute error per feature: >>> from leaspy import AlgorithmSettings, Data, Leaspy >>> data = Data.from_csv_file("/my/data/path") >>> leaspy_logistic = Leaspy('logistic') >>> settings = AlgorithmSettings("mcmc_saem") >>> leaspy_logistic.calibrate(data, settings) >>> settings = AlgorithmSettings("mode_real") >>> results = leaspy_logistic.personalize(data, settings) >>> residuals_dataframe = results.get_error_distribution_dataframe(model) >>> residuals_dataframe[results.data.headers].abs().mean() """ residuals_dataset = Dataset(self.data) residuals_dataset.values = model.compute_individual_tensorized(residuals_dataset.timepoints, self.individual_parameters) \ - residuals_dataset.values residuals_dataframe = residuals_dataset.to_pandas().set_index('ID') if cofactors is not None: if type(cofactors) == str: if cofactors == "all": cofactors_list = self.data.cofactors else: cofactors_list = [cofactors] elif type(cofactors) == list: cofactors_list = cofactors else: raise TypeError( "The given `cofactors` input must be a string or a list of strings! " "You gave an object of type %s" % str(type(cofactors))) cofactors_df = self.data.to_dataframe( cofactors=cofactors).groupby('ID').first()[cofactors_list] residuals_dataframe = residuals_dataframe.join(cofactors_df) return residuals_dataframe
def test_constructor_multivariate(self): path_to_data = os.path.join(test_data_dir, 'io', "data", 'multivariate_data_for_dataset.csv') data = Data.from_csv_file(path_to_data) dataset = Dataset(data) self.assertEqual(dataset.n_individuals, 3) self.assertEqual(dataset.max_observations, 4) self.assertEqual(data.dimension, 2) values = torch.tensor([[[1., 1.], [5., 2.], [2., 3.], [0., 0.]], [[1., 1.], [5., 8.], [0., 0.], [0., 0.]], [[1., 4.], [8., 1.], [1., 1.], [3., 2.]]]) mask = torch.tensor([[[1.], [1.], [1.], [0.]], [[1.], [1.], [0.], [0.]], [[1.], [1.], [1.], [1.]]]) timepoints = torch.tensor([[1., 2., 3., 0.], [1., 2., 0., 0.], [1., 2., 4., 5.]]) self.assertTrue(torch.equal(dataset.values, values)) # print(dataset.mask) # print(mask) # self.assertTrue(torch.equal(dataset.mask, mask)) #TODO check this # print(dataset.timepoints) self.assertAlmostEqual((dataset.timepoints - timepoints).sum(), 0, delta=10e-5)
def plot_patients(self, model, data, indices, ax=None): # Get dataset from data dataset = Dataset(data=data, model=model, algo=None) # Instanciate realizations realizations = data.realizations colors = cm.rainbow(np.linspace(0, 1, len(indices) + 2)) if ax is None: fig, ax = plt.subplots(1, 1) xi = realizations['xi'].tensor_realizations tau = realizations['tau'].tensor_realizations sources = realizations['sources'].tensor_realizations patient_values = model.compute_individual_tensorized( dataset.timepoints, (xi, tau, sources)) # TODO only the 10 first, change that to specified indices dict_correspondence = {} for i, idx in enumerate(data.individuals.keys()): dict_correspondence[idx] = i for p, idx in enumerate(indices): i = dict_correspondence[idx] model_value = patient_values[ i, 0:dataset.nb_observations_per_individuals[i], :] score = dataset.values[ i, 0:dataset.nb_observations_per_individuals[i], :] ax.plot(dataset.timepoints[ i, 0:dataset.nb_observations_per_individuals[i]].detach().numpy(), model_value.detach().numpy(), c=colors[p]) ax.plot(dataset.timepoints[ i, 0:dataset.nb_observations_per_individuals[i]].detach().numpy(), score.detach().numpy(), c=colors[p], linestyle='--', marker='o') # Plot average model # tensor_timepoints = torch.Tensor(np.linspace(data.time_min, data.time_max, 40).reshape(-1,1)) # model_average = model.compute_average(tensor_timepoints) # ax.plot(tensor_timepoints.detach().numpy(), model_average.detach().numpy(), c='black', linewidth=4, alpha=0.3) return 0
def personalize(self, data, settings, return_noise=False): r""" From a model, estimate individual parameters for each `ID` of a given dataset. These individual parameters correspond to the random-effects :math:`(z_{i,j})` of the mixed effect model. Parameters ---------- data: leaspy.io.data.data.Data Contains the information of the individuals, in particular the time-points :math:`(t_{i,j})` and the observations :math:`(y_{i,j})`. settings: leaspy.io.settings.algorithm_settings.AlgorithmSettings Contains the algorithm's settings. return_noise: boolean (default False) Returns a tuple (individual_parameters, noise_std) if True Returns ------- ips: leaspy.io.outputs.individual_parameters.IndividualParameters Contains individual parameters if return_noise is True: tuple(ips, noise_std: torch.FloatTensor) Examples -------- Compute the individual parameters for a given longitudinal dataset & display the histogram of the log-acceleration. >>> from leaspy import AlgorithmSettings, Data, Leaspy, Plotter >>> leaspy_logistic = Leaspy('logistic') >>> data = Data.from_csv_file('data/my_leaspy_data.csv') >>> model_settings = AlgorithmSettings('mcmc_saem', seed=0) >>> personalize_settings = AlgorithmSettings('mode_real', seed=0) >>> leaspy_logistic.fit(data, model_settings) >>> individual_parameters = leaspy_logistic.personalize(data, personalize_settings) The standard deviation of the noise at the end of the personalization is of 0.0929 >>> individual_parameters.to_dataframe() """ # Check if model has been initialized self.check_if_initialized() algorithm = AlgoFactory.algo("personalize", settings) dataset = Dataset(data, algo=algorithm, model=self.model) individual_parameters, noise_std = algorithm.run(self.model, dataset) if return_noise: return individual_parameters, noise_std else: # default return individual_parameters
def fit(self, data, algorithm_settings): r""" Estimate the model's parameters :math:`\theta` for a given dataset, a given model and a given algorithm. These model's parameters correspond to the fixed-effects of the mixed effect model. Parameters ---------- data: leaspy.io.data.data.Data Contains the information of the individuals, in particular the time-points :math:`(t_{i,j})` and the observations :math:`(y_{i,j})`. algorithm_settings: leaspy.io.settings.algorithm_settings.AlgorithmSettings Contains the algorithm's settings. Examples -------- Fit a logistic model on a longitudinal dataset, display the group parameters and plot the group average trajectory. >>> from leaspy import AlgorithmSettings, Data, Leaspy, Plotter >>> leaspy_logistic = Leaspy('logistic') >>> data = Data.from_csv_file('data/my_leaspy_data.csv') >>> settings = AlgorithmSettings('mcmc_saem', seed=0) >>> leaspy_logistic.fit(data, settings) >>> print(leaspy_logistic.model.parameters) {'g': tensor([-0.4441, 1.9722, 1.6657, 0.1368, 0.8728]), 'v0': tensor([-3.2442, -3.2942, -3.3763, -2.4901, -3.0032]), 'betas': tensor([[ 0.0196, 0.0910], [ 0.0559, 0.0291], [-0.0038, -0.1261], [ 0.0988, 0.0767]]), 'tau_mean': tensor(80.5250), 'tau_std': tensor(8.1284), 'xi_mean': 0.0, 'xi_std': tensor(0.6834), 'sources_mean': 0.0, 'sources_std': 1.0, 'noise_std': tensor(0.0972)} >>> leaspy_logistic.plotting.average_trajectory() """ algorithm = AlgoFactory.algo("fit", algorithm_settings) dataset = Dataset(data, algo=algorithm, model=self.model) if not self.model.is_initialized: self.model.initialize(dataset) algorithm.run(self.model, dataset) # Update plotting self.plotting.update_model(self.model)
def test_constructor_univariate(self): path_to_data = os.path.join(test_data_dir, 'io', "data", 'univariate_data_for_dataset.csv') data = Data.from_csv_file(path_to_data) dataset = Dataset(data) self.assertEqual(dataset.n_individuals, 3) self.assertEqual(dataset.max_observations, 4) self.assertEqual(data.dimension, 1) values = torch.tensor([[[1.], [5.], [2.], [0.]], [[1.], [5.], [0.], [0.]], [[1.], [8.], [1.], [3.]]]) mask = torch.tensor([[[1.], [1.], [1.], [0.]], [[1.], [1.], [0.], [0.]], [[1.], [1.], [1.], [1.]]]) self.assertTrue(torch.equal(dataset.values, values)) self.assertTrue(torch.equal(dataset.mask, mask))
def plot_patients_mapped_on_mean_trajectory(self, model, results): dataset = Dataset(results.data, model) patient_values = model.compute_individual_tensorized( dataset.timepoints, results.individual_parameters) timepoints = np.linspace( model.parameters['tau_mean'] - 2 * np.sqrt(model.parameters['tau_std']), model.parameters['tau_mean'] + 4 * np.sqrt(model.parameters['tau_std']), 100) timepoints = torch.Tensor([timepoints]) xi = results.individual_parameters['xi'] tau = results.individual_parameters['tau'] reparametrized_time = model.time_reparametrization( dataset.timepoints, xi, tau) / torch.exp( model.parameters['xi_mean']) + model.parameters['tau_mean'] for i in range(dataset.values.shape[-1]): fig, ax = plt.subplots(1, 1) # ax.plot(timepoints[0,:].detach().numpy(), mean_values[0,:,i].detach().numpy(), c=colors[i]) for idx in range(50): ax.plot( reparametrized_time[ idx, 0:dataset.nb_observations_per_individuals[idx]]. detach().numpy(), dataset.values[ idx, 0:dataset.nb_observations_per_individuals[idx], i].detach().numpy(), 'x', ) ax.plot(reparametrized_time[ idx, 0:dataset.nb_observations_per_individuals[idx]]. detach().numpy(), patient_values[ idx, 0:dataset.nb_observations_per_individuals[idx], i].detach().numpy(), alpha=0.8) if model.name in ['logistic', 'logistic_parallel']: plt.ylim(0, 1)
def _get_noise_generator(self, model, results): """ Compute the level of L2 error per feature and return a noise generator or size n_features. Parameters ---------- model : leaspy.models.abstract_model.AbstractModel Subclass object of AbstractModel. results : leaspy.io.outputs.result.Result Object containing the computed individual parameters. Returns ------- torch.distributions.Normal or None A gaussian noise generator. If self.noise is None, the function returns None. Raises ------ ValueError If the attribute self.noise is an iterable of float of a length different than the number of features. """ if self.noise: if self.noise == "default": dataset = Dataset(results.data) squared_diff_per_ft = model.compute_sum_squared_per_ft_tensorized( dataset, results.individual_parameters).sum(dim=0) noise = torch.sqrt(squared_diff_per_ft / dataset.n_observations_per_ft.float()) else: if hasattr(self.noise, '__len__'): if len(self.noise) != len(results.data.headers): raise ValueError("The attribute 'noise' you gave is {}. If you want to specify the level of" " noise for each feature score, you must give an iterable object of size " "the number of features, here {}.".format(self.noise, len(results.data.headers))) noise = torch.tensor(self.noise) return torch.distributions.Normal(loc=0., scale=noise) # diagonal noise (per feature)
def test_sample(self): """ Test if samples values are the one expected :return: """ # TODO change this instanciation n_patients = 17 n_draw = 50 temperature_inv = 1.0 path_model_sampler = os.path.join(test_data_dir, "model_parameters", "multivariate_model_sampler.json") path_data = os.path.join(test_data_dir, "io", "data", "data_tiny.csv") data = Dataset(Data.from_csv_file(path_data)) leaspy = Leaspy.load(path_model_sampler) realizations = leaspy.model.get_realization_object(n_patients) # Test with taus var_name = 'tau' gsampler = GibbsSampler( leaspy.model.random_variable_informations()[var_name], n_patients) random_draws = [] for i in range(n_draw): gsampler.sample(data, leaspy.model, realizations, temperature_inv) random_draws.append( realizations[var_name].tensor_realizations.clone()) stack_random_draws = torch.stack(random_draws) stack_random_draws_mean = (stack_random_draws[1:, :, :] - stack_random_draws[:-1, :, :]).mean(dim=0) stack_random_draws_std = (stack_random_draws[1:, :, :] - stack_random_draws[:-1, :, :]).std(dim=0) self.assertAlmostEqual(stack_random_draws_mean.mean(), 0.0160, delta=0.05) self.assertAlmostEqual(stack_random_draws_std.mean(), 0.0861, delta=0.05) # Test with g var_name = 'g' gsampler = GibbsSampler( leaspy.model.random_variable_informations()[var_name], n_patients) random_draws = [] for i in range(n_draw): gsampler.sample(data, leaspy.model, realizations, temperature_inv) random_draws.append( realizations[var_name].tensor_realizations.clone()) stack_random_draws = torch.stack(random_draws) stack_random_draws_mean = (stack_random_draws[1:, :] - stack_random_draws[:-1, :]).mean(dim=0) stack_random_draws_std = (stack_random_draws[1:, :] - stack_random_draws[:-1, :]).std(dim=0) self.assertAlmostEqual(stack_random_draws_mean.mean(), 4.2792e-05, delta=0.05) self.assertAlmostEqual(stack_random_draws_std.mean(), 0.0045, delta=0.05)