def select_univariate(X, candidates): """Select the best univariate class for this data. Args: X (pandas.DataFrame): Data for which be best univariate must be found. candidates (list[Univariate]): List of Univariate subclasses (or instances of those) to choose from. Returns: Univariate: Instance of the selected candidate. """ best_ks = np.inf best_model = None for model in candidates: try: instance = get_instance(model) instance.fit(X) ks, _ = kstest(X, instance.cdf) if ks < best_ks: best_ks = ks best_model = model except ValueError: # Distribution not supported pass return get_instance(best_model)
def select_univariate(X, candidates): best_ks = np.inf best_model = None for model in candidates: instance = get_instance(model) instance.fit(X) ks, _ = kstest(X, instance.cdf) if ks < best_ks: best_ks = ks best_model = model return get_instance(best_model)
def _validate_distribution(cls, distribution): if not isinstance(distribution, str): return distribution if distribution in cls._DISTRIBUTIONS: return cls._DISTRIBUTIONS[distribution] try: copulas.get_instance(distribution) return distribution except (ValueError, ImportError): error_message = 'Invalid distribution specification {}'.format(distribution) raise ValueError(error_message) from None
def fit(self, X): """Compute the distribution for each variable and then its covariance matrix. Args: X(numpy.ndarray or pandas.DataFrame): Data to model. Returns: None """ LOGGER.debug('Fitting Gaussian Copula') if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) for column_name, column in X.items(): if isinstance(self.distribution, dict): distribution = self.distribution.get(column_name, DEFAULT_DISTRIBUTION) else: distribution = self.distribution distribution_instance = get_instance(distribution) distribution_instance.fit(column) self.distribs[column_name] = distribution_instance self.covariance = self._get_covariance(X) self.fitted = True
def test_pdf(config_path): with open(config_path, 'r') as config_file: config = json.load(config_file) # Setup test_obj = config['test'] instance = get_instance(test_obj['class'], **test_obj['kwargs']) inputs = config['test_case_inputs'] outputs = config['expected_output'] input_points = pd.read_csv(os.path.join(BASE, 'input', inputs['points'])) output_r = pd.read_csv(os.path.join(BASE, 'output', outputs['R'])) output_matlab = pd.read_csv(os.path.join(BASE, 'output', outputs['Matlab'])) # Run instance.theta = inputs['theta'] # Asserts cdfs = instance.cdf(input_points.values) rtol = config['settings']['rtol'] assert np.all(np.isclose(output_r["cdf"], cdfs, rtol=rtol)) assert np.all(np.isclose(output_matlab["cdf"], cdfs, rtol=rtol))
def test_get_instance_instance(self): """Try to get a new instance from a instance""" # Run instance = get_instance(GaussianMultivariate()) # Asserts assert not instance.fitted assert isinstance(instance, GaussianMultivariate)
def from_dict(cls, param_dict): """Create new instance from dictionary.""" distribution_class = get_instance(param_dict['type']) if get_qualified_name(distribution_class) == get_qualified_name( Univariate): distribution_class.fitted = param_dict['fitted'] if distribution_class.fitted: if param_dict.get("constant_value", None) is not None: distribution_class.constant_value = param_dict[ "constant_value"] distribution_class._replace_constant_methods() else: instance_class = get_instance(param_dict['instance_type']) distribution_class._instance = instance_class.from_dict( param_dict) return distribution_class return distribution_class.from_dict(param_dict)
def test_get_instance_str(self): """Try to get a new instance from a str""" # Run instance = get_instance( 'copulas.multivariate.gaussian.GaussianMultivariate') # Asserts assert not instance.fitted assert isinstance(instance, GaussianMultivariate)
def test_get_instance_instance_fitted(self): """Try to get a new instance from a fitted instance""" # Run gaussian = GaussianMultivariate() gaussian.fit(pd.DataFrame({'a_field': list(range(10))})) instance = get_instance(gaussian) # Asserts assert not instance.fitted assert isinstance(instance, GaussianMultivariate)
def select_univariate(X, candidates): best_ks = np.inf best_model = None for model in candidates: ks = ks_statistic(model, X) if ks < best_ks: best_ks = ks best_model = model return get_instance(best_model)
def test_get_instance_instance_distribution(self): """Try to get a new instance from a instance with distribution""" # Run instance = get_instance( GaussianMultivariate( distribution='copulas.univariate.truncnorm.TruncNorm')) # Asserts assert not instance.fitted assert isinstance(instance, GaussianMultivariate) assert instance.distribution == 'copulas.univariate.truncnorm.TruncNorm'
def evaluate_model_dataset(model_name, dataset_name, max_rows, max_columns): data = load_data(dataset_name, max_rows, max_columns) start = datetime.utcnow() LOGGER.info('Testing dataset %s (shape: %s)', dataset_name, data.shape) LOGGER.debug('dtypes for dataset %s:\n%s', dataset_name, data.dtypes) error_message = None score = None try: model = AVAILABLE_MODELS.get(model_name, model_name) instance = get_instance(model) LOGGER.info('Fitting dataset %s (shape: %s)', dataset_name, data.shape) instance.fit(data) LOGGER.info('Sampling %s rows for dataset %s', len(data), dataset_name) sampled = instance.sample(len(data)) assert sampled.shape == data.shape try: LOGGER.info('Computing PDF for dataset %s', dataset_name) pdf = instance.pdf(sampled) assert (0 <= pdf).all() LOGGER.info('Computing CDF for dataset %s', dataset_name) cdf = instance.cdf(sampled) assert (0 <= cdf).all() and (cdf <= 1).all() except NotImplementedError: pass LOGGER.info('Evaluating scores for dataset %s', dataset_name) scores = [] for column in data.columns: scores.append(ks_2samp(sampled[column].values, data[column].values)) score = np.mean(scores) LOGGER.info("Dataset %s score: %s", dataset_name, score) except Exception as ex: error_message = '{}: {}'.format(ex.__class__.__name__, ex) LOGGER.exception("Dataset %s failed: %s", dataset_name, error_message) elapsed_time = datetime.utcnow() - start return { 'model_name': model_name, 'dataset_name': dataset_name, 'elapsed_time': elapsed_time, 'error_message': error_message, 'score': score, 'num_columns': len(data.columns), 'num_rows': len(data) }
def test_get_instance_with_kwargs(self): """Try to get a new instance with kwargs""" # Run instance = get_instance( GaussianMultivariate, distribution='copulas.univariate.truncnorm.TruncNorm') # Asserts assert not instance.fitted assert isinstance(instance, GaussianMultivariate) assert instance.distribution == 'copulas.univariate.truncnorm.TruncNorm'
def from_dict(cls, copula_dict): """Create a new instance from dictionary. Args: copula_dict: `dict` with the parameters to replicate the copula. Like the output of `Multivariate.to_dict` Returns: Multivariate: Instance of the copula defined on the parameters. """ copula_class = get_instance(copula_dict['type']) return copula_class.from_dict(copula_dict)
def from_dict(cls, params): """Create a new instance from a parameters dictionary. Args: params (dict): Parameters of the distribution, in the same format as the one returned by the ``to_dict`` method. Returns: Multivariate: Instance of the distribution defined on the parameters. """ multivariate_class = get_instance(params['type']) return multivariate_class.from_dict(params)
def from_dict(cls, params): """Build a distribution from its params dict. Args: params (dict): Dictionary containing the FQN of the distribution and the necessary parameters to rebuild it. The input format is exactly the same that is outputted by the distribution class ``to_dict`` method. Returns: Univariate: Distribution instance. """ params = params.copy() distribution = get_instance(params.pop('type')) distribution._set_params(params) distribution.fitted = True return distribution
def test_pdf(config_path): with open(config_path, 'r') as config_file: config = json.load(config_file) # Setup test_obj = config['test'] instance = get_instance(test_obj['class'], **test_obj['kwargs']) data = pd.read_csv(os.path.join(BASE, 'input', config['input']['points'])) output_R = pd.read_csv(os.path.join(BASE, 'output', config['output']['R'])) output_M = pd.read_csv(os.path.join(BASE, 'output', config['output']['M'])) # Run instance.theta = config['input']['theta'] # Asserts pdfs = instance.pdf(data.values) rtol = config['settings']['rtol'] assert np.all(np.isclose(output_R["pdf"], pdfs, rtol=rtol)), config_file assert np.all(np.isclose(output_M["pdf"], pdfs, rtol=rtol)), config_file
def test_fit(config_path): with open(config_path, 'r') as config_file: config = json.load(config_file) # Setup test_obj = config['test'] instance = get_instance(test_obj['class'], **test_obj['kwargs']) data = pd.read_csv(os.path.join(BASE, 'input', config['input'])) # Run instance.fit(data.values) # Asserts params = instance.to_dict() rtol = config['settings']['rtol'] for other, expected in config['output'].items(): for key, exp in expected.items(): obs = params[key] msg = "Mismatch against {} on {}".format(other, config_path) assert np.isclose(exp, obs, rtol=rtol), msg
def fit(self, X): """Compute the distribution for each variable and then its covariance matrix. Arguments: X (pandas.DataFrame): Values of the random variables. """ LOGGER.info('Fitting %s', self) if not isinstance(X, pd.DataFrame): X = pd.DataFrame(X) columns = [] univariates = [] for column_name, column in X.items(): if isinstance(self.distribution, dict): distribution = self.distribution.get(column_name, DEFAULT_DISTRIBUTION) else: distribution = self.distribution LOGGER.debug('Fitting column %s to %s', column_name, distribution) univariate = get_instance(distribution) univariate.fit(column) columns.append(column_name) univariates.append(univariate) self.columns = columns self.univariates = univariates LOGGER.debug('Computing covariance') self.covariance = self._get_covariance(X) self.fitted = True LOGGER.debug('GaussianMultivariate fitted successfully')
def ks_statistic(model, X): instance = get_instance(model) instance.fit(X) estimated_cdf = np.sort(instance.cdf(X)) emperical_cdf = np.linspace(0.0, 1.0, num=len(X)) return max(np.abs(estimated_cdf - emperical_cdf))
def from_dict(cls, param_dict): """Create new instance from dictionary.""" distribution_class = get_instance(param_dict['type']) return distribution_class.from_dict(param_dict)