示例#1
0
def select_univariate(X, candidates):
    """Select the best univariate class for this data.

    Args:
        X (pandas.DataFrame):
            Data for which be best univariate must be found.
        candidates (list[Univariate]):
            List of Univariate subclasses (or instances of those) to choose from.

    Returns:
        Univariate:
            Instance of the selected candidate.
    """
    best_ks = np.inf
    best_model = None
    for model in candidates:
        try:
            instance = get_instance(model)
            instance.fit(X)
            ks, _ = kstest(X, instance.cdf)
            if ks < best_ks:
                best_ks = ks
                best_model = model
        except ValueError:
            # Distribution not supported
            pass

    return get_instance(best_model)
示例#2
0
def select_univariate(X, candidates):
    best_ks = np.inf
    best_model = None
    for model in candidates:
        instance = get_instance(model)
        instance.fit(X)
        ks, _ = kstest(X, instance.cdf)
        if ks < best_ks:
            best_ks = ks
            best_model = model

    return get_instance(best_model)
示例#3
0
    def _validate_distribution(cls, distribution):
        if not isinstance(distribution, str):
            return distribution
        if distribution in cls._DISTRIBUTIONS:
            return cls._DISTRIBUTIONS[distribution]

        try:
            copulas.get_instance(distribution)
            return distribution
        except (ValueError, ImportError):
            error_message = 'Invalid distribution specification {}'.format(distribution)
            raise ValueError(error_message) from None
示例#4
0
    def fit(self, X):
        """Compute the distribution for each variable and then its covariance matrix.

        Args:
            X(numpy.ndarray or pandas.DataFrame): Data to model.

        Returns:
            None
        """
        LOGGER.debug('Fitting Gaussian Copula')

        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        for column_name, column in X.items():
            if isinstance(self.distribution, dict):
                distribution = self.distribution.get(column_name,
                                                     DEFAULT_DISTRIBUTION)
            else:
                distribution = self.distribution

            distribution_instance = get_instance(distribution)
            distribution_instance.fit(column)

            self.distribs[column_name] = distribution_instance

        self.covariance = self._get_covariance(X)
        self.fitted = True
示例#5
0
def test_pdf(config_path):
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)

    # Setup
    test_obj = config['test']
    instance = get_instance(test_obj['class'], **test_obj['kwargs'])

    inputs = config['test_case_inputs']
    outputs = config['expected_output']
    input_points = pd.read_csv(os.path.join(BASE, 'input', inputs['points']))
    output_r = pd.read_csv(os.path.join(BASE, 'output', outputs['R']))
    output_matlab = pd.read_csv(os.path.join(BASE, 'output',
                                             outputs['Matlab']))

    # Run
    instance.theta = inputs['theta']

    # Asserts
    cdfs = instance.cdf(input_points.values)

    rtol = config['settings']['rtol']

    assert np.all(np.isclose(output_r["cdf"], cdfs, rtol=rtol))
    assert np.all(np.isclose(output_matlab["cdf"], cdfs, rtol=rtol))
示例#6
0
    def test_get_instance_instance(self):
        """Try to get a new instance from a instance"""
        # Run
        instance = get_instance(GaussianMultivariate())

        # Asserts
        assert not instance.fitted
        assert isinstance(instance, GaussianMultivariate)
示例#7
0
 def from_dict(cls, param_dict):
     """Create new instance from dictionary."""
     distribution_class = get_instance(param_dict['type'])
     if get_qualified_name(distribution_class) == get_qualified_name(
             Univariate):
         distribution_class.fitted = param_dict['fitted']
         if distribution_class.fitted:
             if param_dict.get("constant_value", None) is not None:
                 distribution_class.constant_value = param_dict[
                     "constant_value"]
                 distribution_class._replace_constant_methods()
             else:
                 instance_class = get_instance(param_dict['instance_type'])
                 distribution_class._instance = instance_class.from_dict(
                     param_dict)
         return distribution_class
     return distribution_class.from_dict(param_dict)
示例#8
0
    def test_get_instance_str(self):
        """Try to get a new instance from a str"""
        # Run
        instance = get_instance(
            'copulas.multivariate.gaussian.GaussianMultivariate')

        # Asserts
        assert not instance.fitted
        assert isinstance(instance, GaussianMultivariate)
示例#9
0
    def test_get_instance_instance_fitted(self):
        """Try to get a new instance from a fitted instance"""
        # Run
        gaussian = GaussianMultivariate()
        gaussian.fit(pd.DataFrame({'a_field': list(range(10))}))
        instance = get_instance(gaussian)

        # Asserts
        assert not instance.fitted
        assert isinstance(instance, GaussianMultivariate)
示例#10
0
def select_univariate(X, candidates):
    best_ks = np.inf
    best_model = None
    for model in candidates:
        ks = ks_statistic(model, X)
        if ks < best_ks:
            best_ks = ks
            best_model = model

    return get_instance(best_model)
示例#11
0
    def test_get_instance_instance_distribution(self):
        """Try to get a new instance from a instance with distribution"""
        # Run
        instance = get_instance(
            GaussianMultivariate(
                distribution='copulas.univariate.truncnorm.TruncNorm'))

        # Asserts
        assert not instance.fitted
        assert isinstance(instance, GaussianMultivariate)
        assert instance.distribution == 'copulas.univariate.truncnorm.TruncNorm'
示例#12
0
def evaluate_model_dataset(model_name, dataset_name, max_rows, max_columns):
    data = load_data(dataset_name, max_rows, max_columns)
    start = datetime.utcnow()

    LOGGER.info('Testing dataset %s (shape: %s)', dataset_name, data.shape)
    LOGGER.debug('dtypes for dataset %s:\n%s', dataset_name, data.dtypes)

    error_message = None
    score = None
    try:
        model = AVAILABLE_MODELS.get(model_name, model_name)
        instance = get_instance(model)
        LOGGER.info('Fitting dataset %s (shape: %s)', dataset_name, data.shape)
        instance.fit(data)

        LOGGER.info('Sampling %s rows for dataset %s', len(data), dataset_name)
        sampled = instance.sample(len(data))
        assert sampled.shape == data.shape

        try:
            LOGGER.info('Computing PDF for dataset %s', dataset_name)
            pdf = instance.pdf(sampled)
            assert (0 <= pdf).all()

            LOGGER.info('Computing CDF for dataset %s', dataset_name)
            cdf = instance.cdf(sampled)
            assert (0 <= cdf).all() and (cdf <= 1).all()
        except NotImplementedError:
            pass

        LOGGER.info('Evaluating scores for dataset %s', dataset_name)
        scores = []
        for column in data.columns:
            scores.append(ks_2samp(sampled[column].values,
                                   data[column].values))

        score = np.mean(scores)
        LOGGER.info("Dataset %s score: %s", dataset_name, score)

    except Exception as ex:
        error_message = '{}: {}'.format(ex.__class__.__name__, ex)
        LOGGER.exception("Dataset %s failed: %s", dataset_name, error_message)

    elapsed_time = datetime.utcnow() - start

    return {
        'model_name': model_name,
        'dataset_name': dataset_name,
        'elapsed_time': elapsed_time,
        'error_message': error_message,
        'score': score,
        'num_columns': len(data.columns),
        'num_rows': len(data)
    }
示例#13
0
    def test_get_instance_with_kwargs(self):
        """Try to get a new instance with kwargs"""
        # Run
        instance = get_instance(
            GaussianMultivariate,
            distribution='copulas.univariate.truncnorm.TruncNorm')

        # Asserts
        assert not instance.fitted
        assert isinstance(instance, GaussianMultivariate)
        assert instance.distribution == 'copulas.univariate.truncnorm.TruncNorm'
示例#14
0
    def from_dict(cls, copula_dict):
        """Create a new instance from dictionary.

        Args:
            copula_dict: `dict` with the parameters to replicate the copula.
            Like the output of `Multivariate.to_dict`

        Returns:
            Multivariate: Instance of the copula defined on the parameters.
        """

        copula_class = get_instance(copula_dict['type'])
        return copula_class.from_dict(copula_dict)
示例#15
0
文件: base.py 项目: zeta1999/Copulas
    def from_dict(cls, params):
        """Create a new instance from a parameters dictionary.

        Args:
            params (dict):
                Parameters of the distribution, in the same format as the one
                returned by the ``to_dict`` method.

        Returns:
            Multivariate:
                Instance of the distribution defined on the parameters.
        """
        multivariate_class = get_instance(params['type'])
        return multivariate_class.from_dict(params)
示例#16
0
文件: base.py 项目: zeta1999/Copulas
    def from_dict(cls, params):
        """Build a distribution from its params dict.

        Args:
            params (dict):
                Dictionary containing the FQN of the distribution and the
                necessary parameters to rebuild it.
                The input format is exactly the same that is outputted by
                the distribution class ``to_dict`` method.

        Returns:
            Univariate:
                Distribution instance.
        """
        params = params.copy()
        distribution = get_instance(params.pop('type'))
        distribution._set_params(params)
        distribution.fitted = True

        return distribution
示例#17
0
def test_pdf(config_path):
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)

    # Setup
    test_obj = config['test']
    instance = get_instance(test_obj['class'], **test_obj['kwargs'])
    data = pd.read_csv(os.path.join(BASE, 'input', config['input']['points']))
    output_R = pd.read_csv(os.path.join(BASE, 'output', config['output']['R']))
    output_M = pd.read_csv(os.path.join(BASE, 'output', config['output']['M']))

    # Run
    instance.theta = config['input']['theta']

    # Asserts
    pdfs = instance.pdf(data.values)

    rtol = config['settings']['rtol']

    assert np.all(np.isclose(output_R["pdf"], pdfs, rtol=rtol)), config_file
    assert np.all(np.isclose(output_M["pdf"], pdfs, rtol=rtol)), config_file
示例#18
0
def test_fit(config_path):
    with open(config_path, 'r') as config_file:
        config = json.load(config_file)

    # Setup
    test_obj = config['test']
    instance = get_instance(test_obj['class'], **test_obj['kwargs'])
    data = pd.read_csv(os.path.join(BASE, 'input', config['input']))

    # Run
    instance.fit(data.values)

    # Asserts
    params = instance.to_dict()

    rtol = config['settings']['rtol']

    for other, expected in config['output'].items():
        for key, exp in expected.items():
            obs = params[key]
            msg = "Mismatch against {} on {}".format(other, config_path)
            assert np.isclose(exp, obs, rtol=rtol), msg
示例#19
0
    def fit(self, X):
        """Compute the distribution for each variable and then its covariance matrix.

        Arguments:
            X (pandas.DataFrame):
                Values of the random variables.
        """
        LOGGER.info('Fitting %s', self)

        if not isinstance(X, pd.DataFrame):
            X = pd.DataFrame(X)

        columns = []
        univariates = []
        for column_name, column in X.items():
            if isinstance(self.distribution, dict):
                distribution = self.distribution.get(column_name,
                                                     DEFAULT_DISTRIBUTION)
            else:
                distribution = self.distribution

            LOGGER.debug('Fitting column %s to %s', column_name, distribution)

            univariate = get_instance(distribution)
            univariate.fit(column)

            columns.append(column_name)
            univariates.append(univariate)

        self.columns = columns
        self.univariates = univariates

        LOGGER.debug('Computing covariance')
        self.covariance = self._get_covariance(X)
        self.fitted = True

        LOGGER.debug('GaussianMultivariate fitted successfully')
示例#20
0
def ks_statistic(model, X):
    instance = get_instance(model)
    instance.fit(X)
    estimated_cdf = np.sort(instance.cdf(X))
    emperical_cdf = np.linspace(0.0, 1.0, num=len(X))
    return max(np.abs(estimated_cdf - emperical_cdf))
示例#21
0
文件: base.py 项目: remem9527/Copulas
 def from_dict(cls, param_dict):
     """Create new instance from dictionary."""
     distribution_class = get_instance(param_dict['type'])
     return distribution_class.from_dict(param_dict)