Пример #1
0
 def setUp(self):
     x1 = numpy.random.normal(size=TEST_SET_SIZE)
     x2 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
     x3 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
     x4 = x2 + x3 + numpy.random.normal(size=TEST_SET_SIZE)
     x5 = x4 + numpy.random.normal(size=TEST_SET_SIZE)
     self.X = pd.DataFrame({
         "x1": x1,
         "x2": x2,
         "x3": x3,
         "x4": x4,
         "x5": x5
     })
     self.variable_types = {
         "x1": "c",
         "x2": "c",
         "x3": "c",
         "x4": "c",
         "x5": "c"
     }
     self.true_neighbors = {
         "x1": set(["x2", "x3"]),
         "x2": set(["x1", "x4"]),
         "x3": set(["x1", "x4"]),
         "x4": set(["x2", "x3", "x5"]),
         "x5": set(["x4"]),
     }
     self.true_colliders = set([("x3", "x4"), ("x2", "x4")])
     self.true_marked = set([("x4", "x5")])
     self.ic = IC(RobustRegressionTest)
     self.ic.search(self.X, self.variable_types)
Пример #2
0
 def setUp(self):
     x1 = numpy.random.normal(size=TEST_SET_SIZE)
     x2 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
     x3 = x1 + numpy.random.normal(size=TEST_SET_SIZE)
     x4 = x2 + x3 + numpy.random.normal(size=TEST_SET_SIZE)
     x5 = x4 + numpy.random.normal(size=TEST_SET_SIZE)
     self.X = pd.DataFrame({
         'x1': x1,
         'x2': x2,
         'x3': x3,
         'x4': x4,
         'x5': x5
     })
     self.variable_types = {
         'x1': 'c',
         'x2': 'c',
         'x3': 'c',
         'x4': 'c',
         'x5': 'c'
     }
     self.true_neighbors = {
         'x1': set(['x2', 'x3']),
         'x2': set(['x1', 'x4']),
         'x3': set(['x1', 'x4']),
         'x4': set(['x2', 'x3', 'x5']),
         'x5': set(['x4'])
     }
     self.true_colliders = set([('x3', 'x4'), ('x2', 'x4')])
     self.true_marked = set([('x4', 'x5')])
     self.ic = IC(RobustRegressionTest)
     self.ic.search(self.X, self.variable_types)
Пример #3
0
def runSearch(data,algo_var_types):
    # run the search
    ic_algorithm = IC(RobustRegressionTest)
    graph = ic_algorithm.search(data, algo_var_types)

    results = graph.edges(data=True)
    #pprint(results)
    return (results)
Пример #4
0
SIZE = 2000
x1 = np.random.normal(size=SIZE)
x2 = x1 + np.random.normal(size=SIZE)
x3 = x1 + np.random.normal(size=SIZE)
x4 = x2 + x3 + np.random.normal(size=SIZE)
x5 = x4 + np.random.normal(size=SIZE)

# load the data into a dataframe:
X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5})

# define the variable types: 'c' is 'continuous'.  The variables defined here
# are the ones the search is performed over  -- NOT all the variables defined
# in the data frame.
variable_types = {'x1': 'c', 'x2': 'c', 'x3': 'c', 'x4': 'c', 'x5': 'c'}

ic_algorithm = IC(RobustRegressionTest)
graph = ic_algorithm.search(X, variable_types)

e = graph.edges(data=True)
print(f"{e}")

SIZE = 2000
x1 = np.random.normal(size=SIZE)
x2 = x1 + np.random.normal(size=SIZE)
x3 = x1 + np.random.normal(size=SIZE)
x6 = np.random.normal(size=SIZE)
x4 = x2 + x3 + x6 + np.random.normal(size=SIZE)
x5 = x6 + np.random.normal(size=SIZE)

# load the data into a dataframe:
X = pd.DataFrame({'x1': x1, 'x2': x2, 'x3': x3, 'x4': x4, 'x5': x5})
Пример #5
0
    def fit(self, X):
        '''
      Copulafit using Gaussian copula with marginals evaluated by Gaussian KDE
      Precision matrix is evaluated using specified method, default to graphical LASSO
      :param X: input dataset
      :return: estimated precision matrix rho
      '''

        N, d = X.shape
        if self.scaler is not None:
            X_scale = self.scaler.fit_transform(X)
        else:
            X_scale = X
        if len(self.vertexes) == 0:
            self.vertexes = [str(id) for id in range(d)]

        self.theta = 1.0 / N
        cum_marginals = np.zeros_like(X)
        inv_norm_cdf = np.zeros_like(X)
        # inv_norm_cdf_scaled = np.zeros_like(X)
        self.kernels = list([])
        # TODO: complexity O(Nd) is high
        if self.verbose:
            colored('>> Computing marginals', color='blue')
        for j in range(cum_marginals.shape[1]):
            self.kernels.append(gaussian_kde(X_scale[:, j]))
            cum_pdf_overall = self.kernels[-1].integrate_box_1d(
                X_scale[:, j].min(), X_scale[:, j].max())
            for i in range(cum_marginals.shape[0]):
                cum_marginals[i, j] = self.kernels[-1].integrate_box_1d(
                    X_scale[:, j].min(), X_scale[i, j]) / cum_pdf_overall
                # truncate cumulative marginals
                if cum_marginals[i, j] < self.theta:
                    cum_marginals[i, j] = self.theta
                elif cum_marginals[i, j] > 1 - self.theta:
                    cum_marginals[i, j] = 1 - self.theta
                # inverse of normal CDF: \Phi(F_j(x))^{-1}
                inv_norm_cdf[i, j] = norm.ppf(cum_marginals[i, j])
                # scaled to preserve mean and variance: u_j + \sigma_j*\Phi(F_j(x))^{-1}
                # inv_norm_cdf_scaled[i, j] = X_scale[:, j].mean() + X_scale[:, j].std() * inv_norm_cdf[i, j]

        if self.method == 'mle':
            # maximum-likelihood estiamtor
            empirical_cov = EmpiricalCovariance()
            empirical_cov.fit(inv_norm_cdf)
            if self.verbose:
                print colored('>> Running MLE to estiamte precision matrix',
                              color='blue')

            self.est_cov = empirical_cov.covariance_
            self.corr = scale_matrix(self.est_cov)
            self.precision_ = inv(empirical_cov.covariance_)

        if self.method == 'glasso':
            if self.verbose:
                print colored('>> Running glasso to estiamte precision matrix',
                              color='blue')

            empirical_cov = EmpiricalCovariance()
            empirical_cov.fit(inv_norm_cdf)
            # shrunk convariance to avoid numerical instability
            shrunk_cov = shrunk_covariance(empirical_cov.covariance_,
                                           shrinkage=0.8)
            self.est_cov, self.precision_ = graph_lasso(emp_cov=shrunk_cov,
                                                        alpha=self.penalty,
                                                        verbose=self.verbose,
                                                        max_iter=self.max_iter)
            self.corr = scale_matrix(self.est_cov)

        if self.method == 'ledoit_wolf':
            if self.verbose:
                print colored(
                    '>> Running ledoit_wolf to estiamte precision matrix',
                    color='blue')

            self.est_cov, _ = ledoit_wolf(inv_norm_cdf)
            self.corr = scale_matrix(self.est_cov)
            self.precision_ = linalg.inv(self.est_cov)

        if self.method == 'spectral':
            '''L2 mehtod, use paper Inverse covariance estimation for high dimension data in linear time and space
         :formular: in paper eq(8)
         '''
            if self.verbose:
                print colored(
                    '>> Running Riccati to estiamte precision matrix',
                    color='blue')

            # TODO: note estimated cov is sample cov
            self.est_cov, self.precision_ = spectral(inv_norm_cdf,
                                                     rho=2 * self.penalty,
                                                     assume_centered=False)
            self.corr = scale_matrix(self.est_cov)

        if self.method == 'pc':
            clf = pgmlearner.PGMLearner()
            data_list = list([])
            for row_id in range(X_scale.shape[0]):
                instance = dict()
                for i, n in enumerate(self.vertexes):
                    instance[n] = X_scale[row_id, i]
                data_list.append(instance)
            graph = clf.lg_constraint_estimatestruct(data=data_list,
                                                     pvalparam=self.pval,
                                                     bins=self.bins)
            dag = np.zeros(shape=(len(graph.V), len(graph.V)))
            for e in graph.E:
                dag[self.vertexes.index(e[0]), self.vertexes.index(e[1])] = 1
            self.conditional_independences_ = dag

        if self.method == 'ic':
            df = dict()
            variable_types = dict()
            for j in range(X_scale.shape[1]):
                df[self.vertexes[j]] = X_scale[:, j]
                variable_types[self.vertexes[j]] = 'c'
            data = pd.DataFrame(df)
            # run the search
            ic_algorithm = IC(RobustRegressionTest,
                              data,
                              variable_types,
                              alpha=self.pval)
            graph = ic_algorithm.search()
            dag = np.zeros(shape=(X_scale.shape[1], X_scale.shape[1]))
            for e in graph.edges(data=True):
                i = self.vertexes.index(e[0])
                j = self.vertexes.index(e[1])
                dag[i, j] = 1
                dag[j, i] = 1
                arrows = set(e[2]['arrows'])
                head_len = len(arrows)
                if head_len > 0:
                    head = arrows.pop()
                    if head_len == 1 and head == e[0]:
                        dag[i, j] = 0
                    if head_len == 1 and head == e[1]:
                        dag[j, i] = 0
            self.conditional_independences_ = dag

        # finally we fit the structure
        self.fit_structure(self.precision_)
Пример #6
0
import numpy
import pandas as pd

from causality.inference.search import IC
from causality.inference.independence_tests import RobustRegressionTest

# generate some toy data:
SIZE = 2000
x1 = numpy.random.normal(size=SIZE)
x2 = x1 + numpy.random.normal(size=SIZE)
x3 = x1 + numpy.random.normal(size=SIZE)
x4 = x2 + x3 + numpy.random.normal(size=SIZE)
x5 = x4 + numpy.random.normal(size=SIZE)

# load the data into a dataframe:
X = pd.DataFrame({'x1' : x1, 'x2' : x2, 'x3' : x3, 'x4' : x4, 'x5' : x5})

# define the variable types: 'c' is 'continuous'.  The variables defined here
# are the ones the search is performed over  -- NOT all the variables defined
# in the data frame.
variable_types = {'x1' : 'c', 'x2' : 'c', 'x3' : 'c', 'x4' : 'c', 'x5' : 'c'}

# run the search
ic_algorithm = IC(RobustRegressionTest, X, variable_types)
graph = ic_algorithm.search()