예제 #1
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), squared=False), 0)
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert(np.amax(mahal_dist) < 250)
    assert(np.amin(mahal_dist) > 50)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)
def test_suffstat_sk_full():
    # compare the EmpiricalCovariance.covariance fitted on X*sqrt(resp)
    # with _sufficient_sk_full, n_components=1
    rng = np.random.RandomState(0)
    n_samples, n_features = 500, 2

    # special case 1, assuming data is "centered"
    X = rng.rand(n_samples, n_features)
    resp = rng.rand(n_samples, 1)
    X_resp = np.sqrt(resp) * X
    nk = np.array([n_samples])
    xk = np.zeros((1, n_features))
    covars_pred = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=True)
    ecov.fit(X_resp)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # special case 2, assuming resp are all ones
    resp = np.ones((n_samples, 1))
    nk = np.array([n_samples])
    xk = X.mean().reshape((1, -1))
    covars_pred = _estimate_gaussian_covariance_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=False)
    ecov.fit(X)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)
예제 #3
0
 def fit(self, X, n_jobs=-1):
     EmpiricalCovariance.fit(self, X)
     if not self.no_fit:
         CovarianceOutlierDetectionMixin.set_threshold(self,
                                                       X,
                                                       n_jobs=n_jobs)
     return self
def calc_full_covs(net, trainloader, n_classes, layers):
    net.eval()
    layers_centers = []
    layers_precisions = []
    for l in range(layers):
        outputs_list = []
        target_list = []
        with torch.no_grad():
            for (inputs, targets) in trainloader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = net.intermediate_forward(inputs, layer_index=l)
                outputs_list.append(outputs)
                target_list.append(targets)
            outputs = torch.cat(outputs_list, axis=0)
            target_list = torch.cat(target_list)
            x_dim = outputs.size(1)
            centers = torch.zeros(n_classes, x_dim).cuda()
            normlized_outputs = []
            for c in range(n_classes):
                class_points = outputs[c == target_list]
                centers[c] = torch.mean(class_points, axis=0)
                normlized_outputs.append(
                    class_points -
                    centers[c].unsqueeze(0).expand(class_points.size(0), -1))
            normlized_outputs = torch.cat(normlized_outputs, axis=0).cpu()
            covs_lasso = EmpiricalCovariance(assume_centered=False)
            covs_lasso.fit(normlized_outputs.cpu().numpy())
            precision = torch.from_numpy(covs_lasso.precision_).float().cuda()
            layers_centers.append(centers)
            layers_precisions.append(precision)
    return layers_precisions, layers_centers
예제 #5
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    assert_array_almost_equal(empirical_covariance(X), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X), squared=False), 0)
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert(np.amax(mahal_dist) < 250)
    assert(np.amin(mahal_dist) > 50)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)
예제 #6
0
class CovEmbedding(BaseEstimator, TransformerMixin):
    """ Tranformer that returns the coefficients on a flat space to
    perform the analysis.
    """

    def __init__(self, base_estimator=None, kind='tangent'):
        self.base_estimator = base_estimator
        self.kind = kind
#        if self.base_estimator == None:
#            self.base_estimator_ = ...
#        else:
#            self.base_estimator_ = clone(base_estimator)

    def fit(self, X, y=None):
        if self.base_estimator is None:
            self.base_estimator_ = EmpiricalCovariance(
                assume_centered=True)
        else:
            self.base_estimator_ = clone(self.base_estimator)

        if self.kind == 'tangent':
            # self.mean_cov = mean_cov = spd_manifold.log_mean(covs)
            # Euclidean mean as an approximation to the geodesic
            covs = [self.base_estimator_.fit(x).covariance_ for x in X]
            covs = my_stack(covs)
            mean_cov = np.mean(covs, axis=0)
            self.whitening_ = inv_sqrtm(mean_cov)
        return self

    def transform(self, X):
        """Apply transform to covariances

        Parameters
        ----------
        covs: list of array
            list of covariance matrices, shape (n_rois, n_rois)

        Returns
        -------
        list of array, transformed covariance matrices,
        shape (n_rois * (n_rois+1)/2,)
        """
        covs = [self.base_estimator_.fit(x).covariance_ for x in X]
        covs = my_stack(covs)
        p = covs.shape[-1]
        if self.kind == 'tangent':
            id_ = np.identity(p)
            covs = [self.whitening_.dot(c.dot(self.whitening_)) - id_
                    for c in covs]
        elif self.kind == 'partial correlation':
            covs = [cov_to_corr(inv(g)) for g in covs]
        elif self.kind == 'correlation':
            covs = [cov_to_corr(g) for g in covs]
        return np.array([sym_to_vec(c) for c in covs])
def printSciKitCovarianceMatrixs():
      #does not work, ValueError: setting an array element with a sequence.
      xMaker = RSTCovarianceMatrixMaker()
      nums, data, ilabels = getLabeledRSTData(False)
      for i,d in enumerate(data):
          d['ratio'] = ilabels[i]
      xMaker.setInstanceNums(nums)
      xMaker.fit(data)
      X = xMaker.transform(data)
      correlator = EmpiricalCovariance()
      correlator.fit(X)

      print correlator.covariance_
예제 #8
0
class CovEmbedding(BaseEstimator, TransformerMixin):
    """ Tranformer that returns the coefficients on a flat space to
    perform the analysis.
    """

    def __init__(self, cov_estimator=None, kind='tangent'):
        self.cov_estimator = cov_estimator
        self.kind = kind

    def fit(self, X, y=None):
        if self.cov_estimator is None:
            self.cov_estimator_ = EmpiricalCovariance(
                assume_centered=True)
        else:
            self.cov_estimator_ = clone(self.cov_estimator)

        if self.kind == 'tangent':
            covs = [self.cov_estimator_.fit(x).covariance_ for x in X]
            self.mean_cov_ = spd_mfd.frechet_mean(covs, max_iter=30, tol=1e-7)
            self.whitening_ = spd_mfd.inv_sqrtm(self.mean_cov_)
        return self

    def transform(self, X):
        """Apply transform to covariances

        Parameters
        ----------
        covs: list of array
            list of covariance matrices, shape (n_rois, n_rois)

        Returns
        -------
        list of array, transformed covariance matrices,
        shape (n_rois * (n_rois+1)/2,)
        """
        covs = [self.cov_estimator_.fit(x).covariance_ for x in X]
        covs = spd_mfd.my_stack(covs)
        if self.kind == 'tangent':
            covs = [spd_mfd.logm(self.whitening_.dot(c).dot(self.whitening_))
                    for c in covs]
        elif self.kind == 'precision':
            covs = [spd_mfd.inv(g) for g in covs]
        elif self.kind == 'partial correlation':
            covs = [prec_to_partial(spd_mfd.inv(g)) for g in covs]
        elif self.kind == 'correlation':
            covs = [cov_to_corr(g) for g in covs]
        else:
            raise ValueError("Unknown connectivity measure.")

        return np.array([sym_to_vec(c) for c in covs])
예제 #9
0
def init_w_kmeans(data):
    """Calculate initialization values using K-Means"""
    # initialize means
    km = KMeans(n_clusters=CLUSTERS)
    labs = km.fit_predict(data)
    means = km.cluster_centers_
    # initialize covariaces
    covs = np.empty((CLUSTERS, DIMENSIONS, DIMENSIONS))
    for l in np.unique(labs.ravel()):
        ce = EmpiricalCovariance()
        ce.fit(data[labs==l, :])
        ce.fit(data)
        covs[l,:,:] = ce.covariance_
    return means, covs
def main():
    print ("Running CV on Mahalanobis Distance based approach.")
    mahanalobis()

    start_time = time.time()
    totalX = []
    totalY = []
    flag = True
    countTrain = 228000
    print ("\n\nNow testing on separate data.")
    with open("creditcard.csv", "rb") as f:
        data = csv.reader(f)
        for row in data:
            if flag:
                flag = False
                continue
            countTrain += 1
            if countTrain > 228000:          #CV on 80% of data
                totalX.append([float(i) for i in row[:-1]])
                totalY.append(int(row[-1]))
    print ("Data Loaded")
    totalX = scalar.fit_transform(totalX)
    clf = EmpiricalCovariance()
    clf.fit(totalX)
    distances = clf.mahalanobis(totalX)

    Y = []
    for i in range(len(totalY)):

        if np.log10(distances[i]) > 1.838:
            Y.append(1)
        else:
            Y.append(0)
    print("%s seconds" % (time.time() - start_time))
    print ("Results")
    auc = roc_auc_score(totalY, Y)
    print("Area under curve : " + str(auc))
    fpr, tpr, _ = roc_curve(totalY, Y)
    print ("False Positive Rate : " + str(fpr[1]))
    _, recall, _ = precision_recall_curve(totalY, Y)
    print ("Recall : " + str(recall[1]))

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, color='darkorange', label='ROC curve (area = %0.3f)' % auc)
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.legend(loc="lower right")
    plt.show()
예제 #11
0
class Mahalanobis(BaseEstimator):
    """Mahalanobis distance estimator. Uses Covariance estimate
    to compute mahalanobis distance of the observations
    from the model.

    Parameters
    ----------
    robust : boolean to determine wheter to use robust estimator
        based on Minimum Covariance Determinant computation
    """
    def __init__(self, robust=False):
        if not robust:
            from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator  #
        else:
            from sklearn.covariance import MinCovDet as CovarianceEstimator  #
        self.model = CovarianceEstimator()
        self.cov = None

    def fit(self, X, y=None, **params):
        """Fits the covariance model according to the given training
        data and parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        -------
        self : object
            Returns self.
        """
        self.cov = self.model.fit(X)
        return self

    def score(self, X, y=None):
        """Computes the mahalanobis distances of given observations.

        The provided observations are assumed to be centered. One may want to
        center them using a location estimate first.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
          The observations, the Mahalanobis distances of the which we compute.

        Returns
        -------
        mahalanobis_distance : array, shape = [n_observations,]
            Mahalanobis distances of the observations.
        """

        #return self.model.score(X,assume_centered=True)
        return -self.model.mahalanobis(X - self.model.location_)**0.33
예제 #12
0
def phd(X, Y, **kwargs):
    """
    Parameters
    ----------
    X : array-like, shape = [n_features, n_samples]
        Training data, where n_samples is the number of samples and
        n_features is the number of features.
    Y : array-like, shape = [n_samples]
        Response variable, where n_samples is the number of samples


    Argument dictionary should contain:
    kwargs = {
        'd' : intrinsic dimension (int)
        'residuals' : If True, creates PHDs from the residuals of linear regression
                    (defaults to False)
        'return_mat' : Boolean whether key PHD matrix should be returned (defaults
                    to False).
    }

    Returns
    -----------
    proj_vecs : array-like, shape = [n_features, d]
        Orthonormal system spanning the sufficient dimension subspace, where
        d refers to the intrinsic dimension.
    }
    """
    # Extract arguments from dictionary
    d = kwargs['d']
    residuals = kwargs.get('residuals', False)
    return_mat = kwargs.get('return_mat', False)
    D, N = X.shape
    # Calculate covariance matrix and empirical covariance matrix
    emc = EmpiricalCovariance()
    emc = emc.fit(X.T)  # Covariance of all samples
    cov_all = emc.covariance_
    weighted_cov = np.zeros(cov_all.shape)
    if residuals:
        linreg = LinearRegression()
        linreg = linreg.fit(X.T, Y)
        res = Y - linreg.predict(X.T)
        Y = res
    Ymean = np.mean(Y)
    mean_all = np.mean(X, axis=1)
    for i in range(N):
        weighted_cov += (Y[i] - Ymean) * np.outer(X[:, i] - mean_all,
                                                  X[:, i] - mean_all)
    weighted_cov = weighted_cov / float(N)
    vals, vecs = eig(weighted_cov, cov_all)
    order = np.argsort(np.abs(vals))[::-1]
    proj_vecs = vecs[:, order[:d]]
    if return_mat:
        return proj_vecs, weighted_cov
    else:
        return proj_vecs
예제 #13
0
파일: models.py 프로젝트: pborky/pynfsa
class Mahalanobis (BaseEstimator):
    """Mahalanobis distance estimator. Uses Covariance estimate
    to compute mahalanobis distance of the observations
    from the model.

    Parameters
    ----------
    robust : boolean to determine wheter to use robust estimator
        based on Minimum Covariance Determinant computation
    """
    def __init__(self, robust=False):
        if not robust:
            from sklearn.covariance import EmpiricalCovariance as CovarianceEstimator #
        else:
            from sklearn.covariance import MinCovDet as CovarianceEstimator #
        self.model = CovarianceEstimator()
        self.cov = None
    def fit(self, X, y=None, **params):
        """Fits the covariance model according to the given training
        data and parameters.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
            Training data, where n_samples is the number of samples and
            n_features is the number of features.

        Returns
        -------
        self : object
            Returns self.
        """
        self.cov = self.model.fit(X)
        return self
    def score(self, X, y=None):
        """Computes the mahalanobis distances of given observations.

        The provided observations are assumed to be centered. One may want to
        center them using a location estimate first.

        Parameters
        ----------
        X : array-like, shape = [n_samples, n_features]
          The observations, the Mahalanobis distances of the which we compute.

        Returns
        -------
        mahalanobis_distance : array, shape = [n_observations,]
            Mahalanobis distances of the observations.
        """

        #return self.model.score(X,assume_centered=True)
        return - self.model.mahalanobis(X-self.model.location_) ** 0.33
예제 #14
0
def test_suffstat_sk_full():
    # compare the precision matrix compute from the
    # EmpiricalCovariance.covariance fitted on X*sqrt(resp)
    # with _sufficient_sk_full, n_components=1
    rng = np.random.RandomState(0)
    n_samples, n_features = 500, 2

    # special case 1, assuming data is "centered"
    X = rng.rand(n_samples, n_features)
    resp = rng.rand(n_samples, 1)
    X_resp = np.sqrt(resp) * X
    nk = np.array([n_samples])
    xk = np.zeros((1, n_features))
    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=True)
    ecov.fit(X_resp)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full')
    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
    assert_array_almost_equal(precs_est, precs_pred)

    # special case 2, assuming resp are all ones
    resp = np.ones((n_samples, 1))
    nk = np.array([n_samples])
    xk = X.mean(axis=0).reshape((1, -1))
    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=False)
    ecov.fit(X)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full')
    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
    assert_array_almost_equal(precs_est, precs_pred)
def test_suffstat_sk_full():
    # compare the precision matrix compute from the
    # EmpiricalCovariance.covariance fitted on X*sqrt(resp)
    # with _sufficient_sk_full, n_components=1
    rng = np.random.RandomState(0)
    n_samples, n_features = 500, 2

    # special case 1, assuming data is "centered"
    X = rng.rand(n_samples, n_features)
    resp = rng.rand(n_samples, 1)
    X_resp = np.sqrt(resp) * X
    nk = np.array([n_samples])
    xk = np.zeros((1, n_features))
    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=True)
    ecov.fit(X_resp)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full')
    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
    assert_array_almost_equal(precs_est, precs_pred)

    # special case 2, assuming resp are all ones
    resp = np.ones((n_samples, 1))
    nk = np.array([n_samples])
    xk = X.mean(axis=0).reshape((1, -1))
    covars_pred = _estimate_gaussian_covariances_full(resp, X, nk, xk, 0)
    ecov = EmpiricalCovariance(assume_centered=False)
    ecov.fit(X)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='frobenius'), 0)
    assert_almost_equal(ecov.error_norm(covars_pred[0], norm='spectral'), 0)

    # check the precision computation
    precs_chol_pred = _compute_precision_cholesky(covars_pred, 'full')
    precs_pred = np.array([np.dot(prec, prec.T) for prec in precs_chol_pred])
    precs_est = np.array([linalg.inv(cov) for cov in covars_pred])
    assert_array_almost_equal(precs_est, precs_pred)
예제 #16
0
def peakSSV(X, Y, **kwargs):
    """
    Parameters
    ----------
    X : array-like, shape = [n_features, n_samples]
        Training data, where n_samples is the number of samples and
        n_features is the number of features.
    Y : array-like, shape = [n_samples]
        Response variable, where n_samples is the number of samples


    Argument dictionary should contain:
    kwargs = {
        'd' : intrinsic dimension (int)
        'n_samples' : Number of samples around the maximum Y to take.
        'rescale' : Boolean whether standardization should be performed (True
                    for yes).
    }

    Returns
    -----------
    proj_vecs : array-like, shape = [n_features, d]
        Orthonormal system spanning the sufficient dimension subspace, where
        d refers to the intrinsic dimension.
    }
    """
    # Extract arguments from dictionary
    d = kwargs['d']
    n_samples = kwargs['n_samples']
    rescale = kwargs['rescale']
    return_mat = kwargs.get('return_mat', False)
    D, N = X.shape
    # Standardize X
    emc = EmpiricalCovariance()
    emc = emc.fit(X.T)  # Covariance of all samples
    mean_all = np.mean(X, axis=0)
    cov_all = emc.covariance_
    scaler = StandardScaler()
    if rescale:
        Z = scaler.fit_transform(X.T).T
    pca = PCA()
    order = np.argsort(Y)
    XO = X[:, order]
    pca = pca.fit(X[:, -n_samples:].T)
    U = pca.components_[-d:, :].T
    if rescale:
        # Apply inverse transformation
        vecs = sqrtm(scipy.linalg.inv(cov_all)).dot(U[:, :d])
        proj_vecs, dummy = np.linalg.qr(vecs)
    else:
        proj_vecs = U[:, :d]
    return proj_vecs
예제 #17
0
def shape_(data):
    ec = EC()
    centre = np.mean(data, axis=0)
    covar = ec.fit(data).covariance_
    v, w = linalg.eigh(covar)
    v = 2. * np.sqrt(2.) * np.sqrt(v)
    u = w[0] / linalg.norm(w[0])
    angle = np.arctan(u[1] / u[0])
    angle = 180. * angle / np.pi
    circ = geometry.Point(centre).buffer(1)
    ellipse = affinity.scale(circ, float(v[0]), float(v[1]))

    return affinity.rotate(ellipse, angle)
예제 #18
0
def test_covariance():
    """Tests Covariance module on a simple dataset.

    """
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, squared=False), 0)
    assert_raises(NotImplementedError,
                  cov.error_norm, emp_cov, norm='foo')
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    print np.amin(mahal_dist), np.amax(mahal_dist)
    assert(np.amin(mahal_dist) > 0)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test with one sample
    X_1sample = np.arange(5)
    cov = EmpiricalCovariance()
    with warnings.catch_warnings(record=True):
        cov.fit(X_1sample)

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_equal(cov.location_, np.zeros(X.shape[1]))
예제 #19
0
def test_covariance():
    # Tests Covariance module on a simple dataset.
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm='spectral'), 0)
    assert_almost_equal(cov.error_norm(emp_cov, norm='frobenius'), 0)
    assert_almost_equal(cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(cov.error_norm(emp_cov, squared=False), 0)
    with pytest.raises(NotImplementedError):
        cov.error_norm(emp_cov, norm='foo')
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert np.amin(mahal_dist) > 0

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test with one sample
    # Create X with 1 sample and 5 features
    X_1sample = np.arange(5).reshape(1, 5)
    cov = EmpiricalCovariance()
    warn_msg = (
        "Only one sample available. You may want to reshape your data array")
    with pytest.warns(UserWarning, match=warn_msg):
        cov.fit(X_1sample)

    assert_array_almost_equal(cov.covariance_,
                              np.zeros(shape=(5, 5), dtype=np.float64))

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_array_equal(cov.location_, np.zeros(X.shape[1]))
예제 #20
0
class MahalanobisDistance(DimReducer):
    """
    Computes a person's Mahalanobis distance 
    using the mean and covariance estimated from a set of young people.
    Uses sklearn; verified this matches up with the normal matrix computation.
    """
    def __init__(self, age_lower, age_upper):
        self.age_lower = age_lower
        self.age_upper = age_upper
        self.need_ages = True
        self.k = 1

    def _fit_from_processed_data(self, X, ages):
        young_people = (ages >= self.age_lower) & (ages <= self.age_upper)
        print("%i people between %s and %s used for mean/cov calculation" %
              (young_people.sum(), self.age_lower, self.age_upper))
        assert young_people.sum() > 1000
        self.model = EmpiricalCovariance(assume_centered=False)
        self.model.fit(X[young_people, :])

    def _get_projections_from_processed_data(self, X):
        md = np.sqrt(self.model.mahalanobis(X)).reshape([-1, 1])
        return md
def mahanalobis():
    totalX = []
    totalY = []
    flag = True
    countTrain = 0
    with open("creditcard.csv", "rb") as f:
        data = csv.reader(f)
        for row in data:
            if flag:
                flag = False
                continue
            if countTrain >= 228000:                            #test on 20% of data
                break
            countTrain += 1
            totalX.append([float(i) for i in row[:-1]])
            totalY.append(int(row[-1]))
    totalX = scalar.fit_transform(totalX)
    print ("Data Loaded")
    clf = EmpiricalCovariance()
    clf.fit(totalX)
    distances = clf.mahalanobis(totalX)

    Y = []
    for i in range(len(totalY)):

        if np.log10(distances[i]) > 1.838:
            Y.append(1)
        else:
            Y.append(0)
    print ("Results")
    auc = roc_auc_score(totalY, Y)
    print(auc)
    fpr, _, _ = roc_curve(totalY, Y)
    print (fpr[1])
    _, recall, _ = precision_recall_curve(totalY, Y)
    print (recall[1])
    return auc, fpr[1], recall[1]
예제 #22
0
def detect_bad_channels(inst, pick_types=None, threshold=.2):
    from sklearn.preprocessing import RobustScaler
    from sklearn.covariance import EmpiricalCovariance
    from jr.stats import median_abs_deviation
    if pick_types is None:
        pick_types = dict(meg='mag')
    inst = inst.pick_types(copy=True, **pick_types)
    cov = EmpiricalCovariance()
    cov.fit(inst._data.T)
    cov = cov.covariance_
    # center
    scaler = RobustScaler()
    cov = scaler.fit_transform(cov).T
    cov /= median_abs_deviation(cov)
    cov -= np.median(cov)
    # compute robust summary metrics
    mu = np.median(cov, axis=0)
    sigma = median_abs_deviation(cov, axis=0)
    mu /= median_abs_deviation(mu)
    sigma /= median_abs_deviation(sigma)
    distance = np.sqrt(mu ** 2 + sigma ** 2)
    bad = np.where(distance < threshold)[0]
    bad = [inst.ch_names[ch] for ch in bad]
    return bad
예제 #23
0
def detect_bad_channels(inst, pick_types=None, threshold=.2):
    from sklearn.preprocessing import RobustScaler
    from sklearn.covariance import EmpiricalCovariance
    from jr.stats import median_abs_deviation
    if pick_types is None:
        pick_types = dict(meg='mag')
    inst = inst.pick_types(copy=True, **pick_types)
    cov = EmpiricalCovariance()
    cov.fit(inst._data.T)
    cov = cov.covariance_
    # center
    scaler = RobustScaler()
    cov = scaler.fit_transform(cov).T
    cov /= median_abs_deviation(cov)
    cov -= np.median(cov)
    # compute robust summary metrics
    mu = np.median(cov, axis=0)
    sigma = median_abs_deviation(cov, axis=0)
    mu /= median_abs_deviation(mu)
    sigma /= median_abs_deviation(sigma)
    distance = np.sqrt(mu ** 2 + sigma ** 2)
    bad = np.where(distance < threshold)[0]
    bad = [inst.ch_names[ch] for ch in bad]
    return bad
예제 #24
0
def pca(X, **kwargs):
    """
    Parameters
    ----------
    X : array-like, shape = [n_features, n_samples]
        Training data, where n_samples is the number of samples and
        n_features is the number of features.


    Argument dictionary should contain:
    kwargs = {
        'd' : intrinsic dimension (int)
        'rescale' : Boolean whether standardization should be performed (True
                    for yes).
        'return_mat' : Boolean whether key SIR matrix should be returned (defaults
                    to False).

    Returns
    -----------
    proj_vecs : array-like, shape = [n_features, d]
        Orthonormal system spanning the sufficient dimension subspace, where
        d refers to the intrinsic dimension.
    }
    """
    d = kwargs['d']
    return_mat = kwargs['return_mat']
    rescale = kwargs['rescale']
    scaler = StandardScaler()
    if rescale:
        emc = EmpiricalCovariance()
        emc = emc.fit(X.T)  # Covariance of all samples
        cov_all = emc.covariance_
        Z = scaler.fit_transform(X.T).T
        pca = PCA(svd_solver='full')
        pca = pca.fit(Z.T)
        proj_vecs = pca.components_[:d, :].T
        # Apply inverse transformation
        vecs = sqrtm(scipy.linalg.inv(cov_all)).dot(proj_vecs)
        proj_vecs, dummy = np.linalg.qr(vecs)
    else:
        pca = PCA(svd_solver='full')
        pca = pca.fit(X.T)
        proj_vecs = pca.components_[:d, :].T
    if return_mat:
        return proj_vecs, X.dot(X.T)
    else:
        return proj_vecs
class ChangeDetector(object):
    """
    Joint Gaussian Change detector using a scikit learn style interface
    
    This class is really a wrapper around the methods in scikit learn for estimating covariance using
    robust or empirical methods and calculating the mahalanobis distances.

    """
    def __init__(self, method='robust', estimator_kw_args={}):
        if method is 'robust':
            self.covariance_estimator_ = MinCovDet(**estimator_kw_args)
        elif method is 'empirical':
            self.covariance_estimator_ = EmpiricalCovariance(
                **estimator_kw_args)
        else:
            raise ValueError(
                "{} is not a valid method. Must be one of 'robust' or 'empirical'"
                .format(method))

    def fit(self, X):
        """
        Fits the estimator.

        Parameters:
        -----------
        X - array of time series, shape (n_series, len_series)
        """
        self.covariance_estimator_ = self.covariance_estimator_.fit(X)
        return self

    def predict(self, X, threshold):
        """
        Returns true for each time series predicted as change. Also returns the mahalanobis distances

        parameters:
        -----------
        X - array of time series, shape (n_series, len_series)
        threshold - float

        returns:
        y_pred - shape (n_time_series), true of change detected
        distances - shape (n_time_series). The mahanobis distances of each time series under the fitted distribution
        """
        distances = self.covariance_estimator_.mahalanobis(X)
        return distances > threshold, distances
예제 #26
0
def test_covariance():
    # Tests Covariance module on a simple dataset.
    # test covariance fit from data
    cov = EmpiricalCovariance()
    cov.fit(X)
    emp_cov = empirical_covariance(X)
    assert_array_almost_equal(emp_cov, cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(emp_cov), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='spectral'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, norm='frobenius'), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, scaling=False), 0)
    assert_almost_equal(
        cov.error_norm(emp_cov, squared=False), 0)
    assert_raises(NotImplementedError,
                  cov.error_norm, emp_cov, norm='foo')
    # Mahalanobis distances computation test
    mahal_dist = cov.mahalanobis(X)
    assert_greater(np.amin(mahal_dist), 0)

    # test with n_features = 1
    X_1d = X[:, 0].reshape((-1, 1))
    cov = EmpiricalCovariance()
    cov.fit(X_1d)
    assert_array_almost_equal(empirical_covariance(X_1d), cov.covariance_, 4)
    assert_almost_equal(cov.error_norm(empirical_covariance(X_1d)), 0)
    assert_almost_equal(
        cov.error_norm(empirical_covariance(X_1d), norm='spectral'), 0)

    # test with one sample
    # Create X with 1 sample and 5 features
    X_1sample = np.arange(5).reshape(1, 5)
    cov = EmpiricalCovariance()
    assert_warns(UserWarning, cov.fit, X_1sample)
    assert_array_almost_equal(cov.covariance_,
                              np.zeros(shape=(5, 5), dtype=np.float64))

    # test integer type
    X_integer = np.asarray([[0, 1], [1, 0]])
    result = np.asarray([[0.25, -0.25], [-0.25, 0.25]])
    assert_array_almost_equal(empirical_covariance(X_integer), result)

    # test centered case
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    assert_array_equal(cov.location_, np.zeros(X.shape[1]))
예제 #27
0
def save(X, Y, **kwargs):
    """
    Parameters
    ----------
    X : array-like, shape = [n_features, n_samples]
        Training data, where n_samples is the number of samples and
        n_features is the number of features.
    Y : array-like, shape = [n_samples]
        Response variable, where n_samples is the number of samples


    Argument dictionary should contain:
    kwargs = {
        'd' : intrinsic dimension (int)
        'n_levelsets' : number of slices to use (int)
        'rescale' : Boolean whether standardization should be performed (True
                    for yes).
        'return_mat' : Boolean whether key SIR matrix should be returned (defaults
                    to False).
    }

    Returns
    -----------
    proj_vecs : array-like, shape = [n_features, d]
        Orthonormal system spanning the sufficient dimension subspace, where
        d refers to the intrinsic dimension.
    }
    """
    # Extract arguments from dictionary
    d = kwargs['d']
    n_levelsets = kwargs['n_levelsets']
    rescale = kwargs['rescale']
    return_mat = kwargs.get('return_mat', False)


    D, N = X.shape
    # Standardize X
    emc = EmpiricalCovariance()
    emc = emc.fit(X.T) # Covariance of all samples
    mean_all = np.mean(X, axis = 0)
    cov_all = emc.covariance_
    scaler = StandardScaler()
    if rescale:
        Z = scaler.fit_transform(X.T).T
    labels, n_levelsets = split_statistically_equivalent_blocks(X, Y, n_levelsets)
    M = np.zeros((D, D)) # Key matrix in SAVE
    empirical_probabilities = np.zeros(n_levelsets)
    for i in range(n_levelsets):
        empirical_probabilities[i] = float(len(np.where(labels == i)[0]))/float(N)
        if rescale:
            emc = emc.fit(Z[:,labels == i].T) # Covariance of all samples
            cov_sub = emc.covariance_
            M += empirical_probabilities[i] * (np.eye(D) - cov_sub).dot((cov_all - cov_sub))
        else:
            emc = emc.fit(X[:,labels == i].T) # Covariance of all samples
            cov_sub = emc.covariance_
            M += empirical_probabilities[i] * (cov_all - cov_sub).dot((cov_all - cov_sub))
    U, S, V = np.linalg.svd(M)
    if rescale:
        # Apply inverse transformation
        vecs = sqrtm(scipy.linalg.inv(cov_all)).dot(U[:,:d])
        proj_vecs, dummy = np.linalg.qr(vecs)
    else:
        proj_vecs = U[:,:d]
    if return_mat:
        return proj_vecs, M
    else:
        return proj_vecs
예제 #28
0
def optimalShrinkage(X, return_covariance=False, method='rie'):
    """This function computes a cleaned, optimal shrinkage, 
       rotationally-invariant estimator (RIE) of the true correlation 
       matrix C underlying the noisy, in-sample estimate 
       E = 1/T X * transpose(X)
       associated to a design matrix X of shape (T, N) (T measurements 
       and N features).

       One approach to getting a cleaned estimator that predates the
       optimal shrinkage, RIE estimator consists in inverting the 
       Marcenko-Pastur equation so as to replace the eigenvalues
       from the spectrum of E by an estimation of the true ones.

       This approach is known to be numerically-unstable, in addition
       to failing to account for the overlap between the sample eigenvectors
       and the true eigenvectors. How to compute such overlaps was first
       explained by Ledoit and Peche (cf. reference below). Their procedure
       was extended by Bun, Bouchaud and Potters, who also correct
       for a systematic downward bias in small eigenvalues.
       
       It is this debiased, optimal shrinkage, rotationally-invariant
       estimator that the function at hand implements.
       
       In addition to above method, this funtion also provides access to:  
       - The finite N regularization of the optimal RIE for small eigenvalues
         as provided in section 8.1 of [3] a.k.a the inverse wishart (IW) regularization.
       - The direct kernel method of O. Ledoit and M. Wolf in their 2017 paper [4]. 
         This is a direct port of their Matlab code.
        
         
       Parameters
       ----------
       X: design matrix, of shape (T, N), where T denotes the number
           of samples (think measurements in a time series), while N
           stands for the number of features (think of stock tickers).
           
       return_covariance: type bool (default: False)
           If set to True, compute the standard deviations of each individual
           feature across observations, clean the underlying matrix
           of pairwise correlations, then re-apply the standard
           deviations and return a cleaned variance-covariance matrix.
       
       method: type string, optional (default="rie")
           - If "rie" : optimal shrinkage in the manner of Bun & al.
            with no regularisation  
           - If "iw" : optimal shrinkage in the manner of Bun & al.
            with the so called Inverse Wishart regularization
           - If 'kernel': Direct kernel method of Ledoit  Wolf.

       Returns
       -------
       E_RIE: type numpy.ndarray, shape (N, N)
           Cleaned estimator of the true correlation matrix C. A sample
           estimator of C is the empirical covariance matrix E 
           estimated from X. E is corrupted by in-sample noise.
           E_RIE is the optimal shrinkage, rotationally-invariant estimator 
           (RIE) of C computed following the procedure of Joel Bun 
           and colleagues (cf. references below).
           
           If return_covariance=True, E_clipped corresponds to a cleaned
           variance-covariance matrix.

       References
       ----------
       1 "Eigenvectors of some large sample covariance matrix ensembles",
         O. Ledoit and S. Peche
         Probability Theory and Related Fields, Vol. 151 (1), pp 233-264
       2 "Rotational invariant estimator for general noisy matrices",
         J. Bun, R. Allez, J.-P. Bouchaud and M. Potters
         arXiv: 1502.06736 [cond-mat.stat-mech]
       3 "Cleaning large Correlation Matrices: tools from Random Matrix Theory",
         J. Bun, J.-P. Bouchaud and M. Potters
         arXiv: 1610.08104 [cond-mat.stat-mech]
       4 "Direct Nonlinear Shrinkage Estimation of Large-Dimensional Covariance Matrices (September 2017)", 
         O. Ledoit and M. Wolf https://ssrn.com/abstract=3047302 or http://dx.doi.org/10.2139/ssrn.3047302
    """
    
    try:
        assert isinstance(return_covariance, bool)
    except AssertionError:
        raise
        sys.exit(1)

    T, N, transpose_flag = checkDesignMatrix(X)
    if transpose_flag:
        X = X.T
        
    if not return_covariance:
        X = StandardScaler(with_mean=False,
                           with_std=True).fit_transform(X)

    ec = EmpiricalCovariance(store_precision=False,
                             assume_centered=True)
    ec.fit(X)
    E = ec.covariance_
    
    if return_covariance:
        inverse_std = 1./np.sqrt(np.diag(E))
        E *= inverse_std
        E *= inverse_std.reshape(-1, 1)

    eigvals, eigvecs = np.linalg.eigh(E)
    eigvecs = eigvecs.T

    q = N / float(T)
    lambda_N = eigvals[0]  # The smallest empirical eigenvalue,
                           # given that the function used to compute
                           # the spectrum of a Hermitian or symmetric
                           # matrix - namely np.linalg.eigh - returns
                           # the eigenvalues in ascending order.
    lambda_hats = None
    
    if method is not 'kernel':
        use_inverse_wishart = (method == 'iw')
        xis = map(lambda x: xiHelper(x, q, E), eigvals)
        Gammas = map(lambda x: gammaHelper(x, q, N, lambda_N, inverse_wishart=use_inverse_wishart), eigvals)
        xi_hats = map(lambda a, b: a * b if b > 1 else a, xis, Gammas)
        lambda_hats = xi_hats
    else:
         lambda_hats = directKernel(q, T, N, eigvals)
        
    E_RIE = np.zeros((N, N), dtype=float)
    for lambda_hat, eigvec in zip(lambda_hats, eigvecs):
        eigvec = eigvec.reshape(-1, 1)
        E_RIE += lambda_hat * eigvec.dot(eigvec.T)

    # bp()
    tmp = 1./np.sqrt(np.diag(E_RIE))
    E_RIE *= tmp
    E_RIE *= tmp.reshape(-1, 1)
    
    if return_covariance:
        std = 1./inverse_std
        E_RIE *= std
        E_RIE *= std.reshape(-1, 1)

    return E_RIE
예제 #29
0
def clipped(X, alpha=None, return_covariance=False):
    """Clips the eigenvalues of an empirical correlation matrix E 
       in order to provide a cleaned estimator E_clipped of the 
       underlying correlation matrix.
       Proceeds by keeping the [N * alpha] top eigenvalues and shrinking
       the remaining ones by a trace-preserving constant 
       (i.e. Tr(E_clipped) = Tr(E)).

       Parameters
       ----------
       X: design matrix, of shape (T, N), where T denotes the number
           of samples (think measurements in a time series), while N
           stands for the number of features (think of stock tickers).

       alpha: type float or derived from numbers.Real (default: None)
           Parameter between 0 and 1, inclusive, determining the fraction
           to keep of the top eigenvalues of an empirical correlation matrix.

           If left unspecified, alpha is chosen so as to keep all the
           empirical eigenvalues greater than the upper limit of 
           the support to the Marcenko-Pastur spectrum. Indeed, such 
           eigenvalues can be considered as associated with some signal,
           whereas the ones falling inside the Marcenko-Pastur range
           should be considered as corrupted with noise and indistinguishable
           from the spectrum of the correlation of a random matrix.

           This ignores finite-size effects that make it possible
           for the eigenvalues to exceed the upper and lower edges
           defined by the Marcenko-Pastur spectrum (cf. a set of results
           revolving around the Tracy-Widom distribution)
           
       return_covariance: type bool (default: False)
           If set to True, compute the standard deviations of each individual
           feature across observations, clean the underlying matrix
           of pairwise correlations, then re-apply the standard
           deviations and return a cleaned variance-covariance matrix.

       Returns
       -------
       E_clipped: type numpy.ndarray, shape (N, N)
           Cleaned estimator of the true correlation matrix C underlying
           a noisy, in-sample estimate E (empirical correlation matrix
           estimated from X). This cleaned estimator proceeds through
           a simple eigenvalue clipping procedure (cf. reference below).
           
           If return_covariance=True, E_clipped corresponds to a cleaned 
           variance-covariance matrix.

       Reference
       ---------
       "Financial Applications of Random Matrix Theory: a short review",
       J.-P. Bouchaud and M. Potters
       arXiv: 0910.1205 [q-fin.ST]
    """

    try:
        if alpha is not None:
            assert isinstance(alpha, Real) and 0 <= alpha <= 1
            
        assert isinstance(return_covariance, bool)
    except AssertionError:
        raise
        sys.exit(1)
    
    T, N, transpose_flag = checkDesignMatrix(X)
    if transpose_flag:
        X = X.T
        
    if not return_covariance:
        X = StandardScaler(with_mean=False,
                           with_std=True).fit_transform(X)

    ec = EmpiricalCovariance(store_precision=False,
                             assume_centered=True)
    ec.fit(X)
    E = ec.covariance_
    
    if return_covariance:
        inverse_std = 1./np.sqrt(np.diag(E))
        E *= inverse_std
        E *= inverse_std.reshape(-1, 1)

    eigvals, eigvecs = np.linalg.eigh(E)
    eigvecs = eigvecs.T

    if alpha is None:
        (lambda_min, lambda_max), _ = marcenkoPastur(X)
        xi_clipped = np.where(eigvals >= lambda_max, eigvals, np.nan)
    else:
        xi_clipped = np.full(N, np.nan)
        threshold = int(ceil(alpha * N))
        if threshold > 0:
            xi_clipped[-threshold:] = eigvals[-threshold:]

    gamma = float(E.trace() - np.nansum(xi_clipped))
    gamma /= np.isnan(xi_clipped).sum()
    xi_clipped = np.where(np.isnan(xi_clipped), gamma, xi_clipped)

    E_clipped = np.zeros((N, N), dtype=float)
    for xi, eigvec in zip(xi_clipped, eigvecs):
        eigvec = eigvec.reshape(-1, 1)
        E_clipped += xi * eigvec.dot(eigvec.T)
        
    tmp = 1./np.sqrt(np.diag(E_clipped))
    E_clipped *= tmp
    E_clipped *= tmp.reshape(-1, 1)
    
    if return_covariance:
      std = 1./inverse_std
      E_clipped *= std
      E_clipped *= std.reshape(-1, 1)

    return E_clipped
예제 #30
0
class Matching(IndividualOutcomeEstimator):
    def __init__(
        self,
        propensity_transform=None,
        caliper=None,
        with_replacement=True,
        n_neighbors=1,
        matching_mode="both",
        metric="mahalanobis",
        knn_backend="sklearn",
        estimate_observed_outcome=False,
    ):
        """Match treatment and control samples with similar covariates.

        Args:
            propensity_transform (causallib.transformers.PropensityTransformer):
                an object for data preprocessing which adds the propensity
                score as a feature (default: None)
            caliper (float) : maximal distance for a match to be accepted. If
                not defined, all matches will be accepted. If defined, some
                samples may not be matched and their outcomes will not be
                estimated. (default: None)
            with_replacement (bool): whether samples can be used multiple times
                for matching. If set to False, the matching process will optimize
                the linear sum of distances between pairs of treatment and
                control samples and only `min(N_treatment, N_control)` samples
                will be estimated. Matching with no replacement does not make
                use of the `fit` data and is therefore not implemented for
                out-of-sample data (default: True)
            n_neighbors (int) : number of nearest neighbors to include in match.
                Must be 1 if `with_replacement` is `False.` If larger than 1, the
                estimate is calculated using the `regress_agg_function` or 
                `classify_agg_function` across the `n_neighbors`. Note that when
                the `caliper` variable is set, some samples will have fewer than
                `n_neighbors` matches. (default: 1).
            matching_mode (str) : Direction of matching: `treatment_to_control`,
                `control_to_treatment` or `both` to indicate which set should
                be matched to which. All sets are cross-matched in `match`
                and when `with_replacement` is `False` all matching modes 
                coincide. With replacement there is a difference.
            metric (str) : Distance metric string for calculating distance
                between samples. Note: if an external built `knn_backend`
                object with a different metric is supplied, `metric` needs to
                be changed to reflect that, because `Matching` will set its 
                inverse covariance matrix if "mahalanobis" is set. (default: 
                "mahalanobis", also supported: "euclidean")
            knn_backend (str or callable) : Backend to use for nearest neighbor
                search. Options are "sklearn"  or a callable  which returns an 
                object implementing `fit`, `kneighbors` and `set_params` 
                like the sklearn `NearestNeighbors` object. (default: "sklearn"). 
            estimate_observed_outcome (bool) : Whether to allow a match of a
                sample to a sample other than itself when looking within its own
                treatment value. If True, the estimated potential outcome for the
                observed outcome may differ from the true observed outcome.
                (default: False)

        Attributes:
            classify_agg_function (callable) : Aggregating function for outcome
                estimation when classifying. (default: majority_rule)
                Usage is determined by type of `y` during `fit`
            regress_agg_function (callable) : Aggregating function for outcome
                estimation when regressing or predicting prob_a. (default: np.mean)
                Usage is determined by type of `y` during `fit`
            treatments_ (pd.DataFrame) : DataFrame of treatments (created after `fit`)
            outcomes_ (pd.DataFrame) : DataFrame of outcomes (created after `fit`)
            match_df_ (pd.DataFrame) : Dataframe of most recently calculated
                matches. For details, see `match`. (created after `match`)
            samples_used_ (pd.Series) : Series with count of samples used
                during most recent match. Series includes a count for each
                treatment value. (created after `match`)
        """
        self.propensity_transform = propensity_transform
        self.covariance_conditioner = EmpiricalCovariance()
        self.caliper = caliper
        self.with_replacement = with_replacement
        self.n_neighbors = n_neighbors
        self.matching_mode = matching_mode
        self.metric = metric
        # if classify task, default aggregation function is majority
        self.classify_agg_function = majority_rule
        # if regress task,  default aggregation function is mean
        self.regress_agg_function = np.mean
        self.knn_backend = knn_backend
        self.estimate_observed_outcome = estimate_observed_outcome

    def fit(self, X, a, y, sample_weight=None):
        """Load the treatments and outcomes and fit search trees.

        Applies transform to covariates X, initializes search trees for each
        treatment value for performing nearest neighbor searches.
        Note: Running `fit` a second time overwrites any information from
        previous `fit or `match` and re-fits the propensity_transform object.

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            y (pd.Series): Series of shape (n,) containing outcomes for
                the n samples.
            sample_weight: IGNORED In signature for compatibility with other
                estimators.


        Note: `X`, `a` and `y` must share the same index.

        Returns:
            self (Matching) the fitted object
        """
        self._clear_post_fit_variables()
        self.outcome_ = y.copy()
        self.treatments_ = a.copy()

        if self.propensity_transform:
            self.propensity_transform.fit(X, a)
            X = self.propensity_transform.transform(X)

        self.conditioned_covariance_ = self._calculate_covariance(X)

        self.treatment_knns_ = {}
        for a in self.treatments_.unique():
            haystack = X[self.treatments_ == a]
            self.treatment_knns_[a] = self._fit_sknn(haystack)

        return self

    def _execute_matching(self, X, a):
        """Execute matching of samples in X according to the treatment values in a.

        Returns a DataFrame including all the results, which is also set as
        the attribute `self.match_df_`. The arguments `X` and `a` define the
        "needle" where the "haystack" is the data that was previously passed
        to fit, for matching with replacement. As such, treatment and control 
        samples from within `X` will not be matched with each other, unless
        the same `X` and `a` were passed to `fit`. For matching without
        replacement, the `X` and `a` passed to `match` provide the "needle" and
        the "haystack". If the attribute `caliper` is set, the matches are
        limited to those with a distance less than `caliper`.

        This function ignores the existing `match_df_` and will overwrite it.
        It is thus useful for if you have changed the settings and need to
        rematch the samples. For most applications, the `match` function is
        more convenient.

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.

        Note: The args are assumed to share the same index.

        Returns:
            match_df: The resulting matches DataFrame is indexed so that
              ` match_df.loc[treatment_value, sample_id]` has columns `matches`
               and `distances` containing lists of indices to samples and the
               respective distances for the matches discovered for `sample_id`
               from within the fitted samples with the given `treatment_value`.
               The indices in the `matches` column are from the fitted data,
               not the X argument in `match`. If `sample_id` had no match,
               `match_df.loc[treatment_value, sample_id].matches = []`.
               The DataFrame has shape (n* len(a.unique()), 2 ).

        Raises:
            NotImplementedError: Raised when with_replacement is False and
               n_neighbors is not 1.
        """
        if self.n_neighbors != 1 and not self.with_replacement:
            raise NotImplementedError(
                "Matching more than one neighbor is only implemented for"
                "no-replacement")

        if self.propensity_transform:
            X = self.propensity_transform.transform(X)
        if self.with_replacement:
            self.match_df_ = self._withreplacement_match(X, a)
        else:
            self.match_df_ = self._noreplacement_match(X, a)
        sample_id_name = X.index.name if X.index.name is not None else "sample_id"
        self.match_df_.index.set_names(["match_to_treatment", sample_id_name],
                                       inplace=True)
        # we record the number of samples that were successfully matched of
        # each treatment value
        self.samples_used_ = self._count_samples_used_by_treatment_value(a)

        return self.match_df_

    def estimate_individual_outcome(self,
                                    X,
                                    a,
                                    y=None,
                                    treatment_values=None,
                                    predict_proba=True,
                                    dropna=True):
        """
        Calculate the potential outcome for each sample and treatment value.

        Execute match and calculate, for each treatment value and each sample,
        the expected outcome. 

        Note: Out of sample estimation for matching without replacement requires
        passing a `y` vector here. If no 'y' is passed here, the values received
        by `fit` are used, and if the estimation indices are not a subset of the 
        fitted indices, the estimation will fail.

        If the attribute `estimate_observed_outcome` is 
        `True`, estimates will be calculated for the observed outcomes as well.
        If not, then the observed outcome will be passed through from the 
        corresponding element of `y` passed to `fit`.


        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            y (pd.Series): Series of shape (n,) containing outcome values for
                n samples. This is only used when `with_replacemnt=False`. 
                Otherwise, the outcome values passed to `fit` are used.
            predict_proba (bool) : whether to output classifications or
                probabilties for a classification task. If set to False and
                data is non-integer, a warning is issued. (default True)
            dropna (bool) : For samples that were unmatched due to caliper
                restrictions, drop from outcome_df leading to a potentially
                smaller sized output, or include them as NaN. (default: True)
            treatment_values : IGNORED

        Note: The args are assumed to share the same index.

        Returns:
            outcome_df (pd.DataFrame)
        """
        match_df = self.match(X, a, use_cached_result=True)

        outcome_df = self._aggregate_match_df_to_generate_outcome_df(
            match_df, a, predict_proba)
        outcome_df = self._filter_outcome_df_by_matching_mode(outcome_df, a)
        if outcome_df.isna().all(axis=None):
            raise ValueError("Matching was not successful and no outcomes can"
                             "be estimated. Check caliper value.")
        if dropna:
            outcome_df = outcome_df.dropna()

        return outcome_df

    def match(self,
              X,
              a,
              use_cached_result=True,
              successful_matches_only=False):
        """Matching the samples in X according to the treatment values in a.

        Returns a DataFrame including all the results, which is also set as
        the attribute `self.match_df_`. The arguments `X` and `a` define the
        "needle" where the "haystack" is the data that was previously passed
        to fit, for matching with replacement. As such, treatment and control 
        samp    les from within `X` will not be matched with each other, unless
        the same `X` and `a` were passed to `fit`. For matching without
        replacement, the `X` and `a` passed to `match` provide the "needle" and
        the "haystack". If the attribute `caliper` is set, the matches are
        limited to those with a distance less than `caliper`.

        Args:
            X (pd.DataFrame): DataFrame of shape (n,m) containing m covariates
                for n samples.
            a (pd.Series): Series of shape (n,) containing discrete treatment
                values for the n samples.
            use_cached_result (bool): Whether or not to return the `match_df` 
                from the most recent matching operation. The cached result will
                only be used if the sample indices of `X` and those of `match_df`
                are identical, otherwise it will rematch.
            successful_matches_only (bool): Whether or not to filter the matches
                to those which matched successfully. If set to `False`, the
                resulting DataFrame will have shape (n* len(a.unique()), 2 ),
                otherwise it may have a smaller shape due to unsuccessful matches.

        Note: The args are assumed to share the same index.

        Returns:
            match_df: The resulting matches DataFrame is indexed so that
              ` match_df.loc[treatment_value, sample_id]` has columns `matches`
               and `distances` containing lists of indices to samples and the
               respective distances for the matches discovered for `sample_id`
               from within the fitted samples with the given `treatment_value`.
               The indices in the `matches` column are from the fitted data,
               not the X argument in `match`. If `sample_id` had no match,
               `match_df.loc[treatment_value, sample_id].matches = []`.
               The DataFrame has shape (n* len(a.unique()), 2 ), if
               `successful_matches_only` is set to `False.

        Raises:
            NotImplementedError: Raised when with_replacement is False and
               n_neighbors is not 1.
        """
        cached_result_available = (hasattr(self, "match_df_") and
                                   X.index.equals(self.match_df_.loc[0].index))
        if not (use_cached_result and cached_result_available):
            self._execute_matching(X, a)

        return self._get_match_df(
            successful_matches_only=successful_matches_only)

    def matches_to_weights(self, match_df=None):
        """Calculate weights based on a given set of matches.

        For each matching from one treatment value to another, a weight vector
        is generated. The weights are calculated as the number of times a
        sample was selected in a matching, with each occurrence weighted
        according to the number of other samples in that matching. The weights
        can be used to estimate outcomes or to check covariate balancing. The 
        function can only be called after `match` has been run.

        Args:
            match_df (pd.DataFrame) : a DataFrame of matches returned from
                `match`. If not supplied, will use the `match_df_` attribute if
                available, else raises ValueError. Will not execute `match` to
                generate a `match_df`.

        Returns:
            weights_df (pd.DataFrame): DataFrame of shape (n,M) where M is the
                number of permutations of `a.unique()`.
        """
        if match_df is None:
            match_df = self._get_match_df(successful_matches_only=False)

        match_permutations = sorted(permutations(self.treatments_.unique()))
        weights_df = pd.DataFrame([
            self._matches_to_weights_single_matching(s, t, match_df)
            for s, t in match_permutations
        ], ).T

        return weights_df

    def get_covariates_of_matches(self, s, t, covariates):
        """
        Look up covariates of closest matches for a given matching.

        Using `self.match_df_` and the supplied `covariates`, look up
        the covariates of the last match. The function can only be called after
        `match` has been run.

            Args:
                s (int) : source treatment value
                t (int) : target treatment value
                covariates (pd.DataFrame) : The same covariates which were
                   passed to `fit`.

            Returns:
                covariate_df (pd.DataFrame) : a DataFrame of size
                (n_matched_samples, n_covariates * 3 + 2) with the covariate
                values of the sample, covariates of its match, calculated
                distance and number of neighbors found within the given
                caliper (with no caliper this will equal self.n_neighbors )

        """
        match_df = self._get_match_df()
        subdf = match_df.loc[s][self.treatments_ == t]
        sample_id_name = subdf.index.name

        def get_covariate_difference_from_nearest_match(source_row_index):
            j = subdf.loc[source_row_index].matches[0]
            delta_series = pd.Series(covariates.loc[source_row_index] -
                                     covariates.loc[j])
            source_row = covariates.loc[j].copy()
            source_row.at[sample_id_name] = j
            target_row = covariates.loc[source_row_index].copy()
            target_row = target_row
            covariate_differences = pd.concat({
                t:
                target_row,
                s:
                source_row,
                "delta":
                delta_series,
                "outcomes":
                pd.Series({
                    t: self.outcome_.loc[source_row_index],
                    s: self.outcome_.loc[j]
                }),
                "match":
                pd.Series(
                    dict(
                        n_neighbors=len(subdf.loc[source_row_index].matches),
                        distance=subdf.loc[source_row_index].distances[0],
                    )),
            })
            return covariate_differences

        covdf = pd.DataFrame(data=[
            get_covariate_difference_from_nearest_match(i) for i in subdf.index
        ],
                             index=subdf.index)
        covdf = covdf.reset_index()
        cols = covdf.columns
        covdf.columns = pd.MultiIndex.from_tuples([(t, sample_id_name)] +
                                                  list(cols[1:]))
        return covdf

    def _clear_post_fit_variables(self):
        for var in list(vars(self)):
            if var[-1] == "_":
                self.__delattr__(var)

    def _calculate_covariance(self, X):
        if len(X.shape) > 1 and X.shape[1] > 1:
            V_list = []
            for a in self.treatments_.unique():
                X_at_a = X[self.treatments_ == a].copy()
                current_V = self.covariance_conditioner.fit(X_at_a).covariance_
                V_list.append(current_V)
            # following Imbens&Rubin, we average across treatment groups
            V = np.mean(V_list, axis=0)
        else:
            # for 1d data revert to euclidean metric
            V = np.array(1).reshape(1, 1)
        return V

    def _aggregate_match_df_to_generate_outcome_df(self, match_df, a,
                                                   predict_proba):
        agg_function = self._get_agg_function(predict_proba)

        def outcome_from_matches_by_idx(x):
            return agg_function(self.outcome_.loc[x])

        outcomes = {}
        for i in sorted(a.unique()):
            outcomes[i] = match_df.loc[i].matches.apply(
                outcome_from_matches_by_idx)
        outcome_df = pd.DataFrame(outcomes)
        return outcome_df

    def _get_match_df(self, successful_matches_only=True):
        if not hasattr(self, "match_df_") or self.match_df_ is None:
            raise NotFittedError("You need to run `match` first")
        match_df = self.match_df_.copy()
        if successful_matches_only:
            match_df = match_df[match_df.matches.apply(bool)]
        if match_df.empty:
            raise ValueError(
                "Matching was not successful and no outcomes can be "
                "estimated. Check caliper value.")
        return match_df

    def _filter_outcome_df_by_matching_mode(self, outcome_df, a):
        if self.matching_mode == "treatment_to_control":
            outcome_df = outcome_df[a == 1]
        elif self.matching_mode == "control_to_treatment":
            outcome_df = outcome_df[a == 0]
        elif self.matching_mode == "both":
            pass
        else:
            raise NotImplementedError(
                "Matching mode {} is not implemented. Please select one of "
                "'treatment_to_control', 'control_to_treatment, "
                "or 'both'.".format(self.matching_mode))
        return outcome_df

    def _get_agg_function(self, predict_proba):
        if predict_proba:
            agg_function = self.regress_agg_function
        else:
            agg_function = self.classify_agg_function
            try:
                isoutputinteger = np.allclose(self.outcome_.apply(int),
                                              self.outcome_)
                if not isoutputinteger:
                    warnings.warn("Classifying non-categorical outcomes. "
                                  "This is probably a mistake.")
            except:
                warnings.warn(
                    "Unable to detect whether outcome is integer-like. ")
        return agg_function

    def _instantiate_nearest_neighbors_object(self):
        backend = self.knn_backend
        if backend == "sklearn":
            backend_instance = NearestNeighbors(algorithm="auto")
        elif callable(backend):
            backend_instance = backend()
            self.metric = backend_instance.metric
        elif hasattr(backend, "fit") and hasattr(backend, "kneighbors"):
            backend_instance = sk_clone(backend)
            self.metric = backend_instance.metric
        else:
            raise NotImplementedError(
                "`knn_backend` must be either an NearestNeighbors-like object,"
                " a callable returning such an object, or the string \"sklearn\""
            )
        backend_instance.set_params(**self._get_metric_dict())
        return backend_instance

    def _fit_sknn(self, target_df):
        """
        Fit scikit-learn NearestNeighbors object with samples in target_df.

        Fits object, adds metric parameters and returns namedtuple which
        also includes DataFrame indices so that identities can looked up.

        Args:
            target_df (pd.DataFrame) : DataFrame of covariates to fit

        Returns:
            KNN (namedtuple) : Namedtuple with members `learner` and `index`
            containing the fitted sklearn object and an index lookup vector,
            respectively.
        """
        target_array = target_df.values

        sknn = self._instantiate_nearest_neighbors_object()

        target_array = self._ensure_array_columnlike(target_array)

        sknn.fit(target_array)
        return KNN(sknn, target_df.index)

    @staticmethod
    def _ensure_array_columnlike(target_array):
        if len(target_array.shape) < 2 or target_array.shape[1] == 1:
            target_array = target_array.reshape(-1, 1)
        return target_array

    def _get_metric_dict(
        self,
        VI_in_metric_params=True,
    ):
        metric_dict = dict(metric=self.metric)
        if self.metric == "mahalanobis":
            VI = np.linalg.inv(self.conditioned_covariance_)
            if VI_in_metric_params:
                metric_dict["metric_params"] = {"VI": VI}
            else:
                metric_dict["VI"] = VI

        return metric_dict

    def _kneighbors(self, knn, source_df):
        """Lookup neighbors in knn object.

        Args:
           knn (namedtuple) : knn named tuple to look for neighbors in. The
               object has `learner` and `index` attributes to reference the
               original df index.
           source_df (pd.DataFrame) : a DataFrame of source data points to use
               as "needles" for the knn "haystack."

        Returns:
            match_df (pd.DataFrame) : a DataFrame of matches
        """
        source_array = source_df.values
        # 1d data must be in shape (-1, 1) for sklearn.knn
        source_array = self._ensure_array_columnlike(source_array)

        distances, neighbor_array_indices = knn.learner.kneighbors(
            source_array, n_neighbors=self.n_neighbors)

        return self._generate_match_df(source_df, knn.index, distances,
                                       neighbor_array_indices)

    def _generate_match_df(self, source_df, target_df_index, distances,
                           neighbor_array_indices):
        """
        Take results of matching and build into match_df DataFrame.

        For clarity we'll call the samples that are being matched "needles" and
        the set of samples that they looked for matches in the "haystack".

        Args:
            source_df (pd.DataFrame) : Covariate dataframe of N "needles"
            target_df_index (np.array) : An array of M indices of the haystack
                samples in their original dataframe.
            distances (np.array) : An array of N arrays of floats of length K
                where K is `self.n_neighbors`.
            neighbor_array_indices (np.array) : An array of N arrays of ints of
                length K where K is `self.n_neighbors`.
        """
        # target is the haystack, source is the needle(s)
        # translate array indices back to original indices
        matches_dict = {}
        for source_idx, distance_row, neighbor_array_index_row in zip(
                source_df.index, distances, neighbor_array_indices):
            neighbor_df_indices = \
                target_df_index[neighbor_array_index_row.flatten()]
            if self.caliper is not None:
                neighbor_df_indices = [
                    n for i, n in enumerate(neighbor_df_indices)
                    if distance_row[i] < self.caliper
                ]
                distance_row = [d for d in distance_row if d < self.caliper]
            matches_dict[source_idx] = dict(matches=list(neighbor_df_indices),
                                            distances=list(distance_row))
        # convert dict of dicts like { 1: {'matches':[], 'distances':[]}} to df
        return pd.DataFrame(matches_dict).T

    def _matches_to_weights_single_matching(self, s, t, match_df):
        """
        For a given match, calculate the resulting weight vector.

        The weight vector adds a count each time a sample is used, weighted by
        the number of other neighbors when it was used. This is necessary to
        make the weighted sum return the correct effect estimate.
        """
        weights = pd.Series(self.treatments_.copy() * 0)
        name = {0: "control", 1: "treatment"}
        weights.name = "{s}_to_{t}".format(s=name[s], t=name[t])
        s_to_t_matches = match_df.loc[t][self.treatments_ == s].matches
        for source_idx, matches_list in s_to_t_matches.iteritems():
            if matches_list:
                weights.loc[source_idx] += 1
            for match in matches_list:
                weights.loc[match] += 1 / len(matches_list)
        return weights

    def _get_distance_matrix(self, source_df, target_df):
        """
        Create distance matrix for no replacement match.

        Combines metric, caliper and source/target data into a
        precalculated distance matrix which can be passed to
        scipy.optimize.linear_sum_assignment.
        """

        cdist_args = dict(
            XA=self._ensure_array_columnlike(source_df.values),
            XB=self._ensure_array_columnlike(target_df.values),
        )
        cdist_args.update(self._get_metric_dict(False))
        distance_matrix = distance.cdist(**cdist_args)

        if self.caliper is not None:
            distance_matrix[distance_matrix > self.caliper] = VERY_LARGE_NUMBER
        return distance_matrix

    def _withreplacement_match(self, X, a):
        matches = {}  # maps treatment value to list of matches TO that value

        for treatment_value, knn in self.treatment_knns_.items():
            matches[treatment_value] = self._kneighbors(knn, X)
            # when producing potential outcomes we may want to force the
            # value of the observed outcome to be the actual observed
            # outcome, and not an average of the k nearest samples.
            if not self.estimate_observed_outcome:

                def limit_within_treatment_matches_to_self_only(row):
                    if (a.loc[row.name] == treatment_value
                            and row.name in row.matches):
                        row.matches = [row.name]
                        row.distances = [0]
                    return row

                matches[treatment_value] = matches[treatment_value].apply(
                    limit_within_treatment_matches_to_self_only, axis=1)

        return pd.concat(matches, sort=True)

    def _noreplacement_match(self, X, a):

        match_combinations = sorted(combinations(a.unique(), 2))
        matches = {}

        for s, t in match_combinations:
            distance_matrix = self._get_distance_matrix(X[a == s], X[a == t])
            source_array, neighbor_array_indices, distances = \
                self._optimally_match_distance_matrix(distance_matrix)
            source_df = X[a == s].iloc[np.array(source_array)]
            target_df = X[a == t].iloc[np.array(neighbor_array_indices)]
            if t in matches or s in matches:
                warnings.warn("No-replacement matching for more than "
                              "2 treatment values is not supported")

            matches[t] = self._create_match_df_for_no_replacement(
                a, source_df, target_df, distances)
            matches[s] = self._create_match_df_for_no_replacement(
                a, target_df, source_df, distances)

        match_df = pd.concat(matches, sort=True)
        return match_df

    def _optimally_match_distance_matrix(self, distance_matrix):
        source_array, neighbor_array_indices = linear_sum_assignment(
            distance_matrix)
        distances = [[
            distance_matrix[s_idx, t_idx]
        ] for s_idx, t_idx in zip(source_array, neighbor_array_indices)]
        source_array, neighbor_array_indices, distances = \
            self._filter_noreplacement_matches_using_caliper(
                source_array, neighbor_array_indices, distances)
        return source_array, neighbor_array_indices, distances

    def _filter_noreplacement_matches_using_caliper(self, source_array,
                                                    neighbor_array_indices,
                                                    distances):
        if self.caliper is None:
            return source_array, neighbor_array_indices, distances
        keep_indices = [
            i for i, d in enumerate(distances) if d[0] <= self.caliper
        ]
        source_array = source_array[keep_indices]
        neighbor_array_indices = neighbor_array_indices[keep_indices]
        distances = [distances[i] for i in keep_indices]
        if not keep_indices:
            warnings.warn("No matches found, check caliper."
                          "No estimation possible.")
        return source_array, neighbor_array_indices, distances

    @staticmethod
    def _create_match_df_for_no_replacement(base_series, source_df, target_df,
                                            distances):
        match_sub_df = pd.DataFrame(
            index=base_series.index,
            columns=[
                "matches",
                "distances",
            ],
            data=base_series.apply(lambda x: pd.Series([[], []])).values,
            dtype="object",
        )

        # matching from source to target: read distances
        match_sub_df.loc[source_df.index] = pd.DataFrame(
            data=dict(
                matches=[[tidx] for tidx in target_df.index],
                distances=distances,
            ),
            index=source_df.index,
        )

        # matching from target to target: fill with zeros
        match_sub_df.loc[target_df.index] = pd.DataFrame(
            data=dict(
                matches=[[tidx] for tidx in target_df.index],
                distances=[[0]] * len(distances),
            ),
            index=target_df.index,
        )
        return match_sub_df

    def _count_samples_used_by_treatment_value(self, a):
        # we record the number of samples that were successfully matched of
        # each treatment value
        samples_used = {
            treatment_value: self.match_df_.loc[treatment_value][
                a != treatment_value].matches.apply(bool).sum()
            for treatment_value in sorted(a.unique(), reverse=True)
        }

        return pd.Series(samples_used)
예제 #31
0
###### Likelyhood Computation ######
# Fold the angles in params into proper range, such that
# they centered at the mean.
N_CYCLE_FOLD_ANGLE = 10
for j in xrange(N_CYCLE_FOLD_ANGLE):
    mean = np.mean(params, axis=0)
    for i in xrange(3, 6):  # index 3,4,5 are angles, others are distances
        params[:, i][params[:, i] > mean[i] + np.pi] -= 2 * np.pi
        params[:, i][params[:, i] < mean[i] - np.pi] += 2 * np.pi
        if PARAMS_TLR[i] > mean[i] + np.pi:
            PARAMS_TLR[i] += 2 * np.pi
        if PARAMS_TLR[i] < mean[i] - np.pi:
            PARAMS_TLR[i] -= 2 * np.pi

est = EmpiricalCovariance(True, False)
est.fit(params)
log_likelyhood = est.score(PARAMS_TLR[None, :])
KT = 0.59
free_e = -log_likelyhood * KT

print 'Log likelyhood score:', log_likelyhood
print 'Free energy:', free_e


###### Output the best conformer to pdb ######
def generate_bp_par_file(params, bps, out_name):
    assert(len(params) == len(bps))
    n_bp = len(params)
    # convert from radians to degrees
    params[:, 3:] = np.degrees(params[:, 3:])
예제 #32
0
import numpy as np
from scipy.io import loadmat
from sklearn.covariance import EmpiricalCovariance, EllipticEnvelope
from sklearn.metrics import accuracy_score, classification_report

cardio_data = loadmat('cardio.mat')
estimator = EmpiricalCovariance()
cov = estimator.fit(cardio_data['X'])
mahal_cov = cov.mahalanobis(cardio_data['X'])
# sort values and extract n maximum values
# number of outliers in cardio data = 176
indexes = np.argpartition(mahal_cov, 176)[-176:]
y_pred = np.zeros(cardio_data['y'].shape)
y_pred[indexes] = 1
print(classification_report(cardio_data['y'], y_pred))
print(accuracy_score(cardio_data['y'], y_pred))

cov = EllipticEnvelope().fit(np.dot(cardio_data['X'].T, cardio_data['X']))
mahal_cov = cov.mahalanobis(cardio_data['X'])

indexes = np.argpartition(mahal_cov, 176)[-176:]
y_pred = np.zeros(cardio_data['y'].shape)
y_pred[indexes] = 1
print(classification_report(cardio_data['y'], y_pred))
print(accuracy_score(cardio_data['y'], y_pred))
예제 #33
0
    # ####################################
    # PLSR - Marco
    # ####################################
    plsr = PLSR(X, Y)
    plsr.Initialize()
    plsr.EvaluateComponents()
    weights = plsr.GetWeights()
    comps = plsr.ReturnComponents()
    print('Covariance X (XX\'):\n %s' % str(X.dot(X.T).shape))
    print('Covariance X (plsr):\n %s' % str(plsr._covX.shape))

    print('Covariance X (numpy):\n %s' % np.cov(X, rowvar=False))
    print('Covariance X (pandas):\n %s' % dfx.cov().values)

    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)

    print('Covariance X (sklearn):\n %s' % cov.covariance_)

    # print('weigths 0:\n %s' % weights[0])
    # print('weigths 1:\n %s' % weights[1])

    # print('Y Scores:\n %s' % comps[0])
    # print('Components Y:\n %s' % comps[1])

    # ####################################
    # PLSR - SKLEARN
    # ####################################
    # print('\n\nPLS-SVD')
    # plsr = PLSSVD(n_components=2, scale=False)
    # plsr.fit(X,Y)
예제 #34
0
def get_covariance(X):
    cov = EmpiricalCovariance(assume_centered=True)
    cov.fit(X)
    return cov.covariance_
예제 #35
0
 def fit(self, X, n_jobs=-1):
     EmpiricalCovariance.fit(self, X)
     if not self.no_fit:
         CovarianceOutlierDetectionMixin.set_threshold(
             self, X, n_jobs=n_jobs)
     return self
예제 #36
0
hPurity_disc.Divide(hPurity_discDen)
hPurity_disc.Draw()
c.Print("purity_disc.png")

hMVAdisc_pt.Draw("colz")
c.Print("discriminator_vs_candPt.png")

from sklearn.covariance import EmpiricalCovariance

npRocInput = numpy.array(rocInput)
npRocAnswers = numpy.array(rocScore)
slimNpData0 = npRocInput[npRocAnswers == 0]
slimNpData1 = npRocInput[npRocAnswers == 1]

ecv = EmpiricalCovariance()
ecv.fit(slimNpData0)

from scipy.linalg import fractional_matrix_power


def diagElements(m):
    size = m.shape[0]
    return numpy.matrix(numpy.diag([m[i, i] for i in xrange(size)]))


def corrMat(m):
    sqrt_diag = fractional_matrix_power(diagElements(m), -0.5)
    return numpy.array(sqrt_diag * m * sqrt_diag)


corr0 = corrMat(numpy.matrix(ecv.covariance_))
예제 #37
0
###### Likelyhood Computation ######
# Fold the angles in params into proper range, such that
# they centered at the mean.
N_CYCLE_FOLD_ANGLE = 10
for j in xrange(N_CYCLE_FOLD_ANGLE):
    mean = np.mean(params, axis=0)
    for i in xrange(3, 6):  # index 3,4,5 are angles, others are distances
        params[:, i][params[:, i] > mean[i] + np.pi] -= 2 * np.pi
        params[:, i][params[:, i] < mean[i] - np.pi] += 2 * np.pi
        if PARAMS_TLR[i] > mean[i] + np.pi:
            PARAMS_TLR[i] += 2 * np.pi
        if PARAMS_TLR[i] < mean[i] - np.pi:
            PARAMS_TLR[i] -= 2 * np.pi

est = EmpiricalCovariance(True, False)
est.fit(params)
log_likelyhood = est.score(PARAMS_TLR[None, :])
KT = 0.59
free_e = -log_likelyhood * KT

print 'Log likelyhood score:', log_likelyhood
print 'Free energy:', free_e


###### Output the best conformer to pdb ######
def generate_bp_par_file(params, bps, out_name):
    assert (len(params) == len(bps))
    n_bp = len(params)
    # convert from radians to degrees
    params[:, 3:] = np.degrees(params[:, 3:])
예제 #38
0
    def fit(self, X):
        '''
      Copulafit using Gaussian copula with marginals evaluated by Gaussian KDE
      Precision matrix is evaluated using specified method, default to graphical LASSO
      :param X: input dataset
      :return: estimated precision matrix rho
      '''

        N, d = X.shape
        if self.scaler is not None:
            X_scale = self.scaler.fit_transform(X)
        else:
            X_scale = X
        if len(self.vertexes) == 0:
            self.vertexes = [str(id) for id in range(d)]

        self.theta = 1.0 / N
        cum_marginals = np.zeros_like(X)
        inv_norm_cdf = np.zeros_like(X)
        # inv_norm_cdf_scaled = np.zeros_like(X)
        self.kernels = list([])
        # TODO: complexity O(Nd) is high
        if self.verbose:
            colored('>> Computing marginals', color='blue')
        for j in range(cum_marginals.shape[1]):
            self.kernels.append(gaussian_kde(X_scale[:, j]))
            cum_pdf_overall = self.kernels[-1].integrate_box_1d(
                X_scale[:, j].min(), X_scale[:, j].max())
            for i in range(cum_marginals.shape[0]):
                cum_marginals[i, j] = self.kernels[-1].integrate_box_1d(
                    X_scale[:, j].min(), X_scale[i, j]) / cum_pdf_overall
                # truncate cumulative marginals
                if cum_marginals[i, j] < self.theta:
                    cum_marginals[i, j] = self.theta
                elif cum_marginals[i, j] > 1 - self.theta:
                    cum_marginals[i, j] = 1 - self.theta
                # inverse of normal CDF: \Phi(F_j(x))^{-1}
                inv_norm_cdf[i, j] = norm.ppf(cum_marginals[i, j])
                # scaled to preserve mean and variance: u_j + \sigma_j*\Phi(F_j(x))^{-1}
                # inv_norm_cdf_scaled[i, j] = X_scale[:, j].mean() + X_scale[:, j].std() * inv_norm_cdf[i, j]

        if self.method == 'mle':
            # maximum-likelihood estiamtor
            empirical_cov = EmpiricalCovariance()
            empirical_cov.fit(inv_norm_cdf)
            if self.verbose:
                print colored('>> Running MLE to estiamte precision matrix',
                              color='blue')

            self.est_cov = empirical_cov.covariance_
            self.corr = scale_matrix(self.est_cov)
            self.precision_ = inv(empirical_cov.covariance_)

        if self.method == 'glasso':
            if self.verbose:
                print colored('>> Running glasso to estiamte precision matrix',
                              color='blue')

            empirical_cov = EmpiricalCovariance()
            empirical_cov.fit(inv_norm_cdf)
            # shrunk convariance to avoid numerical instability
            shrunk_cov = shrunk_covariance(empirical_cov.covariance_,
                                           shrinkage=0.8)
            self.est_cov, self.precision_ = graph_lasso(emp_cov=shrunk_cov,
                                                        alpha=self.penalty,
                                                        verbose=self.verbose,
                                                        max_iter=self.max_iter)
            self.corr = scale_matrix(self.est_cov)

        if self.method == 'ledoit_wolf':
            if self.verbose:
                print colored(
                    '>> Running ledoit_wolf to estiamte precision matrix',
                    color='blue')

            self.est_cov, _ = ledoit_wolf(inv_norm_cdf)
            self.corr = scale_matrix(self.est_cov)
            self.precision_ = linalg.inv(self.est_cov)

        if self.method == 'spectral':
            '''L2 mehtod, use paper Inverse covariance estimation for high dimension data in linear time and space
         :formular: in paper eq(8)
         '''
            if self.verbose:
                print colored(
                    '>> Running Riccati to estiamte precision matrix',
                    color='blue')

            # TODO: note estimated cov is sample cov
            self.est_cov, self.precision_ = spectral(inv_norm_cdf,
                                                     rho=2 * self.penalty,
                                                     assume_centered=False)
            self.corr = scale_matrix(self.est_cov)

        if self.method == 'pc':
            clf = pgmlearner.PGMLearner()
            data_list = list([])
            for row_id in range(X_scale.shape[0]):
                instance = dict()
                for i, n in enumerate(self.vertexes):
                    instance[n] = X_scale[row_id, i]
                data_list.append(instance)
            graph = clf.lg_constraint_estimatestruct(data=data_list,
                                                     pvalparam=self.pval,
                                                     bins=self.bins)
            dag = np.zeros(shape=(len(graph.V), len(graph.V)))
            for e in graph.E:
                dag[self.vertexes.index(e[0]), self.vertexes.index(e[1])] = 1
            self.conditional_independences_ = dag

        if self.method == 'ic':
            df = dict()
            variable_types = dict()
            for j in range(X_scale.shape[1]):
                df[self.vertexes[j]] = X_scale[:, j]
                variable_types[self.vertexes[j]] = 'c'
            data = pd.DataFrame(df)
            # run the search
            ic_algorithm = IC(RobustRegressionTest,
                              data,
                              variable_types,
                              alpha=self.pval)
            graph = ic_algorithm.search()
            dag = np.zeros(shape=(X_scale.shape[1], X_scale.shape[1]))
            for e in graph.edges(data=True):
                i = self.vertexes.index(e[0])
                j = self.vertexes.index(e[1])
                dag[i, j] = 1
                dag[j, i] = 1
                arrows = set(e[2]['arrows'])
                head_len = len(arrows)
                if head_len > 0:
                    head = arrows.pop()
                    if head_len == 1 and head == e[0]:
                        dag[i, j] = 0
                    if head_len == 1 and head == e[1]:
                        dag[j, i] = 0
            self.conditional_independences_ = dag

        # finally we fit the structure
        self.fit_structure(self.precision_)
예제 #39
0
파일: pyRMT.py 프로젝트: akapocsi/pyRMT
def optimalShrinkage(X, return_covariance=False):
    """This function computes a cleaned, optimal shrinkage, 
       rotationally-invariant estimator (RIE) of the true correlation 
       matrix C underlying the noisy, in-sample estimate 
       E = 1/T X * transpose(X)
       associated to a design matrix X of shape (T, N) (T measurements 
       and N features).

       One approach to getting a cleaned estimator that predates the
       optimal shrinkage, RIE estimator consists in inverting the 
       Marcenko-Pastur equation so as to replace the eigenvalues
       from the spectrum of E by an estimation of the true ones.

       This approach is known to be numerically-unstable, in addition
       to failing to account for the overlap between the sample eigenvectors
       and the true eigenvectors. How to compute such overlaps was first
       explained by Ledoit and Peche (cf. reference below). Their procedure
       was extended by Bun, Bouchaud and Potters, who also correct
       for a systematic downward bias in small eigenvalues.
       It is this debiased, optimal shrinkage, rotationally-invariant
       estimator that the function at hand implements. 
         
       Parameter
       ---------
       X: design matrix, of shape (T, N), where T denotes the number
           of samples (think measurements in a time series), while N
           stands for the number of features (think of stock tickers).
           
        return_covariance: type bool (default: False)
           If set to True, compute the standard deviations of each individual
           feature across observations, clean the underlying matrix
           of pairwise correlations, then re-apply the standard
           deviations and return a cleaned variance-covariance matrix.

       Returns
       -------
       E_RIE: type numpy.ndarray, shape (N, N)
           Cleaned estimator of the true correlation matrix C. A sample
           estimator of C is the empirical covariance matrix E 
           estimated from X. E is corrupted by in-sample noise.
           E_RIE is the optimal shrinkage, rotationally-invariant estimator 
           (RIE) of C computed following the procedure of Joel Bun 
           and colleagues (cf. references below).
           
           If return_covariance=True, E_clipped corresponds to a cleaned
           variance-covariance matrix.

       References
       ----------
       * "Eigenvectors of some large sample covariance matrix ensembles",
         O. Ledoit and S. Peche
         Probability Theory and Related Fields, Vol. 151 (1), pp 233-264
       * "Rotational invariant estimator for general noisy matrices",
         J. Bun, R. Allez, J.-P. Bouchaud and M. Potters
         arXiv: 1502.06736 [cond-mat.stat-mech]
       * "Cleaning large Correlation Matrices: tools from Random Matrix Theory",
         J. Bun, J.-P. Bouchaud and M. Potters
         arXiv: 1610.08104 [cond-mat.stat-mech]
    """
    
    try:
        assert isinstance(return_covariance, bool)
    except AssertionError:
        raise
        sys.exit(1)

    T, N, transpose_flag = checkDesignMatrix(X)
    if transpose_flag:
        X = X.T
        
    if not return_covariance:
        X = StandardScaler(with_mean=False,
                           with_std=True).fit_transform(X)

    ec = EmpiricalCovariance(store_precision=False,
                             assume_centered=True)
    ec.fit(X)
    E = ec.covariance_
    
    if return_covariance:
        inverse_std = 1./np.sqrt(np.diag(E))
        E *= inverse_std
        E *= inverse_std.reshape(-1, 1)

    eigvals, eigvecs = np.linalg.eigh(E)
    eigvecs = eigvecs.T

    q = N / float(T)
    lambda_N = eigvals[0]  # The smallest empirical eigenvalue,
                           # given that the function used to compute
                           # the spectrum of a Hermitian or symmetric
                           # matrix - namely np.linalg.eigh - returns
                           # the eigenvalues in ascending order.

    xis = map(lambda x: xiHelper(x, q, E), eigvals)
    Gammas = map(lambda x: gammaHelper(x, q, N, lambda_N), eigvals)
    xi_hats = map(lambda a, b: a * b if b > 1 else a, xis, Gammas)

    E_RIE = np.zeros((N, N), dtype=float)
    for xi_hat, eigvec in zip(xi_hats, eigvecs):
        eigvec = eigvec.reshape(-1, 1)
        E_RIE += xi_hat * eigvec.dot(eigvec.T)
        
    tmp = 1./np.sqrt(np.diag(E_RIE))
    E_RIE *= tmp
    E_RIE *= tmp.reshape(-1, 1)
    
    if return_covariance:
        std = 1./inverse_std
        E_RIE *= std
        E_RIE *= std.reshape(-1, 1)

    return E_RIE
예제 #40
0
def rand_pts_overall_cov_init(X,
                              n_components,
                              cov_est_method='LW',
                              covariance_type='full',
                              random_state=None):
    """
    Sets the means to randomly selected points. Sets the covariances to the overall covariance matrix.

    Parameters
    ----------
    X: (n_samples, n_features)

    n_components: int

    cov_est_method: str
        Must be one of ['emperical', 'LW', 'OAS'] for
        empirical covariance matrix estimate, LedoitWolf and
        Oracle Approximating Shrinkage Estimator. See
        sklean.covariace for details.

    random_state: None, int, random seed
        Random seed.

    """
    assert cov_est_method in ['empirical', 'LW', 'OAS']
    assert covariance_type in ['full', 'diag', 'tied', 'spherical']
    n_samples = X.shape[0]

    # randomly select data points to start cluster centers from
    rng = check_random_state(random_state)

    # estimate global covariance
    if cov_est_method == 'empirical':
        cov_estimator = EmpiricalCovariance(store_precision=False)
    elif cov_est_method == 'LW':
        cov_estimator = LedoitWolf(store_precision=False)
    elif cov_est_method == 'OAS':
        cov_estimator = OAS(store_precision=False)
    cov_estimator.fit(X)
    cov_est = cov_estimator.covariance_

    # set covariance matrix for each cluster
    if covariance_type == 'tied':
        covs = cov_est

    elif covariance_type == 'full':
        covs = np.stack([cov_est for _ in range(n_components)])

    elif covariance_type == 'diag':
        # each components gets the diagonal of the estimated covariance matrix
        covs = np.diag(cov_est)
        covs = np.repeat(covs.reshape(1, -1), repeats=n_components, axis=0)

    elif covariance_type == 'spherical':
        # each components gets the average of the variances
        covs = np.diag(cov_est).mean()
        covs = np.repeat(covs, repeats=n_components)

    # set means to random data points
    rand_idxs = rng.choice(range(n_samples), replace=False, size=n_components)

    means = [X[pt_idx, ] for pt_idx in rand_idxs]
    means = np.array(means)

    return means, covs
예제 #41
0
# save for heuristic correction
age = df_test['var15']
age_ecdf = ECDF(df_train['var15'])
df_train['var15'] = age_ecdf(df_train['var15'])
df_test['var15'] = age_ecdf(df_test['var15'])

# feature engineering
df_train.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0
df_train['num_zeros'] = (df_train == 0).sum(axis=1)
df_test.loc[df_train['var3'] == -999999.000000, 'var3'] = 2.0
df_test['num_zeros'] = (df_test == 0).sum(axis=1)

# outliers
ec = EmpiricalCovariance()
ec = ec.fit(df_train)
m2 = ec.mahalanobis(df_train)
df_train = df_train[m2 < 40000]
df_target = df_target[m2 < 40000]

# clip
# df_test = df_test.clip(df_train.min(), df_train.max(), axis=1)

# standard preprocessing
prep = Pipeline([
    ('cd', ColumnDropper(drop=ZERO_VARIANCE_COLUMNS + CORRELATED_COLUMNS)),
    ('std', StandardScaler())
])

X_train = prep.fit_transform(df_train)
X_test = prep.transform(df_test)