def calculate_distance_metrics(df, time_delta=pd.Timedelta(7, "D")): """ """ lobs = df.columns.tolist() # Self-join LoB fractions, offset by the time specified in the time_delta kwarg. df = df.merge(df.set_index(df.index.to_series() + time_delta), on='date_', suffixes=('', '_prev')) # For each cumulative distribution fraction based metric, compare current to previous distribution: # Kolmogorov-Smirnov Distance df['ks_distance'] = df.apply(lambda row: scipy.stats.ks_2samp( row[lobs], row[[x + "_prev" for x in lobs]])[0], axis=1) #df['ks_distance_pvalue'] = df.apply(lambda row: scipy.stats.ks_2samp(row[lobs], row[[x+"_prev" for x in lobs]])[1], axis=1) # Wasserstein Distance (aka Earth-Mover) df["wasserstein"] = df.apply(lambda row: scipy.stats.wasserstein_distance( row[lobs], row[[x + "_prev" for x in lobs]]), axis=1) # Cramer-von Mises Distance (aka energy distance) df["energy_dist"] = df.apply(lambda row: scipy.stats.energy_distance( row[lobs], row[[x + "_prev" for x in lobs]]), axis=1) df.drop(columns=[x + "_prev" for x in lobs], inplace=True) # Calculate the Mahalanobis distance (based on the multivariate equivalent of the standard deviation) pca = decomposition.PCA().fit_transform( preprocessing.StandardScaler().fit_transform(df[lobs]))[:, :10] mahalanobis = covariance.MinCovDet().fit(pca).mahalanobis(pca) df['mahalanobis'] = mahalanobis return df.drop(columns=lobs)
def robust_mahalanobis_with_chi2(feat, prob_reject, ret_dist=False): '''Reject outliers using one-class classification based on the mahalanobis distance estimate from a robust covariance as calculated by minimum covariance determinant. :Parameters: feat : array 2D array where each row is a feature and each column a factor prob_reject : float Probability threshold for rejecting outliers extra : dict Unused keyword arguments :Returns: sel : array Boolean selection array for each feature ''' feat -= numpy.median( feat, axis=0) #feat.mean(axis=0)#scipy.stats.mstats.mode(feat, 0)[0] try: robust_cov = skcov.MinCovDet().fit(feat) except: robust_cov = skcov.EmpiricalCovariance().fit(feat) dist = robust_cov.mahalanobis( feat) # - scipy.stats.mstats.mode(feat, 0)[0]) cut = scipy.stats.chi2.ppf(prob_reject, feat.shape[1]) sel = dist < cut return (sel, dist) if ret_dist else sel
def AddMahalanobis(df): df2 = df.copy() for t in set(df.type): df_of_a_type = df[df.type == t] mcd = covariance.MinCovDet() mcd.fit(df_of_a_type[NUMERIC_FEATURES]) df2[t + '_md'] = mcd.mahalanobis(df[NUMERIC_FEATURES]) return df2
def robustcovest(df, covtype): if (covtype == 'sample'): return pd.DataFrame(np.cov(df, rowvar=False, ddof=1), index=df.columns, columns=df.columns) if (covtype == 'LedoitWolf'): lw = skc.LedoitWolf() return pd.DataFrame(lw.fit(np.matrix(df)).covariance_, index=df.columns, columns=df.columns) if (covtype == 'MinDet'): return pd.DataFrame(skc.MinCovDet().fit(df).covariance_, index=df.columns, columns=df.columns)
def __init__(self, dim, estimator='OAS', **kwargs): """ TODO """ super(SKGaussianParams, self).__init__(dim, **kwargs) if estimator == 'EmpiricalCovariance': self._estimator = covariance.EmpiricalCovariance( assume_centered=True) elif estimator == 'LedoitWolf': self._estimator = covariance.LedoitWolf(assume_centered=True) elif estimator == 'MinCovDet': self._estimator = covariance.MinCovDet(assume_centered=True) elif estimator == 'OAS': self._estimator = covariance.OAS(assume_centered=True) elif estimator == 'ShrunkCovariance': self._estimator = covariance.ShrunkCovariance(assume_centered=True) else: raise ValueError('Unknown estimator: {}'.format(estimator))
def get_covariance(var_iter, shrinkage=0.1): cov = [] for (samples, genos, quals, variant) in var_iter: if genos is None: continue if any(np.isnan(genos)): continue if len(np.unique(genos)) == 1: continue cov.append(genos) cov = np.cov(np.array(cov, dtype='f').T) cov[np.diag_indices_from(cov)] = 1 # shrunk cov = covariance.shrunk_covariance(cov, shrinkage=shrinkage) #cov, _ = covariance.ledoit_wolf(cov) #cov, _ = covariance.oas(cov) # robust: try: cov = covariance.MinCovDet().fit(cov).covariance_ except ValueError: pass return cov
cur.execute(sql) colnames = [desc[0] for desc in cur.description] raw_data = cur.fetchall() df = pd.DataFrame(raw_data, columns=colnames) dependents = df.drop('userid', 1) dependents = dependents.drop('distance_pct_path_error', 1).drop('classification_confidence', 1).drop('suspension_fit_error', 1).fillna(0) dependents = dependents.apply(lambda x: (x - np.min(x)) / (np.max(x) - np.min(x))) mcd = covariance.MinCovDet() mcd.fit(dependents) distances = mcd.mahalanobis(dependents - mcd.location_)**(.5) distances_with_idx = zip(range(0, len(distances)), distances) #pctile_cutoff = np.percentile(distances_with_idx, 90) pctile_cutoff = 0 filtered_distances = [i for i in distances_with_idx if i[1] > pctile_cutoff] filtered_distances.sort(key=lambda x: -x[1]) filtered_distances = filtered_distances[:21] max_trips = max(df['trip_cnt']) trip_center = mcd.location_[0] * max_trips max_minutes = max(df['avg_minutes']) minutes_center = mcd.location_[1] * max_minutes