def two_point(data, bins): """Two-point correlation function, using Landy-Szalay method Parameters ---------- data : array_like input data, shape = [n_samples, n_features] (2D ndarray) bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 (1D ndarray) Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(None) n_samples, n_features = data.shape Nbins = len(bins) - 1 # shuffle around an axis, making background dist. data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) factor = len(data_R) * 1. / len(data) # Fast two-point correlation functions added in scikit-learn v. 0.14 # Makes tree to embed pairwise distances, increasing look-up speed KDT_D = KDTree(data) # actual distances KDT_R = KDTree(data_R) # randomized background distances counts_DD = KDT_D.two_point_correlation(data, bins) # number of points within bins[i] radius counts_RR = KDT_R.two_point_correlation(data_R, bins) # " " for randomized background DD = np.diff(counts_DD) # number of points in a disc from bins[i-1] to bins[i] RR = np.diff(counts_RR) # " " for randomized background # make zeros 1 for numerical stability (finite difference problems) RR_zero = (RR == 0) # mask creation RR[RR_zero] = 1 # apply update counts_DR = KDT_R.two_point_correlation(data, bins) # cross-correlation betw. actual and random DR = np.diff(counts_DR) # binned cross-corr corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR # the Landy-Szalay formula corr[RR_zero] = np.nan # back-apply the zeros found in RR return corr
def two_point(data,data_R,bins,method='landy-szalay',seed=1234,saverandom=False): """ Uses nearest neighbors KDtree to evaluate two point correlation args: data: n samples x m features data array, eg. x,y,z positions bins: 1d bins array return: two - pt correlation correlation give the method. Errors are not returned. A bootstrap sampling can be run N times to evaluate errors. """ from sklearn.neighbors import KDTree data = np.asarray(data) bins = np.asarray(bins) rng = np.random.RandomState(seed) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") n_samples, n_features = data.shape Nbins = len(bins) - 1 # shuffle all but one axis to get background distribution if data_R is None: data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) else: data_R = np.asarray(data_R) if (data_R.ndim != 2) or (data_R.shape[-1] != n_features): raise ValueError('data_R must have same n_features as data') factor = len(data_R) * 1. / len(data) KDT_D=KDTree(data) KDT_R=KDTree(data_R) print("Correlating Data, data size: {}".format(len(data))) counts_DD=KDT_D.two_point_correlation(data,bins) print('Correlating Random, random size: {}'.format(len(data_R))) counts_RR=KDT_R.two_point_correlation(data_R,bins) DD=np.diff(counts_DD) RR=np.diff(counts_RR) #- Check for zero in RR RR_zero = (RR == 0) RR[RR_zero]=1 if method == 'standard': corr = factor**2*DD/RR - 1 elif method == 'landy-szalay': print("Cross Correlating") counts_DR=KDT_R.two_point_correlation(data,bins) DR=np.diff(counts_DR) print("Evaluating correlation using {}".format(method)) corr = (factor**2 * DD - 2 * factor * DR + RR)/RR corr[RR_zero] = np.nan return corr
def two_point(data, bins, method='standard', data_R=None, random_state=None): """Two-point correlation function Parameters ---------- data : array_like input data, shape = [n_samples, n_features] bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 method : string "standard" or "landy-szalay". data_R : array_like (optional) if specified, use this as the random comparison sample random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") n_samples, n_features = data.shape Nbins = len(bins) - 1 # shuffle all but one axis to get background distribution if data_R is None: data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) else: data_R = np.asarray(data_R) if (data_R.ndim != 2) or (data_R.shape[-1] != n_features): raise ValueError('data_R must have same n_features as data') factor = len(data_R) * 1. / len(data) if sklearn_has_two_point: # Fast two-point correlation functions added in scikit-learn v. 0.14 KDT_D = KDTree(data) KDT_R = KDTree(data_R) counts_DD = KDT_D.two_point_correlation(data, bins) counts_RR = KDT_R.two_point_correlation(data_R, bins) else: warnings.warn("Version 0.3 of astroML will require scikit-learn " "version 0.14 or higher for correlation function " "calculations. Upgrade to sklearn 0.14+ now for much " "faster correlation function calculations.") BT_D = BallTree(data) BT_R = BallTree(data_R) counts_DD = np.zeros(Nbins + 1) counts_RR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DD[i] = np.sum(BT_D.query_radius(data, bins[i], count_only=True)) counts_RR[i] = np.sum(BT_R.query_radius(data_R, bins[i], count_only=True)) DD = np.diff(counts_DD) RR = np.diff(counts_RR) # check for zero in the denominator RR_zero = (RR == 0) RR[RR_zero] = 1 if method == 'standard': corr = factor ** 2 * DD / RR - 1 elif method == 'landy-szalay': if sklearn_has_two_point: counts_DR = KDT_R.two_point_correlation(data, bins) else: counts_DR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DR[i] = np.sum(BT_R.query_radius(data, bins[i], count_only=True)) DR = np.diff(counts_DR) corr = (factor ** 2 * DD - 2 * factor * DR + RR) / RR corr[RR_zero] = np.nan return corr
KDT_D = KDTree(Data_D) KDT_D1 = KDTree(Data_D1) KDT_R = KDTree(Data_R) KDT_R1 = KDTree(Data_R1) Nbins =30 counts_DD1 = np.zeros(Nbins+1) counts_RR1 = np.zeros(Nbins+1) counts_DR1 = np.zeros(Nbins+1) counts_D1R = np.zeros(Nbins+1) bins = np.arange(0, 30) print bins #calculating Two point correlation using Sklearn function counts_DD1 = KDT_D.two_point_correlation(Data_D, bins) counts_RR1 = KDT_R.two_point_correlation(Data_R1, bins) DD1 = np.diff(counts_DD1) RR1 = np.diff(counts_RR1) RR1_zero = (RR1 == 0) RR1[RR1_zero] = 1 #for i in range(Nbins + 1): # counts_DR[i] = np.sum(BT_R.query_radius(Data_D, bins[i], count_only=True)) counts_DR1 = KDT_R1.two_point_correlation(Data_D, bins) counts_D1R = KDT_R.two_point_correlation(Data_D1, bins) DR1 = np.diff(counts_DR1) D1R = np.diff(counts_D1R)
from pylab import plot,xlabel,ylabel,show import time NRAND = 10000 # how many randoms? # get the randoms in this box (32 Mpc/h on a side) rng = np.random.RandomState(0) ran = rng.random_sample((NRAND, 3))*32 ran[:,2] = ran[:,2]*4./32. # logarithmically-spaced bins r = np.linspace(-1, 0.3, 10) r = 10.0**r print(r) tree = KDTree(ran) xi = tree.two_point_correlation(ran, r) # remove self-pairs xi = xi - NRAND # make these a bin, rather than Npairs(<r) nbin = len(r) for i in range(nbin-1): xi[i+1] = xi[i+1] - xi[i] print(xi) print(r)
def two_point(data, bins, method='standard', data_R=None, random_state=None): """Two-point correlation function Parameters ---------- data : array_like input data, shape = [n_samples, n_features] bins : array_like bins within which to compute the 2-point correlation. shape = Nbins + 1 method : string "standard" or "landy-szalay". data_R : array_like (optional) if specified, use this as the random comparison sample random_state : integer, np.random.RandomState, or None specify the random state to use for generating background Returns ------- corr : ndarray the estimate of the correlation function within each bin shape = Nbins """ data = np.asarray(data) bins = np.asarray(bins) rng = check_random_state(random_state) if method not in ['standard', 'landy-szalay']: raise ValueError("method must be 'standard' or 'landy-szalay'") if bins.ndim != 1: raise ValueError("bins must be a 1D array") if data.ndim == 1: data = data[:, np.newaxis] elif data.ndim != 2: raise ValueError("data should be 1D or 2D") n_samples, n_features = data.shape Nbins = len(bins) - 1 # shuffle all but one axis to get background distribution if data_R is None: data_R = data.copy() for i in range(n_features - 1): rng.shuffle(data_R[:, i]) else: data_R = np.asarray(data_R) if (data_R.ndim != 2) or (data_R.shape[-1] != n_features): raise ValueError('data_R must have same n_features as data') factor = len(data_R) * 1. / len(data) if sklearn_has_two_point: # Fast two-point correlation functions added in scikit-learn v. 0.14 KDT_D = KDTree(data) KDT_R = KDTree(data_R) counts_DD = KDT_D.two_point_correlation(data, bins) counts_RR = KDT_R.two_point_correlation(data_R, bins) else: warnings.warn("Version 0.3 of astroML will require scikit-learn " "version 0.14 or higher for correlation function " "calculations. Upgrade to sklearn 0.14+ now for much " "faster correlation function calculations.") BT_D = BallTree(data) BT_R = BallTree(data_R) counts_DD = np.zeros(Nbins + 1) counts_RR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DD[i] = np.sum( BT_D.query_radius(data, bins[i], count_only=True)) counts_RR[i] = np.sum( BT_R.query_radius(data_R, bins[i], count_only=True)) DD = np.diff(counts_DD) RR = np.diff(counts_RR) # check for zero in the denominator RR_zero = (RR == 0) RR[RR_zero] = 1 if method == 'standard': corr = factor**2 * DD / RR - 1 elif method == 'landy-szalay': if sklearn_has_two_point: counts_DR = KDT_R.two_point_correlation(data, bins) else: counts_DR = np.zeros(Nbins + 1) for i in range(Nbins + 1): counts_DR[i] = np.sum( BT_R.query_radius(data, bins[i], count_only=True)) DR = np.diff(counts_DR) corr = (factor**2 * DD - 2 * factor * DR + RR) / RR corr[RR_zero] = np.nan return corr
factor = len(Data_R1)*(1.0/len(Data_D)) print factor KDT_D = KDTree(Data_D) KDT_R = KDTree(Data_R1) Nbins =30 counts_DD = np.zeros(Nbins+1) counts_RR = np.zeros(Nbins+1) counts_DR = np.zeros(Nbins+1) bins = np.arange(0, 30) print bins counts_DD = KDT_D.two_point_correlation(Data_D, bins) #print counts_DD counts_RR = KDT_R.two_point_correlation(Data_R1, bins) DD = np.diff(counts_DD) RR = np.diff(counts_RR) RR_zero = (RR == 0) RR[RR_zero] = 1 counts_DR = KDT_R.two_point_correlation(Data_D, bins) DR = np.diff(counts_DR) corr = (factor**2 * DD - 2 * factor * DR + RR) / RR corr[RR_zero] = np.nan