def get_tcc(configuration, tccrawfile, box, rcut=1., criterium="not marked"):
    "Get the connected cluster formed by  marked or not marked particles"
    xyz = pl.loadtxt(configuration, skiprows=2, usecols=[1, 2, 3])
    cl = pl.loadtxt(tccrawfile, skiprows=3, dtype="S1")

    if criterium == 'not marked':
        select = xyz[(cl == 'A') + (cl == 'B')]
    if criterium == "marked":
        select = xyz[(cl == 'C') + (cl == 'D')]

    T = PeriodicCKDTree(box, select)
    # Find neighbors within a fixed distance of a point
    balls = T.query_ball_point(select, r=rcut)

    visited = pl.zeros(select.shape[0])
    added = pl.zeros(select.shape[0])
    clusters = []

    def addballs(p, cluster):
        if visited[p] == 0:
            visited[p] = 1
            for e in balls[p]:
                addballs(e, cluster)

    for i in xrange(select.shape[0]):
        cluster = []
        addballs(i, cluster)
        if len(cluster) > 0:
    return clusters
def get_marked(xyz, labels, box, marker=True, rcut=1.4, periodic=False):
    select = xyz[labels == marker]
    # print select
    if periodic:
        T = PeriodicCKDTree(box, select)
        T = cKDTree(select)
    # Find neighbors within a fixed distance of a point
    balls = T.query_ball_point(select, r=rcut)

    visited = pl.zeros(select.shape[0])
    added = pl.zeros(select.shape[0])
    clusters = []

    def addballs(p, cluster):
        if visited[p] == 0:
            visited[p] = 1
            for e in balls[p]:
                addballs(e, cluster)

    for i in xrange(select.shape[0]):
        cluster = []
        addballs(i, cluster)
        if len(cluster) > 0:
    return clusters
def test_random_ball_vectorized_compiled():

    n = 20
    m = 5
    bounds = np.ones(m)
    T = PeriodicCKDTree(bounds, np.random.randn(n,m))

    r = T.query_ball_point(np.random.randn(2,3,m),1)
    def velocity_profile(self):
        radius_array = np.linspace(0, 200, self.N + 1)
        velocity_profile = np.zeros(self.N + 1)
        N_in_velocity = np.zeros(self.N + 1)

        bounds = np.array([self.box_size, self.box_size, self.box_size])
        tree = PeriodicCKDTree(bounds, self.galaxy_cat)
        print "Calculating velocity profile"
        for i in range(len(self.void_cat[:, 0])):
            #print i
            current_number_of_galaxies = 0
            current_velocity = 0
            for j in range(1, self.N + 1):
                neighbor_inds = tree.query_ball_point(self.void_cat[i, :],
                r_void = self.void_cat[i]
                galaxies_near_point = self.galaxy_cat[neighbor_inds]
                v_galaxy = self.velocity_cat[neighbor_inds]
                r_vec = r_void - galaxies_near_point
                galaxies_near_point = len(galaxies_near_point[:, 0])
                galaxies_in_shell = galaxies_near_point - current_number_of_galaxies

                radial_velocity = (v_galaxy * r_vec).sum(
                    axis=1) / np.linalg.norm(r_vec, axis=1)
                radial_velocity = np.sum(radial_velocity) - current_velocity

                velocity_profile[j] += radial_velocity / np.maximum(
                    1.0, galaxies_in_shell)
                N_in_velocity[j] += galaxies_in_shell

                current_velocity += radial_velocity
                current_number_of_galaxies += galaxies_in_shell
            #print velocity_profile / np.maximum(np.ones(self.N+1), N_in_velocity)
        v_final = (velocity_profile / len(self.void_cat[:, 0])
                   )  #/ np.maximum(np.ones(self.N+1), N_in_velocity))
        fig, ax = plt.subplots()
        ax.plot(radius_array, v_final)
        ax.set_xlabel("radius [Mpc/h]")
        ax.set_xlabel(r"$v_r(r)$ km/s")
        np.save("datafiles/velocity_profiles/velocity_profile" + self.handle,
        fig.savefig("figures/velocity_profiles/velocity_profile" +
                    self.handle + ".pdf")
Nlist = np.zeros(s, dtype=np.int)
# Boundaries (0 or negative means open boundaries in that dimension)
#changing bounds manually
bounds = np.array([dx, dy, dz])  # xy periodic, open along z

# Build kd-tree
T = PeriodicCKDTree(bounds, x)

# Find 4 closest neighbors to a random point
# (d[j], i[j]) = distance and index of jth closest point
# Find neighbors within a fixed distance of a point
print "Building Neighborlist..."

neighbors = []
for i in xrange(len(x)):
    localneigh = T.query_ball_point(
        x[i], r=2.1)  #r = cutoff (Angstrom) for making Nlist
    localneigh.insert(0, i)

#print neighbors

print "Neighborlist built! Writing data to file...."
print "***********writing with atom types*****************"
#print neighbors
outFile = open('Nlist-types' + '-' + outputfile, 'w')

for i in xrange(s[0]):
    #Slice the atomtypes using the neighbor indices, have to subtract 1
    #from index because you added it in your neighborlist build
# This will take any void and build shells around it up to 2*R_v
# and find the number density per shell using the volume of the shell.

R_shell = np.linspace(0.001, 2*zone_rad[np.int(zone[arb_ind])], 20) #shells from ~0 to 2*R_v in units of Mpc/h
V_shell = ((4.*pi)/3.)*R_shell**3. #volume of each shell in units of Mpc**3 
tot_numden = numpart/(Lbox**3.)

count = []
count_void = []
nden = []

for i in R_shell:
	# Find number of halos in each concetric sphere of radius given by array R
	count_void.append(len(periodic_tree.query_ball_point([x_denmin[np.int(zone[arb_ind])], y_denmin[np.int(zone[arb_ind])], z_denmin[np.int(zone[arb_ind])]], i)))

for i in range(0,len(R_shell)):

	# This gives me a number density in each shell
	# looks for number of particles within a volume given by input radius
	if i==0:
		count_temp = len(periodic_tree.query_ball_point([x_denmin[np.int(zone[arb_ind])], y_denmin[np.int(zone[arb_ind])], z_denmin[np.int(zone[arb_ind])]], R_shell[i]))
		nden_temp = count_temp/V_shell[i]
		count_temp1 = len(periodic_tree.query_ball_point([x_denmin[np.int(zone[arb_ind])], y_denmin[np.int(zone[arb_ind])], z_denmin[np.int(zone[arb_ind])]], R_shell[i]))
		count_temp2 = len(periodic_tree.query_ball_point([x_denmin[np.int(zone[arb_ind])], y_denmin[np.int(zone[arb_ind])], z_denmin[np.int(zone[arb_ind])]], R_shell[i-1]))
		count_temp = count_temp1-count_temp2
 		nden_temp = count_temp/(V_shell[i]-V_shell[i-1])

def overdensity_cylinder(gals,
    Find overdensity statistics over the whole simulation box for cylindrical apertures.

        gals - dataframe of galaxy properties
        coods - coordinates to calculate statistcis at. Typically defined as galaxy or random coordinates.
        R - aperture radius, cMpc
        dc - half aperture depth, cMpc
        L - box length, cMpc
        pc_stats - bool, calculate completeness and purity of each region
        cluster_mass_lim - limiting descendant mass above which to classify clusters, z0_central_mcrit200
        n - chunk length
        out_stats - output statistics, numpy array of shape [len(coods), 4]
                    0 - overdensity
                    1 - completeness
                    2 - purity
                    3 - descendant mass

    dimensions = np.array([L, L, L])

    if verbose: print "Building KDtree..."
    T = PeriodicCKDTree(dimensions, gals[['zn_x', 'zn_y', 'zn_z']])

    avg = float(gals.shape[0]) / L**3  # average overdensity cMpc^-3

    out_stats = np.zeros((len(coods), 4))

    vol_avg = np.pi * R**2 * (2 *
                              dc) * avg  # average overdensity in chosen volume

    for j, c in coods.groupby(
            np.arange(len(coods)) //
            n):  # can't calculate distances all in one go, so need to chunk

        if verbose:  # print progress
            if j % 100 == 0:
                print round(
                    float(c.shape[0] * (j + 1)) / coods.shape[0] * 100, 2), '%'

        # find all galaxies within a sphere of radius the max extent of the cylinder
        gal_index = T.query_ball_point(c, r=(R**2 + dc**2)**0.5)

        # filter by cylinder using norm_coods()
        gal_index = [
                gals.iloc[gal_index[k]][['zn_x', 'zn_y', 'zn_z']].values,
                c.ix[k + j * n].values,
                L=L)] for k in range(len(c))

        start_index = (j * n)  # save start index

        # calculate dgal
        out_stats[start_index:(start_index + len(c)),
                  0] = (np.array([len(x)
                                  for x in gal_index]) - vol_avg) / vol_avg

        if pc_stats:  # calculate completeness and purity statistics

            for i in range(len(gal_index)):

                cluster_ids = gals.iloc[gal_index[i]]
                cluster_ids = Counter(
                    cluster_ids[cluster_ids['z0_central_mcrit200'] >

                if len(cluster_ids) > 0:

                    cstats = np.zeros((len(cluster_ids), 2))

                    for k, (q, no) in enumerate(cluster_ids.items()):
                        cluster_gals = gals.ix[gals['z0_centralId'] == q]
                        cstats[k, 0] = float(no) / len(
                            cluster_gals)  # completeness
                        cstats[k, 1] = float(no) / len(gal_index[i])  # purity

                    # find id of max completeness and purity in cstats array
                    max_completeness = np.where(
                        cstats[:, 0] == cstats[:, 0].max())[0]
                    max_purity = np.where(cstats[:, 1] == cstats[:,

                    # sometimes multiple clusters can have same completeness or purity in a single candidate
                    # - use the cluster with the highest complementary completeness/purity
                    if len(max_completeness) > 1:

                        # get matches between completeness and purity
                        matches = [x in max_purity for x in max_completeness]

                        if np.sum(matches) > 0:
                            # just use the first one
                            max_completeness = [np.where(matches)[0][0]]
                            max_purity = [np.where(matches)[0][0]]
                            max_completeness = [
                                    cstats[max_completeness, 1])]

                    if len(max_purity) > 1:

                        matches = [x in max_completeness for x in max_purity]

                        if np.sum(matches) > 0:
                            max_completeness = [np.where(matches)[0][0]]
                            max_purity = [np.where(matches)[0][0]]

                            max_purity = [

                    # sometimes the cluster with the highest completeness does not have the highest purity, or vice versa
                    # - use the cluster with the highest combined purity/completeness added in quadrature
                    if max_completeness[0] != max_purity[0]:
                        max_completeness = [
                            np.argmax([pow(np.sum(x**2), 0.5) for x in cstats])
                        max_purity = max_completeness

                    # save completeness and purity values
                    out_stats[start_index + i, 1] = cstats[max_completeness[0],
                                                           0]  # completeness
                    out_stats[start_index + i, 2] = cstats[max_purity[0],
                                                           1]  # purity

                    # save descendant mass
                    # filter by cluster id, save z0 halo mass
                    # can use either max_completeness or max_purity, both equal by this point

                    out_stats[start_index + i,
                              3] = gals.loc[gals['z0_centralId'] == cluster_ids

                else:  # if no galaxies in aperture
                    out_stats[start_index + i, 1] = 0.
                    out_stats[start_index + i, 2] = 0.
                    out_stats[start_index + i, 3] = np.nan

    return out_stats
w = T2.query(queries)
print "PeriodicCKDTree %d lookups:\t%g" % (r, time.time() - t)
del w

T3 = PeriodicCKDTree(bounds, data, leafsize=n)
t = time.time()
w = T3.query(queries)
print "flat PeriodicCKDTree %d lookups:\t%g" % (r, time.time() - t)
del w

t = time.time()
w1 = T1.query_ball_point(queries, 0.2)
print "PeriodicKDTree %d ball lookups:\t%g" % (r, time.time() - t)

t = time.time()
w2 = T2.query_ball_point(queries, 0.2)
print "PeriodicCKDTree %d ball lookups:\t%g" % (r, time.time() - t)

t = time.time()
w3 = T3.query_ball_point(queries, 0.2)
print "flat PeriodicCKDTree %d ball lookups:\t%g" % (r, time.time() - t)

all_good = True
for a, b in zip(w1, w2):
    if sorted(a) != sorted(b):
        all_good = False
for a, b in zip(w1, w3):
    if sorted(a) != sorted(b):
        all_good = False

print "Ball lookups agree? %s" % str(all_good)
    def delta_and_sigma_vz_galaxy(self, array_files=None, dictionary=False):
        Calculates the density profile and velocity dispersion of voids in real space.
        Requires xi_vg_real_func() to be run first as this gives the upper and lower bounds
        for the radius array to avoid out of bounds for splines.
        #radius_array = np.linspace(0, self.r_corr[-1], self.N + 1)
        radius_array = np.linspace(1, 200, self.N + 1)
        if array_files == None:
            bounds = np.array([self.box_size, self.box_size, self.box_size])
            tree = PeriodicCKDTree(bounds, self.galaxy_cat)

            delta = np.zeros(self.N + 1)
            v_z = np.zeros(self.N + 1)
            E_vz = np.zeros(self.N + 1)
            E_vz2 = np.zeros(self.N + 1)
            sigma_vz = np.zeros(self.N + 1)
            galaxies_in_shell_arr = np.zeros(self.N + 1)

            print "Starting density profile and velocity dispersion calculation"
            for i in range(len(self.void_cat[:, 0])):
                current_number_of_galaxies = 0
                current_E_vz = 0
                current_E_vz2 = 0
                E_vz_in_shell = 0
                E_vz2_in_shell = 0

                for j in range(1, self.N + 1):
                    # Find galaxy position and velocity in a given radius around the current void
                    neighbor_inds = tree.query_ball_point(self.void_cat[i, :],
                    shell_volume = 4.0 * np.pi * (radius_array[j]**3 -
                                                  radius_array[j - 1]**3) / 3.0
                    velocity_near_point = self.galaxy_vz[neighbor_inds]
                    galaxies_near_point = self.galaxy_cat[neighbor_inds]
                    galaxies_near_point = len(galaxies_near_point[:, 0])
                    galaxies_in_shell = galaxies_near_point - current_number_of_galaxies  # Subtracting previous sphere to get galaxies in current shell.

                    # calulcating terms used in expectation values E[v_z**2] and E[v_z]**2
                    if galaxies_near_point > 0:
                        E_vz2_in_shell = (sum(velocity_near_point**2) -
                        E_vz_in_shell = (sum(velocity_near_point) -

                    galaxies_in_shell_arr[j] += galaxies_in_shell

                    E_vz[j] += E_vz_in_shell
                    E_vz2[j] += E_vz2_in_shell
                    delta[j] += galaxies_in_shell / shell_volume

                    current_E_vz += E_vz_in_shell
                    current_E_vz2 += E_vz2_in_shell
                    current_number_of_galaxies += galaxies_in_shell

            delta /= (len(self.void_cat[:, 0]) * len(self.galaxy_cat[:, 0]) /
            delta -= 1
            for j in range(self.N + 1):
                if galaxies_in_shell_arr[j] > 0:
                    E_vz[j] /= galaxies_in_shell_arr[j]
                    E_vz2[j] /= galaxies_in_shell_arr[j]
            sigma_vz = np.sqrt(E_vz2 - E_vz**2)

            # Replacing zero values to avoid division by zero later
            sigma_vz[np.where(sigma_vz < 10.0)] = 100.0

            if dictionary:
                #Output for victor code
                r_dict = np.linspace(2.11, 118.0, 30)
                sigma_vz_spline = interpolate.interp1d(radius_array, sigma_vz)
                delta_spline = interpolate.interp1d(radius_array, delta)

                delta_new = delta_spline(r_dict)
                sigma_vz_new = sigma_vz_spline(r_dict)
                vr_dict = {}
                vr_dict["rvals"] = r_dict
                vr_dict["sigma_v_los"] = sigma_vz_new
                    "datafiles/velocity_profiles/sigma_vz_dict" + self.handle,

                delta_dict = {}
                delta_dict["rvals"] = r_dict
                delta_dict["delta"] = delta_new
                np.save("datafiles/density_profiles/delta_dict" + self.handle,

            fig, ax = plt.subplots()
            ax.plot(radius_array, delta)
            fig, ax = plt.subplots()
            ax.plot(radius_array, sigma_vz)
            print len(delta)
            np.save("datafiles/density_profiles/delta" + self.handle, delta)
            np.save("datafiles/velocity_profiles/sigma_vz" + self.handle,

            delta = np.load(array_files[0])
            sigma_vz = np.load(array_files[1])
            fig, ax = plt.subplots()
            ax.plot(radius_array, delta)
            fig, ax = plt.subplots()
            ax.plot(radius_array, sigma_vz)

        print "Splining density profile"
        print len(radius_array), len(delta)
        self.delta = interpolate.interp1d(radius_array, delta, kind="cubic")
        self.sigma_vz = interpolate.interp1d(radius_array,

        return self.delta, self.sigma_vz
def NN_finder_all(initial_config_data, cut_off_distance, box_dim, path_to_test_dir, atom_list = None, save_results = False, re_calc = False):
	A very general nearest neigbor finder function calculate multiple atom's nearest neighbor all at once using
	the efficient cKDTree algorithm, the multiple atoms whose item number 
	is listed inside the atom_list input argument,
	the default is to calculate all atoms inside initial_config_data file
	User can customize which atoms to calculate by specifying in atom_list
	Input arguments:
	initial_config_data: instance of pandas.Dataframe
		configuration data
	cut_off_distance: dict
		dictionary contains the multiple pair cut-off distance
		currently use tuples as keys for immutability, frozenset may be another way
		but it reduce duplicates
		in order to access the turple key without order preference, convert
		For example,
		{(1,1):3.7,(1,2):2.7,(2,2):3.0} means that atom_type 1 and 1 cut-off
		is 3.7, etc
	box_dim: list
		a list containing the spatial dimension of simulation box size in x, y, z
	path_to_test_dir: str
		str of current test result dir, under it, it save data into nn_results.pkl
	atom_list: list
		the list containing the item number of interested atoms whose nearest neighbors
		are being found
	save_results: boolean, default True
		specify whether to save the results dictionary into a nn_results_dict.pkl file
	this cKDtree algorithm is efficient when:
	you have many points whose neighbors you want to find, you may save 
	substantial amounts of time by putting them in a cKDTree and using query_ball_tree
	for molecular simulation: 
		nn: dict()
			key is item id of interested atom
			values is the pandas.Dataframe of nearest neighbor for atom
			of interest
	# set up path_to_file and check results out of this function before calling it
	# if check_results is True: 
	# if path_to_file is None or os.path.exists(path_to_file):
	# raise Exception("NN results file not found, please specify the correct path to the file")
	path_to_nn_results = path_to_test_dir + "/nn_results_dict.pkl"
	if re_calc is False:
		if os.path.exists(path_to_nn_results):
			print "nn results dictionary already calculated and saved in pkl file, skip calculation"
			return pickle.load(open(path_to_nn_results,'r'))
	nn = dict()
	# if there is no atom_list specified, use all atoms in initial_config_data
	if atom_list is None:
		atom_list = (initial_config_data["item"]).tolist()
	_data = initial_config_data
	groups = Atom.classify_df(_data)
	#_atom_data = initial_config_data[['x','y','z']]
	_interested_data = _data.loc[_data['item'].isin(atom_list)]
	interested_groups = Atom.classify_df(_interested_data)
	#_interested_atom = _interested_data[['x','y','z']]
	# build the efficient nearest neighbor KDTree algorithm
	# default distance metric Euclidian norm p = 2
	# create tree object using the larger points array
	for (i, int_group) in interested_groups.items():
		for (j, atom_group) in groups.items():
			# comparing atom_type_i and atom_type_j
			for pair in [(i,j),(j,i)]:
				if pair in cut_off_distance:
					 curr_cut_off = cut_off_distance[pair]
			# iterate over each row seems inefficient for (index, curr_atom) in int_group.iterrows()
			result_tree = PeriodicCKDTree(box_dim, atom_group[['x','y','z']].values)
			result_groups = result_tree.query_ball_point(int_group[['x','y','z']].values, curr_cut_off)
			#indices = np.unique(IT.chain.from_iterable(result_groups))
			#for (int_NN,(index,int_atom)) in (result_groups,int_group.iterrows()):
			k = 0
			for index,int_atom in int_group.iterrows():
				# int_NN is a list of index of NN, index is according to the order
				# in atom_group 
				# curr_NN is a dataframe storing NN found for current atom_group
				int_NN = result_groups[k]
				curr_NN = atom_group.iloc[int_NN]
				if int_atom["item"] not in nn:
					nn[int_atom["item"]] = curr_NN
				elif int_atom["item"] in nn:
					nn[int_atom["item"]] = nn[int_atom["item"]].append(curr_NN)				
				k = k + 1	
	# it is best practice to save this NN dictionary results into a pkl file 
	# to prevent rerun, if this file exists, let user know that
	# the file_of_nearest_neighbor exists before calling it
	if save_results is True:
		with open(path_to_nn_results, 'w') as f:
	return nn