def match_from_biglist(self, cv, iter, match): """Write to lexicons file 2 files: _info.txt with info and simple lex file _lex.txt""" a = [i.strip().split(",") for i in open(self.f).readlines()] word, blick, ngram, realword, realblick, realngram = [], [], [], [], [], [] a = a[1:] random.shuffle(a) word_spot = 0 if self.disc == 1: word_spot = -1 for row in a[1:]: if row[3] != "Real": word += [row[word_spot]]; blick += [float(row[1])]; ngram += [float(row[2])] else: realword += [row[word_spot]]; realblick += [float(row[1])]; realngram += [float(row[2])] if match == "ngram": p = zip(ngram); realpoints = zip(realngram); minpoint = .03 elif match == "blick": p = zip(blick); realpoints = zip(realblick); minpoint = .03 else: p = zip(blick, ngram); realpoints = zip(realblick, realngram); minpoint = .03 tree = kd(p) if cv == 0: minpoint = minpoint*.75 print self.f outfile = open("Lexicons/lex_" + self.f.split("/")[-1][:-12] + "_cv" + str(cv) + "_iter" + str(iter) + "_mbigmatch_" + match + "_info.txt", "w") outfile2 = open("Lexicons/lex_" + self.f.split("/")[-1][:-12] + "_cv" + str(cv) + "_iter" + str(iter) + "_mbigmatch_" + match + "_lex.txt", "w") self.match_loop(outfile, outfile2, realpoints, realword, realblick, realngram, self.f, cv, iter, minpoint, blick, ngram, word, tree) outfile.close() outfile2.close() return
def evalNovKDTree(data,archive): tot_data = np.vstack( (data,archive) ) tree=kd(tot_data) nov = np.zeros(data.shape[0]) for idx in xrange(data.shape[0]): nov[idx]=eval_ind_k(tree,data[idx]) return nov
def find_duplicates(craters, radius=1737.4, k=10, rcd=5., ddiam=0.25, filter_pairs=False): """Finds duplicate pairs within crater catalog. Triples or more will show up as multiple pairs. Parameters ---------- craters : pandas.DataFrame Crater catalogue. radius : float, optional Radius of the world. k : int, optional Nearest neighbours to search for duplicates. Default is 10. rcd : float, optional Minimum value of min(crater_pair_diameters) / pair_distance to be considered a crater pair. Minimum rather than average is used to help weed out satellite craters. This criterion is asymmetric between pairs, and when filter_pairs=False may lead to single pair entries. ddiam : float, optional Maximum value of abs(diameter1 - diameter2) / avg(diameters) to be considered a crater pair. filter_pairs : bool, optional If `True`, filters data frame and keeps only one entry per crater pair. Returns ------- outframe : pandas.DataFrame Data frame of crater duplicate pairs. """ mgeod = geodesic.Geodesic(radius=radius, flattening=0.) # Convert to 3D (<https://en.wikipedia.org/wiki/ # Spherical_coordinate_system#Cartesian_coordinates>); phi = [-180, 180) is # equivalent to [0, 360). craters['phi'] = np.pi / 180. * craters['Long'] craters['theta'] = np.pi / 180. * (90 - craters['Lat']) craters['x'] = radius * np.sin(craters['theta']) * np.cos(craters['phi']) craters['y'] = radius * np.sin(craters['theta']) * np.sin(craters['phi']) craters['z'] = radius * np.cos(craters['theta']) # Create tree. kdt = kd(craters[["x", "y", "z"]].as_matrix(), leafsize=10) # Loop over all craters to find duplicates. First, find k + 1 nearest # neighbours (k + 1 because query will include self). Lnn, inn = kdt.query(craters[["x", "y", "z"]].as_matrix(), k=k + 1) # Remove crater matching with itself (by checking id). Lnn_remove_self = np.empty([Lnn.shape[0], Lnn.shape[1] - 1]) inn_remove_self = np.empty([Lnn.shape[0], Lnn.shape[1] - 1], dtype=int) for i in range(Lnn_remove_self.shape[0]): not_self = (inn[i] != i) inn_remove_self[i] = inn[i][not_self] Lnn_remove_self[i] = Lnn[i][not_self] craters['Lnn'] = list(Lnn_remove_self) craters['inn'] = list(inn_remove_self) # Get radii of nearest neighbors. inn_ravel = inn[:, 1:].ravel() craters['dnn'] = list( craters['Diameter (km)'].as_matrix()[inn_ravel].reshape(-1, 10)) craters['long_nn'] = list(craters['Long'].as_matrix()[inn_ravel].reshape( -1, 10)) craters['lat_nn'] = list(craters['Lat'].as_matrix()[inn_ravel].reshape( -1, 10)) craters['set_nn'] = list(craters['Dataset'].as_matrix()[inn_ravel].reshape( -1, 10)) # Prepare empty lists. dup_id1 = [] dup_id2 = [] dup_D1 = [] dup_D2 = [] dup_L = [] dup_LEuclid = [] dup_ll1 = [] dup_ll2 = [] dup_source1 = [] dup_source2 = [] # Iterate over craters to determine if any are duplicate pairs. for index, row in craters.iterrows(): # For each pair, record the smaller crater diameter. pair_diameter_min = np.array( [min(x, row['Diameter (km)']) for x in row['dnn']]) proper_dist = np.asarray( mgeod.inverse(np.array([row['Long'], row['Lat']]), np.vstack([row['long_nn'], row['lat_nn']]).T))[:, 0] # Duplicate pair criteria: 1). min(diameter) / distance > rcd; 2). # abs(diameter1 - diameter2) / average(diameter) < ddiam - i.e. the # separation distance of the centres must be much smaller than either # diameter, and the diameters should be very similar. rcd_crit = (pair_diameter_min / row['Lnn'] > rcd) diam_sim_crit = ((2. * abs(row['dnn'] - row['Diameter (km)']) / (row['dnn'] + row['Diameter (km)'])) < ddiam) dup_candidates, = np.where(rcd_crit & diam_sim_crit) if dup_candidates.size: for i in dup_candidates: if index == row['inn'][i]: raise AssertionError("Two craters with identical IDs.") dup_id1.append(index) dup_id2.append(row['inn'][i]) dup_D1.append(row['Diameter (km)']) dup_D2.append(row['dnn'][i]) dup_L.append(proper_dist[i]) dup_LEuclid.append(row['Lnn'][i]) dup_ll1.append((row['Long'], row['Lat'])) dup_ll2.append((row['long_nn'][i], row['lat_nn'][i])) dup_source1.append(row['Dataset']) dup_source2.append(row['set_nn'][i]) # Multi-index pandas table; see # <https://pandas.pydata.org/pandas-docs/stable/advanced.html>. outframe = pd.DataFrame( { 'ID1': dup_id1, 'ID2': dup_id2, 'Diameter1 (km)': dup_D1, 'Diameter2 (km)': dup_D2, 'Separation (km)': dup_L, 'Euclidean Separation (km)': dup_LEuclid, 'Lat/Long1': dup_ll1, 'Lat/Long2': dup_ll2, 'Dataset1': dup_source1, 'Dataset2': dup_source2 }, columns=('ID1', 'ID2', 'Diameter1 (km)', 'Diameter2 (km)', 'Separation (km)', 'Euclidean Separation (km)', 'Lat/Long1', 'Lat/Long2', 'Dataset1', 'Dataset2')) # Hacky, O(N^2) duplicate entry removal. if filter_pairs: osub = outframe[["ID1", "ID2"]].as_matrix() osub = np.array([set(x) for x in osub]) indices_to_remove = [] for i in range(osub.shape[0]): if i not in indices_to_remove: dups = np.where(osub[i + 1:] == osub[i])[0] + i + 1 indices_to_remove += list(dups) indices_to_remove = list(set(indices_to_remove)) outframe.drop(indices_to_remove, inplace=True) outframe.reset_index(inplace=True, drop=True) return outframe
def make_density_map(craters, img, kernel=None, k_support=8, k_sig=4., knn=10, beta=0.3, kdict={}, truncate=True): """Makes Gaussian kernel density maps. Parameters ---------- craters : pandas.DataFrame craters dataframe that includes pixel x and y columns img : numpy.ndarray original image; assumes colour channel is last axis (tf standard) kernel : function, "knn" or None If a function is inputted, function must return an array of length craters.shape[0]. If "knn", uses variable kernel with sigma = beta*<d_knn>, where <d_knn> is the mean Euclidean distance of the k = knn nearest neighbouring craters. If anything else is inputted, will use constant kernel size with sigma = k_sigma. k_support : int Kernel support (i.e. size of kernel stencil) coefficient. Support is determined by kernel_support = k_support*sigma. Defaults to 8. k_sig : float Sigma for constant sigma kernel. Defaults to 1. knn : int k nearest neighbours, used for "knn" kernel. Defaults to 10. beta : float Beta value used to calculate sigma for "knn" kernel. Default is 0.3. kdict : dict If kernel is custom function, dictionary of arguments passed to kernel. truncate : bool If True, truncate mask where image truncates """ # Load blank density map imgshape = img.shape[:2] dmap = np.zeros(imgshape) # Get number of craters N_ctrs = craters.shape[0] # Obtain gaussian kernel sigma values # callable checks if kernel is function if callable(kernel): sigma = kernel(**kdict) # If knn is used elif kernel == "knn": # If we have more than 1 crater, select either nearest 11 or N_ctrs # neighbours, whichever is closer if N_ctrs > 1: kdt = kd(craters[["x", "y"]].as_matrix(), leafsize=10) dnn = kdt.query(craters[["x","y"]].as_matrix(), \ k=min(N_ctrs, knn + 1))[0][:, 1:].mean(axis=1) # Otherwise, assume there are craters "offscreen" half an image away else: dnn = 0.5 * imgshape[0] * np.ones(1) sigma = beta * dnn else: sigma = k_sig * np.ones(N_ctrs) # Gaussian adding loop for i in range(N_ctrs): cx = int(craters["x"][i]) cy = int(craters["y"][i]) # A bit convoluted, but ensures that kernel_support # is always odd so that centre of gaussian falls on # a pixel. ks_half = int(k_support * sigma[i] / 2) kernel_support = ks_half * 2 + 1 kernel = gkern(kernel_support, sigma[i]) # Calculate indices on image where kernel should be added [imxl, imxr, gxl, gxr] = get_merge_indices(cx, imgshape[1], ks_half, kernel_support) [imyl, imyr, gyl, gyr] = get_merge_indices(cy, imgshape[0], ks_half, kernel_support) # Add kernel to image dmap[imyl:imyr, imxl:imxr] += kernel[gyl:gyr, gxl:gxr] # Removes if truncate: if img.ndim == 3: dmap[img[:, :, 0] == 0] = 0 else: dmap[img == 0] = 0 return dmap
def eval_pop(population): data=numpy.vstack([k.behavior for k in population+archive]) tree=kd(data) for art in population: eval_ind_k(art,tree) return tree