def gower_distance(X): """ This function expects a pandas dataframe as input The data frame is to contain the features along the columns. Based on these features a distance matrix will be returned which will contain the pairwise gower distance between the rows All variables of object type will be treated as nominal variables and the others will be treated as numeric variables. Distance metrics used for: Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry) """ individual_variable_distances = [] for i in range(X.shape[1]): feature = X.iloc[:,[i]] if feature.dtypes[0] == np.object: feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature)) else: feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values) individual_variable_distances.append(feature_dist) return np.array(individual_variable_distances).mean(0)
def calc_mahalanobis(x, y, n_neighbors): from sklearn.neighbors import DistanceMetric, NearestNeighbors DistanceMetric.get_metric('mahalanobis', V=np.cov(x)) nn = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute', metric='mahalanobis', metric_params={'V': np.cov(x)}) return nn.fit(x).kneighbors(y)
def mahalonobis(X): cov = np.cov(X, rowvar=0) try: metric = DistanceMetric.get_metric('mahalanobis', V=cov) if X.shape[0] > 1 \ else DistanceMetric.get_metric('euclidean') except LinAlgError: metric = DistanceMetric.get_metric('euclidean') def distance(x, y): return metric.pairwise([x], [y])[0][0] return distance
def get_full_metric(self, train_pairs): train_pairs_flat = [item for subtuple in train_pairs for item in subtuple] pca = PCA(n_components = self.pca_components) pca.fit(train_pairs_flat) train_pairs_pca_flat = pca.transform(train_pairs_flat) train_pairs_pca = list() for i in xrange(0, len(train_pairs_pca_flat), 2): a = i b = i + 1 train_pairs_pca.append((train_pairs_pca_flat[a], train_pairs_pca_flat[b])) ys = ys_from_pairs(train_pairs_pca) file_id = str(random.random())[2:] save_cvx_params(ys, file_id) run_cvx(file_id) M = load_cvx_result(file_id) dist = DistanceMetric.get_metric('mahalanobis', VI = M) return dist, M, pca
def dist(X, Y, distance_function = "euclidean"): """calculate X, Y distance matrix [Args] ------ X : m samples Y : n samples distance_function : user_defined distance [Returns] --------- distance_matrix: n * m distance matrix we have those built-in function. Default = euclidean "euclidean" EuclideanDistance sqrt(sum((x - y)^2)) "manhattan" ManhattanDistance sum(|x - y|) "chebyshev" ChebyshevDistance sum(max(|x - y|)) "minkowski" MinkowskiDistance sum(|x - y|^p)^(1/p) "wminkowski" WMinkowskiDistance sum(w * |x - y|^p)^(1/p) "seuclidean" SEuclideanDistance sqrt(sum((x - y)^2 / V)) "mahalanobis" MahalanobisDistance sqrt((x - y)' V^-1 (x - y)) """ distance_calculator = DistanceMetric.get_metric(distance_function) return distance_calculator.pairwise(X, Y)
def euclid(_): metric = DistanceMetric.get_metric('euclidean') def distance(x, y): return metric.pairwise([x], [y])[0][0] return distance
def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data): print "Running PCA..." train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs) ys = ys_from_pairs(train_pairs_pca) file_id = str(random.random())[2:] save_cvx_params(ys, file_id) run_cvx(file_id) M = load_cvx_result(file_id) dist = DistanceMetric.get_metric('mahalanobis', VI = M) train_a_sections = [x[0] for x in train_pairs_pca] train_b_sections = [x[1] for x in train_pairs_pca] test_a_sections = [x[0] for x in test_pairs_pca] test_b_sections = [x[1] for x in test_pairs_pca] train_given_sections = train_a_sections train_to_match_sections = train_b_sections test_given_sections = test_a_sections test_to_match_sections = test_b_sections if self.match_a_to_b: train_given_sections = train_b_sections train_to_match_sections = train_a_sections test_given_sections = test_b_sections test_to_match_sections = test_a_sections print "Constructing BallTrees..." train_bt = BallTree(train_to_match_sections, metric=dist) test_bt = BallTree(test_to_match_sections, metric=dist) train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction) test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction) print "Querying the BallTrees..." train_result = train_bt.query(train_given_sections, train_top_fraction) test_result = test_bt.query(test_given_sections, test_top_fraction) print "Looking at correctness of results..." train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))]) test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))]) print "Finding indices of correct matches..." test_result_full = test_bt.query(test_given_sections, len(test_given_sections)) def default_index(lst, i): ind = -1 try: ind = lst.index(i) except: pass return ind test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))] test_indices = [x for x in test_indices if x != -1] with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f: for i, index in enumerate(test_indices): if index == 0: successful_tunes_f.write(str(test_tune_data[i]) + '\n\n') return [[train_correct, len(train_given_sections)], [test_correct, len(test_given_sections)]], test_indices
def standardizedEulideanDistance(wide, p): """ Calculate the standardized Euclidean distance and return an array of distances to the center and a matrix of pairwise distances. :Arguments: :type wide: pandas.DataFrame :param wide: A wide formatted data frame with samples as columns and compounds as rows. :Returns: :return: Return 4 pd.DataFrames with SED values and cutoffs. :rtype: pd.DataFrames """ # Estimated Variance from the data varHat = wide.var(axis=1, ddof=1) varHat[varHat==0] = 1 dist = DistanceMetric.get_metric('seuclidean', V=varHat) # Column means colMean = wide.mean(axis=1) # Calculate the standardized Euclidean Distance from all samples to the center SEDtoCenter = dist.pairwise(wide.values.T, pd.DataFrame(colMean).T) SEDtoCenter = pd.DataFrame(SEDtoCenter, columns = ['SED_to_Center'], index = wide.columns) # Calculate the pairwise standardized Euclidean Distance of all samples SEDpairwise = dist.pairwise(wide.values.T) SEDpairwise = pd.DataFrame(SEDpairwise, columns = wide.columns, index = wide.columns) for index, row in SEDpairwise.iterrows(): SEDpairwise.loc[index, index] = np.nan # Calculate cutoffs # For SEDtoCenter: # Beta: sqrt((p-1)^2/p*(sum of n iid Beta(1/2, p/2))); (It's the exact distribution.) # Normal: sqrt(N((p-1)/p*n, 2*(p-2)*(p-1)^2/p^2/(p+1)*n)); (It's normal approximation. Works well when n is large.) # Chisq: sqrt((p-1)/p*Chi-sq(n)); (It's Chi-sq approximation. Works well when p is decent and p/n is not small.) # For SEDpairwise: # Beta: sqrt(2*(p-1)*(sum of n iid Beta(1/2, p/2))); # Normal: sqrt(N(2*n, 8*(p-2)/(p+1)*n)); # Chisq: sqrt(2*Chi-sq(n)); # where n = # of compounds and p = # of samples pSamples = float(wide.shape[1]) nFeatures = float(wide.shape[0]) nIterate = 20000 #100000 #p = 0.95 betaP = np.percentile(pd.DataFrame(stats.beta.rvs(0.5, 0.5*(pSamples-2), size=nIterate*nFeatures).reshape(nIterate, nFeatures)).sum(axis=1), p*100) betaCut1 = np.sqrt((pSamples-1)**2/pSamples*betaP) normCut1 = np.sqrt(stats.norm.ppf(p, (pSamples-1)/pSamples*nFeatures, np.sqrt(2*nFeatures*(pSamples-2)*(pSamples-1)**2/pSamples**2/(pSamples+1)))) chisqCut1 = np.sqrt((pSamples-1)/pSamples*stats.chi2.ppf(p, nFeatures)) betaCut2 = np.sqrt((pSamples-1)*2*betaP) normCut2 = np.sqrt(stats.norm.ppf(p, 2*nFeatures, np.sqrt(8*nFeatures*(pSamples-2)/(pSamples+1)))) chisqCut2 = np.sqrt(2*stats.chi2.ppf(p, nFeatures)) cutoff1 = pd.DataFrame([[betaCut1, normCut1, chisqCut1]], columns=['Beta(Exact)', 'Normal', 'Chi-sq']) cutoff2 = pd.DataFrame([[betaCut2, normCut2, chisqCut2]], columns=['Beta(Exact)', 'Normal', 'Chi-sq']) # TODO: Create a flag based on values greater than one of the cutoffs. return SEDtoCenter, cutoff1, SEDpairwise, cutoff2
def example2(): """using customized distance """ from HSH.Misc.shgeo import dist def earthdist(x, y): # latitude, longitude earth surface distance return dist((x[0], x[1]), (y[0], y[1])) dist_cal = DistanceMetric.get_metric(earthdist) train = np.array([[32.5, 101.0], [32.5, 102.0]]) test = np.array([[31.5, 101.0], [39.5, 101.0]]) print(dist_cal.pairwise(train, test))
def example1(): dist = DistanceMetric.get_metric("euclidean") train = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) test = np.array([[0.5, 0.5], [-0.5, -0.5]]) distance_matrix = dist.pairwise(train, test) print(distance_matrix) # distance_matrix reduced_distance_matrix = dist.dist_to_rdist(distance_matrix) # reduced_distance_matrix print(reduced_distance_matrix) # for euclidean, it's squared distance_matrix print(dist.rdist_to_dist(reduced_distance_matrix))
def load_model(self): if self.file_cache and os.path.isfile(self.file_cache): self._log.debug("Loading mode: %s", self.file_cache) with numpy.load(self.file_cache) as cache: tail = tuple(cache['tail']) s = (cache['data_arr'], cache['idx_array_arr'], cache['node_data_arr'], cache['node_bounds_arr']) +\ tail + (DistanceMetric.get_metric('hamming'),) #: :type: sklearn.neighbors.BallTree self.bt = BallTree.__new__(BallTree) self.bt.__setstate__(s) self._log.debug("Loading mode: Done")
def distance(X, distance_measure='euclidean'): X = np.array(X) if distance_measure in SKLEARN_METRICS: distance_ = DistanceMetric.get_metric(distance_measure).pairwise(X) elif distance_measure is 'pearson': distance_ = np.corrcoef(X) else: distance_ = None return distance_
def entropy(x,k=3,base=np.exp(1),intens=1e-10): """ The classic K-L k-nearest neighbor continuous entropy estimator x should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]] if x is a one-dimensional scalar and we have four samples """ assert k <= len(x)-1, "Set k smaller than num. samples - 1" d = len(x[0]) N = len(x) x += intens*nr.rand(N,d) tree = KDTree(x, metric=DistanceMetric.get_metric("minkowski",p=np.float64('inf') )) nn = tree.query(x,k+1)[0][:,k] # no need to reshape with new query_radius method const = digamma(N)-digamma(k) + d*log(2) return (const + d*np.mean(map(log,nn)))/log(base)
def find_computed_cluster_metrics(self): """Initialises cluster metric computation over every cluster that is found by the given clustering algorithm. """ for cluster in self.computed_clusters: cluster.compute_metrics(self.original_corpus, self.original_article_pos) centroid_locs = [x.centroid for x in self.computed_clusters] dist = DistanceMetric.get_metric('euclidean') dist_pair = dist.pairwise(centroid_locs) self.max_centroid_dist = max(list(itertools.chain.from_iterable( dist_pair)))
def __init__(self, proc, Xss, yss, valid_set=0.1, validation_set=None): self.seen_states = set() self.state_set = [] self.proc = proc self.valid_set = 0.1 self.surr_loss = DistanceMetric.get_metric('hamming') if validation_set is None: self._split(Xss, yss) else: self.Xss = Xss self.yss = yss self.valid_Xss, self.valid_yss = validation_set
def __max_score_mapping(rr, predicted, test, max_angle=1.0 - 2): angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle) d = angle.pairwise(predicted, test) # Each true sample maps to closest test_mapping = np.zeros(shape=(test.shape[0],), dtype=float) for i in xrange(test.shape[0]): if np.any(d[:, i] < max_angle): close_predictions = d[:, i] < max_angle scores = [rr(p) for p in predicted[close_predictions, :]] test_mapping[i] = np.max(scores) return test_mapping
def update_prediction(prediction, real_pic, metric_name='euclidean'): """ Update a prediction after receiving the actual picture from the webcam. Parameters ---------- prediction : Prediction The model object of the prediction to update real_pic : Picture The model object of the actual picture received Return ------ float : the prediction error """ pred_pic = prediction.as_picture() cam_id = prediction.params.webcam.webcam_id if metric_name == 'wminkowski-pca': with webcam_fs.get_dataset(cam_id) as dataset: if 'pca' not in dataset.imgset.feature_sets: raise ValueError("""wminkowski-pca cannnot be used without a PCA feature set""") pca_extractor = dataset.imgset.feature_sets['pca'].extractor weights = pca_extractor.pca.explained_variance_ratio_ pred_data = pca_extractor.extract(pred_pic.pixels) real_data = pca_extractor.extract(real_pic.pixels) metric = DistanceMetric.get_metric('wminkowski', p=2, w=weights) else: pred_data = pred_pic.pixels real_data = real_pic.pixels metric = DistanceMetric.get_metric(metric_name) error = metric.pairwise([pred_data], [real_data])[0] prediction.error = error prediction.save() return error
def entropy(data, ball='euclidean', k=1, units='nats'): """ Estimates the entropy of the given data using the k-nearest neighbors method input ----- data (nd-array): An (n by p) matrix containing n samples of p-dimensional data ball (string): Which ball (e.g. l1, euclidean, etc.) to use when computing the volume. Acceptable strings include: 'l1' : l1 or Manhattan distance 'l2' : l2 or Euclidean distance; default 'linf' : l-infinity or Chebyshev distance k (integer): How many nearest-neighbors to use when computing radii. Must be at least 1. units (string): Which unit the entropy output has. Acceptable strings include: 'nats' : base e 'bits' : base 2 """ # Get number of samples and dimensionality (n,p) = data.shape # Determine radii and volumes for a given metric space metric = getball(ball) if metric == 1: m = 'manhattan' elif metric == 2: m = 'euclidean' elif metric == inf: m = 'chebyshev' dist = DistanceMetric.get_metric(m) D_mat = dist.pairwise(data) D_mat.sort(axis=1) radii = D_mat[:,k] Vs = volume(radii, ball=str(metric), dimension=p) if units.lower() == 'nats': return sum([np.log(vol) for vol in Vs])/float(n) + np.log(n) - L(k - 1) + 0.577215665 if units.lower() == 'bits': return sum([np.log2(vol) for vol in Vs])/float(n) + np.log2(n) - L(k - 1) + 0.577215665
def __mappings(predicted, test, max_angle=1.0 - 2): angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle) d = angle.pairwise(predicted, test) # Each true sample maps to closest test_mapping = np.zeros(shape=(test.shape[0],), dtype=int) predicted_mapping = np.zeros(shape=(predicted.shape[0],), dtype=int) for i in xrange(test.shape[0]): test_mapping[i] = 1 if np.any(d[:, i] < max_angle) else 0 for i in xrange(predicted.shape[0]): predicted_mapping[i] = 1 if np.any(d[i, :] < max_angle) else 0 return predicted_mapping, test_mapping
def mi_LNC(x,y,k=5,base=np.exp(1),alpha=0.25,intens = 1e-10,metric='minkowski',p=np.float64('inf')): '''The mutual information estimator by PCA-based local non-uniform correction(LNC) ith row of X represents ith dimension of the data, e.g. X = [[1.0,3.0,3.0],[0.1,1.2,5.4]], if X has two dimensions and we have three samples alpha is a threshold parameter related to k and d(dimensionality), please refer to our paper for details about this parameter ''' #N is the number of samples N = x.shape[0] #First Step: calculate the mutual information using the Kraskov mutual information estimator #adding small noise to X, e.g., x<-X+noise x += intens*nr.rand(x.shape[0],x.shape[1]) y += intens*nr.rand(x.shape[0],x.shape[1]) points = np.hstack((x,y)) tree = KDTree(points, metric=DistanceMetric.get_metric(metric, p=p)) try: dvec, knn_idx = tree.query(points, k+1) # no need to reshape with new query_radius method except ValueError: return (float("NaN")) a = MI.avgdigamma(x,dvec[:,-1]*x.shape[1]/points.shape[1], metric=metric, p=p) b = MI.avgdigamma(y,dvec[:,-1]*y.shape[1]/points.shape[1], metric=metric, p=p) c = digamma(k) d = digamma(len(x)) # a,b,c,d = MI.avgdigamma(x,dvec), MI.avgdigamma(y,dvec), digamma(k), digamma(len(x)) # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d)) ret = (-a-b+c+d)/np.log(base) # LNC correction logV_knn = np.sum(np.log(np.abs(points - points[knn_idx[:,-1],:])), axis=1) logV_projected = np.zeros(logV_knn.shape) for i in range(points.shape[0]): knn_points = points[knn_idx[i,:],:] knn_centered = knn_points - points[i,:] u,s,v = la.svd(knn_centered) knn_proj = knn_centered.dot(v.T) max_dims = np.max(np.abs(knn_proj), axis=0) # max-norm per dimension logV_projected[i] = np.sum(np.log(max_dims)) diff = logV_projected - logV_knn if (alpha>1): alpha = 1 diff[diff >= log(alpha)] = 0 e = -np.sum(diff) / N return (ret + e)/log(base);
def distance_from_most_visited_place(place, user): q = select([func.count(),visits_10min.c.placeid]).where(visits_10min.c.userid == user).group_by(visits_10min.c.placeid).order_by(func.count().desc()) most_visited_places = [r[1] for r in connection.execute(q).fetchall()] def get_lat_long(place_q): try: return connection.execute(select([places_location.c.longitude, places_location.c.latitude]).where(and_(places_location.c.placeid == place_q, places_location.c.userid == user))).fetchall()[0] except Exception as e: return None dist = DistanceMetric.get_metric('haversine') X = [] X.append(get_lat_long(place)) for p in most_visited_places: ret = get_lat_long(p) if ret is not None: X.append((ret[0], ret[1])) break return dist.pairwise(X)[0][1]
def calcMDS(pltnum, flag, dmetric): if flag == 1: clf = PCA(n_components=5) Y = clf.fit_transform(X) title = 'PCA-MDS' elif flag == 2: clf = TruncatedSVD(n_components=5) Y = clf.fit_transform(X) else: Y = X title = 'MDS DistanceMetric: ' + str(dmetric) dist = DistanceMetric.get_metric(dmetric) Y = dist.pairwise(Y) # Y = euclidean_distances(Y) mds = manifold.MDS(n_components=2, dissimilarity='precomputed')#, init='pca', random_state=0) Y = mds.fit_transform(Y) for i in range(1, 3): mdsPlot(int(str(pltnum) + str(i)), i, Y, title)
def MDS(self,typeof='classic',dist=False,groups=None,dpi=300,textsize=10,interactive=False, samemarker=False,markersize=8,numbered=False,legend=False,of='pdf',rotate=0,MD=False): ''' Perform Multidimensional Scaling wither classic (PCoA) or non-metric. If you have the upper triangle of a distance matrix as a dictionary, pass the dictionary as dist. ''' # Rotation instance self.clf = PCA(n_components=self.ncomp) seed = np.random.RandomState(seed=3) if typeof == 'classic': metric = True self.type = 'cMDS' else: metric = False self.type = "nMDS" if dist: similarities=self.dict2array2matrix(dist) else: #similarities = euclidean_distances(self.data) dist = DistanceMetric.get_metric('euclidean') similarities = dist.pairwise(self.data) # Initiate multidimensional scaling mds = manifold.MDS(n_components=self.ncomp, metric = metric, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=-1) #fit the data the MDS pos = mds.fit(similarities).embedding_ if typeof != 'classic': pos = mds.fit_transform(similarities, init=pos) # Rescale the data pos *= np.sqrt((np.array(self.data)** 2).sum()) / np.sqrt((pos ** 2).sum()) # Rotate the data self.fit = self.clf.fit_transform(pos) self.Plot(dpi=dpi,textsize=textsize,interactive=interactive,samemarker=samemarker, markersize=markersize,numbered=numbered,legend=legend,of=of,rotate=rotate, groups=groups,MD=MD)
def compute_metrics(self, corpus, article_pos): """Computes metrics for the given cluster. Metrics computed are: diameter, radius, centroid, closest article to centroid, the distance of the closest article to the centroid. Args: corpus: A corpus in LSI space article_pos (dict): Maps the article id to the actual positions of the article in the corpus """ dist_corpus = [corpus[article_pos[x]] for x in self.articles_id] # Centroid calculation self.centroid = np.average(dist_corpus, axis=0) # Diameter calculation dist = DistanceMetric.get_metric('euclidean') dist_pair = dist.pairwise(dist_corpus) self.diameter = max(list(itertools.chain.from_iterable(dist_pair))) # Radius calculation dist_corpus.append(self.centroid) dist_pair = dist.pairwise(dist_corpus) centroid_dist = [x for x in dist_pair[-1] if x > 0] if len(centroid_dist) > 0: self.radius = max(centroid_dist) # Closest article computation closest_article = self.articles_id[0] min_dist = self.radius tmp_content = [] for k, id in enumerate(self.articles_id): if centroid_dist[k] < min_dist: closest_article = id min_dist = centroid_dist[k] tmp_content = self.data[k] self.closest_article_id = closest_article self.closest_article_distance = min_dist self.closest_article_content = tmp_content
def predict_proba(self, X): # Check is fit had been called by confirming that the distances_ dictionary has been set up check_is_fitted(self, ['distances_']) # Check that the input features match the type and shape of the training features X = check_array(X) # Initialise an array to store the prediction scores generated predictions = np.zeros((len(X), len(self.classes_))) distance_metric_model = DistanceMetric.get_metric(self.dist_param) unique = [] dist = [] for i in self.distances_.keys(): unique.append(i) dist.append(self.distances_.get(i)) # Iterate through the query instances in the query dataset predictions_prob = [] my_dict = dict(zip(unique, dist)) for instance in X: prob_dist = [] for item in my_dict: current_label = my_dict[item] array = np.vstack((current_label, instance)) dist = distance_metric_model.pairwise(array) var = np.amin(np.array(dist)[dist != np.amin(dist)]) prob_dist.append(1 / var) sum_value = sum(prob_dist) dict_model_prob = dict(zip(unique, (prob_dist / sum_value))) predictions_prob.append(dict_model_prob) return predictions_prob
def _test(y_pred, y, batch_size): def update_fn(engine, batch): idx = (engine.state.iteration - 1) * batch_size y_true_batch = np_y[idx:idx + batch_size] y_pred_batch = np_y_pred[idx:idx + batch_size] return torch.from_numpy(y_pred_batch), torch.from_numpy( y_true_batch) engine = Engine(update_fn) m = CanberraMetric() m.attach(engine, "cm") np_y = y.numpy().ravel() np_y_pred = y_pred.numpy().ravel() canberra = DistanceMetric.get_metric("canberra") data = list(range(y_pred.shape[0] // batch_size)) cm = engine.run(data, max_epochs=1).metrics["cm"] assert canberra.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(cm)
def fit_predict(self, X): # definition of distance metric dist = DistanceMetric.get_metric(self.metric) # initialization of KDTree with corresponding metric tree = KDTree(X, metric=dist) cluster_counter = -1 X = self.data2Point(X) for i, point in enumerate(X): if point.label == -2: neigh_ind, _ = tree.query_radius([point.coordinate], r=self.eps, return_distance=True, sort_results=True) neigh_ind = neigh_ind[0] # we mark points with less neighbors min_samples as noise/outlier if neigh_ind.size < self.min_samples: X[i].label = -1 else: cluster_counter += 1 X[i].label = cluster_counter neigh_ind = neigh_ind[1:].tolist() for j in neigh_ind: # we mark neighbors that don't belong to any cluster as current cluster if X[j].label < 0: X[j].label = cluster_counter q_neigh_ind, _ = tree.query_radius( [X[j].coordinate], r=self.eps, return_distance=True, sort_results=True) q_neigh_ind = q_neigh_ind[0].tolist() # we add current node as neighbor if density of neighbors is high enough if len(q_neigh_ind) >= self.min_samples: new_el = list( set(q_neigh_ind).difference( set(neigh_ind))) neigh_ind.extend(new_el) return np.array([x.label for x in X])
def closest_target(self, agent_states): """ Compute the nearest neighbor based on the manhattan distance """ agent_states = np.array(list(agent_states.values())) all_together = np.vstack((self.current_state, agent_states)) dist = DistanceMetric.get_metric('chebyshev') distances = dist.pairwise(all_together) # Compute distances for two dirs (Periodic Boundary) - row 0 = pred id_dist_1 = np.argsort(distances[0, :])[1] id_dist_2 = np.flip(np.argsort(self.obs_space_size - distances[0, :]))[1] if id_dist_1 == id_dist_2: target_agent_id = id_dist_1 else: d1 = distances[0, id_dist_1] d2 = self.obs_space_size - distances[0, id_dist_2] target_agent_id = id_dist_1 if d1 < d2 else id_dist_2 # Subtract one (since we included pred in dist calc) to get agent_id return target_agent_id - 1
def _test(y_pred, y, batch_size): def update_fn(engine, batch): idx = (engine.state.iteration - 1) * batch_size y_true_batch = np_y[idx:idx + batch_size] y_pred_batch = np_y_pred[idx:idx + batch_size] return idx, torch.from_numpy(y_pred_batch), torch.from_numpy( y_true_batch) engine = Engine(update_fn) m = ManhattanDistance(output_transform=lambda x: (x[1], x[2])) m.attach(engine, "md") np_y = y.numpy() np_y_pred = y_pred.numpy() manhattan = DistanceMetric.get_metric("manhattan") data = list(range(y_pred.shape[0] // batch_size)) md = engine.run(data, max_epochs=1).metrics["md"] assert manhattan.pairwise([np_y_pred, np_y])[0][1] == pytest.approx(md)
def predict(self, X_arr): """ conventional kNN prediction that use weighted summation distance :param X_arr: :return: """ from sklearn.neighbors import DistanceMetric as DM dis = DM.get_metric('euclidean') distances = [] for i in range(X_arr.__len__()): X_arr[i], self.X_arr[i] = normalize(X_arr[i]), normalize( self.X_arr[i]) # force convert into range distances.append(dis.pairwise(X_arr[i], self.X_arr[i])) distances = np.array(distances) multi_dis = np.zeros((distances.shape[1], distances.shape[2])) for w, d in zip(self.weights, distances): multi_dis += w * d # weighted distances sorted_ss_indicies = multi_dis.argsort() # sort dis k_neighbors_lables = self.y[sorted_ss_indicies][:, :self.k] from scipy.stats import mode y_predicted, t = mode(k_neighbors_lables, axis=1) return y_predicted.reshape(-1)
def euclidean_distance(dataset1: pd.DataFrame, dataset2: pd.DataFrame) -> float: """ Pair up the datasets and compute the euclidean distances between the sequences of values. Both datasets must have the same columns. :param dataset1: First DataFrame. :param dataset2: Second DataFrame. :return: Euclidean distance between all rows in the datasets. """ dist = DistanceMetric.get_metric('euclidean') if not len(dataset1.index) == len(dataset2.index): return -1 distance = 0 for i in range(0, len(dataset1.index)): data_row1 = dataset1.iloc[:, i:i + 1].transpose() data_row2 = dataset2.iloc[:, i:i + 1].transpose() ecl_dist = dist.pairwise(data_row1, data_row2) distance = distance + ecl_dist return distance
def compute_normals(_pcd, height_encoded=True, max_dist=0.1, n_neighbors=9, n_iter=20): _zero_array = np.array([0.0, 0.0, 0.0]) n_neighbors += 1 # It will always find itself too if len(_pcd.shape) == 3: n_elem = _pcd.shape[0] * _pcd.shape[1] else: n_elem = _pcd.shape[0] pcd = _pcd.copy() pcd = pcd.reshape(n_elem, 3) tree = KDTree(pcd, leaf_size=10) metric = DistanceMetric.get_metric('euclidean') distances, indices = tree.query(pcd, k=n_neighbors) normals = numba_compute_normals(n_elem, indices, distances, max_dist, pcd) normals = normals.reshape(_pcd.shape) return normals
def calc_distance_matrix(X, method): if method in ['chebyshev', 'euclidean', 'l1', 'l2']: DM = DistanceMetric.get_metric(method).pairwise(X) elif method in ['cosine']: DM = pairwise.cosine_distances(X) elif method in ['correlation', 'cityblock', 'braycurtis', 'canberra', 'hamming', 'jaccard', 'kulsinski']: DM = squareform(pdist(X, method)) elif method in ['minkowski3']: DM = squareform(pdist(X, 'minkowski', 3)) elif method in ['dot']: DM = squareform(pdist(X, lambda u, v: np.dot(u, v))) elif method in ['emd']: from scipy.stats import wasserstein_distance l = len(X) DM = np.zeros((l, l)) for x in range(l): for y in range(l): DM[x, y] = wasserstein_distance(X[x], X[y]) else: return None return DM
def __init__(self, t=100, n=16, contamination=0.1, metric='euclidean', tol=1e-8, verbose=False): super(iNNe, self).__init__() self.t = int(t) self.n = int(n) self.contamination = float(contamination) if metric == 'cosine': self.metric = cosine_similarity else: try: self.metric = DistanceMetric.get_metric(metric) except ValueError as e: raise BaseException(e) self.tol = float(tol) self.verbose = bool(verbose)
def year_lat_lon(self, x, y): haversine = DistanceMetric.get_metric("haversine") #try: x_year = x['year'] x_lat = radians(x['artist_latitude']) x_lon = radians(x['artist_longitude']) y_year = y['year'] y_lat = radians(y['artist_latitude']) y_lon = radians(y['artist_longitude']) #except: # raise IOError("Problem parsing features.") # return None rad = 6367.44 haversine = 2 * rad * asin( sqrt( sin((y_lat - x_lat) / 2)**2 + cos(x_lat) * cos(y_lat) * sin((y_lon - x_lon) / 2)**2)) norm_year = (abs(x_year - y_year)) / ((2010 - 1926) * 2) dist = (1 / (20003 * 2)) * haversine + norm_year return dist
def train(num_result_images=25): # Convert 2D image matrix => 1D bottleneck vector print('\n** GENERATING BOTTLENECKS bottlenecks.csv **') # Setup model to convert 2D image matrix => 1D bottleneck vector base_model = Xception(include_top=False, weights='imagenet', input_shape=(img_w_size, img_h_size, 3), pooling='avg') bottlenecks = base_model.predict(images) # TODO: Change this to json np.savetxt("bottleneck.csv", bottlenecks, delimiter=",") print('\n** GENERATED BOTTLENECKS to bottleneck.csv **') bottlenecks = np.loadtxt("bottleneck.csv", delimiter=",") print('\n** GENERATING PAIRWISE pairwise_top_25.json **') dist = DistanceMetric.get_metric('euclidean') # Calculate pairwise distance -- O(n^2) bottleneck_pairwise_dist = dist.pairwise(bottlenecks) # Find the top 100 similar images per image retrieved_images = [] for image_idx in range(0, len(bottleneck_pairwise_dist)): retrieved_indexes = pd.Series( bottleneck_pairwise_dist[image_idx]).sort_values().head( num_result_images).index.tolist() retrieved_indexes_int = list( map(lambda index: int(index), retrieved_indexes)) pairwise_top_25[image_idx] = retrieved_indexes_int with open('pairwise_top_25.json', 'w') as fp: json.dump(pairwise_top_25, fp) print('\n** GENERATED PAIRWISE to pairwise_top_25.json **')
def transform(self, X): """ Compute the topological vector for each persistence diagram individually and concatenate the results. Parameters: X (list of n x 2 numpy arrays): input persistence diagrams. Returns: numpy array with shape (number of diagrams) x (**threshold**): output topological vectors. """ if self.threshold == -1: thresh = np.array([X[i].shape[0] for i in range(len(X))]).max() else: thresh = self.threshold num_diag = len(X) Xfit = np.zeros([num_diag, thresh]) for i in range(num_diag): diagram, num_pts_in_diag = X[i], X[i].shape[0] pers = 0.5 * (diagram[:, 1] - diagram[:, 0]) min_pers = np.minimum(pers, np.transpose(pers)) # Works fine with sklearn 1.0, but an ValueError exception is thrown on past versions try: distances = DistanceMetric.get_metric("chebyshev").pairwise( diagram) except ValueError: # Empty persistence diagram case - https://github.com/GUDHI/gudhi-devel/issues/507 assert len(diagram) == 0 distances = np.empty(shape=[0, 0]) vect = np.flip( np.sort(np.triu(np.minimum(distances, min_pers)), axis=None), 0) dim = min(len(vect), thresh) Xfit[i, :dim] = vect[:dim] return Xfit
def findClusters(self): reassignedClusterPoints = 0 v = np.cov(self.dataFrame.iloc[:, :-1]) #print(v) #print(self.__centroidsAsDataFrame__()) #distance_metric = DistanceMetric.get_metric('euclidean') #dist = distance_metric.pairwise(self.dataFrame.loc[:, :'petal_width'], self.__centroidsAsDataFrame__()) #print(dist) for index, row in self.dataFrame.iterrows(): currentCluster = row['cluster'] nearestCluster = {'name': "", 'distance': None} for cluster in self.clusterLabels: distance_metric = DistanceMetric.get_metric('chebyshev') dist = distance_metric.pairwise([row.iloc[:-1]], [self.centroids[cluster]]) #print(dist) #dist = (row[0] - self.centroids[cluster][0])**2 +\ #(row[1] - self.centroids[cluster][1])**2 +\ #(row[2] - self.centroids[cluster][2])**2 +\ #(row[3] - self.centroids[cluster][3])**2 if not nearestCluster[ 'distance'] or dist < nearestCluster['distance']: nearestCluster['name'] = cluster nearestCluster['distance'] = dist self.dataFrame.at[index, 'cluster'] = nearestCluster['name'] if currentCluster != nearestCluster['name']: reassignedClusterPoints += 1 #print("cluster assignment for row {} changed from {} to {}".format(index, currentCluster, nearestCluster['name'])) self.__computeCentroids__( ) # UPDATE CENTROIDS AFTER REASSIGNMENT OF CLUSTER LABELS return reassignedClusterPoints
def ClusterCV(features, linkage='single', n_folds=10): """ Cluster-cross-validation. linkage - 'single', 'complete', or 'average' """ # Get distance matrix dist = DistanceMetric.get_metric('jaccard') distmatrix = dist.pairwise(features) # Find best number of clusters num_samples = features.shape[0] scores = np.zeros(num_samples) for i in range(n_folds * 2, num_samples): clustering = AgglomerativeClustering(n_clusters=i, affinity='precomputed', linkage=linkage) clustering = clustering.fit(distmatrix) scores[i] = silhouette_score(distmatrix, labels=clustering.labels_, metric='precomputed') max_score = max(scores) n_clusters = np.where(scores == max(scores))[0][0] #print(scores) print("Number of clusters:", n_clusters) # Cluster clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage=linkage) clustering = clustering.fit(distmatrix) # Randomly assign each cluster to one fold cluster_nums = list(range(n_clusters)) shuffle(cluster_nums) fold_assignments = {} for n in range(n_clusters): fold_assignments[cluster_nums[n]] = n % n_folds # Assign samples to folds folds = np.zeros(num_samples) for j in range(num_samples): folds[j] = fold_assignments[clustering.labels_[j]] return folds, max_score
def __init__( self, t=100, # number of ensemble members n=16, # sample for each ensemble member contamination=0.1, # expected proportion of anomalies in the data metric='euclidean', # distance metric to use tol=1e-8, # tolerance verbose=False): super().__init__() # instantiate the parameters self.t = int(t) self.n = int(n) self.contamination = float(contamination) if metric == 'cosine': self.metric = cosine_similarity else: try: self.metric = DistanceMetric.get_metric(metric) except ValueError as e: raise BaseException(e) self.tol = float(tol) self.verbose = bool(verbose)
def load_hdf(self, fname, copydata=None, noisy=False): if noisy: print 'start load_hdf' f = h5py.File(fname, "r") state = [] for i in range(4): array_data = f["state_%s"%(i)][:] state.append(array_data) if noisy: print "load state_%s"%(i) print array_data if noisy: print 'done' int_array = f["int_values"][:] for val in int_array: state.append(int(val)) ## ensure type int not 'numpy.int64' euc_dist = DistanceMetric.get_metric('euclidean') state.append(euc_dist) state = tuple(state) self.check_state_compatibility(state) self.__setstate__(state) if noisy: print 'end load_hdf'
def __init__(self, contamination=0.1, metric='euclidean', tol=1e-10, verbose=False): super().__init__() # contamination if not(0.0 < contamination <= 1.0): raise ValueError(contamination, 'is not a float in (0.0, 1.0]') self.c = float(contamination) # distance metric try: self.metric = DistanceMetric.get_metric(metric) except: raise ValueError(metric, 'is not an accepted distance metric') self.tol = float(tol) self.verbose = bool(verbose) # internal self.derived_squashed_ = False
def knn_mahalanobis(x_train, x_test, y_train, y_test, k=50): dist = DistanceMetric.get_metric('manhattan') x_train_temp = x_train.reset_index(drop=True) x_dev_temp = x_test.reset_index(drop=True) y_train_temp = y_train.reset_index(drop=True) y_dev_temp = y_test.reset_index(drop=True) preds_train = np.zeros((y_train_temp.shape[0], 1)) similarity_dist = dist.pairwise(x_train_temp, x_train_temp) for i in range(similarity_dist.shape[1]): min_ind_dev = similarity_dist[:, i].argsort()[:int(k)] preds_train[i] = np.mean(y_train_temp[y_train_temp.index.isin(min_ind_dev)]) mse_train_reg = math.sqrt(mean_squared_error(y_train_temp, preds_train)) preds_dev = np.zeros((y_dev_temp.shape[0],1)) similarity_dist = dist.pairwise(x_train_temp, x_dev_temp) for i in range(similarity_dist.shape[1]): min_ind_dev = similarity_dist[:,i].argsort()[:int(k)] preds_dev[i] = np.mean(y_train_temp[y_train_temp.index.isin(min_ind_dev)]) mse_dev_reg = math.sqrt(mean_squared_error(y_dev_temp, preds_dev)) return mse_train_reg, mse_dev_reg
def mi_Kraskov(x,y,k=5,base=np.exp(1),intens=1e-10,metric="minkowski",p=np.float64('inf')): '''The mutual information estimator by Kraskov et al. Inputs are 2D arrays, with each column being a dimension and each row being a data point ''' assert len(x)==len(y), "Lists should have same length" assert k <= len(x) - 1, "Set k smaller than num. samples - 1" x += intens*nr.rand(x.shape[0],x.shape[1]) y += intens*nr.rand(x.shape[0],x.shape[1]) points = np.hstack((x,y)) #Find nearest neighbors in joint space, p=inf means max-norm tree = KDTree(points, metric=DistanceMetric.get_metric(metric,p=p)) try: dvec = tree.query(points,k+1)[0][:,k] # no need to reshape with new query_radius method except ValueError: return (float("NaN")) a = MI.avgdigamma(x,dvec*x.shape[1]/points.shape[1],metric=metric,p=p) b = MI.avgdigamma(y,dvec*y.shape[1]/points.shape[1],metric=metric,p=p) c = digamma(k) d = digamma(len(x)) # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d)) return (-a-b+c+d)/np.log(base)
def mahalanobis(X,y,nr_bins,mode='continuous'): """ Estimate the mahalanobis distance between the average stimulus pattern defined by classes in y across features in X. For every time point """ stim_mat,y = gen_RSA_matrix(y,nr_bins,mode=mode) evidence_RSA = np.zeros(X.shape[2])*np.nan matrix = np.zeros((nr_bins,nr_bins,X.shape[2]))*np.nan for tp in range(X.shape[2]): X_in = X[:,:,tp] #pca pca = PCA(n_components=.95) X_in = pca.fit(X_in).transform(X_in) #estimate covariance emp_cov = EmpiricalCovariance().fit(X_in) maha = DistanceMetric.get_metric('mahalanobis',VI=emp_cov.covariance_) #scale data scaler = StandardScaler().fit(X_in) X_s = scaler.transform(X_in) X_stim = np.zeros((nr_bins,X_s.shape[1])) for i, stim in enumerate(np.unique(y)): X_stim[i,:] = X_s[(y==stim) ,:].mean(0).T matrix[:,:,tp] = maha.pairwise(X_stim) evidence_RSA[tp] = np.mean(np.mean(np.multiply(matrix[:,:,tp],stim_mat))) return evidence_RSA, matrix
def __main__(): data = load_data('pd_speech_features.csv') # params = {'max_depth': 2, 'eta': 0.1, 'n_estimators': 50, 'booster': 'dart', # 'gamma': 0, 'reg_lambda': 0.05, 'objective': 'binary:logistic'} features = [ "gender", "baselineFeats", "intensityFeats", "formantFeats", "bandwidthFeats", "vocalFeats", "mfccFeats", "waveletFeats", "tqwtFeats" ] X = convert_data(data, features) params = { "hid_layers": [(280, "relu"), (1, "sigmoid")], "compile": { "loss": 'binary_crossentropy', "optimizer": tf.keras.optimizers.Adam(lr=0.001), "metrics": ["accuracy"] }, "epochs": 10, "batch_size": None } # scores = run_model("mlp", X.values, data['label'], **params) # params = {'hid_dim': 150, 'func': 'sigm', 'init': {'norm': 0.05}, # 'train_args': ['OP', 'c'], 'train_kwargs': {'kmax': 100}} # # scores = run_model("elm", X, data['label'], **params) params = {"kernel": "poly", "degree": 20} # scores = run_model("svm", X, data['label'], **params) params = { "n_neighbors": 1, "metric": DistanceMetric.get_metric("manhattan") } scores = run_model("knn", X, data['label'], **params) print_results(scores, features, params)
def linearRect(loc, norm): indarr = np.arange(loc.shape[0]) flag = np.zeros(loc.shape[0], dtype=int) dist = dm.get_metric("euclidean") dist_matrix = dist.pairwise(loc) nbrs = NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(loc) nndistances, nnindices = nbrs.kneighbors(loc) # dist_sorted_ind = np.argsort(dist_matrix, axis=1)[:,1:] flag_closest_rected = np.ones_like(flag, dtype=int) * -1 flag_closest_rected_dist = np.ones_like(flag, dtype=float) * 10 ind = np.argmax(np.linalg.norm(loc, axis=1)) right_drct = loc[ind] / np.linalg.norm(loc[ind]) for i in range(loc.shape[0]): if i != 0: unmarkind = np.argmin(flag_closest_rected_dist[flag<1]) ind = indarr[flag<1][unmarkind] right_drct = get_right_drct(norm, ind, nnindices[ind], nndistances[ind], flag) # print(ind, flag_closest_rected[ind], "flag_closest_rected_dist[ind]", flag_closest_rected_dist[ind]) norm[ind] = rectify(norm[ind], right_drct) flag[ind] = 1 flag_closest_rected_dist_new = np.minimum(dist_matrix[:, ind],flag_closest_rected_dist) flag_closest_rected = np.where(flag_closest_rected_dist_new<flag_closest_rected_dist, ind, flag_closest_rected) flag_closest_rected_dist = flag_closest_rected_dist_new return norm
def test_mahattan_distance(): a = np.random.randn(4) b = np.random.randn(4) c = np.random.randn(4) d = np.random.randn(4) ground_truth = np.random.randn(4) m = ManhattanDistance() manhattan = DistanceMetric.get_metric("manhattan") m.update((torch.from_numpy(a), torch.from_numpy(ground_truth))) np_sum = np.abs(ground_truth - a).sum() assert m.compute() == pytest.approx(np_sum) assert manhattan.pairwise([a, ground_truth])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(b), torch.from_numpy(ground_truth))) np_sum += np.abs(ground_truth - b).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([a, b]) v2 = np.hstack([ground_truth, ground_truth]) assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(c), torch.from_numpy(ground_truth))) np_sum += np.abs(ground_truth - c).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([v1, c]) v2 = np.hstack([v2, ground_truth]) assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(d), torch.from_numpy(ground_truth))) np_sum += np.abs(ground_truth - d).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([v1, d]) v2 = np.hstack([v2, ground_truth]) assert manhattan.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)
def _plot_kmeans_distortions(df_ct_stats, path_to_figure=None): clusters = np.arange(1, 21) max_stat = df_ct_stats.loc['gl_max', :].dropna() max_stat = max_stat.values[:, np.newaxis] euc_dist = DistanceMetric.get_metric('euclidean') dist_mat = euc_dist.pairwise(max_stat) distortions = [] for cluster in clusters: model = KMeans(n_clusters=cluster, random_state=0).fit(dist_mat) distortions.append( sum( np.min(cdist(dist_mat, model.cluster_centers_, 'euclidean'), axis=1)) / dist_mat.shape[0]) plt.figure() plt.plot(clusters, distortions, marker='o', color='yellow') plt.xlabel('Number of Clusters') plt.ylabel('Distortion') #plt.xlim([0.01, 20.01]) x_coords = np.linspace(1, np.size(clusters), 21, dtype=int) y_coords = np.linspace(min(distortions), max(distortions), 6) y_ticks = _ticks(y_coords) plt.xticks(x_coords, x_coords) plt.yticks(y_coords, y_ticks) if path_to_figure is not None: plt.savefig( path_to_figure, bbox_inches='tight', dpi=CONFIG.DPI, )
def load_model(self): """ Load a btree index from the configured cache element. This only occurs if there is a cache element configured and there are bytes there to read. """ with self._model_lock: if self.cache_element and not self.cache_element.is_empty(): self._log.debug("Loading model from cache: %s", self.cache_element) buff = BytesIO(self.cache_element.get_bytes()) # noinspection PyTypeChecker with np.load(buff, allow_pickle=True) as cache: tail = tuple(cache['tail']) s = [cache['data_arr'], cache['idx_array_arr'], cache['node_data_arr'], cache['node_bounds_arr']] s.extend(tail) s[11] = DistanceMetric.get_metric('hamming') s = tuple(s) # noinspection PyTypeChecker #: :type: sklearn.neighbors.BallTree self.bt = BallTree.__new__(BallTree) self.bt.__setstate__(s) self._log.debug("Loading mode: Done")
def transform(self, X): if self.threshold_ == -1: thresh = np.array([X[i].shape[0] for i in range(len(X))]).max() else: thresh = self.threshold_ num_diag = len(X) Xfit = np.zeros([num_diag, thresh]) for i in range(num_diag): diagram, num_pts_in_diag = X[i], X[i].shape[0] pers = 0.5 * np.matmul(diagram, np.array([[-1.0], [1.0]])) min_pers = np.minimum(pers, np.transpose(pers)) distances = DistanceMetric.get_metric("chebyshev").pairwise( diagram) vect = np.flip( np.sort(np.triu(np.minimum(distances, min_pers)), axis=None), 0) dim = min(len(vect), thresh) Xfit[i, :dim] = vect[:dim] return Xfit
def test_compute(): a = np.random.randn(4) b = np.random.randn(4) c = np.random.randn(4) d = np.random.randn(4) ground_truth = np.random.randn(4) m = CanberraMetric() canberra = DistanceMetric.get_metric("canberra") m.update((torch.from_numpy(a), torch.from_numpy(ground_truth))) np_sum = (np.abs(ground_truth - a) / (np.abs(a) + np.abs(ground_truth))).sum() assert m.compute() == pytest.approx(np_sum) assert canberra.pairwise([a, ground_truth])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(b), torch.from_numpy(ground_truth))) np_sum += ((np.abs(ground_truth - b)) / (np.abs(b) + np.abs(ground_truth))).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([a, b]) v2 = np.hstack([ground_truth, ground_truth]) assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(c), torch.from_numpy(ground_truth))) np_sum += ((np.abs(ground_truth - c)) / (np.abs(c) + np.abs(ground_truth))).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([v1, c]) v2 = np.hstack([v2, ground_truth]) assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum) m.update((torch.from_numpy(d), torch.from_numpy(ground_truth))) np_sum += (np.abs(ground_truth - d) / (np.abs(d) + np.abs(ground_truth))).sum() assert m.compute() == pytest.approx(np_sum) v1 = np.hstack([v1, d]) v2 = np.hstack([v2, ground_truth]) assert canberra.pairwise([v1, v2])[0][1] == pytest.approx(np_sum)
time.sleep(10.0) if(init == 0): np.save('maternal_fetal_feature_vectors1k', maternal_fetal_feature_vectors, allow_pickle=False) np.save('maternal_feature_vectors1k', maternal_feature_vectors, allow_pickle=False) np.save('linear_regression_coefs1k', linear_regression_coefs, allow_pickle=False) np.save('linear_regression_intercepts1k', linear_regression_intercepts, allow_pickle=False) # figz.data = [] if ((n_svrs % 25) == 0): print(['n_svrs: ' + str(n_svrs)]) # Get histogram of token - token distances for clustering: # dist = DistanceMetric.get_metric('manhattan') token_dists = dist.pairwise(maternal_fetal_feature_vectors[0:200,:]) # token_dists = distance_matrix(maternal_fetal_feature_vectors, maternal_fetal_feature_vectors, p=1, threshold=100000000) # token_dists = distance.cdist(maternal_fetal_feature_vectors[0:5,:], maternal_fetal_feature_vectors[0:5,:], metric='cityblock') token_dists = distance.pdist(maternal_fetal_feature_vectors, metric='cityblock') # token_dist_hist = np.histogram(token_dists, bins=1000) # token_dist_hist_idxs = np.arange(len(token_dist_hist)) token_dists_sorted = np.sort(token_dists) token_dist_idxs = np.arange(len(token_dists_sorted)) fig = make_subplots(rows=1, cols=1) fig.append_trace(go.Scatter(x=token_dist_idxs, y=token_dists_sorted), row=1, col=1) fig.show()
[0,0,1,1,0,1,0,1,1,0,1,1,0,1,1,0,0,1,1,1,1,1], [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0], [1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0], [1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0], [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0], [0,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,1,1,1,0], [1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ] AttributeClassifier = joblib.load('Dumps/AttributeClassifierKnowledgeTransfer.pkl') features = scipy.io.loadmat("./UIUC1/UIUC1_win_feature.mat") labels = scipy.io.loadmat("./UIUC1/UIUC1_labels.mat") action_actor = open("./UIUC1/action_actor.txt") dist = DistanceMetric.get_metric('euclidean') mapping = [{}] for line in action_actor: line = line.split() actionvector = numpy.zeros(14, dtype=numpy.int) actionvector[int(line[0])]=1 mapping.append({'action':int(line[0]),'actionvector':actionvector, 'actor':int(line[1])}) total = len(labels['vlabels'][0]) ConfusionMatrix=numpy.array([[0,0],[0,0]]) NovelClassList=[[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] for NovelClass in NovelClassList: ConfusionMatrix2=numpy.array([[0,0],[0,0]])
def brute_force_neighbors(X, Y, k, metric, **kwargs): D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) ind = np.argsort(D, axis=1)[:, :k] dist = D[np.arange(Y.shape[0])[:, None], ind] return dist, ind
def haversine_distance(p1, p2): d = DistanceMetric.get_metric('haversine') X = [p1, p2] return d.pairwise(X)[0][1]
from scipy.spatial import distance import numpy as np distance.euclidean([1, 0, 0], [0, 1, 0]) distance.euclidean([20, 25], [25, 22]) #closest : S1 with S2 np.sqrt(((20 - 25)**2 + (25 - 22)**2)) #sqrt(sum(x-y)^2) distance.euclidean([20, 25], [35, 40]) distance.euclidean([20, 25], [40, 35]) distance.euclidean([35, 40], [40, 35]) #distance of all points in DF from sklearn.neighbors import DistanceMetric dist = DistanceMetric.get_metric('euclidean') dist df.to_numpy() dist.pairwise(df.to_numpy()) #Kmeans clustering df from sklearn.cluster import KMeans kmeans = KMeans(n_clusters=2).fit(df) centroids = kmeans.cluster_centers_ print(centroids) df plt.scatter(df['math'], df['science'], c=kmeans.labels_.astype(float),
def dist(X_1, X_2, param='euclidean'): dist = DistanceMetric.get_metric(param) X = [X_1,X_2] return dist.pairwise(X)[0,1]
def mean_distance_to_closest(predicted, event): angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle) nn = BallTree(event.tracks, leaf_size=5, metric=angle) return np.sum([nn.query(predicted[i, :], k=1) for i in xrange(predicted.shape[0])]) / event.tracks.shape[0]
## KNN PREDICTOR ## # do some lambda magic on text columns traindata = list(train.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1)) testdata = list(test.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1)) # Fit TFIDF tfv.fit(traindata) X = tfv.transform(traindata) X_test = tfv.transform(testdata) clf = pipeline.Pipeline([('tSVD',tSVD),('scl',scl),('knn',knn)]) param_grid = {'knn__n_neighbors':[2],'knn__metric':[DistanceMetric.get_metric('manhattan')],'tSVD__n_components':[400]} model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, scoring = kappa_scorer, refit = True, cv = 2, n_jobs = -1) # Fit Model model.fit(X, y) model.best_estimator_.fit(X,y) trainPred = model.best_estimator_.predict(X_test) # Averaging predicted relevance values finalPred = [int(floor((int(stemPred[i])+trainPred[i])*0.5)) for i in range(len(stemPred))] #print "Kappa Score for Training Data\nStemming+KNN\nScore=%f" %(quadratic_weighted_kappa(y, finalPred))