def gower_distance(X): """ This function expects a pandas dataframe as input The data frame is to contain the features along the columns. Based on these features a distance matrix will be returned which will contain the pairwise gower distance between the rows All variables of object type will be treated as nominal variables and the others will be treated as numeric variables. Distance metrics used for: Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient) Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry) """ individual_variable_distances = [] for i in range(X.shape[1]): feature = X.iloc[:,[i]] if feature.dtypes[0] == np.object: feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature)) else: feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / np.ptp(feature.values) individual_variable_distances.append(feature_dist) return np.array(individual_variable_distances).mean(0)
def calc_mahalanobis(x, y, n_neighbors): from sklearn.neighbors import DistanceMetric, NearestNeighbors DistanceMetric.get_metric('mahalanobis', V=np.cov(x)) nn = NearestNeighbors(n_neighbors=n_neighbors, algorithm='brute', metric='mahalanobis', metric_params={'V': np.cov(x)}) return nn.fit(x).kneighbors(y)
def mahalonobis(X): cov = np.cov(X, rowvar=0) try: metric = DistanceMetric.get_metric('mahalanobis', V=cov) if X.shape[0] > 1 \ else DistanceMetric.get_metric('euclidean') except LinAlgError: metric = DistanceMetric.get_metric('euclidean') def distance(x, y): return metric.pairwise([x], [y])[0][0] return distance
def euclid(_): metric = DistanceMetric.get_metric('euclidean') def distance(x, y): return metric.pairwise([x], [y])[0][0] return distance
def dist(X, Y, distance_function = "euclidean"): """calculate X, Y distance matrix [Args] ------ X : m samples Y : n samples distance_function : user_defined distance [Returns] --------- distance_matrix: n * m distance matrix we have those built-in function. Default = euclidean "euclidean" EuclideanDistance sqrt(sum((x - y)^2)) "manhattan" ManhattanDistance sum(|x - y|) "chebyshev" ChebyshevDistance sum(max(|x - y|)) "minkowski" MinkowskiDistance sum(|x - y|^p)^(1/p) "wminkowski" WMinkowskiDistance sum(w * |x - y|^p)^(1/p) "seuclidean" SEuclideanDistance sqrt(sum((x - y)^2 / V)) "mahalanobis" MahalanobisDistance sqrt((x - y)' V^-1 (x - y)) """ distance_calculator = DistanceMetric.get_metric(distance_function) return distance_calculator.pairwise(X, Y)
def get_full_metric(self, train_pairs): train_pairs_flat = [item for subtuple in train_pairs for item in subtuple] pca = PCA(n_components = self.pca_components) pca.fit(train_pairs_flat) train_pairs_pca_flat = pca.transform(train_pairs_flat) train_pairs_pca = list() for i in xrange(0, len(train_pairs_pca_flat), 2): a = i b = i + 1 train_pairs_pca.append((train_pairs_pca_flat[a], train_pairs_pca_flat[b])) ys = ys_from_pairs(train_pairs_pca) file_id = str(random.random())[2:] save_cvx_params(ys, file_id) run_cvx(file_id) M = load_cvx_result(file_id) dist = DistanceMetric.get_metric('mahalanobis', VI = M) return dist, M, pca
def run_single_trial(self, train_pairs, test_pairs, train_tune_data, test_tune_data): print "Running PCA..." train_pairs_pca, test_pairs_pca = self.fit_pca(train_pairs, test_pairs) ys = ys_from_pairs(train_pairs_pca) file_id = str(random.random())[2:] save_cvx_params(ys, file_id) run_cvx(file_id) M = load_cvx_result(file_id) dist = DistanceMetric.get_metric('mahalanobis', VI = M) train_a_sections = [x[0] for x in train_pairs_pca] train_b_sections = [x[1] for x in train_pairs_pca] test_a_sections = [x[0] for x in test_pairs_pca] test_b_sections = [x[1] for x in test_pairs_pca] train_given_sections = train_a_sections train_to_match_sections = train_b_sections test_given_sections = test_a_sections test_to_match_sections = test_b_sections if self.match_a_to_b: train_given_sections = train_b_sections train_to_match_sections = train_a_sections test_given_sections = test_b_sections test_to_match_sections = test_a_sections print "Constructing BallTrees..." train_bt = BallTree(train_to_match_sections, metric=dist) test_bt = BallTree(test_to_match_sections, metric=dist) train_top_fraction = int(len(train_given_sections) * self.correct_within_top_fraction) test_top_fraction = int(len(test_given_sections) * self.correct_within_top_fraction) print "Querying the BallTrees..." train_result = train_bt.query(train_given_sections, train_top_fraction) test_result = test_bt.query(test_given_sections, test_top_fraction) print "Looking at correctness of results..." train_correct = sum([int(i in train_result[1][i]) for i in xrange(len(train_given_sections))]) test_correct = sum([int(i in test_result[1][i]) for i in xrange(len(test_given_sections))]) print "Finding indices of correct matches..." test_result_full = test_bt.query(test_given_sections, len(test_given_sections)) def default_index(lst, i): ind = -1 try: ind = lst.index(i) except: pass return ind test_indices = [default_index(list(test_result_full[1][i]), i) for i in xrange(len(test_given_sections))] test_indices = [x for x in test_indices if x != -1] with open("successful_tunes_{}".format(file_id), 'w') as successful_tunes_f: for i, index in enumerate(test_indices): if index == 0: successful_tunes_f.write(str(test_tune_data[i]) + '\n\n') return [[train_correct, len(train_given_sections)], [test_correct, len(test_given_sections)]], test_indices
def standardizedEulideanDistance(wide, p): """ Calculate the standardized Euclidean distance and return an array of distances to the center and a matrix of pairwise distances. :Arguments: :type wide: pandas.DataFrame :param wide: A wide formatted data frame with samples as columns and compounds as rows. :Returns: :return: Return 4 pd.DataFrames with SED values and cutoffs. :rtype: pd.DataFrames """ # Estimated Variance from the data varHat = wide.var(axis=1, ddof=1) varHat[varHat==0] = 1 dist = DistanceMetric.get_metric('seuclidean', V=varHat) # Column means colMean = wide.mean(axis=1) # Calculate the standardized Euclidean Distance from all samples to the center SEDtoCenter = dist.pairwise(wide.values.T, pd.DataFrame(colMean).T) SEDtoCenter = pd.DataFrame(SEDtoCenter, columns = ['SED_to_Center'], index = wide.columns) # Calculate the pairwise standardized Euclidean Distance of all samples SEDpairwise = dist.pairwise(wide.values.T) SEDpairwise = pd.DataFrame(SEDpairwise, columns = wide.columns, index = wide.columns) for index, row in SEDpairwise.iterrows(): SEDpairwise.loc[index, index] = np.nan # Calculate cutoffs # For SEDtoCenter: # Beta: sqrt((p-1)^2/p*(sum of n iid Beta(1/2, p/2))); (It's the exact distribution.) # Normal: sqrt(N((p-1)/p*n, 2*(p-2)*(p-1)^2/p^2/(p+1)*n)); (It's normal approximation. Works well when n is large.) # Chisq: sqrt((p-1)/p*Chi-sq(n)); (It's Chi-sq approximation. Works well when p is decent and p/n is not small.) # For SEDpairwise: # Beta: sqrt(2*(p-1)*(sum of n iid Beta(1/2, p/2))); # Normal: sqrt(N(2*n, 8*(p-2)/(p+1)*n)); # Chisq: sqrt(2*Chi-sq(n)); # where n = # of compounds and p = # of samples pSamples = float(wide.shape[1]) nFeatures = float(wide.shape[0]) nIterate = 20000 #100000 #p = 0.95 betaP = np.percentile(pd.DataFrame(stats.beta.rvs(0.5, 0.5*(pSamples-2), size=nIterate*nFeatures).reshape(nIterate, nFeatures)).sum(axis=1), p*100) betaCut1 = np.sqrt((pSamples-1)**2/pSamples*betaP) normCut1 = np.sqrt(stats.norm.ppf(p, (pSamples-1)/pSamples*nFeatures, np.sqrt(2*nFeatures*(pSamples-2)*(pSamples-1)**2/pSamples**2/(pSamples+1)))) chisqCut1 = np.sqrt((pSamples-1)/pSamples*stats.chi2.ppf(p, nFeatures)) betaCut2 = np.sqrt((pSamples-1)*2*betaP) normCut2 = np.sqrt(stats.norm.ppf(p, 2*nFeatures, np.sqrt(8*nFeatures*(pSamples-2)/(pSamples+1)))) chisqCut2 = np.sqrt(2*stats.chi2.ppf(p, nFeatures)) cutoff1 = pd.DataFrame([[betaCut1, normCut1, chisqCut1]], columns=['Beta(Exact)', 'Normal', 'Chi-sq']) cutoff2 = pd.DataFrame([[betaCut2, normCut2, chisqCut2]], columns=['Beta(Exact)', 'Normal', 'Chi-sq']) # TODO: Create a flag based on values greater than one of the cutoffs. return SEDtoCenter, cutoff1, SEDpairwise, cutoff2
def example2(): """using customized distance """ from HSH.Misc.shgeo import dist def earthdist(x, y): # latitude, longitude earth surface distance return dist((x[0], x[1]), (y[0], y[1])) dist_cal = DistanceMetric.get_metric(earthdist) train = np.array([[32.5, 101.0], [32.5, 102.0]]) test = np.array([[31.5, 101.0], [39.5, 101.0]]) print(dist_cal.pairwise(train, test))
def distance(X, distance_measure='euclidean'): X = np.array(X) if distance_measure in SKLEARN_METRICS: distance_ = DistanceMetric.get_metric(distance_measure).pairwise(X) elif distance_measure is 'pearson': distance_ = np.corrcoef(X) else: distance_ = None return distance_
def load_model(self): if self.file_cache and os.path.isfile(self.file_cache): self._log.debug("Loading mode: %s", self.file_cache) with numpy.load(self.file_cache) as cache: tail = tuple(cache['tail']) s = (cache['data_arr'], cache['idx_array_arr'], cache['node_data_arr'], cache['node_bounds_arr']) +\ tail + (DistanceMetric.get_metric('hamming'),) #: :type: sklearn.neighbors.BallTree self.bt = BallTree.__new__(BallTree) self.bt.__setstate__(s) self._log.debug("Loading mode: Done")
def example1(): dist = DistanceMetric.get_metric("euclidean") train = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]]) test = np.array([[0.5, 0.5], [-0.5, -0.5]]) distance_matrix = dist.pairwise(train, test) print(distance_matrix) # distance_matrix reduced_distance_matrix = dist.dist_to_rdist(distance_matrix) # reduced_distance_matrix print(reduced_distance_matrix) # for euclidean, it's squared distance_matrix print(dist.rdist_to_dist(reduced_distance_matrix))
def entropy(x,k=3,base=np.exp(1),intens=1e-10): """ The classic K-L k-nearest neighbor continuous entropy estimator x should be a list of vectors, e.g. x = [[1.3],[3.7],[5.1],[2.4]] if x is a one-dimensional scalar and we have four samples """ assert k <= len(x)-1, "Set k smaller than num. samples - 1" d = len(x[0]) N = len(x) x += intens*nr.rand(N,d) tree = KDTree(x, metric=DistanceMetric.get_metric("minkowski",p=np.float64('inf') )) nn = tree.query(x,k+1)[0][:,k] # no need to reshape with new query_radius method const = digamma(N)-digamma(k) + d*log(2) return (const + d*np.mean(map(log,nn)))/log(base)
def find_computed_cluster_metrics(self): """Initialises cluster metric computation over every cluster that is found by the given clustering algorithm. """ for cluster in self.computed_clusters: cluster.compute_metrics(self.original_corpus, self.original_article_pos) centroid_locs = [x.centroid for x in self.computed_clusters] dist = DistanceMetric.get_metric('euclidean') dist_pair = dist.pairwise(centroid_locs) self.max_centroid_dist = max(list(itertools.chain.from_iterable( dist_pair)))
def __init__(self, proc, Xss, yss, valid_set=0.1, validation_set=None): self.seen_states = set() self.state_set = [] self.proc = proc self.valid_set = 0.1 self.surr_loss = DistanceMetric.get_metric('hamming') if validation_set is None: self._split(Xss, yss) else: self.Xss = Xss self.yss = yss self.valid_Xss, self.valid_yss = validation_set
def __max_score_mapping(rr, predicted, test, max_angle=1.0 - 2): angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle) d = angle.pairwise(predicted, test) # Each true sample maps to closest test_mapping = np.zeros(shape=(test.shape[0],), dtype=float) for i in xrange(test.shape[0]): if np.any(d[:, i] < max_angle): close_predictions = d[:, i] < max_angle scores = [rr(p) for p in predicted[close_predictions, :]] test_mapping[i] = np.max(scores) return test_mapping
def update_prediction(prediction, real_pic, metric_name='euclidean'): """ Update a prediction after receiving the actual picture from the webcam. Parameters ---------- prediction : Prediction The model object of the prediction to update real_pic : Picture The model object of the actual picture received Return ------ float : the prediction error """ pred_pic = prediction.as_picture() cam_id = prediction.params.webcam.webcam_id if metric_name == 'wminkowski-pca': with webcam_fs.get_dataset(cam_id) as dataset: if 'pca' not in dataset.imgset.feature_sets: raise ValueError("""wminkowski-pca cannnot be used without a PCA feature set""") pca_extractor = dataset.imgset.feature_sets['pca'].extractor weights = pca_extractor.pca.explained_variance_ratio_ pred_data = pca_extractor.extract(pred_pic.pixels) real_data = pca_extractor.extract(real_pic.pixels) metric = DistanceMetric.get_metric('wminkowski', p=2, w=weights) else: pred_data = pred_pic.pixels real_data = real_pic.pixels metric = DistanceMetric.get_metric(metric_name) error = metric.pairwise([pred_data], [real_data])[0] prediction.error = error prediction.save() return error
def entropy(data, ball='euclidean', k=1, units='nats'): """ Estimates the entropy of the given data using the k-nearest neighbors method input ----- data (nd-array): An (n by p) matrix containing n samples of p-dimensional data ball (string): Which ball (e.g. l1, euclidean, etc.) to use when computing the volume. Acceptable strings include: 'l1' : l1 or Manhattan distance 'l2' : l2 or Euclidean distance; default 'linf' : l-infinity or Chebyshev distance k (integer): How many nearest-neighbors to use when computing radii. Must be at least 1. units (string): Which unit the entropy output has. Acceptable strings include: 'nats' : base e 'bits' : base 2 """ # Get number of samples and dimensionality (n,p) = data.shape # Determine radii and volumes for a given metric space metric = getball(ball) if metric == 1: m = 'manhattan' elif metric == 2: m = 'euclidean' elif metric == inf: m = 'chebyshev' dist = DistanceMetric.get_metric(m) D_mat = dist.pairwise(data) D_mat.sort(axis=1) radii = D_mat[:,k] Vs = volume(radii, ball=str(metric), dimension=p) if units.lower() == 'nats': return sum([np.log(vol) for vol in Vs])/float(n) + np.log(n) - L(k - 1) + 0.577215665 if units.lower() == 'bits': return sum([np.log2(vol) for vol in Vs])/float(n) + np.log2(n) - L(k - 1) + 0.577215665
def __mappings(predicted, test, max_angle=1.0 - 2): angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle) d = angle.pairwise(predicted, test) # Each true sample maps to closest test_mapping = np.zeros(shape=(test.shape[0],), dtype=int) predicted_mapping = np.zeros(shape=(predicted.shape[0],), dtype=int) for i in xrange(test.shape[0]): test_mapping[i] = 1 if np.any(d[:, i] < max_angle) else 0 for i in xrange(predicted.shape[0]): predicted_mapping[i] = 1 if np.any(d[i, :] < max_angle) else 0 return predicted_mapping, test_mapping
def mi_LNC(x,y,k=5,base=np.exp(1),alpha=0.25,intens = 1e-10,metric='minkowski',p=np.float64('inf')): '''The mutual information estimator by PCA-based local non-uniform correction(LNC) ith row of X represents ith dimension of the data, e.g. X = [[1.0,3.0,3.0],[0.1,1.2,5.4]], if X has two dimensions and we have three samples alpha is a threshold parameter related to k and d(dimensionality), please refer to our paper for details about this parameter ''' #N is the number of samples N = x.shape[0] #First Step: calculate the mutual information using the Kraskov mutual information estimator #adding small noise to X, e.g., x<-X+noise x += intens*nr.rand(x.shape[0],x.shape[1]) y += intens*nr.rand(x.shape[0],x.shape[1]) points = np.hstack((x,y)) tree = KDTree(points, metric=DistanceMetric.get_metric(metric, p=p)) try: dvec, knn_idx = tree.query(points, k+1) # no need to reshape with new query_radius method except ValueError: return (float("NaN")) a = MI.avgdigamma(x,dvec[:,-1]*x.shape[1]/points.shape[1], metric=metric, p=p) b = MI.avgdigamma(y,dvec[:,-1]*y.shape[1]/points.shape[1], metric=metric, p=p) c = digamma(k) d = digamma(len(x)) # a,b,c,d = MI.avgdigamma(x,dvec), MI.avgdigamma(y,dvec), digamma(k), digamma(len(x)) # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d)) ret = (-a-b+c+d)/np.log(base) # LNC correction logV_knn = np.sum(np.log(np.abs(points - points[knn_idx[:,-1],:])), axis=1) logV_projected = np.zeros(logV_knn.shape) for i in range(points.shape[0]): knn_points = points[knn_idx[i,:],:] knn_centered = knn_points - points[i,:] u,s,v = la.svd(knn_centered) knn_proj = knn_centered.dot(v.T) max_dims = np.max(np.abs(knn_proj), axis=0) # max-norm per dimension logV_projected[i] = np.sum(np.log(max_dims)) diff = logV_projected - logV_knn if (alpha>1): alpha = 1 diff[diff >= log(alpha)] = 0 e = -np.sum(diff) / N return (ret + e)/log(base);
def calcMDS(pltnum, flag, dmetric): if flag == 1: clf = PCA(n_components=5) Y = clf.fit_transform(X) title = 'PCA-MDS' elif flag == 2: clf = TruncatedSVD(n_components=5) Y = clf.fit_transform(X) else: Y = X title = 'MDS DistanceMetric: ' + str(dmetric) dist = DistanceMetric.get_metric(dmetric) Y = dist.pairwise(Y) # Y = euclidean_distances(Y) mds = manifold.MDS(n_components=2, dissimilarity='precomputed')#, init='pca', random_state=0) Y = mds.fit_transform(Y) for i in range(1, 3): mdsPlot(int(str(pltnum) + str(i)), i, Y, title)
def distance_from_most_visited_place(place, user): q = select([func.count(),visits_10min.c.placeid]).where(visits_10min.c.userid == user).group_by(visits_10min.c.placeid).order_by(func.count().desc()) most_visited_places = [r[1] for r in connection.execute(q).fetchall()] def get_lat_long(place_q): try: return connection.execute(select([places_location.c.longitude, places_location.c.latitude]).where(and_(places_location.c.placeid == place_q, places_location.c.userid == user))).fetchall()[0] except Exception as e: return None dist = DistanceMetric.get_metric('haversine') X = [] X.append(get_lat_long(place)) for p in most_visited_places: ret = get_lat_long(p) if ret is not None: X.append((ret[0], ret[1])) break return dist.pairwise(X)[0][1]
def MDS(self,typeof='classic',dist=False,groups=None,dpi=300,textsize=10,interactive=False, samemarker=False,markersize=8,numbered=False,legend=False,of='pdf',rotate=0,MD=False): ''' Perform Multidimensional Scaling wither classic (PCoA) or non-metric. If you have the upper triangle of a distance matrix as a dictionary, pass the dictionary as dist. ''' # Rotation instance self.clf = PCA(n_components=self.ncomp) seed = np.random.RandomState(seed=3) if typeof == 'classic': metric = True self.type = 'cMDS' else: metric = False self.type = "nMDS" if dist: similarities=self.dict2array2matrix(dist) else: #similarities = euclidean_distances(self.data) dist = DistanceMetric.get_metric('euclidean') similarities = dist.pairwise(self.data) # Initiate multidimensional scaling mds = manifold.MDS(n_components=self.ncomp, metric = metric, max_iter=3000, eps=1e-9, random_state=seed, dissimilarity="precomputed", n_jobs=-1) #fit the data the MDS pos = mds.fit(similarities).embedding_ if typeof != 'classic': pos = mds.fit_transform(similarities, init=pos) # Rescale the data pos *= np.sqrt((np.array(self.data)** 2).sum()) / np.sqrt((pos ** 2).sum()) # Rotate the data self.fit = self.clf.fit_transform(pos) self.Plot(dpi=dpi,textsize=textsize,interactive=interactive,samemarker=samemarker, markersize=markersize,numbered=numbered,legend=legend,of=of,rotate=rotate, groups=groups,MD=MD)
def compute_metrics(self, corpus, article_pos): """Computes metrics for the given cluster. Metrics computed are: diameter, radius, centroid, closest article to centroid, the distance of the closest article to the centroid. Args: corpus: A corpus in LSI space article_pos (dict): Maps the article id to the actual positions of the article in the corpus """ dist_corpus = [corpus[article_pos[x]] for x in self.articles_id] # Centroid calculation self.centroid = np.average(dist_corpus, axis=0) # Diameter calculation dist = DistanceMetric.get_metric('euclidean') dist_pair = dist.pairwise(dist_corpus) self.diameter = max(list(itertools.chain.from_iterable(dist_pair))) # Radius calculation dist_corpus.append(self.centroid) dist_pair = dist.pairwise(dist_corpus) centroid_dist = [x for x in dist_pair[-1] if x > 0] if len(centroid_dist) > 0: self.radius = max(centroid_dist) # Closest article computation closest_article = self.articles_id[0] min_dist = self.radius tmp_content = [] for k, id in enumerate(self.articles_id): if centroid_dist[k] < min_dist: closest_article = id min_dist = centroid_dist[k] tmp_content = self.data[k] self.closest_article_id = closest_article self.closest_article_distance = min_dist self.closest_article_content = tmp_content
def mi_Kraskov(x,y,k=5,base=np.exp(1),intens=1e-10,metric="minkowski",p=np.float64('inf')): '''The mutual information estimator by Kraskov et al. Inputs are 2D arrays, with each column being a dimension and each row being a data point ''' assert len(x)==len(y), "Lists should have same length" assert k <= len(x) - 1, "Set k smaller than num. samples - 1" x += intens*nr.rand(x.shape[0],x.shape[1]) y += intens*nr.rand(x.shape[0],x.shape[1]) points = np.hstack((x,y)) #Find nearest neighbors in joint space, p=inf means max-norm tree = KDTree(points, metric=DistanceMetric.get_metric(metric,p=p)) try: dvec = tree.query(points,k+1)[0][:,k] # no need to reshape with new query_radius method except ValueError: return (float("NaN")) a = MI.avgdigamma(x,dvec*x.shape[1]/points.shape[1],metric=metric,p=p) b = MI.avgdigamma(y,dvec*y.shape[1]/points.shape[1],metric=metric,p=p) c = digamma(k) d = digamma(len(x)) # print("ee_acc: %s, %s, %s, %s" %( a,b,c,d)) return (-a-b+c+d)/np.log(base)
def compute_medoid(raw_points): points = numpy.radians([[p[0], p[1]] for p in raw_points]) d = DistanceMetric.get_metric('haversine') dists = d.pairwise(points) index = numpy.argmin(dists.sum(axis=0)) return index
def mst(data_path, k): df_pd = pd.read_csv(data_path) # import the data N = len(df_pd) # length of the data df_pd.columns = [i for i in range(0, len(df_pd.columns)) ] # name the columns with numbers # Compute distances dist = DistanceMetric.get_metric('euclidean') df = df_pd.to_numpy() distance = dist.pairwise( df ) # matrix with the pair-wise euclidean distance between all data points # Sort the edges in ascending order (according to the distance in between vertices) sorted_edges = np.transpose( np.unravel_index(np.argsort(distance, axis=None), distance.shape)).tolist() # Remove the first N zeros (distance between the same data points) and as they are duplicated, choose one for each 2 sorted_edges = sorted_edges[N::2] # Add a cluster column to keep track of the point's cluster, if it does not belong to any, it will be 'Non visited' df_pd['cluster'] = ['Non visited' for i in range(0, N)] # Start algorithm # Initiate a dictionary with cluster and the data points it has cluster_dic = {-1: []} # Initiate a counter for the number of edges we add counter = 0 t0 = time.time() for edge in sorted_edges: # iterate over all the edges (sorted) parent_0 = df_pd.loc[ edge[0], 'cluster'] # set the cluster to which the first data point of the edge belongs to parent_1 = df_pd.loc[ edge[1], 'cluster'] # set the cluster to which the second data point of the edge belongs to # If both data points belong to the same cluster, then do nothing because it would create a cycle if (parent_0 == parent_1) and (parent_0 != 'Non visited'): pass # If both data points have no cluster assigned, create a new cluster with both data points elif (parent_0 == 'Non visited') and (parent_1 == 'Non visited'): counter += 1 max_cluster = max(cluster_dic) cluster_dic[max_cluster + 1] = [edge[0]] cluster_dic[max_cluster + 1] += [edge[1]] # Keep track that these two data points have now a cluster by including the cluster number to df_pd df_pd.loc[edge[0], 'cluster'] = max_cluster + 1 df_pd.loc[edge[1], 'cluster'] = max_cluster + 1 # If some of the data points does not belong to any cluster, add the other data point of the edge to the cluster # of the first one and keep track that this data point has now a cluster by including the cluster to df_pd elif (parent_0 == 'Non visited') or (parent_1 == 'Non visited'): counter += 1 if parent_0 == 'Non visited': cluster_dic[parent_1] += [edge[0]] df_pd.loc[edge[0], 'cluster'] = parent_1 else: cluster_dic[parent_0] += [edge[1]] df_pd.loc[edge[1], 'cluster'] = parent_0 # If the two data points belong to different cluster, add the vertices of the second cluster to the first, # delete the second cluster and change the cluster in df_pd for points in second cluster else: counter += 1 cluster_dic[parent_0] += cluster_dic[parent_1] del cluster_dic[parent_1] df_pd.loc[df_pd['cluster'] == parent_1, 'cluster'] = parent_0 # Stop iterating once we have N clusters, that is, we have added K == N - edges we have added and add these # points as 'alone' clusters (clusters with only one data point) if N - counter == k: for i in range(0, N): if df_pd.loc[i, 'cluster'] == 'Non visited': df_pd.loc[i, 'cluster'] = max(cluster_dic) + 1 cluster_dic[max(cluster_dic)] = i t1 = time.time() print(t1 - t0) return df_pd
def dist(X_1, X_2, param='euclidean'): dist = DistanceMetric.get_metric(param) X = [X_1,X_2] return dist.pairwise(X)[0,1]
test_error_cheb = 1- accuracy_score(y_test, y_pred_test) #mahalanobis from sklearn.datasets import make_classification from sklearn.neighbors import DistanceMetric x = x_train y = y_train xt = x_train yt = y_train test_error = [] for K in list(range(1, 99,5)): x, y = make_classification() xt, yt = make_classification() DistanceMetric.get_metric('mahalanobis', V=np.cov(x)) K_value = K neigh = KNeighborsClassifier(n_neighbors =K,algorithm='brute', metric='mahalanobis', metric_params={'V': np.cov(x)}) neigh.fit(x,y) y_pred_test = neigh.predict(xt) last = [] for i in range(len(y)): if y[i] != y_pred_test[i]: last.append(i) accuracy = len(last)/len(y_pred_test) bwb = ( 1- accuracy) test_error.append(bwb) minpos = test_error.index(min(test_error)) k_use = (minpos*5)+1 #optimal k = 4
def euclidean(x, y): result = DistanceMetric.get_metric('euclidean') return result.pairwise(x, y)
def aglo(processed_files, n_clusters=80): ''' This function will agglomerate the group of closed coordinates into single coordinate, plot into KML and mark it as Left or Right or Stop :param processed_files: Containing full path of the three generated text files :param n_clusters: By default set to 80 :return: None ''' # Calling the readFile function to parse the text files into the dataframes and stored into the dictionary # key as full path name of the text file dataframes = {} for file in processed_files: dataframes[file] = readFile(file) kml1 = simplekml.Kml(open=1) for df in dataframes: for row in dataframes[df]: kmlPoint(kml1, df, row) kml1.save('Output_before_aglo.kml') # Performing agglomeration for the three dataframes and store the cluster labels for each of the three # Agglomerative process into the dictionary model_labels = {} for df in dataframes: dist = DistanceMetric.get_metric('haversine') dist_matrix = dist.pairwise(dataframes[df]) hc = AgglomerativeClustering(n_clusters=n_clusters, affinity='precomputed', linkage='single') hc.fit(dist_matrix) model_labels[df] = hc.labels_ # Finding the centroid values for each clusters of the three agglomerative clusters label objects and # store that into the dictionary centroids = {} # For each agglomerative cluster label objects for model in model_labels: centroid = [] # Run it through each clusters for cluster in range(n_clusters): coordinates = [] # Take all the coordinates for each clusters and find the mean centroid coordinate and store it latitude = dataframes[model][model_labels[model] == cluster, 0] longitude = dataframes[model][model_labels[model] == cluster, 1] coordinates.append(np.sum(latitude) / len(latitude)) coordinates.append(np.sum(longitude) / len(longitude)) centroid.append(coordinates) centroids[model] = centroid # Getting centroid values list for each of the tracks and store into the separate list stops = [] left = [] right = [] for model in centroids: if 'stops' in model: stops = centroids[model] if 'left' in model: left = centroids[model] if 'right' in model: right = centroids[model] # If the stops and left are within the distance of 100 meters, then remove the particular stop coordinates # from the list. Used Haversine distance to find the distance between two coordinates for idx, stop_coordinate in enumerate(stops): for left_coordinate in left: distance = haversine(stop_coordinate, left_coordinate) if distance < 100: del stops[idx] break # If the stops and right are within the distance of 100 meters, then remove the particular stop coordinates # from the list. Used Haversine distance to find the distance between two coordinates for idx, stop_coordinate in enumerate(stops): for right_coordinate in right: distance = haversine(stop_coordinate, right_coordinate) if distance < 100: del stops[idx] break # Updating the new stop list into the centroids[stops] dictionary value for model in centroids: if 'stops' in model: centroids[model] = stops # Creating the object for KML kml = simplekml.Kml(open=1) # For each GPS tracks, plot the coordinates into the KML with the designated labels and save the # KML as Output.kml for centroid in centroids: for coordinates in centroids[centroid]: kmlPoint(kml, centroid, coordinates) kml.save('Output.kml')
def create_data_model(self): """Stores the data for the problem.""" data = {} # Locations in block units data['locations'] = [ (288, 149), (288, 129), (270, 133), (256, 141), (256, 157), (246, 157), (236, 169), (228, 169), (228, 161), (220, 169), (212, 169), (204, 169), (196, 169), (188, 169), (196, 161), (188, 145), (172, 145), (164, 145), (156, 145), (148, 145), (140, 145), (148, 169), (164, 169), (172, 169), (156, 169), (140, 169), (132, 169), (124, 169), (116, 161), (104, 153), (104, 161), (104, 169), (90, 165), (80, 157), (64, 157), (64, 165), (56, 169), (56, 161), (56, 153), (56, 145), (56, 137), (56, 129), (56, 121), (40, 121), (40, 129), (40, 137), (40, 145), (40, 153), (40, 161), (40, 169), (32, 169), (32, 161), (32, 153), (32, 145), (32, 137), (32, 129), (32, 121), (32, 113), (40, 113), (56, 113), (56, 105), (48, 99), (40, 99), (32, 97), (32, 89), (24, 89), (16, 97), (16, 109), (8, 109), (8, 97), (8, 89), (8, 81), (8, 73), (8, 65), (8, 57), (16, 57), (8, 49), (8, 41), (24, 45), (32, 41), (32, 49), (32, 57), (32, 65), (32, 73), (32, 81), (40, 83), (40, 73), (40, 63), (40, 51), (44, 43), (44, 35), (44, 27), (32, 25), (24, 25), (16, 25), (16, 17), (24, 17), (32, 17), (44, 11), (56, 9), (56, 17), (56, 25), (56, 33), (56, 41), (64, 41), (72, 41), (72, 49), (56, 49), (48, 51), (56, 57), (56, 65), (48, 63), (48, 73), (56, 73), (56, 81), (48, 83), (56, 89), (56, 97), (104, 97), (104, 105), (104, 113), (104, 121), (104, 129), (104, 137), (104, 145), (116, 145), (124, 145), (132, 145), (132, 137), (140, 137), (148, 137), (156, 137), (164, 137), (172, 125), (172, 117), (172, 109), (172, 101), (172, 93), (172, 85), (180, 85), (180, 77), (180, 69), (180, 61), (180, 53), (172, 53), (172, 61), (172, 69), (172, 77), (164, 81), (148, 85), (124, 85), (124, 93), (124, 109), (124, 125), (124, 117), (124, 101), (104, 89), (104, 81), (104, 73), (104, 65), (104, 49), (104, 41), (104, 33), (104, 25), (104, 17), (92, 9), (80, 9), (72, 9), (64, 21), (72, 25), (80, 25), (80, 25), (80, 41), (88, 49), (104, 57), (124, 69), (124, 77), (132, 81), (140, 65), (132, 61), (124, 61), (124, 53), (124, 45), (124, 37), (124, 29), (132, 21), (124, 21), (120, 9), (128, 9), (136, 9), (148, 9), (162, 9), (156, 25), (172, 21), (180, 21), (180, 29), (172, 29), (172, 37), (172, 45), (180, 45), (180, 37), (188, 41), (196, 49), (204, 57), (212, 65), (220, 73), (228, 69), (228, 77), (236, 77), (236, 69), (236, 61), (228, 61), (228, 53), (236, 53), (236, 45), (228, 45), (228, 37), (236, 37), (236, 29), (228, 29), (228, 21), (236, 21), (252, 21), (260, 29), (260, 37), (260, 45), (260, 53), (260, 61), (260, 69), (260, 77), (276, 77), (276, 69), (276, 61), (276, 53), (284, 53), (284, 61), (284, 69), (284, 77), (284, 85), (284, 93), (284, 101), (288, 109), (280, 109), (276, 101), (276, 93), (276, 85), (268, 97), (260, 109), (252, 101), (260, 93), (260, 85), (236, 85), (228, 85), (228, 93), (236, 93), (236, 101), (228, 101), (228, 109), (228, 117), (228, 125), (220, 125), (212, 117), (204, 109), (196, 101), (188, 93), (180, 93), (180, 101), (180, 109), (180, 117), (180, 125), (196, 145), (204, 145), (212, 145), (220, 145), (228, 145), (236, 145), (246, 141), (252, 125), (260, 129), (280, 133) ] # yapf: disable data['data'] = list(map(lambda x: list(x), data['locations'])) data['num_vehicles'] = 1 data['depot'] = 0 euclidean = DistanceMetric.get_metric('euclidean') data['line'] = euclidean.pairwise(data['locations']) self.data = {f"circuit_board{len(data['line'])}": data}
from sklearn.neighbors import KNeighborsClassifier import numpy as np from sklearn.neighbors import DistanceMetric from sklearn.metrics import accuracy_score dist = DistanceMetric.get_metric('manhattan') X = np.asarray([[1, 4, 2], [5, 4, 8], [2, 6, 5], [1, 1, 1], [2, 9, 6]]) y = np.asarray([2, 3, 3, 1, 2]) neigh = KNeighborsClassifier(n_neighbors=3) neigh.fit(X, y) X_test = np.asarray([[5, 3, 8]]) print(neigh.predict(X_test)) print('-----------------') from sklearn.neighbors import KNeighborsClassifier model = KNeighborsClassifier(n_neighbors=1, metric='l2') X = [[1, 4, 1], [5, 4, 8], [2, 6, 5], [1, 1, 1], [2, 9, 6]] Y = [1, 1, 1, 0, 0] to_pred = [[1, 22, 1]] model.fit(X, Y) pred = model.predict(to_pred) print(pred) print(' ---------------- ') print('date de train si date de test') # de modificat sus!!
prop = np.mean(y_test == 1.0) # Compute the appropriate threshold threshold = np.quantile(scores, prop) # Print the confusion matrix for the thresholded scores print(confusion_matrix(y_test, scores > threshold)) ## Find the neighbor # It is clear that the local outlier factor algorithm depends a lot on the idea of a nearest neighbor, which in turn depends on the choice of distance metric. So you decide to experiment some more with the hepatitis dataset introduced in the previous lesson. You are given three examples stored in features, whose classes are stored in labels. You will identify the nearest neighbor to the first example (row with index 0) using three different distance metrics, Euclidean, Hamming and Chebyshev, and on the basis of that choose which distance metric to use. You will import the necessary module as part of the exercise, but pandas and numpy already available, as are features and their labels labels. # Import DistanceMetric as dm from sklearn.neighbors import DistanceMetric as dm # Find the Euclidean distance between all pairs dist_eucl = dm.get_metric('euclidean').pairwise(features) # Find the Hamming distance between all pairs dist_hamm = dm.get_metric('hamming').pairwise(features) # Find the Chebyshev distance between all pairs dist_cheb = dm.get_metric('chebyshev').pairwise(features) ## Not all metrics agree # In the previous exercise you saw that not all metrics agree when it comes to identifying nearest neighbors. But does this mean they might disagree on outliers, too? You decide to put this to the test. You use the same data as before, but this time feed it into a local outlier factor outlier detector. The module LocalOutlierFactor has been made available to you as lof, and the data is available as features. # Instructions # 100 XP # Detect outliers in features using the euclidean metric. # Detect outliers in features using the hamming metric. # Detect outliers in features using the jaccard metric.
def check_pickle(metric, kwargs): dm = DistanceMetric.get_metric(metric, **kwargs) D1 = dm.pairwise(X1) dm2 = pickle.loads(pickle.dumps(dm)) D2 = dm2.pairwise(X1) assert_array_almost_equal(D1, D2)
def _calculate_batch_avg_distance_school(self, points): earth_radius = 6371.0088 dist = DistanceMetric.get_metric('haversine') distances = dist.pairwise(np.radians(points)) indexes = np.tril_indices(n=distances.shape[0], k=-1, m=distances.shape[1]) return earth_radius * np.mean(distances[indexes])
def seuclidean(x, y, V): result = DistanceMetric.get_metric('seuclidean', V) return result.pairwise(x, y)
def brute_force_neighbors(X, Y, k, metric, **kwargs): X, Y = check_array(X), check_array(Y) D = DistanceMetric.get_metric(metric, **kwargs).pairwise(Y, X) ind = np.argsort(D, axis=1)[:, :k] dist = D[np.arange(Y.shape[0])[:, None], ind] return dist, ind
def _distancemetric_factory(X): return DistanceMetric.get_metric(name)
def chebyshev(x, y): result = DistanceMetric.get_metric('chebyshev') return result.pairwise(x, y)
def sklearn_haversine(y, x): haversine = DistanceMetric.get_metric('haversine') latlon = np.hstack((y[:, np.newaxis], x[:, np.newaxis])) dists = haversine.pairwise(latlon) return 6371*dists #6371 is for distances in kms
def compute_diameter(raw_points): points = numpy.radians([[p[0], p[1]] for p in raw_points]) d = DistanceMetric.get_metric('haversine') dists = d.pairwise(points).flatten() return dists[numpy.argmax(dists)] * 6372795
def __init__(self): self.batch_size = 10 self.model = ResNet50(weights='imagenet', include_top=False, pooling='avg') self.dist = DistanceMetric.get_metric('euclidean') self.min_max_scaler = MinMaxScaler()
def dist(X_1, X_2, param='euclidean'): dist = DistanceMetric.get_metric(param) X = [X_1, X_2] return dist.pairwise(X)[0, 1]
def manhattan(x, y): result = DistanceMetric.get_metric('manhattan') return result.pairwise(x, y)
def fit(self, X, y): # czy X i y maja wlasciwy ksztalt X, y = check_X_y(X, y) # przechowanie unikalnych klas problemu self.classes_ = np.unique(y) # zapamietujemy X i y self.X_, self.y_ = X, y # przygotowujemy narzedzie do liczenia dystansow self.dm_ = DistanceMetric.get_metric(self.metric) # kontener na centroidy klas self.centroids_ = [] # plt.scatter(self.X_[:, 0], self.X_[:, 1], c=y, cmap='bwr') # plt.tight_layout() # plt.savefig("trzy") # dla kazdej klasy for cl in self.classes_: # wybieramy tylko instancje nalezace do danej klasy X_class = self.X_[self.y_ == cl] # petla while True: # wyliczamy centroid klasy class_centroid = np.mean(X_class, axis=0) # jeżeli nie optymalizujemy to kończymy if self.optimize == False: break # liczymy odchylenie standardowe instancji klasy std = np.std(X_class, axis=0) # możliwie najdalej znajdująca się instancje self.borderline_ = class_centroid + (self.sigma * std) # maksymalny dopuszczalny dystans accepted_distances = np.squeeze( self.dm_.pairwise( class_centroid.reshape(1, X_class.shape[1]), self.borderline_.reshape(1, X_class.shape[1]))) # liczymy dystanse wszystkich obiektow klasy od centroidu distances = np.squeeze( self.dm_.pairwise( class_centroid.reshape(1, X_class.shape[1]), X_class)) # plt.scatter(class_centroid[0], class_centroid[1], c='black', s=260) # plt.savefig("trzy") # uznajemy za outliery te instancje, ktore znajduja sie od # centroidu dalej niz 3 * std self.outliers_mask_ = np.array(distances > accepted_distances) # konczymy optymalizacje, jezeli nie mamy outlierow if np.sum(self.outliers_mask_) == 0: break # w inym przypadku pozbywamy sie outlierow else: # plt.scatter(X_class[self.outliers_mask_, 0], X_class[self.outliers_mask_, 1], c='gray', s=100) # plt.savefig("trzy") X_class = X_class[self.outliers_mask_ == False] # dodajemy wyliczony centroid do listy self.centroids_.append(class_centroid) # zwracamy klasyfikator return self
def haversine_distance(p1, p2): d = DistanceMetric.get_metric('haversine') X = [p1, p2] return d.pairwise(X)[0][1]
def wminkowski(x, y, p, w): result = DistanceMetric.get_metric('wminkowski', p, w) return result.pairwise(x, y)
'Is this the first document?', ] X = vectorizer.fit_transform(corpus) print vectorizer.get_feature_names() print X.toarray() print "first index:", vectorizer.vocabulary_.get("first") transformer = TfidfTransformer(smooth_idf=False) tfidf = transformer.fit_transform(X.toarray()) print tfidf.toarray() print "idf vector:", transformer.idf_ transformer2 = TfidfVectorizer(smooth_idf=False) tfidf2 = transformer2.fit_transform(corpus) print "\n", tfidf2.toarray() mink_metric = DistanceMetric.get_metric("minkowski") eucl_metric = DistanceMetric.get_metric("euclidean") X = [[0, 1, 2], [3, 4, 5]] eucl_pairs = eucl_metric.pairwise(X) filename = "eucl_pairs" print "before\n" + str(eucl_pairs) save(filename, eucl_pairs) print "after" loaded_eucl_pairs = load(filename + ".npy") print loaded_eucl_pairs
import collections import graphlab as gl import pandas as pd import json from pymongo import MongoClient from sklearn.neighbors import DistanceMetric dist = DistanceMetric.get_metric('haversine') class recommender(object): def __init__(self, df_tip, df_biz): self.tip = df_tip self.biz = df_biz self.loc = None def build(self, community, city): ### Get information of community tip_community = self.tip.loc[self.tip['user_id'].isin(community)] if city is None: # Get a list of business. businesses = list(set(tip_community['business_id'])) # Get information of businesses for biz. biz_community = self.biz.loc[df['business_id'].isin(businesses)] # Calculate distances from loc to business_id. biz_community.loc[:,'dist'] = biz_community[['longitude','latitude']].apply(lambda x: dist.pairwise(self.loc,x[np.newaxis,:])[0][0], axis = 1) # Get a list of rellevant businesses in diameter of 10 unit of distance.
## KNN PREDICTOR ## # do some lambda magic on text columns traindata = list(train.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1)) testdata = list(test.apply(lambda x:'%s %s %s' % (x['query'],x['product_title'], x['product_description']),axis=1)) # Fit TFIDF tfv.fit(traindata) X = tfv.transform(traindata) X_test = tfv.transform(testdata) clf = pipeline.Pipeline([('tSVD',tSVD),('scl',scl),('knn',knn)]) param_grid = {'knn__n_neighbors':[2],'knn__metric':[DistanceMetric.get_metric('manhattan')],'tSVD__n_components':[400]} model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid, scoring = kappa_scorer, refit = True, cv = 2, n_jobs = -1) # Fit Model model.fit(X, y) model.best_estimator_.fit(X,y) trainPred = model.best_estimator_.predict(X_test) # Averaging predicted relevance values finalPred = [int(floor((int(stemPred[i])+trainPred[i])*0.5)) for i in range(len(stemPred))] #print "Kappa Score for Training Data\nStemming+KNN\nScore=%f" %(quadratic_weighted_kappa(y, finalPred))
def minkowski(x, y, p): result = DistanceMetric.get_metric('minkowski', p) return result.pairwise(x, y)
def check_cdist_bool(metric, D_true): dm = DistanceMetric.get_metric(metric) D12 = dm.pairwise(X1_bool, X2_bool) assert_array_almost_equal(D12, D_true)
def main(args): if args.predict is None: # We are training a model. np.random.seed(args.seed) train = Dataset() train_data, train_target = makeVectors(train.data, train.target) model = Pipeline(steps=[ #('trans', OneHotEncoder()), ('mlp', KNeighborsClassifier(n_neighbors=1, p=2, n_jobs=-1)) ]) model.fit(train_data, train_target) ''' print(neigh.predict(np.array([vectorize('reci', 20)]))) print(neigh.predict_proba(np.array([vectorize('reci', 20)]))) scaler = MinMaxScaler() scaledTrain = scaler.fit_transform(trainVectors[1]) scaledTest = scaler.transform(testVector) mlp = MLPClassifier(activation='relu', hidden_layer_sizes=(200,), max_iter=300) mlp.fit(scaledTrain, targets[1]) print(mlp.classes_) probs = mlp.predict_proba(scaledTest) weights = np.array([1.469, 3.131]) for prob in probs: print(prob) print(np.argmax(prob * weights)) testVector = np.array([vectorize(5, 'naramek', 40), vectorize(1, 'sedmnact', 40), vectorize(6, 'vypravel', 40)]); print(testVector) scaler = MinMaxScaler() scaledTrain = scaler.fit_transform(trainVectors[3]) scaledTest = scaler.transform(testVector) mlp = MLPClassifier(activation='tanh', hidden_layer_sizes=(150), max_iter=300) mlp.fit(scaledTrain, targets[3]) print(mlp.predict_proba(scaledTest)) print(mlp.predict(scaledTest)) ''' # TODO: Train a model on the given dataset and store it in `model`. model = model # Serialize the model. with lzma.open(args.model_path, "wb") as model_file: pickle.dump(model, model_file) else: # Use the model and return test set predictions, as either a Python list or a NumPy array. test = Dataset(args.predict) with lzma.open(args.model_path, "rb") as model_file: model = pickle.load(model_file) #ignored = [' ', '\n', '-', ':', ',', '.', '?', '!', '"'] ignored = [' ', '\n'] i = 0 predictions = '' defaultWords = [] words = [] wordsLong = [] wasCaps = [] wasUpper = [] testLower = test.data.lower() prevWord = '' while i < len(testLower): if testLower[i] in ignored: predictions += testLower[i] i += 1 else: startIndex = i endIndex = i + 1 while testLower[endIndex] not in ignored: endIndex += 1 if endIndex == startIndex + 1: i = endIndex predictions += test.data[startIndex:endIndex] continue word = testLower[startIndex:endIndex] defaultWords.append(word) words.append(vectorize(word, 41)) if (prevWord != ''): wordsLong.append(vectorize(prevWord + ' ' + word, 41)) else: wordsLong.append(vectorize(word, 41)) capsC = test.data[startIndex] if capsC == word[0]: wasCaps.append(0) wasUpper.append(0) else: wasCaps.append(1) if len(word) > 1 and startIndex + 1 < len( test.data) and test.data[startIndex + 1] != word[1]: wasUpper.append(1) else: wasUpper.append(0) i = endIndex prevWord = word predictions = '' dist1, newWords = model['mlp'].kneighbors(np.array(words), 1) dist2, newWordsLong = model['mlp'].kneighbors(np.array(wordsLong), 1) wordIndex = 0 distt = DistanceMetric.get_metric('hamming') i = 0 while i < len(testLower): if testLower[i] in ignored: predictions += testLower[i] i += 1 else: startIndex = i endIndex = i + 1 while testLower[endIndex] not in ignored: endIndex += 1 if endIndex == startIndex + 1: i = endIndex predictions += test.data[startIndex:endIndex] continue newWord = model['mlp'].classes_[model['mlp']._y[ newWords[wordIndex]]][0] newWordLong = model['mlp'].classes_[model['mlp']._y[ newWordsLong[wordIndex]]][0] dist = dist1[wordIndex][0] distd = dist2[wordIndex][0] if (newWord != newWordLong and distd == 0): #diff = mymetric(vectorize(newWord, 20), vectorize(newWordLong, 20)) #if diff < 2: #print(newWord, 'za', newWordLong) newWord = newWordLong dist = dist2[wordIndex][0] ''' else: if dist > dist2[wordIndex][0]/1000: dist = dist2[wordIndex][0] newWord = newWordLong if defaultWords[wordIndex] == 'odpovedel': print(defaultWords[wordIndex], newWord, newWordLong) print(dist, distd) ''' if dist > 0.1: newWord = defaultWords[wordIndex] if wasCaps[wordIndex]: newWord = newWord.capitalize() if wasUpper[wordIndex]: newWord = newWord.upper() predictions += newWord wordIndex += 1 i = endIndex i = 0 count = 0 prevWord = '' while i < len(predictions): if predictions[i] in ignored: i += 1 else: startIndex = i endIndex = i + 1 while predictions[endIndex] not in ignored: endIndex += 1 word = predictions[startIndex:endIndex] if word == 'že' and prevWord != 'ale' and prevWord != 'Ale' and predictions[ startIndex - 2] != ',' and predictions[ startIndex - 2] != 'a' and predictions[startIndex - 3] != ' ': predictions = predictions[:startIndex] + 'ze' + predictions[ endIndex:] if word == 'ze' and prevWord == 'ale': predictions = predictions[:startIndex] + 'že' + predictions[ endIndex:] if word == 'Že': predictions = predictions[:startIndex] + 'Ze' + predictions[ endIndex:] if word == 'ně' and predictions[startIndex - 2] == ',': predictions = predictions[:startIndex] + 'ne' + predictions[ endIndex:] if word == 'mne' and (prevWord == 'ke' or prevWord == 'o'): predictions = predictions[: startIndex] + 'mně' + predictions[ endIndex:] if word == 'té' and (prevWord.lower() == 'prosím' or prevWord.lower() == 'jsme' or prevWord.lower() == 'aby' or prevWord.lower() == 'abych' or prevWord.lower() == 'co' or prevWord.lower() == 'kdo' or prevWord.lower() == 'který'): predictions = predictions[:startIndex] + 'tě' + predictions[ endIndex:] if word == 'Té' and (prevWord.lower() == 'prosím'): predictions = predictions[:startIndex] + 'Tě' + predictions[ endIndex:] i = endIndex prevWord = word f = open("pred.txt", "w", encoding='utf8') f.write(predictions) f.close() return predictions
def mean_distance_to_closest(predicted, event): angle = DistanceMetric.get_metric("pyfunc", func=spherical_angle) nn = BallTree(event.tracks, leaf_size=5, metric=angle) return np.sum([nn.query(predicted[i, :], k=1) for i in xrange(predicted.shape[0])]) / event.tracks.shape[0]
def check_pdist(metric, kwargs, D_true): dm = DistanceMetric.get_metric(metric, **kwargs) D12 = dm.pairwise(X1) assert_array_almost_equal(D12, D_true)
plt.ylabel('Euclidean distance') plt.show() df from scipy.spatial import distance import numpy as np distance.euclidean([1, 0, 0], [0, 1, 0]) distance.euclidean([20, 25], [25, 22]) #closest : S1 with S2 np.sqrt(((20 - 25)**2 + (25 - 22)**2)) #sqrt(sum(x-y)^2) distance.euclidean([20, 25], [35, 40]) distance.euclidean([20, 25], [40, 35]) distance.euclidean([35, 40], [40, 35]) from sklearn.neighbors import DistanceMetric dist = DistanceMetric.get_metric('euclidean') dist df.to_numpy() dist.pairwise(df.to_numpy()) #iris dataset from sklearn.datasets import load_iris from sklearn.cluster import AgglomerativeClustering import numpy as np import matplotlib.pyplot as plt from scipy.cluster.hierarchy import dendrogram, linkage #Getting the data ready from pydataset import data iris = data('iris') df2 = iris.copy()
def manhalanobis(x, y, V): result = DistanceMetric.get_metric('manhalanobis', V) return result.pairwise(x, y)
[0,0,1,1,0,1,0,1,1,0,1,1,0,1,1,0,0,1,1,1,1,1], [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0], [1,1,0,0,1,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0], [1,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0], [1,1,0,0,0,0,0,0,0,1,0,0,1,0,0,1,1,0,0,0,0,0], [0,0,1,0,0,1,0,1,0,1,1,0,0,0,0,0,0,1,1,1,1,0], [1,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0] ] AttributeClassifier = joblib.load('Dumps/AttributeClassifierKnowledgeTransfer.pkl') features = scipy.io.loadmat("./UIUC1/UIUC1_win_feature.mat") labels = scipy.io.loadmat("./UIUC1/UIUC1_labels.mat") action_actor = open("./UIUC1/action_actor.txt") dist = DistanceMetric.get_metric('euclidean') mapping = [{}] for line in action_actor: line = line.split() actionvector = numpy.zeros(14, dtype=numpy.int) actionvector[int(line[0])]=1 mapping.append({'action':int(line[0]),'actionvector':actionvector, 'actor':int(line[1])}) total = len(labels['vlabels'][0]) ConfusionMatrix=numpy.array([[0,0],[0,0]]) NovelClassList=[[0,1],[2,3],[4,5],[6,7],[8,9],[10,11],[12,13]] for NovelClass in NovelClassList: ConfusionMatrix2=numpy.array([[0,0],[0,0]])
def test_pickle_bool_metrics(metric): dm = DistanceMetric.get_metric(metric) D1 = dm.pairwise(X1_bool) dm2 = pickle.loads(pickle.dumps(dm)) D2 = dm2.pairwise(X1_bool) assert_array_almost_equal(D1, D2)