def distance(vector1, vector2, alpha=2, metric='euclidean'): ''' Helper function that calculates the alpha :param vector1:a vector :type vector1:list of doubles :param vector2:a vector :type vector2:list of doubles :param metric: euclidean, mahalanobis, seuclidean, cityblock :type metric:string :rtype: norm between vectors A and B ''' alpha = numpy.float64(1.0 * alpha) vector1 = numpy.float64(numpy.array(vector1)) vector2 = numpy.float64(numpy.array(vector2)) if metric == 'euclidean': vector_norm = distances.euclidean(vector1, vector2) elif metric == 'mahalanobis': vi = numpy.linalg.inv( numpy.cov(numpy.concatenate((vector1, vector2)).T)) vector_norm = distances.mahalanobis(vector1, vector2, vi) elif metric == 'seuclidean': vector_norm = distances.seuclidean(vector1, vector2) elif metric == 'cityblock': vector_norm = distances.cityblock(vector1, vector2) elif metric == 'hamming': vector_norm = distances.hamming(vector1, vector2) else: print "Unknown metric" return None return vector_norm
def match_state_seq(sts_true, sts_pred, K): """ Matchs the set of states in sts_pred such that it minimizes the hamming distance between sts_pred and sts_true. We assume here that the states are labeled 0, ..., K - 1. sts_true : A numpy array of integers. sts_pred : A numpy array of integers. K : Number of states in case sts_true doesn't cover all states. """ sts = np.arange(K, dtype='int') sts_true = sts_true.astype('int') sts_pred = sts_pred.astype('int') min_perm = None min_hd = np.inf for p in itertools.permutations(sts): cur_sts = np.array(p)[sts_pred] hd = distance.hamming(sts_true, cur_sts) if hd < min_hd: min_hd = hd min_perm = p return np.array(min_perm)
def eval_ind(individual, initial_pop, model, base_biomass, exp_ess, distance): # Set this as warning model.solver = 'gurobi' old_biomass = list(linear_reaction_coefficients(model).keys())[0] # index removed old_biomass.remove_from_model() # Make a biomass reaction and optimize for it biomass = Reaction('BIOMASS') model.add_reaction(biomass) index = initial_pop.index for i in range(len(index)): if individual[i] == 1: biomass.add_metabolites({initial_pop.index[i]: -0.1}) biomass.add_metabolites(base_biomass) biomass.objective_coefficient = 1. # Generate deletion results --> BOTTLENECK FOR SURE deletion_results = single_gene_deletion(model, model.genes, processes=1) # Filter the results to get a boolean result a = [(str(next(iter(i))), 1) for i in deletion_results[deletion_results['growth'] > 1e-3].index] b = [(str(next(iter(i))), 0) for i in deletion_results[deletion_results['growth'] <= 1e-3].index] c = a + b pred_ess = pd.DataFrame(c, columns=['Genes', 'Predicted_growth']) compare_df = pd.merge(right=exp_ess, left=pred_ess, on='Genes', how='inner') # Apply hamming distance u = np.array([f for f in compare_df.Measured_growth]) v = np.array([x for x in compare_df.Predicted_growth]) if distance == 'hd': dist = hamming(u, v) elif distance == 'mcc': dist = matthews_corrcoef(u, v) else: print('Error: Invalid distance metric') return dist, sum(individual)
def k_means(data, k=2, distance='e'): centers = np.array(random.sample(list(data), k)) centers_steps = [centers.tolist()] changed = True while changed: prev_centers = np.copy(centers) data_nr = data.shape[0] clusters = np.empty((data_nr, k)) for i in range(data_nr): if distance == 'e': clusters[i] = np.array([euclidean(data[i] - centers[j]) for j in range(k)]) elif distance == 'm': clusters[i] = np.array([cityblock(data[i], centers[j]) for j in range(k)]) elif distance == 'h': clusters[i] = np.array([hamming(data[i], centers[j]) for j in range(k)]) else: raise ValueError('Unrecognized distance') clusters = np.argmin(clusters, axis=1) for i in range(k): centers[i] = np.mean(data[np.where(clusters == i)], axis=0) changed = not np.intersect1d(prev_centers, centers).size == centers.size centers_steps.append(centers.tolist()) return centers, centers_steps
def hamming_z_score(seq1, seq2, p1=None, p2=None): """ Return the z score of the hamming distance under the specified 3 assumptions: 1. P(X_i = Y_i) = P(X_j = Y_j). In words, this means (X_i, Y_i) and (X_j, Y_j) are identically jointly distributed. In other words, all data points are equally easy (or hard) to learn (this is an empirically false assumption). 2. X_i and Y_i are conditionally independent (conditioned on i). In other words, the predictions between any two learned models on the same test example are independent (obviously false assumption). 3. (X_i = Y_i) and (X_j = Y_j) are independent. Given assumptions 1 and 2, this amounts to adding the assumption that the ith and jth predictions from the same classifier are the independent (obviously false assumption). """ # Use given p1 and p2 if p1 is not None and p2 is not None: expected = expected_hamming(p1, p2) # Use p1 as single p elif p1 is not None: expected = expected_hamming(p1) # Calculate and use p1 and p2 elif p1 is None and p2 is None: p1 = torch.mean(seq1) p2 = torch.mean(seq2) expected = expected_hamming(p1, p2) else: raise ValueError('Invalid arguments: p1 is None and p2 is not None') std = hamming_std(len(seq1), p1, p2) return (hamming(seq1, seq2) - expected) / std
def similarity_function(x, y): """ Similarity function for comparing user features. This actually really should be implemented in taar.similarity_recommender and then imported here for consistency. """ def safe_get(field, row, default_value): # Safely get a value from the Row. If the value is None, get the # default value. return row[field] if row[field] is not None else default_value # Extract the values for the categorical and continuous features for both # the x and y samples. Use an empty string as the default value for missing # categorical fields and 0 for the continuous ones. x_categorical_features = [safe_get(k, x, "") for k in CATEGORICAL_FEATURES] y_categorical_features = [safe_get(k, y, "") for k in CATEGORICAL_FEATURES] x_continuous_features = [ float(safe_get(k, x, 0)) for k in CONTINUOUS_FEATURES ] y_continuous_features = [ float(safe_get(k, y, 0)) for k in CONTINUOUS_FEATURES ] # Here a larger distance indicates a poorer match between categorical variables. j_d = distance.hamming(x_categorical_features, y_categorical_features) j_c = distance.canberra(x_continuous_features, y_continuous_features) # Take the product of similarities to attain a univariate similarity score. # Add a minimal constant to prevent zero values from categorical features. # Note: since both the distance function return a Numpy type, we need to # call the |item| function to get the underlying Python type. If we don't # do that this job will fail when performing KDE due to SPARK-20803 on # Spark 2.2.0. return abs((j_c + 0.001) * j_d).item()
def one_simulation(number, opt_k): print("\nThe result for %dth experiment with optimal k=%d:" % (number, opt_k)) kMeans_model = KMeans(n_clusters=opt_k).fit(data_X) kMeans_labels = kMeans_model.labels_ df['cluster'] = kMeans_labels labels = ["Family", "Genus", "Species"] df['Family_label'] = df['cluster'].apply(family_label) df['Genus_label'] = df['cluster'].apply(genus_label) df['Species_label'] = df['cluster'].apply(species_label) ham_dist_once = [] for cluster_id in range(0, opt_k): cluster = df[df['cluster'] == cluster_id] for label in labels: cluster_label = label + "_label" dist = distance.hamming(cluster[label], cluster[cluster_label]) ham_dist_once.append(dist) print("Hamming score of cluster %d for label %s is %f" % (cluster_id, label, dist)) avg_dist = np.mean(ham_dist_once) ham_dist_all.append(avg_dist) print("The average hamming score for %dth experiment is %f\n\n" % (number, avg_dist))
def hamming_diff(seq1, seq2, p1=None, p2=None, ps=None): """ Return the difference between the hamming distance of the two sequences and the expected hamming distance between two random vectors. If p is specified, use it as the common p for both vector. Otherwise, calculate each vector's respective p and use those. """ # Use the list of ps (not identically distributed) if ps is not None: if isinstance(ps, torch.FloatTensor): assert (len(ps) == len(seq1)) expected = expected_hamming_nonid(ps) else: raise ValueError('ps is not FloatTensor') # Use given p1 and p2 elif p1 is not None and p2 is not None: expected = expected_hamming(p1, p2) # Use p1 as single p elif p1 is not None: expected = expected_hamming(p1) # Calculate and use p1 and p2 elif p1 is None and p2 is None: p1 = torch.mean(seq1) p2 = torch.mean(seq2) expected = expected_hamming(p1, p2) else: raise ValueError('Invalid arguments: p1 is None and p2 is not None') return hamming(seq1, seq2) - expected
def find_matches( pred, #features from user selected image collection_features, #list of features in the collection images, #list of filenames associated with the features dist='cosine' #distance metric - only cosine is good ): ''' Finds matches for the features of the selected image, according to the distance metric specified. Distance metrics use the scipy package ''' pred = pred.flatten() nimages = len(collection_features) #vectorize cosine similarity # sims= inner(pred,collection_features)/norm(pred)/norm(collection_features,axis=1) sims = [] for i in range(0, nimages): if dist == 'euclidean': sims.append( distance.euclidean(pred.flatten(), collection_features[i].flatten())) elif dist == 'hamming': pred[pred > 0] = 1 sims.append( distance.hamming(pred.flatten(), collection_features[i].flatten())) else: #default to cosine sims.append( distance.cosine(pred.flatten(), collection_features[i].flatten())) print('max sim = ' + str(max(sims))) similar_images = pd.DataFrame({'imgfile': images, 'simscore': sims}) return (similar_images)
def example_of_cross_validation_with_detailed_info(raw_data, labels, num_subjects, num_epochs_per_subj): # no shrinking, set C=1 svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) #logit_clf = LogisticRegression() clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj) # doing leave-one-subject-out cross validation for i in range(num_subjects): leave_start = i * num_epochs_per_subj leave_end = (i+1) * num_epochs_per_subj training_data = raw_data[0:leave_start] + raw_data[leave_end:] test_data = raw_data[leave_start:leave_end] training_labels = labels[0:leave_start] + labels[leave_end:] test_labels = labels[leave_start:leave_end] clf.fit(list(zip(training_data, training_data)), training_labels) # joblib can be used for saving and loading models #joblib.dump(clf, 'model/logistic.pkl') #clf = joblib.load('model/svm.pkl') predict = clf.predict(list(zip(test_data, test_data))) print(predict) print(clf.decision_function(list(zip(test_data, test_data)))) incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj logger.info( 'when leaving subject %d out for testing, the accuracy is %d / %d = %.2f' % (i, num_epochs_per_subj-incorrect_predict, num_epochs_per_subj, (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj) ) print(clf.score(list(zip(test_data, test_data)), test_labels))
def compute_clients_dist(self, client_data, cache): client_categorical_feats = [ client_data.get(specified_key) for specified_key in CATEGORICAL_FEATURES ] client_continuous_feats = [ client_data.get(specified_key) for specified_key in CONTINUOUS_FEATURES ] # Compute the distances between the user and the cached continuous features. cont_features = distance.cdist( cache["continuous_features"], np.array([client_continuous_feats]), "canberra", ) # Compute the distances between the user and the cached categorical features. cat_features = np.array( [[distance.hamming(x, client_categorical_feats)] for x in cache["categorical_features"]]) # See the "Note about cdist optimization" in README.md for why we only use cdist once. # Take the product of similarities to attain a univariate similarity score. # Note that the addition of 0.001 to the continuous features # sets a floor value to the distance in continuous similarity # scores. There is no such floor value set for categorical # features so this adjustment prioritizes categorical # similarity over continous similarity return (cont_features + FLOOR_DISTANCE_ADJUSTMENT) * cat_features
def get_error_hamming_distributions_from_results(results: Sequence[Sequence[Sequence[int]]]) \ -> Sequence[Sequence[float]]: """ Get the distribution of the hamming weight of the error vector (number of bits flipped between output and expected answer) for each possible pair of two n_bit summands using results output by get_n_bit_adder_results :param results: a list of results output from a call to get_n_bit_adder_results :return: the relative frequency of observing each hamming weight, 0 to n_bits+1, for the error that occurred when adding each pair of two n_bit summands """ num_shots = len(results[0]) n_bits = len(results[0][0]) - 1 hamming_wt_distrs = [] # loop over all binary strings of length n_bits for result, bits in zip(results, all_bitstrings(2 * n_bits)): # Input nums are written from (MSB .... LSB) = (a_n, ..., a_1, a_0) num_a = bit_array_to_int(bits[:n_bits]) num_b = bit_array_to_int(bits[n_bits:]) # add the numbers ans = num_a + num_b ans_bits = int_to_bit_array(ans, n_bits + 1) # record the fraction of shots that resulted in an error of the given weight hamming_wt_distr = [0. for _ in range(len(ans_bits) + 1)] for shot in result: # multiply relative hamming distance by the length of the output for the weight wt = len(ans_bits) * hamming(ans_bits, shot) hamming_wt_distr[int(wt)] += 1. / num_shots hamming_wt_distrs.append(hamming_wt_distr) return hamming_wt_distrs
def hmmg(X): hashcode=np.sign(X) hammin = np.zeros((X.shape[0], X.shape[0])) # penalized hamming(Ci,Cj) for i in range(X.shape[0] - 1): for j in range(i + 1, X.shape[0]): hammin[i,j]=hamming(hashcode[i],hashcode[j]) hammin[j,i]=hammin[i,j] return hammin
def get_class_from_ECOC(testing_predictions, class_codes): label = [] for i in range(testing_predictions.shape[0]): hamming_distances = [] for j in range(class_codes.shape[0]): hamming_distances.append(distance.hamming(testing_predictions[i], class_codes[j])) label.append(np.array(hamming_distances).argmin()) return np.array(label)
def distance(user1, user2): try: user1Ratings = userItemRatingMatrix.transpose()[str(user1)] user2Ratings = userItemRatingMatrix.transpose()[str(user2)] distance = hamming(user1Ratings, user2Ratings) except: distance = np.NaN return distance
def computeDistance(user1, user2): try: user1Ratings = userItemRatingMatrix.transpose()[user1] user2Ratings = userItemRatingMatrix.transpose()[user2] distance = hamming(user1Ratings, user2Ratings) except: distance = np.NAN return distance
def get_score(motifs): motif_mx = np.vstack(motifs) profile = get_profile(motif_mx) consensus = get_consensus(profile) return np.sum( distance.hamming(motif, consensus) for motif in motif_mx )
def distance(x, y): d = 0. for i in xrange(len(x)): a = np.int32(list(binary(x[i]))) b = np.int32(list(binary(y[i]))) d = d + hamming(a,b) d = d / len(x) return d
def hamming_distance(training_set_vectors, query_vector, top_n=50): distances = [] # comparing each image to all training set for i in range(len(training_set_vectors)): distances.append(hamming(training_set_vectors[i], query_vector[0])) # return sorted indices of 30 most similar images return np.argsort(distances)[:top_n]
def mapfn(k, v): import kdt_config # import has to be under function. cf. mincemeat README. from dkdt_chunk_ends import chunk_ends from scipy.spatial.distance import hamming import numpy as np sims = [( hamming(np.asarray(chunk),np.asarray(v)) , i ) for i,chunk in enumerate(chunk_ends)] sims = sorted(sims) yield sims[-1][1],v # yield one pair so each record is indexed at only one leaf tree.
def _init_similar_matrix(self): for a1 in self.agents: for a2 in self.agents: similarity = 1 - hamming(a1.traits, a2.traits) similar = similarity >= self.similarity_threshold self.similar_matrix[a1.index, a2.index] = similar
def hamming_distance(v1, v2): print('v1: ', v1.shape) print('v2: ', v2.shape) if (len(v2) < len(v1)): v2 = np.concatenate((v2, np.zeros(len(v1) - len(v2))), axis=0) if (len(v2) > len(v1)): v1 = np.concatenate((v1, np.zeros(len(v2) - len(v1))), axis=0) return distance.hamming(v1, v2)
def get_ratio(self,line_list, CLASS_list): # ersetzt die matches mit leer und subtrahiert das gesamt davon --> Anteil der Matches #SIZE = float(len(line_list)) #replace = line_list.replace(str(CLASS_list), "") #print(str(CLASS_list).replace("]","").replace("[","")) ratio = distance.hamming(line_list,CLASS_list) #float(((SIZE - float(len(replace)) )) / SIZE) return (1 - ratio)
def _instability_score(predicted_label, test_label, k): """Computes the stability score (see Lange) for `predicted_label` and `test_label` assuming `k` possible labels. """ # find optimal permutation of labels between predicted and test test_label_ = _permute( test_label, _get_optimal_permutation(predicted_label, test_label, k)) # return hamming distance return hamming(predicted_label, test_label_)
def jaccard(self, id1, id2): """ Approximate Jaccard coefficient using minhash :param id1: Doc ID (key) :param id2: Doc ID (key) :return j: Approximate Jaccard coefficient """ j = 1 - hamming(self.signatures[id1], self.signatures[id2]) return j
def get_displacement(image0, image1): """ Gets displacement (in pixels I think) difference between 2 images using scikit-image not as accurate as the opencv version i think. :param image0: reference image :param image1: target image :return: """ from skimage.feature import (match_descriptors, ORB, plot_matches) from skimage.color import rgb2gray from scipy.spatial.distance import hamming from scipy import misc image0_gray = rgb2gray(image0) image1_gray = rgb2gray(image1) descriptor_extractor = ORB(n_keypoints=200) descriptor_extractor.detect_and_extract(image0_gray) keypoints1 = descriptor_extractor.keypoints descriptors1 = descriptor_extractor.descriptors descriptor_extractor.detect_and_extract(image1_gray) keypoints2 = descriptor_extractor.keypoints descriptors2 = descriptor_extractor.descriptors matches12 = match_descriptors(descriptors1, descriptors2, cross_check=True) # Sort the matches based on distance. Least distance # is better distances12 = [] for match in matches12: distance = hamming(descriptors1[match[0]], descriptors2[match[1]]) distances12.append(distance) indices = np.arange(len(matches12)) indices = [index for (_, index) in sorted(zip(distances12, indices))] matches12 = matches12[indices] # collect displacement from the first 10 matches dx_list = [] dy_list = [] for mat in matches12[:10]: # Get the matching key points for each of the images img1_idx = mat[0] img2_idx = mat[1] # x - columns # y - rows (x1, y1) = keypoints1[img1_idx] (x2, y2) = keypoints2[img2_idx] dx_list.append(abs(x1 - x2)) dy_list.append(abs(y1 - y2)) dx_median = np.median(np.asarray(dx_list, dtype=np.double)) dy_median = np.median(np.asarray(dy_list, dtype=np.double)) # plot_matches(image0, image1, descriptors1, descriptors2, matches12[:10]) return dx_median, dy_median
def get_hamming_distance_matrix(spike_nums_dur): n_cells = spike_nums_dur.shape[0] hamm_dist_matrix = np.zeros((n_cells, n_cells)) for i in np.arange(n_cells): for j in np.arange(n_cells): hamm_dist_matrix[i, j] = sci_sp_dist.hamming(spike_nums_dur[i, :], spike_nums_dur[j, :]) return hamm_dist_matrix
def calculateHEM(self): from scipy.spatial.distance import hamming N = self.Errors.shape[0] self.HEM = np.zeros((N, N)) for i, ei in enumerate(self.Errors): for j, ej in enumerate(self.Errors): self.HEM[i, j] = hamming(ei, ej)
def vote_hamming_distance(votes1, votes2): ids = np.array(range(len(votes1))) both_voted = ids[(votes1 != 0) & (votes2 != 0)] if len(both_voted) == 0: return 0 v1 = votes1[both_voted] v2 = votes2[both_voted] distance = hamming(v1, v2) return distance
def compute_accuracy(self, X, y): """ Computes accuracy in percent. Uses scipy lib. :param X: Data batch. d x n :param y: Labels batch. n vector :return: Prediction accuracy in percent """ return 1 - hamming(y, self.predict(X))
def match_iris(self,iris,irisList): check = False img = '' for i in irisList: if distance.hamming(iris.ravel(),i.ravel()) == 0: check = True img = i.ravel() break return (check, img)
def ComputeFeatureDistance(F1, F2, dis='L2'): res = np.zeros([F1.shape[0], F2.shape[0]]) for i in range(F1.shape[0]): for j in range(F2.shape[0]): if (dis == 'L2'): res[i][j] = (np.linalg.norm(F1[i] - F2[j])) elif (dis == 'hamming'): res[i][j] = hamming(F1[i], F2[j]) return res
def distance(self, otherPoint): max_len = max(len(otherPoint.data), len(self.data)) dist = 0. for i in range(max_len): l1, l2 = pad_to_match(self.get_data(i), otherPoint.get_data(i)) l1.sort() l2.sort() dist += hamming(l1, l2)/max_len return dist
def dist_between_matrices(A, B): A_unweighted = np.zeros(A.shape) B_unweighted = np.zeros(B.shape) A_unweighted[A != 0] = 1 B_unweighted[B != 0] = 1 return hamming(A_unweighted.flatten(), B_unweighted.flatten())
def hamming_distance(a, b): ''' compares distance for binary arrays returns number of features that are not the same ''' if max(a) > 1: a[a > 0] = 1 b[b > 0] = 1 return (distance.hamming(a, b))
def stability_score(predicted_label, test_label, k): """Computes the stability score (see Lange) for `predicted_label` and `test_label` assuming `k` possible labels. """ # find optimal permutation of labels between predicted and test test_label_ = permute(test_label, get_optimal_permutation(predicted_label, test_label, k)) # return hamming distance return hamming(predicted_label, test_label_)
def cam_corr(M, cell, test): cor = [] for i in test: #r = pearsonr(M[cell,:],M[i,:])[0] r = hamming(M[cell, :], M[i, :]) if np.isnan(r): continue else: cor.append(r) return cor
def compute_similarity(self, arr1, arr2): if self.simfcn == "cosine": return self.d_to_sim(cosine(arr1, arr2)) elif self.simfcn == "pearson": return self.d_to_sim(correlation(arr1, arr2)) elif self.simfcn == "hamming": return 1 - hamming(arr1, arr2) elif self.simfcn == "jaccard": return 1 - jaccard(arr1, arr2) else: print "Similiarity Function Not Yet Supported" exit()
def dist(self, other): """Return the hamming distance between self and other Attempts to have other provide its own numpy array, but able to produce one if necessary. """ try: other_seq_array = other.get_seq_array() except AttributeError: other_seq_array = sp.array(list(other)) return hamming(self.get_seq_array(), other_seq_array)
def __find__(self,videorequest): """ :param videorequest: videorequest object :type: object :rtype: tuple of 2 numpy.array (frames number, hamming distances) """ reqsig = videorequest.get_feature('bindct') lreq = len(reqsig) dists = [] for i in range(len(self.sigs)-lreq): print "frame", i hdist = ssd.hamming(self.sigs[i:i+lreq],reqsig) dists.append(hdist) return self.__local_minima_fancy__(dists,window=lreq),lreq
def pair_distance(genome_sig_list, file_list): H_dist = [[0 for x in range(genome_sig_list.shape[0])] for x in range(genome_sig_list.shape[0])] E_dist = [[0 for x in range(genome_sig_list.shape[0])] for x in range(genome_sig_list.shape[0])] C_dist = [[0 for x in range(genome_sig_list.shape[0])] for x in range(genome_sig_list.shape[0])] for i in range(0, genome_sig_list.shape[0]): for j in range(0, genome_sig_list.shape[0]): H_dist[i][j] = distance.hamming(genome_sig_list[i],genome_sig_list[j]) E_dist[i][j] = distance.euclidean(genome_sig_list[i],genome_sig_list[j]) C_dist[i][j] = distance.cosine(genome_sig_list[i],genome_sig_list[j]) output_distance(H_dist, file_list, "hamming_distance.csv") output_distance(E_dist, file_list, "euclidean_distance.csv") output_distance(C_dist, file_list, "cosine_distance.csv")
def rd_dist(new_perm, dist_metric): #some metric where # [0, 9, 8, 1, 2, 3, 4, 7, 5, 6, 10, 11] # is worse than # [0, 1, 2, 3, 8, 9, 4, 7, 5, 6, 10, 11] or [0, 3, 2, 1, 4, 9, 8, 7, 5, 6, 10, 11] # beta = 0.5 goal = range(len(new_perm)) # metric = ((beta**2 + 1) * ssd.hamming(goal, new_perm) * ssd.euclidean(goal, new_perm))/((beta**2) * ssd.hamming(goal, new_perm) + ssd.euclidean(goal,new_perm)) # metric = ssd.hamming(goal, new_perm) + ssd.euclidean(goal, new_perm) if dist_metric == "hamming": metric = ssd.hamming(goal, new_perm) elif dist_metric == "euclidean": metric = ssd.euclidean(goal, new_perm) return metric
def test_hammming_loss(): true = np.random.binomial(n=1, p=.5, size=10).astype('float32') predicted = np.round(np.random.random(10)) refscore = hamming(true, predicted) yt = T.fvector('yt') yp = T.fvector('yp') f = theano.function([yt, yp], tmetrics.classification.hamming_loss(yt, yp), allow_input_downcast=True) score = f(true, predicted) print 'true' print true print 'predicted' print predicted print 'refscore {}'.format(refscore) print 'score {}'.format(score) assert np.allclose(refscore, score)
def analyze_performance(predicted_labels, actual_labels): """ Returns the proportion of total labels that are acurately matched Parameters ---------- predicted_labels : 1d array Consists of numeric labels actual_labels: 1d array Consists of numeric labels Returns ------- distance : float 'Distance' here equals #matching entries / length(predicted_labels) """ normed_distance = hamming(predicted_labels, actual_labels) #between 0 - 1 return 1 - normed_distance
def test_classification(): fake_raw_data = [create_epoch(i) for i in range(20)] labels = [0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1] # 4 subjects, 4 epochs per subject epochs_per_subj = 4 # svm svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) training_data = fake_raw_data[0: 12] clf = Classifier(svm_clf, epochs_per_subj=epochs_per_subj) clf.fit(training_data, labels) expected_confidence = np.array([-1.18234421, 0.97403604, -1.04005679, 0.92403019, -0.95567738, 1.11746593, -0.83275891, 0.9486868]) recomputed_confidence = clf.decision_function(fake_raw_data[12:]) hamming_distance = hamming(np.sign(expected_confidence), np.sign(recomputed_confidence)) assert hamming_distance <= 1, \ 'decision function of SVM with recomputation ' \ 'does not provide correct results' y_pred = clf.predict(fake_raw_data[12:]) expected_output = [0, 0, 0, 1, 0, 1, 0, 1] hamming_distance = hamming(y_pred, expected_output) * len(y_pred) assert hamming_distance <= 1, \ 'classification via SVM does not provide correct results' confidence = clf.decision_function(fake_raw_data[12:]) hamming_distance = hamming(np.sign(expected_confidence), np.sign(confidence)) assert hamming_distance <= 1, \ 'decision function of SVM without recomputation ' \ 'does not provide correct results' # logistic regression lr_clf = LogisticRegression() clf = Classifier(lr_clf, epochs_per_subj=epochs_per_subj) clf.fit(training_data, labels[0:12]) expected_confidence = np.array([-4.49666484, 3.73025553, -4.04181695, 3.73027436, -3.77043872, 4.42613412, -3.35616616, 3.77716609]) recomputed_confidence = clf.decision_function(fake_raw_data[12:]) hamming_distance = hamming(np.sign(expected_confidence), np.sign(recomputed_confidence)) assert hamming_distance <= 1, \ 'decision function of logistic regression with recomputation ' \ 'does not provide correct results' y_pred = clf.predict(fake_raw_data[12:]) expected_output = [0, 0, 0, 1, 0, 1, 0, 1] hamming_distance = hamming(y_pred, expected_output) * len(y_pred) assert hamming_distance <= 1, \ 'classification via logistic regression ' \ 'does not provide correct results' confidence = clf.decision_function(fake_raw_data[12:]) hamming_distance = hamming(np.sign(expected_confidence), np.sign(confidence)) assert hamming_distance <= 1, \ 'decision function of logistic regression without precomputation ' \ 'does not provide correct results'
def generate_one_hit_codon_table(): """Creates a lookup table for one-hit codon changes using a numbered index Returns ------- OrderedDict """ _one_hit_codon = OrderedDict() for i in range(1, 65): for j in range(1, 65): anc = GENETIC_CODE.by_index(i)[0] der = GENETIC_CODE.by_index(j)[0] dist = hamming(list(anc), list(der)) if dist <= 1/float(3): _one_hit_codon[(i,j)] = Codon_change(GENETIC_CODE.by_index(i), GENETIC_CODE.by_index(j)) return _one_hit_codon
def hamming_dist(self, full_var_x, true_sts): """ This function returns the hamming distance between the full variational distribution on the states, and the true state sequence, after matching via the munkres algorithm full_var_x: variational distribution of state sequence. Generate it with self.full_local_update(). true_sts: true state sequence Returns float with hamming distance and best permutation to match the states. """ state_sq = np.argmax(full_var_x, axis=1).astype(int) #these are learned states best_match = util.munkres_match(true_sts, state_sq, self.K) return dist.hamming(true_sts, best_match[state_sq]), best_match
def eval(self, heldout): """ evaluate under two metrics: 1-best error rate and hamming """ mistakes = dd(int) err, ham = 0, 0 N = float(len(heldout)) for x, y, e, c in heldout: y_pred = self.predict(x, e, c) err += 0.0 if (y == y_pred).all() else 1 ham += hamming(y, y_pred) * len(y) for i, (y1, y2) in enumerate(zip(y, y_pred)): if y1 != y2: mistakes[i] += 1 return err / N, ham / N, dict(mistakes)
def getDisplacement(Image0, Image1): Image0Gray = rgb2gray(Image0) Image1Gray = rgb2gray(Image1) descriptor_extractor = ORB(n_keypoints=200) descriptor_extractor.detect_and_extract(Image0Gray) keypoints1 = descriptor_extractor.keypoints descriptors1 = descriptor_extractor.descriptors descriptor_extractor.detect_and_extract(Image1Gray) keypoints2 = descriptor_extractor.keypoints descriptors2 = descriptor_extractor.descriptors matches12 = match_descriptors(descriptors1, descriptors2, cross_check=True) # Sort the matches based on distance. Least distance # is better distances12 = [] for match in matches12: distance = hamming(descriptors1[match[0]], descriptors2[match[1]]) distances12.append(distance) indices = np.range(len(matches12)) indices = [index for (_, index) in sorted(zip(distances12, indices))] matches12 = matches12[indices] # collect displacement from the first 10 matches dxList = [] dyList = [] for mat in matches12[:10]: # Get the matching keypoints for each of the images img1_idx = mat[0] img2_idx = mat[1] # x - columns # y - rows (x1, y1) = keypoints1[img1_idx] (x2, y2) = keypoints2[img2_idx] dxList.append(abs(x1 - x2)) dyList.append(abs(y1 - y2)) dxMedian = np.median(np.asarray(dxList, dtype=np.double)) dyMedian = np.median(np.asarray(dyList, dtype=np.double)) plot_matches(Image0, Image1, descriptors1, descriptors2, matches12[:10]) return dxMedian, dyMedian
def predict(self, x_test): prediction = np.zeros( x_test.shape[0] ) for i in range( x_test.shape[0] ): x = x_test[i] new_code = np.zeros( self.P ) for p in range(self.P): new_code[p] = self.forests[p].predict(x) # predict the class whose codeword has the smallest hamming distance to new_code from scipy.spatial.distance import hamming min_dist = float(np.inf) c_predicted = -1 for c in range(self.C): dist = hamming(new_code, self.code_matrix[c,:]) if dist < min_dist: min_dist = dist c_predicted = c prediction[i] = c_predicted return prediction
def detect(self,video,hamming_threshold=21.0/64): """ :param video: video object :type: video """ dctcuts = [] dctcuts.append(0) sigs = video.get_feature('bindct') s = sigs[0] #TODO use GPGPU module if possible or numpy.ediff1d for i,s_ in enumerate (sigs[1:]): if ssd.hamming(s,s_) > hamming_threshold : dctcuts.append(i+1) s= s_ dctcuts.append(len(sigs)) self.cuts = numpy.array(dctcuts) self.video = video return self.cuts
def ecoc_test(): svms = loader.load_pickle_file(model_path) te_data= loader.load_pickle_file(te_data_path) pred = [] for f in te_data[0]: min_hamming_dist = 1. match_label = 0 code = [] for s in svms: c_pred = s.predict([f])[0] code.append(1 if c_pred == 1 else 0) # replace -1 with 0 for ind, c in enumerate(ecoc): cur_hd = hamming(c, code) if cur_hd < min_hamming_dist: min_hamming_dist = cur_hd match_label = ind pred.append(match_label) return (pred == te_data[1]).sum() / len(te_data[1])
def example_of_correlating_two_components(raw_data, raw_data2, labels, num_subjects, num_epochs_per_subj): # aggregate the kernel matrix to save memory svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) clf = Classifier(svm_clf, epochs_per_subj=num_epochs_per_subj) num_training_samples=num_epochs_per_subj*(num_subjects-1) clf.fit(list(zip(raw_data[0:num_training_samples], raw_data2[0:num_training_samples])), labels[0:num_training_samples]) X = list(zip(raw_data[num_training_samples:], raw_data2[num_training_samples:])) predict = clf.predict(X) print(predict) print(clf.decision_function(X)) test_labels = labels[num_training_samples:] incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj logger.info( 'when aggregating the similarity matrix to save memory, ' 'the accuracy is %d / %d = %.2f' % (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj, (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj) ) # when the kernel matrix is computed in portion, the test data is already in print(clf.score(X, test_labels))
def ecoc_prediction_single(feature, boosts, ecoc): ''' :param feature: :param boosts: :param ecoc: ecoc for predicting :return: ''' min_hamming_dist = 1. match_label = 0 code = [] for b in boosts: c_pred = b.predict_single(feature) code.append(1 if c_pred == 1 else 0) # replace -1 with 0 for ind, c in enumerate(ecoc): cur_hd = hamming(c, code) if cur_hd < min_hamming_dist: min_hamming_dist = cur_hd match_label = ind return match_label
def example_of_aggregating_sim_matrix(raw_data, labels, num_subjects, num_epochs_per_subj): # aggregate the kernel matrix to save memory svm_clf = svm.SVC(kernel='precomputed', shrinking=False, C=1) clf = Classifier(svm_clf, num_processed_voxels=1000, epochs_per_subj=num_epochs_per_subj) rearranged_data = raw_data[num_epochs_per_subj:] + raw_data[0:num_epochs_per_subj] rearranged_labels = labels[num_epochs_per_subj:] + labels[0:num_epochs_per_subj] clf.fit(list(zip(rearranged_data, rearranged_data)), rearranged_labels, num_training_samples=num_epochs_per_subj*(num_subjects-1)) predict = clf.predict() print(predict) print(clf.decision_function()) test_labels = labels[0:num_epochs_per_subj] incorrect_predict = hamming(predict, np.asanyarray(test_labels)) * num_epochs_per_subj logger.info( 'when aggregating the similarity matrix to save memory, ' 'the accuracy is %d / %d = %.2f' % (num_epochs_per_subj-incorrect_predict, num_epochs_per_subj, (num_epochs_per_subj-incorrect_predict) * 1.0 / num_epochs_per_subj) ) # when the kernel matrix is computed in portion, the test data is already in print(clf.score(None, test_labels))
def load_data(data_name, dir_to_data): """ Reading data (also transforming string and binary data to appropriate shape) """ if data_name in ["actg1", "actg2", "actg3"]: data = np.genfromtxt(op.join(dir_to_data, data_name + ".data.gz"), dtype = 'str') N = len(data) data_input = np.zeros((N, N)) for i in range(N): for j in range(N): data_input[i][j] = Levenshtein.distance(data[i], data[j]) elif data_name in ["binstr1", "binstr2", "binstr3"]: data = np.genfromtxt(op.join(dir_to_data, data_name + ".data.gz"), dtype = 'str') N = len(data) data_input = np.zeros((N, N)) for i in range(N): for j in range(N): data_input[i][j] = hamming(list(data[i]), list(data[j])) else: data_input = np.loadtxt(op.join(dir_to_data, data_name + ".data.gz"), ndmin = 2) labels = np.loadtxt(op.join(dir_to_data, data_name + ".labels.gz"), dtype = 'int') return (data_input, labels)
def mapfn(k, v): from dkdt_chunk_ends import chunk_ends from scipy.spatial import kdtree kdtree.node = kdtree.KDTree.node kdtree.leafnode = kdtree.KDTree.leafnode kdtree.innernode = kdtree.KDTree.innernode #k is 0, ..., M #v is serialzied KDtree import kdt_config # import has to be under function. cf. mincemeat README. import cPickle dkdt = cPickle.loads(v) from scipy.spatial.distance import hamming import numpy as np for i , q in enumerate(kdt_config.queries): sims = [( hamming(np.asarray(chunk),np.asarray(q)) , i ) for i,chunk in enumerate(chunk_ends)] sims = sorted(sims) sims = sims[-kdt_config.dkdt_S:] sims = [str(s[1]) for s in sims] if (str(k) in sims): # search if current tree is attached to a similar top leaf # check is ith kdtree is close to query q. nearestNeighbors = dkdt.query(q) yield i , nearestNeighbors
def get_ecoc(ecoc_path, num_ecoc, class_num): if path.isfile(ecoc_path): print('Loading the ecoc...') best_ecoc = loader.load_pickle_file(ecoc_path) else: print('Creating the ecoc...') best_ecoc = [0, [], []] # distance, ecoc for training, ecoc for predicting for i in range(100): n = int(math.pow(2, num_ecoc)) codes = choice(n, class_num) ecoc_func_codes = [] for i in range(num_ecoc): ecoc_func_codes.append([]) c_ecoc = [] for c in codes: bin_s = '{0:0' + str(num_ecoc) + '10b}'.format(c) bin_s = [int(ss) for ss in bin_s] c_ecoc.append(bin_s) for i in range(num_ecoc): ecoc_func_codes[i].append(bin_s[i]) c_hamming_dist = 0 has_same_code = False for j in range(len(c_ecoc)): for k in range(len(c_ecoc)): if j != k: c_hd = hamming(c_ecoc[j], c_ecoc[k]) if c_hd == 0: has_same_code = True c_hamming_dist += c_hd if has_same_code: continue if c_hamming_dist > best_ecoc[0]: best_ecoc[0] = c_hamming_dist best_ecoc[1] = ecoc_func_codes best_ecoc[2] = c_ecoc # serialize the best ecoc loader.save(ecoc_path, best_ecoc) return best_ecoc
ids=0 for i in range(1,niter): data = pickle.load(open(model %(eta,i),'rb')) Amedi=data['Ubs'][1][:,:rk]; Acodei=data['Ubs'][2][:,:rk]; for j in range(i+1,niter): data = pickle.load(open(model %(eta,j),'rb')) Amedj=data['Ubs'][1][:,:rk]; Acodej=data['Ubs'][2][:,:rk]; # Med stats cc=[1-cosine(Amedi[:,r],Amedj[:,r]) for r in range(rk)] c['med_cosine']=c['med_cosine']+cc cmean['med_cosine']=cmean['med_cosine']+[np.mean(cc)] cc=[1-hamming(Amedi[:,r]>=1e-15,Amedj[:,r]>=1e-15) for r in range(rk)] c['med_hamming']=c['med_hamming']+cc cmean['med_hamming']=cmean['med_hamming']+[np.mean(cc)] t1=[np.argsort(Amedi[:,r])[::-1][:K] for r in range(rk)] t2=[np.argsort(Amedj[:,r])[::-1][:K] for r in range(rk)] cc=[len(np.intersect1d(t1[r],t2[r]))/float(len(np.union1d(t1[r],t2[r]))) for r in range(rk)] c['med_jaccard_topK']=c['med_jaccard_topK']+cc cmean['med_jaccard_topK']=cmean['med_jaccard_topK']+[np.mean(cc)] cc=[len(np.intersect1d(t1[r],t2[r]))/float(K) for r in range(rk)] c['med_hamming_topK']=c['med_hamming_topK']+cc cmean['med_hamming_topK']=cmean['med_hamming_topK']+[np.mean(cc)] # code stats