def main(): emd_list=[] Min_goal_location =1 Min_goal_visual = 1 Min_black_location = 1 Min_black_visual=1 file = open('result.txt','wb') test_list= get_features('svips_detect_visual.pkl') print "len of test_list is",len(test_list) raw_input("test") black_list = get_features('vips_black_visual.pkl') print "length of white",len(black_list) raw_input("goal") #weight1 = [1/len(features1) for i in range(len(features1))] goal_list =get_features('vips_white_visual.pkl') for test_url in test_list.keys(): Min_goal_location =1 Min_goal_lurl ='' Min_goal_visual = 1 Min_goal_vurl = '' Min_black_visual=1 Min_black_vurl = '' Min_black_location = 1 Min_black_lurl = '' features1=test_list[test_url] if len(features1)>50: continue weight1 = [1/len(features1) for i in range(len(features1))] for goal_url in goal_list.keys(): features2 =goal_list[goal_url] if len(features2)>50: continue weight2 =[1/len(features2) for i in range(len(features2))] #features1中每个元素的权重 #emd_dis = emd((features1,weight1),(features2,weight2),distance) emd_goal_location = emd((features1,weight1),(features2,weight2),dis_location) if emd_goal_location < Min_goal_location: Min_goal_location = emd_goal_location Min_goal_lurl = goal_url emd_goal_visual = emd((features1,weight1),(features2,weight2),dis_visual) if emd_goal_visual < Min_goal_visual: Min_goal_visual = emd_goal_visual Min_goal_vurl = goal_url for black_url in black_list.keys(): features2 = black_list[black_url] if len(features2)>50: continue weight2 = [1/len(features2) for i in range(len(features2))] emd_black_location = emd((features1,weight1),(features2,weight2),dis_location) if emd_black_location < Min_black_location: Min_black_location = emd_black_location Min_black_lurl = black_url emd_black_visual = emd((features1,weight1),(features2,weight2),dis_visual) if emd_black_visual < Min_black_visual: Min_black_visual = emd_black_visual Min_black_vurl = black_url file.write(str(test_url)+' '+str(Min_goal_lurl)+' '+str(1-Min_goal_location)+' '+str(Min_goal_vurl)+' '+str(1-Min_goal_visual)+' '+ str(Min_black_lurl)+' '+str(1-Min_black_location)+' '+str(Min_black_vurl)+' '+str(1-Min_black_visual)+'\n') # print emd_dis_location,emd_dis_visual # raw_input("item") file.close()
def report(prediction_hapls, answer_hapls, dist): # For the dataset it reports: # 1) the count of predicted haplotypes with no errors (TP); # 2) the count of predicted haplotypes with at least one error(FP); # 3) total count of haplotypes(TP+FP); # 4) sensitivity(TP/(TP+FN)); # 5) Precision(PPV=(TP/(TP+FP)); # 6) EMD to a consensus. # For every true variant it should report: # 1) true frequency(TF); # 2) editing distance to the closest prediction variant(ECP); # 3) frequency of the closest predicted variant(FCP); # 4) explanation error for a true variant (EEV). #For every predicted variant it should report: # 1) editing distance to the closest true variant (ECT). emd_res = emd(X=np.ones(len(prediction_hapls.freqs)), Y=np.ones(len(answer_hapls.freqs)), X_weights=prediction_hapls.freqs, Y_weights=answer_hapls.freqs, distance='precomputed', D=dist, return_flows=True) pred_freqs_unif = np.array([1./len(prediction_hapls.freqs) for _ in range(len(prediction_hapls.freqs))]) answer_freqs_unif = np.array([1./len(answer_hapls.freqs) for _ in range(len(answer_hapls.freqs))]) emd_unif = emd(X=np.ones(len(prediction_hapls.freqs)), Y=np.ones(len(answer_hapls.freqs)), X_weights=pred_freqs_unif, Y_weights=answer_freqs_unif, distance='precomputed', D=dist, return_flows=True) ans_hapl_count = len(answer_hapls.seqs) pred_hapl_count = len(prediction_hapls.seqs) predictions_closest_to_answer = get_prediction_closest_to_answer(dist) answer_closest_to_prediction = get_answer_closest_to_prediction(dist) predictions_closest_to_answer_freqs = get_closest_freq(prediction_hapls.freqs, predictions_closest_to_answer, answer_hapls.freqs) # answer_closest_to_prediction_freqs = get_closest_freq(answer_hapls.freqs, answer_closest_to_prediction, # prediction_hapls.freqs) freq_adjusted_mismatches = get_freq_adjusted_mismatches(emd_res[1], dist) report_dict = dict() report_dict["TP"] = sum([x[0] == 0 for x in predictions_closest_to_answer[:ans_hapl_count]]) report_dict["FP"] = len(prediction_hapls.seqs) - report_dict["TP"] report_dict["TotalPredicted"] = len(prediction_hapls.seqs) report_dict["Sensitivity"] = float(report_dict["TP"])/ans_hapl_count report_dict["PPV"] = float(report_dict["TP"])/report_dict["TotalPredicted"] report_dict["EMD"] = emd_res[0] # Fractional accuracy report_dict["UEMD"] = emd_unif[0] report_dict["TF"] = [x for x in answer_hapls.freqs[:ans_hapl_count]] report_dict["ECP"] = [x[0] for x in predictions_closest_to_answer[:ans_hapl_count]] report_dict["ECT"] = [x[0] for x in answer_closest_to_prediction[:pred_hapl_count]] report_dict["FCP"] = [x for x in predictions_closest_to_answer_freqs[:ans_hapl_count]] report_dict["EEV"] = [x for x in freq_adjusted_mismatches[:ans_hapl_count]] report_dict["PCA"] = [x[1][0] for x in predictions_closest_to_answer] report_dict["ACP"] = [x[1][0] for x in answer_closest_to_prediction] # ADC report_dict["ADC"] = get_adc(predictions_closest_to_answer, answer_hapls.freqs) # APE report_dict["APE"] = get_adc(answer_closest_to_prediction, prediction_hapls.freqs) report_dict["UADC"] = get_adc(predictions_closest_to_answer, answer_freqs_unif) report_dict["UAPE"] = get_adc(answer_closest_to_prediction, pred_freqs_unif) json.dump(report_dict, sys.stdout)
def main(): features1 = [Feature(100, 40, 22), Feature(211, 20, 2), Feature(32, 190, 150), Feature(2, 100, 100)] weights1 = [0.4, 0.3, 0.2, 0.1] features2 = [Feature(0, 0, 0), Feature(50, 100, 80), Feature(255, 255, 255)] weights2 = [0.5, 0.3, 0.2] print emd( (features1, weights1), (features2, weights2), distance )
def arc_emd_choice(t, y, method='spline'): """ Denoise the data in y by returning the intrinsic mode (or residual) with the largest variance as found using empirical mode decomposition. Parameters ---------- y : 1D array-like The data to be denoised. method : {'spline'|'saw'} Which intrinsic mode identification process to employ. Result ------ y_denoised : 1D array The denoised data. """ if method == 'spline': modes, residual = emd.emd(t, y) if method == 'saw': modes, residual = emd.saw_emd(t, y) choices = np.append(modes, residual[:, np.newaxis], axis=1) stds = np.var(choices, axis=0) i_choice = np.argmax(stds) return choices[:, i_choice]
def emd_nd(u, v): """ Computes Earth Mover's Distance in N-dimensions Uses https://github.com/garydoranjr/pyemd Need to convert probability distribution in non-log space """ return emd(np.exp(u), np.exp(v))
def calc_similarity(comparisonpair, distance=cosine_distance, cosine_adjustment=True): # load pickle files X, BOW_X = (word_vector_arrays, BOW-features) word_vectors1, word_weights1 = comparisonpair[0] word_vectors2, word_weights2 = comparisonpair[1] # check if both files users are identical if (word_vectors1 == word_vectors2) and (word_weights1 == word_weights2): return 1.0 # else else: # calculate the earth mover's distance (EMD) between two 'signatures' (generalized distributions) # signature format: (list of vectors [number of vectors x embedding dimension], list of their weights) # with the cosine distance #if(use_cosine): emd_result = emd((word_vectors1, word_weights1), (word_vectors2, word_weights2), distance) # map the EMD output to [0,1]: if cosine_adjustment: similarity = float(float(1) - (emd_result / 2 * 1.0)) else: # take the reciprocal for an estimate of the similarity (instead of distance) # to adjust for euclidean distance similarity = float(1 / float(emd_result)) return similarity
def calculate_emd(hidict, endict, ensentence, hindisentence): x = [] y = [] #print('Inside EMD', ensentence) for word in ensentence: word = word.lower() if word not in ('!','.',':', ';', ','): #print('ENWORD', word) try: x.append(endict[word]) except: #print("except", word) continue #print('Error', word) for word in hindisentence: if word not in ('!','.',':', ';', ','): #print('HIWORD', word) try: y.append(hidict[word]) except: #print("except", word) continue #print('Error', word) #print('ENG',np.array(x).shape, 'Hndi', np.array(y).shape) distance = 99 if len(y) > 0 and len(x)> 0: #print("correct", ensentence) distance = emd(np.array(x),np.array(y)) #distance = emd(np.array(y),np.array(x)) return distance
def emd_nd(u, v): tot = 0 U = np.reshape(u, (-1, 101)) V = np.reshape(v, (-1, 101)) for s in zip(U, V): tot += emd(np.atleast_2d(s[0]), np.atleast_2d(s[1])) return tot
def WMD(document1, document2, embeddings): ''' Compute WMD. Input: document1: List of words. document2: List of words. embeddings: word2vec embeddings of words. Returns: WMD between documents, float. ''' # Compute nBOW representation of documents. d1 = nBOW(document1) d2 = nBOW(document2) # Get features. features1 = [tuple(embeddings[token]) for token in document1] features2 = [tuple(embeddings[token]) for token in document2] if len(features1) > len(features2): diff = abs(len(features1) - len(features2)) d2.extend([0] * diff) features2.append(tuple([0] * len(features1[0])) * diff) elif len(features1) < len(features2): diff = abs(len(features1) - len(features2)) d1.extend([0] * diff) features1.append(tuple([0] * len(features1[0])) * diff) # Return WMD. return emd((features1, d1), (features2, d1), distance)
def solve_by_emd(): """Solve the problem where the whole mass must be moved.""" ltheta = DIM * [ 1, ] return emd((lP, fw1), (lQ, fw2), lambda a, b: float(dist_for_emd(a, b, ltheta)))
def get_dist(n_samples): mu_X, mu_Y = 1.0, -1.0 X = np.random.randn(n_samples, 2) + mu_X Y = np.random.randn(n_samples, 2) + mu_Y return emd(X, Y)
def WMD_bt_queryAndLib(text, Pos, X, BOW_X): # X: w2v vectors matrix, iTH column is the iTH document's w2v vectors matrix # BOW_X: BOW(word frequency) vectors matrix, iTH column is the iTH document's BOW vectors matrix # words: store the documents without repeating words # with open('src/STDvectors.pk') as f: # [Pos, X, BOW_X, words] = pickle.load(f) n = np.shape(X) n = n[0] # number of documents Dist = [] (Fs, wordOrders, weight) = Weight(segmentation(text.lower()).split()) # print '-----------------W2V向量---------------' # print type(Fs) # print np.shape(Fs) # print Fs # print '---------------Weight向量----------------' # print type(weight) # print np.shape(weight) # print weight # print '---------------------------------------' text_BOWVec = weight.tolist() # BOW word vector of text text_w2vVec = Fs.T.T.tolist() # w2v word vector of text for j in xrange(n): emdDist = emd((text_w2vVec, text_BOWVec), (X[j].T.tolist(), BOW_X[j].tolist()), distance) Dist.append((Pos[j], emdDist)) # calculate the EMD of two documents if emdDist == 0: # already find the same name in the standard library, jump out of the loop break sort_Dist = sorted(Dist, key = lambda d: d[1]) print '-----------------------------------------------' if len(sort_Dist) >= 5: for i in range(5): print '%f %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8')) else: for i in range(len(sort_Dist)): print '%f %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8')) return sort_Dist[0][0]
def WMD(document1, document2, embeddings): ''' Compute WMD. Input: document1: List of words. document2: List of words. embeddings: word2vec embeddings of words. Returns: WMD between documents, float. ''' # Compute nBOW representation of documents. d1 = nBOW(document1) d2 = nBOW(document2) # Get features. features1 = [tuple(embeddings[token]) for token in document1] features2 = [tuple(embeddings[token]) for token in document2] if len(features1) > len(features2): diff = abs(len(features1) - len(features2)) d2.extend([0]*diff) features2.append(tuple([0]*len(features1[0]))*diff) elif len(features1) < len(features2): diff = abs(len(features1) - len(features2)) d1.extend([0]*diff) features1.append(tuple([0]*len(features1[0]))*diff) # Return WMD. return emd((features1, d1), (features2, d1), distance)
def WMD_bt_queryAndLib(query, Pos, X, Weight_X): # X:w2v vectors matrix, iTH column is the iTH document's w2v vectors matrix (list type) # Weight_X: weight vectors matrix, iTH column is the iTH document's TF vectors matrix (list type) n = np.shape(X) n = n[0] # number of documents Dist = [] query = clean(query) (Fs, weight) = generateVec(segmentation(query.lower()).split()) query_weightVec = weight.tolist() # weight word vector of query query_w2vVec = Fs.T.T.tolist() # w2v word vector of query for j in xrange(n): emdDist = emd((query_w2vVec, query_weightVec), (X[j].T.tolist(), Weight_X[j].tolist()), distance) if math.isnan(emdDist): # NOTICE! this sentence used to be missing and triggered a bug emdDist = 9999 # Dist.append(emdDist) # calculate the EMD of two documents Dist.append((Pos[j], emdDist)) # calculate the EMD of two documents sort_Dist = sorted(Dist, key = lambda d: d[1]) print '-----------------------------------------------' if len(sort_Dist) >= 5: for i in range(5): print '%f %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8')) else: for i in range(len(sort_Dist)): print '%f %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8')) # return sort_Dist[0][0] return sort_Dist
def pre_phi_future(current, future, current_part, future_part, state, tpm, base=2): whole_rep = iit.effect_repertoire(current, future, state, tpm, base) part1_state = iit.convert_to_subset(state, current_part, base) part1_rep = iit.effect_repertoire(current_part, future_part, part1_state, tpm, base) part2_state = iit.convert_to_subset(state, current - current_part, base) part2_rep = iit.effect_repertoire(current - current_part, future - future_part, part2_state, tpm, base) partitioned_rep = iit.multiply_repertoires(future_part, future - future_part, part1_rep, part2_rep, base) future_nnodes = len(future) d = np.array(range(2**future_nnodes)) locs = ((d[:, None] & (1 << np.arange(future_nnodes - 1, -1, -1))) > 0).astype(int) return emd(locs, locs, whole_rep, partitioned_rep, distance='cityblock')
def main(): features1 = [ Feature(100, 40, 22), Feature(211, 20, 2), Feature(32, 190, 150), Feature(2, 100, 100) ] weights1 = [0.4, 0.3, 0.2, 0.1] features2 = [ Feature(0, 0, 0), Feature(50, 100, 80), Feature(255, 255, 255) ] weights2 = [0.5, 0.3, 0.2] print emd((features1, weights1), (features2, weights2), distance)
def populate_EMD_chunk(coordinates): with open("out", 'a') as log: log.write("%s: populating (%d, %d) through (%d, %d)...\n" % (datetime.now().isoformat(), coordinates[0][0], coordinates[0][1], coordinates[-1][0], coordinates[-1][1])) return [(i, j, emd(features_by_bid[business_ids[i]], features_by_bid[business_ids[j]])) for i, j in coordinates]
def main(): features1 =check_rect(get_features('emd_mxye.pkl')) weight1 =[1/len(features1) for i in range(len(features1))] #features1中每个元素的权重 features2 = check_rect(get_features('emd_nhwd.pkl')) weight2 = [1/len(features2) for i in range(len(features2))] print "result" R= emd((features1,weight1),(features2,weight2),Distance) print "Result is",R
def cei(subset, state, tpm, base=2): ''' This function calculates the cause-effect imformation resulting from the state of a particular subset being known. subset is the set of nodes for which the state is known. state is an integer which describes the state of these nodes. tpm is the 2^n x n transition probability matrix. ''' nnodes = np.size(tpm, 1) full_set = set(range(nnodes)) #print(nnodes); f_uncon = iit.uncon_effect_repertoire(tpm, base) p_uncon = iit.uncon_cause_repertoire(nnodes, base) f = iit.effect_repertoire(subset, full_set, state, tpm, base) p = iit.cause_repertoire(subset, full_set, state, tpm, base) ''' cause_information = iit.EMD1(p_uncon, p); effect_information = iit.EMD1(f_uncon, f); print('ci',cause_information) print('ei',effect_information) cause_information = iit.EMD2(p_uncon, p); effect_information = iit.EMD2(f_uncon, f); print('ci',cause_information) print('ei',effect_information) Dist = np.array([[iit.hamming(i,j) for i in xrange(2**nnodes)] for j in xrange(2**nnodes)], dtype=int) print (Dist) ''' d = np.array(range(2**nnodes)) locs = ((d[:, None] & (1 << np.arange(nnodes - 1, -1, -1))) > 0).astype(int) cause_information = emd(locs, locs, p_uncon, p, distance='cityblock') effect_information = emd(locs, locs, f_uncon, f, distance='cityblock') return np.minimum(cause_information, effect_information)
def WMD_bt_2texts(text1, text2): (Fs, wordOrders, weight) = Weight(segmentation(text1.lower()).split()) text1_BOWVec = weight.tolist() # BOW word vector of text text1_w2vVec = Fs.T.T.tolist() # w2v word vector of text (Fs, wordOrders, weight) = Weight(segmentation(text2.lower()).split()) text2_BOWVec = weight.tolist() # BOW word vector of text text2_w2vVec = Fs.T.T.tolist() # w2v word vector of text Dist = (emd((text1_w2vVec, text1_BOWVec), (text2_w2vVec, text2_BOWVec), distance)) # calculate the EMD of two documents return Dist
def get_wmd(ix): n = np.shape(X) n = n[0] Di = np.zeros((1,n)) i = ix print '%d out of %d' % (i, n) for j in xrange(i): Di[0,j] = emd( (X[i], BOW_X[i]), (X[j], BOW_X[j]), distance) return Di
def earth_mover(points1, points2, normer=np.sum): xs1 = np.linspace(0, 1, len(points1), endpoint=True)[np.array(np.isfinite(points1))] xs2 = np.linspace(0, 1, len(points2), endpoint=True)[np.array(np.isfinite(points2))] points1 = points1[np.isfinite(points1)] points2 = points2[np.isfinite(points2)] return emd.emd(xs1, xs2, points1 / normer(points1), points2 / normer(points2))
def wae_loss(x, x_hat, mu, logvar, batch_size=128): KLD = -0.5 * torch.sum( 1 + logvar - mu.pow(2) - logvar.exp()) #KL divergence # Normalise by same number of elements as in reconstruction KLD /= batch_size EMDist = emd(x, x_hat) return EMDist + KLD
def AdvancedKernelEMD(X1, X1_labels, X2, X2_labels, dist): sz1 = np.shape(X1)[0] sz2 = np.shape(X2)[0] D = np.zeros((sz1, sz2)) for i in range(0, sz1): for j in range(i, sz2): s1 = X1[i] X1_positive = s1[X1_labels[i] == 1] X1_negative = s1[X1_labels[i] == 0] s2 = X2[j] X2_positive = s2[X2_labels[j] == 1] X2_negative = s2[X2_labels[j] == 0] D[i, j] = (emd(X1_positive, X2_positive, distance=dist))\ +(emd(X1_negative, X2_negative, distance=dist)) D = D + np.transpose(np.triu(D, k=1)) return D
def get_wmd(i): Di = np.zeros((1,n)) for j in range(n): if len(X[i]) > 0 and len(X[j]) > 0: #print i, j, len(X[i]), len(X[j]) Di[0,j] = emd(X[i], X[j], X_weights=BOW_X[i], Y_weights=BOW_X[j]) else: Di[0,j] = 2.0 return Di
def get_wmd(i): Di = np.zeros((1, n)) for j in range(n): if len(X[i]) > 0 and len(X[j]) > 0: #print i, j, len(X[i]), len(X[j]) Di[0, j] = emd(X[i], X[j], X_weights=BOW_X[i], Y_weights=BOW_X[j]) else: Di[0, j] = 2.0 return Di
def calc_emd(G_orig, G_anon): """ Calc Earth Mover distance of degree distribution between original graph and sanitized PyEMD package required! https://github.com/garydoranjr/pyemd :param G_orig: :param G_anon: :return: """ assert (G_orig.number_of_nodes() == G_anon.number_of_nodes()) return emd(G_orig.degree().items(), G_anon.degree().items())
def cal_sentence_distance(sentence1_word_vector_list, sentence1_word_freq_list, sentence2_word_vector_list, sentence2_word_freq_list): start = time() sentence_distance = emd( (sentence1_word_vector_list, sentence1_word_freq_list), (sentence2_word_vector_list, sentence2_word_freq_list), distance) end = time() # print 'cal time :', end - start return sentence_distance
def ComputeKernelEMD(X1, X2, dist): sz1 = np.shape(X1)[0] sz2 = np.shape(X2)[0] D = np.zeros((sz1, sz2)) for i in range(0, sz1): for j in range(i, sz2): D[i, j] = (emd(X1[i], X2[j], distance=dist)) D = D + np.transpose(np.triu(D, k=1)) return D
def ComputeKernelEMD1D(X1, X2, dist): sz1 = np.shape(X1)[0] sz2 = np.shape(X2)[0] D = np.zeros((sz1, sz2)) for i in range(0, sz1): for j in range(i, sz2): D[i, j] = (emd(X1[i], X2[j], distance=dist)) D = np.squeeze(D) return D
def earth_mover(points1, points2): xs1 = np.linspace(0,1,len(points1), endpoint=True)[np.array(np.isfinite(points1))] xs2 = np.linspace(0,1,len(points2), endpoint=True)[np.array(np.isfinite(points2))] points1 = points1[np.isfinite(points1)] points2 = points2[np.isfinite(points2)] return emd.emd(xs1, xs2, points1/np.sum(points1), points2/np.sum(points2))
def WMD_bt_2texts(text1, text2): (Fs, weight) = generateVec(segmentation(text1.lower()).split()) text1_BOWVec = weight.tolist() # BOW word vector of text text1_w2vVec = Fs.T.T.tolist() # w2v word vector of text (Fs, weight) = generateVec(segmentation(text2.lower()).split()) text2_BOWVec = weight.tolist() # BOW word vector of text text2_w2vVec = Fs.T.T.tolist() # w2v word vector of text Dist = emd((text1_w2vVec, text1_BOWVec), (text2_w2vVec, text2_BOWVec), distance) # calculate the EMD of two documents if math.isnan(Dist): Dist = 9999 return Dist
def ensemble_process(x, data_length, max_modes, max_siftings, noise_std, ensembles_per_process, output): imfs = np.zeros((max_modes + 1, data_length)) for i in range(ensembles_per_process): noise = np.multiply(np.random.randn(data_length), noise_std) noise_assisted_data = np.add(x, noise) ensemble = emd.emd(noise_assisted_data, max_modes, max_siftings) imfs = np.add(imfs, ensemble) output.put(imfs)
def get_all_emd(grouped_chrm, type): for comb in combs: s1 = [getattr(v, type) for v in grouped_chrm.windows[comb[0]].variants] s2 = [getattr(v, type) for v in grouped_chrm.windows[comb[1]].variants] s1 = [abs(v - 0.5) for v in s1] s2 = [abs(v - 0.5) for v in s2] if s1 != [] and s2 != []: p = scipy.stats.ks_2samp(s1, s2).pvalue emd_object = emd_obj(comb, emd.emd(emd.cumsum(s1), emd.cumsum(s2)), p * bonf_corr) else: emd_object = emd_obj(comb, None, None) all_emd.append(emd_object)
def ComputeKernelEMD1D(X1, X2, dist): sz1 = np.shape(X1)[0] sz2 = np.shape(X2)[0] D = np.zeros((sz1, sz2)) for i in range(0, sz1): for j in range(i, sz2): print i, j startT = time.time() D[i, j] = (emd(X1[i], X2[j], distance=dist)) endT = time.time() - startT #print 'EMD took ' + str(endT) + ' seconds.' #D2 = np.exp((-1/np.mean(D[np.nonzero(D)]))*D) return D
def emdist(a, b, distance_function): """ Return Earth Mover's distance :type distance_function: function :type b: numpy matrix :type a: numpy matrix """ w1, w2 = a[:, 0], b[:, 0] feature1, feature2 = a[:, 1:], b[:, 1:] feature1 = (feature1.tolist(), w1.tolist()) feature2 = (feature2.tolist(), w2.tolist()) return emd(feature1, feature2, distance_function)
def calculateEMDMetric(histContainer1, histContainer2,angles,distanceFcn): emdMetricWT={} minVal=1000000000 for fileName1,histVal1 in histContainer1: for fileName2,histVal2 in histContainer2: minVal=1000000000 for rotNum in range(len(angles)): anglesR=rotate(angles,rotNum) cVal= emd((angles,histVal1.tolist()),(anglesR,histVal2.tolist()),distanceFcn) if cVal < minVal: minVal=cVal if emdMetricWT.get(fileName1) == None: emdMetricWT[fileName1]=[] emdMetricWT[fileName1].append((fileName2,minVal)) return emdMetricWT
def get_wmd(ix): # calculate the WMD distance between documents print '***', ix n = np.shape(X) n = n[0] # number of documents Di = np.zeros((1,n)) # (1 x NoOfDocs) matrix i = ix print '%d out of %d' % (i, n) for j in xrange(i): print '***' print 'X[i] Size = ', np.shape(X[i]) print 'X[j] Size = ', np.shape(X[j]) print 'BOW_X[i] Size = ', np.shape(BOW_X[i]) print 'BOW_X[j] Size = ', np.shape(BOW_X[j]) Di[0,j] = emd((X[i], BOW_X[i]), (X[j], BOW_X[j]), distance) # calculate the EMD of two documents print Di return Di
def calculate_emd(signature1, signature2): D = calculate_distance_matrix(signature1, signature2, ground_distance = GROUND_DISTANCES["euclidean"]) signature1_points = [ cl[0] for cl in signature1] signature1_weights = [ cl[2] for cl in signature1 ] signature1_weights = np.array(map(lambda x: float(x)/sum(signature1_weights), signature1_weights)) signature2_points = [ cl[0] for cl in signature2] signature2_weights = [ cl[2] for cl in signature2 ] signature2_weights = np.array(map(lambda x: float(x)/sum(signature2_weights), signature2_weights)) return emd( signature1_points, signature2_points, X_weights = signature1_weights, Y_weights = signature2_weights, distance = 'precomputed', D = D )
def calWD(d1,d2): st = time.time() # 6层卷积,生成随机参数 n_layers = int(np.log2(64)) n_features = [64,128,256,512,512,4] r_convs = range(n_layers) #图片数量 size = d1.shape[0] D = tf.placeholder(tf.float32, [d1.shape[0],d1.shape[1],d1.shape[2],d1.shape[3]]) D2 = tf.placeholder(tf.float32, [d1.shape[0], d1.shape[1], d1.shape[2], d1.shape[3]]) with tf.Session() as sess: #init_op = tf.global_variables_initializer() #sess.run(init_op) # 卷积操作 #stride决定图片缩小的倍数 resultP6 = D resultQ6 = D2 Q = sess.run(resultP6,feed_dict={D: d1}) P = sess.run(resultQ6,feed_dict={D2:d2}) Q = Q.reshape(size,-1) P = P.reshape(size,-1) print Q.shape t = time.time()-st #返回值很奇怪 (value, ) dis = emd(Q, P), #print 'spend time: %0.2f'%(t) return dis[0]
def get_score(s1, s2, model): s1 =re.sub('[^a-zA-Z\s]+', ' ', s1) s2 =re.sub('[^a-zA-Z\s]+', ' ', s2) set1 = [word for word in set( s1.strip().lower().split() + s1.strip().split()) if word in model ] set2 = [word for word in set(s2.strip().lower().split() + s2.strip().split()) if word in model] c1 = collections.Counter(s1.strip().split() + s1.lower().strip().split()) c2 = collections.Counter( s2.strip().split() + s2.lower().strip().split()) w1 = [c1[word]*1.00 for word in set1] w2 = [c2[word]*1.00 for word in set2] w1 = np.array(w1) w2 = np.array(w2) w1/=sum(w1) w2/=sum(w2) v1 = [model[word]*1.00 for word in set1] v2 = [model[word]*1.00 for word in set2] return emd(v1,v2,X_weights = w1, Y_weights = w2)
def metric_emd_sig(vec1,vec2): ndim = 4 nclusters = vec1.shape[0]/ndim vec1 = vec1.reshape(ndim*nclusters,1) vec2 = vec2.reshape(ndim*nclusters,1) sig1 = vec1.reshape((ndim,nclusters)).T sig2 = vec2.reshape((ndim,nclusters)).T X_weights = np.empty((nclusters,1)) Y_weights = np.empty((nclusters,1)) X = np.empty((nclusters,ndim-1)) Y = np.empty((nclusters,ndim-1)) X_weights[:,0] = sig1[:,0] Y_weights[:,0] = sig2[:,0] X[:,0:ndim-1] = sig1[:,1:ndim] Y[:,0:ndim-1] = sig2[:,1:ndim] # Next lines remove biggest cluster. Assumes it represents the # background max_index = np.argmax(X_weights) X_weights_red = np.delete(X_weights,max_index,axis=0) X_weights_red = X_weights_red/np.sum(X_weights_red,axis=0) X_red = np.delete(X,max_index,axis=0) max_index = np.argmax(Y_weights) Y_weights_red = np.delete(Y_weights,max_index,axis=0) Y_weights_red = Y_weights_red/np.sum(Y_weights_red,axis=0) Y_red = np.delete(Y,max_index,axis=0) distance = emd(X_red,Y_red,X_weights_red,Y_weights_red,distance='euclidean') #distance = emd(X,Y,X_weights,Y_weights,distance='euclidean') return distance
0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652]) st = time.time() print emd(feature1, feature2, d) print (time.time() - st) * 1000
del content[len(content) - 1] label = [] for i in range(0, len(content)): content[i] = content[i].split(":") label.append(content[i][0]) content[i][1] = content[i][1].split(";") if len(content[i][1]) != 16: print(content[i][0]) for j in range(0, len(content[i][1])): content[i][1][j] = int(content[i][1][j]) #Build distance matrix weights = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0] distance_matrix = np.empty([len(content), len(content)]) for i in range(0, len(content)): for j in range(i, len(content)): distance = emd.emd(content[i][1], content[j][1], weights, weights) distance_matrix[i][j] = distance distance_matrix[j][i] = distance link_matrix = linkage(distance_matrix, method='weighted') plt.figure() plt.title("Dendrogram of " + str(len(label)) + " teams") plt.xlabel("teams") plt.ylabel("distance") dendrogram = dendrogram(link_matrix, labels=label, leaf_rotation=30., leaf_font_size=5.) plt.savefig("/robocup/implementation/dendro") max_d = 0.5 clusters = fcluster(link_matrix, max_d, criterion='distance') file = open("clusters", "w") for i in range(0, len(content)): file.write(content[i][0] + " : " + str(clusters[i]) + "\n") file.close()
def solve_by_emd(): """Solve the problem where the whole mass must be moved.""" ltheta = DIM*[1, ] return emd((lP, fw1), (lQ, fw2), lambda a, b: float(dist_for_emd(a, b, ltheta)))
def hist_compare(self): print 'BANDS',self.bands bins = 32 emd_list = [] out_dir = os.path.join(self.plot_dir,self.name) if not os.path.exists(out_dir): os.mkdir(out_dir) hist_dir = os.path.join(out_dir,'histograms') if not os.path.exists(hist_dir): os.mkdir(hist_dir) archaeology = self.arc background = self.bac #print archaeology.shape #print background.shape minima = np.min(archaeology) if minima > np.min(background): minima = np.min(background) maxima = np.min(archaeology) if maxima < np.max(background): maxima = np.max(background) hist_arch = np.histogram(archaeology, bins=bins, range=(minima,maxima)) hist_back = np.histogram(background, bins=bins, range=(minima,maxima)) #print hist_arch[0] #print hist_back[0] print 'Totals' print 'hist_arch', np.sum(hist_arch[0]) print archaeology.shape #print hist_arch[0].shape #print hist_arch[0].shape hist_arch_norm = np.true_divide(hist_arch[0],archaeology.shape) hist_back_norm = np.true_divide(hist_back[0],background.shape) #hist_arch_norm = hist_arch #hist_back_norm = hist_back #print hist_arch_norm #print x_vals.shape os.chdir(hist_dir) sum_of_difference = np.sum(np.abs(hist_arch_norm-hist_back_norm)) print sum_of_difference contrast_emd = emd.emd(range(bins),range(bins),hist_arch_norm, hist_back_norm) print 'EMD',contrast_emd emd_list.append(contrast_emd) emd_comp = np.array(emd_list) print emd_comp.shape os.chdir(self.plot_dir) np.savetxt(self.name+'_emd.txt',emd_comp, delimiter=',')
def regions_distance(r_features, r_weigths): if len(r_features) >= MAX_EMD_POINTS: return 1e20 return emd((query_num, map(float, weights)), (r_features, map(float, r_weigths)), lambda a, b: float(dist_for_emd(a, b, ltheta)))
def discoverfks(self, theta): # phase 1 fs = [] fm = [] # b will contain bottom-k sketches for each column, indexed on (<schemaname>, <tablename>, <columnname>) bksketches = {} quantiles = {} s = {} # calculate bottom-k sketch for all columns and store in dictionary <bksketches> for column in self.columns: bksketches[(column.db_schema, column.tablename, column.columnname)] = self.bottomksketch(self.getDataFn(column.db_schema, column.columnname, column.tablename)) pkall = self.pksingle pkall.extend(self.pkmulti) for pk in pkall: # foreach primary key (single and multi) pkcollst = pk.db_columns.split(self.colseparator) n = len(pkcollst) for keycolumn_name in pkcollst: # foreach column in primary key for candidate in self.columns: # foreach column as foreign key candidate this = bksketches[(candidate.db_schema, candidate.tablename, candidate.columnname)] that = bksketches[(pk.db_schema, pk.tablename, keycolumn_name)] if self.inclusion(this, that) >= theta and (candidate.tablename != pk.tablename): if n == 1: # in case we are dealing with a single column pk fs.append(([candidate], pk)) if n > 1: # in case we are dealing with a multi column pk if (pk.db_columns, keycolumn_name) not in s: s[(pk.db_columns, keycolumn_name)] = [] # dictionary s indexes on (<pk name>, <pk column>) where the pk name is generic (can be # just concatenation of the columnnames), e.g.: ('id|name', 'id') and ('id|name', 'name') # indicate the two entries in s for PK 'id|name'. For each entry we store a list of # candidate columns found in other tables s[(pk.db_columns, keycolumn_name)].append(candidate) if n > 1: bksketches[(pk.db_schema, pk.tablename, pk.db_columns)] = self.bottomksketch(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename)) quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename)) # phase 2 # fks: dictionary that indexes on (<foreignkey table>, <primary key column>) # value of the dictionary are those candidate columns in <foreignkey table> for <primary key column> # TBD: remove the table loop fks = {} for kvp in s: spkcolname = kvp[1] for e in s[kvp]: key = (e.tablename, spkcolname) if key not in fks: fks[key] = [] fks[key].append(e) for pkm in self.pkmulti: pkcollst = pkm.db_columns.split(self.colseparator) print(pkm) # print() # for each table in the database, check if we have candidates in fks for this PK, if we do: get cartesian # product and store in the fm list for table in self.tables: tname = table.tablename L = [] for pkcolumn in pkcollst: key = (tname, pkcolumn) if key not in fks: continue L.append(fks[key]) if len(L) == len(pkcollst): cart = self.cartesian(L) for prod in cart: fm.append((prod, pkm)) for flst,pk in fm: pkcollst = pk.db_columns.split(self.colseparator) fcols = [ c.columnname for c in flst ] fschema = flst[0].db_schema # TBD: ugly indices here ftable = flst[0].tablename # TBD: and here fsample = self.bottomksketch(self.getDataFn(fschema, fcols, ftable)) if self.inclusion(fsample, bksketches[(pk.db_schema, pk.tablename, pk.db_columns)]) >= theta: quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename)) quantiles[(fschema, ftable, "|".join(fcols))] = self.quantilehistogram(self.getDataFn(fschema, fcols, ftable)) else: fm.remove((flst,pk)) for flst,pk in fs: # only index zero because every fs has only one candidate column... quantiles[(flst[0].db_schema, flst[0].tablename, flst[0].columnname)] = self.quantilehistogram(self.getDataFn(flst[0].db_schema, flst[0].columnname, flst[0].tablename)) result = [] fall = fs fall.extend(fm) for f,pk in fall: fcols = [] for cdict in f: fcols.append(cdict.columnname) fschema = f[0].db_schema # TBD: ugly indices here ftable = f[0].tablename # TBD: and here if quantiles[(fschema, ftable, "|".join(fcols))] is not None and quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] is not None: # empty columns.... qfk = quantiles[(fschema, ftable, "|".join(fcols))] qpk = quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] emdscore = 0 try: for i in range(len(qfk)): fkhist = qfk[i][0] pkhist = qpk[i][0] fkbins = qfk[i][1] pkbins = qpk[i][1] emdscore += emd.emd(fkhist, pkhist, fkbins[0:-1], pkbins[0:-1]) emdscore = emdscore/len(qfk[0]) except: emdscore = -1 if math.isnan(emdscore): emdscore = -1 nfk = ForeignKey(db_catalog=pk.db_catalog, pkdb_schema=pk.db_schema, fkdb_schema=fschema, pktablename=pk.tablename, fktablename=ftable, fk_columns=fcols, keyname='implicit_fk', type='implicit') nfk.pk_columns=pk.db_columns nfk.score = emdscore result.append((nfk, emdscore)) # print("## len(Q): " + str(len(q))) return sorted(result, key=lambda kvp: kvp[1], reverse=False)
def emdcalculate(self, gray_url): update_num = 0 find_flags = 0 location_value = 0.15 visual_value = 0.2 features1 = self.mongo_operate.get_web_view(gray_url, 'gray') if features1 is False or features1 == []: return 0 if len(features1) > 50: features1 = features1[:50] weight1 = [1 / len(features1) for i in range(len(features1))] for protect_url in self.protected_title_dict.keys(): features2 = self.mongo_operate.get_web_view( protect_url, 'protected') if not features2: continue if len(features2) > 50: features2 = features2[:50] # features1中每个元素的权重 weight2 = [1 / len(features2) for i in range(len(features2))] emd_goal_location = emd( (features1, weight1), (features2, weight2), self.dis_location) if math.isnan(emd_goal_location): continue if emd_goal_location < location_value: find_flags = 1 self.mysql_handle.undate_gray_list_check_result( gray_url, 'view', source_url=protect_url) self.mysql_handle.undate_task_result_check_result( self.task_id, self.task_start_time, gray_url, 'view_location') emd_goal_visual = emd( (features1, weight1), (features2, weight2), self.dis_visual) if math.isnan(emd_goal_visual): continue if emd_goal_visual < visual_value: find_flags = 1 self.mysql_handle.undate_gray_list_check_result( gray_url, 'view', source_url=protect_url) self.mysql_handle.undate_task_result_check_result( self.task_id, self.task_start_time, gray_url, 'view_visual') for counterfeit_url in self.counterfeit_title_dict.keys(): features2 = self.mongo_operate.get_web_view( counterfeit_url, 'counterfeit') if not features2: continue if len(features2) > 50: features2 = features2[:50] # features1中每个元素的权重 weight2 = [1 / len(features2) for i in range(len(features2))] emd_goal_location = emd( (features1, weight1), (features2, weight2), self.dis_location) if math.isnan(emd_goal_location): continue if emd_goal_location < location_value: find_flags = 1 self.mysql_handle.undate_gray_list_check_result( gray_url, 'view', counterfeit_url=counterfeit_url) self.mysql_handle.undate_task_result_check_result( self.task_id, self.task_start_time, gray_url, 'view_location') emd_goal_visual = emd( (features1, weight1), (features2, weight2), self.dis_visual) if math.isnan(emd_goal_visual): continue if emd_goal_visual < visual_value: find_flags = 1 self.mysql_handle.undate_gray_list_check_result( gray_url, 'view', counterfeit_url=counterfeit_url) self.mysql_handle.undate_task_result_check_result( self.task_id, self.task_start_time, gray_url, 'view_visual') if update_num >= 5: #update_running_state(view_check_num, view_find_num) pass return find_flags
features_prefix = sys.argv[1] fdata = FeatureData(features_prefix) features_by_bid = {} for f, bid in izip(fdata.features, fdata.business_ids): if bid in features_by_bid: features_by_bid[bid].append(f) else: features_by_bid[bid] = [f] for bid in features_by_bid.keys(): features_by_bid[bid] = np.array(features_by_bid[bid]) business_ids = np.array(sorted(features_by_bid.keys())[:2]) print "bids: ", business_ids print "recalculated:" for bid1 in business_ids: for bid2 in business_ids: print "D(%d, %d): %.4f" % (bid1, bid2, emd(features_by_bid[bid1], features_by_bid[bid2])) print "from file:" emd_matrix = EMDMatrix.load(sys.argv[2]) print emd_matrix.for_business_ids(business_ids, business_ids)
def my_emd(a, b): import emd pos = range(len(a)) return emd.emd((pos, list(a)), (pos, list(b)), lambda x,y: abs(x-y)+0.0)
def wmd(doc1, doc1_nbow, doc2, doc2_nbow): doc1 = doc1.T #for converting to list and feeding to EMD solver doc2 = doc2.T wmd_dist = emd((doc1.tolist(), doc1_nbow.tolist()), (doc2.tolist(), doc2_nbow.tolist()), distance) return wmd_dist