示例#1
0
文件: text_dis.py 项目: wyl-hit/job
def main():
    emd_list=[]
    Min_goal_location =1
    Min_goal_visual = 1
    Min_black_location = 1
    Min_black_visual=1
    file = open('result.txt','wb')
    test_list= get_features('svips_detect_visual.pkl')
    print "len of test_list is",len(test_list)
    raw_input("test")
    black_list = get_features('vips_black_visual.pkl')
    print "length of white",len(black_list)
    raw_input("goal")
    #weight1 = [1/len(features1) for i in range(len(features1))]
    goal_list =get_features('vips_white_visual.pkl')
    for test_url in test_list.keys():
    	Min_goal_location =1
    	Min_goal_lurl =''
    	Min_goal_visual = 1
	Min_goal_vurl = ''
    	Min_black_visual=1
	Min_black_vurl = ''
    	Min_black_location = 1
	Min_black_lurl = ''
	features1=test_list[test_url]
    	if len(features1)>50:
	    continue
	weight1 = [1/len(features1) for i in range(len(features1))]
    	for goal_url in goal_list.keys():
		features2 =goal_list[goal_url]
        	if len(features2)>50:
		    continue
		weight2 =[1/len(features2) for i in range(len(features2))]  #features1中每个元素的权重
        	#emd_dis = emd((features1,weight1),(features2,weight2),distance)
        	emd_goal_location = emd((features1,weight1),(features2,weight2),dis_location)
		if emd_goal_location < Min_goal_location:
			Min_goal_location = emd_goal_location
			Min_goal_lurl = goal_url
        	emd_goal_visual = emd((features1,weight1),(features2,weight2),dis_visual)
		if emd_goal_visual < Min_goal_visual:
			Min_goal_visual = emd_goal_visual
			Min_goal_vurl = goal_url
	for black_url in black_list.keys():
		features2 = black_list[black_url]
		if len(features2)>50:
		    continue
		weight2 = [1/len(features2) for i in range(len(features2))]
        	emd_black_location = emd((features1,weight1),(features2,weight2),dis_location)
		if emd_black_location < Min_black_location:
			Min_black_location = emd_black_location
			Min_black_lurl = black_url
        	emd_black_visual = emd((features1,weight1),(features2,weight2),dis_visual)
		if emd_black_visual < Min_black_visual:
			Min_black_visual = emd_black_visual
			Min_black_vurl = black_url
	file.write(str(test_url)+' '+str(Min_goal_lurl)+' '+str(1-Min_goal_location)+' '+str(Min_goal_vurl)+' '+str(1-Min_goal_visual)+' '+
				     str(Min_black_lurl)+' '+str(1-Min_black_location)+' '+str(Min_black_vurl)+' '+str(1-Min_black_visual)+'\n')
#	print emd_dis_location,emd_dis_visual
#	raw_input("item")
    file.close()
def report(prediction_hapls, answer_hapls, dist):
    # For the dataset it reports:
    # 1) the count of predicted haplotypes with no errors (TP);
    # 2) the count of predicted haplotypes with at least one error(FP);
    # 3) total count of haplotypes(TP+FP);
    # 4) sensitivity(TP/(TP+FN));
    # 5) Precision(PPV=(TP/(TP+FP));
    # 6) EMD to a consensus.

    # For every true variant it should report:
    # 1) true frequency(TF);
    # 2) editing distance to the closest prediction variant(ECP);
    # 3) frequency of the closest predicted variant(FCP);
    # 4) explanation error for a true variant (EEV).

    #For every predicted variant it should report:
    # 1) editing distance to the closest true variant (ECT).

    emd_res = emd(X=np.ones(len(prediction_hapls.freqs)), Y=np.ones(len(answer_hapls.freqs)),
                  X_weights=prediction_hapls.freqs, Y_weights=answer_hapls.freqs,
                  distance='precomputed', D=dist, return_flows=True)
    pred_freqs_unif = np.array([1./len(prediction_hapls.freqs) for _ in range(len(prediction_hapls.freqs))])
    answer_freqs_unif = np.array([1./len(answer_hapls.freqs) for _ in range(len(answer_hapls.freqs))])

    emd_unif = emd(X=np.ones(len(prediction_hapls.freqs)), Y=np.ones(len(answer_hapls.freqs)),
                   X_weights=pred_freqs_unif, Y_weights=answer_freqs_unif,
                   distance='precomputed', D=dist, return_flows=True)

    ans_hapl_count = len(answer_hapls.seqs)
    pred_hapl_count = len(prediction_hapls.seqs)
    predictions_closest_to_answer = get_prediction_closest_to_answer(dist)
    answer_closest_to_prediction = get_answer_closest_to_prediction(dist)
    predictions_closest_to_answer_freqs = get_closest_freq(prediction_hapls.freqs, predictions_closest_to_answer,
                                                           answer_hapls.freqs)
#    answer_closest_to_prediction_freqs = get_closest_freq(answer_hapls.freqs, answer_closest_to_prediction,
#                                                          prediction_hapls.freqs)
    freq_adjusted_mismatches = get_freq_adjusted_mismatches(emd_res[1], dist)
    report_dict = dict()
    report_dict["TP"] = sum([x[0] == 0 for x in predictions_closest_to_answer[:ans_hapl_count]])
    report_dict["FP"] = len(prediction_hapls.seqs) - report_dict["TP"]
    report_dict["TotalPredicted"] = len(prediction_hapls.seqs)
    report_dict["Sensitivity"] = float(report_dict["TP"])/ans_hapl_count
    report_dict["PPV"] = float(report_dict["TP"])/report_dict["TotalPredicted"]
    report_dict["EMD"] = emd_res[0]
    # Fractional accuracy
    report_dict["UEMD"] = emd_unif[0]
    report_dict["TF"] = [x for x in answer_hapls.freqs[:ans_hapl_count]]
    report_dict["ECP"] = [x[0] for x in predictions_closest_to_answer[:ans_hapl_count]]
    report_dict["ECT"] = [x[0] for x in answer_closest_to_prediction[:pred_hapl_count]]
    report_dict["FCP"] = [x for x in predictions_closest_to_answer_freqs[:ans_hapl_count]]
    report_dict["EEV"] = [x for x in freq_adjusted_mismatches[:ans_hapl_count]]
    report_dict["PCA"] = [x[1][0] for x in predictions_closest_to_answer]
    report_dict["ACP"] = [x[1][0] for x in answer_closest_to_prediction]
    # ADC
    report_dict["ADC"] = get_adc(predictions_closest_to_answer, answer_hapls.freqs)
    # APE
    report_dict["APE"] = get_adc(answer_closest_to_prediction, prediction_hapls.freqs)
    report_dict["UADC"] = get_adc(predictions_closest_to_answer, answer_freqs_unif)
    report_dict["UAPE"] = get_adc(answer_closest_to_prediction, pred_freqs_unif)
    json.dump(report_dict, sys.stdout)
示例#3
0
def main():
    features1 = [Feature(100, 40, 22), Feature(211, 20, 2),
                 Feature(32, 190, 150), Feature(2, 100, 100)]
    weights1  = [0.4, 0.3, 0.2, 0.1]
    
    features2 = [Feature(0, 0, 0), Feature(50, 100, 80), Feature(255, 255, 255)]
    weights2  = [0.5, 0.3, 0.2]
    
    print emd( (features1, weights1), (features2, weights2), distance )
示例#4
0
def arc_emd_choice(t, y, method='spline'):
    """
    Denoise the data in y by returning the intrinsic mode (or residual) with
    the largest variance as found using empirical mode decomposition.

    Parameters
    ----------
    y : 1D array-like
        The data to be denoised.
    method : {'spline'|'saw'}
        Which intrinsic mode identification process to employ.

    Result
    ------
    y_denoised : 1D array
        The denoised data.
    """
    if method == 'spline':
        modes, residual = emd.emd(t, y)
    if method == 'saw':
        modes, residual = emd.saw_emd(t, y)
    choices = np.append(modes, residual[:, np.newaxis], axis=1)
    stds = np.var(choices, axis=0)
    i_choice = np.argmax(stds)
    return choices[:, i_choice]
示例#5
0
def emd_nd(u, v):
    """
    Computes Earth Mover's Distance in N-dimensions
    Uses https://github.com/garydoranjr/pyemd
    Need to convert probability distribution in non-log space
    """
    return emd(np.exp(u), np.exp(v))
示例#6
0
def calc_similarity(comparisonpair,
                    distance=cosine_distance,
                    cosine_adjustment=True):
    # load pickle files X, BOW_X = (word_vector_arrays, BOW-features)
    word_vectors1, word_weights1 = comparisonpair[0]
    word_vectors2, word_weights2 = comparisonpair[1]

    # check if both files users are identical
    if (word_vectors1 == word_vectors2) and (word_weights1 == word_weights2):
        return 1.0
    # else
    else:
        # calculate the earth mover's distance (EMD) between two 'signatures' (generalized distributions)
        # signature format: (list of vectors [number of vectors x embedding dimension], list of their weights)
        # with the cosine distance
        #if(use_cosine):
        emd_result = emd((word_vectors1, word_weights1),
                         (word_vectors2, word_weights2), distance)
        # map the EMD output to [0,1]:
        if cosine_adjustment:
            similarity = float(float(1) - (emd_result / 2 * 1.0))
        else:
            # take the reciprocal for an estimate of the similarity (instead of distance)
            # to adjust for euclidean distance
            similarity = float(1 / float(emd_result))
        return similarity
示例#7
0
def calculate_emd(hidict, endict, ensentence, hindisentence):
	x = []
	y = []
	#print('Inside EMD', ensentence)
	for word in ensentence:
		word = word.lower()
		if word not in ('!','.',':', ';', ','):
			#print('ENWORD', word)
			try:
				x.append(endict[word])
			except:
				#print("except", word)	
				continue
				#print('Error', word)
	for word in hindisentence:
		if word not in ('!','.',':', ';', ','):
			#print('HIWORD', word)
			try:
				y.append(hidict[word])
			except:
				#print("except", word)
				continue
				#print('Error', word)
	#print('ENG',np.array(x).shape, 'Hndi', np.array(y).shape)
	distance = 99
	if len(y) > 0 and len(x)> 0:
		#print("correct", ensentence)
		distance = emd(np.array(x),np.array(y))
		#distance = emd(np.array(y),np.array(x))
	return distance
示例#8
0
 def emd_nd(u, v):
     tot = 0
     U = np.reshape(u, (-1, 101))
     V = np.reshape(v, (-1, 101))
     for s in zip(U, V):
         tot += emd(np.atleast_2d(s[0]), np.atleast_2d(s[1]))
     return tot
def WMD(document1, document2, embeddings):
    '''
    Compute WMD.

    Input:
    document1:      List of words.
    document2:      List of words.
    embeddings:     word2vec embeddings of words.

    Returns:        WMD between documents, float.
    '''

    # Compute nBOW representation of documents.
    d1 = nBOW(document1)
    d2 = nBOW(document2)

    # Get features.
    features1 = [tuple(embeddings[token]) for token in document1]
    features2 = [tuple(embeddings[token]) for token in document2]

    if len(features1) > len(features2):
        diff = abs(len(features1) - len(features2))
        d2.extend([0] * diff)
        features2.append(tuple([0] * len(features1[0])) * diff)
    elif len(features1) < len(features2):
        diff = abs(len(features1) - len(features2))
        d1.extend([0] * diff)
        features1.append(tuple([0] * len(features1[0])) * diff)

    # Return WMD.
    return emd((features1, d1), (features2, d1), distance)
示例#10
0
def solve_by_emd():
    """Solve the problem where the whole mass must be moved."""
    ltheta = DIM * [
        1,
    ]
    return emd((lP, fw1), (lQ, fw2),
               lambda a, b: float(dist_for_emd(a, b, ltheta)))
def get_dist(n_samples):

    mu_X, mu_Y = 1.0, -1.0

    X = np.random.randn(n_samples, 2) + mu_X
    Y = np.random.randn(n_samples, 2) + mu_Y
    return emd(X, Y)
示例#12
0
def WMD_bt_queryAndLib(text, Pos, X, BOW_X):
# X:             w2v vectors matrix, iTH column is the iTH document's w2v vectors matrix
# BOW_X:   BOW(word frequency) vectors matrix, iTH column is the iTH document's BOW vectors matrix
# words:     store the documents without repeating words
#     with open('src/STDvectors.pk') as f:
#         [Pos, X, BOW_X, words] = pickle.load(f)
    n = np.shape(X)
    n = n[0]                                                # number of documents 
    Dist = []
    (Fs, wordOrders, weight) = Weight(segmentation(text.lower()).split())
#     print '-----------------W2V向量---------------'
#     print type(Fs)
#     print np.shape(Fs)
#     print Fs
#     print '---------------Weight向量----------------'
#     print type(weight)
#     print np.shape(weight)
#     print weight
#     print '---------------------------------------'
    text_BOWVec = weight.tolist()               # BOW word vector of text
    text_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    for j in xrange(n):
        emdDist = emd((text_w2vVec, text_BOWVec), (X[j].T.tolist(), BOW_X[j].tolist()), distance)
        Dist.append((Pos[j], emdDist))     # calculate the EMD of two documents
        if emdDist == 0:                          # already find the same name in the standard library, jump out of the loop
            break
    sort_Dist = sorted(Dist, key = lambda d: d[1])
    print '-----------------------------------------------'
    if len(sort_Dist) >= 5:
        for i in range(5):
            print '%f    %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8'))
    else:
        for i in range(len(sort_Dist)):
            print '%f    %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8'))
    return sort_Dist[0][0]
def WMD(document1, document2, embeddings):
    '''
    Compute WMD.

    Input:
    document1:      List of words.
    document2:      List of words.
    embeddings:     word2vec embeddings of words.

    Returns:        WMD between documents, float.
    '''

    # Compute nBOW representation of documents.
    d1 = nBOW(document1)
    d2 = nBOW(document2)
    
    # Get features.
    features1 = [tuple(embeddings[token]) for token in document1]
    features2 = [tuple(embeddings[token]) for token in document2]


    if len(features1) > len(features2):
        diff = abs(len(features1) - len(features2))
        d2.extend([0]*diff)
        features2.append(tuple([0]*len(features1[0]))*diff)
    elif len(features1) < len(features2):
        diff = abs(len(features1) - len(features2))
        d1.extend([0]*diff)
        features1.append(tuple([0]*len(features1[0]))*diff)

    # Return WMD.
    return emd((features1, d1), (features2, d1), distance)
示例#14
0
def WMD_bt_queryAndLib(query, Pos, X, Weight_X):
# X:w2v vectors matrix, iTH column is the iTH document's w2v vectors matrix (list type)
# Weight_X: weight vectors matrix, iTH column is the iTH document's TF vectors matrix (list type)
    n = np.shape(X)
    n = n[0]                                                  # number of documents 
    Dist = []
    query = clean(query)
    (Fs, weight) = generateVec(segmentation(query.lower()).split())
    query_weightVec = weight.tolist()          # weight word vector of query
    query_w2vVec = Fs.T.T.tolist()                # w2v word vector of query
    for j in xrange(n):
        emdDist = emd((query_w2vVec, query_weightVec), (X[j].T.tolist(), Weight_X[j].tolist()), distance)
        if math.isnan(emdDist):                     # NOTICE! this sentence used to be missing and triggered a bug
            emdDist = 9999
#         Dist.append(emdDist)                        # calculate the EMD of two documents
        Dist.append((Pos[j], emdDist))     # calculate the EMD of two documents
    sort_Dist = sorted(Dist, key = lambda d: d[1])
    print '-----------------------------------------------'
    if len(sort_Dist) >= 5:
        for i in range(5):
            print '%f    %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8'))
    else:
        for i in range(len(sort_Dist)):
            print '%f    %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8'))
#     return sort_Dist[0][0]
    return sort_Dist
示例#15
0
def pre_phi_future(current,
                   future,
                   current_part,
                   future_part,
                   state,
                   tpm,
                   base=2):
    whole_rep = iit.effect_repertoire(current, future, state, tpm, base)

    part1_state = iit.convert_to_subset(state, current_part, base)
    part1_rep = iit.effect_repertoire(current_part, future_part, part1_state,
                                      tpm, base)

    part2_state = iit.convert_to_subset(state, current - current_part, base)
    part2_rep = iit.effect_repertoire(current - current_part,
                                      future - future_part, part2_state, tpm,
                                      base)

    partitioned_rep = iit.multiply_repertoires(future_part,
                                               future - future_part, part1_rep,
                                               part2_rep, base)

    future_nnodes = len(future)
    d = np.array(range(2**future_nnodes))
    locs = ((d[:, None] &
             (1 << np.arange(future_nnodes - 1, -1, -1))) > 0).astype(int)

    return emd(locs, locs, whole_rep, partitioned_rep, distance='cityblock')
示例#16
0
def main():
    features1 = [
        Feature(100, 40, 22),
        Feature(211, 20, 2),
        Feature(32, 190, 150),
        Feature(2, 100, 100)
    ]
    weights1 = [0.4, 0.3, 0.2, 0.1]

    features2 = [
        Feature(0, 0, 0),
        Feature(50, 100, 80),
        Feature(255, 255, 255)
    ]
    weights2 = [0.5, 0.3, 0.2]

    print emd((features1, weights1), (features2, weights2), distance)
def populate_EMD_chunk(coordinates):
    with open("out", 'a') as log:
        log.write("%s: populating (%d, %d) through (%d, %d)...\n" %
                  (datetime.now().isoformat(), coordinates[0][0],
                   coordinates[0][1], coordinates[-1][0], coordinates[-1][1]))
    return [(i, j,
             emd(features_by_bid[business_ids[i]],
                 features_by_bid[business_ids[j]])) for i, j in coordinates]
示例#18
0
def populate_EMD_chunk(coordinates):
	with open("out", 'a') as log:
		log.write("%s: populating (%d, %d) through (%d, %d)...\n"
			% (datetime.now().isoformat(),
			   coordinates[0][0], coordinates[0][1],
			   coordinates[-1][0], coordinates[-1][1]))
	return [(i, j, emd(features_by_bid[business_ids[i]], features_by_bid[business_ids[j]]))
				for i, j in coordinates]
示例#19
0
文件: emd_dis.py 项目: wyl-hit/job
def main():
	    features1 =check_rect(get_features('emd_mxye.pkl'))
	    weight1 =[1/len(features1) for i in range(len(features1))]  #features1中每个元素的权重
	    features2 = check_rect(get_features('emd_nhwd.pkl'))
	    weight2 = [1/len(features2) for i in range(len(features2))]
	    print "result"
	    R= emd((features1,weight1),(features2,weight2),Distance)
    	    print "Result is",R
示例#20
0
def cei(subset, state, tpm, base=2):
    '''
    This function calculates the cause-effect imformation resulting
    from the state of a particular subset being known. 
    
    subset is the set of nodes for which the state is known.
    state is an integer which describes the state of these nodes.
    tpm is the 2^n x n transition probability matrix.
    '''

    nnodes = np.size(tpm, 1)
    full_set = set(range(nnodes))

    #print(nnodes);

    f_uncon = iit.uncon_effect_repertoire(tpm, base)
    p_uncon = iit.uncon_cause_repertoire(nnodes, base)

    f = iit.effect_repertoire(subset, full_set, state, tpm, base)
    p = iit.cause_repertoire(subset, full_set, state, tpm, base)
    '''
    cause_information = iit.EMD1(p_uncon, p);
    effect_information = iit.EMD1(f_uncon, f);
    
    print('ci',cause_information)
    print('ei',effect_information)
    
    cause_information = iit.EMD2(p_uncon, p);
    effect_information = iit.EMD2(f_uncon, f);

    print('ci',cause_information)
    print('ei',effect_information)
    
    Dist = np.array([[iit.hamming(i,j) for i in xrange(2**nnodes)] for j in xrange(2**nnodes)], dtype=int)  
    print (Dist)
    '''

    d = np.array(range(2**nnodes))
    locs = ((d[:, None] &
             (1 << np.arange(nnodes - 1, -1, -1))) > 0).astype(int)

    cause_information = emd(locs, locs, p_uncon, p, distance='cityblock')
    effect_information = emd(locs, locs, f_uncon, f, distance='cityblock')

    return np.minimum(cause_information, effect_information)
示例#21
0
def WMD_bt_2texts(text1, text2):
    (Fs, wordOrders, weight) = Weight(segmentation(text1.lower()).split())
    text1_BOWVec = weight.tolist()               # BOW word vector of text
    text1_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    (Fs, wordOrders, weight) = Weight(segmentation(text2.lower()).split())
    text2_BOWVec = weight.tolist()               # BOW word vector of text
    text2_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    Dist = (emd((text1_w2vVec, text1_BOWVec), (text2_w2vVec, text2_BOWVec), distance))   # calculate the EMD of two documents
    return Dist
示例#22
0
def get_wmd(ix):
    n = np.shape(X)
    n = n[0]
    Di = np.zeros((1,n))
    i = ix
    print '%d out of %d' % (i, n)
    for j in xrange(i):
        Di[0,j] = emd( (X[i], BOW_X[i]), (X[j], BOW_X[j]), distance)
    return Di 
def earth_mover(points1, points2, normer=np.sum):
    xs1 = np.linspace(0, 1, len(points1),
                      endpoint=True)[np.array(np.isfinite(points1))]
    xs2 = np.linspace(0, 1, len(points2),
                      endpoint=True)[np.array(np.isfinite(points2))]
    points1 = points1[np.isfinite(points1)]
    points2 = points2[np.isfinite(points2)]
    return emd.emd(xs1, xs2, points1 / normer(points1),
                   points2 / normer(points2))
示例#24
0
def wae_loss(x, x_hat, mu, logvar, batch_size=128):

    KLD = -0.5 * torch.sum(
        1 + logvar - mu.pow(2) - logvar.exp())  #KL divergence
    # Normalise by same number of elements as in reconstruction
    KLD /= batch_size
    EMDist = emd(x, x_hat)

    return EMDist + KLD
示例#25
0
def AdvancedKernelEMD(X1, X1_labels, X2, X2_labels, dist):
    sz1 = np.shape(X1)[0]
    sz2 = np.shape(X2)[0]
    D = np.zeros((sz1, sz2))
    for i in range(0, sz1):
        for j in range(i, sz2):
            s1 = X1[i]
            X1_positive = s1[X1_labels[i] == 1]
            X1_negative = s1[X1_labels[i] == 0]

            s2 = X2[j]
            X2_positive = s2[X2_labels[j] == 1]
            X2_negative = s2[X2_labels[j] == 0]
            D[i, j] = (emd(X1_positive, X2_positive, distance=dist))\
                      +(emd(X1_negative, X2_negative, distance=dist))

    D = D + np.transpose(np.triu(D, k=1))
    return D
示例#26
0
文件: wmd.py 项目: COMPASS-WANG/wmd
def get_wmd(ix):
    n = np.shape(X)
    n = n[0]
    Di = np.zeros((1,n))
    i = ix
    print '%d out of %d' % (i, n)
    for j in xrange(i):
        Di[0,j] = emd( (X[i], BOW_X[i]), (X[j], BOW_X[j]), distance)
    return Di 
示例#27
0
文件: wmd.py 项目: htt210/wmd
def get_wmd(i):

    Di = np.zeros((1,n))
    for j in range(n):
        if len(X[i]) > 0 and len(X[j]) > 0:
            #print i, j, len(X[i]), len(X[j])
            Di[0,j] = emd(X[i], X[j], X_weights=BOW_X[i], Y_weights=BOW_X[j])
        else:
            Di[0,j] = 2.0
    return Di
示例#28
0
def get_wmd(i):

    Di = np.zeros((1, n))
    for j in range(n):
        if len(X[i]) > 0 and len(X[j]) > 0:
            #print i, j, len(X[i]), len(X[j])
            Di[0, j] = emd(X[i], X[j], X_weights=BOW_X[i], Y_weights=BOW_X[j])
        else:
            Di[0, j] = 2.0
    return Di
示例#29
0
def calc_emd(G_orig, G_anon):
    """
    Calc Earth Mover distance of degree distribution between original graph and sanitized
     PyEMD package required! https://github.com/garydoranjr/pyemd
    :param G_orig: 
    :param G_anon: 
    :return: 
    """
    assert (G_orig.number_of_nodes() == G_anon.number_of_nodes())
    return emd(G_orig.degree().items(), G_anon.degree().items())
示例#30
0
def cal_sentence_distance(sentence1_word_vector_list, sentence1_word_freq_list,
                          sentence2_word_vector_list,
                          sentence2_word_freq_list):
    start = time()
    sentence_distance = emd(
        (sentence1_word_vector_list, sentence1_word_freq_list),
        (sentence2_word_vector_list, sentence2_word_freq_list), distance)
    end = time()
    # print 'cal time :', end - start
    return sentence_distance
示例#31
0
def ComputeKernelEMD(X1, X2, dist):
    sz1 = np.shape(X1)[0]
    sz2 = np.shape(X2)[0]
    D = np.zeros((sz1, sz2))
    for i in range(0, sz1):
        for j in range(i, sz2):

            D[i, j] = (emd(X1[i], X2[j], distance=dist))
    D = D + np.transpose(np.triu(D, k=1))
    return D
示例#32
0
def ComputeKernelEMD1D(X1, X2, dist):
    sz1 = np.shape(X1)[0]
    sz2 = np.shape(X2)[0]
    D = np.zeros((sz1, sz2))
    for i in range(0, sz1):
        for j in range(i, sz2):
            D[i, j] = (emd(X1[i], X2[j], distance=dist))

    D = np.squeeze(D)
    return D
def earth_mover(points1, points2):
    xs1 = np.linspace(0,1,len(points1),
                      endpoint=True)[np.array(np.isfinite(points1))]
    xs2 = np.linspace(0,1,len(points2),
                      endpoint=True)[np.array(np.isfinite(points2))]
    points1 = points1[np.isfinite(points1)]
    points2 = points2[np.isfinite(points2)]
    return emd.emd(xs1, xs2,
                   points1/np.sum(points1),
                   points2/np.sum(points2))
示例#34
0
def WMD_bt_2texts(text1, text2):
    (Fs, weight) = generateVec(segmentation(text1.lower()).split())
    text1_BOWVec = weight.tolist()               # BOW word vector of text
    text1_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    (Fs, weight) = generateVec(segmentation(text2.lower()).split())
    text2_BOWVec = weight.tolist()               # BOW word vector of text
    text2_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    Dist = emd((text1_w2vVec, text1_BOWVec), (text2_w2vVec, text2_BOWVec), distance)   # calculate the EMD of two documents
    if math.isnan(Dist):
        Dist = 9999
    return Dist
示例#35
0
def ensemble_process(x, data_length, max_modes, max_siftings, noise_std,
                     ensembles_per_process, output):
    imfs = np.zeros((max_modes + 1, data_length))

    for i in range(ensembles_per_process):
        noise = np.multiply(np.random.randn(data_length), noise_std)
        noise_assisted_data = np.add(x, noise)
        ensemble = emd.emd(noise_assisted_data, max_modes, max_siftings)
        imfs = np.add(imfs, ensemble)

    output.put(imfs)
def get_all_emd(grouped_chrm, type):
    for comb in combs:
        s1 = [getattr(v, type) for v in grouped_chrm.windows[comb[0]].variants]
        s2 = [getattr(v, type) for v in grouped_chrm.windows[comb[1]].variants]
        s1 = [abs(v - 0.5) for v in s1]
        s2 = [abs(v - 0.5) for v in s2]
        if s1 != [] and s2 != []:
            p = scipy.stats.ks_2samp(s1, s2).pvalue
            emd_object = emd_obj(comb, emd.emd(emd.cumsum(s1), emd.cumsum(s2)),
                                 p * bonf_corr)
        else:
            emd_object = emd_obj(comb, None, None)
        all_emd.append(emd_object)
示例#37
0
def ComputeKernelEMD1D(X1, X2, dist):
    sz1 = np.shape(X1)[0]
    sz2 = np.shape(X2)[0]
    D = np.zeros((sz1, sz2))
    for i in range(0, sz1):
        for j in range(i, sz2):
            print i, j
            startT = time.time()
            D[i, j] = (emd(X1[i], X2[j], distance=dist))
            endT = time.time() - startT
            #print 'EMD took ' + str(endT) + ' seconds.'
    #D2 = np.exp((-1/np.mean(D[np.nonzero(D)]))*D)
    return D
示例#38
0
def emdist(a, b, distance_function):
    """
    Return Earth Mover's distance

    :type distance_function: function
    :type b: numpy matrix
    :type a: numpy matrix
    """
    w1, w2 = a[:, 0], b[:, 0]
    feature1, feature2 = a[:, 1:], b[:, 1:]
    feature1 = (feature1.tolist(), w1.tolist())
    feature2 = (feature2.tolist(), w2.tolist())

    return emd(feature1, feature2, distance_function)
def calculateEMDMetric(histContainer1, histContainer2,angles,distanceFcn):      
  emdMetricWT={}
  minVal=1000000000
  for fileName1,histVal1 in histContainer1:
    for fileName2,histVal2 in histContainer2:
      minVal=1000000000
      for rotNum in range(len(angles)):
        anglesR=rotate(angles,rotNum)        
        cVal= emd((angles,histVal1.tolist()),(anglesR,histVal2.tolist()),distanceFcn)
        if cVal < minVal:
          minVal=cVal      
      if emdMetricWT.get(fileName1) == None:
        emdMetricWT[fileName1]=[]
      emdMetricWT[fileName1].append((fileName2,minVal))
        
  return emdMetricWT                    
示例#40
0
def get_wmd(ix):                                 # calculate the WMD distance between documents
    print '***', ix
    n = np.shape(X)
    n = n[0]                                            # number of documents 
    Di = np.zeros((1,n))                          # (1 x NoOfDocs) matrix
    i = ix
    print '%d out of %d' % (i, n)
    for j in xrange(i):
        print '***'
        print 'X[i] Size = ', np.shape(X[i])
        print 'X[j] Size = ', np.shape(X[j])
        print 'BOW_X[i] Size = ', np.shape(BOW_X[i])
        print 'BOW_X[j] Size = ', np.shape(BOW_X[j])
        Di[0,j] = emd((X[i], BOW_X[i]), (X[j], BOW_X[j]), distance)     # calculate the EMD of two documents
    print Di
    return Di 
示例#41
0
文件: model.py 项目: ssherko/musicman
def calculate_emd(signature1, signature2):
	D = calculate_distance_matrix(signature1, signature2, ground_distance = GROUND_DISTANCES["euclidean"])

	signature1_points = [ cl[0] for cl in signature1]
	signature1_weights = [ cl[2] for cl in signature1 ]
	signature1_weights = np.array(map(lambda x: float(x)/sum(signature1_weights), signature1_weights))

	signature2_points = [ cl[0] for cl in signature2]
	signature2_weights = [ cl[2] for cl in signature2 ]
	signature2_weights = np.array(map(lambda x: float(x)/sum(signature2_weights), signature2_weights))

	return emd(
				signature1_points, signature2_points, 
				X_weights = signature1_weights, Y_weights = signature2_weights, 
			  	distance = 'precomputed', D = D
			)
def calWD(d1,d2):
    st = time.time()
    # 6层卷积,生成随机参数
    n_layers = int(np.log2(64))
    n_features = [64,128,256,512,512,4]
    r_convs = range(n_layers)
    #图片数量
    size = d1.shape[0]

    D = tf.placeholder(tf.float32,  [d1.shape[0],d1.shape[1],d1.shape[2],d1.shape[3]])
    D2 = tf.placeholder(tf.float32, [d1.shape[0], d1.shape[1], d1.shape[2], d1.shape[3]])



    with tf.Session() as sess:
        #init_op = tf.global_variables_initializer()
        #sess.run(init_op)
        # 卷积操作

        #stride决定图片缩小的倍数

        resultP6 = D
        resultQ6 = D2




        Q = sess.run(resultP6,feed_dict={D: d1})
        P = sess.run(resultQ6,feed_dict={D2:d2})


        Q = Q.reshape(size,-1)
        P = P.reshape(size,-1)

        print Q.shape



        t = time.time()-st
        #返回值很奇怪 (value, )
        dis = emd(Q, P),
        #print 'spend time: %0.2f'%(t)
        return dis[0]
def get_score(s1, s2, model):
	s1 =re.sub('[^a-zA-Z\s]+', ' ', s1)
	s2 =re.sub('[^a-zA-Z\s]+', ' ', s2)
	set1 = [word  for word in set( s1.strip().lower().split() + s1.strip().split()) if word in model ]
	set2 = [word  for word in set(s2.strip().lower().split() + s2.strip().split()) if word in model]

	c1 = collections.Counter(s1.strip().split() + s1.lower().strip().split())
	c2 = collections.Counter( s2.strip().split() + s2.lower().strip().split())

	w1 = [c1[word]*1.00 for word in set1]
	w2 = [c2[word]*1.00 for word in set2]

	w1 = np.array(w1)
	w2 = np.array(w2)
	w1/=sum(w1)
	w2/=sum(w2)

	v1 = [model[word]*1.00 for word in set1]
	v2 = [model[word]*1.00 for word in set2]

	return emd(v1,v2,X_weights = w1, Y_weights = w2)
示例#44
0
def metric_emd_sig(vec1,vec2):

    ndim = 4
    nclusters = vec1.shape[0]/ndim

    vec1 = vec1.reshape(ndim*nclusters,1)
    vec2 = vec2.reshape(ndim*nclusters,1)

    sig1 = vec1.reshape((ndim,nclusters)).T
    sig2 = vec2.reshape((ndim,nclusters)).T

    X_weights = np.empty((nclusters,1))
    Y_weights = np.empty((nclusters,1))
    X = np.empty((nclusters,ndim-1))
    Y = np.empty((nclusters,ndim-1))

    X_weights[:,0] = sig1[:,0]
    Y_weights[:,0] = sig2[:,0]

    X[:,0:ndim-1] = sig1[:,1:ndim]
    Y[:,0:ndim-1] = sig2[:,1:ndim]

    # Next lines remove biggest cluster. Assumes it represents the
    # background
    max_index = np.argmax(X_weights)
    X_weights_red = np.delete(X_weights,max_index,axis=0)
    X_weights_red = X_weights_red/np.sum(X_weights_red,axis=0)
    X_red = np.delete(X,max_index,axis=0)

    max_index = np.argmax(Y_weights)
    Y_weights_red = np.delete(Y_weights,max_index,axis=0)
    Y_weights_red = Y_weights_red/np.sum(Y_weights_red,axis=0)
    Y_red = np.delete(Y,max_index,axis=0)

    distance = emd(X_red,Y_red,X_weights_red,Y_weights_red,distance='euclidean')
    #distance = emd(X,Y,X_weights,Y_weights,distance='euclidean') 

    return distance
示例#45
0
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652])

    st = time.time()
    print emd(feature1, feature2, d)
    print (time.time() - st) * 1000
    del content[len(content) - 1]
label = []
for i in range(0, len(content)):
    content[i] = content[i].split(":")
    label.append(content[i][0])
    content[i][1] = content[i][1].split(";")
    if len(content[i][1]) != 16:
        print(content[i][0])
    for j in range(0, len(content[i][1])):
        content[i][1][j] = int(content[i][1][j])
#Build distance matrix
weights = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
distance_matrix = np.empty([len(content), len(content)])
for i in range(0, len(content)):
    for j in range(i, len(content)):
        distance = emd.emd(content[i][1], content[j][1], weights, weights)
        distance_matrix[i][j] = distance
        distance_matrix[j][i] = distance
link_matrix = linkage(distance_matrix, method='weighted')
plt.figure()
plt.title("Dendrogram of " + str(len(label)) + " teams")
plt.xlabel("teams")
plt.ylabel("distance")
dendrogram = dendrogram(link_matrix, labels=label, leaf_rotation=30., leaf_font_size=5.)
plt.savefig("/robocup/implementation/dendro")
max_d = 0.5
clusters = fcluster(link_matrix, max_d, criterion='distance')
file = open("clusters", "w")
for i in range(0, len(content)):
    file.write(content[i][0] + " : " + str(clusters[i]) + "\n")
file.close()
示例#47
0
文件: alt_emd.py 项目: daureg/illalla
def solve_by_emd():
    """Solve the problem where the whole mass must be moved."""
    ltheta = DIM*[1, ]
    return emd((lP, fw1), (lQ, fw2),
               lambda a, b: float(dist_for_emd(a, b, ltheta)))
示例#48
0
    def hist_compare(self):
        
        
        print 'BANDS',self.bands
        
        bins = 32
        
        emd_list = []
        
        out_dir = os.path.join(self.plot_dir,self.name)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
            
        hist_dir = os.path.join(out_dir,'histograms')
        if not os.path.exists(hist_dir):
            os.mkdir(hist_dir)
                
                

        archaeology = self.arc
        background = self.bac
        
        #print archaeology.shape
        #print background.shape
        
        minima = np.min(archaeology)
        if minima > np.min(background):
            minima = np.min(background)
            
        maxima = np.min(archaeology)
        if maxima < np.max(background):
            maxima = np.max(background)
            
        hist_arch = np.histogram(archaeology,
                                 bins=bins, 
                                 range=(minima,maxima))
        hist_back = np.histogram(background,
                                 bins=bins, 
                                 range=(minima,maxima))
                                 
        #print hist_arch[0]
        #print hist_back[0]
        
        print 'Totals'             
        print 'hist_arch', np.sum(hist_arch[0])
        print archaeology.shape                          
        
        #print hist_arch[0].shape
        #print hist_arch[0].shape
                                 
                                 
        hist_arch_norm = np.true_divide(hist_arch[0],archaeology.shape)
        hist_back_norm = np.true_divide(hist_back[0],background.shape)
        
        #hist_arch_norm = hist_arch
        #hist_back_norm = hist_back           
        
        #print hist_arch_norm
        

        
        #print x_vals.shape
        
        os.chdir(hist_dir)
      
                                             
      
        
        sum_of_difference = np.sum(np.abs(hist_arch_norm-hist_back_norm))
                                          
        print sum_of_difference       

        contrast_emd = emd.emd(range(bins),range(bins),hist_arch_norm, hist_back_norm)          
        print 'EMD',contrast_emd
        emd_list.append(contrast_emd)
            
        emd_comp = np.array(emd_list)
        print emd_comp.shape

        
        os.chdir(self.plot_dir)
        np.savetxt(self.name+'_emd.txt',emd_comp, delimiter=',')
示例#49
0
 def regions_distance(r_features, r_weigths):
     if len(r_features) >= MAX_EMD_POINTS:
         return 1e20
     return emd((query_num, map(float, weights)),
                (r_features, map(float, r_weigths)),
                lambda a, b: float(dist_for_emd(a, b, ltheta)))
示例#50
0
	def discoverfks(self, theta):
		# phase 1
		fs = []
		fm = []
		# b will contain bottom-k sketches for each column, indexed on (<schemaname>, <tablename>, <columnname>)
		bksketches = {}
		quantiles = {}
		s = {}

		# calculate bottom-k sketch for all columns and store in dictionary <bksketches>
		for column in self.columns:
			bksketches[(column.db_schema, column.tablename, column.columnname)] = self.bottomksketch(self.getDataFn(column.db_schema, column.columnname, column.tablename))

		pkall = self.pksingle
		pkall.extend(self.pkmulti)
		for pk in pkall: # foreach primary key (single and multi)
			pkcollst = pk.db_columns.split(self.colseparator)
			n = len(pkcollst)

			for keycolumn_name in pkcollst: # foreach column in primary key
				for candidate in self.columns: # foreach column as foreign key candidate
					this = bksketches[(candidate.db_schema, candidate.tablename, candidate.columnname)]
					that = bksketches[(pk.db_schema, pk.tablename, keycolumn_name)]
					if self.inclusion(this, that) >= theta and (candidate.tablename != pk.tablename):
						if n == 1: # in case we are dealing with a single column pk
							fs.append(([candidate], pk))
						if n > 1: # in case we are dealing with a multi column pk
							if (pk.db_columns, keycolumn_name) not in s:
								s[(pk.db_columns, keycolumn_name)] = []
							# dictionary s indexes on (<pk name>, <pk column>) where the pk name is generic (can be
							# just concatenation of the columnnames), e.g.: ('id|name', 'id') and ('id|name', 'name')
							# indicate the two entries in s for PK 'id|name'. For each entry we store a list of
							# candidate columns found in other tables
							s[(pk.db_columns, keycolumn_name)].append(candidate)
			if n > 1:
				bksketches[(pk.db_schema, pk.tablename, pk.db_columns)] = self.bottomksketch(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename))

			quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename))

		# phase 2

		# fks: dictionary that indexes on (<foreignkey table>, <primary key column>)
		# value of the dictionary are those candidate columns in <foreignkey table> for <primary key column>
		# TBD: remove the table loop
		fks = {}
		for kvp in s:
			spkcolname = kvp[1]
			for e in s[kvp]:
				key = (e.tablename, spkcolname)
				if key not in fks:
					fks[key] = []
				fks[key].append(e)

		for pkm in self.pkmulti:
			pkcollst = pkm.db_columns.split(self.colseparator)
			print(pkm)
			# print()

			# for each table in the database, check if we have candidates in fks for this PK, if we do: get cartesian
			# product and store in the fm list
			for table in self.tables:
				tname = table.tablename
				L = []
				for pkcolumn in pkcollst:
					key = (tname, pkcolumn)
					if key not in fks:
						continue
					L.append(fks[key])
				if len(L) == len(pkcollst):
					cart = self.cartesian(L)
					for prod in cart:
						fm.append((prod, pkm))

		for flst,pk in fm:
			pkcollst = pk.db_columns.split(self.colseparator)
			fcols = [ c.columnname for c in flst ]

			fschema = flst[0].db_schema # TBD: ugly indices here
			ftable = flst[0].tablename # TBD: and here

			fsample = self.bottomksketch(self.getDataFn(fschema, fcols, ftable))
			if self.inclusion(fsample, bksketches[(pk.db_schema, pk.tablename, pk.db_columns)]) >= theta:
				quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename))
				quantiles[(fschema, ftable, "|".join(fcols))] = self.quantilehistogram(self.getDataFn(fschema, fcols, ftable))
			else:
				fm.remove((flst,pk))

		for flst,pk in fs:
			# only index zero because every fs has only one candidate column...
			quantiles[(flst[0].db_schema, flst[0].tablename, flst[0].columnname)] = self.quantilehistogram(self.getDataFn(flst[0].db_schema, flst[0].columnname, flst[0].tablename))

		result = []
		fall = fs
		fall.extend(fm)

		for f,pk in fall:

			fcols = []
			for cdict in f:
				fcols.append(cdict.columnname)
			fschema = f[0].db_schema # TBD: ugly indices here
			ftable = f[0].tablename # TBD: and here

			if quantiles[(fschema, ftable, "|".join(fcols))] is not None and quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] is not None: # empty columns....
				qfk = quantiles[(fschema, ftable, "|".join(fcols))]
				qpk = quantiles[(pk.db_schema, pk.tablename, pk.db_columns)]

				emdscore = 0
				try:
					for i in range(len(qfk)):
						fkhist = qfk[i][0]
						pkhist = qpk[i][0]

						fkbins = qfk[i][1]
						pkbins = qpk[i][1]

						emdscore += emd.emd(fkhist, pkhist, fkbins[0:-1], pkbins[0:-1])
					emdscore = emdscore/len(qfk[0])
				except:
					emdscore = -1

				if math.isnan(emdscore):
					emdscore = -1

				nfk = ForeignKey(db_catalog=pk.db_catalog, pkdb_schema=pk.db_schema, fkdb_schema=fschema, pktablename=pk.tablename, fktablename=ftable, fk_columns=fcols, keyname='implicit_fk', type='implicit')
				nfk.pk_columns=pk.db_columns
				nfk.score = emdscore

				result.append((nfk, emdscore))

		# print("## len(Q): " + str(len(q)))

		return sorted(result, key=lambda kvp: kvp[1], reverse=False)
示例#51
0
    def emdcalculate(self, gray_url):
        update_num = 0
        find_flags = 0
        location_value = 0.15
        visual_value = 0.2
        features1 = self.mongo_operate.get_web_view(gray_url, 'gray')
        if features1 is False or features1 == []:
            return 0
        if len(features1) > 50:
            features1 = features1[:50]
        weight1 = [1 / len(features1) for i in range(len(features1))]
        for protect_url in self.protected_title_dict.keys():
            features2 = self.mongo_operate.get_web_view(
                protect_url, 'protected')
            if not features2:
                continue

            if len(features2) > 50:
                features2 = features2[:50]
            # features1中每个元素的权重
            weight2 = [1 / len(features2) for i in range(len(features2))]
            emd_goal_location = emd(
                (features1, weight1), (features2, weight2), self.dis_location)
            if math.isnan(emd_goal_location):
                continue
            if emd_goal_location < location_value:
                find_flags = 1
                self.mysql_handle.undate_gray_list_check_result(
                    gray_url, 'view', source_url=protect_url)
                self.mysql_handle.undate_task_result_check_result(
                    self.task_id, self.task_start_time, gray_url, 'view_location')

            emd_goal_visual = emd(
                (features1, weight1), (features2, weight2), self.dis_visual)
            if math.isnan(emd_goal_visual):
                continue
            if emd_goal_visual < visual_value:
                find_flags = 1
                self.mysql_handle.undate_gray_list_check_result(
                    gray_url, 'view', source_url=protect_url)
                self.mysql_handle.undate_task_result_check_result(
                    self.task_id, self.task_start_time, gray_url, 'view_visual')

        for counterfeit_url in self.counterfeit_title_dict.keys():
            features2 = self.mongo_operate.get_web_view(
                counterfeit_url, 'counterfeit')
            if not features2:
                continue
            if len(features2) > 50:
                features2 = features2[:50]
            # features1中每个元素的权重
            weight2 = [1 / len(features2) for i in range(len(features2))]
            emd_goal_location = emd(
                (features1, weight1), (features2, weight2), self.dis_location)
            if math.isnan(emd_goal_location):
                continue
            if emd_goal_location < location_value:
                find_flags = 1
                self.mysql_handle.undate_gray_list_check_result(
                    gray_url, 'view', counterfeit_url=counterfeit_url)
                self.mysql_handle.undate_task_result_check_result(
                    self.task_id, self.task_start_time, gray_url, 'view_location')

            emd_goal_visual = emd(
                (features1, weight1), (features2, weight2), self.dis_visual)
            if math.isnan(emd_goal_visual):
                continue
            if emd_goal_visual < visual_value:
                find_flags = 1
                self.mysql_handle.undate_gray_list_check_result(
                    gray_url, 'view', counterfeit_url=counterfeit_url)
                self.mysql_handle.undate_task_result_check_result(
                    self.task_id, self.task_start_time, gray_url, 'view_visual')
        if update_num >= 5:
            #update_running_state(view_check_num, view_find_num)
            pass
        return find_flags
features_prefix = sys.argv[1]
fdata = FeatureData(features_prefix)

features_by_bid = {}
for f, bid in izip(fdata.features, fdata.business_ids):
	if bid in features_by_bid:
		features_by_bid[bid].append(f)
	else:
		features_by_bid[bid] = [f]

for bid in features_by_bid.keys():
	features_by_bid[bid] = np.array(features_by_bid[bid])

business_ids = np.array(sorted(features_by_bid.keys())[:2])
print "bids: ", business_ids

print "recalculated:"

for bid1 in business_ids:
	for bid2 in business_ids:
		print "D(%d, %d): %.4f" % (bid1, bid2,
		                           emd(features_by_bid[bid1], features_by_bid[bid2]))

print "from file:"

emd_matrix = EMDMatrix.load(sys.argv[2])
print emd_matrix.for_business_ids(business_ids, business_ids)



def my_emd(a, b):
    import emd
    pos = range(len(a))
    return emd.emd((pos, list(a)), (pos, list(b)), lambda x,y: abs(x-y)+0.0)
示例#54
0
def wmd(doc1, doc1_nbow, doc2, doc2_nbow):
	doc1 = doc1.T #for converting to list and feeding to EMD solver
	doc2 = doc2.T
	wmd_dist = emd((doc1.tolist(), doc1_nbow.tolist()), (doc2.tolist(), doc2_nbow.tolist()), distance)
	return wmd_dist