Python emd 예제들, emd.emd Python 예제들

예제 #1

0

파일 보기

파일: text_dis.py 프로젝트: wyl-hit/job

def main():
    emd_list=[]
    Min_goal_location =1
    Min_goal_visual = 1
    Min_black_location = 1
    Min_black_visual=1
    file = open('result.txt','wb')
    test_list= get_features('svips_detect_visual.pkl')
    print "len of test_list is",len(test_list)
    raw_input("test")
    black_list = get_features('vips_black_visual.pkl')
    print "length of white",len(black_list)
    raw_input("goal")
    #weight1 = [1/len(features1) for i in range(len(features1))]
    goal_list =get_features('vips_white_visual.pkl')
    for test_url in test_list.keys():
    	Min_goal_location =1
    	Min_goal_lurl =''
    	Min_goal_visual = 1
	Min_goal_vurl = ''
    	Min_black_visual=1
	Min_black_vurl = ''
    	Min_black_location = 1
	Min_black_lurl = ''
	features1=test_list[test_url]
    	if len(features1)>50:
	    continue
	weight1 = [1/len(features1) for i in range(len(features1))]
    	for goal_url in goal_list.keys():
		features2 =goal_list[goal_url]
        	if len(features2)>50:
		    continue
		weight2 =[1/len(features2) for i in range(len(features2))]  #features1中每个元素的权重
        	#emd_dis = emd((features1,weight1),(features2,weight2),distance)
        	emd_goal_location = emd((features1,weight1),(features2,weight2),dis_location)
		if emd_goal_location < Min_goal_location:
			Min_goal_location = emd_goal_location
			Min_goal_lurl = goal_url
        	emd_goal_visual = emd((features1,weight1),(features2,weight2),dis_visual)
		if emd_goal_visual < Min_goal_visual:
			Min_goal_visual = emd_goal_visual
			Min_goal_vurl = goal_url
	for black_url in black_list.keys():
		features2 = black_list[black_url]
		if len(features2)>50:
		    continue
		weight2 = [1/len(features2) for i in range(len(features2))]
        	emd_black_location = emd((features1,weight1),(features2,weight2),dis_location)
		if emd_black_location < Min_black_location:
			Min_black_location = emd_black_location
			Min_black_lurl = black_url
        	emd_black_visual = emd((features1,weight1),(features2,weight2),dis_visual)
		if emd_black_visual < Min_black_visual:
			Min_black_visual = emd_black_visual
			Min_black_vurl = black_url
	file.write(str(test_url)+' '+str(Min_goal_lurl)+' '+str(1-Min_goal_location)+' '+str(Min_goal_vurl)+' '+str(1-Min_goal_visual)+' '+
				     str(Min_black_lurl)+' '+str(1-Min_black_location)+' '+str(Min_black_vurl)+' '+str(1-Min_black_visual)+'\n')
#	print emd_dis_location,emd_dis_visual
#	raw_input("item")
    file.close()

예제 #2

0

파일 보기

파일: analyze_prediction.py 프로젝트: Sergey-Knyazev/CliqueSNV-validation

def report(prediction_hapls, answer_hapls, dist):
    # For the dataset it reports:
    # 1) the count of predicted haplotypes with no errors (TP);
    # 2) the count of predicted haplotypes with at least one error(FP);
    # 3) total count of haplotypes(TP+FP);
    # 4) sensitivity(TP/(TP+FN));
    # 5) Precision(PPV=(TP/(TP+FP));
    # 6) EMD to a consensus.

    # For every true variant it should report:
    # 1) true frequency(TF);
    # 2) editing distance to the closest prediction variant(ECP);
    # 3) frequency of the closest predicted variant(FCP);
    # 4) explanation error for a true variant (EEV).

    #For every predicted variant it should report:
    # 1) editing distance to the closest true variant (ECT).

    emd_res = emd(X=np.ones(len(prediction_hapls.freqs)), Y=np.ones(len(answer_hapls.freqs)),
                  X_weights=prediction_hapls.freqs, Y_weights=answer_hapls.freqs,
                  distance='precomputed', D=dist, return_flows=True)
    pred_freqs_unif = np.array([1./len(prediction_hapls.freqs) for _ in range(len(prediction_hapls.freqs))])
    answer_freqs_unif = np.array([1./len(answer_hapls.freqs) for _ in range(len(answer_hapls.freqs))])

    emd_unif = emd(X=np.ones(len(prediction_hapls.freqs)), Y=np.ones(len(answer_hapls.freqs)),
                   X_weights=pred_freqs_unif, Y_weights=answer_freqs_unif,
                   distance='precomputed', D=dist, return_flows=True)

    ans_hapl_count = len(answer_hapls.seqs)
    pred_hapl_count = len(prediction_hapls.seqs)
    predictions_closest_to_answer = get_prediction_closest_to_answer(dist)
    answer_closest_to_prediction = get_answer_closest_to_prediction(dist)
    predictions_closest_to_answer_freqs = get_closest_freq(prediction_hapls.freqs, predictions_closest_to_answer,
                                                           answer_hapls.freqs)
#    answer_closest_to_prediction_freqs = get_closest_freq(answer_hapls.freqs, answer_closest_to_prediction,
#                                                          prediction_hapls.freqs)
    freq_adjusted_mismatches = get_freq_adjusted_mismatches(emd_res[1], dist)
    report_dict = dict()
    report_dict["TP"] = sum([x[0] == 0 for x in predictions_closest_to_answer[:ans_hapl_count]])
    report_dict["FP"] = len(prediction_hapls.seqs) - report_dict["TP"]
    report_dict["TotalPredicted"] = len(prediction_hapls.seqs)
    report_dict["Sensitivity"] = float(report_dict["TP"])/ans_hapl_count
    report_dict["PPV"] = float(report_dict["TP"])/report_dict["TotalPredicted"]
    report_dict["EMD"] = emd_res[0]
    # Fractional accuracy
    report_dict["UEMD"] = emd_unif[0]
    report_dict["TF"] = [x for x in answer_hapls.freqs[:ans_hapl_count]]
    report_dict["ECP"] = [x[0] for x in predictions_closest_to_answer[:ans_hapl_count]]
    report_dict["ECT"] = [x[0] for x in answer_closest_to_prediction[:pred_hapl_count]]
    report_dict["FCP"] = [x for x in predictions_closest_to_answer_freqs[:ans_hapl_count]]
    report_dict["EEV"] = [x for x in freq_adjusted_mismatches[:ans_hapl_count]]
    report_dict["PCA"] = [x[1][0] for x in predictions_closest_to_answer]
    report_dict["ACP"] = [x[1][0] for x in answer_closest_to_prediction]
    # ADC
    report_dict["ADC"] = get_adc(predictions_closest_to_answer, answer_hapls.freqs)
    # APE
    report_dict["APE"] = get_adc(answer_closest_to_prediction, prediction_hapls.freqs)
    report_dict["UADC"] = get_adc(predictions_closest_to_answer, answer_freqs_unif)
    report_dict["UAPE"] = get_adc(answer_closest_to_prediction, pred_freqs_unif)
    json.dump(report_dict, sys.stdout)

예제 #3

0

파일 보기

파일: new_example_2.py 프로젝트: cgrudz/data_assimilation

def main():
    features1 = [Feature(100, 40, 22), Feature(211, 20, 2),
                 Feature(32, 190, 150), Feature(2, 100, 100)]
    weights1  = [0.4, 0.3, 0.2, 0.1]
    
    features2 = [Feature(0, 0, 0), Feature(50, 100, 80), Feature(255, 255, 255)]
    weights2  = [0.5, 0.3, 0.2]
    
    print emd( (features1, weights1), (features2, weights2), distance )

예제 #4

0

파일 보기

파일: arc_functions.py 프로젝트: parkus/arc

def arc_emd_choice(t, y, method='spline'):
    """
    Denoise the data in y by returning the intrinsic mode (or residual) with
    the largest variance as found using empirical mode decomposition.

    Parameters
    ----------
    y : 1D array-like
        The data to be denoised.
    method : {'spline'|'saw'}
        Which intrinsic mode identification process to employ.

    Result
    ------
    y_denoised : 1D array
        The denoised data.
    """
    if method == 'spline':
        modes, residual = emd.emd(t, y)
    if method == 'saw':
        modes, residual = emd.saw_emd(t, y)
    choices = np.append(modes, residual[:, np.newaxis], axis=1)
    stds = np.var(choices, axis=0)
    i_choice = np.argmax(stds)
    return choices[:, i_choice]

예제 #5

0

파일 보기

def emd_nd(u, v):
    """
    Computes Earth Mover's Distance in N-dimensions
    Uses https://github.com/garydoranjr/pyemd
    Need to convert probability distribution in non-log space
    """
    return emd(np.exp(u), np.exp(v))

예제 #6

0

파일 보기

def calc_similarity(comparisonpair,
                    distance=cosine_distance,
                    cosine_adjustment=True):
    # load pickle files X, BOW_X = (word_vector_arrays, BOW-features)
    word_vectors1, word_weights1 = comparisonpair[0]
    word_vectors2, word_weights2 = comparisonpair[1]

    # check if both files users are identical
    if (word_vectors1 == word_vectors2) and (word_weights1 == word_weights2):
        return 1.0
    # else
    else:
        # calculate the earth mover's distance (EMD) between two 'signatures' (generalized distributions)
        # signature format: (list of vectors [number of vectors x embedding dimension], list of their weights)
        # with the cosine distance
        #if(use_cosine):
        emd_result = emd((word_vectors1, word_weights1),
                         (word_vectors2, word_weights2), distance)
        # map the EMD output to [0,1]:
        if cosine_adjustment:
            similarity = float(float(1) - (emd_result / 2 * 1.0))
        else:
            # take the reciprocal for an estimate of the similarity (instead of distance)
            # to adjust for euclidean distance
            similarity = float(1 / float(emd_result))
        return similarity

예제 #7

0

파일 보기

def calculate_emd(hidict, endict, ensentence, hindisentence):
	x = []
	y = []
	#print('Inside EMD', ensentence)
	for word in ensentence:
		word = word.lower()
		if word not in ('!','.',':', ';', ','):
			#print('ENWORD', word)
			try:
				x.append(endict[word])
			except:
				#print("except", word)	
				continue
				#print('Error', word)
	for word in hindisentence:
		if word not in ('!','.',':', ';', ','):
			#print('HIWORD', word)
			try:
				y.append(hidict[word])
			except:
				#print("except", word)
				continue
				#print('Error', word)
	#print('ENG',np.array(x).shape, 'Hndi', np.array(y).shape)
	distance = 99
	if len(y) > 0 and len(x)> 0:
		#print("correct", ensentence)
		distance = emd(np.array(x),np.array(y))
		#distance = emd(np.array(y),np.array(x))
	return distance

예제 #8

0

파일 보기

 def emd_nd(u, v):
     tot = 0
     U = np.reshape(u, (-1, 101))
     V = np.reshape(v, (-1, 101))
     for s in zip(U, V):
         tot += emd(np.atleast_2d(s[0]), np.atleast_2d(s[1]))
     return tot

예제 #9

0

파일 보기

파일: WMD_mkusner.py 프로젝트: vyraun/word_movers_distance

def WMD(document1, document2, embeddings):
    '''
    Compute WMD.

    Input:
    document1:      List of words.
    document2:      List of words.
    embeddings:     word2vec embeddings of words.

    Returns:        WMD between documents, float.
    '''

    # Compute nBOW representation of documents.
    d1 = nBOW(document1)
    d2 = nBOW(document2)

    # Get features.
    features1 = [tuple(embeddings[token]) for token in document1]
    features2 = [tuple(embeddings[token]) for token in document2]

    if len(features1) > len(features2):
        diff = abs(len(features1) - len(features2))
        d2.extend([0] * diff)
        features2.append(tuple([0] * len(features1[0])) * diff)
    elif len(features1) < len(features2):
        diff = abs(len(features1) - len(features2))
        d1.extend([0] * diff)
        features1.append(tuple([0] * len(features1[0])) * diff)

    # Return WMD.
    return emd((features1, d1), (features2, d1), distance)

예제 #10

0

파일 보기

def solve_by_emd():
    """Solve the problem where the whole mass must be moved."""
    ltheta = DIM * [
        1,
    ]
    return emd((lP, fw1), (lQ, fw2),
               lambda a, b: float(dist_for_emd(a, b, ltheta)))

예제 #11

0

파일 보기

파일: evaluate_wasserstein_dist.py 프로젝트: yes7rose/evoGAN

def get_dist(n_samples):

    mu_X, mu_Y = 1.0, -1.0

    X = np.random.randn(n_samples, 2) + mu_X
    Y = np.random.randn(n_samples, 2) + mu_Y
    return emd(X, Y)

예제 #12

0

파일 보기

def WMD_bt_queryAndLib(text, Pos, X, BOW_X):
# X:             w2v vectors matrix, iTH column is the iTH document's w2v vectors matrix
# BOW_X:   BOW(word frequency) vectors matrix, iTH column is the iTH document's BOW vectors matrix
# words:     store the documents without repeating words
#     with open('src/STDvectors.pk') as f:
#         [Pos, X, BOW_X, words] = pickle.load(f)
    n = np.shape(X)
    n = n[0]                                                # number of documents 
    Dist = []
    (Fs, wordOrders, weight) = Weight(segmentation(text.lower()).split())
#     print '-----------------W2V向量---------------'
#     print type(Fs)
#     print np.shape(Fs)
#     print Fs
#     print '---------------Weight向量----------------'
#     print type(weight)
#     print np.shape(weight)
#     print weight
#     print '---------------------------------------'
    text_BOWVec = weight.tolist()               # BOW word vector of text
    text_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    for j in xrange(n):
        emdDist = emd((text_w2vVec, text_BOWVec), (X[j].T.tolist(), BOW_X[j].tolist()), distance)
        Dist.append((Pos[j], emdDist))     # calculate the EMD of two documents
        if emdDist == 0:                          # already find the same name in the standard library, jump out of the loop
            break
    sort_Dist = sorted(Dist, key = lambda d: d[1])
    print '-----------------------------------------------'
    if len(sort_Dist) >= 5:
        for i in range(5):
            print '%f    %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8'))
    else:
        for i in range(len(sort_Dist)):
            print '%f    %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8'))
    return sort_Dist[0][0]

예제 #13

0

파일 보기

파일: WMD_mkusner.py 프로젝트: olavurmortensen/word_movers_distance

def WMD(document1, document2, embeddings):
    '''
    Compute WMD.

    Input:
    document1:      List of words.
    document2:      List of words.
    embeddings:     word2vec embeddings of words.

    Returns:        WMD between documents, float.
    '''

    # Compute nBOW representation of documents.
    d1 = nBOW(document1)
    d2 = nBOW(document2)
    
    # Get features.
    features1 = [tuple(embeddings[token]) for token in document1]
    features2 = [tuple(embeddings[token]) for token in document2]


    if len(features1) > len(features2):
        diff = abs(len(features1) - len(features2))
        d2.extend([0]*diff)
        features2.append(tuple([0]*len(features1[0]))*diff)
    elif len(features1) < len(features2):
        diff = abs(len(features1) - len(features2))
        d1.extend([0]*diff)
        features1.append(tuple([0]*len(features1[0]))*diff)

    # Return WMD.
    return emd((features1, d1), (features2, d1), distance)

예제 #14

0

파일 보기

def WMD_bt_queryAndLib(query, Pos, X, Weight_X):
# X:w2v vectors matrix, iTH column is the iTH document's w2v vectors matrix (list type)
# Weight_X: weight vectors matrix, iTH column is the iTH document's TF vectors matrix (list type)
    n = np.shape(X)
    n = n[0]                                                  # number of documents 
    Dist = []
    query = clean(query)
    (Fs, weight) = generateVec(segmentation(query.lower()).split())
    query_weightVec = weight.tolist()          # weight word vector of query
    query_w2vVec = Fs.T.T.tolist()                # w2v word vector of query
    for j in xrange(n):
        emdDist = emd((query_w2vVec, query_weightVec), (X[j].T.tolist(), Weight_X[j].tolist()), distance)
        if math.isnan(emdDist):                     # NOTICE! this sentence used to be missing and triggered a bug
            emdDist = 9999
#         Dist.append(emdDist)                        # calculate the EMD of two documents
        Dist.append((Pos[j], emdDist))     # calculate the EMD of two documents
    sort_Dist = sorted(Dist, key = lambda d: d[1])
    print '-----------------------------------------------'
    if len(sort_Dist) >= 5:
        for i in range(5):
            print '%f    %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8'))
    else:
        for i in range(len(sort_Dist)):
            print '%f    %s' % (sort_Dist[i][1], sort_Dist[i][0].encode('utf-8'))
#     return sort_Dist[0][0]
    return sort_Dist

예제 #15

0

파일 보기

파일: IITfunctions.py 프로젝트: Ectheleon/IIT-Mini-Project

def pre_phi_future(current,
                   future,
                   current_part,
                   future_part,
                   state,
                   tpm,
                   base=2):
    whole_rep = iit.effect_repertoire(current, future, state, tpm, base)

    part1_state = iit.convert_to_subset(state, current_part, base)
    part1_rep = iit.effect_repertoire(current_part, future_part, part1_state,
                                      tpm, base)

    part2_state = iit.convert_to_subset(state, current - current_part, base)
    part2_rep = iit.effect_repertoire(current - current_part,
                                      future - future_part, part2_state, tpm,
                                      base)

    partitioned_rep = iit.multiply_repertoires(future_part,
                                               future - future_part, part1_rep,
                                               part2_rep, base)

    future_nnodes = len(future)
    d = np.array(range(2**future_nnodes))
    locs = ((d[:, None] &
             (1 << np.arange(future_nnodes - 1, -1, -1))) > 0).astype(int)

    return emd(locs, locs, whole_rep, partitioned_rep, distance='cityblock')

예제 #16

0

파일 보기

def main():
    features1 = [
        Feature(100, 40, 22),
        Feature(211, 20, 2),
        Feature(32, 190, 150),
        Feature(2, 100, 100)
    ]
    weights1 = [0.4, 0.3, 0.2, 0.1]

    features2 = [
        Feature(0, 0, 0),
        Feature(50, 100, 80),
        Feature(255, 255, 255)
    ]
    weights2 = [0.5, 0.3, 0.2]

    print emd((features1, weights1), (features2, weights2), distance)

예제 #17

0

파일 보기

파일: build_emd_matrix.py 프로젝트: akkaurkapoor/yelp-kaggle

def populate_EMD_chunk(coordinates):
    with open("out", 'a') as log:
        log.write("%s: populating (%d, %d) through (%d, %d)...\n" %
                  (datetime.now().isoformat(), coordinates[0][0],
                   coordinates[0][1], coordinates[-1][0], coordinates[-1][1]))
    return [(i, j,
             emd(features_by_bid[business_ids[i]],
                 features_by_bid[business_ids[j]])) for i, j in coordinates]

예제 #18

0

파일 보기

파일: build_emd_matrix.py 프로젝트: frankcarey/yelp-kaggle

def populate_EMD_chunk(coordinates):
	with open("out", 'a') as log:
		log.write("%s: populating (%d, %d) through (%d, %d)...\n"
			% (datetime.now().isoformat(),
			   coordinates[0][0], coordinates[0][1],
			   coordinates[-1][0], coordinates[-1][1]))
	return [(i, j, emd(features_by_bid[business_ids[i]], features_by_bid[business_ids[j]]))
				for i, j in coordinates]

예제 #19

0

파일 보기

파일: emd_dis.py 프로젝트: wyl-hit/job

def main():
	    features1 =check_rect(get_features('emd_mxye.pkl'))
	    weight1 =[1/len(features1) for i in range(len(features1))]  #features1中每个元素的权重
	    features2 = check_rect(get_features('emd_nhwd.pkl'))
	    weight2 = [1/len(features2) for i in range(len(features2))]
	    print "result"
	    R= emd((features1,weight1),(features2,weight2),Distance)
    	    print "Result is",R

예제 #20

0

파일 보기

파일: IITfunctions.py 프로젝트: Ectheleon/IIT-Mini-Project

def cei(subset, state, tpm, base=2):
    '''
    This function calculates the cause-effect imformation resulting
    from the state of a particular subset being known. 
    
    subset is the set of nodes for which the state is known.
    state is an integer which describes the state of these nodes.
    tpm is the 2^n x n transition probability matrix.
    '''

    nnodes = np.size(tpm, 1)
    full_set = set(range(nnodes))

    #print(nnodes);

    f_uncon = iit.uncon_effect_repertoire(tpm, base)
    p_uncon = iit.uncon_cause_repertoire(nnodes, base)

    f = iit.effect_repertoire(subset, full_set, state, tpm, base)
    p = iit.cause_repertoire(subset, full_set, state, tpm, base)
    '''
    cause_information = iit.EMD1(p_uncon, p);
    effect_information = iit.EMD1(f_uncon, f);
    
    print('ci',cause_information)
    print('ei',effect_information)
    
    cause_information = iit.EMD2(p_uncon, p);
    effect_information = iit.EMD2(f_uncon, f);

    print('ci',cause_information)
    print('ei',effect_information)
    
    Dist = np.array([[iit.hamming(i,j) for i in xrange(2**nnodes)] for j in xrange(2**nnodes)], dtype=int)  
    print (Dist)
    '''

    d = np.array(range(2**nnodes))
    locs = ((d[:, None] &
             (1 << np.arange(nnodes - 1, -1, -1))) > 0).astype(int)

    cause_information = emd(locs, locs, p_uncon, p, distance='cityblock')
    effect_information = emd(locs, locs, f_uncon, f, distance='cityblock')

    return np.minimum(cause_information, effect_information)

예제 #21

0

파일 보기

def WMD_bt_2texts(text1, text2):
    (Fs, wordOrders, weight) = Weight(segmentation(text1.lower()).split())
    text1_BOWVec = weight.tolist()               # BOW word vector of text
    text1_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    (Fs, wordOrders, weight) = Weight(segmentation(text2.lower()).split())
    text2_BOWVec = weight.tolist()               # BOW word vector of text
    text2_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    Dist = (emd((text1_w2vVec, text1_BOWVec), (text2_w2vVec, text2_BOWVec), distance))   # calculate the EMD of two documents
    return Dist

예제 #22

0

파일 보기

def get_wmd(ix):
    n = np.shape(X)
    n = n[0]
    Di = np.zeros((1,n))
    i = ix
    print '%d out of %d' % (i, n)
    for j in xrange(i):
        Di[0,j] = emd( (X[i], BOW_X[i]), (X[j], BOW_X[j]), distance)
    return Di

예제 #23

0

파일 보기

파일: DistributionDifference.py 프로젝트: petercombs/HybridSliceSeq

def earth_mover(points1, points2, normer=np.sum):
    xs1 = np.linspace(0, 1, len(points1),
                      endpoint=True)[np.array(np.isfinite(points1))]
    xs2 = np.linspace(0, 1, len(points2),
                      endpoint=True)[np.array(np.isfinite(points2))]
    points1 = points1[np.isfinite(points1)]
    points2 = points2[np.isfinite(points2)]
    return emd.emd(xs1, xs2, points1 / normer(points1),
                   points2 / normer(points2))

예제 #24

0

파일 보기

def wae_loss(x, x_hat, mu, logvar, batch_size=128):

    KLD = -0.5 * torch.sum(
        1 + logvar - mu.pow(2) - logvar.exp())  #KL divergence
    # Normalise by same number of elements as in reconstruction
    KLD /= batch_size
    EMDist = emd(x, x_hat)

    return EMDist + KLD

예제 #25

0

파일 보기

def AdvancedKernelEMD(X1, X1_labels, X2, X2_labels, dist):
    sz1 = np.shape(X1)[0]
    sz2 = np.shape(X2)[0]
    D = np.zeros((sz1, sz2))
    for i in range(0, sz1):
        for j in range(i, sz2):
            s1 = X1[i]
            X1_positive = s1[X1_labels[i] == 1]
            X1_negative = s1[X1_labels[i] == 0]

            s2 = X2[j]
            X2_positive = s2[X2_labels[j] == 1]
            X2_negative = s2[X2_labels[j] == 0]
            D[i, j] = (emd(X1_positive, X2_positive, distance=dist))\
                      +(emd(X1_negative, X2_negative, distance=dist))

    D = D + np.transpose(np.triu(D, k=1))
    return D

예제 #26

0

파일 보기

파일: wmd.py 프로젝트: COMPASS-WANG/wmd

def get_wmd(ix):
    n = np.shape(X)
    n = n[0]
    Di = np.zeros((1,n))
    i = ix
    print '%d out of %d' % (i, n)
    for j in xrange(i):
        Di[0,j] = emd( (X[i], BOW_X[i]), (X[j], BOW_X[j]), distance)
    return Di

예제 #27

0

파일 보기

파일: wmd.py 프로젝트: htt210/wmd

def get_wmd(i):

    Di = np.zeros((1,n))
    for j in range(n):
        if len(X[i]) > 0 and len(X[j]) > 0:
            #print i, j, len(X[i]), len(X[j])
            Di[0,j] = emd(X[i], X[j], X_weights=BOW_X[i], Y_weights=BOW_X[j])
        else:
            Di[0,j] = 2.0
    return Di

예제 #28

0

파일 보기

def get_wmd(i):

    Di = np.zeros((1, n))
    for j in range(n):
        if len(X[i]) > 0 and len(X[j]) > 0:
            #print i, j, len(X[i]), len(X[j])
            Di[0, j] = emd(X[i], X[j], X_weights=BOW_X[i], Y_weights=BOW_X[j])
        else:
            Di[0, j] = 2.0
    return Di

예제 #29

0

파일 보기

파일: utils.py 프로젝트: grimadas/ml_graph_attack

def calc_emd(G_orig, G_anon):
    """
    Calc Earth Mover distance of degree distribution between original graph and sanitized
     PyEMD package required! https://github.com/garydoranjr/pyemd
    :param G_orig: 
    :param G_anon: 
    :return: 
    """
    assert (G_orig.number_of_nodes() == G_anon.number_of_nodes())
    return emd(G_orig.degree().items(), G_anon.degree().items())

예제 #30

0

파일 보기

def cal_sentence_distance(sentence1_word_vector_list, sentence1_word_freq_list,
                          sentence2_word_vector_list,
                          sentence2_word_freq_list):
    start = time()
    sentence_distance = emd(
        (sentence1_word_vector_list, sentence1_word_freq_list),
        (sentence2_word_vector_list, sentence2_word_freq_list), distance)
    end = time()
    # print 'cal time :', end - start
    return sentence_distance

예제 #31

0

파일 보기

def ComputeKernelEMD(X1, X2, dist):
    sz1 = np.shape(X1)[0]
    sz2 = np.shape(X2)[0]
    D = np.zeros((sz1, sz2))
    for i in range(0, sz1):
        for j in range(i, sz2):

            D[i, j] = (emd(X1[i], X2[j], distance=dist))
    D = D + np.transpose(np.triu(D, k=1))
    return D

예제 #32

0

파일 보기

def ComputeKernelEMD1D(X1, X2, dist):
    sz1 = np.shape(X1)[0]
    sz2 = np.shape(X2)[0]
    D = np.zeros((sz1, sz2))
    for i in range(0, sz1):
        for j in range(i, sz2):
            D[i, j] = (emd(X1[i], X2[j], distance=dist))

    D = np.squeeze(D)
    return D

예제 #33

0

파일 보기

파일: DistributionDifference.py 프로젝트: petercombs/EisenLab-Code

def earth_mover(points1, points2):
    xs1 = np.linspace(0,1,len(points1),
                      endpoint=True)[np.array(np.isfinite(points1))]
    xs2 = np.linspace(0,1,len(points2),
                      endpoint=True)[np.array(np.isfinite(points2))]
    points1 = points1[np.isfinite(points1)]
    points2 = points2[np.isfinite(points2)]
    return emd.emd(xs1, xs2,
                   points1/np.sum(points1),
                   points2/np.sum(points2))

예제 #34

0

파일 보기

def WMD_bt_2texts(text1, text2):
    (Fs, weight) = generateVec(segmentation(text1.lower()).split())
    text1_BOWVec = weight.tolist()               # BOW word vector of text
    text1_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    (Fs, weight) = generateVec(segmentation(text2.lower()).split())
    text2_BOWVec = weight.tolist()               # BOW word vector of text
    text2_w2vVec = Fs.T.T.tolist()                   # w2v word vector of text
    Dist = emd((text1_w2vVec, text1_BOWVec), (text2_w2vVec, text2_BOWVec), distance)   # calculate the EMD of two documents
    if math.isnan(Dist):
        Dist = 9999
    return Dist

예제 #35

0

파일 보기

파일: eemd.py 프로젝트: FWorren/pysignanalysis

def ensemble_process(x, data_length, max_modes, max_siftings, noise_std,
                     ensembles_per_process, output):
    imfs = np.zeros((max_modes + 1, data_length))

    for i in range(ensembles_per_process):
        noise = np.multiply(np.random.randn(data_length), noise_std)
        noise_assisted_data = np.add(x, noise)
        ensemble = emd.emd(noise_assisted_data, max_modes, max_siftings)
        imfs = np.add(imfs, ensemble)

    output.put(imfs)

예제 #36

0

파일 보기

파일: window_based_analysis.py 프로젝트: HorvathLab/MATLAB_Rewrite

def get_all_emd(grouped_chrm, type):
    for comb in combs:
        s1 = [getattr(v, type) for v in grouped_chrm.windows[comb[0]].variants]
        s2 = [getattr(v, type) for v in grouped_chrm.windows[comb[1]].variants]
        s1 = [abs(v - 0.5) for v in s1]
        s2 = [abs(v - 0.5) for v in s2]
        if s1 != [] and s2 != []:
            p = scipy.stats.ks_2samp(s1, s2).pvalue
            emd_object = emd_obj(comb, emd.emd(emd.cumsum(s1), emd.cumsum(s2)),
                                 p * bonf_corr)
        else:
            emd_object = emd_obj(comb, None, None)
        all_emd.append(emd_object)

예제 #37

0

파일 보기

def ComputeKernelEMD1D(X1, X2, dist):
    sz1 = np.shape(X1)[0]
    sz2 = np.shape(X2)[0]
    D = np.zeros((sz1, sz2))
    for i in range(0, sz1):
        for j in range(i, sz2):
            print i, j
            startT = time.time()
            D[i, j] = (emd(X1[i], X2[j], distance=dist))
            endT = time.time() - startT
            #print 'EMD took ' + str(endT) + ' seconds.'
    #D2 = np.exp((-1/np.mean(D[np.nonzero(D)]))*D)
    return D

예제 #38

0

파일 보기

파일: emdist.py 프로젝트: Doxxer/EMDPython

def emdist(a, b, distance_function):
    """
    Return Earth Mover's distance

    :type distance_function: function
    :type b: numpy matrix
    :type a: numpy matrix
    """
    w1, w2 = a[:, 0], b[:, 0]
    feature1, feature2 = a[:, 1:], b[:, 1:]
    feature1 = (feature1.tolist(), w1.tolist())
    feature2 = (feature2.tolist(), w2.tolist())

    return emd(feature1, feature2, distance_function)

예제 #39

0

파일 보기

파일: calculateEMDMetric.py 프로젝트: psnegi/NeuriteOrientation

def calculateEMDMetric(histContainer1, histContainer2,angles,distanceFcn):      
  emdMetricWT={}
  minVal=1000000000
  for fileName1,histVal1 in histContainer1:
    for fileName2,histVal2 in histContainer2:
      minVal=1000000000
      for rotNum in range(len(angles)):
        anglesR=rotate(angles,rotNum)        
        cVal= emd((angles,histVal1.tolist()),(anglesR,histVal2.tolist()),distanceFcn)
        if cVal < minVal:
          minVal=cVal      
      if emdMetricWT.get(fileName1) == None:
        emdMetricWT[fileName1]=[]
      emdMetricWT[fileName1].append((fileName2,minVal))
        
  return emdMetricWT

예제 #40

0

파일 보기

파일: wmd.py 프로젝트: haomingchan0811/iPIN

def get_wmd(ix):                                 # calculate the WMD distance between documents
    print '***', ix
    n = np.shape(X)
    n = n[0]                                            # number of documents 
    Di = np.zeros((1,n))                          # (1 x NoOfDocs) matrix
    i = ix
    print '%d out of %d' % (i, n)
    for j in xrange(i):
        print '***'
        print 'X[i] Size = ', np.shape(X[i])
        print 'X[j] Size = ', np.shape(X[j])
        print 'BOW_X[i] Size = ', np.shape(BOW_X[i])
        print 'BOW_X[j] Size = ', np.shape(BOW_X[j])
        Di[0,j] = emd((X[i], BOW_X[i]), (X[j], BOW_X[j]), distance)     # calculate the EMD of two documents
    print Di
    return Di

예제 #41

0

파일 보기

파일: model.py 프로젝트: ssherko/musicman

def calculate_emd(signature1, signature2):
	D = calculate_distance_matrix(signature1, signature2, ground_distance = GROUND_DISTANCES["euclidean"])

	signature1_points = [ cl[0] for cl in signature1]
	signature1_weights = [ cl[2] for cl in signature1 ]
	signature1_weights = np.array(map(lambda x: float(x)/sum(signature1_weights), signature1_weights))

	signature2_points = [ cl[0] for cl in signature2]
	signature2_weights = [ cl[2] for cl in signature2 ]
	signature2_weights = np.array(map(lambda x: float(x)/sum(signature2_weights), signature2_weights))

	return emd(
				signature1_points, signature2_points, 
				X_weights = signature1_weights, Y_weights = signature2_weights, 
			  	distance = 'precomputed', D = D
			)

예제 #42

0

파일 보기

파일: no_random.py 프로젝트: RobinROAR/TensorflowTutorialsCode

def calWD(d1,d2):
    st = time.time()
    # 6层卷积，生成随机参数
    n_layers = int(np.log2(64))
    n_features = [64,128,256,512,512,4]
    r_convs = range(n_layers)
    #图片数量
    size = d1.shape[0]

    D = tf.placeholder(tf.float32,  [d1.shape[0],d1.shape[1],d1.shape[2],d1.shape[3]])
    D2 = tf.placeholder(tf.float32, [d1.shape[0], d1.shape[1], d1.shape[2], d1.shape[3]])



    with tf.Session() as sess:
        #init_op = tf.global_variables_initializer()
        #sess.run(init_op)
        # 卷积操作

        #stride决定图片缩小的倍数

        resultP6 = D
        resultQ6 = D2




        Q = sess.run(resultP6,feed_dict={D: d1})
        P = sess.run(resultQ6,feed_dict={D2:d2})


        Q = Q.reshape(size,-1)
        P = P.reshape(size,-1)

        print Q.shape



        t = time.time()-st
        #返回值很奇怪 （value, ）
        dis = emd(Q, P),
        #print 'spend time: %0.2f'%(t)
        return dis[0]

예제 #43

0

파일 보기

파일: get_wmd_scores.py 프로젝트: kavyasrinet/stack-searcher

def get_score(s1, s2, model):
	s1 =re.sub('[^a-zA-Z\s]+', ' ', s1)
	s2 =re.sub('[^a-zA-Z\s]+', ' ', s2)
	set1 = [word  for word in set( s1.strip().lower().split() + s1.strip().split()) if word in model ]
	set2 = [word  for word in set(s2.strip().lower().split() + s2.strip().split()) if word in model]

	c1 = collections.Counter(s1.strip().split() + s1.lower().strip().split())
	c2 = collections.Counter( s2.strip().split() + s2.lower().strip().split())

	w1 = [c1[word]*1.00 for word in set1]
	w2 = [c2[word]*1.00 for word in set2]

	w1 = np.array(w1)
	w2 = np.array(w2)
	w1/=sum(w1)
	w2/=sum(w2)

	v1 = [model[word]*1.00 for word in set1]
	v2 = [model[word]*1.00 for word in set2]

	return emd(v1,v2,X_weights = w1, Y_weights = w2)

예제 #44

0

파일 보기

파일: predict.py 프로젝트: cccue/Watch_and_learn

def metric_emd_sig(vec1,vec2):

    ndim = 4
    nclusters = vec1.shape[0]/ndim

    vec1 = vec1.reshape(ndim*nclusters,1)
    vec2 = vec2.reshape(ndim*nclusters,1)

    sig1 = vec1.reshape((ndim,nclusters)).T
    sig2 = vec2.reshape((ndim,nclusters)).T

    X_weights = np.empty((nclusters,1))
    Y_weights = np.empty((nclusters,1))
    X = np.empty((nclusters,ndim-1))
    Y = np.empty((nclusters,ndim-1))

    X_weights[:,0] = sig1[:,0]
    Y_weights[:,0] = sig2[:,0]

    X[:,0:ndim-1] = sig1[:,1:ndim]
    Y[:,0:ndim-1] = sig2[:,1:ndim]

    # Next lines remove biggest cluster. Assumes it represents the
    # background
    max_index = np.argmax(X_weights)
    X_weights_red = np.delete(X_weights,max_index,axis=0)
    X_weights_red = X_weights_red/np.sum(X_weights_red,axis=0)
    X_red = np.delete(X,max_index,axis=0)

    max_index = np.argmax(Y_weights)
    Y_weights_red = np.delete(Y_weights,max_index,axis=0)
    Y_weights_red = Y_weights_red/np.sum(Y_weights_red,axis=0)
    Y_red = np.delete(Y,max_index,axis=0)

    distance = emd(X_red,Y_red,X_weights_red,Y_weights_red,distance='euclidean')
    #distance = emd(X,Y,X_weights,Y_weights,distance='euclidean') 

    return distance

예제 #45

0

파일 보기

파일: emdist.py 프로젝트: Doxxer/EMDPython

                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652,
                 0.005434782608695652, 0.005434782608695652, 0.005434782608695652, 0.005434782608695652])

    st = time.time()
    print emd(feature1, feature2, d)
    print (time.time() - st) * 1000

예제 #46

0

파일 보기

파일: get_clusters.py 프로젝트: Henriojord/strategy_selector

    del content[len(content) - 1]
label = []
for i in range(0, len(content)):
    content[i] = content[i].split(":")
    label.append(content[i][0])
    content[i][1] = content[i][1].split(";")
    if len(content[i][1]) != 16:
        print(content[i][0])
    for j in range(0, len(content[i][1])):
        content[i][1][j] = int(content[i][1][j])
#Build distance matrix
weights = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0]
distance_matrix = np.empty([len(content), len(content)])
for i in range(0, len(content)):
    for j in range(i, len(content)):
        distance = emd.emd(content[i][1], content[j][1], weights, weights)
        distance_matrix[i][j] = distance
        distance_matrix[j][i] = distance
link_matrix = linkage(distance_matrix, method='weighted')
plt.figure()
plt.title("Dendrogram of " + str(len(label)) + " teams")
plt.xlabel("teams")
plt.ylabel("distance")
dendrogram = dendrogram(link_matrix, labels=label, leaf_rotation=30., leaf_font_size=5.)
plt.savefig("/robocup/implementation/dendro")
max_d = 0.5
clusters = fcluster(link_matrix, max_d, criterion='distance')
file = open("clusters", "w")
for i in range(0, len(content)):
    file.write(content[i][0] + " : " + str(clusters[i]) + "\n")
file.close()

예제 #47

0

파일 보기

파일: alt_emd.py 프로젝트: daureg/illalla

def solve_by_emd():
    """Solve the problem where the whole mass must be moved."""
    ltheta = DIM*[1, ]
    return emd((lP, fw1), (lQ, fw2),
               lambda a, b: float(dist_for_emd(a, b, ltheta)))

예제 #48

0

파일 보기

파일: cpm_analysis.py 프로젝트: dav-stott/phd-thesis

    def hist_compare(self):
        
        
        print 'BANDS',self.bands
        
        bins = 32
        
        emd_list = []
        
        out_dir = os.path.join(self.plot_dir,self.name)
        if not os.path.exists(out_dir):
            os.mkdir(out_dir)
            
        hist_dir = os.path.join(out_dir,'histograms')
        if not os.path.exists(hist_dir):
            os.mkdir(hist_dir)
                
                

        archaeology = self.arc
        background = self.bac
        
        #print archaeology.shape
        #print background.shape
        
        minima = np.min(archaeology)
        if minima > np.min(background):
            minima = np.min(background)
            
        maxima = np.min(archaeology)
        if maxima < np.max(background):
            maxima = np.max(background)
            
        hist_arch = np.histogram(archaeology,
                                 bins=bins, 
                                 range=(minima,maxima))
        hist_back = np.histogram(background,
                                 bins=bins, 
                                 range=(minima,maxima))
                                 
        #print hist_arch[0]
        #print hist_back[0]
        
        print 'Totals'             
        print 'hist_arch', np.sum(hist_arch[0])
        print archaeology.shape                          
        
        #print hist_arch[0].shape
        #print hist_arch[0].shape
                                 
                                 
        hist_arch_norm = np.true_divide(hist_arch[0],archaeology.shape)
        hist_back_norm = np.true_divide(hist_back[0],background.shape)
        
        #hist_arch_norm = hist_arch
        #hist_back_norm = hist_back           
        
        #print hist_arch_norm
        

        
        #print x_vals.shape
        
        os.chdir(hist_dir)
      
                                             
      
        
        sum_of_difference = np.sum(np.abs(hist_arch_norm-hist_back_norm))
                                          
        print sum_of_difference       

        contrast_emd = emd.emd(range(bins),range(bins),hist_arch_norm, hist_back_norm)          
        print 'EMD',contrast_emd
        emd_list.append(contrast_emd)
            
        emd_comp = np.array(emd_list)
        print emd_comp.shape

        
        os.chdir(self.plot_dir)
        np.savetxt(self.name+'_emd.txt',emd_comp, delimiter=',')

예제 #49

0

파일 보기

파일: neighborhood.py 프로젝트: daureg/illalla

 def regions_distance(r_features, r_weigths):
     if len(r_features) >= MAX_EMD_POINTS:
         return 1e20
     return emd((query_num, map(float, weights)),
                (r_features, map(float, r_weigths)),
                lambda a, b: float(dist_for_emd(a, b, ltheta)))

예제 #50

0

파일 보기

파일: FkExplorer.py 프로젝트: madjelan/cobr

	def discoverfks(self, theta):
		# phase 1
		fs = []
		fm = []
		# b will contain bottom-k sketches for each column, indexed on (<schemaname>, <tablename>, <columnname>)
		bksketches = {}
		quantiles = {}
		s = {}

		# calculate bottom-k sketch for all columns and store in dictionary <bksketches>
		for column in self.columns:
			bksketches[(column.db_schema, column.tablename, column.columnname)] = self.bottomksketch(self.getDataFn(column.db_schema, column.columnname, column.tablename))

		pkall = self.pksingle
		pkall.extend(self.pkmulti)
		for pk in pkall: # foreach primary key (single and multi)
			pkcollst = pk.db_columns.split(self.colseparator)
			n = len(pkcollst)

			for keycolumn_name in pkcollst: # foreach column in primary key
				for candidate in self.columns: # foreach column as foreign key candidate
					this = bksketches[(candidate.db_schema, candidate.tablename, candidate.columnname)]
					that = bksketches[(pk.db_schema, pk.tablename, keycolumn_name)]
					if self.inclusion(this, that) >= theta and (candidate.tablename != pk.tablename):
						if n == 1: # in case we are dealing with a single column pk
							fs.append(([candidate], pk))
						if n > 1: # in case we are dealing with a multi column pk
							if (pk.db_columns, keycolumn_name) not in s:
								s[(pk.db_columns, keycolumn_name)] = []
							# dictionary s indexes on (<pk name>, <pk column>) where the pk name is generic (can be
							# just concatenation of the columnnames), e.g.: ('id|name', 'id') and ('id|name', 'name')
							# indicate the two entries in s for PK 'id|name'. For each entry we store a list of
							# candidate columns found in other tables
							s[(pk.db_columns, keycolumn_name)].append(candidate)
			if n > 1:
				bksketches[(pk.db_schema, pk.tablename, pk.db_columns)] = self.bottomksketch(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename))

			quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename))

		# phase 2

		# fks: dictionary that indexes on (<foreignkey table>, <primary key column>)
		# value of the dictionary are those candidate columns in <foreignkey table> for <primary key column>
		# TBD: remove the table loop
		fks = {}
		for kvp in s:
			spkcolname = kvp[1]
			for e in s[kvp]:
				key = (e.tablename, spkcolname)
				if key not in fks:
					fks[key] = []
				fks[key].append(e)

		for pkm in self.pkmulti:
			pkcollst = pkm.db_columns.split(self.colseparator)
			print(pkm)
			# print()

			# for each table in the database, check if we have candidates in fks for this PK, if we do: get cartesian
			# product and store in the fm list
			for table in self.tables:
				tname = table.tablename
				L = []
				for pkcolumn in pkcollst:
					key = (tname, pkcolumn)
					if key not in fks:
						continue
					L.append(fks[key])
				if len(L) == len(pkcollst):
					cart = self.cartesian(L)
					for prod in cart:
						fm.append((prod, pkm))

		for flst,pk in fm:
			pkcollst = pk.db_columns.split(self.colseparator)
			fcols = [ c.columnname for c in flst ]

			fschema = flst[0].db_schema # TBD: ugly indices here
			ftable = flst[0].tablename # TBD: and here

			fsample = self.bottomksketch(self.getDataFn(fschema, fcols, ftable))
			if self.inclusion(fsample, bksketches[(pk.db_schema, pk.tablename, pk.db_columns)]) >= theta:
				quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] = self.quantilehistogram(self.getDataFn(pk.db_schema, pk.db_columns.split(self.colseparator), pk.tablename))
				quantiles[(fschema, ftable, "|".join(fcols))] = self.quantilehistogram(self.getDataFn(fschema, fcols, ftable))
			else:
				fm.remove((flst,pk))

		for flst,pk in fs:
			# only index zero because every fs has only one candidate column...
			quantiles[(flst[0].db_schema, flst[0].tablename, flst[0].columnname)] = self.quantilehistogram(self.getDataFn(flst[0].db_schema, flst[0].columnname, flst[0].tablename))

		result = []
		fall = fs
		fall.extend(fm)

		for f,pk in fall:

			fcols = []
			for cdict in f:
				fcols.append(cdict.columnname)
			fschema = f[0].db_schema # TBD: ugly indices here
			ftable = f[0].tablename # TBD: and here

			if quantiles[(fschema, ftable, "|".join(fcols))] is not None and quantiles[(pk.db_schema, pk.tablename, pk.db_columns)] is not None: # empty columns....
				qfk = quantiles[(fschema, ftable, "|".join(fcols))]
				qpk = quantiles[(pk.db_schema, pk.tablename, pk.db_columns)]

				emdscore = 0
				try:
					for i in range(len(qfk)):
						fkhist = qfk[i][0]
						pkhist = qpk[i][0]

						fkbins = qfk[i][1]
						pkbins = qpk[i][1]

						emdscore += emd.emd(fkhist, pkhist, fkbins[0:-1], pkbins[0:-1])
					emdscore = emdscore/len(qfk[0])
				except:
					emdscore = -1

				if math.isnan(emdscore):
					emdscore = -1

				nfk = ForeignKey(db_catalog=pk.db_catalog, pkdb_schema=pk.db_schema, fkdb_schema=fschema, pktablename=pk.tablename, fktablename=ftable, fk_columns=fcols, keyname='implicit_fk', type='implicit')
				nfk.pk_columns=pk.db_columns
				nfk.score = emdscore

				result.append((nfk, emdscore))

		# print("## len(Q): " + str(len(q)))

		return sorted(result, key=lambda kvp: kvp[1], reverse=False)

예제 #51

0

파일 보기

파일: emd_calculate.py 프로젝트: wyl-hit/job

    def emdcalculate(self, gray_url):
        update_num = 0
        find_flags = 0
        location_value = 0.15
        visual_value = 0.2
        features1 = self.mongo_operate.get_web_view(gray_url, 'gray')
        if features1 is False or features1 == []:
            return 0
        if len(features1) > 50:
            features1 = features1[:50]
        weight1 = [1 / len(features1) for i in range(len(features1))]
        for protect_url in self.protected_title_dict.keys():
            features2 = self.mongo_operate.get_web_view(
                protect_url, 'protected')
            if not features2:
                continue

            if len(features2) > 50:
                features2 = features2[:50]
            # features1中每个元素的权重
            weight2 = [1 / len(features2) for i in range(len(features2))]
            emd_goal_location = emd(
                (features1, weight1), (features2, weight2), self.dis_location)
            if math.isnan(emd_goal_location):
                continue
            if emd_goal_location < location_value:
                find_flags = 1
                self.mysql_handle.undate_gray_list_check_result(
                    gray_url, 'view', source_url=protect_url)
                self.mysql_handle.undate_task_result_check_result(
                    self.task_id, self.task_start_time, gray_url, 'view_location')

            emd_goal_visual = emd(
                (features1, weight1), (features2, weight2), self.dis_visual)
            if math.isnan(emd_goal_visual):
                continue
            if emd_goal_visual < visual_value:
                find_flags = 1
                self.mysql_handle.undate_gray_list_check_result(
                    gray_url, 'view', source_url=protect_url)
                self.mysql_handle.undate_task_result_check_result(
                    self.task_id, self.task_start_time, gray_url, 'view_visual')

        for counterfeit_url in self.counterfeit_title_dict.keys():
            features2 = self.mongo_operate.get_web_view(
                counterfeit_url, 'counterfeit')
            if not features2:
                continue
            if len(features2) > 50:
                features2 = features2[:50]
            # features1中每个元素的权重
            weight2 = [1 / len(features2) for i in range(len(features2))]
            emd_goal_location = emd(
                (features1, weight1), (features2, weight2), self.dis_location)
            if math.isnan(emd_goal_location):
                continue
            if emd_goal_location < location_value:
                find_flags = 1
                self.mysql_handle.undate_gray_list_check_result(
                    gray_url, 'view', counterfeit_url=counterfeit_url)
                self.mysql_handle.undate_task_result_check_result(
                    self.task_id, self.task_start_time, gray_url, 'view_location')

            emd_goal_visual = emd(
                (features1, weight1), (features2, weight2), self.dis_visual)
            if math.isnan(emd_goal_visual):
                continue
            if emd_goal_visual < visual_value:
                find_flags = 1
                self.mysql_handle.undate_gray_list_check_result(
                    gray_url, 'view', counterfeit_url=counterfeit_url)
                self.mysql_handle.undate_task_result_check_result(
                    self.task_id, self.task_start_time, gray_url, 'view_visual')
        if update_num >= 5:
            #update_running_state(view_check_num, view_find_num)
            pass
        return find_flags

예제 #52

0

파일 보기

파일: test_emd_matrix.py 프로젝트: Bots-And-Brains/yelp-kaggle

features_prefix = sys.argv[1]
fdata = FeatureData(features_prefix)

features_by_bid = {}
for f, bid in izip(fdata.features, fdata.business_ids):
	if bid in features_by_bid:
		features_by_bid[bid].append(f)
	else:
		features_by_bid[bid] = [f]

for bid in features_by_bid.keys():
	features_by_bid[bid] = np.array(features_by_bid[bid])

business_ids = np.array(sorted(features_by_bid.keys())[:2])
print "bids: ", business_ids

print "recalculated:"

for bid1 in business_ids:
	for bid2 in business_ids:
		print "D(%d, %d): %.4f" % (bid1, bid2,
		                           emd(features_by_bid[bid1], features_by_bid[bid2]))

print "from file:"

emd_matrix = EMDMatrix.load(sys.argv[2])
print emd_matrix.for_business_ids(business_ids, business_ids)

예제 #53

0

파일 보기

파일: classifier.py 프로젝트: ankit-maverick/kaggle-accbiometric

def my_emd(a, b):
    import emd
    pos = range(len(a))
    return emd.emd((pos, list(a)), (pos, list(b)), lambda x,y: abs(x-y)+0.0)

예제 #54

0

파일 보기

파일: approx_wmd.py 프로젝트: brightbird/WMD-1

def wmd(doc1, doc1_nbow, doc2, doc2_nbow):
	doc1 = doc1.T #for converting to list and feeding to EMD solver
	doc2 = doc2.T
	wmd_dist = emd((doc1.tolist(), doc1_nbow.tolist()), (doc2.tolist(), doc2_nbow.tolist()), distance)
	return wmd_dist