Exemplo n.º 1
0
def calculate_asymmetry(frame, training_data, headpose_data):
	# These are the pairs from left to right features across the face
	# All of these following numbers are 1 less than their documentation, because they are indices in the data array
	pairs = [[4, 5], [3, 6], [2, 7], [1, 8], [0, 9],											# Eyebrows
			 [19, 28], [20, 27], [21, 26], [22, 25], [23, 30], [24, 29],						# Eyes
			 [14, 18], [24, 29],																# Nose
			 [31, 37], [32, 36], [33, 35], [42, 38], [41, 39], [41, 39], [43, 45], [46, 48]]	# Lips
	middle = [10, 11, 12, 13, 16, 34, 40, 44, 47]

	result_val = 0
	for i in range(len(training_data)):
		frame = i
		for pair in pairs:
			mid_feature = 12

			p1 = training_data[frame][pair[0]]
			# print p1
			#p1 = mapPoint(headpose_data[frame], p1)

			p2 = training_data[frame][pair[1]]
			#p2 = mapPoint(headpose_data[frame], p2)

			mid_point = training_data[frame][mid_feature]
			#mid_point = mapPoint(headpose_data[frame], mid_point)

			dis1 = distance(p1, mid_point)
			dis2 = distance(p2, mid_point)
			result_val += abs(dis1 - dis2)
			
	return result_val
Exemplo n.º 2
0
 def test_points_near_linf(self):
     x = self.x
     d = self.d
     dd, ii = self.kdtree.query(x, k=self.kdtree.n, p=np.inf, distance_upper_bound=d)
     eps = 1e-8
     hits = 0
     for near_d, near_i in zip(dd,ii):
         if near_d == np.inf:
             continue
         hits += 1
         assert_almost_equal(near_d,distance(x,self.data[near_i],np.inf))
         assert_(near_d < d+eps, "near_d=%g should be less than %g" % (near_d,d))
     assert_equal(np.sum(distance(self.data,x,np.inf) < d+eps),hits)
Exemplo n.º 3
0
def NewNodeEdgeDistibutions(graphs):   # node + intra-edge
    nodedistnfilename = sys.argv[1]+"_NewNodeDistnProbabilities.txt"
    intraedgedistnfilename = sys.argv[1]+"_NewIntraEdgeDistnProbabilities.txt"
    os.remove(nodedistnfilename) if os.path.exists(nodedistnfilename) else None
    os.remove(intraedgedistnfilename) if os.path.exists(intraedgedistnfilename) else None
    
    computationtime = {}
    for g in graphs:
       print('Processing graph:', g)
       #graph = Graph.Read_Ncol(join(dir_path, g), directed=False)  
       print ("Extracting Node _ IntraEdge Distn Probabilities: %s" % g)
       graph = graphs[g] #.simplify()
       start_time = time.time()
       coreness = GraphBase.coreness(graph)
       highestcore = max(coreness)
       kshell = [[] for k in range(highestcore+1)] 
       for v in graph.vs:
          i = coreness[v.index]
          kshell[i].append(v) 
       
       nodedistn = []
       intraedgedistn = []
       
       for k in range(highestcore+1):
          subgraph = graph.subgraph(kshell[k])
          subgraph.simplify()
          nodedistn.append(float(subgraph.vcount()))
          intraedgedistn.append(float(subgraph.ecount()))
       
       sumn = sum(nodedistn)
       normalizednodedistn = []   
       if not(sumn ==0):
           normalizednodedistn = [x/sumn for x in nodedistn]

       sumn = sum(intraedgedistn)
       normalizedintraedgedistn = []   
       if not(sumn ==0):
           normalizedintraedgedistn = [x/sumn for x in intraedgedistn]
           
       computationtime[g] = time.time() - start_time
      
       saveFeature(g,normalizednodedistn, nodedistnfilename)
       saveFeature(g,normalizedintraedgedistn, intraedgedistnfilename)

    distance(nodedistnfilename)
    distance(intraedgedistnfilename)
    saveDict(computationtime, sys.argv[1]+"_NewNCKDTimings.txt")   
    return
Exemplo n.º 4
0
    def _compute_df(self, feature, timeline, distance):
        """Pre-compute feature distance dendrogram

        Parameters
        ----------
        feature :

        timeline :
            Temporal units
        distance :

        """

        # initialize feature distance matrix with zeros
        n = len(timeline)
        M = np.zeros((n, n))

        for s, segment in enumerate(timeline):

            s_feature = feature[segment]

            for t in range(s+1, n):

                t_feature = feature[timeline[t]]

                # feature distance between two segments
                M[s, t] = distance(s_feature, t_feature)

                # feature distance is symmetric
                M[t, s] = M[s, t]

        y = scipy.spatial.distance.squareform(M, checks=False)
        df = scipy.cluster.hierarchy.complete(y)
        return df
Exemplo n.º 5
0
def generate_distance_matrix(dist, peak_idxs, mode_dists, method='euclidean'):
	"""-------------------------------------------------------------------------
	Iteratively calculates the distance of the input distribution from each 
	(mode candidate, tonic candidate) pair. This is a generic function, that is
	independent of distribution type or any other parameter value.
	----------------------------------------------------------------------------
	dist       : Input distribution that is to be estimated
	peak_idxs  : List of indices of dist's peaks
	mode_dists : List of candidate mode distributions
	method     : The distance method to be used. The available distances are
	             listed in distance() function.
	-------------------------------------------------------------------------"""

	result = np.zeros((len(peak_idxs), len(mode_dists)))

	# Iterates over the peaks, i.e. the tonic candidates
	for i, cur_peak_idx in enumerate(peak_idxs):
		trial = dist.shift(cur_peak_idx)

		# Iterates over mode candidates
		for j, cur_mode_dist in enumerate(mode_dists):

			# Calls the distance function for each entry of the matrix
			result[i][j] = distance(trial.vals, cur_mode_dist.vals, method=method)
	return np.array(result)
 def find_nearest_connector(self, detection_coord ):
     """
     Search the given buckets of connectors for the one that is nearest to the given coordinates.
     Buckets farther than SEARCH_RADIUS are not searched, in which case a default ConnectorInfo object is returned.
     
     Returns: nearest_connector, distance to the nearest connector
     """
     # Find nearby blocks
     detection_coord_int = detection_coord.astype(int)
     search_roi = ( detection_coord_int - self.SEARCH_RADIUS,
                    detection_coord_int + self.SEARCH_RADIUS )
     nearby_block_starts = getIntersectingBlocks(self._blockshape, search_roi)
     nearby_block_starts = map(tuple, nearby_block_starts)
 
     # Accumulate connectors found in nearby blocks
     nearby_connectors = []
     for block_start in nearby_block_starts:
         if block_start in self._blocks:
             nearby_connectors += self._blocks[block_start]
 
     # Closure.  Distance from current point to given connector.
     def distance( conn ):
         return scipy.spatial.distance.euclidean( (conn.x_nm, conn.y_nm, conn.z_nm), detection_coord )
 
     # Find closest connector.
     if nearby_connectors:
         nearest_connector = min(nearby_connectors, key=distance)
         min_distance = distance( nearest_connector )
     else:
         # No connectors nearby.  Emit default values.
         nearest_connector = ConnectorInfo(-1, -1, -1, -1, [], [])
         min_distance = 9999999.0
     
     return nearest_connector, min_distance
Exemplo n.º 7
0
 def test_consistency_with_neighbors(self):
     M = self.T1.sparse_distance_matrix(self.T2, self.r)
     r = self.T1.query_ball_tree(self.T2, self.r)
     for i,l in enumerate(r):
         for j in l:
             assert_equal(M[i,j],distance(self.T1.data[i],self.T2.data[j]))
     for ((i,j),d) in M.items():
         assert_(j in r[i])
Exemplo n.º 8
0
	def nearest_neighbors(self, item_ratings, distance=distance.euclidean, limit=5):
		distances = {}
		with h5py.File(self.h5filename, 'r') as model:
			user_vector = self.__item_rating_dictionary_to_user_vector(item_ratings)
			ratings = model['ratings']
			for i in xrange(len(ratings)):
				distances[str(i + 1)] = distance(user_vector, ratings[i])
			
		return sorted(distances.iteritems(), key=operator.itemgetter(1))[:limit]
Exemplo n.º 9
0
def testFunction(v):
	total = 0.0
	for item in data_set_done:
		tmp = np.array([item])
		dotProduct = np.dot(tmp,v) # dotproduct should be a constant
		#print "dotProduct is ", dotProduct
		dotProduct = dotProduct * v # dotproduct is a column major vector
		#print "dotProduct is ", dotProduct
		total = total + distance(tmp,dotProduct)**2
	return total
Exemplo n.º 10
0
def data_clustering(data, distance=Euclidean,
                    linkage=AVERAGE):
    """
    Return the hierarchical clustering of the data set's rows.

    :param Orange.data.Table data: Data set to cluster.
    :param Orange.distance.Distance distance: A distance measure.
    :param str linkage:
    """
    matrix = distance(data)
    return dist_matrix_clustering(matrix, linkage=linkage)
Exemplo n.º 11
0
def feature_clustering(data, distance=PearsonR,
                       linkage=AVERAGE):
    """
    Return the hierarchical clustering of the data set's columns.

    :param Orange.data.Table data: Data set to cluster.
    :param Orange.distance.Distance distance: A distance measure.
    :param str linkage:
    """
    matrix = distance(data, axis=0)
    return dist_matrix_clustering(matrix, linkage=linkage)
Exemplo n.º 12
0
def test_distance_matrix():
    m = 10
    n = 11
    k = 4
    xs = np.random.randn(m,k)
    ys = np.random.randn(n,k)
    ds = distance_matrix(xs,ys)
    assert_equal(ds.shape, (m,n))
    for i in range(m):
        for j in range(n):
            assert_almost_equal(distance(xs[i],ys[j]),ds[i,j])
Exemplo n.º 13
0
def centroide( matrix , dim):
	tam = matrix.shape[1]
	coord = np.zeros(shape=(1, dim))
	r=0
	for i in range(dim):
		for j in range(tam):
			coord[0][i] = coord[0][i] + matrix[i][j]
	
	coord = coord*1.0/(tam)

	dist ,r = distance(matrix , coord)
	return coord,r
Exemplo n.º 14
0
def similarity(model,phr1,phr2,opts={}):
    num = distance(model,phr1,phr2,opts)
    res = 1-num
    
    # if opts.distance=="correlation":
    #     res = 1 - ((num + 1)/2)
    # elif opts.distance=="euclidean":        
    #     res = 1 / (1 + (num))
    # elif opts.distance=="seuclidean":
    #     res = 1 / (1 + (num))
    # else: # cosine
    #     res = 1-num

    return res
def SGD(alpha, is_base_model, model_file, is_batch_mode):

    matrix, y_label = readInMatrix(model_file, is_base_model)

    lambda_regularization = 0.05
    if is_batch_mode:
        threshold = 1
    else:
        threshold = 0.01

    W = np.zeros((5,feature_vector_length))
    matrix_csr = matrix.tocsr()
    lines_train_file = matrix.shape[0]
    count_threshold = 5
    count = 0
    for iteration in range(100000):
        if iteration % 10000 == 0:
            print iteration

        if not is_batch_mode:
            random_row = random_pick_data_vector(0, lines_train_file - 1) # 1 * 1000
            x_i = matrix_csr.getrow(random_row)
            y_c = y_label[random_row]

        else:
            random_100_rows = random_pick_data_vectors_100(0, lines_train_file - 1)
            x_i = matrix_csr[random_100_rows,:]
            y_c = y_label[random_100_rows]

        new_l_splash = gradient(y_c, W, x_i, lambda_regularization)
        newW = W + alpha * new_l_splash
        if iteration == 0:
            W = newW
        else:
            dist = distance(newW, W)

            # print dist
            if dist < threshold:
                if count > count_threshold:
                    break
                else:
                    count_threshold += 1
            else:
                count_threshold = 0
        W = newW
    return W
Exemplo n.º 16
0
def mode_estimate(dist, mode_dists, distance_method='euclidean', metric='pcd', step_size=7.5):
	"""-------------------------------------------------------------------------
	Compares the recording's distribution with each candidate mode with respect
	to the given tonic and returns the resultant distance vector to higher level
	functions. Here the input distribution is expected to be aligned according to
	the tonic and tonic  isn't explicitly used in this function. This is a wrapper
	function that handles the required preliminary tasks and calls
	generate_distance_matrix() accordingly.
	----------------------------------------------------------------------------
	dist            : Distribution of the input recording
	mode_dists      : List of PitchDistribution objects. These are the model
	                  pitch distributions of candidate modes.
	distance_method : The choice of distance method. See the full list at
	                  distance()
	metric          : Whether PCD or PD is used
	step_size         : The step-size of the pitch distribution. Unit is cents
	-------------------------------------------------------------------------"""

	#TODO: step_size and pD/pcd information can be retrieved from the dist object
	#try and test that

	# There are no preliminaries, simply generate the distance vector.
	if (metric == 'pcd'):
		distance_vector = np.array(generate_distance_matrix(dist, [0], mode_dists, method=distance_method))[0]

	elif (metric == 'pD'):
		distance_vector = np.zeros(len(mode_dists))

		# For each trial, a new instance of PitchDistribution is created and its
		# attributes are copied from dist. For each trial, it needs to be zero
		# padded according to the current mode distribution length. The entries
		# of the vector is generated iteratively, one-by-one.
		for i in range(len(mode_dists)):
			trial = pD.PitchDistribution(dist.bins, dist.vals, kernel_width=dist.kernel_width,
				                          source=dist.source, ref_freq=dist.ref_freq, segment=dist.segmentation)
			trial, mode_trial = pd_zero_pad(trial, mode_dists[i], step_size=step_size)
			distance_vector[i] = distance(trial, mode_trial, method=distance_method)
	return distance_vector
Exemplo n.º 17
0
def test_distance_vectorization():
    np.random.seed(1234)
    x = np.random.randn(10,1,3)
    y = np.random.randn(1,7,3)
    assert_equal(distance(x,y).shape,(10,7))
Exemplo n.º 18
0
def test_distance_l1():
    assert_almost_equal(distance([0,0],[1,1],1),2)
Exemplo n.º 19
0
 def test_found_all(self):
     r = self.T1.query_ball_tree(self.T2, self.d, p=self.p, eps=self.eps)
     for i, l in enumerate(r):
         c = np.ones(self.T2.n,dtype=bool)
         c[l] = False
         assert_(np.all(distance(self.data2[c],self.data1[i],self.p) >= self.d/(1.+self.eps)))
Exemplo n.º 20
0
 def test_found_all(self):
     c = np.ones(self.T.n,dtype=bool)
     l = self.T.query_ball_point(self.x, self.d, p=self.p, eps=self.eps)
     c[l] = False
     assert_(np.all(distance(self.data[c],self.x,self.p) >= self.d/(1.+self.eps)))
Exemplo n.º 21
0
    def simulate_step(self, action=None, loc=None, probability=1):
        done = False
        wall = False
        if action == None:
            action = self.brain.get_action()
        if loc == None:
            loc = self.location
        last_loc = deepcopy(loc)
        if action == 0:  # Go up
            loc[0] = loc[0] - 1
        elif action == 1:
            loc[1] = loc[1] + 1
        elif action == 2:
            loc[0] = loc[0] + 1
        elif action == 3:
            loc[1] = loc[1] - 1
        elif action == 4:
            pass
        elif action == 5:
            pass  #self.communicate()
        elif action == 6:
            pass  #self.communicate()

        if loc[0] <= 0:
            loc[0] = 0
        if loc[1] <= 0:
            loc[1] = 0
        if loc[0] > self.grid.size1 - 1:
            loc[0] = self.grid.size1 - 1
        if loc[1] > self.grid.size1 - 1:
            loc[1] = self.grid.size1 - 1
        if self.grid.Map[loc[0], loc[1]] > 0:
            loc = last_loc
            action = 10
            wall = True

        #### The reward
        r = 0  # total reward for step
        k = 50  # Gain on distance
        k2 = -.2  # Gain on the gradient
        if self.meeting_point is None:
            r += -1
        else:
            r += k / (
                distance([self.location], [self.meeting_point]).squeeze() + 1)
        if action == None:
            action = self.last_action
        if not wall:
            if action == 6:
                r -= .03
            elif action == 5:
                r -= .03
            elif action == 4:
                r += -.01
            else:
                r += -.03
            if self.grid.have_met() == True:
                r += 50
                done = True
        else:
            r += -1
            action = 4

        r_fromgrad = k2 * (self.grid.Map[loc[0], loc[1]])
        #print('The reward from the gradient is %f'%r_fromgrad)
        r += r_fromgrad  #The gradient contribution to the reward
        return loc, action, r, done
Exemplo n.º 22
0
 def compute_distance(_list, distance):
     return [
         distance(el1, el2) for i, el1 in enumerate(_list[:-1])
         for j, el2 in enumerate(_list[i + 1:])
     ]
Exemplo n.º 23
0
 def min_dist_label_and_model(self):
     features_data = np.array(self.data[['Latitude', 'Longitude', 'Country', 'Province', 'City']])
     # print features_data
     for d in features_data:
         distance(d)
     pass
Exemplo n.º 24
0
        # fix the leanring rate problem
        learning_rate = 1.0/math.sqrt(index)
        index = index + 1
        tmp = data_set_done[i] # convert to right form
        tmp = np.array([tmp])
        #print "dataset ", tmp # array([[]])
        #print "tranpose xt is ", np.transpose(tmp)
        #print "dot result ", np.dot(np.transpose(tmp),tmp)
        b = np.dot(np.transpose(tmp),tmp)
        b = np.dot(b,v1) # result will be a 110 * 1 vector
        #print "times learning plus ", tmp + learning_rate * b
        v1 = v1 + learning_rate * b 
        #print "v1 only sum ", v1
        v1 = normalizeVector(v1)
        if i % 400 == 0:
            tPoint.append(i)
            test = testFunction(v1)
            print "test " ,test
            testResult.append(test)
    print testResult
    plt.plot(tPoint, testResult, '-')
    #plt.axis([0, len(data_set_done), 0, 5000])
    plt.ylabel('test function result')
    plt.show()
    print "global v1, ", v1
    dis = distance(prev_v,v1)
    print "distance ", dis
    if dis <= 0.001:
        break
    prev_v = v1
	
Exemplo n.º 25
0
    ## Se itera sobre corpus, frases
    train_output={}

    # [Pseudo: 4 ] Por cada corpus de entrenamiento 
    for (filename, phrases) in train_data:
        filename_old=filename.replace('input', 'gs')
        train_output[filename_old]=[]
        # [Pseudo: 4.a ] Por cada frase de corpos de entrenamiento
        for phr1,phr2 in phrases:
            # [Pseudo: 4.a.i ] Preprocesamiento
            phr1,phr2=preprocessing(phr1,phr2,opts)
            # [Pseudo: 4.a.ii ] Sumar vectores frase uno
            # [Pseudo: 4.a.iii ] Sumar vectores frase dos
            # [Pseudo: 4.a.iv ] Calcular distancia
            num=distance(model,phr1,phr2,opts)
            train_output[filename_old].append([num])

    # [Pseudo: 5 ] Entrenar regresor
    verbose('Training model')
    if opts.method=="svr":
        method = train_model_srv(train_gs, train_output,args={'kernel':'rbf'})
    ## PARA AGREGAR UN MÉTODO MÁS
    # if opts.method == "nombre":
    #   method =  train_model_nombre(train_gs, train_output,args={'kernel':'rbf'})
    #   train_model_nombre tienen que estar en utils


    filenames_sys=[]
    distances=[]
    # [Pseudo: 6 ] Por cada corpus de prueba
Exemplo n.º 26
0
def test_distance_vectorization():
    x = np.random.randn(10, 1, 3)
    y = np.random.randn(1, 7, 3)
    assert_equal(distance(x, y).shape, (10, 7))
Exemplo n.º 27
0
def test_distance_linf():
    assert_almost_equal(distance([0, 0], [1, 1], np.inf), 1)
Exemplo n.º 28
0
def test_distance_l1():
    assert_almost_equal(distance([0, 0], [1, 1], 1), 2)
Exemplo n.º 29
0
def test_distance_l2():
    assert_almost_equal(distance([0, 0], [1, 1], 2), np.sqrt(2))
Exemplo n.º 30
0
 def test_found_all(self):
     c = np.ones(self.T.n,dtype=np.bool)
     l = self.T.query_ball_point(self.x, self.d, p=self.p, eps=self.eps)
     c[l] = False
     assert_(np.all(distance(self.data[c],self.x,self.p) >= self.d/(1.+self.eps)))
test_file = ['test1.csv', 'test2.csv', 'test3.csv', 'test4.csv', 'test5.csv']
#train_file = ['train1.csv']
#test_file = ['test1.csv']
distances = ['euclidean', 'cityblock', 'cosine']
k = 50
length = 0
MAD_list1 = []
MAD_list2 = []
for i in range(len(distances)):
    for j in range(len(train_file)):
        train, test = reading_csv(train_file[j], test_file[j])
        train_pivot = train.pivot_table(index='user',
                                        columns='movie',
                                        values='rating',
                                        aggfunc='first',
                                        fill_value=0)
        distance_df = distance(train_pivot, distances[i])
        users_similar = similar_user(train)
        users_neighbor = neighbors(distance_df)
        MAD_list1 += madlist1(test, train_pivot, k)
        MAD_list2 += madlist2(test, train_pivot, k)
        length += len(test.index)
    MAD1 = find_MAD1(MAD_list1, test, length)
    MAD2 = find_MAD2(MAD_list2, test, length)
    #proper algorithm is the one that is designed based on user i and each movie j they did not see, top k most similar users to i who have seen j, used this to infer i's rating on j
    print("MAD of proper algorithm is", distances[i], " is::", MAD2)
    #Basic algorithm is the simple algorithm which gives each user movie pair a rating that is equal to average score over all users who rated that movie
    print("MAD of basic algorithm is", distances[i], " is::", MAD1)

print time.clock() - start
Exemplo n.º 32
0
Arquivo: clus.py Projeto: taer/popcorn
def getClustersHier(positions, diameter):
    cl = HierarchicalClustering(positions, lambda x,y: distance(x,y))
    return cl.getlevel(diameter)
Exemplo n.º 33
0



#------------------------------
'''c
DISTANCE FORMULA
Representing Points
In this lesson, you will learn three different ways to define the distance between two points:

Euclidean Distance
Manhattan Distance
Hamming Distance
Before diving into the distance formulas, it is first important to consider how to represent points in your code.

In this exercise, we will use a list, where each item in the list represents a dimension of the point. For example, the point (5, 8) could be represented in Python like this:

pt1 = [5, 8]
Points aren’t limited to just two dimensions. For example, a five-dimensional point could be represented as [4, 8, 15, 16, 23].

Ultimately, we want to find the distance between two points. We’ll be writing functions that look like this:

distance([1, 2, 3], [5, 8, 9])
Note that we can only find the difference between two points if they have the same number of dimensions!
'''

'''DISTANCE FORMULA
Euclidean Distance
Euclidean Distance is the most commonly used distance formula. To find the Euclidean distance between two points, we first calculate the squared distance between each dimension. If we add up all of these squared differences and take the square root, we’ve computed the Euclidean distance.

Let’s take a look at the equation that represents what we just learned:
def re_ranking(feat, k1, k2, lambda_value, MemorySave=False, Minibatch=2000):
    #function included from https://github.com/zhunzhong07/person-re-ranking/tree/master/python-version
    #inputs:
    #feat: apperance feature from last layer from each gallery image
    #k1: number of K nearest neighbors to consider for given test image
    #k2: number of subset images to consider for each neighbor for test image
    #lambda_value= percentage weightage for reranking matrix, if 1 then it uses only the apperance feature
    all_num = feat.shape[0]
    query_num = feat.shape[0]
    feat = feat.astype(np.float16)
    print('computing original distance')
    if MemorySave:
        original_dist = np.zeros(shape=[all_num, all_num], dtype=np.float16)
        i = 0
        while True:
            it = i + Minibatch
            if it < np.shape(feat)[0]:
                original_dist[i:it, ] = np.power(cdist(feat[i:it, ], feat),
                                                 2).astype(np.float16)
            else:
                original_dist[i:, :] = np.power(cdist(feat[i:, ], feat),
                                                2).astype(np.float16)
                break
            i = it
    else:
        # original_dist = cdist(feat,feat).astype(np.float16)
        # original_dist = np.power(original_dist,2).astype(np.float16)
        original_dist = distance(feat).astype(np.float16)
    del feat
    gallery_num = original_dist.shape[0]
    original_dist = np.transpose(original_dist / np.max(original_dist, axis=0))
    V = np.zeros_like(original_dist).astype(np.float16)
    initial_rank = np.argsort(original_dist).astype(np.int32)

    print('starting re_ranking')
    for i in range(all_num):
        # k-reciprocal neighbors
        forward_k_neigh_index = initial_rank[i, :k1 + 1]
        backward_k_neigh_index = initial_rank[forward_k_neigh_index, :k1 + 1]
        fi = np.where(backward_k_neigh_index == i)[0]
        k_reciprocal_index = forward_k_neigh_index[fi]
        k_reciprocal_expansion_index = k_reciprocal_index
        for j in range(len(k_reciprocal_index)):
            candidate = k_reciprocal_index[j]
            candidate_forward_k_neigh_index = initial_rank[
                candidate, :int(np.around(k1 / 2)) + 1]
            candidate_backward_k_neigh_index = initial_rank[
                candidate_forward_k_neigh_index, :int(np.around(k1 / 2)) + 1]
            fi_candidate = np.where(
                candidate_backward_k_neigh_index == candidate)[0]
            candidate_k_reciprocal_index = candidate_forward_k_neigh_index[
                fi_candidate]
            if len(
                    np.intersect1d(candidate_k_reciprocal_index,
                                   k_reciprocal_index)
            ) > 2 / 3 * len(candidate_k_reciprocal_index):
                k_reciprocal_expansion_index = np.append(
                    k_reciprocal_expansion_index, candidate_k_reciprocal_index)

        k_reciprocal_expansion_index = np.unique(k_reciprocal_expansion_index)
        weight = np.exp(-original_dist[i, k_reciprocal_expansion_index])
        V[i, k_reciprocal_expansion_index] = weight / np.sum(weight)
    original_dist = original_dist[:query_num, ]
    if k2 != 1:
        V_qe = np.zeros_like(V, dtype=np.float16)
        for i in range(all_num):
            V_qe[i, :] = np.mean(V[initial_rank[i, :k2], :], axis=0)
        V = V_qe
        del V_qe
    del initial_rank
    invIndex = []
    for i in range(gallery_num):
        invIndex.append(np.where(V[:, i] != 0)[0])

    jaccard_dist = np.zeros_like(original_dist, dtype=np.float16)

    for i in range(query_num):
        temp_min = np.zeros(shape=[1, gallery_num], dtype=np.float16)
        indNonZero = np.where(V[i, :] != 0)[0]
        indImages = []
        indImages = [invIndex[ind] for ind in indNonZero]
        for j in range(len(indNonZero)):
            temp_min[0, indImages[j]] = temp_min[0, indImages[j]] + np.minimum(
                V[i, indNonZero[j]], V[indImages[j], indNonZero[j]])
        jaccard_dist[i] = 1 - temp_min / (2 - temp_min)

    final_dist = jaccard_dist * (1 -
                                 lambda_value) + original_dist * lambda_value
    #del original_dist
    #del V
    #del r
    # final_dist = final_dist[:query_num,query_num:]
    return final_dist
def all_features(vec1, sent1, vec2, sent2):

    #dist1=distance.cdist(vecList,vecList,'euclidean')
    from scipy.spatial import distance
    eu_dist1 = distance.euclidean(vec1, vec2)
    eu_dist2 = distance.cityblock(vec1, vec2)
    eu_dist3 = distance.cosine(vec1, vec2)
    #eu_dist4=distance.correlation(vec1,vec2)
    #eu_dist5=distance.chebyshev(vec1,vec2)

    eu_dist6 = distance.dice(vec1, vec2)
    eu_dist7 = distance.jaccard(vec1, vec2)

    #eu_dist8=distance.hamming(vec1,vec2)

    #print 'euclidean dist = ',eu_dist1
    #print 'cityblock dist = ',eu_dist2
    #print 'cosine dist = ',eu_dist3
    #print 'correlation dist = ',eu_dist4
    #print 'chebyshev dist = ',eu_dist5
    #print 'dice dist = ',eu_dist6
    #print 'jaccard dist = ',eu_dist7
    #print 'hamming dist = ',eu_dist8
    #print DocSentWordMap[-1]

    #--------------------------------------------------------------
    def lcs_length(a, b):
        table = [[0] * (len(b) + 1) for _ in xrange(len(a) + 1)]
        for i, ca in enumerate(a, 1):
            for j, cb in enumerate(b, 1):
                table[i][j] = (table[i - 1][j - 1] + 1 if ca == cb else max(
                    table[i][j - 1], table[i - 1][j]))
        return table[-1][-1]

    #-----------------------------------------------------------------

    #---------------------------------------------------------------------
    sent1 = set(sent1)
    sent2 = set(sent2)

    common = len(sent1.intersection(sent2))

    s1ins2 = (common * 1.00) / len(sent1)
    s2ins1 = (common * 1.00) / len(sent2)

    #print '% word of S1 in s2 = ',s1ins2
    #print '% word of S2 in s2 = ',s2ins1

    in_and_ex = common + (len(sent1) + len(sent2) - common)

    #print 'inclusion and exclution = ',in_and_ex

    word_overlap = (common * 1.00) / min(len(sent1), len(sent2))

    #print 'word overlap coefficient = ',word_overlap

    v1 = [str(x) for x in vec1]
    v2 = [str(x) for x in vec2]

    v1 = ' '.join(v1)
    v2 = ' '.join(v2)

    from Levenshtein import distance
    Levenshtein_dist = distance(v1, v2)
    #print 'levenshtein distance',Levenshtein_dist

    lcs_dist = lcs_length(sent1, sent2)
    #print 'LCS =',lcs_dist

    v1 = unicode(v1, 'utf-8')
    v2 = unicode(v2, 'utf-8')
    import jellyfish
    jaro_dist = 0
    #jaro_dist=jellyfish.jaro_distance(v1,v2)
    #print 'jaro distance',jaro_dist

    total_dist = (eu_dist1 + eu_dist2 + eu_dist3 + eu_dist6 + eu_dist7 +
                  s1ins2 + s2ins1 + in_and_ex + word_overlap + jaro_dist +
                  Levenshtein_dist + lcs_dist)
    #print 'total dist = ',total_dist
    total_dist = (total_dist) / 12

    #print 'total dist = ',total_dist

    return total_dist
Exemplo n.º 36
0
 def test_in_ball(self):
     l = self.T.query_ball_point(self.x, self.d, p=self.p, eps=self.eps)
     for i in l:
         assert_(distance(self.data[i],self.x,self.p) <= self.d*(1.+self.eps))
def k_means(data, K):
    """
	Implements the K-means algorithm following the slides' notation.
	"""
    MAX_ITERATIONS = 50
    N = len(data)
    C, C_with_labels, m = {}, {}, [None] * N
    iters = 0

    # initialize cluster representatives
    random_idxs = np.random.randint(0, N, size=K)
    # print('K = {}, random_idxs = {}'.format(K,random_idxs))
    # return(0,0,0)

    for i in range(len(random_idxs)):
        point = data[random_idxs[i], 2:]
        point_idx, point_label = data[random_idxs[i], 0], data[random_idxs[i],
                                                               1]
        C.setdefault(i, []).append((point[0], point[1]))
        C_with_labels.setdefault(i, []).append(
            [point_idx, point_label, point[0], point[1]])

    # in case there is a centroid is not the closest to ANY point, then there will be no points in its array and the mean will be nan. to prevent this, I store the old clusters and check if the a cluster in the updated clusters is empty, then just assign it the old value i.e. leave it unchanged
    C_old = C.copy()

    while (iters < MAX_ITERATIONS):
        cluster_means = dict()
        for c in C:
            if (len(C[c]) >
                    0):  # in case there is no point belonging to a cluster
                cluster_means[c] = get_cluster_mean(C[c])
                C_old[c] = C[c]
            else:  # use the old centroid and do not update its value
                cluster_means[c] = get_cluster_mean(C_old[c])

        # print('C =')
        # for c in C: print(C[c], len(C[c]));
        # print ('cluster_means = {}'.format(cluster_means))
        # input('...\n')

        # reassign points in D to closest cluster mean
        C = {c: [] for c in C}
        C_with_labels = {c: [] for c in C}
        for i in range(N):
            point = data[i, 2:]
            point = (point[0], point[1])
            point_idx, point_label = data[i, 0], data[i, 1]

            # get the closest cluster to this point
            closest_dist = float('inf')
            closest_cluster = None
            for c in cluster_means:
                dist = distance(point, cluster_means[c])
                if dist < closest_dist:
                    closest_dist = dist
                    closest_cluster = c

            # print ('Closest cluster/centroid to the point {} is {}:{}'.format(point, closest_cluster, cluster_means[closest_cluster]))

            # assign this point to the closest cluster
            C[closest_cluster].append(point)
            C_with_labels[closest_cluster].append(
                [point_idx, point_label, point[0], point[1]])
            # update m s.t. m_i is cluster ID of ith point in D
            m[i] = closest_cluster

            # print ('Closest centroid to the point {} is {}'.format(point, m[i]))
            # input('...')

        # if there is no change in the centroids, then stop
        new_cluster_means = {c: get_cluster_mean(C[c]) for c in C}
        if (centroids_did_not_change(new_cluster_means, cluster_means)):
            if DEBUG:
                print('Old cluster_means = {}'.format(cluster_means))
                print('New cluster_means = {}'.format(new_cluster_means))
                print('No change in the centroids, breaking out of the loop.')
            break

        # if this is the last iteration then break out of the loop after the assignment of points to the cluster and before recomputing the new centroids
        if iters == MAX_ITERATIONS - 1:
            if DEBUG:
                print(
                    'this is the last ({}) iteration, break out of the loop after the assignment of points to the cluster and before recomputing the new centroids'
                    .format(iters))
            break

        iters += 1

    if DEBUG:
        print('iters = ', iters)

    for c in C_with_labels:
        C_with_labels[c] = np.asarray(C_with_labels[c])

    return (C, C_with_labels, m)
Exemplo n.º 38
0
 def test_all_in_ball(self):
     r = self.T1.query_ball_tree(self.T2, self.d, p=self.p, eps=self.eps)
     for i, l in enumerate(r):
         for j in l:
             assert_(distance(self.data1[i],self.data2[j],self.p) <= self.d*(1.+self.eps))
Exemplo n.º 39
0
 def test_found_all(self):
     r = self.T1.query_ball_tree(self.T2, self.d, p=self.p, eps=self.eps)
     for i, l in enumerate(r):
         c = np.ones(self.T2.n,dtype=np.bool)
         c[l] = False
         assert_(np.all(distance(self.data2[c],self.data1[i],self.p) >= self.d/(1.+self.eps)))
Exemplo n.º 40
0
def test_distance_l2():
    assert_almost_equal(distance([0,0],[1,1],2),np.sqrt(2))
Exemplo n.º 41
0
def NodeEdgeDistribution(graphs):
    #graphs = {}
    Coreness = {}
    KCoreSignatures = {}
    KCoreTimings = {}
    CountTimings = {}
    NodeDistnTimings = {}   
    EdgeMatrices = {}
    EdgeDistnTimings = {}
    AllTimings = {}
  
    nodedistnfilename = sys.argv[1]+"_NodeDistnProbabilities.txt"
    edgedistnfilename = sys.argv[1]+"_EdgeProbabilities.txt"
    os.remove(nodedistnfilename) if os.path.exists(nodedistnfilename) else None
    os.remove(edgedistnfilename) if os.path.exists(edgedistnfilename) else None
    
    maximumcore = 0
    for g in graphs:
       print('Processing graph:', g)
       #graph = Graph.Read_Ncol(join(dir_path, g), directed=False)  
       print ("Extracting Node Distn Probabilities: %s" % g)
       graph = graphs[g].simplify()
       start_time = time.time()
       coreness = GraphBase.coreness(graph)
       KCoreTimings[g] = time.time() - start_time
       Coreness[g] = coreness
       
       l = len(coreness)
       
       start_time = time.time()
       d = {n:coreness.count(n) for n in range(max(coreness)+1)}
       CountTimings[g] = time.time() - start_time
       print('max(coreness): ', max(coreness))
       start_time = time.time()
       KCoreSignature = [d[key] / (l * 1.0) for key in sorted(d)]
       NodeDistnTimings[g] = time.time() - start_time
       
       KCoreSignatures[g] = KCoreSignature
       
       saveFeature(g,KCoreSignature, nodedistnfilename)
       highestcore = max(Coreness[g])
       if (maximumcore < highestcore):
           maximumcore = highestcore    
       
    saveDict(NodeDistnTimings, sys.argv[1]+"_NodeDistnTimings.txt") 
          
 
    for g in graphs:
       start_time = time.time()
       EdgeMatrix = getEdgeProbabilities(g, graphs[g], Coreness[g], maximumcore)
       EdgeMatrices[g] = EdgeMatrix
       EdgeDistnTimings[g] = time.time() - start_time
       saveFeature(g,EdgeMatrix, edgedistnfilename)
    saveDict(EdgeDistnTimings, sys.argv[1]+"_EdgeProbabilitiesTiming.txt")    
    
    for g in graphs:
       t0= KCoreTimings[g]
       t1 = CountTimings[g]
       t2 = NodeDistnTimings[g]
       t3 = EdgeDistnTimings[g]
       
       t4 =  t0+t1+t2+t3
       AllTimings[g]=[t0,t1 ,t2, t3, t4]   
       #AllTimings[g]=[t3]
    
    distance(nodedistnfilename)
    distance(edgedistnfilename)
    saveDict(AllTimings, sys.argv[1]+"_AllNCKDTimings.txt")  
    return
Exemplo n.º 42
0
def test_distance_linf():
    assert_almost_equal(distance([0,0],[1,1],np.inf),1)
Exemplo n.º 43
0
 def test_all_in_ball(self):
     r = self.T1.query_ball_tree(self.T2, self.d, p=self.p, eps=self.eps)
     for i, l in enumerate(r):
         for j in l:
             assert_(distance(self.data1[i],self.data2[j],self.p) <= self.d*(1.+self.eps))
def recommendationScores(ratings_dict_train, ratings_dict_test):
    """
    Calculates recommendation scores for each user/business combination in the
    test set using collaborative filtering.
    :param ratings_dict_test: Nested dictionary with structure {user_id: {business_id: rating}}
    :param ratings_dict_training: Nested dictionary with structure {user_id: {business_id: rating}}
    :return: nested dictionary, identical to ratings_dict_test except that it contains
    recommendation scores instead of ratings
    """

    # Initialize recommendation scores dictionary
    rec_scores = {}

    # For every user/business combo in the test set,
    # calculate recommendation score based on top 5 most similar users
    for test_user, test_ratings in ratings_dict_test.iteritems():
        #print test_user, test_ratings

        # Initialize dictionary to hold distances between current test user and all training users
        distances = {}
        # Initialize key-value pair in recommendation scores dictionary to hold the
        # recommendation scores for this user for all businesses
        rec_scores[test_user] = {}

        # For each training user, calculate the distance between them and current test user
        for train_user, train_ratings in ratings_dict_train.iteritems():
            # Do not consider the same user
            if train_user == test_user:
                continue
            # Do not consider users who haven't reviewed any of the same businesses
            elif len(set(test_ratings.keys()) & set(train_ratings.keys())) == 0:
                continue
            # Calculate distance based on common business ratings
            else:
                distances[train_user] = distance(test_ratings, train_ratings)

        # For each business for the current user, calculate recommendation score
        for bus_id in test_ratings.iterkeys():

            # First narrow down similar users based on who has also reviewed the current question
            similar_users = [user_id for user_id, ratings in ratings_dict_train.iteritems() if bus_id in ratings]

            # Subset distance dictionary created in previous step to only hold these users
            similar_users_dist = {user_id: distances[user_id] for user_id in similar_users if user_id in distances}

            # Sort these users by distance to take the most similar 5
            # The "top 5" will have less than 5 if fewer than 5 other users have reviewed the business
            top5 = sorted(similar_users_dist, key=similar_users_dist.get, reverse=True)[0:5]

            # Calculate recommendation score as sum(similarity * rating) for top 5's ratings of this business
            rec_score_unscaled = np.sum([1.0/(distances[user_id]+0.1) * ratings_dict_train[user_id][bus_id] for user_id in top5])

            # Scale the score by the number of users who actually contributed to it
            rec_score = rec_score_unscaled/len(top5)

            # Add to recommendation scores dictionary
            rec_scores[test_user][bus_id] = rec_score

            # Print if there's a strange value
            if rec_score == 0 or rec_score == np.inf:
                print "Test User: "******", Business: " + str(bus_id)
                print "    " + str(len(similar_users_dist)) + " users also reviewed this business"
                print "    " + str(top5)
                print "    Recommendation Score: " + str(rec_score_unscaled) + "/" + str(len(top5)) + '=' + str(rec_score)
                print "\n"

    return rec_scores
Exemplo n.º 45
0
 def test_in_ball(self):
     l = self.T.query_ball_point(self.x, self.d, p=self.p, eps=self.eps)
     for i in l:
         assert_(
             distance(self.data[i], self.x, self.p) <= self.d *
             (1. + self.eps))
Exemplo n.º 46
0
def distance(user, measurement):
    measurementsDB = shelve.open('measurmentsDB')
    savedMeasurement = measurementsDB[user]
    meanMeasurement = mean(savedMeasurement, axis=0)
    distMatrix = distance(meanMeasurement, measurement, 'cityblock')
    distArray = distMatrix.diagonal()
Exemplo n.º 47
0
   
    coords_n=[]
    
    for i in range(0,len(coords)):
        
        coords_n.append([x[i],y[i]])  #store the new coordinates in a list
        

#    N=[] #list to store count of each root hair
D=[] #list to store the distances


#centroid length(cl)
for (x,y) in coords_n:

    k=distance(x,y,xc,yc)
    D.append(k)


    
cs=(sum(D))**.5
CSN1.append(cs)
print("Area of the shape")
print(cs)
Filename.append('42.jpg')

#print(len(Filename))
#print(len(CSN1))

data={'ID': Filename, 'Area':CSN1}
NS=pd.DataFrame(data)