示例#1
0
def GeneralizedNormalKernel(X, Y=None, gamma = None, beta = 1):
    """Compute the generalized normal kernel between X and Y.
    The generalized normal kernel is defined as::
        K(x, y) = exp(-gamma ||x-y||_1^beta)
    for each pair of rows x in X and y in Y.
    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)
    Y : array of shape (n_samples_Y, n_features)
    gamma : float
    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """

    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    if beta == 1:
        K = -gamma * manhattan_distances(X, Y)
    else:
        K = -gamma * manhattan_distances(X, Y) ** beta
    np.exp(K, K)    # exponentiate K in-place
    return K
示例#2
0
def med_distnaces(X):
	"""
	"""
	print 'medutilDist: custome distance fuction'
	D = manhattan_distances(X, Y, sum_over_features=False)
	
	return D
def mean_absolute_error(x, y):
    vector = manhattan_distances(x, y)
    summation = np.sum(vector)
                     
    mae = summation / y.shape[0]
    
    return mae
示例#4
0
def MaternKernel(X, Y=None, gamma = None, p = 0):
    """Compute the generalized normal kernel between X and Y.
    The generalized normal kernel is defined as::
        K(x, y) = exp(-gamma ||x-y||_1^beta)
    for each pair of rows x in X and y in Y.
    Parameters
    ----------
    X : array of shape (n_samples_X, n_features)
    Y : array of shape (n_samples_Y, n_features)
    gamma : float
    Returns
    -------
    kernel_matrix : array of shape (n_samples_X, n_samples_Y)
    """
    assert(p == int(p))

    X, Y = check_pairwise_arrays(X, Y)
    if gamma is None:
        gamma = 1.0 / X.shape[1]

    r = manhattan_distances(X, Y)
    if p == 0:
        K = -gamma * r
        np.exp(K, K)    # exponentiate K in-place
    if p == 1:
        K = -gamma * r * math.sqrt(3)
        np.exp(K, K)    # exponentiate K in-place
        K *= (1+gamma * r * math.sqrt(3))
    if p == 1:
        K = -gamma * r * math.sqrt(5)
        np.exp(K, K)    # exponentiate K in-place
        K *= (1+gamma * r * math.sqrt(5) + 5./3. * (r*gamma)**2)
    return K
def search_query(images, query, proximity_by, rank_size,coding_kind):
    
    #print('The search has began')
    if coding_kind == 1:  
        #frequencia de cd cor na imagem, agrupado em 32 cluster para cd cor, 96 no total
        t='rgb_hist' 
    elif coding_kind == 2:    
        #descrevo a imagem com base na cor dos pixels sorteados e das cores no codebook
        t = 'feature_vector_pixels_hard'
    else:    
        t='feature_vector_hist_hard'
    
    distances = None
    if proximity_by == 'ed':
        for img in images:
            distances=np.append(distances,euclidean_distances(img[t],query[t]))            
    elif proximity_by == 'md':
        for img in images:
            distances=np.append(distances,manhattan_distances(img[t],query[t],sum_over_features=False))

    distances = np.array(np.delete(distances,0),dtype=np.float)
    #print('We have the distances')
    #print(distances)
    proximity_vector = None
    for i in range(rank_size):
        a = np.nanargmin(distances)
        proximity_vector = np.append(proximity_vector,np.int(a))
        distances[a] = np.nan

    proximity_vector = np.array(np.delete(proximity_vector,0),dtype=np.int)
    #print(proximity_vector)
    
    return [proximity_vector]# ['obj1__0', 'obj1__10', ...]
示例#6
0
def compute_distance(matrix, v, distance_metric):
    if (distance_metric == 'cosine'):
        return cosine_similarity(matrix, v.reshape(1, -1)).reshape(-1)
    elif (distance_metric == 'l2'):
        return euclidean_distances(matrix, v.reshape(1, -1)).reshape(-1)
    elif (distance_metric == 'l1'):
        return manhattan_distances(matrix, v.reshape(1, -1)).reshape(-1)
    else:
        raise ValueError(
            'Invalid distance metric, must be in [cosine, l1, l2]')
def compute_distance_matrix(model_matrix, pred_matrix, distance_matrix_type):
    distance_matrix = []
    if (distance_matrix_type == 'euclidean'):
        distance_matrix = euclidean_distances(model_matrix, pred_matrix)
    elif (distance_matrix_type == 'cosine'):
        distance_matrix = cosine_distances(model_matrix, pred_matrix)
    elif (distance_matrix_type == 'manhatten'):
        distance_matrix = manhattan_distances(model_matrix, pred_matrix)

    return distance_matrix
示例#8
0
def l1_dist(X, Y):
    '''
        Computes l1 metric between X and Y. May need to modify since I swapped the rows and columns.
        X: A.layers['log1p']
        Y: B.layers['log1p']
    '''
    dist_AA = manhattan_distances(X, X)
    dist_AB = manhattan_distances(X, Y)

    # nkc are the kallisto-cellranger distances
    dist_AB = np.diagonal(dist_AB)

    # ncc are the kallisto-kallisto distances
    AA = []
    for row in dist_AA:
        val = np.partition(row, 1)[1]
        AA.append(val)
    dist_AA = AA

    return dist_AA, dist_AB
示例#9
0
def ehd_calculations(features, vector, query_vector):
    dim = calculate_dimensions(vector, 'EHD')
    data_feature_ehd, query_feature_ehd = fit_pca(dim, vector, query_vector)
    # manhattan distance between query image and complete dataset
    man_dis_ehd = pd.DataFrame(manhattan_distances(
        query_feature_ehd, data_feature_ehd, sum_over_features=True).flatten(),
                               columns=['ehd_dis'])
    dataset = pd.concat([features, man_dis_ehd], axis=1)
    # Manhattan Distance sorting
    # man_dis_result = dataset_ehd.sort_values(by='ehd_dis', ascending=True)[[0, 'ehd_dis']].head(20)
    return dataset
示例#10
0
def scalable_align(cs_lat, cs_lon, swath_lat, swath_lon):
    """  """
    (n, m) = swath_lat.shape

    swath_points = np.stack((swath_lat.flatten(), swath_lon.flatten())).T
    track_points = np.stack((cs_lat, cs_lon), axis=1)

    dist = manhattan_distances(swath_points, track_points)
    mapping = np.unravel_index(np.argmin(dist, axis=0), (n, m))

    return mapping
    def __init__(self):
        '''
        Sets the scaling and distance functions that define the Manhattan
        Similarity Metric.

        (1 / (.0000000001 + d)) is from https://stats.stackexchange.com/a/158285
        2000 is manually chosen, by @Zoran, to help scale.
        '''
        self.set_name("Manhattan")
        self.set_scaling(lambda dst: np.tanh(400 * (1 / (.0000000001 + dst))))
        self.set_dist(lambda wm, uv: manhattan_distances(wm, uv)[0][0])
示例#12
0
 def measure_distance(self, sentences:list):
     
     transformed_sentences:list=self.transformer.extract_text_features_simple(sentences)
         
     #first= manhattan_distances(transformed_sentences,self.featurized_ex1)
     #second=manhattan_distances(transformed_sentences,self.featurized_ex2)
     third=manhattan_distances(transformed_sentences,self.featurized_cl1)
     
     return third #, np.var((first,second,third),axis=0)
     
     
     
示例#13
0
    def _predict_derivatives(self, x, kx):
        """
        Evaluates the derivatives at a set of points.

        Arguments
        ---------
        x : np.ndarray [n_evals, dim]
            Evaluation point input variable values
        kx : int
            The 0-based index of the input variable with respect to which derivatives are desired.

        Returns
        -------
        y : np.ndarray
            Derivative values.
        """
        kx += 1

        # Initialization
        n_eval, n_features_x = x.shape
        x = (x - self.X_mean) / self.X_std
        # Get pairwise componentwise L1-distances to the input training set
        dx = manhattan_distances(x, Y=self.X_norma.copy(), sum_over_features=
                                 False)
        d = self._componentwise_distance(dx)
        # Compute the correlation function
        r = self.options['corr'](self.optimal_theta, d).reshape(n_eval,self.nt)

        if self.options['corr'].__name__ != 'squar_exp':
            raise ValueError(
            'The derivative is only available for square exponential kernel')
        if self.options['poly'].__name__ == 'constant':
            df = np.array([0])
        elif self.options['poly'].__name__ == 'linear':
            df = np.zeros((self.nx + 1, self.nx))
            df[1:,:] = 1
        else:
            raise ValueError(
                'The derivative is only available for ordinary kriging or '+
                'universal kriging using a linear trend')

        # Beta and gamma = R^-1(y-FBeta)
        beta = self.optimal_par['beta']
        gamma = self.optimal_par['gamma']
        df_dx = np.dot(df.T, beta)
        d_dx=x[:,kx-1].reshape((n_eval,1))-self.X_norma[:,kx-1].reshape((1,self.nt))
        if self.name != 'Kriging' and 'KPLSK' not in self.name:
            theta = np.sum(self.optimal_theta * self.coeff_pls**2,axis=1)
        else:
            theta = self.optimal_theta
        y = (df_dx[0]-2*theta[kx-1]*np.dot(d_dx*r,gamma))*self.y_std/self.X_std[kx-1]

        return y
示例#14
0
def compute_distances(X, Y, metric, metric_params):

    if metric == 'manhattan':
        distances = manhattan_distances(X, Y)
    elif metric == 'euclidean':
        distances = euclidean_distances(X, Y)
    elif metric == 'cosine':
        distances = cosine_distances(X, Y)
    elif metric == 'bm25':
        distances = bm25_similarity(X, Y, metric_params)

    return distances
def evaluate(test_emb, test_id, params):
    unique_ids, unique_counts = np.unique(test_id, return_counts=True)
    unique_ids = unique_ids[unique_counts >= 2]
    good_test_indices = np.in1d(test_id, unique_ids)
    valid_test_embs = test_emb[good_test_indices]
    valid_test_ids = test_id[good_test_indices]
    n_correct_at_k = np.zeros(params.max_k)
    if len(test_emb) < 40000:
        if params.dist == 'cos':
            #distances = 1.-np.dot(valid_test_embs, test_emb.T)
            distances = find_cos_distances(valid_test_embs, test_emb)
        elif params.dist == 'l2':
            distances = find_l2_distances(valid_test_embs, test_emb)
        elif params.dist == 'l1':
            distances = manhattan_distances(valid_test_embs, test_emb)
        elif params.dist == 'max_l1' or params.dist == 'max_l2':
            distances = max_distances(valid_test_embs, test_emb, params.dist)
        elif params.dist == 'softmax_l1':
            distances = softmax_distances(valid_test_embs, test_emb,
                                          params.softmax_l1_beta)
        elif params.dist == 'p_norm':
            distances = p_norm_distances(valid_test_embs, test_emb,
                                         params.p_norm_p)
        elif params.dist == 'max_n':
            distances = max_n_distances(valid_test_embs, test_emb,
                                        params.max_n)
        for idx, valid_test_id in enumerate(valid_test_ids):
            k_sorted_indices = np.argsort(distances[idx])[1:]
            first_correct_position = np.where(
                test_id[k_sorted_indices] == valid_test_id)[0][0]
            if first_correct_position < params.max_k:
                n_correct_at_k[first_correct_position:] += 1
        return 100. * n_correct_at_k / len(valid_test_ids)
    else:
        #if params.dist == 'cos':
        #    metric='cosine'
        #else: metric = 'l2'
        metric = 'l2'
        nn = NearestNeighbors(n_neighbors=params.max_k + 1,
                              metric=metric,
                              algorithm='kd_tree',
                              n_jobs=-1).fit(test_emb)
        distances, indices = nn.kneighbors(valid_test_embs)
        for idx, valid_test_id in enumerate(valid_test_ids):
            k_sorted_indices = indices[idx]
            correct_positions = np.where(
                test_id[k_sorted_indices] == valid_test_id)[0][1:]
            first_correct_position = params.max_k
            if len(correct_positions) > 0:
                first_correct_position = correct_positions[0] - 1
            if first_correct_position < params.max_k:
                n_correct_at_k[first_correct_position:] += 1
        return 100. * n_correct_at_k / len(valid_test_ids)
示例#16
0
def _multi_gini_seg(data, groups):
    """
    Calculation of Multigroup Gini Segregation index

    Parameters
    ----------

    data   : a pandas DataFrame

    groups : list of strings.
             The variables names in data of the groups of interest of the analysis.

    Returns
    -------

    statistic : float
                Multigroup Gini Segregation Index

    core_data : a pandas DataFrame
                A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67.

    Reference: :cite:`reardon2002measures`.

    """

    core_data = data[groups]
    data = _nan_handle(core_data)

    df = np.array(core_data)

    K = df.shape[1]

    T = df.sum()

    ti = df.sum(axis=1)
    pik = df / ti[:, None]
    Pk = df.sum(axis=0) / df.sum()
    Is = (Pk * (1 - Pk)).sum()

    elements_sum = np.empty(K)
    for k in range(K):
        aux = np.multiply(np.outer(ti, ti),
                          manhattan_distances(pik[:, k].reshape(-1, 1))).sum()
        elements_sum[k] = aux

    multi_Gini_Seg = elements_sum.sum() / (2 * (T**2) * Is)

    return multi_Gini_Seg, core_data, groups
示例#17
0
def manhattan_dist(a, b):
    ######## EUCLIDEAN DISTANCE ########
    # INPUT
    # a: 1-D array
    # b: 1-D array
    # a and b have the same length
    # OUTPUT
    # d: Manhattan distance between a and b

    # TODO: Manhattan distance
    d = 0
    d = manhattan_distances(a, b)
    return d
示例#18
0
 def countError(self, data):
     error = 0
     for cluster in range(self.n_clusters):
         clusterData = []
         for i in range(len(data)):
             if (self.labels_[i] == cluster):
                 clusterData.append(data[i])
         distanceMatrix = manhattan_distances(
             clusterData, [self.clusters_centers_[cluster]])
         # distanceMatrix[i][0], 0 karena hanya cuma ada satu cluster center yang diitung jaraknya
         for i in range(len(distanceMatrix)):
             error += distanceMatrix[i][0]
     return error
示例#19
0
def distance(instance1, instance2, dis_type):
    if dis_type == 1:
        if instance1 == None or instance2 == None:
            return float("inf")
        sumOfSquares = 0
        for i in range(1, len(instance1)):
            sumOfSquares += (instance1[i] - instance2[i])**2
    elif dis_type == 2:
        #print(instance1)
        #print(instance2)
        if instance1 == None or instance2 == None:
            return float("inf")
        sumOfSquares = 0
        #for i in range(1, len(instance1)):
        instance1 = list(instance1)
        instance2 = list(instance2)
        a = instance1.pop(0)
        a = instance2.pop(0)
        sumOfSquares += manhattan_distances([instance1], [instance2])
    elif dis_type == 3:
        #print(instance1)
        #print(instance2)
        if instance1 == None or instance2 == None:
            return float("inf")
        sumOfSquares = 0
        #for i in range(1, len(instance1)):
        instance1 = list(instance1)
        instance2 = list(instance2)
        a = instance1.pop(0)
        a = instance2.pop(0)
        sumOfSquares += (1 - cosine_similarity([instance1], [instance2]))
    elif dis_type == 4:
        #print(instance1)
        #print(instance2)
        if instance1 == None or instance2 == None:
            return float("inf")
        sumOfSquares = 0
        #for i in range(1, len(instance1)):
        instance1 = list(instance1)
        instance2 = list(instance2)
        a = instance1.pop(0)
        a = instance2.pop(0)
        sumOfSquares += (1 - jaccard(instance1, instance2))
    #print("The centroid is :")
    #print(instance2)
    #print("The SSE is :")
    #sse = 0
    #for i in range(1, len(instance1)):
    #    sse += (instance1[i] - instance2[i])**2
    #print(sse)
    return sumOfSquares
示例#20
0
def calculate_similarity(beer1, beer2):
    # find common reviewers
    beer_1_reviewers = df[df.beer_name==beer1].review_profilename.unique()
    beer_2_reviewers = df[df.beer_name==beer2].review_profilename.unique()
    common_reviewers = set(beer_1_reviewers).intersection(beer_2_reviewers)

    # get reviews
    beer_1_reviews = get_beer_reviews(beer1, common_reviewers)
    beer_2_reviews = get_beer_reviews(beer2, common_reviewers)
    dists = []
    for f in ALL_FEATURES:
        dists.append(manhattan_distances(beer_1_reviews[f], beer_2_reviews[f])[0][0])
    
    return dists
def investor_helper(betas):
    # weights for market porfolio
    mkt = betas.sum(axis=0) / betas.sum()
    # "AUM" weights to aggregate market portfolio
    x = betas.sum(axis=1)
    aum = x / x.sum()
    nbetas = betas / x[:, None]

    # distance to AUM weighted market portfolio
    l2 = cosine_similarity(X=betas, Y=np.expand_dims(mkt, axis=0)).flatten()
    l1 = 1 - manhattan_distances(X=nbetas,
                                 Y=np.expand_dims(mkt, axis=0),
                                 sum_over_features=True).flatten() / 2
    return (aum, l2, l1)
示例#22
0
 def uclidean_distance(self, centroids, vector, key, cluster):
     result = {}
     for i in range(len(centroids)):
         try:
             result['c' + str(i)] = manhattan_distances(
                 np.array(centroids[i]).reshape(1, -1),
                 np.array(vector).reshape(1, -1))
             #result['c'+str(i)] = distance.euclidean(centroids[i], vector)
         except:
             print(centroids[i])
             exit()
     result = sorted(result.items(), key=lambda x: x[1])
     result = dict(result)
     cluster[list(result.keys())[0]].append(key)
示例#23
0
def get_actor_actor_similarity_matrix(similarity_measure=3, consider_zero_tag_vectors=True):
    actor_list, actor_tag_matrix = __get_actor_tag_matrix__(consider_zero_tag_vectors)
    actor_actor_similarity_matrix = [[[] for _ in range(len(actor_tag_matrix))]
                                     for _ in range(len(actor_tag_matrix))]
    if similarity_measure == 0:

        # inverse of manhattan distance matrix generation
        actor_actor_similarity_matrix = \
            __get_similarity_from_distance_matrix__(manhattan_distances(actor_tag_matrix))

    elif similarity_measure == 1:

        # cosine similarity matrix generation
        actor_actor_similarity_matrix = cosine_similarity(actor_tag_matrix)

    elif similarity_measure == 2:

        # inverse of cosine distance matrix generation
        actor_actor_similarity_matrix = \
            __get_similarity_from_distance_matrix__(cosine_distances(actor_tag_matrix))

    elif similarity_measure == 3:

        # mahalanobis distance calculation but the distance value is inverted i.e 1/value
        # once the distance is calculated and put in the similarity metric
        cov_matrix = np.cov(np.transpose(actor_tag_matrix))
        for actor_index in range(len(actor_tag_matrix)):
            for actor_index_2 in range(len(actor_tag_matrix)):
                dist = distance.mahalanobis(u=actor_tag_matrix[actor_index],
                                            v=actor_tag_matrix[actor_index_2],
                                            VI=cov_matrix)
                if dist == 0:
                    actor_actor_similarity_matrix[actor_index][actor_index_2] = 1
                else:
                    actor_actor_similarity_matrix[actor_index][actor_index_2] = 1/(1 + dist)

    else:

        # Euclidean distance measure
        # uses the distance function given above that returns reciprocal values of distance
        # making it mimic a similarity measure
        for actor_index in range(len(actor_tag_matrix)):
            for actor_index_2 in range(len(actor_tag_matrix)):
                dist = __get_actor_actor_euclidean_similarity__(
                    actor_tag_matrix[actor_index],
                    actor_tag_matrix[actor_index_2])
                actor_actor_similarity_matrix[actor_index][actor_index_2] = dist

    return actor_list, actor_actor_similarity_matrix
示例#24
0
def TFIDFManhattanDistance(doc, alldoc):
    cleantext = alldocclean_(alldoc)
    tfidf = TfidfVectorizer()
    tfs = tfidf.fit_transform(cleantext)
    df = pd.DataFrame(tfs.toarray(), columns=tfidf.get_feature_names())
    mansim = manhattan_distances(df)
    mansimdf = pd.DataFrame(mansim)
    mansimpair = []
    for index in range(len(alldoc)):
        if (doc != index):
            temp = []
            temp.append(index)
            temp.append(mansimdf.at[index, doc])
            mansimpair.append(temp)
    return mansimpair
示例#25
0
def _multi_gini_seg(data, groups):
    """Calculate Multigroup Gini Segregation index.

    Parameters
    ----------
    data   : a pandas DataFrame
        dataframe holding group data
    groups : list of strings.
        The variables names in data of the groups of interest of the analysis.

    Returns
    -------
    statistic : float
        Multigroup Gini Segregation Index
    core_data : a pandas DataFrame
        A pandas DataFrame that contains the columns used to perform the estimate.

    Notes
    -----
    Based on Reardon, Sean F., and Glenn Firebaugh. "Measures of multigroup segregation." Sociological methodology 32.1 (2002): 33-67.

    Reference: :cite:`reardon2002measures`.

    """
    core_data = data[groups]
    df = np.array(core_data)

    K = df.shape[1]

    T = df.sum()

    ti = df.sum(axis=1)
    pik = df / ti[:, None]
    pik = np.nan_to_num(
        pik)  # Replace NaN from zerodivision when unit has no population
    Pk = df.sum(axis=0) / df.sum()
    Is = (Pk * (1 - Pk)).sum()

    elements_sum = np.empty(K)
    for k in range(K):
        aux = np.multiply(np.outer(ti, ti),
                          manhattan_distances(pik[:, k].reshape(-1, 1))).sum()
        elements_sum[k] = aux

    multi_Gini_Seg = elements_sum.sum() / (2 * (T**2) * Is)
    if isinstance(data, GeoDataFrame):
        core_data = data[[data.geometry.name]].join(core_data)
    return multi_Gini_Seg, core_data, groups
def write_distances_manhattan(visual_desc,file2):
	distance_list=[]
	id_list=[]
	distance=pd.DataFrame()
	#print(id_list)
	for input in visual_desc:
		for first in file2:
			dist = manhattan_distances(input[2:].reshape(1,-1),first[2:].reshape(1,-1))[0][0]
			distance_list.append(dist)
			id_list.append([input[0],input[1],first[0],first[1]])
	#print(id_list)
	#distance=distance.append(pd.DataFrame({'Distance':[dist]}))		
	distance=distance.append(pd.DataFrame(id_list, columns=['Input Location','Input Id','Second Location','Second Id']))
	distance.insert(loc=0,column='Distance',value=distance_list)		
	distance_sorted = distance.sort_values('Distance',ascending=True)	
	return(distance_sorted)
示例#27
0
 def nn_ind(self, color_hist, num):
     """
     Exact nearest neighbor seach through exhaustive comparison.
     """
     if self.distance_metric == 'manhattan':
         dists = manhattan_distances(color_hist, self.hists_reduced)
     elif self.distance_metric == 'euclidean':
         dists = euclidean_distances(color_hist, self.hists_reduced, squared=True)
     elif self.distance_metric == 'chi_square':
         dists = -additive_chi2_kernel(color_hist, self.hists_reduced)
     
     dists = dists.flatten()
     nn_ind = np.argsort(dists).flatten()[:num]
     nn_dists = dists[nn_ind]
     
     return nn_ind, nn_dists
 def nn_ind(self, color_hist, num):
     """
     Exact nearest neighbor seach through exhaustive comparison.
     """
     if self.distance_metric == 'manhattan':
         dists = manhattan_distances(color_hist, self.hists_reduced)
     elif self.distance_metric == 'euclidean':
         dists = euclidean_distances(color_hist, self.hists_reduced, squared=True)
     elif self.distance_metric == 'chi_square':
         dists = -additive_chi2_kernel(color_hist, self.hists_reduced)
     
     dists = dists.flatten()
     nn_ind = np.argsort(dists).flatten()[:num]
     nn_dists = dists[nn_ind]
     
     return nn_ind, nn_dists
示例#29
0
def KDE(x, kn_negs, Z, beta):
    '''
    Calculate the density estimation using Gaussian-kernal-based kernel density estimator
    '''
    num_neg_instances = 0
    p = 0
    x = np.reshape(x, (1, x.shape[0]))

    for instance in kn_negs:
        num_neg_instances += 1
        inst = np.reshape(instance, (1, instance.shape[0]))
        dist = manhattan_distances(x, inst)
        p = p + math.exp(-1 * beta * dist)

    p = p / (Z * num_neg_instances)
    return p
示例#30
0
    def manhattan_from_data(training_data, test_data):
        """Manhattan distance

        Parameters
        ----------
        training_data : dict

        test_data : dict

        Returns
        -------
        distance : dict
        """
        training_data = pd.DataFrame.from_dict(training_data, orient='index')
        test_data = pd.DataFrame.from_dict(test_data, orient='index')
        return manhattan_distances(test_data, training_data)
示例#31
0
    def gradient(self, x):
        """
        Calculate the gradient of the posterior mean and variance
        Note that the nugget effect will not the change the computation below
        """

        check_is_fitted(self, 'X')

        # Check input shapes
        x = np.atleast_2d(x)
        n_eval, _ = x.shape
        n_samples, n_features = self.X.shape

        if _ != n_features:
            raise Exception('x does not have the right size!')

        if n_eval != 1:
            raise Exception('x must be a vector!')

        # trend and its Jacobian
        f = self.regr(x).T
        f_dx = self.regr_dx(x)

        # correlation and its Jacobian
        d = manhattan_distances(x, Y=self.X, sum_over_features=False)
        r = self.corr(self.theta_, d).reshape(n_eval, n_samples)
        r_dx = self.corr_dx(x, X=self.X, r=r)

        # gradient of the posterior mean
        y_dx = dot(f_dx, self.beta) + dot(r_dx, self.gamma)

        # auxiliary variable: rt = C^-1 * r
        rt = solve_triangular(self.C, r.T, lower=True)
        rt_dx = solve_triangular(self.C, r_dx.T, lower=True).T

        # auxiliary variable: u = Ft^T * rt - f
        u = dot(self.Ft.T, rt) - f
        u_dx = dot(rt_dx, self.Ft) - f_dx

        mse_dx = -dot(rt_dx, rt)  # for Simple Kriging
        if self.beta0 is None:  # for Universal Kriging
            Ft2inv = inv(dot(self.Ft.T, self.Ft))
            mse_dx += dot(u_dx, Ft2inv).dot(u)

        mse_dx = 2.0 * self.sigma2 * mse_dx

        return y_dx, mse_dx
示例#32
0
 def process_similarity(self, similarity):
     if similarity == "cosine":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = cosine_similarity(
             self._data.sp_i_train_ratings.T)[x, y]
     elif similarity == "dot":
         self._similarity_matrix = (
             self._data.sp_i_train_ratings.T
             @ self._data.sp_i_train_ratings).toarray()
     elif similarity == "euclidean":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (
             1 /
             (1 + euclidean_distances(self._data.sp_i_train_ratings.T)))[x,
                                                                         y]
     elif similarity == "manhattan":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (
             1 /
             (1 + manhattan_distances(self._data.sp_i_train_ratings.T)))[x,
                                                                         y]
     elif similarity == "haversine":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (
             1 /
             (1 + haversine_distances(self._data.sp_i_train_ratings.T)))[x,
                                                                         y]
     elif similarity == "chi2":
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (
             1 / (1 + chi2_kernel(self._data.sp_i_train_ratings.T)))[x, y]
     elif similarity in ['cityblock', 'l1', 'l2']:
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (1 / (1 + pairwise_distances(
             self._data.sp_i_train_ratings.T, metric=similarity)))[x, y]
     elif similarity in [
             'braycurtis', 'canberra', 'chebyshev', 'correlation', 'dice',
             'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'minkowski',
             'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener',
             'sokalsneath', 'sqeuclidean', 'yule'
     ]:
         x, y = np.triu_indices(self._similarity_matrix.shape[0], k=1)
         self._similarity_matrix[x, y] = (1 / (1 + pairwise_distances(
             self._data.sp_i_train_ratings.T.toarray(), metric=similarity))
                                          )[x, y]
     else:
         raise Exception("Not implemented similarity")
    def predict(self, df):
        predictions = np.zeros(len(df))
        callersInPipeline = list()
        
        for i in range(0, self.num_clusters):
            weight = len([1 for num in self.clusters  if num == i])
            distances = manhattan_distances(df.T, self.centers[i])
            bestIndex = np.argmin(distances)
            bestCallerName = df.columns[bestIndex]
            representativeCaller = df[bestCallerName]
            callersInPipeline.append(bestCallerName)
            predictions += weight * representativeCaller

        predictions /= len(self.clusters)
        predictions = self.transformToPredictions(predictions) 
        
        return predictions, callersInPipeline
示例#34
0
def get_similarities(dtm, labels, sim_measure):
    # Try different distance metrics
    if sim_measure == 'cosine':
        dist = 1 - cosine_similarity(dtm)
    elif sim_measure == 'euclidean':
        dist = euclidean_distances(dtm)
    elif sim_measure == 'manhattan':
        dist = manhattan_distances(dtm)
    dist_df = pd.DataFrame((1 - dist), columns=labels, index=labels)
    # dist_df = dist_df[sorted(labels)]
    # dist_df = dist_df.sort_index()
    dist_df.to_csv('{}/distance-{}-matrix.csv'.format(path, sim_measure))

    # Hierarchical Clustering
    get_hierarchical_clustering(dist, labels, path)

    return dist_df
示例#35
0
def get_course_specific_similarities(course, dtm, labels, sim_measure):
    dtm = dtm.toarray()
    index = labels.index(course)
    # Try different distance metrics
    if sim_measure == 'cosine':
        dist = 1 - cosine_similarity(dtm, [dtm[index]])
    elif sim_measure == 'euclidean':
        dist = euclidean_distances(dtm, [dtm[index]])
    elif sim_measure == 'manhattan':
        dist = manhattan_distances(dtm, [dtm[index]])
    dist_df = pd.DataFrame((1 - dist), columns=[course], index=labels)
    # dist_df = dist_df[sorted(labels)]
    # dist_df = dist_df.sort_index()
    dist_df.to_csv('{}/distance-{}-matrix-{}.csv'.format(
        path, sim_measure, course))

    return dist_df
示例#36
0
    def __init__(self,
                 base_estimator=DecisionTreeClassifier(max_depth=1),
                 base_weighter=KernelMeanMatching(),
                 similarity="euclidean"):
        self._base_estimator = base_estimator
        self._base_weighter = base_weighter

        if similarity == "euclidean":
            self._similarity = lambda x, y: 1 / (1 + pw.euclidean_distances(
                x, y))
        elif similarity == "cosine":
            self._similarity = pw.cosine_similarity
        elif similarity == "l1":
            self._similarity = lambda x, y: 1 / (1 + pw.manhattan_distances(
                x, y))
        else:
            self._similarity = similarity
示例#37
0
def CM(imgsvec1, imgsvec2, imgpairs=True):

    #calculate the pair wise manhattan distance between two vector matrices
    allDis = manhattan_distances(imgsvec1, imgsvec2)

    #average of all the distances between one vector of first matrix to all the vectors of second matrix, for all vectors of first matrix
    imgDis = numpy.mean(allDis, axis=1)

    #average of all the distances from previous step to get the final combined distance between all images
    finalDis = numpy.mean(imgDis, axis=0)

    #if return most similar image pairs is True, also return the most similar images
    if imgpairs:

        return finalDis, sim_images(allDis)

    return finalDis
示例#38
0
def nn(feat, feats, distance='euclidean', K=-1):
    """
    Exact nearest neighbor seach through exhaustive comparison.
    """
    if distance == 'manhattan':
        dists = metrics.manhattan_distances(feat, feats)
    elif distance == 'euclidean':
        dists = metrics.euclidean_distances(feat, feats, squared=True)
    elif distance == 'chi_square':
        dists = -metrics.additive_chi2_kernel(feat, feats)

    dists = dists.flatten()
    if K > 0:
        nn_ind = bn.argpartsort(dists, K).flatten()[:K]
        nn_ind = nn_ind[np.argsort(dists[nn_ind])]
    else:
        nn_ind = np.argsort(dists)
    nn_dist = dists[nn_ind]

    return nn_ind, nn_dist
    def get_nearst_n_hist_index(self, color_hist, num=10):
        """
        return n closest nearest to the color_hist and the distance to it
        :param color_hist: hist of the color palette
        :param num: n of the nearest to return
        :return:
        nearst_index: the index of the neighbors
        nearst_dists: the distance to the neighbors
        """
        #------ manhattan_distance ------
        #------ 曼哈顿距离
        #------ 直方图相似度评估
        dists = manhattan_distances(color_hist, self.img_set.img_set_hist)
        dists = dists.flatten()
        nearst_index = np.argsort(dists).flatten()[:num]
        nearst_dists = dists[nearst_index]
        nearst_file_name = []
        for i in nearst_index:
            nearst_file_name.append(self.img_set.img_load_order[i])
        if self.debug:

            print 'pic file names', nearst_file_name
        return nearst_index, nearst_dists
示例#40
0
    def _nn(self, image_id, feature, distance='cosine', K=-1):
        """
        Exact nearest neighbor seach through exhaustive comparison.
        """
        # S = self.S[feature]
        feats = self.features[feature]
        feat = feats[self.index.index(image_id)]

        if distance == 'manhattan':
            dists = metrics.manhattan_distances(feat, feats)

        elif distance == 'euclidean':
            dists = metrics.euclidean_distances(feat, feats, squared=True)

        elif distance == 'chi_square':
            dists = -metrics.additive_chi2_kernel(feat, feats)

        elif distance == 'dot':
            dists = -np.dot(feats, feat)

        elif distance == 'cosine':
            feats_norm = self.features_norm[feature]
            dists = -np.dot(feats, feat) / feats_norm / np.linalg.norm(feat, 2)

        elif distance == 'projected':
            feats = self.features_proj[feature]
            feat = feats[self.index.index(image_id)]
            dists = sklearn.utils.extmath.row_norms(feats - feat)

        dists = dists.flatten()
        if K > 0:
            nn_ind = np.argsort(dists).flatten()[:K]
        else:
            nn_ind = np.argsort(dists)
        nn_dist = dists[nn_ind]

        return nn_ind, nn_dist
示例#41
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)
    # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)
    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Low-level function for manhattan can divide in blocks to avoid
    # using too much memory during the broadcasting
    S3 = manhattan_distances(X, Y, size_threshold=10)
    assert_array_almost_equal(S, S3)
    # Test cosine as a string metric versus cosine callable
    # "cosine" uses sklearn metric, cosine (function) is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError, pairwise_distances, X, Y_sparse,
                  metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
示例#42
0
from flask import Flask, render_template, request
from flaskext.markdown import Markdown
import json
import folium
import glob

# imports for classifier re-example
from lyrics_classifier import LyricsClf
lclf = LyricsClf('classifier.p')

# imports for beer similarity
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import manhattan_distances
beers = pd.read_csv('beer.csv').set_index('Beer')
distances = manhattan_distances(beers)

# setup application
app = Flask(__name__)
app.debug = True
Markdown(app)

def blog_posts():
    """this function should search for the blog posts that exist in
    the folder /posts, and create a list of tuples with the cleaned up title
    and the filename.
    ex: ['2015-04-06_my_first_blog_post']
    """
    files = glob.glob("posts/*.md")
    return [f.split('/')[1].split('.')[0] for f in files]
示例#43
0
def test_pairwise_distances():
    # Test the pairwise_distance helper function.
    rng = np.random.RandomState(0)

    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)

    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)

    # Test haversine distance
    # The data should be valid latitude and longitude
    X = rng.random_sample((5, 2))
    X[:, 0] = (X[:, 0] - 0.5) * 2 * np.pi/2
    X[:, 1] = (X[:, 1] - 0.5) * 2 * np.pi
    S = pairwise_distances(X, metric="haversine")
    S2 = haversine_distances(X)
    assert_array_almost_equal(S, S2)

    # Test haversine distance, with Y != X
    Y = rng.random_sample((2, 2))
    Y[:, 0] = (Y[:, 0] - 0.5)*2*np.pi/2
    Y[:, 1] = (Y[:, 1] - 0.5)*2*np.pi
    S = pairwise_distances(X, Y, metric="haversine")
    S2 = haversine_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # "cityblock" uses scikit-learn metric, cityblock (function) is
    # scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)

    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)

    # Test cosine as a string metric versus cosine callable
    # The string "cosine" uses sklearn.metric,
    # while the function cosine is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)

    # Test with sparse X and Y,
    # currently only supported for Euclidean, L1 and cosine.
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse.tocsc(), metric="manhattan")
    S2 = manhattan_distances(X_sparse.tobsr(), Y_sparse.tocoo())
    assert_array_almost_equal(S, S2)
    S2 = manhattan_distances(X, Y)
    assert_array_almost_equal(S, S2)

    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)

    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError, pairwise_distances, X, Y_sparse,
                  metric="minkowski")

    # Test that a value error is raised if the metric is unknown
    assert_raises(ValueError, pairwise_distances, X, Y, metric="blah")
 def _e_step(self, X,w):
     self.labels_ = array([v[1]*v[0] for v in zip(w,manhattan_distances(X,self.cluster_centers_))]).argmin(axis=1)
示例#45
0
 def _e_step(self):
     self.labels_ = manhattan_distances(self.vectors, self.cluster_centers_).argmin(axis=1)
示例#46
0
def med_distance_test(X, Y):
	print 'medutilDist: testing cross distances function (equivalent to manhattan_distances)'
	D = manhattan_distances(X, Y, sum_over_features=False)

	return D
            x, y = dtm[i, :], dtm[j, :]
            dist[i, j] = np.sqrt(np.sum((x - y)**2))
    return dist

dist_eukl= zero_space(dist)
dist_cos = zero_space(dist)
dist_man = zero_space(dist)


dist_eukl = euclidean_distances(dtm)
np.round(dist_eukl, 1)

dist_cos = 1 - cosine_similarity(dtm)
np.round(dist_cos, 2)

dist_man = manhattan_distances(dtm)
np.round(dist_man, 1)

norms = np.sqrt(np.sum(dtm * dtm, axis=1, keepdims=True))

dtm_normed = dtm / norms
similarities = np.dot(dtm_normed, dtm_normed.T)
np.round(similarities, 2)



# Visualizing distances



示例#48
0
print(dells_activities_data_frame.shape)

# use dictionary object for mapping the response/target variable
activity_to_binary = {'NO' : 0, 'YES' : 1, '': 0}
for iname in binary_variable_names:
    dells_activities_data_frame[iname] = \
        dells_activities_data_frame[iname].map(activity_to_binary)
print(dells_activities_data_frame[0:10])  # examine the first 10 rows of data    
 
# convert DataFrame to numpy array representation of activities matrix                   
activities_binary_matrix = dells_activities_data_frame.as_matrix().transpose() 
print(type(activities_binary_matrix))
print(activities_binary_matrix.shape)

# compute distance matrix
distance_matrix = manhattan_distances(activities_binary_matrix)
print(distance_matrix.shape)

# apply the multidimensional scaling algorithm and plot the map
mds_method = manifold.MDS(n_components = 2, random_state = 9999,\
    dissimilarity = 'precomputed')
mds_fit = mds_method.fit(distance_matrix)  
mds_coordinates = mds_method.fit_transform(distance_matrix) 
                                                                                                                                  
activity_names = ['Shopping', 'Antiquing',     
'Site Seeing', 'Fine Dining', 'Casual Dining', 
'Family Style Dining', 'Fast Food Dining', 'Museums',       
'Indoor Pool', 'Outdoor Pool', 'Hiking', 'Gambling', 
'Boating/Swimming', 'Fishing', 'Golfing', 'Boat Tours', 
'Ride the Ducks', 'Amusement Park', 'Minigolf', 'Go-carting',     
'Waterpark', 'Circus World', 'Tommy Bartlett Ski Show', 
 def _e_step(self, X):
     self.labels_ = manhattan_distances(X, self.cluster_centers_).argmin(axis=1)
示例#50
0
def test_pairwise_distances():
    """ Test the pairwise_distance helper function. """
    rng = np.random.RandomState(0)
    # Euclidean distance should be equivalent to calling the function.
    X = rng.random_sample((5, 4))
    S = pairwise_distances(X, metric="euclidean")
    S2 = euclidean_distances(X)
    assert_array_almost_equal(S, S2)
    # Euclidean distance, with Y != X.
    Y = rng.random_sample((2, 4))
    S = pairwise_distances(X, Y, metric="euclidean")
    S2 = euclidean_distances(X, Y)
    assert_array_almost_equal(S, S2)
    # Test with tuples as X and Y
    X_tuples = tuple([tuple([v for v in row]) for row in X])
    Y_tuples = tuple([tuple([v for v in row]) for row in Y])
    S2 = pairwise_distances(X_tuples, Y_tuples, metric="euclidean")
    assert_array_almost_equal(S, S2)
    # "cityblock" uses sklearn metric, cityblock (function) is scipy.spatial.
    S = pairwise_distances(X, metric="cityblock")
    S2 = pairwise_distances(X, metric=cityblock)
    assert_equal(S.shape[0], S.shape[1])
    assert_equal(S.shape[0], X.shape[0])
    assert_array_almost_equal(S, S2)
    # The manhattan metric should be equivalent to cityblock.
    S = pairwise_distances(X, Y, metric="manhattan")
    S2 = pairwise_distances(X, Y, metric=cityblock)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # manhattan does not support sparse matrices atm.
    assert_raises(ValueError, pairwise_distances, csr_matrix(X),
                  metric="manhattan")
    # Low-level function for manhattan can divide in blocks to avoid
    # using too much memory during the broadcasting
    S3 = manhattan_distances(X, Y, size_threshold=10)
    assert_array_almost_equal(S, S3)
    # Test cosine as a string metric versus cosine callable
    # "cosine" uses sklearn metric, cosine (function) is scipy.spatial
    S = pairwise_distances(X, Y, metric="cosine")
    S2 = pairwise_distances(X, Y, metric=cosine)
    assert_equal(S.shape[0], X.shape[0])
    assert_equal(S.shape[1], Y.shape[0])
    assert_array_almost_equal(S, S2)
    # Tests that precomputed metric returns pointer to, and not copy of, X.
    S = np.dot(X, X.T)
    S2 = pairwise_distances(S, metric="precomputed")
    assert_true(S is S2)
    # Test with sparse X and Y,
    # currently only supported for euclidean and cosine
    X_sparse = csr_matrix(X)
    Y_sparse = csr_matrix(Y)
    S = pairwise_distances(X_sparse, Y_sparse, metric="euclidean")
    S2 = euclidean_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    S = pairwise_distances(X_sparse, Y_sparse, metric="cosine")
    S2 = cosine_distances(X_sparse, Y_sparse)
    assert_array_almost_equal(S, S2)
    # Test with scipy.spatial.distance metric, with a kwd
    kwds = {"p": 2.0}
    S = pairwise_distances(X, Y, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, Y, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # same with Y = None
    kwds = {"p": 2.0}
    S = pairwise_distances(X, metric="minkowski", **kwds)
    S2 = pairwise_distances(X, metric=minkowski, **kwds)
    assert_array_almost_equal(S, S2)
    # Test that scipy distance metrics throw an error if sparse matrix given
    assert_raises(TypeError, pairwise_distances, X_sparse, metric="minkowski")
    assert_raises(TypeError, pairwise_distances, X, Y_sparse,
                  metric="minkowski")
 def predict(self, X):
     return manhattan_distances(X, self.cluster_centers_).argmin(axis=1)
示例#52
0
    def predict(self, X, eval_MSE=False, batch_size=None):
        """
        This function evaluates the Gaussian Process model at x.

        Parameters
        ----------
        X : array_like
            An array with shape (n_eval, n_features) giving the point(s) at
            which the prediction(s) should be made.

        eval_MSE : boolean, optional
            A boolean specifying whether the Mean Squared Error should be
            evaluated or not.
            Default assumes evalMSE = False and evaluates only the BLUP (mean
            prediction).

        batch_size : integer, optional
            An integer giving the maximum number of points that can be
            evaluated simultaneously (depending on the available memory).
            Default is None so that all given points are evaluated at the same
            time.

        Returns
        -------
        y : array_like, shape (n_samples, ) or (n_samples, n_targets)
            An array with shape (n_eval, ) if the Gaussian Process was trained
            on an array of shape (n_samples, ) or an array with shape
            (n_eval, n_targets) if the Gaussian Process was trained on an array
            of shape (n_samples, n_targets) with the Best Linear Unbiased
            Prediction at x.

        MSE : array_like, optional (if eval_MSE == True)
            An array with shape (n_eval, ) or (n_eval, n_targets) as with y,
            with the Mean Squared Error at x.
        """
        check_is_fitted(self, "X")

        # Check input shapes
        X = check_array(X)
        n_eval, _ = X.shape
        n_samples, n_features = self.X.shape
        n_samples_y, n_targets = self.y.shape

        # Run input checks
        self._check_params(n_samples)

        if X.shape[1] != n_features:
            raise ValueError(("The number of features in X (X.shape[1] = %d) "
                              "should match the number of features used "
                              "for fit() "
                              "which is %d.") % (X.shape[1], n_features))

        if batch_size is None:
            # No memory management
            # (evaluates all given points in a single batch run)

            # Normalize input
            X = (X - self.X_mean) / self.X_std

            # Initialize output
            y = np.zeros(n_eval)
            if eval_MSE:
                MSE = np.zeros(n_eval)

            # Get pairwise componentwise L1-distances to the input training set
            dx = manhattan_distances(X, Y=self.X, sum_over_features=False)
            # Get regression function and correlation
            f = self.regr(X)
            r = self.corr(self.theta_, dx).reshape(n_eval, n_samples)

            # Scaled predictor
            y_ = np.dot(f, self.beta) + np.dot(r, self.gamma)

            # Predictor
            y = (self.y_mean + self.y_std * y_).reshape(n_eval, n_targets)

            if self.y_ndim_ == 1:
                y = y.ravel()

            # Mean Squared Error
            if eval_MSE:
                C = self.C
                if C is None:
                    # Light storage mode (need to recompute C, F, Ft and G)
                    if self.verbose:
                        print("This GaussianProcess used 'light' storage mode "
                              "at instantiation. Need to recompute "
                              "autocorrelation matrix...")
                    reduced_likelihood_function_value, par = \
                        self.reduced_likelihood_function()
                    self.C = par['C']
                    self.Ft = par['Ft']
                    self.G = par['G']

                rt = linalg.solve_triangular(self.C, r.T, lower=True)

                if self.beta0 is None:
                    # Universal Kriging
                    u = linalg.solve_triangular(self.G.T,
                                                np.dot(self.Ft.T, rt) - f.T,
                                                lower=True)
                else:
                    # Ordinary Kriging
                    u = np.zeros((n_targets, n_eval))

                MSE = np.dot(self.sigma2.reshape(n_targets, 1),
                             (1. - (rt ** 2.).sum(axis=0)
                              + (u ** 2.).sum(axis=0))[np.newaxis, :])
                MSE = np.sqrt((MSE ** 2.).sum(axis=0) / n_targets)

                # Mean Squared Error might be slightly negative depending on
                # machine precision: force to zero!
                MSE[MSE < 0.] = 0.

                if self.y_ndim_ == 1:
                    MSE = MSE.ravel()

                return y, MSE

            else:

                return y

        else:
            # Memory management

            if type(batch_size) is not int or batch_size <= 0:
                raise Exception("batch_size must be a positive integer")

            if eval_MSE:

                y, MSE = np.zeros(n_eval), np.zeros(n_eval)
                for k in range(max(1, n_eval / batch_size)):
                    batch_from = k * batch_size
                    batch_to = min([(k + 1) * batch_size + 1, n_eval + 1])
                    y[batch_from:batch_to], MSE[batch_from:batch_to] = \
                        self.predict(X[batch_from:batch_to],
                                     eval_MSE=eval_MSE, batch_size=None)

                return y, MSE

            else:

                y = np.zeros(n_eval)
                for k in range(max(1, n_eval / batch_size)):
                    batch_from = k * batch_size
                    batch_to = min([(k + 1) * batch_size + 1, n_eval + 1])
                    y[batch_from:batch_to] = \
                        self.predict(X[batch_from:batch_to],
                                     eval_MSE=eval_MSE, batch_size=None)

                return y
示例#53
0
	def predict(self, X, eval_MSE=False, transformY=True, returnRV=False, integratedPrediction= False, eval_confidence_bounds=False,coef_bound=1.96, batch_size=None):
		"""
		This function evaluates the Gaussian Process model at x.

		Parameters
		----------
		X : array_like
			An array with shape (n_eval, n_features) giving the point(s) at
			which the prediction(s) should be made.

		eval_MSE : boolean, optional
			A boolean specifying whether the Mean Squared Error should be
			evaluated or not.
			Default assumes evalMSE = False and evaluates only the BLUP (mean
			prediction).

		transformY : boolean, optional
			A boolean specifying if the predicted values should correspond to
			the same space as the data given to the fit method, or to the
			warped space (in which the GP is fitted).
			Default is True. Setting to False can be useful to compute the Expected
			Improvement in an optimization process.

		returnRV : boolean, optional
			A boolean specifying if the method should return the predicted random variables
			at x instead of a float number.
			Default is False.

		integratedPrediction : boolean, optional
			A boolean specifying if the method should return the fully Bayesian
			prediction, ie compute the expectation given the posterior in the
			original space. If False, the returned value is the inverse value
			(by the mapping function) of the GP prediction. This is much more faster
			as the integratedPrediction needs to numerically compute the integral.
			Default is False.

		eval_confidence_bounds : boolean, optional
			A boolean specifying if the method should return the confidence bounds.
			Because of the non-linearity of the mapping function, this cannot be computed
			directly with the MSE, but needs to invert the mapping function.
			Default is False. If True, coef_bound specifies the boundary to compute.

		coef_bound : float, optional
			A float specifying the confidence bounds to compute. Upper and lower
			confidence bounds are computed as the inverse of m + coef_bound*sigma
			where m and sigma are the mean and the std of the posterior in the GP
			space.
			Default is 1.96 which corresponds to the 95% confidence bounds.

		batch_size : integer, optional
			An integer giving the maximum number of points that can be
			evaluated simultaneously (depending on the available memory).
			Default is None so that all given points are evaluated at the same
			time.

		Returns
		-------
		y : array_like, shape (n_samples,)
			Prediction at x.

		MSE : array_like, optional (if eval_MSE == True)
			Mean Squared Error at x.

		LCB : array_like, optional (if eval_confidence_bounds == True)
			Lower confidence bound.

		UCB : array_like, optional (if eval_confidence_bounds == True)
			Upper confidence bound.
		"""

		# Check input shapes
		X = sk_utils.array2d(X)
		n_eval, _ = X.shape
		n_samples, n_features = self.X.shape
		n_samples_y, n_targets = self.y.shape

		if(n_targets > 1):
			raise ValueError('More than one target in the Y outputs. \
							  Currently only 1D outputs are handled')

		# Run input checks
		self._check_params(n_samples)

		if X.shape[1] != n_features:
			raise ValueError(("The number of features in X (X.shape[1] = %d) "
							  "should match the number of features used "
							  "for fit() "
							  "which is %d.") % (X.shape[1], n_features))

		# Normalize input
		if self.normalize:
			X = (X - self.X_mean) / self.X_std
			
		# Initialize output
		y = np.zeros(n_eval)
		if eval_MSE:
			MSE = np.zeros(n_eval)

		# Get pairwise componentwise L1-distances to the input training set
		dx = manhattan_distances(X, Y=self.X, sum_over_features=False)
		# Get regression function and correlation
		f = self.regr(X)
		r = self.corr(self.theta, dx).reshape(n_eval, n_samples)

		# Scaled predictor
		y_ = np.dot(f, self.beta) + np.dot(r, self.gamma)

		# Predictor
		y = (self.y_mean + self.y_std * y_).reshape(n_eval, n_targets)

		# transform the warped y, modeled as a Gaussian, to the real y
		size = y.shape[0]
		warped_y = np.copy(y)
		
		if(transformY):
			if( np.sum([ y[i][0] > 8.2 for i in range(size)]) >0):
				print('Warning : mapping_inversion failed')
			real_y = [ self.mapping_inv(X[i],y[i][0]) for i in range(size)]
			real_y = self.raw_y_std * np.asarray(real_y) +self.raw_y_mean
			y = real_y.reshape(n_eval, n_targets)
		
		if self.y_ndim_ == 1:
			y = y.ravel()
			warped_y = warped_y.ravel()

		# Mean Squared Error
		if eval_MSE:
			C = self.C
			if C is None:
				# Light storage mode (need to recompute C, F, Ft and G)
				if self.verbose:
					print("This GaussianProcess used 'light' storage mode "
						  "at instantiation. Need to recompute "
						  "autocorrelation matrix...")
				reduced_likelihood_function_value, par = \
					self.reduced_likelihood_function()
				self.C = par['C']
				self.Ft = par['Ft']
				self.G = par['G']

			rt = linalg.solve_triangular(self.C, r.T, lower=True)

			if self.beta0 is None:
				# Universal Kriging
				u = linalg.solve_triangular(self.G.T,
											np.dot(self.Ft.T, rt) - f.T)
			else:
				# Ordinary Kriging
				u = np.zeros((n_targets, n_eval))

			MSE = np.dot(self.sigma2.reshape(n_targets, 1),
						 (1. - (rt ** 2.).sum(axis=0)
						  + (u ** 2.).sum(axis=0))[np.newaxis, :])
			MSE = np.sqrt((MSE ** 2.).sum(axis=0) / n_targets)

			# Mean Squared Error might be slightly negative depending on
			# machine precision: force to zero!
			MSE[MSE < 0.] = 0.

			if self.y_ndim_ == 1:
				MSE = MSE.ravel()
				sigma = np.sqrt(MSE)
				if(returnRV):
					return [ self.predicted_RV([warped_y[i]],sigma[i],X[i]) for i in range(size)]
				else:
					if(eval_confidence_bounds):
						if not(transformY):
							print('Warning, transformY set to False but trying to evaluate conf bounds')
						warped_y_with_boundL = warped_y - coef_bound * sigma
						warped_y_with_boundU = warped_y + coef_bound * sigma
						pred_with_boundL = self.raw_y_std * np.asarray( [ self.mapping_inv(X[i],warped_y_with_boundL[i])[0] for i in range(size) ] ) +self.raw_y_mean
						pred_with_boundU =  self.raw_y_std * np.asarray( [ self.mapping_inv(X[i],warped_y_with_boundU[i])[0] for i in range(size)] ) +self.raw_y_mean
						
						if(integratedPrediction):
							lb = self.raw_y_min - 3.*(self.raw_y_max-self.raw_y_min)
							ub = self.raw_y_max + 3.*(self.raw_y_max-self.raw_y_min)
							print(lb,ub)
							integrated_real_y = [ self.integrate_prediction([warped_y[i]],sigma[i],X[i],lb,ub) for i in range(size)]
							integrated_real_y =  np.asarray(integrated_real_y)
							print('Integrated prediction')
							return integrated_real_y,MSE,pred_with_boundL,pred_with_boundU

						else:
							return y,MSE,pred_with_boundL,pred_with_boundU

						
					else:
						return y, MSE
			
			else:
				return y, MSE

		else:
			return y