Пример #1
0
def get_metric_function(metric=None):
    """
    Parameters
    ----------
    metric : str or function or None, default: None
        Using None is equivalent to using "euclidean".

        If str, then this string specifies the distance metric (from
        scikit-learn) to use for calculating the objective function.
        Possible values are:

        * "cityblock" for sklearn.metrics.pairwise.manhattan_distances
        * "cosine" for sklearn.metrics.pairwise.cosine_distances
        * "euclidean" for sklearn.metrics.pairwise.euclidean_distances
        * "l1" for sklearn.metrics.pairwise.manhattan_distances
        * "l2" for sklearn.metrics.pairwise.euclidean_distances
        * "manhattan" for sklearn.metrics.pairwise.manhattan_distances

        If function, then this function should take two arguments and return a
        scalar value. Furthermore, the following conditions must be fulfilled:

        1. d(a, b) >= 0, for all a and b
        2. d(a, b) == 0, if and only if a = b, positive definiteness
        3. d(a, b) == d(b, a), symmetry
        4. d(a, c) <= d(a, b) + d(b, c), the triangle inequality

    Returns
    -------
    metric_func : function
        If the `metric` argument is a function, it is returned.
        If the `metric` argument is a string, then the corresponding distance
        metric function from `sklearn.metrics.pairwise` is returned.
    """
    if metric is None:
        metric = "manhattan"

    if isinstance(metric, str):
        try:
            return distance_metrics()[metric]
        except KeyError:
            raise ValueError(
                "{} is not a known metric. Please use rather one of the "
                "following metrics: {}".format(
                    metric,
                    tuple(
                        name
                        for name in distance_metrics().keys()
                        if name != "precomputed"
                    ),
                )
            )
    elif callable(metric):
        return metric
    else:
        raise ValueError(
            "A {} was passed as `metric` argument. "
            "Please pass a string or a function "
            "instead.".format(type(metric))
        )
Пример #2
0
def get_metric_function(metric=None):
    """
    Parameters
    ----------
    metric : str or function or None, default: None
        Using None is equivalent to using "euclidean".

        If str, then this string specifies the distance metric (from
        scikit-learn) to use for calculating the objective function.
        Possible values are:

        * "cityblock" for sklearn.metrics.pairwise.manhattan_distances
        * "cosine" for sklearn.metrics.pairwise.cosine_distances
        * "euclidean" for sklearn.metrics.pairwise.euclidean_distances
        * "l1" for sklearn.metrics.pairwise.manhattan_distances
        * "l2" for sklearn.metrics.pairwise.euclidean_distances
        * "manhattan" for sklearn.metrics.pairwise.manhattan_distances

        If function, then this function should take two arguments and return a
        scalar value. Furthermore, the following conditions must be fulfilled:

        1. d(a, b) >= 0, for all a and b
        2. d(a, b) == 0, if and only if a = b, positive definiteness
        3. d(a, b) == d(b, a), symmetry
        4. d(a, c) <= d(a, b) + d(b, c), the triangle inequality

    Returns
    -------
    metric_func : function
        If the `metric` argument is a function, it is returned.
        If the `metric` argument is a string, then the corresponding distance
        metric function from `sklearn.metrics.pairwise` is returned.
    """
    if metric is None:
        metric = "manhattan"

    if isinstance(metric, str):
        try:
            return distance_metrics()[metric]
        except KeyError:
            raise ValueError(
                "{} is not a known metric. Please use rather one of the "
                "following metrics: {}".format(metric,
                                               tuple(name for name in
                                                     distance_metrics().keys()
                                                     if name != "precomputed"))
            )
    elif callable(metric):
        return metric
    else:
        raise ValueError("A {} was passed as `metric` argument. "
                         "Please pass a string or a function "
                         "instead.".format(type(metric)))
Пример #3
0
    def run_embedding_lookup_distance(self, querys, metric):
        """
        Calculate embedding distance of all querys against the lookup database
        :param querys: querys for which distances should be calculated
        :param metric: metric to use to calculate distances
        :return: distances, query ids
        """

        if metric in pairwise.distance_metrics():
            if isinstance(querys, dict):
                query_ids, raw_data_query = zip(*querys.items())
            else:
                raw_data_query = querys
                query_ids = range(0, numpy.shape(querys)[0])

            raw_data_query = numpy.array(raw_data_query).squeeze()
            if len(query_ids) == 1:
                raw_data_query = raw_data_query.reshape(1, -1)

            distances = pairwise_distances(raw_data_query,
                                           self.raw_data,
                                           metric=metric)
        else:
            sys.exit("{} is not a correct distance metric\n"
                     "See <sklearn.metrics.pairwise.distance_metrics()> "
                     "for all possible distance metrics".format(metric))

        return distances, query_ids
Пример #4
0
def get_adj(train_data, test_data, k, alpha, kappa):
    eps = np.finfo(float).eps
    emb_all = np.append(train_data, test_data, axis=0)
    N = emb_all.shape[0]
    metric = distance_metrics()['cosine']
    S = 1 - metric(emb_all, emb_all)
    S = torch.tensor(S)
    S = S - torch.eye(S.shape[0])

    if k > 0:
        topk, indices = torch.topk(S, k)
        mask = torch.zeros_like(S)
        mask = mask.scatter(1, indices, 1)
        mask = ((mask + torch.t(mask)) > 0).type(torch.float32)

        S = S * mask

    D = S.sum(0)
    Dnorm = torch.diag(torch.pow(D, -0.5))
    E = torch.matmul(Dnorm, torch.matmul(S, Dnorm))

    E = alpha * torch.eye(E.shape[0]) + E
    E = torch.matrix_power(E, kappa)

    E = E.cuda()

    train_data = train_data - train_data.mean(0)
    train_data_norm = train_data / LA.norm(train_data, 2, 1)[:, None]
    test_data = test_data - test_data.mean(0)
    test_data_norm = test_data / LA.norm(test_data, 2, 1)[:, None]
    features = np.append(train_data_norm, test_data_norm, axis=0)

    features = torch.tensor(features).cuda()
    return E, features
Пример #5
0
    def _localReadMoreXML(self, xmlNode):
        """
      Method that reads the portion of the xml input that belongs to this specialized class
      and initializes internal parameters
      @ In, xmlNode, xml.etree.Element, Xml element node
      @ Out, None
    """
        self.distParams = {}
        for child in xmlNode:
            if child.tag == 'metricType':
                self.metricType = child.text
            else:
                self.distParams[str(child.tag)] = utils.tryParse(child.text)
        availableMetrics = pairwise.kernel_metrics().keys(
        ) + pairwise.distance_metrics().keys() + scores.keys()
        if self.metricType not in availableMetrics:
            metricList = ', '.join(
                availableMetrics[:-1]) + ', or ' + availableMetrics[-1]
            self.raiseAnError(
                IOError,
                'Metric SKL error: metricType ' + str(self.metricType) +
                ' is not available. Available metrics are: ' + metricList +
                '.')

        for key, value in self.distParams.items():
            try:
                newValue = ast.literal_eval(value)
                if type(newValue) == list:
                    newValue = np.asarray(newValue)
                self.distParams[key] = newValue
            except:
                self.distParams[key] = value
Пример #6
0
def kNN(X, k, measure='euclidean'):
    """
    Construct pairwise weights by finding the k nearest neighbors to each point
    and assigning a Gaussian-based distance.

    Parameters
    ----------
    X : [n_samples, n_dim] array
    k : int
        number of neighbors for each sample in X
    """
    from scipy.spatial import distance

    weights = []
    parallelized_metrics = list(distance_metrics().keys())

    if (measure in parallelized_metrics):
        w = pairwise_distances(X=X, Y=X, metric=measure, n_jobs=-1)
    else:
        w = distance.cdist(X, X, measure)

    y = np.argsort(w, axis=1)

    for i, x in enumerate(X):
        distances, indices = w[i, y[i, 1:k + 1]], y[i, 1:k + 1]
        for (d, j) in zip(distances, indices):
            if i < j:
                weights.append((i, j, d * d))
            else:
                weights.append((j, i, d * d))
    weights = sorted(weights, key=lambda r: (r[0], r[1]))
    return np.unique(np.asarray(weights), axis=0)
    def __init__(self,
                 sourcedataset: Dataset,
                 alpha: Optional[float] = .5,
                 length: Optional[str] = "auto",
                 batchsize: Optional[int] = 5000,
                 store: Optional[bool] = True,
                 seed: Optional[int] = 0,
                 averageneighbors: Optional[int] = 2,
                 maxneighbors: Optional[int] = 100,
                 distmetric: Optional[str] = "l2",
                 transform: Optional[Callable] = None,
                 limitdata: Optional[int] = None):
        """
        Args: 
            sourcedataset : an iterable Dataset
            alpha (float): concentration parameter for the Dirichlet distribution
            length (integer or "auto") : length of this Dataset; auto will create averageneighbors*(length of input dataset) size Dataset
            batchsize (integer): number of points per batch to use in computing the interpoint distances
            store (binary) : whether to store the data or not
            seed (integer): seed used to generate this Dataset
            averageneighbors (integer): number of neighbors to average over for each sample
            maxneighbors (integer): maximum number of nearest neighbors to store for each point
            distmetric (function): pairwise distance function used to select nearby points, must be a key in sklearn.metrics.pairwise.distance_metrics()
            transform (function): transform for image data sets
            limitdata (integer): only works with this initial poriton of the source dataset 
        """

        self.sourcedataset = sourcedataset

        if limitdata is None:
            self.sourcelen = len(self.sourcedataset)
        else:
            self.sourcelen = limitdata

        self.alpha = alpha
        if length == "auto":
            self.len = averageneighbors * self.sourcelen
        else:
            self.len = length

        self.batchsize = batchsize
        self.store = store
        self.seed = seed
        self.averageneighbors = averageneighbors
        self.maxneighbors = maxneighbors
        self.distmetric = distance_metrics()[distmetric]
        self.transform = transform

        self.testpoint = sourcedataset[0][0]
        self.testtarget = sourcedataset[0][1]

        self.computedistances()
        self.generator = torch.manual_seed(self.seed)
        self.computesamples(self.store)

        # copied from the VisionDataset class to handle transforms
        has_separate_transform = transform is not None
def optimal_fuzzifier(data):
    """
    Determines the optimal value of fuzzifier for fuzzy c-means algorithm. This
    method employs the method described in Dembele & Kastner 2003.
    Reference: Dembele, D., & Kastner, P. (2003). Fuzzy C-means method for 
               clustering microarray data. bioinformatics, 19(8), 973-980.
    Params:
        data: ndarray of shape (n_samples, n_features)
            - original data which needs to be clustered by Fuzzy C-Means
    Returns:
        mOpt: float
    """
    p = data.shape[1]  # number of dimensions
    min_m = 1.0001  # Minimum possible value of m
    max_m = 50  # Realistic value corresponding to m->np.inf
    m = 2  # Initialize m for numerical approximation
    delta = 0.0001  # Acceptable error range for coeeficient of variation of Ym
    Ym = []
    for i in range(data.shape[0]):
        for j in range(i + 1, data.shape[0]):
            temp = distance_metrics()['euclidean'](data[i].reshape(1, -1),
                                                   data[j].reshape(1,
                                                                   -1))[0][0]
            Ym.append(pow(temp, 1 / (m - 1)))
    cv_Ym = stdev(Ym) / mean(Ym)
    while not (cv_Ym > (0.03 * p) - delta and cv_Ym < (0.03 * p) + delta):
        if cv_Ym <= (0.03 * p) - delta:
            max_m = m
            m = (min_m + max_m) / 2
        else:
            min_m = m
            m = (min_m + max_m) / 2
        Ym = []
        for i in range(data.shape[0]):
            for j in range(i + 1, data.shape[0]):
                temp = distance_metrics()['euclidean'](data[i].reshape(
                    1, -1), data[j].reshape(1, -1))[0][0]
                Ym.append(pow(temp, 1 / (m - 1)))
        cv_Ym = stdev(Ym) / mean(Ym)
    if m > 10:
        mOpt = 2
    else:
        mOpt = 1 + (m / 10)
    return mOpt
def silhouette_samples(X, labels, metric='euclidean'):
    metric = distance_metrics()[metric]
    n = labels.shape[0]
    A = np.array(
        [intra_cluster_distance(X, labels, metric, i) for i in range(n)])
    B = np.array(
        [nearest_cluster_distance(X, labels, metric, i) for i in range(n)])
    sil_samples = (B - A) / np.maximum(A, B)
    # nan values are for clusters of size 1, and should be 0
    return np.nan_to_num(sil_samples)
Пример #10
0
 def distance(self, x, y=None, **kwargs):
     """
   This method returns the distance between two points x and y. If y is not provided then x is a pointSet and a distance matrix is returned
   @ In, x, dict, dictionary containing data of x
   @ In, y, dict, dictionary containing data of y
   @ Out, value, float or numpy.ndarray, distance between x and y (if y is provided) or a square distance matrix if y is None
 """
     if y is not None:
         if isinstance(x, np.ndarray) and isinstance(y, np.ndarray):
             dictTemp = utils.mergeDictionaries(kwargs, self.distParams)
             if self.metricType in pairwise.kernel_metrics().keys():
                 value = pairwise.kernel_metrics(X=x,
                                                 Y=y,
                                                 metric=self.metricType,
                                                 **dictTemp)
             elif self.metricType in pairwise.distance_metrics():
                 value = pairwise.pairwise_distances(X=x,
                                                     Y=y,
                                                     metric=self.metricType,
                                                     **dictTemp)
             return value
         else:
             self.raiseAnError(
                 IOError,
                 'Metric SKL error: SKL metrics support only PointSets and not HistorySets'
             )
     else:
         if self.metricType == 'mahalanobis':
             covMAtrix = np.cov(x.T)
             kwargs['VI'] = np.linalg.inv(covMAtrix)
         dictTemp = utils.mergeDictionaries(kwargs, self.distParams)
         if self.metricType in pairwise.kernel_metrics().keys():
             value = pairwise.pairwise_kernels(X=x,
                                               metric=self.metricType,
                                               **dictTemp)
         elif self.metricType in pairwise.distance_metrics().keys():
             value = pairwise.pairwise_distances(X=x,
                                                 metric=self.metricType,
                                                 **dictTemp)
         return value
Пример #11
0
 def distance(self, x, y=None, **kwargs):
   """
     This method returns the distance between two points x and y. If y is not provided then x is a pointSet and a distance matrix is returned
     @ In, x, numpy.ndarray, array containing data of x, if 1D array is provided, the array will be reshaped via x.reshape(1,-1)
     @ In, y, numpy.ndarray, array containing data of y, if 1D array is provided, the array will be reshaped via y.reshape(1,-1)
     @ Out, value, numpy.ndarray, distance between x and y (if y is provided) or a square distance matrix if y is None
   """
   if y is not None:
     if isinstance(x,np.ndarray) and isinstance(y,np.ndarray):
       if len(x.shape) == 1:
         x = x.reshape(1,-1)
         #self.raiseAWarning(self, "1D array is provided. For consistence, this array is reshaped via x.reshape(1,-1) ")
       if len(y.shape) == 1:
         y = y.reshape(1,-1)
         #self.raiseAWarning(self, "1D array is provided. For consistence, this array is reshaped via y.reshape(1,-1) ")
       dictTemp = utils.mergeDictionaries(kwargs,self.distParams)
       if self.metricType in pairwise.kernel_metrics().keys():
         value = pairwise.pairwise_kernels(X=x, Y=y, metric=self.metricType, **dictTemp)
       elif self.metricType in pairwise.distance_metrics():
         value = pairwise.pairwise_distances(X=x, Y=y, metric=self.metricType, **dictTemp)
       if value.shape == (1,1):
         return value[0]
       else:
         return value
     else:
       self.raiseAnError(IOError,'Metric SKL error: SKL metrics support only PointSets and not HistorySets')
   else:
     if self.metricType == 'mahalanobis':
       covMAtrix = np.cov(x.T)
       kwargs['VI'] = np.linalg.inv(covMAtrix)
     dictTemp = utils.mergeDictionaries(kwargs,self.distParams)
     if self.metricType in pairwise.kernel_metrics().keys():
       value = pairwise.pairwise_kernels(X=x, metric=self.metricType, **dictTemp)
     elif self.metricType in pairwise.distance_metrics().keys():
       value = pairwise.pairwise_distances(X=x, metric=self.metricType, **dictTemp)
     if value.shape == (1,1):
       return value[0]
     else:
       return value
Пример #12
0
def edge_weight(x, y, mode='rbf', gamma=0.5):
    dists = distance_metrics()
    kernels = kernel_metrics()
    kernels['bhattacharya'] = bhattacharya
    kernels['intersection'] = intersection
    if mode in dists:
        diff = dists[mode](x, y)
    elif mode in kernels:
        diff = kernels[mode](x, y, gamma=gamma)
    else:
        raise Exception('Mode not recognised')

    return np.float64(diff)
Пример #13
0
def make_disk_graph(X, radius, metric='euclidean'):
    """Make a generalized disk graph, in which points whose distance is less
    than a certain radius are considered adjacent.
    
    Params:
        X: a 2D numpy array of shape (n_observations, n_features).
        radius: the radius of disks for adjacency. 
        metric: string, representing which metric. Options are given by
            sklearn.metrics.pairwise.distance_metrics. Default is 'euclidean'.
        
    Returns: a networkx simple Graph
    """
    metric = distance_metrics()[metric]
    dist = metric(X)
    adj = np.asarray(dist < radius, dtype=np.float)
    return from_numpy_matrix(adj, create_using=Graph)
Пример #14
0
def agglomerative(X, affinity, thres, n_clusters=None, p=-3):
    '''
    X: A list of n data features. Each entry is An array of data of size n_samples * n_dimension. Different affinity is applied for differnt feature.
    thres: The distance thresholed to seperate two clusters.
    affinity: A list of n affinities. The distance metric to seperate clusters. Could be 'p_norm', 'euclidean', 'cosine'.
    n_clusters: The result number of clusters.
    p: the p value if p-norm is used as affinity

    return: The extimated labels for each datapoint.
    '''
    if len(set(
            affinity)) == 1:  # Apply the same affinity to different features.
        affinity = affinity[0]
        if isinstance(X, list):
            X = np.concatenate(X, axis=1)
        if affinity in distance_metrics():
            ac = AgglomerativeClustering(n_clusters=n_clusters,
                                         affinity=affinity,
                                         linkage='average',
                                         distance_threshold=thres)
            estimated_labels = ac.fit_predict(X)
        elif affinity == 'p_norm':
            ac = AgglomerativeClustering(n_clusters=n_clusters,
                                         affinity='precomputed',
                                         linkage='average',
                                         distance_threshold=thres)
            distances = p_norm_distance(X, p=p)
            estimated_labels = ac.fit_predict(distances)
    elif len(set(affinity)
             ) != 1:  # Apply different affinities to different features.
        ac = AgglomerativeClustering(n_clusters=n_clusters,
                                     affinity='precomputed',
                                     linkage='average',
                                     distance_threshold=thres)
        n_data = X[0].shape[0]
        distances = np.zeros((n_data, n_data))
        for i, data in enumerate(X):
            if affinity[i] == 'p_norm':
                distances += p_norm_distance(X[i], p=p)
            elif affinity[i] == 'euclidean':
                distances += euclidean_distance(X[i])
            elif affinity[i] == 'cosine':
                distances += cos_distance(X[i])
        estimated_labels = ac.fit_predict(distances)
    return estimated_labels
Пример #15
0
def plot_graph_layout(embedding_set, kind="cosine", **kwargs):
    """
    Handles the plotting of a layout graph using the embeddings in an embeddingset as input.

    **Input**

    - embeddings: a set of `whatlies.Embedding` objects to plot
    - kind: distance metric options: 'cityblock', 'cosine', 'euclidean', 'l2', 'l1', 'manhattan',
    """

    vectors = [token.vector for k, token in embedding_set.items()]
    label_dict = {i: w for i, (w, _) in enumerate(embedding_set.items())}
    dist_fnc = distance_metrics()[kind]
    dist = dist_fnc(np.array(vectors), np.array(vectors))
    # Greate graph
    graph = nx.from_numpy_matrix(dist)
    distance = pd.DataFrame(dist).to_dict()
    # Chhange layout positions of the graph
    pos = nx.kamada_kawai_layout(graph, dist=distance)
    # Draw nodes and labels
    nx.draw_networkx_nodes(graph, pos, node_color="b", alpha=0.5)
    nx.draw_networkx_labels(graph, pos, labels=label_dict, **kwargs)
Пример #16
0
def agglomerative(X, affinity, thres, n_clusters=None, p=-3):
    '''
    X: An array of data of size n_samples * n_dimension
    thres: The distance thresholed to seperate two clusters.
    affinity: The distance metric to seperate clusters. Could be 'p_norm', 'euclidean', 'cosine'.
    n_clusters: The result number of clusters.
    p: the p value if p-norm is used as affinity

    return: The extimated labels for each datapoint.
    '''
    if affinity in distance_metrics():
        ac = AgglomerativeClustering(n_clusters=n_clusters,
                                     affinity=affinity,
                                     linkage='average',
                                     distance_threshold=thres)
        estimated_labels = ac.fit_predict(X)
    elif affinity == 'p_norm':
        ac = AgglomerativeClustering(n_clusters=n_clusters,
                                     affinity='precomputed',
                                     linkage='average',
                                     distance_threshold=thres)
        distances = p_norm_distance(X, p=p)
        estimated_labels = ac.fit_predict(distances)
    return estimated_labels
def cluster_diameter(data, labelsMat, mode='max', dist_metric='euclidean'):
    """
    Computes the diameter for each cluster from the given dataset and 
    clustering labels.
    Params:
        data: ndarray of shape (n_samples, n_features)
            - original data elements which were clustered
        labelsMat: ndarray of shape (n_samples, n_clusteringAlgorithms)
            - labels assigned by each clustering algorithms stored in columns
            - assigned labels are in the range [0, n_clusters]
        mode: Either of the four defined below
            - 'max' :: Maximum distance between any 2 points of a cluster
            - 'avg' :: Mean distance between all pairs within the cluster
            - 'avg_centroid' :: Twice the mean distance of every point from 
              cluster centroid
            - 'far_centroid' :: Twice the distance between the centroid and the
              farthest point from it within the same cluster
        dist_metric: string, can be one of the following:
            - 'manhattan' or 'l1'
            - 'euclidean' or 'l2'
            - 'cosine'
            - 'haversine'
    Returns:
        diamCluster: list[ndarray (n_clusters,)] of length n_clusteringAlgorithms
            - every element in the list is a numpy array containing diameter of
              the clusters defined by the a clustering algorithm
            - list elements arranges according to the labelsMat
    """
    # Change labelsMat shape if only one clustering algorithm is used
    if len(labelsMat.shape) == 1:
        labelsMat = np.expand_dims(labelsMat, 1)
    diamCluster = []
    for j in range(labelsMat.shape[1]):  #Iterate over n_clusteringAlgorithms
        nClusters = np.max(
            labelsMat[:, j]
        ) + 1  #Find number of clusters corresponding to each algorithm
        clusterDiameter = np.zeros((nClusters))
        clusters = dict(
        )  # Dictionary of clusters with labels as key and list of numpy vectors (samples corresponding to that label) as values
        for i in range(data.shape[0]):  #Iterate over n_samples
            clusters.setdefault(labelsMat[i, j], []).append(data[i, :])
        for k in range(nClusters):
            if mode == 'max':
                if len(clusters[k]) == 1:
                    clusterDiameter[k] = 0
                else:
                    clusterDiameter[k] = np.max(
                        distance_metrics()['euclidean'](clusters[k]))
            elif mode == 'avg':
                if len(clusters[k]) == 1:
                    clusterDiameter[k] = 0
                else:
                    clusterDiameter[k] = np.sum(
                        distance_metrics()['euclidean'](
                            clusters[k])) / (2 * len(clusters[k]))
            elif mode == 'avg_centroid':
                if len(clusters[k]) == 1:
                    clusterDiameter[k] = 0
                else:
                    centersMat = calc_cluster_centroids(data, labelsMat)
                    clusterDiameter[k] = 2 * np.sum(
                        distance_metrics()['euclidean'](
                            np.array(clusters[k]), centersMat[j][k].reshape(
                                1, -1))) / (len(clusters[k]))
            elif mode == 'far_centroid':
                if len(clusters[k]) == 1:
                    clusterDiameter[k] = 0
                else:
                    centersMat = calc_cluster_centroids(data, labelsMat)
                    clusterDiameter[k] = 2 * np.max(
                        distance_metrics()['euclidean'](np.array(
                            clusters[k]), centersMat[j][k].reshape(1, -1)))
            else:
                raise Exception(
                    "Unsupported MODE to calculate cluster diameter")
        diamCluster.append(clusterDiameter)
    return diamCluster
def xie_beni_score(data, labels=None, fuzzyMembershipMat=None, fuzzifier=2):
    """
    Computes the Xie-Beni index score for the clustering algorithm whose
    corresponding labels are defined in the vector 'labels' or the membership
    of each sample point is defined in the fuzzy membership matrix (i.e.
    'fuzzyMembershipMat'). If the value of 'fuzzyMembershipMat' remains 'None'
    upon call to this function, 'labels' vector is used to define membership of
    each sample to the labelled cluster as 1 while others as 0. In that case,
    cluster center will also be computed as the mean of all the points belonging
    to that cluster.
    Params:
        data: ndarray of shape (n_samples, n_features)
            - original data elements which were clustered
        labels: ndarray of shape (n_samples,)
            - labels assigned by the clustering algorithms stored in columns
            - 'None' signifies that the provided algorithm is Fuzzy C-Means
        fuzzyMembershipMat: ndarray of shape (n_clusters, n_samples)
            - matrix defining the fuzzy membership of each sample point to the
              generated clusters
            - 'None' signifies that the provided algorithm is NOT Fuzzy C-Means
        fuzzifier: float in range (1,2]
            - fuzzifier used in Fuzzy C-Means Algorithm
            - Only considered if 'fuzzyMembershipMat' is not None
    Returns:
        score: float
            - The resulting Xie-Beni Index score correpsonding to the given
              clustering algorithm.
    """
    # Check if all input parameters are sepcified correctly
    if fuzzyMembershipMat is None:
        if labels is None:
            raise Exception(
                "Neither fuzzy membership matrix, nor fixed labels provided.")
        else:  # Create fuzzyMembershipMat based on labels
            nClusters = np.max(labels) + 1  # Find number of clusters
            fuzzyMembershipMat = np.zeros((nClusters, data.shape[0]))
            for i in range(data.shape[0]):
                fuzzyMembershipMat[labels[i], i] = 1
    else:
        nClusters = fuzzyMembershipMat.shape[0]  # Find number of clusters
        if labels is not None:
            warn(
                "Since both labels and fuzzy membership matrix are provided, latter one is used."
            )
    if fuzzifier <= 1 or fuzzifier > 2:
        raise Exception("Value of fuzzifier must lie in range: (1,2]")

    # Compute Fuzzy Centroids (n_clusters, n_features)
    fuzzyCentroids = np.zeros((nClusters, data.shape[1]))
    for i in range(nClusters):
        fuzzyCentroids[i] = np.matmul(
            np.power(fuzzyMembershipMat[i], fuzzifier), data) / np.sum(
                np.power(fuzzyMembershipMat[i], fuzzifier))

    # Compute Separation
    interClusterDist = distance_metrics()['euclidean'](fuzzyCentroids)
    np.fill_diagonal(interClusterDist, np.inf)
    separation = np.power(np.min(interClusterDist), 2)

    # Compute Total Variance
    sigma = 0
    for i in range(nClusters):
        sigma += np.sum(
            np.multiply(
                np.power(fuzzyMembershipMat[i], fuzzifier),
                np.power(
                    distance_metrics()['euclidean'](np.expand_dims(
                        fuzzyCentroids[i], axis=0), data), 2)))

    # Compute Compactness
    compactness = sigma / data.shape[0]

    # Compute Score
    score = compactness / separation
    return score
def intercluster_dist(data, labelsMat, mode='min', dist_metric='euclidean'):
    """
    Computes the intercluster distance between each pair of clusters from the 
    given dataset and clustering labels.
    Params:
        data: ndarray of shape (n_samples, n_features)
            - original data elements which were clustered
        labelsMat: ndarray of shape (n_samples, n_clusteringAlgorithms)
            - labels assigned by each clustering algorithms stored in columns
            - assigned labels are in the range [0, n_clusters]
        mode: Either of the three defined below
            - 'min' :: Minimum distance between any pair of points (1 from
              each cluster)
            - 'max' :: Maximum distance between any pair of points (1 from
              each cluster)
            - 'centroid' :: Distance between centroids of the 2 clusters
        dist_metric: string, can be one of the following:
            - 'manhattan' or 'l1'
            - 'euclidean' or 'l2'
            - 'cosine'
            - 'haversine'
    Returns:
        interClusterDist: list[ndarray (n_clusters, n_clusters)] of length n_clusteringAlgorithms
            - every element in the list is a numpy array containing intercluster
              distance matrix for the clusters defined by the a clustering algorithm
            - list elements arranges according to the labelsMat
    """
    # Change labelsMat shape if only one clustering algorithm is used
    if len(labelsMat.shape) == 1:
        labelsMat = np.expand_dims(labelsMat, 1)
    interClusterDist = []
    if mode == 'centroid':
        centersMat = calc_cluster_centroids(data, labelsMat)
    for j in range(labelsMat.shape[1]):  #Iterate over n_clusteringAlgorithms
        nClusters = np.max(
            labelsMat[:, j]
        ) + 1  #Find number of clusters corresponding to each algorithm
        clusterDistMat = np.zeros((nClusters, nClusters))
        if mode == 'min' or mode == 'max':
            clusters = dict()
            for i in range(data.shape[0]):  #Iterate over n_samples
                clusters.setdefault(labelsMat[i, j], []).append(data[i, :])
        for m in range(nClusters):
            for n in range(m + 1, nClusters):
                if mode == 'min':
                    clusterDistMat[m][n] = np.min(
                        distance_metrics()[dist_metric](clusters[m],
                                                        clusters[n]))
                    clusterDistMat[n][m] = clusterDistMat[m][n]
                elif mode == 'max':
                    clusterDistMat[m][n] = np.max(
                        distance_metrics()[dist_metric](clusters[m],
                                                        clusters[n]))
                    clusterDistMat[n][m] = clusterDistMat[m][n]
                elif mode == 'centroid':
                    clusterDistMat[m][n] = np.max(
                        distance_metrics()[dist_metric](
                            (centersMat[j][m], centersMat[j][n])))
                    clusterDistMat[n][m] = clusterDistMat[m][n]
                else:
                    raise Exception(
                        "Unsupported MODE to calculate intercluster distance")
        interClusterDist.append(clusterDistMat)
    return interClusterDist
Пример #20
0
def is_distance(mode):
    return mode in distance_metrics()
Пример #21
0
#  'laplacian': sklearn.metrics.pairwise.laplacian_kernel,
#  'sigmoid': sklearn.metrics.pairwise.sigmoid_kernel,
#  'cosine': sklearn.metrics.pairwise.cosine_similarity}
# (Last Updated: sklearn.__version__ == 0.19.1)
_METRICS_MISC_PAIRWISE.update(sk_pairwise.kernel_metrics())
# Update with dict of distance names and functions.
# >>> distance_metrics()
# {'cityblock': sklearn.metrics.pairwise.manhattan_distances,  # \/
#  'cosine': sklearn.metrics.pairwise.cosine_distances,
#  'euclidean': sklearn.metrics.pairwise.euclidean_distances,  # \/
#  'l2': sklearn.metrics.pairwise.euclidean_distances,  # /\
#  'l1': sklearn.metrics.pairwise.manhattan_distances,  # \/
#  'manhattan': sklearn.metrics.pairwise.manhattan_distances,  # /\
#  'precomputed': None}
# (Last Updated: sklearn.__version__ == 0.19.1)
_METRICS_MISC_PAIRWISE.update(sk_pairwise.distance_metrics())
# Update with paired distance names (prepend "paired_") and functions.
# >>> {'paired_' + k: v for k, v in
# ...  iteritems(sk_pairwise.PAIRED_DISTANCES.copy())}
# {'paired_cosine': sklearn.metrics.pairwise.paired_cosine_distances,
#  'paired_euclidean': sklearn.metrics.pairwise.paired_euclidean_distances,
#  'paired_l2': sklearn.metrics.pairwise.paired_euclidean_distances,
#  'paired_l1': sklearn.metrics.pairwise.paired_manhattan_distances,
#  'paired_manhattan': sklearn.metrics.pairwise.paired_manhattan_distances,
#  'paired_cityblock': sklearn.metrics.pairwise.paired_manhattan_distances}
# (Last Updated: sklearn.__version__ == 0.19.1)
_METRICS_MISC_PAIRWISE.update({
    'paired_' + k: v
    for k, v in iteritems(sk_pairwise.PAIRED_DISTANCES.copy())
})
Пример #22
0
df1 = ratings.iloc[1, :150]
x1 = ratings.iloc[100:175, 2]
df2 = ratings.iloc[2, :75]
cs111 = laplacian_kernel(x.values.reshape(1, -1), [df1])
cs112 = laplacian_kernel(x1.values.reshape(1, -1), [df2])
cs111[0][0] = 1
print(cs111)  ### [[1.]]
print(cs112)  ### [[0.10699213]]
print(np.argmax(cs111))  ## 0
print(np.argmax(cs112))  ### 0
r = [0, 0]
cs_55 = [cs111[0][0], cs112[0][0]]
print(cs_55)  ############# [1.0, 0.10699212985311443]
np.argmax(cs_55)  ### 0
from sklearn.metrics.pairwise import distance_metrics
cs113 = distance_metrics()
##cs114 = distance_metrics(x1.values.reshape(1,-1),[df2])
print(np.argmax(cs113))  ## 0
print(cs113)
from sklearn.metrics.pairwise import _parallel_pairwise
cs114 = _parallel_pairwise(x.values.reshape(1, -1), [df1],
                           n_jobs=1,
                           func=euclidean_distances)
cs115 = _parallel_pairwise(x1.values.reshape(1, -1), [df2],
                           n_jobs=2,
                           func=euclidean_distances)
cs114[0][0] = 1
print(cs114)  ### [[1.]]
print(cs115)  ### [[28.53037718]]
print(np.argmax(cs114))  ## 0
print(np.argmax(cs115))  ### 0
Пример #23
0
def mkNN(X, k, measure='euclidean'):
    """
    Construct mutual_kNN for large scale dataset

    If j is one of i's closest neighbors and i is also one of j's closest members,
    the edge will appear once with (i,j) where i < j.

    Parameters
    ----------
    X : [n_samples, n_dim] array
    k : int
      number of neighbors for each sample in X
    """
    from scipy.spatial import distance
    from scipy.sparse import csr_matrix, triu, find
    from scipy.sparse.csgraph import minimum_spanning_tree

    samples = X.shape[0]
    batchsize = 10000
    b = np.arange(k + 1)
    b = tuple(b[1:].ravel())

    z = np.zeros((samples, k))
    weigh = np.zeros_like(z)

    # This loop speeds up the computation by operating in batches
    # This can be parallelized to further utilize CPU/GPU resource
    for x in np.arange(0, samples, batchsize):
        start = x
        end = min(x + batchsize, samples)

        parallelized_metrics = list(distance_metrics().keys())

        if (measure in parallelized_metrics):
            w = pairwise_distances(X=X[start:end],
                                   Y=X,
                                   metric=measure,
                                   n_jobs=-1)
        else:
            w = distance.cdist(X[start:end], X, measure)

        y = np.argpartition(w, b, axis=1)

        z[start:end, :] = y[:, 1:k + 1]
        weigh[start:end, :] = np.reshape(
            w[tuple(np.repeat(np.arange(end - start), k)),
              tuple(y[:, 1:k + 1].ravel())], (end - start, k))
        del (w)

    ind = np.repeat(np.arange(samples), k)

    P = csr_matrix((np.ones((samples * k)), (ind.ravel(), z.ravel())),
                   shape=(samples, samples))
    Q = csr_matrix((weigh.ravel(), (ind.ravel(), z.ravel())),
                   shape=(samples, samples))

    Tcsr = minimum_spanning_tree(Q)
    P = P.minimum(P.transpose()) + Tcsr.maximum(Tcsr.transpose())
    P = triu(P, k=1)

    return np.asarray(find(P)).T
Пример #24
0
 def distance(self, x, y=None, **kwargs):
     """
   This method returns the distance between two points x and y. If y is not provided then x is a pointSet and a distance matrix is returned
   @ In, x, numpy.ndarray, array containing data of x, if 1D array is provided, the array will be reshaped via x.reshape(1,-1)
   @ In, y, numpy.ndarray, array containing data of y, if 1D array is provided, the array will be reshaped via y.reshape(1,-1)
   @ Out, value, numpy.ndarray, distance between x and y (if y is provided) or a square distance matrix if y is None
 """
     if y is not None:
         if isinstance(x, np.ndarray) and isinstance(y, np.ndarray):
             if len(x.shape) == 1 and self.metricType not in scores.keys():
                 x = x.reshape(1, -1)
                 #self.raiseAWarning(self, "1D array is provided. For consistence, this array is reshaped via x.reshape(1,-1) ")
             if len(y.shape) == 1 and self.metricType not in scores.keys():
                 y = y.reshape(1, -1)
                 #self.raiseAWarning(self, "1D array is provided. For consistence, this array is reshaped via y.reshape(1,-1) ")
             dictTemp = utils.mergeDictionaries(kwargs, self.distParams)
             try:
                 if self.metricType in pairwise.kernel_metrics().keys():
                     value = pairwise.pairwise_kernels(
                         X=x, Y=y, metric=self.metricType, **dictTemp)
                 elif self.metricType in pairwise.distance_metrics():
                     value = pairwise.pairwise_distances(
                         X=x, Y=y, metric=self.metricType, **dictTemp)
                 elif self.metricType in scores.keys():
                     value = np.zeros((1, 1))
                     value[:, :] = scores[self.metricType](x, y, **dictTemp)
             except TypeError as e:
                 self.raiseAWarning(
                     'There are some unexpected keyword arguments found in Metric with type "',
                     self.metricType, '"!')
                 self.raiseAnError(TypeError, 'Input parameters error:\n',
                                   str(e), '\n')
             if value.shape == (1, 1):
                 return value[0]
             else:
                 return value
         else:
             self.raiseAnError(
                 IOError,
                 'Metric SKL error: SKL metrics support only PointSets and not HistorySets'
             )
     else:
         if self.metricType == 'mahalanobis':
             covMAtrix = np.cov(x.T)
             kwargs['VI'] = np.linalg.inv(covMAtrix)
         dictTemp = utils.mergeDictionaries(kwargs, self.distParams)
         try:
             if self.metricType in pairwise.kernel_metrics().keys():
                 value = pairwise.pairwise_kernels(X=x,
                                                   metric=self.metricType,
                                                   **dictTemp)
             elif self.metricType in pairwise.distance_metrics().keys():
                 value = pairwise.pairwise_distances(X=x,
                                                     metric=self.metricType,
                                                     **dictTemp)
         except TypeError as e:
             self.raiseAWarning(
                 'There are some unexpected keyword arguments found in Metric with type "',
                 self.metricType, '"!')
             self.raiseAnError(TypeError, 'Input parameters error:\n',
                               str(e), '\n')
         if value.shape == (1, 1):
             return value[0]
         else:
             return value
def silhouette_samples_slow(X, labels, metric='euclidean', **kwds):
    """Compute the Silhouette Coefficient for each sample.
 
    The Silhoeutte Coefficient is a measure of how well samples are clustered
    with samples that are similar to themselves. Clustering models with a high
    Silhouette Coefficient are said to be dense, where samples in the same
    cluster are similar to each other, and well separated, where samples in
    different clusters are not very similar to each other.
 
    The Silhouette Coefficient is calculated using the mean intra-cluster
    distance (a) and the mean nearest-cluster distance (b) for each sample.
    The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``.
 
    This function returns the Silhoeutte Coefficient for each sample.
 
    The best value is 1 and the worst value is -1. Values near 0 indicate
    overlapping clusters.
 
    Parameters
    ----------
    X : array [n_samples_a, n_features]
        Feature array.
 
    labels : array, shape = [n_samples]
             label values for each sample
 
    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by metrics.pairwise.pairwise_distances. If X is the distance
        array itself, use "precomputed" as the metric.
 
    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.
 
    Returns
    -------
    silhouette : array, shape = [n_samples]
        Silhouette Coefficient for each samples.
 
    References
    ----------
 
    Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
        Interpretation and Validation of Cluster Analysis". Computational
        and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7.
 
    http://en.wikipedia.org/wiki/Silhouette_(clustering)
 
    """
    metric = distance_metrics()[metric]
    n = labels.shape[0]
    A = np.array([_intra_cluster_distance_slow(X, labels, metric, i)
                  for i in range(n)])
    B = np.array([_nearest_cluster_distance_slow(X, labels, metric, i)
                  for i in range(n)])
    sil_samples = (B - A) / np.maximum(A, B)
    # nan values are for clusters of size 1, and should be 0
    return np.nan_to_num(sil_samples)
Пример #26
0
def silhouette_samples_slow(X, labels, metric='euclidean', **kwds):
    """Compute the Silhouette Coefficient for each sample.

    The Silhoeutte Coefficient is a measure of how well samples are clustered
    with samples that are similar to themselves. Clustering models with a high
    Silhouette Coefficient are said to be dense, where samples in the same
    cluster are similar to each other, and well separated, where samples in
    different clusters are not very similar to each other.

    The Silhouette Coefficient is calculated using the mean intra-cluster
    distance (a) and the mean nearest-cluster distance (b) for each sample.
    The Silhouette Coefficient for a sample is ``(b - a) / max(a, b)``.

    This function returns the Silhoeutte Coefficient for each sample.

    The best value is 1 and the worst value is -1. Values near 0 indicate
    overlapping clusters.

    Parameters
    ----------
    X : array [n_samples_a, n_features]
        Feature array.

    labels : array, shape = [n_samples]
             label values for each sample

    metric : string, or callable
        The metric to use when calculating distance between instances in a
        feature array. If metric is a string, it must be one of the options
        allowed by metrics.pairwise.pairwise_distances. If X is the distance
        array itself, use "precomputed" as the metric.

    `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still
        metric dependent. See the scipy docs for usage examples.

    Returns
    -------
    silhouette : array, shape = [n_samples]
        Silhouette Coefficient for each samples.

    References
    ----------

    Peter J. Rousseeuw (1987). "Silhouettes: a Graphical Aid to the
        Interpretation and Validation of Cluster Analysis". Computational
        and Applied Mathematics 20: 53-65. doi:10.1016/0377-0427(87)90125-7.

    http://en.wikipedia.org/wiki/Silhouette_(clustering)

    """
    metric = distance_metrics()[metric]
    n = labels.shape[0]
    A = np.array(
        [_intra_cluster_distance_slow(X, labels, metric, i) for i in range(n)])
    B = np.array([
        _nearest_cluster_distance_slow(X, labels, metric, i) for i in range(n)
    ])
    sil_samples = (B - A) / np.maximum(A, B)
    # nan values are for clusters of size 1, and should be 0
    return np.nan_to_num(sil_samples)
Пример #27
0
def GetSklearnPairwiseMetrics():
    while True:
        sklearn_dict = distance_metrics()
        yield pd.Series(data=itemgetter(*SKLEARN_FUNC_NAMES)(sklearn_dict),
                        index=SKLEARN_FUNC_NAMES,
                        name='sklearn_funcs')
Пример #28
0
    def display_word_similarity(
        pipe,  # nlu component_list
        default_texts: Tuple[str, str] = ("Donald Trump likes to party!",
                                          "Angela Merkel likes to party!"),
        threshold: float = 0.5,
        title: Optional[
            str] = "Embeddings Similarity Matrix &  Visualizations  ",
        sub_tile: Optional[
            str] = "Visualize `word-wise similarity matrix` and calculate `similarity scores` for `2 texts` and every `word embedding` loaded",
        write_raw_pandas: bool = False,
        display_embed_information: bool = True,
        similarity_matrix=True,
        show_algo_select: bool = True,
        dist_metrics: List[str] = ('cosine'),
        set_wide_layout_CSS: bool = True,
        generate_code_sample: bool = False,
        key: str = "NLU_streamlit",
        num_cols: int = 2,
        display_scalar_similarities: bool = False,
        display_similarity_summary: bool = False,
        model_select_position: str = 'side',  # main or side
        show_infos: bool = True,
        show_logo: bool = True,
    ):
        """We visualize the following cases :
        1. Simmilarity between 2 words - > sim (word_emb1, word_emb2)
        2. Simmilarity between 2 sentences -> let weTW stand word word_emb of token T and sentence S
            2.1. Raw token level with merged embeddings -> sim([we11,we21,weT1], [we12,we22,weT2])
            2.2  Autogenerate sentemb, basically does 2.1 in the Spark NLP backend
            2.3 Already using sentence_embedder model -> sim(se1,se2)
        3. Simmilarity between token and sentence -> sim([we11,w21,wT1], se2)
        4. Mirrored 3
         """
        # https://scikit-learn.org/stable/modules/classes.html#module-sklearn.metrics.pairwise
        StreamlitVizTracker.footer_displayed = False
        try:
            import plotly.express as px
            from sklearn.metrics.pairwise import distance_metrics
        except:
            st.error(
                "You need the sklearn and plotly package in your Python environment installed for similarity visualizations. Run <pip install sklearn plotly>"
            )
        if set_wide_layout_CSS: _set_block_container_style()
        if title: st.header(title)
        if show_logo: StreamlitVizTracker.show_logo()
        if sub_tile: st.subheader(sub_tile)

        StreamlitVizTracker.loaded_word_embeding_pipes = []
        dist_metric_algos = distance_metrics()
        dist_algos = list(dist_metric_algos.keys())
        if 'haversine' in dist_algos:
            dist_algos.remove('haversine')  # not applicable in >2D
        if 'precomputed' in dist_algos:
            dist_algos.remove('precomputed')  # Not a dist
        cols = st.columns(2)
        text1 = cols[0].text_input("Text or word1",
                                   default_texts[0],
                                   key=key + 'field_1')
        text2 = cols[1].text_input(
            "Text or word2", default_texts[1], key=key +
            'field_2') if len(default_texts) > 1 else cols[1].text_input(
                "Text or word2", 'Please enter second string', key=key)
        # exp = st.sidebar.beta_expander("Select additional Embedding Models and distance metric to compare ")
        e_coms = StreamlitUtilsOS.find_all_embed_components(pipe)
        embed_algos_to_load = []
        embed_pipes = [pipe]
        dist_algo_selection = dist_metrics
        if show_algo_select:
            # emb_components_usable = Discoverer.get_components('embed')
            emb_components_usable = [
                e for e in Discoverer.get_components(
                    'embed', True, include_aliases=True)
                if 'chunk' not in e and 'sentence' not in e
            ]
            loaded_embed_nlu_refs = []
            loaded_storage_refs = []
            loaded_embed_nlu_refs = list(set(loaded_embed_nlu_refs))

            for c in e_coms:
                r = c.nlu_ref
                if 'en.' not in r and 'embed.' not in r and 'ner' not in r:
                    loaded_embed_nlu_refs.append('en.embed.' + r)
                elif 'en.' in r and 'embed.' not in r and 'ner' not in r:
                    r = r.split('en.')[0]
                    loaded_embed_nlu_refs.append('en.embed.' + r)
                else:
                    loaded_embed_nlu_refs.append(
                        StorageRefUtils.extract_storage_ref(c))
                loaded_storage_refs.append(
                    StorageRefUtils.extract_storage_ref(c))
            for p in StreamlitVizTracker.loaded_word_embeding_pipes:
                if p != pipe: loaded_embed_nlu_refs.append(p.nlu_ref)
            for l in loaded_embed_nlu_refs:
                if l not in emb_components_usable:
                    emb_components_usable.append(l)
            # embed_algo_selection = exp.multiselect("Click to pick additional Embedding Algorithm",options=emb_components_usable,default=loaded_embed_nlu_refs,key = key)
            # dist_algo_selection = exp.multiselect("Click to pick additional Distance Metric", options=dist_algos, default=dist_metrics, key = key)
            emb_components_usable.sort()
            loaded_embed_nlu_refs.sort()
            dist_algos.sort()
            if model_select_position == 'side':
                embed_algo_selection = st.sidebar.multiselect(
                    "Pick additional Word Embeddings for the Similarity Matrix",
                    options=emb_components_usable,
                    default=loaded_embed_nlu_refs,
                    key=key)
                dist_algo_selection = st.sidebar.multiselect(
                    "Pick additional Similarity Metrics ",
                    options=dist_algos,
                    default=dist_metrics,
                    key=key)
            else:
                exp = st.expander(
                    "Pick additional Word Embeddings and Similarity Metrics")
                embed_algo_selection = exp.multiselect(
                    "Pick additional Word Embeddings for the Similarity Matrix",
                    options=emb_components_usable,
                    default=loaded_embed_nlu_refs,
                    key=key)
                dist_algo_selection = exp.multiselect(
                    "Pick additional Similarity Metrics ",
                    options=dist_algos,
                    default=dist_metrics,
                    key=key)
            embed_algos_to_load = list(
                set(embed_algo_selection) - set(loaded_embed_nlu_refs))

        for embedder in embed_algos_to_load:
            embed_pipes.append(nlu.load(embedder))

        if generate_code_sample:
            st.code(
                get_code_for_viz(
                    'SIMILARITY',
                    [StreamlitUtilsOS.extract_name(p)
                     for p in embed_pipes], default_texts))

        StreamlitVizTracker.loaded_word_embeding_pipes += embed_pipes
        similarity_metrics = {}
        embed_vector_info = {}
        cols_full = True
        col_index = 0
        # for p in embed_pipes :
        for p in StreamlitVizTracker.loaded_word_embeding_pipes:
            data1 = p.predict(text1, output_level='token',
                              get_embeddings=True).dropna()
            data2 = p.predict(text2, output_level='token',
                              get_embeddings=True).dropna()
            e_coms = StreamlitUtilsOS.find_all_embed_components(p)
            modelhub_links = [
                ModelHubUtils.get_url_by_nlu_refrence(c.nlu_ref)
                for c in e_coms
            ]
            e_cols = StreamlitUtilsOS.get_embed_cols(p)
            for num_emb, e_col in enumerate(e_cols):
                if col_index == num_cols - 1: cols_full = True
                if cols_full:
                    cols = st.columns(num_cols)
                    col_index = 0
                    cols_full = False
                else:
                    col_index += 1
                tok1 = data1['token']
                tok2 = data2['token']
                emb1 = data1[e_col]
                emb2 = data2[e_col]

                def normalize_matrix(m):
                    return np.nan_to_num(
                        m / np.linalg.norm(m, axis=1, keepdims=True))

                embed_mat1 = normalize_matrix(np.array([x for x in emb1]))
                embed_mat2 = normalize_matrix(np.array([x for x in emb2]))
                # e_name = e_col.split('word_embedding_')[-1]
                e_name = e_coms[num_emb].nlu_ref
                e_name = e_name.split(
                    'embed.')[-1] if 'en.' in e_name else e_name
                if 'ner' in e_name: e_name = loaded_storage_refs[num_emb]

                embed_vector_info[e_name] = {
                    "Vector Dimension ":
                    embed_mat1.shape[1],
                    "Num Vectors":
                    embed_mat1.shape[0] + embed_mat1.shape[0],
                    "NLU_reference":
                    e_coms[num_emb].nlu_ref,
                    "Spark_NLP_reference":
                    ModelHubUtils.NLU_ref_to_NLP_ref(e_coms[num_emb].nlu_ref),
                    "Storage Reference":
                    loaded_storage_refs[num_emb],
                    'Modelhub info':
                    modelhub_links[num_emb]
                }
                for dist_algo in dist_algo_selection:
                    # scalar_similarities[e_col][dist_algo]={}
                    sim_score = ((dist_metric_algos[dist_algo]
                                  (embed_mat1, embed_mat2) - 1) * -1)

                    sim_score = pd.DataFrame(sim_score)
                    sim_score.index = tok1.values
                    sim_score.columns = tok2.values
                    sim_score.columns = StreamlitVizTracker.pad_duplicate_tokens(
                        list(sim_score.columns))
                    sim_score.index = StreamlitVizTracker.pad_duplicate_tokens(
                        list(sim_score.index))
                    if write_raw_pandas: st.write(sim_score, key=key)
                    if sim_score.shape == (1, 1):
                        sim_score = sim_score.iloc[0][0]
                        sim_score = round(sim_score, 2)
                        if sim_score > threshold:
                            st.success(sim_score)
                            st.success(
                                f'Scalar Similarity={sim_score} for distance metric={dist_algo}'
                            )
                            st.error(
                                'No similarity matrix for only 2 tokens. Try entering at least 1 sentences in a field'
                            )
                        else:
                            st.error(
                                f'Scalar Similarity={sim_score} for distance metric={dist_algo}'
                            )
                    else:
                        ploty_avaiable = True
                        # for tok emb, sum rows and norm by rows, then sum cols and norm by cols to generate a scalar from matrix
                        scalar_sim_score = np.sum(
                            (np.sum(sim_score, axis=0) /
                             sim_score.shape[0])) / sim_score.shape[1]
                        scalar_sim_score = round(scalar_sim_score, 2)

                        if display_scalar_similarities:
                            if scalar_sim_score > threshold:
                                st.success(
                                    f'Scalar Similarity :{scalar_sim_score} for distance metric={dist_algo}'
                                )
                            else:
                                st.error(
                                    f'Scalar Similarity :{scalar_sim_score} for embedder={e_col} distance metric={dist_algo}'
                                )
                        if similarity_matrix:
                            if ploty_avaiable:
                                fig = px.imshow(
                                    sim_score, labels=dict(color="similarity")
                                )  # , title=f'Simmilarity Matrix for embedding_model={e_name} distance metric={dist_algo}')
                                # st.write(fig,key =key)
                                similarity_metrics[
                                    f'{e_name}_{dist_algo}_similarity'] = {
                                        'scalar_similarity': scalar_sim_score,
                                        'dist_metric': dist_algo,
                                        'embedding_model': e_name,
                                        'modelhub_info':
                                        modelhub_links[num_emb],
                                    }
                                subh = f"""Embedding-Model=`{e_name}`, Similarity-Score=`{scalar_sim_score}`,  distance metric=`{dist_algo}`"""
                                cols[col_index].markdown(subh)
                                cols[col_index].write(fig, key=key)
                            else:
                                pass  # todo fallback plots

        if display_similarity_summary:
            exp = st.expander("Similarity summary")
            exp.write(similarity_metrics)
        if display_embed_information:
            exp = st.expander("Embedding vector information")
            exp.write(embed_vector_info)
        if show_infos:
            # VizUtilsStreamlitOS.display_infos()
            StreamlitVizTracker.display_model_info(pipe.nlu_ref, pipes=[pipe])
            StreamlitVizTracker.display_footer()