Пример #1
0
def _find_nearest(query_vectors, embeds, n_nearby, batch_size):
    num_embeds = embeds.shape[0]
    num_batches = (num_embeds /
                   batch_size) + (1 if num_embeds % batch_size != 0 else 0)
    nearest_k_fn = _make_nearest_k_fn(k=n_nearby)
    indices = []
    dists = []
    for b in range(num_batches):
        print 'Running batch {} of {}\r'.format(1 + b, num_batches),
        sys.stdout.flush()
        start = b * batch_size
        end = min(start + batch_size, embeds.shape[0])
        batch = embeds[start:end, :]
        batch_indices, batch_dists = zip(*pairwise_distances_chunked(
            query_vectors, batch, reduce_func=nearest_k_fn,
            metric='euclidean'))
        indices.append(np.concatenate(batch_indices) + start)
        dists.append(np.concatenate(batch_dists))
    # Combine results from each batch, sort them and take the n_nearby closest
    indices = np.concatenate(indices, axis=1)
    dists = np.concatenate(dists, axis=1)
    sorted_idx = np.argsort(dists, axis=1)[:, :n_nearby]
    indices = np.concatenate([[row[idx_row]]
                              for row, idx_row in zip(indices, sorted_idx)])
    dists = np.concatenate([[row[idx_row]]
                            for row, idx_row in zip(dists, sorted_idx)])
    return indices, dists
Пример #2
0
def silhouette_score_(X, labels, metric='euclidean'):
    '''
    The Silhouette Coefficient is calculated using the mean intra-cluster
    distance (``a``) and the mean nearest-cluster distance (``b``) for each
    sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,
    b)``
    '''
    le = LabelEncoder()
    labels = le.fit_transform(labels)  # resequence labels: 0,1,...
    n_samples = len(labels)
    label_freqs = np.bincount(labels)
    # check_number_of_labels(len(le.classes_), n_samples)

    reduce_func = functools.partial(_silhouette_reduce,
                                    labels=labels,
                                    label_freqs=label_freqs)
    results = zip(*pairwise_distances_chunked(X, reduce_func=reduce_func))
    intra_clust_dists, inter_clust_dists = results
    intra_clust_dists = np.concatenate(intra_clust_dists)
    inter_clust_dists = np.concatenate(inter_clust_dists)

    denom = (label_freqs - 1).take(labels, mode='clip')
    with np.errstate(divide="ignore", invalid="ignore"):
        intra_clust_dists /= denom

    sil_samples = inter_clust_dists - intra_clust_dists
    with np.errstate(divide="ignore", invalid="ignore"):
        sil_samples /= np.maximum(intra_clust_dists, inter_clust_dists)

    # nan values are for clusters of size 1, and should be 0
    return np.mean(np.nan_to_num(intra_clust_dists)), np.mean(
        np.nan_to_num(inter_clust_dists)), np.mean(np.nan_to_num(sil_samples))
Пример #3
0
def min_radius(n, data, target, dist_metric):
    """Compute minimum radius of hypersphere such that for each example in
    the data matrix as the centre the sphere will contain at least n examples from
    same class and n examples from a different class.

    --- Parameters: ---

    n: minimum number of examples from same class and different class a hypersphere with centre in
    each example in the dataset should contain

    data: Matrix containing examples' features as rows

    target: Matrix of target variable values

    dist_metric: distance metric for distance matrix computation

    (see documentation on function pairwise_distances from scikit-learn for 
    valid distance metric specifiers)

    ------

    Returns:
    Minimum acceptable radius of the hypersphere

    Author: Jernej Vivod

    """

    # Allocate array for storing minimum acceptable radius for each example in dataset.
    min_r = np.empty(data.shape[0], dtype=float)

    # Construct distances matrix. Force generation by rows.
    dist_mat = sk_metrics.pairwise_distances_chunked(data,
                                                     metric=dist_metric,
                                                     n_jobs=-1,
                                                     working_memory=0)

    # Go over examples and compute minimum acceptable radius for each example.
    for k in np.arange(data.shape[0]):
        dist_from_e = next(dist_mat)[0]  # Get next row of distances matrix
        msk = target == target[k]  # Get mask for examples from same class.
        dist_same = dist_from_e[
            msk]  # Get minimum distance that includes n examples from same class.
        dist_diff = dist_from_e[
            ~msk]  # Get minimum distance that includes n examples from different class.
        try:
            min_r[k] = np.max((np.sort(dist_same)[n], np.sort(dist_diff)[n - 1]
                               ))  # Compute minimum radius for this example.
        except IndexError:
            raise ValueError(
                'Insufficient examples with class {0} for given value of n (n = {1})'
                .format(target[k], n))

    return np.max(
        min_r
    )  # Return maximum of array of minimum acceptable radiuses for each example
def get_score_for_item(content, all_content, taxon):
    embedded_sentences_for_taxon = get_embedded_sentences_for_taxon(all_content, taxon)
    if not embedded_sentences_for_taxon:
        return [], -1;
    content_generator = pairwise_distances_chunked(
        X=[content],
        Y=embedded_sentences_for_taxon,
        working_memory=0,
        metric='cosine',
        n_jobs=-1)
    cosine_scores = list(enumerate(content_generator))[0][1][0]
    cosine_scores.sort()
    return cosine_scores, cosine_scores.mean();
Пример #5
0
def haus(data, sample, max_mem=float('inf')):
    if max_mem == float('inf'):
        dist = pairwise_distances(data[sample, :], data, n_jobs=-1)
        return(dist.min(0).max())
    else:

        dists = pairwise_distances_chunked(data, data[sample,:],
                                           reduce_func = lambda x,y: x.min(1),
                                           working_memory = max_mem)

        h = 0
        for x in dists:
            h = max([h, max(x)])
            print(h)
        return h
Пример #6
0
    def _get_mus_scaling(self, range_scaling):

        reduce_func = partial(self._mus_scaling_reduce_func,
                              range_scaling=range_scaling)

        kwds = {'squared': True}
        chunked_results = list(
            pairwise_distances_chunked(self.X,
                                       self.X,
                                       reduce_func=reduce_func,
                                       metric='euclidean',
                                       n_jobs=self.njobs,
                                       working_memory=1024,
                                       **kwds))

        neigh_dist, neigh_ind, mus, rs = zip(*chunked_results)
        return np.vstack(neigh_dist), np.vstack(neigh_ind), np.vstack(
            mus), np.vstack(rs)
Пример #7
0
    def _return_mus_scaling(self, range_scaling):
        """
        Description:
            adapted from kneighbors function of sklearn
            https://github.com/scikit-learn/scikit-learn/blob/95119c13af77c76e150b753485c662b7c52a41a2/sklearn/neighbors/_base.py#L596
            It allows to keep a nearest neighbor matrix up to rank 'maxk' (few tens of points)
            instead of 'range_scaling' (few thousands), while computing the ratios between neighbors' distances
            up to neighbors' rank 'range scaling'.
            For big datasets it avoids out of memory errors

        Args:
            range_scaling (int): maximum neighbor rank considered in the computation of the mu ratios

        Returns:
            dist (np.ndarray(float)): the FULL distance matrix sorted in increasing order of neighbor distances up to maxk
            neighb_ind np.ndarray(int)): the FULL matrix of the indices of the nearest neighbors up to maxk
            mus np.ndarray(float)): the FULL matrix of the ratios of the neighbor distances of order 2**(i+1) and 2**i
            rs np.ndarray(float)): the FULL matrix of the distances of the neighbors involved in the mu estimates
        """

        reduce_func = partial(self._mus_scaling_reduce_func,
                              range_scaling=range_scaling)

        kwds = {"squared": True}
        chunked_results = list(
            pairwise_distances_chunked(
                self.X,
                self.X,
                reduce_func=reduce_func,
                metric=self.metric,
                n_jobs=self.njobs,
                working_memory=1024,
                **kwds,
            ))

        neigh_dist, neigh_ind, mus, rs = zip(*chunked_results)

        return (
            np.vstack(neigh_dist),
            np.vstack(neigh_ind),
            np.vstack(mus),
            np.vstack(rs),
        )
Пример #8
0
        def gen_pairwise_distances(label, **kwargs):
            """
            (1) Represent the presence of cuis in a binary array.
            (2) Calculate pairwise distances between all records in the array.
            (3) Select record pairs with distance below the threshold.
            """
            def apply_threshold(chunk, _):
                similar = chunk < self.distance_threshold
                return pd.DataFrame(similar,
                                    columns=self.preprocessed_data.index)

            binary = self.binarize(self.preprocessed_data[label],
                                   sparse_output=self.sparse)
            return pairwise_distances_chunked(
                binary,
                metric=self.metric,
                reduce_func=apply_threshold,
                **kwargs,
            )
Пример #9
0
        def func(i, X):
            t0 = perf_counter()

            # D = pairwise_distances(X, n_jobs=-1)

            dtype = [('values', 'f8'), ('indices', 'u8')]
            D = np.memmap(tmp_dir + f"D_{i}.memmap",
                          dtype=dtype,
                          mode="w+",
                          shape=(X.shape[0]**2, ))
            for j in range(0, X.shape[0]**2, chunk_size):
                D["indices"][j:j + chunk_size] = np.arange(j, j + chunk_size)

            row_ptr = 0
            for chunk in pairwise_distances_chunked(X,
                                                    working_memory=chunk_size):
                length = chunk.ravel().shape[0]
                D["values"][row_ptr:row_ptr + length] = chunk.ravel()
                row_ptr += length

            t1 = perf_counter() - t0
            print(f"Done with distances ({i}) t={t1}")
            t0 = perf_counter()

            D.sort(order='values', axis=0)

            t1 = perf_counter() - t0
            print(f"Done with sorting ({i}) t={t1}")
            t0 = perf_counter()

            for j in range(0, X.shape[0]**2, chunk_size):
                idx = D["indices"][j:j + chunk_size]
                D["values"][idx] = np.arange(j + 1, j + chunk_size + 1)

            ranked = D["values"]

            # np.float64 avoids overflows in the computation that comes next
            # ranked = stats.rankdata(D["values"], method="ordinal").astype(np.float64)

            t1 = perf_counter() - t0
            print(f"Done with ranking ({i}) t={t1}")

            return ranked
def pairwise_jaccard_distances(doclist_X, doclist_Y=None):
    """Calculates the matrix with the pairwise Jaccard *DISTANCES* matrix for one (or two) given document list(s).
    If `doclist_Y` is `None`, the matrix is created between every document of the `doclist_X`, else between `doclist_X` 
    and `doclist_Y`.

    Args:
        doclist_X (list of list of words): X - list of documents (words separated by empty space)
        doclist_Y (list of list of words, optional): Y - list of documents. Defaults to None.

    Returns:
        generator: generator of vertical chunks of the distance matrix
    """
    # Transform into sparse document-word-matrix
    wordlist_x = doclist_X
    wordlist_y = doclist_Y if doclist_Y is not None else []
    all_words = wordlist_x + wordlist_y

    # split into chunks
    lexicon = corpora.Dictionary()
    for index in range(0, len(all_words), 1000):
        lexicon.add_documents(all_words[index:index+1000])
        lexicon.filter_extremes(no_below=5, no_above=1, keep_n=None)

    bow_x = []
    for t in wordlist_x:
        bow_x.append(lexicon.doc2bow(t))

    vocab_matrix_x = matutils.corpus2dense(bow_x, num_terms=len(lexicon.token2id)).T.astype(bool)

    if doclist_Y is not None:
        bow_y = []
        for t in wordlist_y:
            bow_y.append(lexicon.doc2bow(t))
        vocab_matrix_y = matutils.corpus2dense(bow_y, num_terms=len(lexicon.token2id)).T
    else:
        vocab_matrix_y = None
    
    # Calculate jaccard *DISTANCES* (no similarities yet)
    matrix_gen = pairwise_distances_chunked(vocab_matrix_x, vocab_matrix_y, metric='jaccard', n_jobs=-1, working_memory=256)
    return matrix_gen
Пример #11
0
def get_matrix_params(embeddings,
                      *,
                      ka,
                      neighborhood_size,
                      metric='euclidean',
                      n_jobs=1,
                      working_memory=None):

    reduce_function = partial(reduce_distance_matrix_func,
                              ka=ka,
                              neighborhood_size=neighborhood_size)

    neighbor_coors, neighbor_distances, kernel_width = list(
        map(
            np.concatenate,
            zip(*pairwise_distances_chunked(embeddings,
                                            reduce_func=reduce_function,
                                            metric=metric,
                                            n_jobs=n_jobs,
                                            working_memory=working_memory))))

    return neighbor_coors, neighbor_distances, kernel_width
Пример #12
0
    def fit(self, X, y):

        self.Y = np.unique(y)
        self.L = len(self.Y)

        if self.L < 2:
            raise ValueError(
                "There is only one unique value in target vector y.")

        Y_idx = [np.where(y == k) for k in self.Y]

        self.XY = [X[Y_idx[i]] for i in range(self.L)]

        self.A = np.zeros((self.L, self.L))

        for i in range(self.L):
            for j in range(i, self.L):

                Xi, Xj = self.XY[i], self.XY[j]
                ni, nj = Xi.shape[0], Xj.shape[0]
                self.A[i, j] = 1.0 / (ni * nj) * sum(
                    np.sum(M) for M in pairwise_distances_chunked(Xi, Xj))
                if j > i:
                    self.A[j, i] = self.A[i, j]

        if self.L > 2:
            k = self.L - 1
            self.B = np.zeros((k, k))

            for i in range(k):
                for j in range(i, k):
                    self.B[i, j] = -self.A[i, j] + self.A[i, k] + self.A[
                        k, j] - self.A[k, k]
                    if j > i:
                        self.B[j, i] = self.B[i, j]

        return self
Пример #13
0
    def predict(self, X):

        s = np.zeros(self.L)
        n = X.shape[0]

        for i in range(self.L):
            Xi = self.XY[i]
            ni = Xi.shape[0]
            s[i] = 1.0 / (ni * n) * sum(
                np.sum(M) for M in pairwise_distances_chunked(Xi, X))

        if self.L < 3:
            p = (s[1] - s[0] + self.A[0, 1] - self.A[1, 1]) / (
                -self.A[0, 0] + 2 * self.A[0, 1] - self.A[1, 1])

            if p < 0:
                return np.array([0, 1])
            if p > 1:
                return np.array([1, 0])

            return np.array([p, 1 - p])

        else:
            k = self.L - 1
            t = np.zeros(k)
            for i in range(k):
                t[i] = -s[i] + self.A[i, k] + s[k] - self.A[k, k]

            P = cvx.Variable(k)
            constraints = [P >= 0, cvx.sum(P) <= 1.0]
            problem = cvx.Problem(
                cvx.Minimize(cvx.quad_form(P, self.B) - 2 * P.T @ t),
                constraints)
            problem.solve()

            P = np.array(P.value).squeeze()
            return np.append(P, 1.0 - sum(P))
Пример #14
0
def corint(x, k1=10, k2=20, DM=None):

    n_elements = len(x)**2  #number of elements

    dists, _ = get_nn(x, k2)

    if DM is None:
        chunked_distmat = pairwise_distances_chunked(x)
    else:
        chunked_distmat = DM

    r1 = np.median(dists[:, k1 - 1])
    r2 = np.median(dists[:, -1])

    n_diagonal_entries = len(x)  #remove diagonal from sum count
    s1 = -n_diagonal_entries
    s2 = -n_diagonal_entries
    for chunk in chunked_distmat:
        s1 += (chunk < r1).sum()
        s2 += (chunk < r2).sum()

    Cr = np.array([s1 / n_elements, s2 / n_elements])
    estq = np.diff(np.log(Cr)) / np.log(r2 / r1)
    return (estq)
Пример #15
0
def main(*, gtex):
    store = h5py.File(gtex, "r", rdcc_nbytes=CACHE_SIZE)
    genes = list(store["genes"][:])
    expressions = store["expressions"]
    n_genes, n_cells = expressions.shape

    Z = transform_rows(expressions, lambda x: np.log10(1 + x), CHUNK_SIZE)
    Z = Z - np.mean(Z, axis=1)[:, None]

    gene_indices = np.arange(n_genes, dtype=np.int32)
    threshold = R2_THRESHOLD
    top_k = MAX_NEIGHBORS
    i = 0

    for chunk in pairwise_distances_chunked(Z,
                                            metric="cosine",
                                            working_memory=0):
        # Transform cosine distance to cosine correlation.
        score = 1 - chunk[0]

        # Adjust correlation against bias.
        score = score - score.mean()

        # Then, obtain a score similar to coefficient of determination.
        score = score**2

        # Do not count for self correlation.
        score[i] = 0

        tops = np.argsort(score)[::-1][:top_k]
        tops = tops[score[tops] > threshold]

        tops_text = " ".join(f"{j}" for j in tops)
        print(f"{i}: {tops_text}")

        i += 1
Пример #16
0
    def _corrint(self, X):

        n_elements = len(X) ** 2  # number of elements

        dists, _ = get_nn(X, min(self.k2, len(X) - 1))

        if self.DM is False:
            chunked_distmat = pairwise_distances_chunked(X)
        else:
            chunked_distmat = X

        r1 = np.median(dists[:, min(self.k1 - 1, len(X) - 2)])
        r2 = np.median(dists[:, -1])

        n_diagonal_entries = len(X)  # remove diagonal from sum count
        s1 = -n_diagonal_entries
        s2 = -n_diagonal_entries
        for chunk in chunked_distmat:
            s1 += (chunk < r1).sum()
            s2 += (chunk < r2).sum()

        Cr = np.array([s1 / n_elements, s2 / n_elements])
        estq = np.diff(np.log(Cr)) / np.log(r2 / r1)
        return estq[0]
Пример #17
0
    def kneighbors(self, X, n_neighbors=None, return_distance=True):
        """Find the K-neighbors of a point.

        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]),
        or numpy ndarray with shape([n_cases,n_readings,n_dimensions])
        y : {array-like, sparse matrix}
            Target values of shape = [n_samples]
        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).
        return_distance : boolean, optional. Defaults to True.
            If False, distances will not be returned

        Returns
        -------
        dist : array
            Array representing the lengths to points, only present if
            return_distance=True
        ind : array
            Indices of the nearest points in the population matrix.
        """
        self.check_is_fitted()
        # Transpose to work correctly with distance functions
        X = X.transpose((0, 2, 1))

        if n_neighbors is None:
            n_neighbors = self.n_neighbors
        elif n_neighbors <= 0:
            raise ValueError("Expected n_neighbors > 0. Got %d" % n_neighbors)
        else:
            if not np.issubdtype(type(n_neighbors), np.integer):
                raise TypeError(
                    "n_neighbors does not take %s value, "
                    "enter integer value" % type(n_neighbors)
                )

        if X is not None:
            query_is_train = False
            X = check_array(X, accept_sparse="csr", allow_nd=True)
        else:
            query_is_train = True
            X = self._fit_X
            # Include an extra neighbor to account for the sample itself being
            # returned, which is removed later
            n_neighbors += 1

        train_size = self._fit_X.shape[0]
        if n_neighbors > train_size:
            raise ValueError(
                "Expected n_neighbors <= n_samples, "
                " but n_samples = %d, n_neighbors = %d" % (train_size, n_neighbors)
            )
        n_samples = X.shape[0]
        sample_range = np.arange(n_samples)[:, None]

        n_jobs = effective_n_jobs(self.n_jobs)
        if self._fit_method == "brute":

            reduce_func = partial(
                self._kneighbors_reduce_func,
                n_neighbors=n_neighbors,
                return_distance=return_distance,
            )

            # for efficiency, use squared euclidean distances
            kwds = (
                {"squared": True}
                if self.effective_metric_ == "euclidean"
                else self.effective_metric_params_
            )

            result = pairwise_distances_chunked(
                X,
                self._fit_X,
                reduce_func=reduce_func,
                metric=self.effective_metric_,
                n_jobs=n_jobs,
                **kwds
            )
        else:
            raise ValueError("internal: _fit_method not recognized")

        if return_distance:
            dist, neigh_ind = zip(*result)
            result = np.vstack(dist), np.vstack(neigh_ind)
        else:
            result = np.vstack(result)

        if not query_is_train:
            return result
        else:
            # If the query data is the same as the indexed data, we would like
            # to ignore the first nearest neighbor of every sample, i.e
            # the sample itself.
            if return_distance:
                dist, neigh_ind = result
            else:
                neigh_ind = result

            sample_mask = neigh_ind != sample_range

            # Corner case: When the number of duplicates are more
            # than the number of neighbors, the first NN will not
            # be the sample, but a duplicate.
            # In that case mask the first duplicate.
            dup_gr_nbrs = np.all(sample_mask, axis=1)
            sample_mask[:, 0][dup_gr_nbrs] = False

            neigh_ind = np.reshape(neigh_ind[sample_mask], (n_samples, n_neighbors - 1))

            if return_distance:
                dist = np.reshape(dist[sample_mask], (n_samples, n_neighbors - 1))
                return dist, neigh_ind
            return neigh_ind
Пример #18
0
vectordf = df[0].apply(lambda x:list(map(int,x)))

final = pd.DataFrame(vectordf)[0].apply(pd.Series)
#removing more garbage from memory
del(df)
del(vectordf)
gc.collect()
print("Performing hamming distance analysis on high frequency kmers. {}".format(time.asctime()))

if myargs.temp:
    #writes distance matrix to specified location
    filename = os.path.join(myargs.temp,"distance.dat")
    kmerdist=np.memmap(filename, dtype='float32',mode='w+',shape=(comb(final.shape[0],2,exact=True),))
    if myargs.mem:
        dist_gen = pairwise_distances_chunked(final,metric='hamming',n_jobs=-1,working_memory=myargs.mem)
        position = final.shape[0]-1
        remaining = 1
        total = 0
        del(kmerdist)
        for temp in dist_gen:
            print("Processing distance matrix chunk. {}".format(time.asctime()))
            kmerdist=np.memmap(filename,dtype='float32',mode='r+')
            for r in range(len(temp)):
                kmerdist[total:total+position] = temp[r][remaining:]
                total+=position
                position-=1
                remaining+=1
            del(kmerdist)
            gc.collect()
        kmerdist=np.memmap(filename,dtype='float32',mode='r')
Пример #19
0
    def kneighbors(self,
                   X,
                   n_neighbors=None,
                   sketch_method=None,
                   candidates_scale=None,
                   return_distance=False):
        """Fast finds the approximate K-neighbors of each point using sketch.
        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : array-like, shape (n_query, n_features).
            The query point or points.
        n_neighbors : :obj:`int`, :obj:
            Number of neighbors to get.
        sketch_method : {:obj:`None`, 'symmetric', 'asymmetric', 'g_asymmetric', 'PCA'}, defalut = :obj:`None`
            Method to be used to filter candidates before rank the real distances.
            If non None value passed to the constructor, this value will be
            ignored. If both constructor and this method get None, It will not
            use any sketch filter, act just like normal KNN. See constructor
            for more details.
        candidates_scale : :obj:`int`, default is the value passed to the constructor
            Scale up n_neighbors as number of candidate when filtering using
            sketch.
        return_distance : :obj:`boolean`, default = :obj:`False`.
            If False, distances will not be returned

        Returns
        -------
        dist : :obj:`array`
            Array representing the lengths to points, only present if
            return_distance= :obj:`True`
        ind : :obj:`array`
            Indices of the nearest points in the population matrix.
        """
        check_is_fitted(self, ["_fit_X"])

        if n_neighbors is None:
            n_neighbors = self.n_neighbors

        X = check_array(X)

        if self.sketch_method is not None:
            sketch_method = self.sketch_method

        # reduce_func for neighbors
        reduce_func_k = partial(self._kneighbors_reduce_func,
                                n_neighbors=n_neighbors,
                                return_distance=return_distance)
        kwds = ({'squared': True})
        n_jobs = effective_n_jobs(self.n_jobs)

        # find candidates
        if sketch_method is None:  # KNN
            pass
        else:
            if candidates_scale is None:
                candidates_scale = self.candidates_scale
            n_candidates = self.n_neighbors * candidates_scale
            reduce_func_1 = partial(self._kneighbors_reduce_func,
                                    n_neighbors=n_candidates,
                                    return_distance=False)
            if sketch_method == 'symmetric':
                sketch_X = self._sketch(X)
                candidates = list(
                    pairwise_distances_chunked(sketch_X,
                                               self._sketch_X,
                                               reduce_func=reduce_func_1,
                                               metric=paired_hamming_distance,
                                               n_jobs=n_jobs))
            elif sketch_method == 'asymmetric':
                # TODO: sketch X (query points)
                sketch_X, weight = self._sketch(X, return_weight=True)
                _sketch_X_weight = sketch_X + weight  # encode sketch_X and weight together
                # TODO: filter candidates
                candidates = list(
                    pairwise_distances_chunked(
                        _sketch_X_weight,
                        self._sketch_X,
                        reduce_func=reduce_func_1,
                        metric=paired_asymmetric_distance,
                        n_jobs=n_jobs))
            elif sketch_method == 'PCA':
                # sketch X (query points)
                sketch_X = self._pca.transform(X)
                # filter candidates
                candidates = list(
                    pairwise_distances_chunked(sketch_X,
                                               self._pca_X,
                                               reduce_func=reduce_func_1,
                                               metric=self.effective_metric_,
                                               n_jobs=n_jobs,
                                               **kwds))
            elif sketch_method == 'g_asymmetric':
                # TODO: sketch X (query points)
                sketch_X, weight, g_sketch_X, g_weight = self._sketch(
                    X, return_weight=True, return_label=True)
                _sketch_X_weight = sketch_X + weight  # encode sketch_X and weight together
                # TODO: filter label
                Candidate_inds = []
                for g_sketch_X_i, g_weight_i in zip(
                        g_sketch_X, g_weight):  # for each query point
                    labels = self._getlabels(
                        g_weight_i, g_sketch_X_i,
                        self.g_threshold)  # get query point's labels
                    inds = set()
                    for label in labels:
                        inds |= self._g_dict[label]
                    sketchlist = g_sketch_X_i
                    wlist = g_weight_i
                    while len(inds) < n_candidates:
                        wlist = [
                            w if w >= self.g_threshold else math.inf
                            for w in g_weight_i
                        ]
                        ind = np.argmin(wlist)  # find index of smallest weight
                        sketchlist[ind] = 1 - sketchlist[ind]
                        label = ''.join(str(elm) for elm in sketchlist)
                        inds |= self._g_dict[label]
                    del wlist
                    Candidate_inds.append(
                        inds
                    )  # get row number of data points that matched query point's labels
                # TODO: filter candidates
                candidates = []
                for i in range(len(Candidate_inds)):  # for each query point
                    candidate_inds = sorted(list(
                        Candidate_inds[i]))  # get matched inds
                    tmp1 = self._sketch_X[candidate_inds, :]
                    tmp2 = _sketch_X_weight[[i]]
                    iinds = list(
                        pairwise_distances_chunked(
                            tmp2,
                            tmp1,
                            reduce_func=reduce_func_1,
                            metric=paired_asymmetric_distance,
                            n_jobs=n_jobs))
                    iinds[0][0] = np.array(
                        [candidate_inds[ii] for ii in list(iinds[0][0])])
                    candidates += iinds
            else:
                raise ValueError(
                    "%s sketch_method has not been implemented.".format(
                        sketch_method))
            candidates = np.vstack(candidates)

        # result to return
        if return_distance:
            dists = np.empty([0, n_neighbors])
        neight_inds = np.empty([0, n_neighbors], dtype=int)

        # find neighbors
        if sketch_method is None:  # KNN
            # find neighbors from all data points
            result = list(
                pairwise_distances_chunked(X,
                                           self._fit_X,
                                           reduce_func=reduce_func_k,
                                           metric=self.effective_metric_,
                                           n_jobs=n_jobs,
                                           **kwds))
            if return_distance:
                dist, neigh_ind = zip(*result)
                result = np.vstack(dist), np.vstack(neigh_ind)
            else:
                result = np.vstack(result)
        else:
            # find neighbors from the candidate points.
            for i in range(len(candidates)):
                result = list(
                    pairwise_distances_chunked(X[[i], :],
                                               self._fit_X[candidates[i]],
                                               reduce_func=reduce_func_k,
                                               metric=self.effective_metric_,
                                               n_jobs=n_jobs,
                                               **kwds))
                if return_distance:
                    dist, neigh_ind = zip(*result)
                    dist = np.vstack(dist)
                    neigh_ind = candidates[i][np.vstack(neigh_ind).reshape(-1)]
                    dists = np.concatenate((dists, dist), axis=0)
                    neight_inds = np.vstack((neight_inds, neigh_ind))
                else:
                    neigh_ind = candidates[i][np.vstack(result)[0]]
                    neight_inds = np.vstack((neight_inds, neigh_ind))
            if return_distance:
                result = dists, neight_inds
            else:
                result = neight_inds

        return result
Пример #20
0



# ------------------------------------------------------------------
# Generate taxon_branch_homogeneity_scores.csv
# ------------------------------------------------------------------

# Calculate size and mean cosine scores for each branch
branch_homogeneity = []
for branch in labelled.level1taxon.unique():
    total_cosine_for_branch = np.zeros(1)
    denominator_for_branch = np.zeros(1)
    branch_embeddings = embedded_clean_content[labelled['level1taxon'] == branch]
    branch_size = branch_embeddings.shape[0]
    for chunk in pairwise_distances_chunked(branch_embeddings, metric='cosine', n_jobs=-1):
        total_cosine_for_branch += np.sum(chunk)
        denominator_for_branch += np.prod(chunk.shape)
    mean_cosine_for_branch = (total_cosine_for_branch / denominator_for_branch).item()
    branch_homogeneity.append([branch, branch_size, mean_cosine_for_branch])

# Put mean cosine scores for each branch into a sorted dataframe
branch_homogeneity_scores = pd.DataFrame(branch_homogeneity, columns=['branch', 'branch_size', 'mean_cosine_score']) \
    .sort_values('mean_cosine_score', ascending=False)

# Add weighted score
branch_homogeneity_scores['min_max_branch_size'] = (
     branch_homogeneity_scores['branch_size'] -
     branch_homogeneity_scores['branch_size'].min()) / (
     branch_homogeneity_scores['branch_size'].max() -
     branch_homogeneity_scores['branch_size'].min())
Пример #21
0
    def kneighbors(self, X, n_neighbors=None, return_distance=True):
        """Finds the K-neighbors of a point.
        Returns indices of and distances to the neighbors of each point.

        Parameters
        ----------
        X : sktime-format pandas dataframe with shape([n_cases,n_dimensions]),
        or numpy ndarray with shape([n_cases,n_readings,n_dimensions])

        y : {array-like, sparse matrix}
            Target values of shape = [n_samples]

        n_neighbors : int
            Number of neighbors to get (default is the value
            passed to the constructor).

        return_distance : boolean, optional. Defaults to True.
            If False, distances will not be returned

        Returns
        -------
        dist : array
            Array representing the lengths to points, only present if
            return_distance=True

        ind : array
            Indices of the nearest points in the population matrix.
        """
        self.check_is_fitted()
        X = check_X(X, enforce_univariate=False)
        X = nested_to_3d_numpy(X)

        if n_neighbors is None:
            n_neighbors = self.n_neighbors
        elif n_neighbors <= 0:
            raise ValueError(
                "Expected n_neighbors > 0. Got %d" %
                n_neighbors
            )
        else:
            if not np.issubdtype(type(n_neighbors), np.integer):
                raise TypeError(
                    "n_neighbors does not take %s value, "
                    "enter integer value" %
                    type(n_neighbors))

        if X is not None:
            query_is_train = False
            X = check_array(X, accept_sparse='csr', allow_nd=True)
        else:
            query_is_train = True
            X = self._fit_X
            # Include an extra neighbor to account for the sample itself being
            # returned, which is removed later
            n_neighbors += 1

        train_size = self._fit_X.shape[0]
        if n_neighbors > train_size:
            raise ValueError(
                "Expected n_neighbors <= n_samples, "
                " but n_samples = %d, n_neighbors = %d" %
                (train_size, n_neighbors)
            )
        n_samples = X.shape[0]
        sample_range = np.arange(n_samples)[:, None]

        n_jobs = effective_n_jobs(self.n_jobs)
        if self._fit_method == 'brute':

            reduce_func = partial(self._kneighbors_reduce_func,
                                  n_neighbors=n_neighbors,
                                  return_distance=return_distance)

            # for efficiency, use squared euclidean distances
            kwds = ({'squared': True} if self.effective_metric_ == 'euclidean'
                    else self.effective_metric_params_)

            result = pairwise_distances_chunked(
                X, self._fit_X, reduce_func=reduce_func,
                metric=self.effective_metric_, n_jobs=n_jobs,
                **kwds)

        elif self._fit_method in ['ball_tree', 'kd_tree']:
            if issparse(X):
                raise ValueError(
                    "%s does not work with sparse matrices. Densify the data, "
                    "or set algorithm='brute'" % self._fit_method)
            if LooseVersion(joblib_version) < LooseVersion('0.12'):
                # Deal with change of API in joblib
                delayed_query = delayed(self._tree.query,
                                        check_pickle=False)
                parallel_kwargs = {"backend": "threading"}
            else:
                delayed_query = delayed(self._tree.query)
                parallel_kwargs = {"prefer": "threads"}
            result = Parallel(n_jobs, **parallel_kwargs)(
                delayed_query(
                    X[s], n_neighbors, return_distance)
                for s in gen_even_slices(X.shape[0], n_jobs)
            )
        else:
            raise ValueError("internal: _fit_method not recognized")

        if return_distance:
            dist, neigh_ind = zip(*result)
            result = np.vstack(dist), np.vstack(neigh_ind)
        else:
            result = np.vstack(result)

        if not query_is_train:
            return result
        else:
            # If the query data is the same as the indexed data, we would like
            # to ignore the first nearest neighbor of every sample, i.e
            # the sample itself.
            if return_distance:
                dist, neigh_ind = result
            else:
                neigh_ind = result

            sample_mask = neigh_ind != sample_range

            # Corner case: When the number of duplicates are more
            # than the number of neighbors, the first NN will not
            # be the sample, but a duplicate.
            # In that case mask the first duplicate.
            dup_gr_nbrs = np.all(sample_mask, axis=1)
            sample_mask[:, 0][dup_gr_nbrs] = False

            neigh_ind = np.reshape(
                neigh_ind[sample_mask], (n_samples, n_neighbors - 1))

            if return_distance:
                dist = np.reshape(
                    dist[sample_mask], (n_samples, n_neighbors - 1))
                return dist, neigh_ind
            return neigh_ind
Пример #22
0
A = np.load("audio.npy")
#A = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

threshold = 6e-12


def get_edges(D_chunk, start):
    below_threshold = np.where(D_chunk < threshold)
    edges = np.array(below_threshold)
    edges[0] += start
    dist = D_chunk[below_threshold]
    return edges, dist


g = pairwise_distances_chunked(A, metric='cosine')

edge_source = []
edge_dest = []
edge_weight = []
start = 0
while 1:
    try:
        D_chunk = next(g)
        edges, dist = get_edges(D_chunk, start)
        start += D_chunk.shape[0]
        edge_source.extend(edges[0])
        edge_dest.extend(edges[1])
        edge_weight.extend(dist)
    except StopIteration:
        break
Пример #23
0
from dask.multiprocessing import get
from multiprocessing import cpu_count
nCores = cpu_count()


from sklearn.metrics import pairwise_distances_chunked
from sklearn.metrics.pairwise import euclidean_distances
from scipy.spatial.distance import wminkowski


def rf(dchunk , start):
    return dchunk.argsort(axis=1)[:,:knnk]


t1 = time.time()
distMat = pairwise_distances_chunked(kdf,  distSample , reduce_func=rf , metric='nan_euclidean',n_jobs=-1, force_all_finite=False)
distMat = list(distMat)[0]
imputeIndices = pd.DataFrame(distMat)
t2 = time.time()

print(t2-t1)

def KNN(kdf):
    #dist = euclidean_distances(kdf , distSample)
    dist = pairwise_distances(kdf, distSample, metric='nan_euclidean', force_all_finite=False)
    indices =  dist.argsort(axis=1)[:,:knnk]
    #kdf = kdf[].fillna()
    return indices


kdf.shape
Пример #24
0
import numpy as np
from sklearn.metrics import pairwise_distances_chunked
import pandas as pd
df = pd.read_csv("data/human/04_sl_input/cell_line_compare_expr.csv")

X = df.iloc[:, 1:10721].transpose()
gen = pairwise_distances_chunked(X)
gen2 = next(gen)  # this was enough
#gen3 = next(gen)

np.savetxt("cell_dist_mat.txt", gen2)
Пример #25
0
def _evaluate_map(data_loader, model, writer, epoch, logging_label, no_cuda,
                  log_interval, map, **kwargs):
    """
    The evaluation routine

    Parameters
    ----------
    data_loader : torch.utils.data.DataLoader
        The dataloader of the evaluation set
    model : torch.nn.module
        The network model being used
    writer : tensorboardX.writer.SummaryWriter
        The tensorboard writer object. Used to log values on file for the tensorboard visualization.
    epoch : int
        Number of the epoch (for logging purposes)
    logging_label : string
        Label for logging purposes. Typically 'test' or 'valid'. Its prepended to the logging output path and messages.
    no_cuda : boolean
        Specifies whether the GPU should be used or not. A value of 'True' means the CPU will be used.
    log_interval : int
        Interval limiting the logging of mini-batches. Default value of 10.
    map : str
        Specify value for mAP computation. Possible values are ("auto", "full" or specify K for AP@K)

    Returns
    -------
    mAP : float
        Mean average precision for evaluated on this split

    """
    multi_run = kwargs['run'] if 'run' in kwargs else None

    # Switch to evaluate mode (turn off dropout & such )
    model.eval()

    labels, outputs = [], []

    # For use with the multi-crop transform
    multi_crop = False

    # Iterate over whole evaluation set
    pbar = tqdm(enumerate(data_loader),
                total=len(data_loader),
                unit='batch',
                ncols=150,
                leave=False)
    with torch.no_grad():
        for batch_idx, (data, label) in pbar:

            # Check if data is provided in multi-crop form and process accordingly
            if len(data.size()) == 5:
                multi_crop = True
                bs, ncrops, c, h, w = data.size()
                data = data.view(-1, c, h, w)

            if not no_cuda:
                data = data.cuda()

            # Compute output
            out = model(data)

            if multi_crop:
                out = out.view(bs, ncrops, -1).mean(1)

            # Store output
            outputs.append(out.data.cpu().numpy())
            labels.append(label.data.cpu().numpy())

            # Log progress to console
            if batch_idx % log_interval == 0:
                pbar.set_description(logging_label +
                                     ' Epoch: {} [{}/{} ({:.0f}%)]'.format(
                                         epoch, batch_idx * len(data),
                                         len(data_loader.dataset), 100. *
                                         batch_idx / len(data_loader)))

    # Measure accuracy (FPR95)
    num_tests = len(data_loader.dataset.file_names)
    labels = np.concatenate(labels, 0).reshape(num_tests)
    outputs = np.concatenate(outputs, 0)

    # Cosine similarity distance
    distances = pairwise_distances_chunked(outputs, metric='cosine', n_jobs=16)
    logging.debug('Computed pairwise distances')
    t = time.time()
    mAP, per_class_mAP = compute_mapk(distances, labels, k=map)
    writer.add_text('Per class mAP at epoch {}\n'.format(epoch),
                    json.dumps(per_class_mAP, indent=2, sort_keys=True))

    logging.debug('Completed evaluation of mAP in {}'.format(
        datetime.timedelta(seconds=int(time.time() - t))))

    logging.info('\33[91m ' + logging_label +
                 ' set: mAP: {}\n\33[0m'.format(mAP))

    # Logging the epoch-wise accuracy
    if multi_run is None:
        writer.add_scalar(logging_label + '/mAP', mAP, epoch)
    else:
        writer.add_scalar(logging_label + '/mAP{}'.format(multi_run), mAP,
                          epoch)

    return mAP
Пример #26
0
    def create_intra_samples_model(self):
        """Create intra sample model
        This model compute depth correlation within samples. 
        This will be used later to compute a new intra z-score in a new sample
        """
        # Keep row every step line
        # reset index because we are going to work on integer index

        logging.info(f"Create intra model")

        sub_raw = self.raw.reset_index()
        # sub_raw = sub_raw[sub_raw.index % self.sampling == 0]

        # Create Mask index
        # This is used to avoid pairwise comparaison within same name
        # For example, if name is = [A,A,A,B,B,C], it computes the following mask

        #   A A A B B C
        # A 0 0 0 1 1 0
        # A 0 0 0 1 1 0
        # A 0 0 0 1 1 0
        # B 1 1 1 0 0 1
        # B 1 1 1 0 0 1
        # C 1 1 1 1 1 0

        index = sub_raw["name"]
        mask = np.array([i[0] == i[1] for i in product(index, index)
                         ]).reshape(len(index), len(index))

        # return to multiindex
        sub_raw = sub_raw.set_index(["name", "chrom", "pos"])

        def _reduce(chunk, start):
            """This function is called internally by pairwise_distances_chunked
            @see https://scikit-learn.org/stable/modules/generated/sklearn.metrics.pairwise_distances_chunked.html
            
            This function looks for the maximum correlation  value in the chunk matrix and return the id 
            Same name in pairwise are skiped by the mask 
            
            For example:
                      A   B   C 
                A    NA  0.9  0.8
                B    0.5 NA  0.4
                C    0.3 0.7  NA
            
            Will return a dataframe:
             id   idx  corr 
             A     B    0.9
             B     C    0.4
             C     B    0.9
            
            Args:
                chunk (TYPE): Description
                start (TYPE): Description
            
            Returns:
                TYPE: Description
            
            """
            # skip na value
            chunk[np.isnan(chunk)] = 1
            # correlation metrics from sklearn is 1 - corr
            chunk = 1 - chunk
            rows_size = chunk.shape[0]

            select_mask = mask[start:start + rows_size]
            # looks for id of maximum correlation value
            idx = np.argmax(np.ma.masked_array(chunk, select_mask), axis=1)

            # We only get idx, let's get correlation value
            corr = []
            for i, index in enumerate(idx):
                corr.append(chunk[i][index])

            # Create a dataframe
            return pd.DataFrame({
                "idx": idx,
                "corr": corr
            },
                                index=range(start, start + rows_size))

        # Perform pairwise correlation by using pairwise_distances_chunked to avoid memory limit

        all_reduce_chunk = []

        # -1 mean all jobs
        for chunk in pairwise_distances_chunked(sub_raw,
                                                metric="correlation",
                                                reduce_func=_reduce,
                                                n_jobs=self.threads):
            all_reduce_chunk.append(chunk)

        self.intra_model = pd.concat(all_reduce_chunk)
        ss = sub_raw.reset_index(drop=True)

        # avoid warning : polynomial.py:630: RuntimeWarning: invalid value encountered in true_divide

        with np.errstate(divide="ignore", invalid="ignore"):
            for i, row in self.intra_model.iterrows():

                j = row["idx"]

                x = ss.loc[i, :]
                y = ss.loc[j, :]

                try:
                    coef, intercept = tuple(np.polyfit(x, y, 1))
                    yp = x * coef + intercept
                    error = yp - y
                    std = error.std()
                except:
                    coef, intercept = 0, 0
                    std = np.NaN

                self.intra_model.loc[i, "coef"] = coef
                self.intra_model.loc[i, "intercept"] = intercept
                self.intra_model.loc[i, "std2"] = std

        self.intra_model = self.intra_model.set_index(sub_raw.index)
Пример #27
0
def interlist_diversity(predicted_results: pd.DataFrame,
                        click_column: str,
                        k: int,
                        user_id_column: str = Constants.user_id,
                        item_id_column: str = Constants.item_id,
                        user_sample_size: Union[int, float, None] = 10000,
                        seed: int = Constants.default_seed,
                        metric: Union[str, Callable] = 'cosine',
                        num_runs: int = 10,
                        n_jobs: int = 1,
                        working_memory: int = None) -> Tuple[float, int]:
    """
    Inter-List Diversity@k measures the inter-list diversity of the recommendations when only k recommendations are
    made to the user. It measures how user's lists of recommendations are different from each other. This metric has a
    range in :math:`[0, 1]`. The higher this metric is, the more diversified lists of items are recommended to different
    users. Let :math:`U` denote the set of :math:`N` unique users, :math:`u_i`, :math:`u_j \in U` denote the i-th and
    j-th user in the user set, :math:`i, j \in \{1,2,\cdots,N\}`. :math:`R_{u_i}` is the binary indicator vector
    representing provided recommendations for :math:`u_i`. :math:`I` is the set of all unique user pairs,
    :math:`\\forall~i<j, \{u_i, u_j\} \in I`.

    .. math::
            Inter \mbox{-} list~diversity = \\frac{\sum_{i,j, \{u_i, u_j\} \in I}(cosine\_distance(R_{u_i}, R_{u_j}))}{|I|}

    By default, the reported metric is averaged over a number of ``num_runs`` (default=10) evaluations with each run
    using ``user_sample_size`` (default=10000) users, to ease the computing process and meanwhile get close
    approximation of this metric. When ``user_sample_size=None``, all users will be used in evaluation.

    Parameters
    ----------
    predicted_results: pd.DataFrame
        Recommendations data frame with (user_id, item_id, score) in each row.
    k: int
        Top-k recommendations to consider.
    user_id_column: str
        User id column name.
    item_id_column: str
        Item id column name.
    click_column: str
        Recommendation score column name.
    user_sample_size: Union[int, float, None]
        When input is an integer, it defines the number of randomly sampled users. When input is float, it defines the
        proportion of users to randomly sample for evaluation. If it is None, all users are included. Default=10,000.
    seed: int
        The seed used to create random state.
    metric: Union[str, Callable]
        Default = 'cosine'. The distance metric leveraged by sklearn.metrics.pairwise_distances_chunked.
        The metric to use when calculating distance between instances in a feature array.
        If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric
        parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is a callable function,
        it is called on each pair of instances (rows) and the resulting value recorded.
        The callable should take two arrays from X as input and return a value indicating the distance between them.
    num_runs: int
        num_runs is used to report the approximation of Inter-List Diversity over multiple runs on smaller
        samples of users, default=10, for a speed-up on evaluations. The sampling size is defined by
        user_sample_size. The final result is averaged over the multiple runs.
    n_jobs: int
        Number of jobs to use for computation in parallel, leveraged by sklearn.metrics.pairwise_distances_chunked.
        -1 means using all processors. Default=1.
    working_memory: Union[int, None]
        Maximum memory for temporary distance matrix chunks, leveraged by sklearn.metrics.pairwise_distances_chunked.
        Example input: working_memory = 1024. When None (default), the value of sklearn.get_config()['working_memory'],
        i.e. 1024M, is used.

    Returns
    -------
    Inter-list diversity metric, number of unique users as the support to get the metric
    """

    # Sample users
    if user_sample_size is not None:
        results_over_runs = []
        supports_over_runs = []

        # Create a different seed for each run
        rng = np.random.default_rng(seed)
        seeds = rng.integers(0, num_runs * 10, num_runs)

        for i in range(num_runs):

            df = sample_users(predicted_results,
                              user_id_column,
                              user_sample_size,
                              seed=seeds[i])

            res, support = interlist_diversity(df,
                                               click_column,
                                               k,
                                               user_id_column=user_id_column,
                                               item_id_column=item_id_column,
                                               user_sample_size=None,
                                               metric=metric,
                                               n_jobs=n_jobs,
                                               working_memory=working_memory)
            results_over_runs.append(res)
            supports_over_runs.append(support)

        inter_list_diversity = np.mean(results_over_runs)
        support = int(np.mean(supports_over_runs))

        return inter_list_diversity, support

    df = predicted_results

    # Sort by user and score, and take the top k scores.
    df = get_sorted_clicks(df, user_id_column, click_column, k)

    # Given user/item id column names, create sparse matrix as the new representation of user-item interactions.
    sparse_matrix = tocsr(df, user_id_column, item_id_column)

    # Get pairwise cosine distances
    chunked_sum_cosine_distances = map(
        sum,
        pairwise_distances_chunked(sparse_matrix,
                                   reduce_func=reduce_func,
                                   metric=metric,
                                   n_jobs=n_jobs,
                                   working_memory=working_memory))

    # Sum of all cosine distances of unique pairs
    sum_cosine_distances = sum(list(chunked_sum_cosine_distances)) / 2.0

    # Get number of pairs
    num_pairs = np.sum(range(sparse_matrix.shape[0]))

    # Calculate metric
    if num_pairs == 0:
        inter_list_diversity = np.nan
        warnings.warn(
            'Inter-List Diversity will be nan when there is only one single user.'
        )
    else:
        inter_list_diversity = sum_cosine_distances / num_pairs
        if np.abs(inter_list_diversity) <= 1e-06:
            inter_list_diversity = 0.0

    # Calculate support, set it to be the number of users
    support = len(df[user_id_column].unique())

    return inter_list_diversity, support
    def min_radius(self, n, data, target, dist_metric, mode, **kwargs):
        """
        Compute minimum radius of hypersphere such that for each example in
        the data matrix as the centre the sphere will contain at least n examples from
        same class and n examples from a different class.

        Args:
            n : int -- minimum number of examples from same class and different class a hypersphere with centre in
            each example in the dataset should contain

            data : Array[np.float64] -- Matrix containing examples' features as rows

            target : Array[np.int] Matrix of target variable values

            dist_metric : Callable[[Array[np.float64], Array[np.float64]], np.float64] -- distance metric for distance matrix computation

            mode : str -- equal to 'index' if selecting examples by their index and equal to 'example' if passing in explicit examples.

            **kwargs -- argument with keyword learned_metric_func can contain a learned metric function.

        Returns:
        np.float64 : Minimum acceptable radius of the hypersphere

        """

        # Allocate array for storing minimum acceptable radius for each example in dataset.
        min_r = np.empty(data.shape[0], dtype=np.float)

        # Initialize distance matrix.
        dist_mat = None

        # If operating in learned metric space.
        if mode == "index":
            dist_metric_aux = lambda x1, x2: dist_metric(
                np.ones(data.shape[1], dtype=np.float), x1[np.newaxis], x2[
                    np.newaxis])
            dist_func = partial(kwargs['learned_metric_func'], dist_metric_aux)
            dist_func_adapter = lambda x1, x2: dist_func(
                np.int(
                    np.where(np.sum(np.equal(x1, data), 1) == data.shape[1])[0]
                    [0]),
                np.int(
                    np.where(np.sum(np.equal(x2, data), 1) == data.shape[1])[0]
                    [0]))
            dist_mat = sk_metrics.pairwise_distances_chunked(
                data, metric=dist_func_adapter, working_memory=0)
        elif mode == "example":  # else
            dist_func = lambda x1, x2: dist_metric(
                np.ones(data.shape[1], dtype=np.float), x1[np.newaxis], x2[
                    np.newaxis])
            dist_mat = sk_metrics.pairwise_distances_chunked(data,
                                                             metric=dist_func,
                                                             n_jobs=-1,
                                                             working_memory=0)
        else:
            raise ValueError('Unknown mode specifier {0}'.format(mode))

        # Go over examples and compute minimum acceptable radius for each example.
        for k in np.arange(data.shape[0]):
            dist_from_e = next(dist_mat)[
                0]  # Get next row of distances matrix.
            msk = target == target[k]  # Get mask for examples from same class.
            dist_same = dist_from_e[
                msk]  # Get minimum distance that includes n examples from same class.
            dist_diff = dist_from_e[
                ~msk]  # Get minimum distance that includes n examples from different class.
            try:
                min_r[k] = np.max(
                    (np.sort(dist_same)[n], np.sort(dist_diff)[n - 1]
                     ))  # Compute minimum radius for this example.
            except IndexError:
                raise ValueError(
                    'Insufficient examples with class {0} for given value of n (n = {1})'
                    .format(target[k], n))

        return np.max(
            min_r
        )  # Return maximum of array of minimum acceptable radiuses for each example
Пример #29
0
    def calculate_distances(self):
        print('\n Calculating distances - TFIDF')
        self.tfidf_distances = pairwise_distances_chunked(
            self.tfidf_embeddings, metric='cosine', n_jobs=-1)

        """
Пример #30
0
    def __init__(self, configs: Union[Configuration, dict, nx.DiGraph]):
        if type(configs) == nx.DiGraph:  # Assume we're creating a copy
            super().__init__(configs)
            return
        elif type(configs) == dict:
            configs = SmallWorldTopology.Configuration(**configs)

        super().__init__()
        self.__dict__.update(asdict(configs))

        assert (
            len(self.minicolumn_shape) == 3
        ), "Minicolumn shape must be of dimension 3 (3D)"
        assert (
            len(self.macrocolumn_shape) == 3
        ), "Macrocolumn shape must be of dimension 3 (3D)"

        # Initial neuron positions (all separated by neuron_spacing)
        i, j, k = np.multiply(self.macrocolumn_shape, self.minicolumn_shape)
        grid = np.mgrid[:i, :j, :k].reshape(3, -1)
        x, y, z = grid * self.neuron_spacing

        # Adding minicolumnSpacing (from random to small world topology)
        if self.minicolumn_spacing > 0:
            for d in range(3):  # For each dimension
                grid[d] //= self.minicolumn_shape[d]
            x += grid[0] * self.minicolumn_spacing
            y += grid[1] * self.minicolumn_spacing
            z += grid[2] * self.minicolumn_spacing

        positions = map(lambda p: {"position": p}, zip(x, y, z))
        self.add_nodes_from(zip(range(len(x)), positions))

        # Distance-based random connectivity
        positions = np.stack(np.asarray(self.nodes.data("position"))[:, 1])

        if (
            self.sparse_init
        ):  # Slower but iterative (for adjacency matrices that don't fit in memory)
            distances = pairwise_distances_chunked(
                positions,
                metric="euclidean",
                n_jobs=-1,
                reduce_func=lambda chunk, start: bsr_matrix(
                    np.random.random(chunk.shape)
                    < self.p_max * np.exp(-chunk / self.intracolumnar_sparseness)
                ),
                working_memory=self.mem_available,
            )
            adjacency_matrix = vstack(list(distances))
            adjacency_matrix.setdiag(0)  # Avoid self-connections
            self.add_edges_from(zip(*adjacency_matrix.nonzero()))
        else:
            distances = cdist(positions, positions, "euclidean")
            probabilities = self.p_max * np.exp(
                -distances / self.intracolumnar_sparseness
            )
            np.fill_diagonal(probabilities, 0)  # Avoid self-connections
            rand_matrix = np.random.random(probabilities.shape)
            i, j = np.nonzero(rand_matrix < probabilities)
            self.add_edges_from(zip(i, j))

        n_neurons = self.number_of_nodes()
        self.inhibitory_neurons = set(
            np.random.permutation(n_neurons)[: int(n_neurons * self.inhibitory_prob)]
        )

        for u, v in self.edges:
            if u in self.inhibitory_neurons:
                self.edges[u, v]["weight"] = -np.random.uniform(
                    *self.inhibitory_init_weight_range
                )
            else:
                self.edges[u, v]["weight"] = np.random.uniform(
                    *self.excitatory_init_weight_range
                )

        if self.spectral_radius_norm:
            spectral_radius = lambda matrix: np.max(np.abs(np.linalg.eigvals(matrix)))
            adj = nx.adjacency_matrix(self, weight="weight").todense()
            scale = 1.0 / spectral_radius(np.abs(adj))

            for i, (u, v) in enumerate(self.edges):
                self.edges[u, v]["weight"] = self.edges[u, v]["weight"] * scale

        if _logger.isEnabledFor(logging.INFO):
            # Some extra info about the topology
            out_degrees = np.array(self.out_degree())[:, 1]
            reporter.log_metrics(
                {
                    "number-of-neurons": n_neurons,
                    "number-of-synapses": self.number_of_edges(),
                    "excitatory-ratio": 100.0
                    * (1.0 - len(self.inhibitory_neurons) / n_neurons),
                    "avg-out-degree": np.mean(out_degrees),
                    "nb-out-degree-0": len(out_degrees) - np.count_nonzero(out_degrees),
                    "nb-isolates": nx.number_of_isolates(self),
                }
            )