예제 #1
0
    def __call__(
            self, q_vects: Union[csr_matrix,
                                 List]) -> Tuple[List[str], List[int]]:
        """Found most similar answer for input vectorized question

        Parameters:
            q_vects: vectorized questions

        Returns:
            Tuple of Answer and Score
        """

        if isinstance(q_vects[0], csr_matrix):
            norm = sparse_norm(q_vects) * sparse_norm(self.x_train_features,
                                                      axis=1)
            cos_similarities = np.array(
                q_vects.dot(self.x_train_features.T).todense()) / norm
        elif isinstance(q_vects[0], np.ndarray):
            q_vects = np.array(q_vects)
            norm = np.linalg.norm(q_vects) * np.linalg.norm(
                self.x_train_features, axis=1)
            cos_similarities = q_vects.dot(self.x_train_features.T) / norm
        elif q_vects[0] is None:
            cos_similarities = np.zeros(len(self.x_train_features))
        else:
            raise NotImplementedError('Not implemented this type of vectors')

        # get cosine similarity for each class
        y_labels = np.unique(self.y_train)
        labels_scores = np.zeros((len(cos_similarities), len(y_labels)))
        for i, label in enumerate(y_labels):
            labels_scores[:, i] = np.max([
                cos_similarities[:, i]
                for i, value in enumerate(self.y_train) if value == label
            ],
                                         axis=0)

        # normalize for each class
        labels_scores = labels_scores / labels_scores.sum(axis=1,
                                                          keepdims=True)
        answer_ids = np.argsort(labels_scores)[:, -self.top_n:]

        # generate top_n asnwers and scores
        answers = []
        scores = []
        for i in range(len(answer_ids)):
            answers.append([y_labels[id] for id in answer_ids[i, ::-1]])
            scores.append([
                np.round(labels_scores[i, id], 2) for id in answer_ids[i, ::-1]
            ])

        return answers, scores
예제 #2
0
    def _similarity(self, q_vect: Union[csr_matrix, List]) -> List[float]:
        """Calculates cosine similarity between the user's query and product items.

        Parameters:
            q_cur: user's query

        Returns:
            cos_similarities: lits of similarity scores
        """

        norm = sparse_norm(q_vect) * sparse_norm(self.x_train_features, axis=1)
        cos_similarities = np.array(q_vect.dot(self.x_train_features.T).todense()) / norm

        cos_similarities = cos_similarities[0]
        cos_similarities = np.nan_to_num(cos_similarities)
        return cos_similarities
예제 #3
0
    def __call__(self, q_vects: Union[csr_matrix, List]) -> Tuple[List[str], List[int]]:
        """Found most similar answer for input vectorized question

        Parameters:
            q_vects: vectorized questions

        Returns:
            Tuple of Answer and Score
        """

        if isinstance(q_vects[0], csr_matrix):
            norm = sparse_norm(q_vects) * sparse_norm(self.x_train_features, axis=1)
            cos_similarities = np.array(q_vects.dot(self.x_train_features.T).todense())/norm
        elif isinstance(q_vects[0], np.ndarray):
            q_vects = np.array(q_vects)
            norm = np.linalg.norm(q_vects)*np.linalg.norm(self.x_train_features, axis=1)
            cos_similarities = q_vects.dot(self.x_train_features.T)/norm
        elif q_vects[0] is None:
            cos_similarities = np.zeros(len(self.x_train_features))
        else:
            raise NotImplementedError('Not implemented this type of vectors')

        # get cosine similarity for each class
        y_labels = np.unique(self.y_train)
        labels_scores = np.zeros((len(cos_similarities), len(y_labels)))
        for i, label in enumerate(y_labels):
            labels_scores[:, i] = np.max([cos_similarities[:, i]
                                          for i, value in enumerate(self.y_train) if value == label], axis=0)

        # normalize for each class
        labels_scores = labels_scores/labels_scores.sum(axis=1, keepdims=True)
        answer_ids = np.argsort(labels_scores)[:, -self.top_n:]

        # generate top_n asnwers and scores
        answers = []
        scores = []
        for i in range(len(answer_ids)):
            answers.extend([y_labels[id] for id in answer_ids[i, ::-1]])
            scores.extend([np.round(labels_scores[i, id], 2) for id in answer_ids[i, ::-1]])

        return answers, scores
예제 #4
0
    def __init_probability(self, adjacency):
        """计算目标节点(每列)对于每个源节点(每行)的重要性(归一化)

            Input:
            ------
            adjacency: sparse numpy array, 初始邻接矩阵

        """

        norm_adjacency = sparse_norm(adjacency, axis=0)
        self.probability = norm_adjacency / np.sum(norm_adjacency)

        return
예제 #5
0
    def predict(self, X):
        """Predicter method of ingredient extractor

        X is a list of text blocks.
        This methods returns the index of the text block that is most likely
        to hold the ingredient list"""
        X_against_ingred_voc = self._count_vect.transform(X)
        X_norms = sparse_norm(CountVectorizer().fit_transform(X), axis=1)
        X_dot_ingred = np.array(X_against_ingred_voc.sum(axis=1)).squeeze()
        pseudo_cosine_sim = np.divide(X_dot_ingred,
                                      X_norms,
                                      out=np.zeros(X_norms.shape),
                                      where=X_norms != 0)
        self.similarity_ = pseudo_cosine_sim
        return (np.argmax(pseudo_cosine_sim))
예제 #6
0
def tfidf(data, sublinear_tf = False, norm = 'l2', use_idf = True, smooth_idf = True, **kwargs):

    # Contamos las repeticiones
    tf = CountVectorizer(**kwargs).fit_transform(data)

    # Si estamos en sublinear tf, aplicamos logaritmo.
    if sublinear_tf: tf[tf != 0] = 1 + np.log(tf[tf != 0])

    # Dividimos por la suma en cada documento para sacar la frecuencia tf.
    tf = csr_matrix(tf / tf.sum(axis = 1))

    # D: Cantidad total de documentos; d: Cantidad de documentos en los que aparece cada palabra.
    D = tf.shape[0] + int(smooth_idf)
    d = tf.getnnz(axis = 0) + int(smooth_idf)

    # Aplicamos fórmula para idf.
    idf = 1 + np.log(D / d)

    # Multiplicamos tf * idf cuando corresponda.
    res = (tf.multiply(idf) if use_idf else tf)

    # Normalizamos a la norma que corresponda.
    return csr_matrix(res / sparse_norm(res, int(norm[1:]), axis = 1).reshape(-1, 1))
예제 #7
0
def algebra_view(request):
    subs = [
        request.GET.get('sub{}'.format(subnum), NULL_SUB)
        for subnum in range(N_SUBS)
    ]
    subsset = set(subs)
    ops = [
        request.GET.get('op{}'.format(subnum), 'plus')
        for subnum in range(N_SUBS - 1)
    ]
    if all(sub == NULL_SUB for sub in subs):
        return render(request, 'algebra.html', {
            'subs': subs,
            'ops': ops,
            'view': 'algebra'
        })

    # Combine each subreddit's vector using the defined operations
    X = get_X()
    sub_to_index = get_sub_to_index()
    if subs[0] != NULL_SUB:
        vec = X[sub_to_index[subs[0]]]
    else:
        vec = np.zeros_like(X[0])
    for sub, op in zip(subs[1:], ops):
        if sub != NULL_SUB:
            if op == 'plus':
                vec = vec + X[sub_to_index[sub]]
            elif op == 'minus':
                vec = vec - X[sub_to_index[sub]]
            else:
                return render(
                    request, 'algebra.html', {
                        'subs': subs,
                        'ops': ops,
                        'error': 'Invalid operation ({})'.format(op),
                        'view': 'algebra'
                    })

    # Need to renormalize this vector because a normalized vector plus/minus a normalized vector is not normalized
    norm = sparse_norm(vec)
    if norm != 0:
        vec = vec / norm

    # Get the similarity of this vector to every subreddit vector
    sims = X.dot(vec.T).toarray().ravel().astype(np.float16)
    # Sort the results in descending order
    sorted_indices = sims.argsort().astype(np.uint16)[::-1]
    index_to_sub = get_index_to_sub()
    # Store the results in the session so that they can be used in the refined search
    request.session['sorted_indices'] = sorted_indices
    request.session['sims'] = sims
    # Limit the results and don't include subreddits used in the initial algebra
    subsims = ((index_to_sub[i], round(sims[i] * 100, 1))
               for i in sorted_indices if index_to_sub[i] not in subsset)
    subsims = list(islice(subsims, N_RESULTS))
    return render(request, 'algebra.html', {
        'subs': subs,
        'subsims': subsims,
        'ops': ops,
        'view': 'algebra'
    })
예제 #8
0
 def __init__(self, pre_probs, features, adj, **kwargs):
     super().__init__(features, adj, **kwargs)
     col_norm = sparse_norm(adj, axis=0)
     self.probs = col_norm / np.sum(col_norm)
예제 #9
0
 def cost(X):
     return 0.5 * sparse_norm(X[sigma_set] - a_sparse) ** 2