def __call__( self, q_vects: Union[csr_matrix, List]) -> Tuple[List[str], List[int]]: """Found most similar answer for input vectorized question Parameters: q_vects: vectorized questions Returns: Tuple of Answer and Score """ if isinstance(q_vects[0], csr_matrix): norm = sparse_norm(q_vects) * sparse_norm(self.x_train_features, axis=1) cos_similarities = np.array( q_vects.dot(self.x_train_features.T).todense()) / norm elif isinstance(q_vects[0], np.ndarray): q_vects = np.array(q_vects) norm = np.linalg.norm(q_vects) * np.linalg.norm( self.x_train_features, axis=1) cos_similarities = q_vects.dot(self.x_train_features.T) / norm elif q_vects[0] is None: cos_similarities = np.zeros(len(self.x_train_features)) else: raise NotImplementedError('Not implemented this type of vectors') # get cosine similarity for each class y_labels = np.unique(self.y_train) labels_scores = np.zeros((len(cos_similarities), len(y_labels))) for i, label in enumerate(y_labels): labels_scores[:, i] = np.max([ cos_similarities[:, i] for i, value in enumerate(self.y_train) if value == label ], axis=0) # normalize for each class labels_scores = labels_scores / labels_scores.sum(axis=1, keepdims=True) answer_ids = np.argsort(labels_scores)[:, -self.top_n:] # generate top_n asnwers and scores answers = [] scores = [] for i in range(len(answer_ids)): answers.append([y_labels[id] for id in answer_ids[i, ::-1]]) scores.append([ np.round(labels_scores[i, id], 2) for id in answer_ids[i, ::-1] ]) return answers, scores
def _similarity(self, q_vect: Union[csr_matrix, List]) -> List[float]: """Calculates cosine similarity between the user's query and product items. Parameters: q_cur: user's query Returns: cos_similarities: lits of similarity scores """ norm = sparse_norm(q_vect) * sparse_norm(self.x_train_features, axis=1) cos_similarities = np.array(q_vect.dot(self.x_train_features.T).todense()) / norm cos_similarities = cos_similarities[0] cos_similarities = np.nan_to_num(cos_similarities) return cos_similarities
def __call__(self, q_vects: Union[csr_matrix, List]) -> Tuple[List[str], List[int]]: """Found most similar answer for input vectorized question Parameters: q_vects: vectorized questions Returns: Tuple of Answer and Score """ if isinstance(q_vects[0], csr_matrix): norm = sparse_norm(q_vects) * sparse_norm(self.x_train_features, axis=1) cos_similarities = np.array(q_vects.dot(self.x_train_features.T).todense())/norm elif isinstance(q_vects[0], np.ndarray): q_vects = np.array(q_vects) norm = np.linalg.norm(q_vects)*np.linalg.norm(self.x_train_features, axis=1) cos_similarities = q_vects.dot(self.x_train_features.T)/norm elif q_vects[0] is None: cos_similarities = np.zeros(len(self.x_train_features)) else: raise NotImplementedError('Not implemented this type of vectors') # get cosine similarity for each class y_labels = np.unique(self.y_train) labels_scores = np.zeros((len(cos_similarities), len(y_labels))) for i, label in enumerate(y_labels): labels_scores[:, i] = np.max([cos_similarities[:, i] for i, value in enumerate(self.y_train) if value == label], axis=0) # normalize for each class labels_scores = labels_scores/labels_scores.sum(axis=1, keepdims=True) answer_ids = np.argsort(labels_scores)[:, -self.top_n:] # generate top_n asnwers and scores answers = [] scores = [] for i in range(len(answer_ids)): answers.extend([y_labels[id] for id in answer_ids[i, ::-1]]) scores.extend([np.round(labels_scores[i, id], 2) for id in answer_ids[i, ::-1]]) return answers, scores
def __init_probability(self, adjacency): """计算目标节点(每列)对于每个源节点(每行)的重要性(归一化) Input: ------ adjacency: sparse numpy array, 初始邻接矩阵 """ norm_adjacency = sparse_norm(adjacency, axis=0) self.probability = norm_adjacency / np.sum(norm_adjacency) return
def predict(self, X): """Predicter method of ingredient extractor X is a list of text blocks. This methods returns the index of the text block that is most likely to hold the ingredient list""" X_against_ingred_voc = self._count_vect.transform(X) X_norms = sparse_norm(CountVectorizer().fit_transform(X), axis=1) X_dot_ingred = np.array(X_against_ingred_voc.sum(axis=1)).squeeze() pseudo_cosine_sim = np.divide(X_dot_ingred, X_norms, out=np.zeros(X_norms.shape), where=X_norms != 0) self.similarity_ = pseudo_cosine_sim return (np.argmax(pseudo_cosine_sim))
def tfidf(data, sublinear_tf = False, norm = 'l2', use_idf = True, smooth_idf = True, **kwargs): # Contamos las repeticiones tf = CountVectorizer(**kwargs).fit_transform(data) # Si estamos en sublinear tf, aplicamos logaritmo. if sublinear_tf: tf[tf != 0] = 1 + np.log(tf[tf != 0]) # Dividimos por la suma en cada documento para sacar la frecuencia tf. tf = csr_matrix(tf / tf.sum(axis = 1)) # D: Cantidad total de documentos; d: Cantidad de documentos en los que aparece cada palabra. D = tf.shape[0] + int(smooth_idf) d = tf.getnnz(axis = 0) + int(smooth_idf) # Aplicamos fórmula para idf. idf = 1 + np.log(D / d) # Multiplicamos tf * idf cuando corresponda. res = (tf.multiply(idf) if use_idf else tf) # Normalizamos a la norma que corresponda. return csr_matrix(res / sparse_norm(res, int(norm[1:]), axis = 1).reshape(-1, 1))
def algebra_view(request): subs = [ request.GET.get('sub{}'.format(subnum), NULL_SUB) for subnum in range(N_SUBS) ] subsset = set(subs) ops = [ request.GET.get('op{}'.format(subnum), 'plus') for subnum in range(N_SUBS - 1) ] if all(sub == NULL_SUB for sub in subs): return render(request, 'algebra.html', { 'subs': subs, 'ops': ops, 'view': 'algebra' }) # Combine each subreddit's vector using the defined operations X = get_X() sub_to_index = get_sub_to_index() if subs[0] != NULL_SUB: vec = X[sub_to_index[subs[0]]] else: vec = np.zeros_like(X[0]) for sub, op in zip(subs[1:], ops): if sub != NULL_SUB: if op == 'plus': vec = vec + X[sub_to_index[sub]] elif op == 'minus': vec = vec - X[sub_to_index[sub]] else: return render( request, 'algebra.html', { 'subs': subs, 'ops': ops, 'error': 'Invalid operation ({})'.format(op), 'view': 'algebra' }) # Need to renormalize this vector because a normalized vector plus/minus a normalized vector is not normalized norm = sparse_norm(vec) if norm != 0: vec = vec / norm # Get the similarity of this vector to every subreddit vector sims = X.dot(vec.T).toarray().ravel().astype(np.float16) # Sort the results in descending order sorted_indices = sims.argsort().astype(np.uint16)[::-1] index_to_sub = get_index_to_sub() # Store the results in the session so that they can be used in the refined search request.session['sorted_indices'] = sorted_indices request.session['sims'] = sims # Limit the results and don't include subreddits used in the initial algebra subsims = ((index_to_sub[i], round(sims[i] * 100, 1)) for i in sorted_indices if index_to_sub[i] not in subsset) subsims = list(islice(subsims, N_RESULTS)) return render(request, 'algebra.html', { 'subs': subs, 'subsims': subsims, 'ops': ops, 'view': 'algebra' })
def __init__(self, pre_probs, features, adj, **kwargs): super().__init__(features, adj, **kwargs) col_norm = sparse_norm(adj, axis=0) self.probs = col_norm / np.sum(col_norm)
def cost(X): return 0.5 * sparse_norm(X[sigma_set] - a_sparse) ** 2