def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform 256 documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # 256 smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] vec = matutils.corpus2csc(bow, num_terms=self.num_terms) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly") corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc( corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)
def __init__(self, uci_dir, dictionary, n_topics): bv = artm.BatchVectorizer(data_format='bow_uci', data_path=uci_dir, collection_name='corpus', target_folder=uci_dir + '/artm_batches') bv_dict = bv.dictionary logging.info("Fitting the ARTM model") model = artm.ARTM(dictionary=bv_dict, num_topics=n_topics) model.fit_offline(batch_vectorizer=bv, num_collection_passes=10) logging.info("Processing word-topic matrices") # Create a new word-topic matrix according to dictionary indices self.phi = np.zeros(model.phi_.shape, dtype=np.float64) for word, vec in model.phi_.iterrows(): idx = dictionary.token2id[word[1]] self.phi[idx, :] = vec logging.info("Building the index for ARTM") corpus = model.transform(bv).T.sort_index() corpus = [matutils.full2sparse(row) for index, row in corpus.iterrows()] self.index = similarities.MatrixSimilarity(corpus, num_features=n_topics, num_best=self.N_BEST) self.model = model self.dictionary = dictionary
def train(self, read_article_ids = None, unread_article_ids = None): #Load user feedback if needed if read_article_ids is None: read_article_ids = (r.article.id for r in ReadArticleFeedback.objects(user_id = self.user.id).only("article")) user_feedback = Article.objects(id__in = read_article_ids) #TODO: cluster feedback articles and save more than one profile num_loaded_articles = 0 centroid = numpy.zeros(self.num_features_, dtype=numpy.float32) for article in user_feedback: try: article_features_as_full_vec = self.get_features(article) except Exception as inst: logger.error("Could not get features for article %s: %s" % (article.id, inst)) continue #do we need this? tmp_doc = matutils.unitvec(article_features_as_full_vec) #add up tmp_doc centroid = numpy.add(centroid, tmp_doc) num_loaded_articles += 1 #average each element if num_loaded_articles != 0: centroid = centroid / num_loaded_articles centroid = matutils.full2sparse(centroid) #set user model data self.user_model_features = [centroid]
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. If `scaled` is set, scale topics by the inverse of singular values (default: no scaling). """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = ( vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec) # # use np's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype)) if not is_corpus: # convert back from matrix into a 1d vec topic_dist = topic_dist.reshape(-1) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics] ) * topic_dist # s^-1 * u^-1 * x # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. If `scaled` is set, scale topics by the inverse of singular values (default: no scaling). """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] # convert input to scipy.sparse CSC, then do "sparse * dense = dense" multiplication vec = matutils.corpus2csc(bow, num_terms=self.num_terms, dtype=self.projection.u.dtype) topic_dist = (vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x # # convert input to dense, then do dense * dense multiplication # # ± same performance as above (BLAS dense * dense is better optimized than scipy.sparse), # but consumes more memory # vec = matutils.corpus2dense(bow, num_terms=self.num_terms, num_docs=len(bow)) # topic_dist = np.dot(self.projection.u[:, :self.num_topics].T, vec) # # use np's advanced indexing to simulate sparse * dense # # ± same speed again # u = self.projection.u[:, :self.num_topics] # topic_dist = np.empty((u.shape[1], len(bow)), dtype=u.dtype) # for vecno, vec in enumerate(bow): # indices, data = zip(*vec) if vec else ([], []) # topic_dist[:, vecno] = np.dot(u.take(indices, axis=0).T, np.array(data, dtype=u.dtype)) if not is_corpus: # convert back from matrix into a 1d vec topic_dist = topic_dist.reshape(-1) if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a np array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def __iter__(self): """The function that defines a corpus. Iterating over the corpus must yield sparse vectors, one for each document. """ for i, image in enumerate(self.get_images()): logging.debug('__iter__ Yielding image no. {0}'.format(i)) yield matutils.full2sparse(image, self.eps)
def sparse_mean(sparse_vectors_list): dense_vectors_list = [] for vec in sparse_vectors_list: dense_vectors_list.append(matutils.sparse2full(vec, length=1013243)) mean = np.mean(dense_vectors_list, axis=0) return matutils.unitvec(matutils.full2sparse(mean))
def value_for_text(self, t, rp=default_rp): space = rp.lsa_space() num_topics = space.num_topics tokens = rp.tokens(t) tokens = [[token.lower() for token in sentence] for sentence in tokens] if len(tokens) < 2: return 0 spans = np.zeros(len(tokens) - 1) for i in range(1, len(tokens)): past_sentences = tokens[:i] span_dim = len(past_sentences) if span_dim > num_topics - 1: # It's not clear, from the papers I read, what should be done # in this case. I did what seemed to not imply in loosing # information. beginning = past_sentences[0:span_dim - num_topics] past_sentences[0] = list(chain.from_iterable(beginning)) past_vectors = [ sparse2full(space.get_vector(sent), num_topics) for sent in past_sentences ] curr_vector = sparse2full(space.get_vector(tokens[i]), num_topics) curr_array = np.array(curr_vector).reshape(num_topics, 1) A = np.array(past_vectors).transpose() projection_matrix = dot(dot(A, pinv(dot(A.transpose(), A))), A.transpose()) projection = dot(projection_matrix, curr_array).ravel() spans[i - 1] = cossim(full2sparse(curr_vector), full2sparse(projection)) return self.get_value(spans)
def value_for_text(self, t, rp=default_rp): space = rp.lsa_space() num_topics = space.num_topics tokens = rp.tokens(t) tokens = [[token.lower() for token in sentence] for sentence in tokens] if len(tokens) < 2: return 0 spans = np.zeros(len(tokens) - 1) for i in range(1, len(tokens)): past_sentences = tokens[:i] span_dim = len(past_sentences) if span_dim > num_topics - 1: # It's not clear, from the papers I read, what should be done # in this case. I did what seemed to not imply in loosing # information. beginning = past_sentences[0:span_dim - num_topics] past_sentences[0] = list(chain.from_iterable(beginning)) past_vectors = [sparse2full(space.get_vector(sent), num_topics) for sent in past_sentences] curr_vector = sparse2full(space.get_vector(tokens[i]), num_topics) curr_array = np.array(curr_vector).reshape(num_topics, 1) A = np.array(past_vectors).transpose() projection_matrix = dot(dot(A, pinv(dot(A.transpose(), A))), A.transpose()) projection = dot(projection_matrix, curr_array).ravel() spans[i - 1] = cossim(full2sparse(curr_vector), full2sparse(projection)) return self.get_value(spans)
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError( "refusing to guess the number of sparse features: specify num_features explicitly" ) corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc(corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr( ) # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)
def sparse2matrix(inpath1, inpath2, topics_num, file_name) : destpath = '/data/mallet_tests/hellinger/tmp_matrice_'+topics_num+'_'+file_name with open(inpath1, 'r') as comparator : with io.open(inpath2, 'r') as comparable : i = 0 for line_tor in comparator : print line_tor.split()[:2] l_tor = line_tor.split()[2:] l_tor = tuple( (tuple (map (int, (i.split(':')))) for i in l_tor)) # print l_tor len_tor = int(topics_num.split('x')[0]) mat_tor = mat.sparse2full(doc=l_tor,length=len_tor) # print mat_tor.size # for line_ble in comparable : line_ble = comparable.readline() print line_ble.split()[:2] l_ble = line_ble.split()[2:] l_ble = tuple( (tuple (map (int, (i.split(':')))) for i in l_ble)) # print l_ble len_ble = int(topics_num.split('x')[1]) mat_ble = mat.sparse2full(doc=l_ble,length=len_ble) # print mat_ble.size # sys.exit() matrix = n.zeros(shape=(len_ble,len_tor)) # print matrix # sys.exit() for k in xrange(len_tor) : # print 'ollaan koossa' for j in xrange(len_ble) : # print 'ollaan jiissa' # matrix[j][k] = k*j # print matrix # sys.exit() matrix[j][k] = pow(abs((math.sqrt(mat_tor[k]) - math.sqrt(mat_ble[j]))),2) print matrix[j][k] sys.exit() with open(destpath+'_'+line_tor.split()[1]+'.txt', 'w') as matrixfile : matrixfile.write(str(mat.full2sparse(matrix))) matrixfile.closed print 'word %s done' % line_ble.split()[:2] i += 1 comparator.closed comparable.closed print 'matrixes done'
def __get_centroid(self, cluster): #averages all docs in cluster count = 0 centroid = numpy.zeros(self.num_features, dtype=numpy.float32) for doc_id in cluster: doc = self.similarity_index.vector_by_id(doc_id).toarray().flatten() #full_doc = matutils.sparse2full(doc, self.num_features) centroid = centroid + doc count += 1 if count != 0: centroid = centroid / count return matutils.full2sparse(centroid)
def __getitem__(self, bow, scaled=False, chunksize=256): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform 256 documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # 256 smaller mat * vec multiplications, better use of cache). return self._apply(bow, chunksize=chunksize) if is_corpus: vec = numpy.vstack(matutils.sparse2full(doc, self.num_terms).astype(self.projection.u.dtype) for doc in bow).T else: vec = matutils.sparse2full(bow, self.num_terms).astype(self.projection.u.dtype) assert self.projection.u is not None, "decomposition not initialized yet" # automatically convert U to memory order suitable for column slicing # this will ideally be done only once, at the very first lsi[query] transformation self.projection.u = asfarray(self.projection.u) topic_dist = numpy.dot(self.projection.u[:, :self.num_topics].T, vec) # u^-1 * x if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics]) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def __getitem__(self, bow, scaled=False, chunksize=512): """ Return latent representation, as a list of (topic_id, topic_value) 2-tuples. This is done by folding input document into the latent topic space. """ assert self.projection.u is not None, "decomposition not initialized yet" # if the input vector is in fact a corpus, return a transformed corpus as a result is_corpus, bow = utils.is_corpus(bow) if is_corpus and chunksize: # by default, transform `chunksize` documents at once, when called as `lsi[corpus]`. # this chunking is completely transparent to the user, but it speeds # up internal computations (one mat * mat multiplication, instead of # `chunksize` smaller mat * vec multiplications). return self._apply(bow, chunksize=chunksize) if not is_corpus: bow = [bow] vec = matutils.corpus2csc(bow, num_terms=self.num_terms) topic_dist = ( vec.T * self.projection.u[:, :self.num_topics]).T # (x^T * u).T = u^-1 * x if scaled: topic_dist = (1.0 / self.projection.s[:self.num_topics] ) * topic_dist # s^-1 * u^-1 * x # convert a numpy array to gensim sparse vector = tuples of (feature_id, feature_weight), # with no zero weights. if not is_corpus: # lsi[single_document] result = matutils.full2sparse(topic_dist.flat) else: # lsi[chunk of documents] result = matutils.Dense2Corpus(topic_dist) return result
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): """ Parameters ---------- corpus: iterable of list of (int, float) A list of documents in the BoW format. num_features : int, optional Size of the dictionary. Must be either specified, or present in `corpus.num_terms`. num_terms : int, optional Alias for `num_features`, you can use either. num_docs : int, optional Number of documents in `corpus`. Will be calculated if not provided. num_nnz : int, optional Number of non-zero elements in `corpus`. Will be calculated if not provided. num_best : int, optional If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. Otherwise, return a full vector with one float for every document in the index. chunksize : int, optional Size of query chunks. Used internally when the query is an entire corpus. dtype : numpy.dtype, optional Data type of the internal matrix. maintain_sparsity : bool, optional Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`? """ self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError("refusing to guess the number of sparse features: specify num_features explicitly") corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc( corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000 ).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr() # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)
def get_similar(self, doc, topn=10): m = np.asarray([matutils.sparse2full(doc, len(self.dictionary))]) bv = artm.BatchVectorizer(data_format='bow_n_wd', n_wd=m.T, vocabulary=self.dictionary) sims = self.index[matutils.full2sparse(self.model.transform(bv))] return [t[0] for t in sims[:topn]]
def __init__(self, corpus, num_features=None, num_terms=None, num_docs=None, num_nnz=None, num_best=None, chunksize=500, dtype=numpy.float32, maintain_sparsity=False): """ Parameters ---------- corpus: iterable of list of (int, float) A list of documents in the BoW format. num_features : int, optional Size of the dictionary. Must be either specified, or present in `corpus.num_terms`. num_terms : int, optional Alias for `num_features`, you can use either. num_docs : int, optional Number of documents in `corpus`. Will be calculated if not provided. num_nnz : int, optional Number of non-zero elements in `corpus`. Will be calculated if not provided. num_best : int, optional If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. Otherwise, return a full vector with one float for every document in the index. chunksize : int, optional Size of query chunks. Used internally when the query is an entire corpus. dtype : numpy.dtype, optional Data type of the internal matrix. maintain_sparsity : bool, optional Return sparse arrays from :meth:`~gensim.similarities.docsim.SparseMatrixSimilarity.get_similarities`? """ self.num_best = num_best self.normalize = True self.chunksize = chunksize self.maintain_sparsity = maintain_sparsity if corpus is not None: logger.info("creating sparse index") # iterate over input corpus, populating the sparse index matrix try: # use the more efficient corpus generation version, if the input # `corpus` is MmCorpus-like (knows its shape and number of non-zeroes). num_terms, num_docs, num_nnz = corpus.num_terms, corpus.num_docs, corpus.num_nnz logger.debug("using efficient sparse index creation") except AttributeError: # no MmCorpus, use the slower version (or maybe user supplied the # num_* params in constructor) pass if num_features is not None: # num_terms is just an alias for num_features, for compatibility with MatrixSimilarity num_terms = num_features if num_terms is None: raise ValueError( "refusing to guess the number of sparse features: specify num_features explicitly" ) corpus = (matutils.scipy2sparse(v) if scipy.sparse.issparse(v) else (matutils.full2sparse(v) if isinstance(v, numpy.ndarray) else matutils.unitvec(v)) for v in corpus) self.index = matutils.corpus2csc(corpus, num_terms=num_terms, num_docs=num_docs, num_nnz=num_nnz, dtype=dtype, printprogress=10000).T # convert to Compressed Sparse Row for efficient row slicing and multiplications self.index = self.index.tocsr( ) # currently no-op, CSC.T is already CSR logger.info("created %r", self.index)
def ndarray2gensim(array): """Convert a numpy ndarray into a gensim-style generator of lists of tuples.""" return (full2sparse(row) for row in array)
def __getitem__(self, item): return full2sparse(np.random.randn(1, self.dims))