def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'): """ Construct the index from `corpus`. The index can be later extended by calling the `add_documents` method. **Note**: documents are split (internally, transparently) into shards of `shardsize` documents each, converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number` (=you need write access to that location). If you don't specify an output prefix, a random filename in temp will be used. `shardsize` should be chosen so that a `shardsize x chunksize` matrix of floats fits comfortably into main memory. `num_features` is the number of features in the `corpus` (e.g. size of the dictionary, or the number of latent topics for latent semantic models). `norm` is the user-chosen normalization to use. Accepted values are: 'l1' and 'l2'. If `num_best` is left unspecified, similarity queries will return a full vector with one float for every document in the index: >>> index = Similarity('/path/to/index', corpus, num_features=400) # if corpus has 7 documents... >>> index[query] # ... then result will have 7 floats [0.0, 0.0, 0.2, 0.13, 0.8, 0.0, 0.1] If `num_best` is set, queries return only the `num_best` most similar documents, always leaving out documents for which the similarity is 0. If the input vector itself only has features with zero values (=the sparse representation is empty), the returned list will always be empty. >>> index.num_best = 3 >>> index[query] # return at most "num_best" of `(index_of_document, similarity)` tuples [(4, 0.8), (2, 0.13), (3, 0.13)] You can also override `num_best` dynamically, simply by setting e.g. `self.num_best = 10` before doing a query. """ if output_prefix is None: # undocumented feature: set output_prefix=None to create the server in temp self.output_prefix = utils.randfname(prefix='simserver') else: self.output_prefix = output_prefix logger.info("starting similarity index under %s", self.output_prefix) self.num_features = num_features self.num_best = num_best self.norm = norm self.chunksize = int(chunksize) self.shardsize = shardsize self.shards = [] self.fresh_docs, self.fresh_nnz = [], 0 if corpus is not None: self.add_documents(corpus)
def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, norm='l2'): """ Parameters ---------- output_prefix : str Prefix for shard filename. If None, a random filename in temp will be used. corpus : iterable of list of (int, number) Corpus in streamed Gensim bag-of-words format. num_features : int Size of the dictionary (number of features). num_best : int, optional If set, return only the `num_best` most similar documents, always leaving out documents with similarity = 0. Otherwise, return a full vector with one float for every document in the index. chunksize : int, optional Size of query chunks. Used internally when the query is an entire corpus. shardsize : int, optional Maximum shard size, in documents. Choose a value so that a `shardsize x chunksize` matrix of floats fits comfortably into your RAM. norm : {'l1', 'l2'}, optional Normalization to use. Notes ----- Documents are split (internally, transparently) into shards of `shardsize` documents each, and each shard converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number`. If you don't specify an output prefix, a random filename in temp will be used. If your entire index fits in memory (~1 million documents per 1GB of RAM), you can also use the :class:`~gensim.similarities.docsim.MatrixSimilarity` or :class:`~gensim.similarities.docsim.SparseMatrixSimilarity` classes directly. These are more simple but do not scale as well (they keep the entire index in RAM, no sharding). They also do not support adding new document dynamically. """ if output_prefix is None: # undocumented feature: set output_prefix=None to create the server in temp self.output_prefix = utils.randfname(prefix='simserver') else: self.output_prefix = output_prefix logger.info("starting similarity index under %s", self.output_prefix) self.num_features = num_features self.num_best = num_best self.norm = norm self.chunksize = int(chunksize) self.shardsize = shardsize self.shards = [] self.fresh_docs, self.fresh_nnz = [], 0 if corpus is not None: self.add_documents(corpus)
def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=256, shardsize=32768, use_reverse_index=False): """ Construct the index from `corpus`. The index can be later extended by calling the `add_documents` method. **Note**: documents are split (internally, transparently) into shards of `shardsize` documents each, converted to a matrix, for faster BLAS calls. Each shard is stored to disk under `output_prefix.shard_number` (=you need write access to that location). If you don't specify an output prefix, a random filename in temp will be used. `shardsize` should be chosen so that a `shardsize x chunksize` matrix of floats fits comfortably into main memory. `num_features` is the number of features in the `corpus` (e.g. size of the dictionary, or the number of latent topics for latent semantic models). If `num_best` is left unspecified, similarity queries will return a full vector with one float for every document in the index: >>> index = Similarity('/path/to/index', corpus, num_features=400) # if corpus has 7 documents... >>> index[query] # ... then result will have 7 floats [0.0, 0.0, 0.2, 0.13, 0.8, 0.0, 0.1] If `num_best` is set, queries return only the `num_best` most similar documents, always leaving out documents for which the similarity is 0. If the input vector itself only has features with zero values (=the sparse representation is empty), the returned list will always be empty. >>> index.num_best = 3 >>> index[query] # return at most "num_best" of `(index_of_document, similarity)` tuples [(4, 0.8), (2, 0.13), (3, 0.13)] You can also override `num_best` dynamically, simply by setting e.g. `self.num_best = 10` before doing a query. """ if output_prefix is None: # undocumented feature: set output_prefix=None to create the server in temp self.output_prefix = utils.randfname(prefix='simserver') else: self.output_prefix = output_prefix logger.info("starting similarity index under %s" % self.output_prefix) self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = int(chunksize) self.shardsize = shardsize self.shards = [] self.fresh_docs, self.fresh_nnz = [], 0 self.use_reverse_index = use_reverse_index """ if self.use_reverse_index: self.reverse_index = ReverseIndex(num_documents = len(corpus), num_features = self.num_features) """ if corpus is not None: self.add_documents(corpus)
def __init__(self, output_prefix, corpus, num_features, num_best=None, chunksize=1024, shardsize=32768): """ Construct the index from `corpus`. The index can be later extended by calling the `add_documents` method. Documents are split into shards of `shardsize` documents each, converted to a matrix (for fast BLAS calls) and stored to disk under `output_prefix.shard_number` (=you need write access to that location). If you don't specify an output prefix, a random filename in temp will be used. `shardsize` should be chosen so that a `shardsize x chunksize` matrix of floats fits comfortably into main memory. `num_features` is the number of features in the `corpus` (e.g. size of the dictionary, or the number of latent topics for latent semantic models). If `num_best` is left unspecified, similarity queries will return a full vector with one float for every document in the index: >>> index = Similarity('/path/to/index', corpus, num_features=400) # if corpus has 7 documents... >>> index[query] # ... then result will have 7 floats [0.0, 0.0, 0.2, 0.13, 0.8, 0.0, 0.1] If `num_best` is set, queries return only the `num_best` most similar documents: >>> index.num_best = 3 >>> index[query] # return at most "num_best" of `(index_of_document, similarity)` tuples [(4, 0.8), (2, 0.13), (3, 0.13)] """ if output_prefix is None: # undocumented feature: set output_prefix=None to create the server in temp self.output_prefix = utils.randfname(prefix='simserver') else: self.output_prefix = output_prefix self.num_features = num_features self.num_best = num_best self.normalize = True self.chunksize = int(chunksize) self.shardsize = shardsize self.shards = [] self.fresh_docs, self.fresh_nnz = [], 0 if corpus is not None: self.add_documents(corpus)