def docs_store(self, field='doc_id'): # NOTE: the MS MARCO v2 documents have this really neat quality that they contain the offset # position in the source file: <https://microsoft.github.io/msmarco/TREC-Deep-Learning.html>. # Unfortunately, it points to the position in the *uncompressed* file, so for this to work, we'd # need to decompress the source files, inflating the size ~3.3x. The options would be to: # 1) Always de-compress the source files, costing everybody ~3.3x the storage. Ouch. # 2) De-compress the source files the first time that the docstore is requested. This would # only cost the users who use the docstore 3.3x, but increases the complexity of the # iteration code to handle both compressed and non-compressed versions. Would also need code # to handle stuff like fancy slicing, which wouldn't be trivial. Would we also keep # the original source file around? If so, it actually ends up being 4.3x. # 3) Build a PickleLz4FullStore on demand, as normal. This would only cost the users who use # the docstore ~2.7x (accounting for worse lz4 compression rate and keeping around original # copy of the data), but is also slightly slower because of the O(log n) position lookups and # decompression. (This may be offset because pickle parsing is faster than json though.) # It also reduces the complexity of the code, as it does not require a new docstore # implementation for this dataset, and is just doing the normal procedure. return PickleLz4FullStore( path=f'{self._dlc.path(force=False)}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], key_field_prefix= 'msmarco_doc_', # cut down on storage by removing prefix in lookup structure size_hint=66500029281, count_hint=ir_datasets.util.count_hint(NAME), )
def docs_store(self, field='doc_id'): return PickleLz4FullStore( path=f'{self.docs_path()}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], )
def docs_store(self, field='doc_id'): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], )
def docs_store(self, field='doc_id'): return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), )
def docs_store(self, field="doc_id"): return PickleLz4FullStore( path=f"{self.docs_path()}.pklz4", init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=["doc_id"], count_hint=self._count_hint, )
def docs_store(self, field='doc_id'): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()}/{NAME}/anchor-text.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], count_hint=self._count_hint, )
def docs_store(self, field='doc_id'): return PickleLz4FullStore( path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4', init_iter_fn=self._docs_iter, data_cls=self.docs_cls(), lookup_field=field, size_hint=30735927055, index_fields=['doc_id'], count_hint=ir_datasets.util.count_hint(NAME), )
def docs_store(self, field='doc_id'): return PickleLz4FullStore( path=f'{self.docs_path(force=False)}.pklz4', init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=['doc_id'], size_hint=self._docstore_size_hint, count_hint=self._count_hint, )
def docs_store(self, field='doc_id'): fields = (self._doc_store_index_fields or ['doc_id']) return PickleLz4FullStore( path=self._docstore_path, init_iter_fn=self.docs_iter, data_cls=self.docs_cls(), lookup_field=field, index_fields=fields, count_hint=self._count_hint, )