示例#1
0
 def docs_store(self, field='doc_id'):
     # NOTE: the MS MARCO v2 documents have this really neat quality that they contain the offset
     # position in the source file: <https://microsoft.github.io/msmarco/TREC-Deep-Learning.html>.
     # Unfortunately, it points to the position in the *uncompressed* file, so for this to work, we'd
     # need to decompress the source files, inflating the size ~3.3x. The options would be to:
     #  1) Always de-compress the source files, costing everybody ~3.3x the storage. Ouch.
     #  2) De-compress the source files the first time that the docstore is requested. This would
     #     only cost the users who use the docstore 3.3x, but increases the complexity of the
     #     iteration code to handle both compressed and non-compressed versions. Would also need code
     #     to handle stuff like fancy slicing, which wouldn't be trivial. Would we also keep
     #     the original source file around? If so, it actually ends up being 4.3x.
     #  3) Build a PickleLz4FullStore on demand, as normal. This would only cost the users who use
     #     the docstore ~2.7x (accounting for worse lz4 compression rate and keeping around original
     #     copy of the data), but is also slightly slower because of the O(log n) position lookups and
     #     decompression. (This may be offset because pickle parsing is faster than json though.)
     #     It also reduces the complexity of the code, as it does not require a new docstore
     #     implementation for this dataset, and is just doing the normal procedure.
     return PickleLz4FullStore(
         path=f'{self._dlc.path(force=False)}.pklz4',
         init_iter_fn=self.docs_iter,
         data_cls=self.docs_cls(),
         lookup_field=field,
         index_fields=['doc_id'],
         key_field_prefix=
         'msmarco_doc_',  # cut down on storage by removing prefix in lookup structure
         size_hint=66500029281,
         count_hint=ir_datasets.util.count_hint(NAME),
     )
示例#2
0
 def docs_store(self, field='doc_id'):
     return PickleLz4FullStore(
         path=f'{self.docs_path()}.pklz4',
         init_iter_fn=self.docs_iter,
         data_cls=self.docs_cls(),
         lookup_field=field,
         index_fields=['doc_id'],
     )
 def docs_store(self, field='doc_id'):
     return PickleLz4FullStore(
         path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
         init_iter_fn=self.docs_iter,
         data_cls=self.docs_cls(),
         lookup_field=field,
         index_fields=['doc_id'],
     )
示例#4
0
 def docs_store(self, field='doc_id'):
     return PickleLz4FullStore(
         path=f'{self.docs_path(force=False)}.pklz4',
         init_iter_fn=self._docs_iter,
         data_cls=self.docs_cls(),
         lookup_field=field,
         index_fields=['doc_id'],
         count_hint=ir_datasets.util.count_hint(NAME),
     )
示例#5
0
 def docs_store(self, field="doc_id"):
     return PickleLz4FullStore(
         path=f"{self.docs_path()}.pklz4",
         init_iter_fn=self.docs_iter,
         data_cls=self.docs_cls(),
         lookup_field=field,
         index_fields=["doc_id"],
         count_hint=self._count_hint,
     )
示例#6
0
 def docs_store(self, field='doc_id'):
     return PickleLz4FullStore(
         path=f'{ir_datasets.util.home_path()}/{NAME}/anchor-text.pklz4',
         init_iter_fn=self.docs_iter,
         data_cls=self.docs_cls(),
         lookup_field=field,
         index_fields=['doc_id'],
         count_hint=self._count_hint,
     )
示例#7
0
 def docs_store(self, field='doc_id'):
     return PickleLz4FullStore(
         path=f'{ir_datasets.util.home_path()/NAME}/docs.pklz4',
         init_iter_fn=self._docs_iter,
         data_cls=self.docs_cls(),
         lookup_field=field,
         size_hint=30735927055,
         index_fields=['doc_id'],
         count_hint=ir_datasets.util.count_hint(NAME),
     )
示例#8
0
 def docs_store(self, field='doc_id'):
     return PickleLz4FullStore(
         path=f'{self.docs_path(force=False)}.pklz4',
         init_iter_fn=self.docs_iter,
         data_cls=self.docs_cls(),
         lookup_field=field,
         index_fields=['doc_id'],
         size_hint=self._docstore_size_hint,
         count_hint=self._count_hint,
     )
示例#9
0
 def docs_store(self, field='doc_id'):
     fields = (self._doc_store_index_fields or ['doc_id'])
     return PickleLz4FullStore(
         path=self._docstore_path,
         init_iter_fn=self.docs_iter,
         data_cls=self.docs_cls(),
         lookup_field=field,
         index_fields=fields,
         count_hint=self._count_hint,
     )