def setUp(self): self.reader = MSMarcoPassageReader() self.data_dir = 'data_samples/ms_marco_passage_retrieval' corpus_file = os.path.join(self.data_dir, 'collection.tsv') self.expected_content = {} with open(corpus_file, 'r') as f: for line in f.readlines(): key, value = tuple(line.split('\t', 1)) self.expected_content[key] = value
def setUp(self): # create indexer file_dir_path = os.path.dirname(__file__) data_dir = 'data_samples/ms_marco_passage_retrieval' self.abs_data_dir = os.path.abspath( os.path.join(file_dir_path, *([os.pardir] * 4), data_dir)) self.index_name = "final" indexer_config = { "batch_size": 5, "fields": ["doc_id", "content", "pack_info"], "indexer": { "name": "ElasticSearchIndexer", "hparams": { "index_name": self.index_name, "hosts": "localhost:9200", "algorithm": "bm25" }, "other_kwargs": { "request_timeout": 10, "refresh": True } } } self.indexer = ElasticSearchIndexer( config={"index_name": self.index_name}) nlp: Pipeline[DataPack] = Pipeline() nlp.set_reader(MSMarcoPassageReader()) nlp.add(DataSelectorIndexProcessor(), config=indexer_config) nlp.initialize() self.size = 0 for _ in nlp.process_dataset(self.abs_data_dir): self.size += 1 self.test_dir = tempfile.mkdtemp()
def main(): parser = argparse.ArgumentParser() parser.add_argument("--config_file", default="./config.yml", help="Config YAML filepath") args = parser.parse_args() # loading config config = yaml.safe_load(open(args.config_file, "r")) file_dir_path = os.path.dirname(__file__) data_dir = 'data_samples/ms_marco_passage_retrieval' abs_data_dir = os.path.abspath( os.path.join(file_dir_path, *([os.pardir] * 3), data_dir)) reader = MSMarcoPassageReader() nlp = CreateIndexerPipeline(reader=reader, reader_config=None, indexer_config=config["indexer_config"]) nlp.create_index(abs_data_dir)
def setUp(self): file_dir_path = os.path.dirname(__file__) data_dir = 'data_samples/ms_marco_passage_retrieval' self.abs_data_dir = os.path.abspath( os.path.join(file_dir_path, *([os.pardir] * 4), data_dir)) corpus_file = os.path.join(self.abs_data_dir, 'collection.tsv') self.expected_content = set() with open(corpus_file, 'r') as f: for line in f.readlines(): key, value = tuple(line.split('\t', 1)) self.expected_content.add(value) self.index_name = "test_indexer" indexer_config = { "batch_size": 5, "fields": ["doc_id", "content", "pack_info"], "indexer": { "name": "ElasticSearchIndexer", "hparams": { "index_name": self.index_name, "hosts": "localhost:9200", "algorithm": "bm25" }, "other_kwargs": { "request_timeout": 10, "refresh": True } } } self.indexer = ElasticSearchIndexer( config={"index_name": self.index_name}) self.nlp: Pipeline[DataPack] = Pipeline() self.reader = MSMarcoPassageReader() self.processor = DataSelectorIndexProcessor() self.nlp.set_reader(self.reader) self.nlp.add(self.processor, config=indexer_config) self.nlp.initialize()
def setUp(self): self.pipeline = Pipeline() self.pipeline.set_reader(MSMarcoPassageReader()) self.pipeline.initialize() root_path = os.path.abspath( os.path.join( os.path.dirname(os.path.abspath(__file__)), os.pardir, os.pardir, os.pardir, os.pardir, )) self.data_dir = os.path.join( root_path, "data_samples/ms_marco_passage_retrieval") corpus_file = os.path.join(self.data_dir, "collection.tsv") self.expected_content = {} with open(corpus_file, "r") as f: for line in f.readlines(): key, value = tuple(line.split("\t", 1)) self.expected_content[key] = value
class MSMarcoPassageReaderTest(unittest.TestCase): def setUp(self): self.reader = MSMarcoPassageReader() self.data_dir = 'data_samples/ms_marco_passage_retrieval' corpus_file = os.path.join(self.data_dir, 'collection.tsv') self.expected_content = {} with open(corpus_file, 'r') as f: for line in f.readlines(): key, value = tuple(line.split('\t', 1)) self.expected_content[key] = value def test_ms_marco_passage_reader(self): actual_content: Dict[str, str] = {} for data_pack in self.reader.iter(self.data_dir): self.assertIsInstance(data_pack, DataPack) doc_entries = list(data_pack.get_entries_by_type(Document)) self.assertTrue(len(doc_entries) == 1) doc_entry: Document = doc_entries[0] self.assertIsInstance(doc_entry, Document) actual_content[data_pack.meta.doc_id] = doc_entry.text self.assertDictEqual(actual_content, self.expected_content)
from forte.common.configuration import Config from forte.data.data_pack import DataPack from forte.data.readers import MSMarcoPassageReader from forte.pipeline import Pipeline from forte.processors.ir import ElasticSearchTextIndexProcessor logging.basicConfig(level=logging.INFO) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--config_file", default="./config.yml", help="Config YAML filepath") args = parser.parse_args() config = yaml.safe_load(open(args.config_file, "r")) config = Config(config, default_hparams=None) nlp: Pipeline[DataPack] = Pipeline() nlp.set_reader(MSMarcoPassageReader()) nlp.add(ElasticSearchTextIndexProcessor(), config=config.create_index) nlp.initialize() data_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), config.data.relative_path) for idx, pack in enumerate(nlp.process_dataset(data_path)): if idx + 1 > 0 and (idx + 1) % 10000 == 0: print(f"Indexed {idx + 1} packs")