def test_forward_index(self): forward_index = IndexFactory.from_path(self.meta_path) reader = forward_index.reader() with self.subTest(document=1): document = reader.next_document() self.assertEqual(document.title, "Document1") self.assertEqual(document.doc_id, 0) self.assertEqual(document.count, 3) self.assertEqual(document.next_term_id(), 0) self.assertEqual(document.next_term_id(), 1) # Intentionally leaving out the next lines # self.assertEqual(document.next_term_id(), 2) # self.assertEqual(document.next_term_id(), None) with self.subTest(document=2): document = reader.next_document() self.assertEqual(document.title, "Document2") self.assertEqual(document.doc_id, 1) self.assertEqual(document.count, 3) self.assertEqual(document.next_term_id(), 3) self.assertEqual(document.next_term_id(), 4) self.assertEqual(document.next_term_id(), 2) self.assertEqual(document.next_term_id(), None) self.assertEqual(reader.next_document(), None)
def test_forward_index_skip_all(self): forward_index = IndexFactory.from_path(self.meta_path) reader = forward_index.reader() reader.skip(2) document = reader.next_document() self.assertEqual(document, None)
def test_forward_index_skip_first(self): forward_index = IndexFactory.from_path(self.meta_path) reader = forward_index.reader() reader.skip(1) document = reader.next_document() self.assertEqual(document.title, "Document2") self.assertEqual(document.doc_id, 1) self.assertEqual(document.count, 3)
def test_pruning(self): meta_path = path.join(self.test_dir, 'f-metadata') doc_info_path = path.join(self.test_dir, 'f-doc_info') collection_path = path.join(self.test_dir, 'f-collection') terms_path = path.join(self.test_dir, 'f-terms') f = open(meta_path, 'w') f.write(''' {{ "type" : "research.index.forward.ForwardIndex", "name" : "ofi", "paths": {{ "doc_info": "{0}", "collection": "{1}", "terms": "{2}" }} }} '''.format(doc_info_path, collection_path, terms_path)) f.close() forward_index = IndexFactory.from_path(self.meta_path) output_index = IndexFactory.from_path(meta_path) class TermPruner: def test(self, term): for ch in term: if ord(ch) > ord("2"): return False return True forward_index.prune(TermPruner(), output_index) reader = output_index.reader() document = reader.next_document() self.assertEqual(document.title, "Document2") self.assertEqual(document.doc_id, 0) self.assertEqual(document.count, 2) self.assertEqual(document.next_term_id(), 0) self.assertEqual(document.next_term_id(), 1) self.assertEqual(document.next_term_id(), None) self.assertEqual(reader.next_document(), None)
def test_forward_index_read_terms(self): forward_index = IndexFactory.from_path(self.meta_path) reader = forward_index.reader() with self.subTest(document=1): document = reader.next_document() self.assertEqual(document.next_term(), "0") self.assertEqual(document.next_term(), "1") self.assertEqual(document.next_term(), "2") self.assertEqual(document.next_term(), None) with self.subTest(document=2): document = reader.next_document() self.assertEqual(document.next_term(), "3") self.assertEqual(document.next_term(), "4") self.assertEqual(document.next_term(), "2") self.assertEqual(document.next_term(), None)
import argparse import sys from research.index.common import IndexFactory parser = argparse.ArgumentParser( description='Flip the most significant bit in every byte of the file.') parser.add_argument('index', type=str, help='JSON file of the input index') g = parser.add_mutually_exclusive_group(required=True) g.add_argument('--title', '-t', type=str, help='title to search for') g.add_argument('--docid', '-d', type=int, help='document ID to search for') args = parser.parse_args() index = IndexFactory.from_path(args.index) sys.stderr.write('Loading index...') sys.stderr.flush() reader = index.reader() sys.stderr.write(' Done.\n') document = None if args.title is not None: document = reader.find_by_title(args.title) else: document = reader.find_by_id(args.docid) if document is None: print("Couldn't find document {}".format(args.title)) else: sys.stdout.write("\n# Document {0}: {1}\n".format(document.doc_id,
def get_index(test_dir) -> Index: with open(path.join(test_dir, 'simple.terms'), 'w') as f: f.write("a\n") f.write("b\n") f.write("c\n") with open(path.join(test_dir, 'simple.docs'), 'bw') as f: write_header({"coding": "research.coding.varbyte"}, f) encoder = Encoder(f) # a encoder.encode(1) encoder.encode(4) # 5 encoder.encode(4) # 9 # b encoder.encode(1) encoder.encode(1) # 2 # c encoder.encode(5) encoder.encode(1) # 6 encoder.encode(2) # 8 with open(path.join(test_dir, 'simple.frequencies'), 'bw') as f: write_header({"count": 3, "coding": "research.coding.varbyte"}, f) encoder = Encoder(f) encoder.encode(3) encoder.encode(2) encoder.encode(3) with open(path.join(test_dir, 'simple.docs#offsets'), 'bw') as f: write_header({"count": 3, "coding": "research.coding.varbyte"}, f) encoder = Encoder(f) encoder.encode(0) encoder.encode(3) encoder.encode(5) with open(path.join(test_dir, 'simple.counts'), 'bw') as f: write_header({"coding": "research.coding.varbyte"}, f) encoder = Encoder(f) # a encoder.encode(1) encoder.encode(1) encoder.encode(3) # b encoder.encode(2) encoder.encode(2) # c encoder.encode(2) encoder.encode(5) encoder.encode(20) with open(path.join(test_dir, 'simple.counts#offsets'), 'bw') as f: write_header({"count": 3, "coding": "research.coding.varbyte"}, f) encoder = Encoder(f) encoder.encode(0) encoder.encode(3) encoder.encode(5) return IndexFactory.from_json({ 'type': 'research.index.simple.Index', 'name': 'simple', 'dir': test_dir })
import research.utils as utils from research.index.common import IndexFactory parser = argparse.ArgumentParser( description='Flip the most significant bit in every byte of the file.') parser.add_argument('index', type=str, help='Properties file of the input index') parser.add_argument('--pruner', '-p', required=True, type=str, help='A pruner to be used') group = parser.add_mutually_exclusive_group(required=True) group.add_argument('--output', '-o', type=str, help='Properties file of the output index') group.add_argument('--json', '-j', type=str, help='JSON string defining output index') args = parser.parse_args() input_index = IndexFactory.from_path(args.index) output_index = IndexFactory.from_path(args.output) input_index.prune(utils.get_object_of(args.pruner), output_index)