示例#1
0
    def test_forward_index(self):
        forward_index = IndexFactory.from_path(self.meta_path)
        reader = forward_index.reader()

        with self.subTest(document=1):
            document = reader.next_document()
            self.assertEqual(document.title, "Document1")
            self.assertEqual(document.doc_id, 0)
            self.assertEqual(document.count, 3)
            self.assertEqual(document.next_term_id(), 0)
            self.assertEqual(document.next_term_id(), 1)
            # Intentionally leaving out the next lines
            # self.assertEqual(document.next_term_id(), 2)
            # self.assertEqual(document.next_term_id(), None)

        with self.subTest(document=2):
            document = reader.next_document()
            self.assertEqual(document.title, "Document2")
            self.assertEqual(document.doc_id, 1)
            self.assertEqual(document.count, 3)
            self.assertEqual(document.next_term_id(), 3)
            self.assertEqual(document.next_term_id(), 4)
            self.assertEqual(document.next_term_id(), 2)
            self.assertEqual(document.next_term_id(), None)

        self.assertEqual(reader.next_document(), None)
示例#2
0
    def test_forward_index_skip_all(self):
        forward_index = IndexFactory.from_path(self.meta_path)
        reader = forward_index.reader()

        reader.skip(2)
        document = reader.next_document()
        self.assertEqual(document, None)
示例#3
0
    def test_forward_index_skip_first(self):
        forward_index = IndexFactory.from_path(self.meta_path)
        reader = forward_index.reader()

        reader.skip(1)
        document = reader.next_document()
        self.assertEqual(document.title, "Document2")
        self.assertEqual(document.doc_id, 1)
        self.assertEqual(document.count, 3)
示例#4
0
    def test_pruning(self):

        meta_path = path.join(self.test_dir, 'f-metadata')
        doc_info_path = path.join(self.test_dir, 'f-doc_info')
        collection_path = path.join(self.test_dir, 'f-collection')
        terms_path = path.join(self.test_dir, 'f-terms')
        f = open(meta_path, 'w')
        f.write('''
                    {{
                        "type" : "research.index.forward.ForwardIndex",
                        "name" : "ofi",
                        "paths": {{
                            "doc_info": "{0}",
                            "collection": "{1}",
                            "terms": "{2}"
                        }}
                    }}
                '''.format(doc_info_path, collection_path, terms_path))
        f.close()

        forward_index = IndexFactory.from_path(self.meta_path)
        output_index = IndexFactory.from_path(meta_path)

        class TermPruner:
            def test(self, term):
                for ch in term:
                    if ord(ch) > ord("2"):
                        return False
                return True

        forward_index.prune(TermPruner(), output_index)
        reader = output_index.reader()

        document = reader.next_document()
        self.assertEqual(document.title, "Document2")
        self.assertEqual(document.doc_id, 0)
        self.assertEqual(document.count, 2)
        self.assertEqual(document.next_term_id(), 0)
        self.assertEqual(document.next_term_id(), 1)
        self.assertEqual(document.next_term_id(), None)

        self.assertEqual(reader.next_document(), None)
示例#5
0
    def test_forward_index_read_terms(self):
        forward_index = IndexFactory.from_path(self.meta_path)
        reader = forward_index.reader()

        with self.subTest(document=1):
            document = reader.next_document()
            self.assertEqual(document.next_term(), "0")
            self.assertEqual(document.next_term(), "1")
            self.assertEqual(document.next_term(), "2")
            self.assertEqual(document.next_term(), None)

        with self.subTest(document=2):
            document = reader.next_document()
            self.assertEqual(document.next_term(), "3")
            self.assertEqual(document.next_term(), "4")
            self.assertEqual(document.next_term(), "2")
            self.assertEqual(document.next_term(), None)
示例#6
0
import argparse
import sys

from research.index.common import IndexFactory

parser = argparse.ArgumentParser(
    description='Flip the most significant bit in every byte of the file.')
parser.add_argument('index', type=str, help='JSON file of the input index')
g = parser.add_mutually_exclusive_group(required=True)
g.add_argument('--title', '-t', type=str, help='title to search for')
g.add_argument('--docid', '-d', type=int, help='document ID to search for')

args = parser.parse_args()

index = IndexFactory.from_path(args.index)
sys.stderr.write('Loading index...')
sys.stderr.flush()
reader = index.reader()
sys.stderr.write(' Done.\n')

document = None

if args.title is not None:
    document = reader.find_by_title(args.title)
else:
    document = reader.find_by_id(args.docid)

if document is None:
    print("Couldn't find document {}".format(args.title))
else:
    sys.stdout.write("\n# Document {0}: {1}\n".format(document.doc_id,
示例#7
0
def get_index(test_dir) -> Index:

    with open(path.join(test_dir, 'simple.terms'), 'w') as f:
        f.write("a\n")
        f.write("b\n")
        f.write("c\n")

    with open(path.join(test_dir, 'simple.docs'), 'bw') as f:
        write_header({"coding": "research.coding.varbyte"}, f)
        encoder = Encoder(f)
        # a
        encoder.encode(1)
        encoder.encode(4)  # 5
        encoder.encode(4)  # 9
        # b
        encoder.encode(1)
        encoder.encode(1)  # 2
        # c
        encoder.encode(5)
        encoder.encode(1)  # 6
        encoder.encode(2)  # 8

    with open(path.join(test_dir, 'simple.frequencies'), 'bw') as f:
        write_header({"count": 3, "coding": "research.coding.varbyte"}, f)
        encoder = Encoder(f)
        encoder.encode(3)
        encoder.encode(2)
        encoder.encode(3)

    with open(path.join(test_dir, 'simple.docs#offsets'), 'bw') as f:
        write_header({"count": 3, "coding": "research.coding.varbyte"}, f)
        encoder = Encoder(f)
        encoder.encode(0)
        encoder.encode(3)
        encoder.encode(5)

    with open(path.join(test_dir, 'simple.counts'), 'bw') as f:
        write_header({"coding": "research.coding.varbyte"}, f)
        encoder = Encoder(f)
        # a
        encoder.encode(1)
        encoder.encode(1)
        encoder.encode(3)
        # b
        encoder.encode(2)
        encoder.encode(2)
        # c
        encoder.encode(2)
        encoder.encode(5)
        encoder.encode(20)

    with open(path.join(test_dir, 'simple.counts#offsets'), 'bw') as f:
        write_header({"count": 3, "coding": "research.coding.varbyte"}, f)
        encoder = Encoder(f)
        encoder.encode(0)
        encoder.encode(3)
        encoder.encode(5)

    return IndexFactory.from_json({
        'type': 'research.index.simple.Index',
        'name': 'simple',
        'dir': test_dir
    })
示例#8
0
import research.utils as utils
from research.index.common import IndexFactory

parser = argparse.ArgumentParser(
    description='Flip the most significant bit in every byte of the file.')
parser.add_argument('index',
                    type=str,
                    help='Properties file of the input index')
parser.add_argument('--pruner',
                    '-p',
                    required=True,
                    type=str,
                    help='A pruner to be used')

group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('--output',
                   '-o',
                   type=str,
                   help='Properties file of the output index')
group.add_argument('--json',
                   '-j',
                   type=str,
                   help='JSON string defining output index')

args = parser.parse_args()

input_index = IndexFactory.from_path(args.index)
output_index = IndexFactory.from_path(args.output)

input_index.prune(utils.get_object_of(args.pruner), output_index)