Exemplo n.º 1
0
def test_postings_file_write_entry_overwrite():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        assert_eq(0, pfile.pointer)
        write_location = 0

        entry = PostingsFileEntry(1)
        entry.own_pointer = write_location

        pfile.write_entry(entry)

        assert_eq(
            PostingsFileEntry(1).to_string(), pfile.read_entry(write_location))

        entry.next_pointer = 2
        pfile.write_entry(entry)

        assert_eq(
            PostingsFileEntry(1, 2).to_string(),
            pfile.read_entry(write_location))

        entry.skip_pointer = 3
        pfile.write_entry(entry)

        assert_eq(
            PostingsFileEntry(1, 2, 3).to_string(),
            pfile.read_entry(write_location))

    os.remove(filename)
Exemplo n.º 2
0
def test_postings_file_seek():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        assert_eq(0, pfile.pointer)
        pfile.seek(10)
        assert_eq(10, pfile.pointer)
    os.remove(filename)
Exemplo n.º 3
0
def test_postings_file_get_entry_reset_false():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        head = pfile.pointer
        prev_ptr = head

        last = 12
        for i in xrange(1, last):
            current_entry = PostingsFileEntry(i)
            current_entry.own_pointer = pfile.pointer
            pfile.write_entry(current_entry)

            if i != last - 1:
                current_entry.next_pointer = pfile.pointer
                pfile.write_entry(current_entry)

        entries = []
        entry = pfile.get_entry(head, reset=False)
        while entry:
            entries.append(entry)
            entry = entry.next()
        entries = [entry.doc_id for entry in entries]
        assert_eq([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], entries)

    os.remove(filename)
Exemplo n.º 4
0
def build(training_dir, dict_file, postings_file):
    dictionary = Dictionary()

    # Read each file in the training dir.
    filepaths = []
    for filename in os.listdir(training_dir):
        filepaths.append(os.path.join(training_dir, filename))

    # Sort the filepaths according to doc_id
    filepaths = sorted(filepaths, key=lambda x: int(os.path.basename(x)))

    # Two loops here to have control over the size of the loop.
    # NOTE(michael): for testing.
    # filepaths = filepaths[:10]

    with PostingsFile(
            postings_file, mode='w+',
            entry_cls=PostingsFileEntryWithFrequencies) as postings_file:
        for filepath in filepaths:
            # TODO(michael): Making assumption that document is an int.
            doc_id = int(os.path.basename(filepath))
            terms = process_file(filepath)
            for term in terms:
                # Create postings file entry if entry does not exist for
                # `(term, doc_id)` pair.
                if not dictionary.has_entry(term, doc_id):
                    # Update postings file entry for previous `(term, doc_id)`
                    # entry for the current term. (To point to the entry we are
                    # about to add.
                    # `(term, doc_id)` pair.
                    if dictionary.get_frequency(term) != 0:
                        previous_node_location = dictionary.get_tail(term)
                        previous_entry = \
                            postings_file.get_entry(previous_node_location)
                        previous_entry.next_pointer = postings_file.pointer
                        postings_file.write_entry(previous_entry)

                    # Add new postings file entry for the `(term, doc_id)` pair.
                    dictionary.add_term(term, doc_id, postings_file.pointer)
                    new_entry = PostingsFileEntryWithFrequencies(doc_id)
                    postings_file.write_entry(new_entry)

                # Update postings file entry term frequency. (Increment).
                # NOTE(michael): We can safely use the tail pointer since we
                # process documents in order and not at random.
                current_term_location = dictionary.get_tail(term)
                current_term_entry = \
                    postings_file.get_entry(current_term_location)
                current_term_entry.term_freq += 1
                postings_file.write_entry(current_term_entry)

    # Write dictionary to file.
    with open(dict_file, 'w') as dictionary_file:
        dictionary_file.write(dictionary.to_json())
Exemplo n.º 5
0
def test_postings_file_get_entry():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        head = pfile.pointer
        pfile.write_entry(PostingsFileEntry(1))

        # Test that we set the entries own pointer.
        assert_eq(head, pfile.get_entry(head).own_pointer)

        ptr = pfile.pointer
        pfile.write_entry(PostingsFileEntry(2))
        assert_eq(ptr, pfile.get_entry(ptr).own_pointer)

    os.remove(filename)
Exemplo n.º 6
0
def test_postings_file_get_entry_from_pointer():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        head = pfile.pointer
        prev_ptr = head
        pfile.write_entry(1)

        for i in xrange(10):
            next_ptr = pfile.pointer
            pfile.write_entry(i + 1, next_ptr, write_location=prev_ptr)
            pfile.write_entry(i + 2, write_location=next_ptr)
            prev_ptr = next_ptr

        entries = pfile.get_entry_list_from_pointer(head)
        entries = [entry.doc_id for entry in entries]
        assert_eq([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], entries)

    os.remove(filename)
Exemplo n.º 7
0
def test_postings_file_write_entry():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        assert_eq(0, pfile.pointer)
        pfile.write_entry(PostingsFileEntry(1))
        pfile.write_entry(PostingsFileEntry(2))
        pfile.write_entry(PostingsFileEntry(3))

        assert_eq(PostingsFileEntry(1).to_string(), pfile.read_entry(0))

        assert_eq(
            PostingsFileEntry(2).to_string(),
            pfile.read_entry(0 + PostingsFileEntry.SIZE))

        assert_eq(
            PostingsFileEntry(3).to_string(),
            pfile.read_entry(0 + PostingsFileEntry.SIZE * 2))

    os.remove(filename)
Exemplo n.º 8
0
def test_postings_file_get_entry_from_pointer():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        head = pfile.pointer
        prev_ptr = head

        last = 12
        for i in xrange(1, last):
            current_entry = PostingsFileEntry(i)
            current_entry.own_pointer = pfile.pointer
            pfile.write_entry(current_entry)

            if i != last - 1:
                current_entry.next_pointer = pfile.pointer
                pfile.write_entry(current_entry)

        entries = pfile.get_entry_list_from_pointer(head)
        entries = [entry.doc_id for entry in entries]
        assert_eq([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], entries)

    os.remove(filename)
Exemplo n.º 9
0
def test_postings_file_write_entry_out_of_order():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        first_write_location = pfile.pointer
        first_entry = PostingsFileEntry(1)
        first_entry.own_pointer = first_write_location

        pfile.write_entry(first_entry)

        assert_eq(
            PostingsFileEntry(1).to_string(),
            pfile.read_entry(first_write_location))

        second_write_location = pfile.pointer
        second_entry = PostingsFileEntry(2)
        second_entry.own_pointer = second_write_location
        pfile.write_entry(second_entry)

        assert_eq(
            PostingsFileEntry(2).to_string(),
            pfile.read_entry(second_write_location))

        # Update first entry
        first_entry.doc_id = 4
        pfile.write_entry(first_entry)

        assert_eq(first_entry.to_string(),
                  pfile.read_entry(first_write_location))

        # Add third entry
        third_entry = PostingsFileEntry(3)
        pfile.write_entry(third_entry)

        # Check that second write location was not overwritten.
        assert_eq(
            PostingsFileEntry(2).to_string(),
            pfile.read_entry(second_write_location))

    os.remove(filename)
Exemplo n.º 10
0
def test_postings_file_write_entry_overwrite():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        assert_eq(0, pfile.pointer)
        write_location = 0
        pfile.write_entry(1, write_location=write_location)

        assert_eq(
            PostingsFileEntry(1).to_string(), pfile.read_entry(write_location))

        pfile.write_entry(1, 2, write_location=write_location)

        assert_eq(
            PostingsFileEntry(1, 2).to_string(),
            pfile.read_entry(write_location))

        pfile.write_entry(1, 2, 3, write_location=write_location)

        assert_eq(
            PostingsFileEntry(1, 2, 3).to_string(),
            pfile.read_entry(write_location))

    os.remove(filename)
Exemplo n.º 11
0
def test_postings_file_noop():
    filename = 'test'
    with PostingsFile(filename, 'w+') as pfile:
        pass
    os.remove(filename)