예제 #1
0
def create_document(doc_id, text, weight, length):
    d = Document()
    d._document.id = (str(doc_id) * 16)[:16]
    d.buffer = text.encode('utf8')
    d.weight = weight
    d.length = length
    return d
예제 #2
0
def test_segment_driver():
    valid_doc = Document()
    valid_doc.text = 'valid'
    valid_doc.length = 2
    valid_doc.mime_type = 'image/png'

    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, runtime=None)
    driver._apply_all(DocumentSet([valid_doc]))

    assert valid_doc.length == 2

    assert valid_doc.chunks[0].tags['id'] == 3
    assert valid_doc.chunks[0].parent_id == valid_doc.id
    np.testing.assert_equal(valid_doc.chunks[0].blob, np.array([0.0, 0.0, 0.0]))
    assert valid_doc.chunks[0].weight == 0.0
    assert valid_doc.chunks[0].length == 3
    assert valid_doc.chunks[0].mime_type == 'text/plain'

    assert valid_doc.chunks[1].tags['id'] == 4
    assert valid_doc.chunks[1].parent_id == valid_doc.id
    np.testing.assert_equal(valid_doc.chunks[1].blob, np.array([1.0, 1.0, 1.0]))
    assert valid_doc.chunks[1].weight == 1.0
    assert valid_doc.chunks[1].length == 3
    assert valid_doc.chunks[1].mime_type == 'image/png'

    assert valid_doc.chunks[2].tags['id'] == 5
    assert valid_doc.chunks[2].parent_id == valid_doc.id
    np.testing.assert_equal(valid_doc.chunks[2].blob, np.array([2.0, 2.0, 2.0]))
    assert valid_doc.chunks[2].weight == 2.0
    assert valid_doc.chunks[2].length == 3
    assert valid_doc.chunks[2].mime_type == 'image/png'
예제 #3
0
def create_chunk_matches_to_score():
    # doc: (id: 100, granularity=0)
    # |- chunks: (id: 10)
    # |  |- matches: (id: 11, parent_id: 1, score.value: 2),
    # |  |- matches: (id: 12, parent_id: 1, score.value: 3),
    # |- chunks: (id: 20)
    #    |- matches: (id: 21, parent_id: 2, score.value: 4),
    #    |- matches: (id: 22, parent_id: 2, score.value: 5)
    doc = Document()
    doc.id = '1'
    doc.granularity = 0
    num_matches = 2
    for parent_id in range(1, 3):
        chunk = Document()
        chunk_id = parent_id * 10
        chunk.id = str(chunk_id)
        chunk.granularity = doc.granularity + 1
        for score_value in range(parent_id * 2, parent_id * 2 + num_matches):
            match = Document()
            match.granularity = chunk.granularity
            match.parent_id = str(parent_id)
            match.score = NamedScore(value=score_value, ref_id=chunk.id)
            match.id = str(10 * int(parent_id) + score_value)
            match.length = 4
            chunk.matches.append(match)
        doc.chunks.append(chunk)
    return doc
예제 #4
0
def create_document_to_score():
    # doc: 1
    # |- chunk: 2
    # |  |- matches: (id: 4, parent_id: 40, score.value: 4),
    # |  |- matches: (id: 5, parent_id: 50, score.value: 5),
    # |
    # |- chunk: 3
    #    |- matches: (id: 6, parent_id: 60, score.value: 6),
    #    |- matches: (id: 7, parent_id: 70, score.value: 7)
    doc = Document()
    doc.id = '1'
    for c in range(2):
        chunk = Document()
        chunk_id = str(c + 2)
        chunk.id = chunk_id
        for m in range(2):
            match = Document()
            match_id = 2 * int(chunk_id) + m
            match.id = str(match_id)
            parent_id = 10 * int(match_id)
            match.parent_id = str(parent_id)
            match.length = int(match_id)
            # to be used by MaxRanker and MinRanker
            match.score = NamedScore(value=int(match_id), ref_id=chunk.id)
            match.tags['price'] = match.score.value
            match.tags['discount'] = DISCOUNT_VAL
            chunk.matches.append(match)
        doc.chunks.append(chunk)
    return doc
예제 #5
0
def create_document(doc_id, text, weight, length):
    d = Document()
    d.id = doc_id
    d.buffer = text.encode('utf8')
    d.weight = weight
    d.length = length
    return d
예제 #6
0
def test_broken_document():
    driver = SimpleSegmentDriver()
    executor = MockSegmenter()
    driver.attach(executor=executor, pea=None)

    invalid_doc = Document()
    invalid_doc.id = 1
    invalid_doc.text = 'invalid'
    invalid_doc.length = 2

    assert invalid_doc.length == 2

    with pytest.raises(AttributeError):
        driver._apply_all([invalid_doc])
예제 #7
0
def create_document_to_score():
    # doc: 1
    # |- matches: (id: 2, parent_id: 1, score.value: 2),
    # |- matches: (id: 3, parent_id: 1, score.value: 3),
    # |- matches: (id: 4, parent_id: 1, score.value: 4),
    # |- matches: (id: 5, parent_id: 1, score.value: 5),
    doc = Document()
    doc.id = '1' * 16
    doc.length = 5
    for match_id, match_score in [(2, 3), (3, 6), (4, 1), (5, 8)]:
        with Document() as match:
            match.id = str(match_id) * 16
            match.length = match_score
            match.score.value = match_score
            doc.matches.append(match)
    return doc
def create_document_to_score():
    # doc: 1
    # |- matches: (id: 2, parent_id: 1, score.value: 2),
    # |- matches: (id: 3, parent_id: 1, score.value: 3),
    # |- matches: (id: 4, parent_id: 1, score.value: 4),
    # |- matches: (id: 5, parent_id: 1, score.value: 5),
    doc = Document()
    doc.id = '1' * 20
    doc.length = 5
    for match_id, match_score, match_length in [
        (2, 3, 16),
        (3, 6, 24),
        (4, 1, 8),
        (5, 8, 16),
    ]:
        with Document() as match:
            match.id = str(match_id) * match_length
            match.length = match_score
            match.score = NamedScore(value=match_score, ref_id=doc.id)
            doc.matches.append(match)
    return doc
def create_document_to_score_same_depth_level():
    # doc: 1
    # |  matches: (id: 2, parent_id: 20, score.value: 30, length: 3),
    # |  matches: (id: 3, parent_id: 20, score.value: 40, length: 4),
    # |  matches: (id: 4, parent_id: 30, score.value: 20, length: 2),
    # |  matches: (id: 5, parent_id: 30, score.value: 10, length: 1),

    doc = Document()
    doc.id = 1

    for match_id, parent_id, match_score, match_length in [
        (2, 20, 30, 3),
        (3, 20, 40, 4),
        (4, 30, 20, 2),
        (5, 30, 10, 1),
    ]:
        match = Document()
        match.id = match_id
        match.parent_id = parent_id
        match.length = match_length
        match.score = NamedScore(value=match_score, ref_id=doc.id)
        doc.matches.append(match)
    return doc