Пример #1
0
def test_search_intervals():
    idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR)
    idx_path = os.path.join(idx_dir, 'index.bin')
    documents, lexicon = get_docs_and_lexicon(idx_dir)

    with captions.CaptionIndex(idx_path, lexicon, documents) as index:
        test_document = documents['test.srt']

        # unigrams
        for i in range(10):
            (d, ) = list(index.search([str(i + 1)], [test_document]))
            (p, ) = d.postings
            assert _is_close(p.start, i * 5.)
            assert _is_close(p.end, (i + 1) * 5.)

        # bigrams
        for i in range(9):
            bigram = [str(i + 1), str(i + 2)]
            (d, ) = list(index.search(bigram, [test_document]))
            (p, ) = d.postings
            assert _is_close(p.start, i * 5.), bigram
            assert _is_close(p.end, (i + 2) * 5.), bigram

        # 3-grams
        for i in range(8):
            trigram = [str(i + 1), str(i + 2), str(i + 3)]
            (d, ) = list(index.search(trigram, [test_document]))
            (p, ) = d.postings
            assert _is_close(p.start, i * 5.), trigram
            assert _is_close(p.end, (i + 3) * 5.), trigram
Пример #2
0
def test_lemmatize():
    lemmatizer = captions.default_lemmatizer()
    assert 'tree' in lemmatizer.lemma('tree')
    assert 'tree' in lemmatizer.lemma('trees')
    assert 'duck' in lemmatizer.lemma('duck')
    assert 'duck' in lemmatizer.lemma('ducks')

    # Force lemmatization in the lexicon
    idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR)
    _, lexicon = get_docs_and_lexicon(idx_dir)
    assert lexicon['DUCK'].id in lexicon.similar('DUCKS')
Пример #3
0
def test_token_data():
    idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR)
    documents, lexicon = get_docs_and_lexicon(idx_dir)
    for i in range(len(documents)):
        dh = documents.open(i)
        doc_len = dh.length
        tokens = dh.tokens()
        assert len(tokens) == doc_len, \
            '{} has an inconsistent number of tokens'.format(documents[i].name)
        for t in tokens:
            lexicon.decode(t)
Пример #4
0
def test_search_position():
    idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR)
    idx_path = os.path.join(idx_dir, 'index.bin')
    documents, lexicon = get_docs_and_lexicon(idx_dir)

    with captions.CaptionIndex(idx_path, lexicon, documents) as index:
        test_document = documents['test.srt']
        dh = documents.open(test_document)

        # In range
        for i in range(10):
            assert dh.position(5 * i + 2.5) == i

        # Out of range
        assert dh.position(51) == 10
        assert dh.position(100) == 10
Пример #5
0
def test_intervals_data():
    idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR)
    documents, _ = get_docs_and_lexicon(idx_dir)
    for i in range(len(documents)):
        dh = documents.open(i)

        assert len(dh.lines(0, 0)) == 0
        duration = dh.duration
        lines = dh.lines()
        assert len(lines) > 0, \
            '{} has no intervals'.format(documents[i].name)
        length_from_intervals = 0
        for line in lines:
            length_from_intervals += line.len
        assert math.fabs(lines[-1].end - duration) < 1e-6
        assert length_from_intervals == dh.length, \
            '{} has an inconsistent number of tokens'.format(documents[i].name)
Пример #6
0
def test_inverted_index():
    idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR)
    idx_path = os.path.join(idx_dir, 'index.bin')
    documents, lexicon = get_docs_and_lexicon(idx_dir)

    def test_search_and_contains(tokens, doc_ids=None):
        ids = index.contains(tokens, doc_ids)
        search_ids = set()
        for d in index.search(tokens, doc_ids):
            assert len(d.postings) > 0
            for l in d.postings:
                assert l.len == len(tokens)
                assert abs(l.end - l.start) <= 10.0, 'ngram time too large'
            search_ids.add(d.id)
        assert ids == search_ids

    all_doc_ids = [d.id for d in documents]
    with captions.CaptionIndex(idx_path, lexicon, documents) as index:
        # Unigram search
        test_search_and_contains(['THE'])
        test_search_and_contains(['UNITED'])
        test_search_and_contains(['STATES'])
        test_search_and_contains(['AND'])
        test_search_and_contains(['THE'], all_doc_ids)
        test_search_and_contains(['UNITED'], all_doc_ids)
        test_search_and_contains(['STATES'], all_doc_ids)
        test_search_and_contains(['AND'], all_doc_ids)

        # Bigram search
        test_search_and_contains(['UNITED', 'STATES'])
        test_search_and_contains(['UNITED', 'KINGDOM'])
        test_search_and_contains(['UNITED', 'STATES'], all_doc_ids)
        test_search_and_contains(['UNITED', 'KINGDOM'], all_doc_ids)

        # N-gram search
        test_search_and_contains(['UNITED', 'STATES', 'OF', 'AMERICA'])
        test_search_and_contains(['UNITED', 'STATES', 'OF', 'AMERICA'],
                                 all_doc_ids)

        test_search_and_contains(['THE', 'GREAT', 'WAR'])
        test_search_and_contains(['THE', 'GREAT', 'WAR'], all_doc_ids)
Пример #7
0
def test_search():
    idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR)
    idx_path = os.path.join(idx_dir, 'index.bin')
    documents, lexicon = get_docs_and_lexicon(idx_dir)

    def count_and_test(index, document, tokens):
        ids = index.contains(tokens, [document])
        assert len(ids) == 1

        count = 0
        (d, ) = list(index.search(tokens, [document]))
        assert len(d.postings) > 0
        dh = documents.open(document)
        for l in d.postings:
            assert l.len == len(tokens)
            assert abs(l.end - l.start) < 10.0, 'ngram time too large'
            count += 1

            # Check that we actually found the right ngrams
            assert [lexicon.decode(t)
                    for t in dh.tokens(l.idx, l.len)] == tokens

        return count

    test_document = documents['cnn.srt']

    with captions.CaptionIndex(idx_path, lexicon, documents) as index:
        assert count_and_test(index, test_document, ['THEY']) == 12
        assert count_and_test(index, test_document, ['PEOPLE']) == 12
        assert count_and_test(index, test_document,
                              ['TO', 'THE']) == 9  # one wraps
        assert count_and_test(index, test_document,
                              ['GIBSON', 'GUITAR', 'DROP']) == 1
        assert count_and_test(index, test_document,
                              ['PUT', 'THAT', 'DOWN']) == 1
        assert count_and_test(index, test_document, ['CLOCK', 'STRIKES']) == 2
        assert count_and_test(index, test_document, ['>>']) == 149
        assert count_and_test(index, test_document, ['SEE', '?']) == 1
def test_update_index():
    tmp_dir = tempfile.mkdtemp(suffix=None, prefix='caption-index-unittest-',
                               dir=None)
    subs_dir = os.path.join(tmp_dir, 'subs')
    idx_dir = os.path.join(tmp_dir, 'index')

    # Unpack the test data
    os.makedirs(subs_dir)
    check_call(['tar', '-xzf', TEST_DATA_PATH, '-C', subs_dir])

    # Build an index
    check_call([BUILD_INDEX_SCRIPT, '-d', subs_dir, '-o', idx_dir])

    # Update the index (should fail due to duplicate files)
    try:
        check_call([UPDATE_INDEX_SCRIPT, '-d', subs_dir, idx_dir])
        raise Exception('Uh oh, an exception should have been thrown...')
    except CalledProcessError:
        pass

    # Update the index (should do nothing since all of them are duplicates)
    check_call([UPDATE_INDEX_SCRIPT, '--skip-existing-names', '-d', subs_dir,
                idx_dir])

    # Update the index
    for fname in os.listdir(subs_dir):
        src_path = os.path.join(subs_dir, fname)
        dst_path = os.path.join(subs_dir, 'copy::' + fname)
        shutil.move(src_path, dst_path)
    check_call([UPDATE_INDEX_SCRIPT, '-d', subs_dir, idx_dir])
    assert os.path.isfile(os.path.join(idx_dir, 'documents.txt.old'))

    # Test the new index
    def count_and_test(index, document, tokens):
        ids = index.contains(tokens, [document])
        assert len(ids) == 1

        count = 0
        (d,) = list(index.search(tokens, [document]))
        dh = documents.open(document)
        assert len(d.postings) > 0
        for l in d.postings:
            assert l.len == len(tokens)
            assert abs(l.end - l.start) < 10.0, 'ngram time too large'
            count += 1

            # Check that we actually found the right ngrams
            assert [lexicon.decode(t) for t in dh.tokens(l.idx, l.len)] == tokens

        return count

    documents, lexicon = get_docs_and_lexicon(idx_dir)
    idx_path = os.path.join(idx_dir, 'index.bin')
    assert os.path.isdir(idx_path)
    assert len(os.listdir(idx_path)) == 2, os.listdir(idx_path)

    test_document = documents['copy::cnn.srt']
    with captions.CaptionIndex(idx_path, lexicon, documents) as index:
        assert count_and_test(index, test_document, ['THEY']) == 12
        assert count_and_test(index, test_document, ['PEOPLE']) == 12
        assert count_and_test(index, test_document, ['TO', 'THE']) == 9    # one wraps
        assert count_and_test(index, test_document, ['GIBSON', 'GUITAR', 'DROP']) == 1
        assert count_and_test(index, test_document, ['PUT', 'THAT', 'DOWN']) == 1
        assert count_and_test(index, test_document, ['CLOCK', 'STRIKES']) == 2
        assert count_and_test(index, test_document, ['>>']) == 149
        assert count_and_test(index, test_document, ['SEE', '?']) == 1
Пример #9
0
def test_decode():
    idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR)
    documents, lexicon = get_docs_and_lexicon(idx_dir)
    doc_handle = documents.open(0)
    print(decode.get_vtt(lexicon, doc_handle))
    print(decode.get_srt(lexicon, doc_handle))
Пример #10
0
def test_frequent_words():
    idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR)
    _, lexicon = get_docs_and_lexicon(idx_dir)
    assert len(util.frequent_words(lexicon, 100)) == 1
    assert len(util.frequent_words(lexicon, 0)) == len(lexicon)
    assert len(util.frequent_words(lexicon, 99)) > 0