def test_search_intervals(): idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR) idx_path = os.path.join(idx_dir, 'index.bin') documents, lexicon = get_docs_and_lexicon(idx_dir) with captions.CaptionIndex(idx_path, lexicon, documents) as index: test_document = documents['test.srt'] # unigrams for i in range(10): (d, ) = list(index.search([str(i + 1)], [test_document])) (p, ) = d.postings assert _is_close(p.start, i * 5.) assert _is_close(p.end, (i + 1) * 5.) # bigrams for i in range(9): bigram = [str(i + 1), str(i + 2)] (d, ) = list(index.search(bigram, [test_document])) (p, ) = d.postings assert _is_close(p.start, i * 5.), bigram assert _is_close(p.end, (i + 2) * 5.), bigram # 3-grams for i in range(8): trigram = [str(i + 1), str(i + 2), str(i + 3)] (d, ) = list(index.search(trigram, [test_document])) (p, ) = d.postings assert _is_close(p.start, i * 5.), trigram assert _is_close(p.end, (i + 3) * 5.), trigram
def test_lemmatize(): lemmatizer = captions.default_lemmatizer() assert 'tree' in lemmatizer.lemma('tree') assert 'tree' in lemmatizer.lemma('trees') assert 'duck' in lemmatizer.lemma('duck') assert 'duck' in lemmatizer.lemma('ducks') # Force lemmatization in the lexicon idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR) _, lexicon = get_docs_and_lexicon(idx_dir) assert lexicon['DUCK'].id in lexicon.similar('DUCKS')
def test_token_data(): idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR) documents, lexicon = get_docs_and_lexicon(idx_dir) for i in range(len(documents)): dh = documents.open(i) doc_len = dh.length tokens = dh.tokens() assert len(tokens) == doc_len, \ '{} has an inconsistent number of tokens'.format(documents[i].name) for t in tokens: lexicon.decode(t)
def test_search_position(): idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR) idx_path = os.path.join(idx_dir, 'index.bin') documents, lexicon = get_docs_and_lexicon(idx_dir) with captions.CaptionIndex(idx_path, lexicon, documents) as index: test_document = documents['test.srt'] dh = documents.open(test_document) # In range for i in range(10): assert dh.position(5 * i + 2.5) == i # Out of range assert dh.position(51) == 10 assert dh.position(100) == 10
def test_intervals_data(): idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR) documents, _ = get_docs_and_lexicon(idx_dir) for i in range(len(documents)): dh = documents.open(i) assert len(dh.lines(0, 0)) == 0 duration = dh.duration lines = dh.lines() assert len(lines) > 0, \ '{} has no intervals'.format(documents[i].name) length_from_intervals = 0 for line in lines: length_from_intervals += line.len assert math.fabs(lines[-1].end - duration) < 1e-6 assert length_from_intervals == dh.length, \ '{} has an inconsistent number of tokens'.format(documents[i].name)
def test_inverted_index(): idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR) idx_path = os.path.join(idx_dir, 'index.bin') documents, lexicon = get_docs_and_lexicon(idx_dir) def test_search_and_contains(tokens, doc_ids=None): ids = index.contains(tokens, doc_ids) search_ids = set() for d in index.search(tokens, doc_ids): assert len(d.postings) > 0 for l in d.postings: assert l.len == len(tokens) assert abs(l.end - l.start) <= 10.0, 'ngram time too large' search_ids.add(d.id) assert ids == search_ids all_doc_ids = [d.id for d in documents] with captions.CaptionIndex(idx_path, lexicon, documents) as index: # Unigram search test_search_and_contains(['THE']) test_search_and_contains(['UNITED']) test_search_and_contains(['STATES']) test_search_and_contains(['AND']) test_search_and_contains(['THE'], all_doc_ids) test_search_and_contains(['UNITED'], all_doc_ids) test_search_and_contains(['STATES'], all_doc_ids) test_search_and_contains(['AND'], all_doc_ids) # Bigram search test_search_and_contains(['UNITED', 'STATES']) test_search_and_contains(['UNITED', 'KINGDOM']) test_search_and_contains(['UNITED', 'STATES'], all_doc_ids) test_search_and_contains(['UNITED', 'KINGDOM'], all_doc_ids) # N-gram search test_search_and_contains(['UNITED', 'STATES', 'OF', 'AMERICA']) test_search_and_contains(['UNITED', 'STATES', 'OF', 'AMERICA'], all_doc_ids) test_search_and_contains(['THE', 'GREAT', 'WAR']) test_search_and_contains(['THE', 'GREAT', 'WAR'], all_doc_ids)
def test_search(): idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR) idx_path = os.path.join(idx_dir, 'index.bin') documents, lexicon = get_docs_and_lexicon(idx_dir) def count_and_test(index, document, tokens): ids = index.contains(tokens, [document]) assert len(ids) == 1 count = 0 (d, ) = list(index.search(tokens, [document])) assert len(d.postings) > 0 dh = documents.open(document) for l in d.postings: assert l.len == len(tokens) assert abs(l.end - l.start) < 10.0, 'ngram time too large' count += 1 # Check that we actually found the right ngrams assert [lexicon.decode(t) for t in dh.tokens(l.idx, l.len)] == tokens return count test_document = documents['cnn.srt'] with captions.CaptionIndex(idx_path, lexicon, documents) as index: assert count_and_test(index, test_document, ['THEY']) == 12 assert count_and_test(index, test_document, ['PEOPLE']) == 12 assert count_and_test(index, test_document, ['TO', 'THE']) == 9 # one wraps assert count_and_test(index, test_document, ['GIBSON', 'GUITAR', 'DROP']) == 1 assert count_and_test(index, test_document, ['PUT', 'THAT', 'DOWN']) == 1 assert count_and_test(index, test_document, ['CLOCK', 'STRIKES']) == 2 assert count_and_test(index, test_document, ['>>']) == 149 assert count_and_test(index, test_document, ['SEE', '?']) == 1
def test_update_index(): tmp_dir = tempfile.mkdtemp(suffix=None, prefix='caption-index-unittest-', dir=None) subs_dir = os.path.join(tmp_dir, 'subs') idx_dir = os.path.join(tmp_dir, 'index') # Unpack the test data os.makedirs(subs_dir) check_call(['tar', '-xzf', TEST_DATA_PATH, '-C', subs_dir]) # Build an index check_call([BUILD_INDEX_SCRIPT, '-d', subs_dir, '-o', idx_dir]) # Update the index (should fail due to duplicate files) try: check_call([UPDATE_INDEX_SCRIPT, '-d', subs_dir, idx_dir]) raise Exception('Uh oh, an exception should have been thrown...') except CalledProcessError: pass # Update the index (should do nothing since all of them are duplicates) check_call([UPDATE_INDEX_SCRIPT, '--skip-existing-names', '-d', subs_dir, idx_dir]) # Update the index for fname in os.listdir(subs_dir): src_path = os.path.join(subs_dir, fname) dst_path = os.path.join(subs_dir, 'copy::' + fname) shutil.move(src_path, dst_path) check_call([UPDATE_INDEX_SCRIPT, '-d', subs_dir, idx_dir]) assert os.path.isfile(os.path.join(idx_dir, 'documents.txt.old')) # Test the new index def count_and_test(index, document, tokens): ids = index.contains(tokens, [document]) assert len(ids) == 1 count = 0 (d,) = list(index.search(tokens, [document])) dh = documents.open(document) assert len(d.postings) > 0 for l in d.postings: assert l.len == len(tokens) assert abs(l.end - l.start) < 10.0, 'ngram time too large' count += 1 # Check that we actually found the right ngrams assert [lexicon.decode(t) for t in dh.tokens(l.idx, l.len)] == tokens return count documents, lexicon = get_docs_and_lexicon(idx_dir) idx_path = os.path.join(idx_dir, 'index.bin') assert os.path.isdir(idx_path) assert len(os.listdir(idx_path)) == 2, os.listdir(idx_path) test_document = documents['copy::cnn.srt'] with captions.CaptionIndex(idx_path, lexicon, documents) as index: assert count_and_test(index, test_document, ['THEY']) == 12 assert count_and_test(index, test_document, ['PEOPLE']) == 12 assert count_and_test(index, test_document, ['TO', 'THE']) == 9 # one wraps assert count_and_test(index, test_document, ['GIBSON', 'GUITAR', 'DROP']) == 1 assert count_and_test(index, test_document, ['PUT', 'THAT', 'DOWN']) == 1 assert count_and_test(index, test_document, ['CLOCK', 'STRIKES']) == 2 assert count_and_test(index, test_document, ['>>']) == 149 assert count_and_test(index, test_document, ['SEE', '?']) == 1
def test_decode(): idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR) documents, lexicon = get_docs_and_lexicon(idx_dir) doc_handle = documents.open(0) print(decode.get_vtt(lexicon, doc_handle)) print(decode.get_srt(lexicon, doc_handle))
def test_frequent_words(): idx_dir = os.path.join(TMP_DIR, TEST_INDEX_SUBDIR) _, lexicon = get_docs_and_lexicon(idx_dir) assert len(util.frequent_words(lexicon, 100)) == 1 assert len(util.frequent_words(lexicon, 0)) == len(lexicon) assert len(util.frequent_words(lexicon, 99)) > 0