def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=10,
                               stopword_basis='corpus',
                               score_basis='stem',
                               frequency_basis='corpus',
                               max_distance=10,
                               distance_metric='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path,
                                  'mini_greek_corpus_results.tab')
    _check_search_results(v5_results, v3_results)
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=['et', 'neque', 'qui'],
                               stopword_basis='texts',
                               score_basis='stem',
                               frequency_basis='texts',
                               max_distance=10,
                               distance_metric='frequency',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_latin_results.tab')
    _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=[
                                   'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                   'οὗτος', 'ἐμός'
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               frequency_basis='texts',
                               max_distance=10,
                               distance_metric='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results.tab')
    print(len(v5_results), len(v3_results))
    _check_search_results(v5_results, v3_results)
Exemplo n.º 4
0
def test_greek_semantic(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'semantic',
                               stopwords=[
                                   'τις', 'οὗτος', 'καί', 'αβγ', 'ἐγώ',
                                   'τηνόθι', 'τηνικαῦτα', 'τέκνον'
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=999,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_syn.tab')
    print(len(v5_results), len(v3_results))
    _check_search_results(v5_results, v3_results)
Exemplo n.º 5
0
def test_lucverg(lucvergpop, lucverg_metadata):
    texts = lucvergpop.find(Text.collection,
                            title=[m['title'] for m in lucverg_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    lucvergpop.insert(search_result)
    matcher = SparseMatrixSearch(lucvergpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=[
                                   "et", "qui", "quis", "in", "sum", "hic",
                                   "non", "tu", "neque", "ego"
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=0)
    lucvergpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    lucvergpop.update(search_result)
    v5_results = get_results(lucvergpop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab')
    _check_search_results(v5_results, v3_results)
Exemplo n.º 6
0
def test_greek_sound(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(
        search_result,
        TextOptions(texts[0], 'phrase'),
        TextOptions(texts[1], 'phrase'),
        'sound',
        stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'],
        stopword_basis='texts',
        score_basis='3gr',
        freq_basis='texts',
        max_distance=999,
        distance_basis='span',
        min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab')
    for p in v3_results:
        print('v3 trigrams:', p['matched_features'])
    for p in v5_results:
        print('v5 trigrams:', p['matched_features'])
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    _check_search_results(v5_results, v3_results)
Exemplo n.º 7
0
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=[
                                   'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                   'οὗτος', 'ἐμός'
                               ],
                               stopword_basis='texts',
                               score_basis='lemmata',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_greek_results.tab')
Exemplo n.º 8
0
def test_english(engpop, eng_metadata, v3checker):
    texts = engpop.find(Text.collection,
                        title=[m['title'] for m in eng_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    engpop.insert(search_result)
    matcher = SparseMatrixSearch(engpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'form',
                               stopwords=[
                                   "the",
                                   "and",
                                   "of",
                                   "a",
                                   "to",
                                   "in",
                                   "that",
                                   "with",
                                   "i",
                                   "by",
                               ],
                               stopword_basis='texts',
                               score_basis='form',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=6.0)
    engpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    engpop.update(search_result)
    v3checker.check_search_results(engpop, search_result.id, texts[0].path,
                                   'eng_time.tab')
Exemplo n.º 9
0
def test_greek_to_latin_corpus_basis(g2lpop, mini_g2l_metadata, v3checker):
    texts = g2lpop.find(Text.collection,
                        title=[m['title'] for m in mini_g2l_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    g2lpop.insert(search_result)
    matcher = GreekToLatinSearch(g2lpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               greek_stopwords=[],
                               latin_stopwords=['et', 'non', 'iam'],
                               freq_basis='corpus',
                               max_distance=999,
                               distance_basis='frequency',
                               min_score=0)
    g2lpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    g2lpop.update(search_result)
    v3checker.check_search_results(g2lpop, search_result.id, texts[0].path,
                                   'mini_g2l_corpus.tab')
Exemplo n.º 10
0
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=['et', 'neque', 'qui'],
                               stopword_basis='texts',
                               score_basis='lemmata',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_latin_results.tab')
Exemplo n.º 11
0
def test_greek_sound(minipop, mini_greek_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(
        search_result,
        TextOptions(texts[0], 'phrase'),
        TextOptions(texts[1], 'phrase'),
        'sound',
        stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'],
        stopword_basis='texts',
        score_basis='sound',
        freq_basis='texts',
        max_distance=999,
        distance_basis='span',
        min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_greek_results_3gr.tab')