Пример #1
0
def test_greek_semlem(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'semantic + lemma',
                               stopwords=[
                                   'οὗτος', 'τις', 'ὁ', 'ὅς', 'καί',
                                   'αβγ', 'ἐγώ', 'τέκνον'
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=999,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path,
                                  'mini_greek_results_syn_lem.tab')
    print(len(v5_results), len(v3_results))
    _check_search_results(v5_results, v3_results)
Пример #2
0
def test_lucverg(lucvergpop, lucverg_metadata):
    texts = lucvergpop.find(Text.collection,
                            title=[m['title'] for m in lucverg_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    lucvergpop.insert(search_result)
    matcher = SparseMatrixSearch(lucvergpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=[
                                   "et", "qui", "quis", "in", "sum", "hic",
                                   "non", "tu", "neque", "ego"
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=0)
    lucvergpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    lucvergpop.update(search_result)
    v5_results = get_results(lucvergpop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab')
    _check_search_results(v5_results, v3_results)
Пример #3
0
def test_greek_sound(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(
        search_result,
        TextOptions(texts[0], 'phrase'),
        TextOptions(texts[1], 'phrase'),
        'sound',
        stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'],
        stopword_basis='texts',
        score_basis='3gr',
        freq_basis='texts',
        max_distance=999,
        distance_basis='span',
        min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab')
    for p in v3_results:
        print('v3 trigrams:', p['matched_features'])
    for p in v5_results:
        print('v5 trigrams:', p['matched_features'])
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    _check_search_results(v5_results, v3_results)
Пример #4
0
def test_latin_semlem(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'semantic + lemma',
                               stopwords=['et', 'neque', 'per'],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=999,
                               distance_basis='frequency',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path,
                                  'mini_latin_results_syn_lem.tab')
    _check_search_results(v5_results, v3_results)
Пример #5
0
def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=10,
                               stopword_basis='corpus',
                               score_basis='stem',
                               freq_basis='corpus',
                               max_distance=10,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path,
                                  'mini_greek_corpus_results.tab')
    _check_search_results(v5_results, v3_results)
Пример #6
0
def test_get_results_sort_source_tag(resultsdb):
    search = resultsdb.find(Search.collection)[0]
    page_options = PageOptions(sort_by='source_tag',
                               sort_order='descending',
                               per_page=20,
                               page_number=0)
    got_results = get_results(resultsdb, search.id, page_options)
    true_results = resultsdb.find(Match.collection, search_id=search.id)
    true_results.sort(key=lambda x: x.source_tag, reverse=True)
    _assert_equivalent_results(got_results, true_results[0:20])

    page_options = PageOptions(sort_by='source_tag',
                               sort_order='descending',
                               per_page=50,
                               page_number=1)
    got_results = get_results(resultsdb, search.id, page_options)
    _assert_equivalent_results(got_results, true_results[50:100])

    page_options = PageOptions(sort_by='source_tag',
                               sort_order='ascending',
                               per_page=20,
                               page_number=2)
    got_results = get_results(resultsdb, search.id, page_options)
    true_results.sort(key=lambda x: x.source_tag, reverse=False)
    _assert_equivalent_results(got_results, true_results[40:60])
    page_options.sort_order = 1
Пример #7
0
 def check_search_results(conn, search_id, textpath, tabname):
     v5_results = get_results(conn, search_id, PageOptions())
     v5_results.sort(key=lambda x: -x['score'])
     v3_results = _load_v3_results(textpath, tabname)
     v3_relations = _build_relations(v3_results)
     v5_relations = _build_relations(v5_results)
     score_discrepancies = []
     match_discrepancies = []
     in_v5_not_in_v3 = []
     in_v3_not_in_v5 = []
     for target_loc in v3_relations:
         for source_loc in v3_relations[target_loc]:
             if target_loc not in v5_relations or \
                     source_loc not in v5_relations[target_loc]:
                 in_v3_not_in_v5.append(
                     v3_relations[target_loc][source_loc])
                 continue
             v3_match = v3_relations[target_loc][source_loc]
             v5_match = v5_relations[target_loc][source_loc]
             v3_score = v3_match['score']
             v5_score = v5_match['score']
             if f'{v5_score:.3f}' != f'{v3_score:.3f}':
                 score_discrepancies.append(
                     (target_loc, source_loc, v5_score - v3_score))
             v5_match_features = set(v5_match['matched_features'])
             v3_match_features = set()
             for match_f in v3_match['matched_features']:
                 for f in match_f.split('-'):
                     v3_match_features.add(f)
             only_in_v5 = v5_match_features - v3_match_features
             only_in_v3 = v3_match_features - v5_match_features
             if only_in_v5 or only_in_v3:
                 match_discrepancies.append(
                     (target_loc, source_loc, only_in_v5, only_in_v3))
     for target_loc in v5_relations:
         for source_loc in v5_relations[target_loc]:
             if target_loc not in v3_relations or \
                     source_loc not in v3_relations[target_loc]:
                 in_v5_not_in_v3.append(
                     v5_relations[target_loc][source_loc])
     print('# Score discrepancies')
     pprint.pprint(score_discrepancies)
     print('# Match discrepancies')
     pprint.pprint(match_discrepancies)
     print('# In v5 but not in v3')
     pprint.pprint(in_v5_not_in_v3)
     print('# In v3 but not in v5')
     pprint.pprint(in_v3_not_in_v5)
     assert not score_discrepancies
     assert not match_discrepancies
     assert not in_v5_not_in_v3
     assert not in_v3_not_in_v5
Пример #8
0
def main():
    """Perform Tesserae search and display the top 10 results"""
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ')
    source_title = args.source_title.lower().replace('-', ' ')
    source = TextOptions(text=connection.find('texts',
                                              author=source_author,
                                              title=source_title)[0],
                         unit_type=args.source_unit)
    target_author = args.target_author.lower().replace('_', ' ')
    target_title = args.target_title.lower().replace('_', ' ')
    target = TextOptions(text=connection.find('texts',
                                              author=target_author,
                                              title=target_title)[0],
                         unit_type=args.target_unit)

    start = time.time()
    stopword_indices = create_stoplist(
        connection,
        args.n_stopwords,
        args.feature,
        source.text.language,
        basis='corpus' if args.stopword_basis == 'corpus' else
        [source.text.id, target.text.id])
    stopword_tokens = get_stoplist_tokens(connection, stopword_indices,
                                          args.feature, source.text.language)
    parameters = {
        'source': {
            'object_id': str(source.text.id),
            'units': source.unit_type
        },
        'target': {
            'object_id': str(target.text.id),
            'units': target.unit_type
        },
        'method': {
            'name': SparseMatrixSearch.matcher_type,
            'feature': args.feature,
            'stopwords': stopword_tokens,
            'freq_basis': args.freq_basis,
            'max_distance': args.max_distance,
            'distance_basis': args.distance_basis
        }
    }
    results_id = check_cache(connection, parameters['source'],
                             parameters['target'], parameters['method'])
    if results_id:
        print('Cached results found.')
        search = connection.find(Search.collection,
                                 results_id=results_id,
                                 search_type=NORMAL_SEARCH)[0]
    else:
        search = Search(results_id=uuid.uuid4().hex,
                        search_type=NORMAL_SEARCH,
                        parameters=parameters)
        connection.insert(search)
        search_params = {
            'source': source,
            'target': target,
            'feature': parameters['method']['feature'],
            'stopwords': parameters['method']['stopwords'],
            'freq_basis': parameters['method']['freq_basis'],
            'max_distance': parameters['method']['max_distance'],
            'distance_basis': parameters['method']['distance_basis'],
            'min_score': 0
        }
        _run_search(connection, search, SparseMatrixSearch.matcher_type,
                    search_params)
    matches = get_results(connection, search.id, PageOptions())
    end = time.time() - start
    matches.sort(key=lambda x: x['score'], reverse=True)
    print(f'Search found {len(matches)} matches in {end}s.')
    display_count = 10 if len(matches) >= 10 else len(matches)
    print(f'The Top {display_count} Matches')
    print('------------------')
    print()
    print("Result\tScore\tSource Locus\tTarget Locus\tShared")
    for i, m in enumerate(matches[:10]):
        shared = m['matched_features']
        print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t'
              f'{[t for t in shared]}')
Пример #9
0
def test_get_results_dump(resultsdb):
    search = resultsdb.find(Search.collection)[0]
    got_results = get_results(resultsdb, search.id, PageOptions())
    true_results = resultsdb.find(Match.collection, search_id=search.id)
    _assert_equivalent_results(got_results, true_results)