def test_greek_semlem(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'semantic + lemma', stopwords=[ 'οὗτος', 'τις', 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'τέκνον' ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_syn_lem.tab') print(len(v5_results), len(v3_results)) _check_search_results(v5_results, v3_results)
def test_lucverg(lucvergpop, lucverg_metadata): texts = lucvergpop.find(Text.collection, title=[m['title'] for m in lucverg_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) lucvergpop.insert(search_result) matcher = SparseMatrixSearch(lucvergpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=[ "et", "qui", "quis", "in", "sum", "hic", "non", "tu", "neque", "ego" ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=0) lucvergpop.insert_nocheck(v5_matches) search_result.status = Search.DONE lucvergpop.update(search_result) v5_results = get_results(lucvergpop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab') _check_search_results(v5_results, v3_results)
def test_greek_sound(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match( search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'sound', stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'], stopword_basis='texts', score_basis='3gr', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab') for p in v3_results: print('v3 trigrams:', p['matched_features']) for p in v5_results: print('v5 trigrams:', p['matched_features']) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) _check_search_results(v5_results, v3_results)
def test_latin_semlem(minipop, mini_latin_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'semantic + lemma', stopwords=['et', 'neque', 'per'], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=999, distance_basis='frequency', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_latin_results_syn_lem.tab') _check_search_results(v5_results, v3_results)
def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=10, stopword_basis='corpus', score_basis='stem', freq_basis='corpus', max_distance=10, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_corpus_results.tab') _check_search_results(v5_results, v3_results)
def test_get_results_sort_source_tag(resultsdb): search = resultsdb.find(Search.collection)[0] page_options = PageOptions(sort_by='source_tag', sort_order='descending', per_page=20, page_number=0) got_results = get_results(resultsdb, search.id, page_options) true_results = resultsdb.find(Match.collection, search_id=search.id) true_results.sort(key=lambda x: x.source_tag, reverse=True) _assert_equivalent_results(got_results, true_results[0:20]) page_options = PageOptions(sort_by='source_tag', sort_order='descending', per_page=50, page_number=1) got_results = get_results(resultsdb, search.id, page_options) _assert_equivalent_results(got_results, true_results[50:100]) page_options = PageOptions(sort_by='source_tag', sort_order='ascending', per_page=20, page_number=2) got_results = get_results(resultsdb, search.id, page_options) true_results.sort(key=lambda x: x.source_tag, reverse=False) _assert_equivalent_results(got_results, true_results[40:60]) page_options.sort_order = 1
def check_search_results(conn, search_id, textpath, tabname): v5_results = get_results(conn, search_id, PageOptions()) v5_results.sort(key=lambda x: -x['score']) v3_results = _load_v3_results(textpath, tabname) v3_relations = _build_relations(v3_results) v5_relations = _build_relations(v5_results) score_discrepancies = [] match_discrepancies = [] in_v5_not_in_v3 = [] in_v3_not_in_v5 = [] for target_loc in v3_relations: for source_loc in v3_relations[target_loc]: if target_loc not in v5_relations or \ source_loc not in v5_relations[target_loc]: in_v3_not_in_v5.append( v3_relations[target_loc][source_loc]) continue v3_match = v3_relations[target_loc][source_loc] v5_match = v5_relations[target_loc][source_loc] v3_score = v3_match['score'] v5_score = v5_match['score'] if f'{v5_score:.3f}' != f'{v3_score:.3f}': score_discrepancies.append( (target_loc, source_loc, v5_score - v3_score)) v5_match_features = set(v5_match['matched_features']) v3_match_features = set() for match_f in v3_match['matched_features']: for f in match_f.split('-'): v3_match_features.add(f) only_in_v5 = v5_match_features - v3_match_features only_in_v3 = v3_match_features - v5_match_features if only_in_v5 or only_in_v3: match_discrepancies.append( (target_loc, source_loc, only_in_v5, only_in_v3)) for target_loc in v5_relations: for source_loc in v5_relations[target_loc]: if target_loc not in v3_relations or \ source_loc not in v3_relations[target_loc]: in_v5_not_in_v3.append( v5_relations[target_loc][source_loc]) print('# Score discrepancies') pprint.pprint(score_discrepancies) print('# Match discrepancies') pprint.pprint(match_discrepancies) print('# In v5 but not in v3') pprint.pprint(in_v5_not_in_v3) print('# In v3 but not in v5') pprint.pprint(in_v3_not_in_v5) assert not score_discrepancies assert not match_discrepancies assert not in_v5_not_in_v3 assert not in_v3_not_in_v5
def main(): """Perform Tesserae search and display the top 10 results""" args = parse_args() if args.password: password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ') source_title = args.source_title.lower().replace('-', ' ') source = TextOptions(text=connection.find('texts', author=source_author, title=source_title)[0], unit_type=args.source_unit) target_author = args.target_author.lower().replace('_', ' ') target_title = args.target_title.lower().replace('_', ' ') target = TextOptions(text=connection.find('texts', author=target_author, title=target_title)[0], unit_type=args.target_unit) start = time.time() stopword_indices = create_stoplist( connection, args.n_stopwords, args.feature, source.text.language, basis='corpus' if args.stopword_basis == 'corpus' else [source.text.id, target.text.id]) stopword_tokens = get_stoplist_tokens(connection, stopword_indices, args.feature, source.text.language) parameters = { 'source': { 'object_id': str(source.text.id), 'units': source.unit_type }, 'target': { 'object_id': str(target.text.id), 'units': target.unit_type }, 'method': { 'name': SparseMatrixSearch.matcher_type, 'feature': args.feature, 'stopwords': stopword_tokens, 'freq_basis': args.freq_basis, 'max_distance': args.max_distance, 'distance_basis': args.distance_basis } } results_id = check_cache(connection, parameters['source'], parameters['target'], parameters['method']) if results_id: print('Cached results found.') search = connection.find(Search.collection, results_id=results_id, search_type=NORMAL_SEARCH)[0] else: search = Search(results_id=uuid.uuid4().hex, search_type=NORMAL_SEARCH, parameters=parameters) connection.insert(search) search_params = { 'source': source, 'target': target, 'feature': parameters['method']['feature'], 'stopwords': parameters['method']['stopwords'], 'freq_basis': parameters['method']['freq_basis'], 'max_distance': parameters['method']['max_distance'], 'distance_basis': parameters['method']['distance_basis'], 'min_score': 0 } _run_search(connection, search, SparseMatrixSearch.matcher_type, search_params) matches = get_results(connection, search.id, PageOptions()) end = time.time() - start matches.sort(key=lambda x: x['score'], reverse=True) print(f'Search found {len(matches)} matches in {end}s.') display_count = 10 if len(matches) >= 10 else len(matches) print(f'The Top {display_count} Matches') print('------------------') print() print("Result\tScore\tSource Locus\tTarget Locus\tShared") for i, m in enumerate(matches[:10]): shared = m['matched_features'] print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t' f'{[t for t in shared]}')
def test_get_results_dump(resultsdb): search = resultsdb.find(Search.collection)[0] got_results = get_results(resultsdb, search.id, PageOptions()) true_results = resultsdb.find(Match.collection, search_id=search.id) _assert_equivalent_results(got_results, true_results)