def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=10, stopword_basis='corpus', score_basis='stem', frequency_basis='corpus', max_distance=10, distance_metric='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_corpus_results.tab') _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='texts', score_basis='stem', frequency_basis='texts', max_distance=10, distance_metric='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results.tab') print(len(v5_results), len(v3_results)) _check_search_results(v5_results, v3_results)
def test_greek_sound(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match( search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'sound', stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'], stopword_basis='texts', score_basis='3gr', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab') for p in v3_results: print('v3 trigrams:', p['matched_features']) for p in v5_results: print('v5 trigrams:', p['matched_features']) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) _check_search_results(v5_results, v3_results)
def test_greek_semantic(minipop, mini_greek_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'semantic', stopwords=[ 'τις', 'οὗτος', 'καί', 'αβγ', 'ἐγώ', 'τηνόθι', 'τηνικαῦτα', 'τέκνον' ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_syn.tab') print(len(v5_results), len(v3_results)) _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='texts', score_basis='lemmata', freq_basis='texts', max_distance=10, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_greek_results.tab')
def test_english(engpop, eng_metadata, v3checker): texts = engpop.find(Text.collection, title=[m['title'] for m in eng_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) engpop.insert(search_result) matcher = SparseMatrixSearch(engpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'form', stopwords=[ "the", "and", "of", "a", "to", "in", "that", "with", "i", "by", ], stopword_basis='texts', score_basis='form', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=6.0) engpop.insert_nocheck(v5_matches) search_result.status = Search.DONE engpop.update(search_result) v3checker.check_search_results(engpop, search_result.id, texts[0].path, 'eng_time.tab')
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result.id, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=['et', 'neque', 'qui'], stopword_basis='texts', score_basis='stem', frequency_basis='texts', max_distance=10, distance_metric='frequency', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v5_results = get_results(minipop, results_id) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'mini_latin_results.tab') _check_search_results(v5_results, v3_results)
def test_lucverg(lucvergpop, lucverg_metadata): texts = lucvergpop.find(Text.collection, title=[m['title'] for m in lucverg_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) lucvergpop.insert(search_result) matcher = SparseMatrixSearch(lucvergpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=[ "et", "qui", "quis", "in", "sum", "hic", "non", "tu", "neque", "ego" ], stopword_basis='texts', score_basis='stem', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=0) lucvergpop.insert_nocheck(v5_matches) search_result.status = Search.DONE lucvergpop.update(search_result) v5_results = get_results(lucvergpop, search_result.id, PageOptions()) v5_results = sorted(v5_results, key=lambda x: -x['score']) v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab') _check_search_results(v5_results, v3_results)
def test_latin_trigrams(minipop, mini_latin_metadata): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) v5_results = [] v3_results = [] raw_v5_results = [] target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound') for b in target_units: raw_v5_results.append(b['features']) raw_v3_results = _load_v3_results(texts[0].path, 'mini_latin_results_3gr.tab') for a in raw_v3_results: v3_results.append(a['matched_features']) print('v5 results:') for a in raw_v5_results: print(a) for n in a: print(n) n = np.asarray(n) print('array', n) print(np.shape(n)) b = get_stoplist_tokens(minipop, n, 'sound', 'latin') v5_results.append(b) print(v5_results) print('v3 results:') for a in v3_results: print(a) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) assert False
def test_greek_to_latin_corpus_basis(g2lpop, mini_g2l_metadata, v3checker): texts = g2lpop.find(Text.collection, title=[m['title'] for m in mini_g2l_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) g2lpop.insert(search_result) matcher = GreekToLatinSearch(g2lpop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), greek_stopwords=[], latin_stopwords=['et', 'non', 'iam'], freq_basis='corpus', max_distance=999, distance_basis='frequency', min_score=0) g2lpop.insert_nocheck(v5_matches) search_result.status = Search.DONE g2lpop.update(search_result) v3checker.check_search_results(g2lpop, search_result.id, texts[0].path, 'mini_g2l_corpus.tab')
def test_greek_multitext_search(minipop): feature = 'lemmata' language = 'greek' texts = minipop.find(Text.collection, language=language) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=[ 'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ', 'οὗτος', 'ἐμός' ], stopword_basis='corpus', score_basis='lemmata', freq_basis='corpus', max_distance=10, distance_basis='span', min_score=0) results = multitext_search(search_result, minipop, matches, feature, 'line', texts) assert len(results) == len(matches) for r, m in zip(results, matches): bigrams = [ bigram for bigram in itertools.combinations(sorted(m.matched_features), 2) ] assert len(bigrams) == len(r) for bigram in bigrams: assert bigram in r
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_latin_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match(search_result, TextOptions(texts[0], 'line'), TextOptions(texts[1], 'line'), 'lemmata', stopwords=['et', 'neque', 'qui'], stopword_basis='texts', score_basis='lemmata', freq_basis='texts', max_distance=10, distance_basis='frequency', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_latin_results.tab')
def test_mini_punctuation(punctpop, mini_punctuation_metadata): texts = punctpop.find( Text.collection, title=[m['title'] for m in mini_punctuation_metadata]) results_id = uuid.uuid4() search_result = Search( results_id=results_id, status=Search.INIT, msg='', # see tesserae.utils.search for how to actually set up Search ) punctpop.insert(search_result) matcher = SparseMatrixSearch(punctpop) matcher.match(search_result.id, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'lemmata', stopwords=10, stopword_basis='corpus', score_basis='stem', frequency_basis='corpus', max_distance=10, distance_metric='span', min_score=0)
def test_greek_sound(minipop, mini_greek_metadata, v3checker): texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) matcher = SparseMatrixSearch(minipop) v5_matches = matcher.match( search_result, TextOptions(texts[0], 'phrase'), TextOptions(texts[1], 'phrase'), 'sound', stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'], stopword_basis='texts', score_basis='sound', freq_basis='texts', max_distance=999, distance_basis='span', min_score=0) minipop.insert_nocheck(v5_matches) search_result.status = Search.DONE minipop.update(search_result) v3checker.check_search_results(minipop, search_result.id, texts[0].path, 'mini_greek_results_3gr.tab')
def test_greek_trigrams(minipop, mini_greek_metadata): """ For the purpose of visualization. Use to confirm that trigrams are being stored in the database correctly. It should be noted that v5 results do not have stopwords filtered out, while v3 results probably do. """ texts = minipop.find(Text.collection, title=[m['title'] for m in mini_greek_metadata]) results_id = uuid.uuid4() search_result = Search(results_id=results_id) minipop.insert(search_result) v5_results = [] v3_results = [] raw_v5_results = [] target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound') for b in target_units: raw_v5_results.append(b['features']) raw_v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab') for a in raw_v3_results: v3_results.append(a['matched_features']) print('v5 results:') for a in raw_v5_results: print(a) for n in a: # print(n) n = np.asarray(n) # print('array',n) # print('shape', np.shape(n)) b = get_stoplist_tokens(minipop, n, 'sound', 'greek') v5_results.append(b) print(v5_results) print('v3 results:') for a in v3_results: print(a) print('v5 length:', len(v5_results), 'v3 length:', len(v3_results)) assert False
def test_greek_to_latin_inv_freq_by_text(g2lpop, v3checker): greek_to_latin = load_greek_to_latin() greek_ind_to_other_greek_inds = _build_greek_ind_to_other_greek_inds( g2lpop, greek_to_latin) greek_text = g2lpop.find(Text.collection, language='greek')[0] greek_text_options = TextOptions(greek_text, 'line') greek_text_length = sum( len(u['forms']) for u in _get_units(g2lpop, greek_text_options, 'lemmata')) inv_freqs = _get_greek_to_latin_inv_freqs_by_text( g2lpop, greek_text_options, greek_text_length, greek_ind_to_other_greek_inds) v3_total, v3_counts = v3checker._load_v3_mini_text_freqs_file( g2lpop, greek_text, 'g_l') assert len(v3_counts) == len(inv_freqs) greek_forms = { f.index: f.token for f in g2lpop.find( Feature.collection, language='greek', feature='form') } for token, count in v3_counts.items(): assert token in inv_freqs assert math.isclose(inv_freqs[token], float(v3_total) / count), greek_forms[token]
def main(): """Perform Tesserae search and display the top 10 results""" args = parse_args() if args.password: password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ') source_title = args.source_title.lower().replace('-', ' ') source = TextOptions(text=connection.find('texts', author=source_author, title=source_title)[0], unit_type=args.source_unit) target_author = args.target_author.lower().replace('_', ' ') target_title = args.target_title.lower().replace('_', ' ') target = TextOptions(text=connection.find('texts', author=target_author, title=target_title)[0], unit_type=args.target_unit) start = time.time() stopword_indices = create_stoplist( connection, args.n_stopwords, args.feature, source.text.language, basis='corpus' if args.stopword_basis == 'corpus' else [source.text.id, target.text.id]) stopword_tokens = get_stoplist_tokens(connection, stopword_indices, args.feature, source.text.language) parameters = { 'source': { 'object_id': str(source.text.id), 'units': source.unit_type }, 'target': { 'object_id': str(target.text.id), 'units': target.unit_type }, 'method': { 'name': SparseMatrixSearch.matcher_type, 'feature': args.feature, 'stopwords': stopword_tokens, 'freq_basis': args.freq_basis, 'max_distance': args.max_distance, 'distance_basis': args.distance_basis } } results_id = check_cache(connection, parameters['source'], parameters['target'], parameters['method']) if results_id: print('Cached results found.') search = connection.find(Search.collection, results_id=results_id, search_type=NORMAL_SEARCH)[0] else: search = Search(results_id=uuid.uuid4().hex, search_type=NORMAL_SEARCH, parameters=parameters) connection.insert(search) search_params = { 'source': source, 'target': target, 'feature': parameters['method']['feature'], 'stopwords': parameters['method']['stopwords'], 'freq_basis': parameters['method']['freq_basis'], 'max_distance': parameters['method']['max_distance'], 'distance_basis': parameters['method']['distance_basis'], 'min_score': 0 } _run_search(connection, search, SparseMatrixSearch.matcher_type, search_params) matches = get_results(connection, search.id, PageOptions()) end = time.time() - start matches.sort(key=lambda x: x['score'], reverse=True) print(f'Search found {len(matches)} matches in {end}s.') display_count = 10 if len(matches) >= 10 else len(matches) print(f'The Top {display_count} Matches') print('------------------') print() print("Result\tScore\tSource Locus\tTarget Locus\tShared") for i, m in enumerate(matches[:10]): shared = m['matched_features'] print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t' f'{[t for t in shared]}')
def submit_search(): """Run a Tesserae search""" error_response, received = apitess.errors.check_body(flask.request) if error_response: return error_response requireds = {'source', 'target', 'method'} miss_error = apitess.errors.check_requireds(received, requireds) if miss_error: return miss_error source = received['source'] target = received['target'] errors = _validate_units(source, 'source') errors.extend(_validate_units(target, 'target')) if errors: return apitess.errors.error( 400, data=received, message=('The following errors were found in source and target ' 'unit specifications:\n{}'.format('\n\t'.join(errors)))) source_object_id = source['object_id'] target_object_id = target['object_id'] results = flask.g.db.find( tesserae.db.entities.Text.collection, _id=[ObjectId(source_object_id), ObjectId(target_object_id)]) results = {str(t.id): t for t in results} errors = [] if source_object_id not in results: errors.append(source_object_id) if target_object_id not in results: errors.append(target_object_id) if errors: return apitess.errors.error( 400, data=received, message=('Unable to find the following object_id(s) among the ' 'texts in the database:\n\t{}'.format( '\n\t'.join(errors)))) source_text = results[source_object_id] target_text = results[target_object_id] method_requireds = { 'original': { 'name', 'feature', 'stopwords', 'score_basis', 'freq_basis', 'max_distance', 'distance_basis' }, 'greek_to_latin': { 'name', 'greek_stopwords', 'latin_stopwords', 'freq_basis', 'max_distance', 'distance_basis' }, } method = received['method'] if 'name' not in method: return apitess.errors.error(400, data=received, message='No specified method name.') missing = [] for req in method_requireds[method['name']]: if req not in method: missing.append(req) if missing: return apitess.errors.error( 400, data=received, message=('The specified method is missing the following required ' 'key(s): {}'.format(', '.join(missing)))) if 'min_score' in method: try: method['min_score'] = float(method['min_score']) except ValueError: return apitess.error.error( 400, data=received, message=(f'Specified minimum score ({method["min_score"]}) ' 'could not be converted into a number')) else: method['min_score'] = 0 results_id = tesserae.utils.search.check_cache(flask.g.db, source, target, method) if results_id: response = flask.Response() response.status_code = 303 response.status = '303 See Other' # Redirect should point to paginated results response.headers['Location'] = os.path.join( flask.request.base_url, results_id, '?' + '&'.join( f'{a}={b}' for a, b in { 'sort_by': 'score', 'sort_order': 'descending', 'per_page': '100', 'page_number': '0' }.items())) return response response = flask.Response() response.status_code = 201 response.status = '201 Created' results_id = uuid.uuid4().hex # we want the final '/' on the URL response.headers['Location'] = os.path.join(flask.request.base_url, results_id, '') try: search_params = { 'source': TextOptions(source_text, source['units']), 'target': TextOptions(target_text, target['units']), } search_params.update( {key: method[key] for key in method if key != 'name'}) tesserae.utils.search.submit_search(flask.g.jobqueue, flask.g.db, results_id, method['name'], search_params) except queue.Full: return apitess.error.error( 500, data=received, message=('The search request could not be added to the queue. ' 'Please try again in a few minutes')) return response