def test_mini_greek_search_corpus_freqs(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=10,
                               stopword_basis='corpus',
                               score_basis='stem',
                               frequency_basis='corpus',
                               max_distance=10,
                               distance_metric='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path,
                                  'mini_greek_corpus_results.tab')
    _check_search_results(v5_results, v3_results)
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=[
                                   'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                   'οὗτος', 'ἐμός'
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               frequency_basis='texts',
                               max_distance=10,
                               distance_metric='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results.tab')
    print(len(v5_results), len(v3_results))
    _check_search_results(v5_results, v3_results)
Пример #3
0
def test_greek_sound(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(
        search_result,
        TextOptions(texts[0], 'phrase'),
        TextOptions(texts[1], 'phrase'),
        'sound',
        stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'],
        stopword_basis='texts',
        score_basis='3gr',
        freq_basis='texts',
        max_distance=999,
        distance_basis='span',
        min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_3gr.tab')
    for p in v3_results:
        print('v3 trigrams:', p['matched_features'])
    for p in v5_results:
        print('v5 trigrams:', p['matched_features'])
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    _check_search_results(v5_results, v3_results)
Пример #4
0
def test_greek_semantic(minipop, mini_greek_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'semantic',
                               stopwords=[
                                   'τις', 'οὗτος', 'καί', 'αβγ', 'ἐγώ',
                                   'τηνόθι', 'τηνικαῦτα', 'τέκνον'
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=999,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_greek_results_syn.tab')
    print(len(v5_results), len(v3_results))
    _check_search_results(v5_results, v3_results)
Пример #5
0
def test_mini_greek_search_text_freqs(minipop, mini_greek_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'phrase'),
                               TextOptions(texts[1], 'phrase'),
                               'lemmata',
                               stopwords=[
                                   'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                   'οὗτος', 'ἐμός'
                               ],
                               stopword_basis='texts',
                               score_basis='lemmata',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='span',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_greek_results.tab')
Пример #6
0
def test_english(engpop, eng_metadata, v3checker):
    texts = engpop.find(Text.collection,
                        title=[m['title'] for m in eng_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    engpop.insert(search_result)
    matcher = SparseMatrixSearch(engpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'form',
                               stopwords=[
                                   "the",
                                   "and",
                                   "of",
                                   "a",
                                   "to",
                                   "in",
                                   "that",
                                   "with",
                                   "i",
                                   "by",
                               ],
                               stopword_basis='texts',
                               score_basis='form',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=6.0)
    engpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    engpop.update(search_result)
    v3checker.check_search_results(engpop, search_result.id, texts[0].path,
                                   'eng_time.tab')
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result.id,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=['et', 'neque', 'qui'],
                               stopword_basis='texts',
                               score_basis='stem',
                               frequency_basis='texts',
                               max_distance=10,
                               distance_metric='frequency',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v5_results = get_results(minipop, results_id)
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'mini_latin_results.tab')
    _check_search_results(v5_results, v3_results)
Пример #8
0
def test_lucverg(lucvergpop, lucverg_metadata):
    texts = lucvergpop.find(Text.collection,
                            title=[m['title'] for m in lucverg_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    lucvergpop.insert(search_result)
    matcher = SparseMatrixSearch(lucvergpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=[
                                   "et", "qui", "quis", "in", "sum", "hic",
                                   "non", "tu", "neque", "ego"
                               ],
                               stopword_basis='texts',
                               score_basis='stem',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=0)
    lucvergpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    lucvergpop.update(search_result)
    v5_results = get_results(lucvergpop, search_result.id, PageOptions())
    v5_results = sorted(v5_results, key=lambda x: -x['score'])
    v3_results = _load_v3_results(texts[0].path, 'lucverg_time.tab')
    _check_search_results(v5_results, v3_results)
Пример #9
0
def test_latin_trigrams(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_latin_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            print(n)
            n = np.asarray(n)
            print('array', n)
            print(np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'latin')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False
Пример #10
0
def test_greek_to_latin_corpus_basis(g2lpop, mini_g2l_metadata, v3checker):
    texts = g2lpop.find(Text.collection,
                        title=[m['title'] for m in mini_g2l_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    g2lpop.insert(search_result)
    matcher = GreekToLatinSearch(g2lpop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               greek_stopwords=[],
                               latin_stopwords=['et', 'non', 'iam'],
                               freq_basis='corpus',
                               max_distance=999,
                               distance_basis='frequency',
                               min_score=0)
    g2lpop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    g2lpop.update(search_result)
    v3checker.check_search_results(g2lpop, search_result.id, texts[0].path,
                                   'mini_g2l_corpus.tab')
Пример #11
0
def test_greek_multitext_search(minipop):
    feature = 'lemmata'
    language = 'greek'
    texts = minipop.find(Text.collection, language=language)

    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    matches = matcher.match(search_result,
                            TextOptions(texts[0], 'line'),
                            TextOptions(texts[1], 'line'),
                            'lemmata',
                            stopwords=[
                                'ὁ', 'ὅς', 'καί', 'αβγ', 'ἐγώ', 'δέ',
                                'οὗτος', 'ἐμός'
                            ],
                            stopword_basis='corpus',
                            score_basis='lemmata',
                            freq_basis='corpus',
                            max_distance=10,
                            distance_basis='span',
                            min_score=0)

    results = multitext_search(search_result, minipop, matches, feature,
                               'line', texts)
    assert len(results) == len(matches)
    for r, m in zip(results, matches):
        bigrams = [
            bigram
            for bigram in itertools.combinations(sorted(m.matched_features), 2)
        ]
        assert len(bigrams) == len(r)
        for bigram in bigrams:
            assert bigram in r
Пример #12
0
def test_mini_latin_search_text_freqs(minipop, mini_latin_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(search_result,
                               TextOptions(texts[0], 'line'),
                               TextOptions(texts[1], 'line'),
                               'lemmata',
                               stopwords=['et', 'neque', 'qui'],
                               stopword_basis='texts',
                               score_basis='lemmata',
                               freq_basis='texts',
                               max_distance=10,
                               distance_basis='frequency',
                               min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_latin_results.tab')
def test_mini_punctuation(punctpop, mini_punctuation_metadata):
    texts = punctpop.find(
        Text.collection, title=[m['title'] for m in mini_punctuation_metadata])
    results_id = uuid.uuid4()
    search_result = Search(
        results_id=results_id,
        status=Search.INIT,
        msg='',
        # see tesserae.utils.search for how to actually set up Search
    )
    punctpop.insert(search_result)
    matcher = SparseMatrixSearch(punctpop)
    matcher.match(search_result.id,
                  TextOptions(texts[0], 'phrase'),
                  TextOptions(texts[1], 'phrase'),
                  'lemmata',
                  stopwords=10,
                  stopword_basis='corpus',
                  score_basis='stem',
                  frequency_basis='corpus',
                  max_distance=10,
                  distance_metric='span',
                  min_score=0)
Пример #14
0
def test_greek_sound(minipop, mini_greek_metadata, v3checker):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    matcher = SparseMatrixSearch(minipop)
    v5_matches = matcher.match(
        search_result,
        TextOptions(texts[0], 'phrase'),
        TextOptions(texts[1], 'phrase'),
        'sound',
        stopwords=['και', 'του', 'αλλ', 'ειν', 'μεν', 'μοι', 'αυτ', 'ους'],
        stopword_basis='texts',
        score_basis='sound',
        freq_basis='texts',
        max_distance=999,
        distance_basis='span',
        min_score=0)
    minipop.insert_nocheck(v5_matches)
    search_result.status = Search.DONE
    minipop.update(search_result)
    v3checker.check_search_results(minipop, search_result.id, texts[0].path,
                                   'mini_greek_results_3gr.tab')
Пример #15
0
def test_greek_trigrams(minipop, mini_greek_metadata):
    """
    For the purpose of visualization.
    Use to confirm that trigrams are being stored in the database correctly.
    It should be noted that v5 results do not have stopwords filtered out,
    while v3 results probably do.
    """
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_greek_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            #            print(n)
            n = np.asarray(n)
            #            print('array',n)
            #            print('shape', np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'greek')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False
Пример #16
0
def test_greek_to_latin_inv_freq_by_text(g2lpop, v3checker):
    greek_to_latin = load_greek_to_latin()
    greek_ind_to_other_greek_inds = _build_greek_ind_to_other_greek_inds(
        g2lpop, greek_to_latin)
    greek_text = g2lpop.find(Text.collection, language='greek')[0]
    greek_text_options = TextOptions(greek_text, 'line')
    greek_text_length = sum(
        len(u['forms'])
        for u in _get_units(g2lpop, greek_text_options, 'lemmata'))
    inv_freqs = _get_greek_to_latin_inv_freqs_by_text(
        g2lpop, greek_text_options, greek_text_length,
        greek_ind_to_other_greek_inds)
    v3_total, v3_counts = v3checker._load_v3_mini_text_freqs_file(
        g2lpop, greek_text, 'g_l')
    assert len(v3_counts) == len(inv_freqs)
    greek_forms = {
        f.index: f.token
        for f in g2lpop.find(
            Feature.collection, language='greek', feature='form')
    }
    for token, count in v3_counts.items():
        assert token in inv_freqs
        assert math.isclose(inv_freqs[token],
                            float(v3_total) / count), greek_forms[token]
Пример #17
0
def main():
    """Perform Tesserae search and display the top 10 results"""
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ')
    source_title = args.source_title.lower().replace('-', ' ')
    source = TextOptions(text=connection.find('texts',
                                              author=source_author,
                                              title=source_title)[0],
                         unit_type=args.source_unit)
    target_author = args.target_author.lower().replace('_', ' ')
    target_title = args.target_title.lower().replace('_', ' ')
    target = TextOptions(text=connection.find('texts',
                                              author=target_author,
                                              title=target_title)[0],
                         unit_type=args.target_unit)

    start = time.time()
    stopword_indices = create_stoplist(
        connection,
        args.n_stopwords,
        args.feature,
        source.text.language,
        basis='corpus' if args.stopword_basis == 'corpus' else
        [source.text.id, target.text.id])
    stopword_tokens = get_stoplist_tokens(connection, stopword_indices,
                                          args.feature, source.text.language)
    parameters = {
        'source': {
            'object_id': str(source.text.id),
            'units': source.unit_type
        },
        'target': {
            'object_id': str(target.text.id),
            'units': target.unit_type
        },
        'method': {
            'name': SparseMatrixSearch.matcher_type,
            'feature': args.feature,
            'stopwords': stopword_tokens,
            'freq_basis': args.freq_basis,
            'max_distance': args.max_distance,
            'distance_basis': args.distance_basis
        }
    }
    results_id = check_cache(connection, parameters['source'],
                             parameters['target'], parameters['method'])
    if results_id:
        print('Cached results found.')
        search = connection.find(Search.collection,
                                 results_id=results_id,
                                 search_type=NORMAL_SEARCH)[0]
    else:
        search = Search(results_id=uuid.uuid4().hex,
                        search_type=NORMAL_SEARCH,
                        parameters=parameters)
        connection.insert(search)
        search_params = {
            'source': source,
            'target': target,
            'feature': parameters['method']['feature'],
            'stopwords': parameters['method']['stopwords'],
            'freq_basis': parameters['method']['freq_basis'],
            'max_distance': parameters['method']['max_distance'],
            'distance_basis': parameters['method']['distance_basis'],
            'min_score': 0
        }
        _run_search(connection, search, SparseMatrixSearch.matcher_type,
                    search_params)
    matches = get_results(connection, search.id, PageOptions())
    end = time.time() - start
    matches.sort(key=lambda x: x['score'], reverse=True)
    print(f'Search found {len(matches)} matches in {end}s.')
    display_count = 10 if len(matches) >= 10 else len(matches)
    print(f'The Top {display_count} Matches')
    print('------------------')
    print()
    print("Result\tScore\tSource Locus\tTarget Locus\tShared")
    for i, m in enumerate(matches[:10]):
        shared = m['matched_features']
        print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t'
              f'{[t for t in shared]}')
Пример #18
0
def submit_search():
    """Run a Tesserae search"""
    error_response, received = apitess.errors.check_body(flask.request)
    if error_response:
        return error_response
    requireds = {'source', 'target', 'method'}
    miss_error = apitess.errors.check_requireds(received, requireds)
    if miss_error:
        return miss_error

    source = received['source']
    target = received['target']

    errors = _validate_units(source, 'source')
    errors.extend(_validate_units(target, 'target'))
    if errors:
        return apitess.errors.error(
            400,
            data=received,
            message=('The following errors were found in source and target '
                     'unit specifications:\n{}'.format('\n\t'.join(errors))))

    source_object_id = source['object_id']
    target_object_id = target['object_id']
    results = flask.g.db.find(
        tesserae.db.entities.Text.collection,
        _id=[ObjectId(source_object_id),
             ObjectId(target_object_id)])
    results = {str(t.id): t for t in results}
    errors = []
    if source_object_id not in results:
        errors.append(source_object_id)
    if target_object_id not in results:
        errors.append(target_object_id)
    if errors:
        return apitess.errors.error(
            400,
            data=received,
            message=('Unable to find the following object_id(s) among the '
                     'texts in the database:\n\t{}'.format(
                         '\n\t'.join(errors))))
    source_text = results[source_object_id]
    target_text = results[target_object_id]

    method_requireds = {
        'original': {
            'name', 'feature', 'stopwords', 'score_basis', 'freq_basis',
            'max_distance', 'distance_basis'
        },
        'greek_to_latin': {
            'name', 'greek_stopwords', 'latin_stopwords', 'freq_basis',
            'max_distance', 'distance_basis'
        },
    }
    method = received['method']
    if 'name' not in method:
        return apitess.errors.error(400,
                                    data=received,
                                    message='No specified method name.')
    missing = []
    for req in method_requireds[method['name']]:
        if req not in method:
            missing.append(req)
    if missing:
        return apitess.errors.error(
            400,
            data=received,
            message=('The specified method is missing the following required '
                     'key(s): {}'.format(', '.join(missing))))

    if 'min_score' in method:
        try:
            method['min_score'] = float(method['min_score'])
        except ValueError:
            return apitess.error.error(
                400,
                data=received,
                message=(f'Specified minimum score ({method["min_score"]}) '
                         'could not be converted into a number'))
    else:
        method['min_score'] = 0

    results_id = tesserae.utils.search.check_cache(flask.g.db, source, target,
                                                   method)
    if results_id:
        response = flask.Response()
        response.status_code = 303
        response.status = '303 See Other'
        # Redirect should point to paginated results
        response.headers['Location'] = os.path.join(
            flask.request.base_url, results_id, '?' + '&'.join(
                f'{a}={b}' for a, b in {
                    'sort_by': 'score',
                    'sort_order': 'descending',
                    'per_page': '100',
                    'page_number': '0'
                }.items()))
        return response

    response = flask.Response()
    response.status_code = 201
    response.status = '201 Created'
    results_id = uuid.uuid4().hex
    # we want the final '/' on the URL
    response.headers['Location'] = os.path.join(flask.request.base_url,
                                                results_id, '')

    try:
        search_params = {
            'source': TextOptions(source_text, source['units']),
            'target': TextOptions(target_text, target['units']),
        }
        search_params.update(
            {key: method[key]
             for key in method if key != 'name'})
        tesserae.utils.search.submit_search(flask.g.jobqueue, flask.g.db,
                                            results_id, method['name'],
                                            search_params)
    except queue.Full:
        return apitess.error.error(
            500,
            data=received,
            message=('The search request could not be added to the queue. '
                     'Please try again in a few minutes'))
    return response