Exemplo n.º 1
0
def test_latin_trigrams(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_latin_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            print(n)
            n = np.asarray(n)
            print('array', n)
            print(np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'latin')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False
Exemplo n.º 2
0
def test_greek_trigrams(minipop, mini_greek_metadata):
    """
    For the purpose of visualization.
    Use to confirm that trigrams are being stored in the database correctly.
    It should be noted that v5 results do not have stopwords filtered out,
    while v3 results probably do.
    """
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_greek_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            #            print(n)
            n = np.asarray(n)
            #            print('array',n)
            #            print('shape', np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'greek')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False
Exemplo n.º 3
0
def main():
    """Perform Tesserae search and display the top 10 results"""
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ')
    source_title = args.source_title.lower().replace('-', ' ')
    source = TextOptions(text=connection.find('texts',
                                              author=source_author,
                                              title=source_title)[0],
                         unit_type=args.source_unit)
    target_author = args.target_author.lower().replace('_', ' ')
    target_title = args.target_title.lower().replace('_', ' ')
    target = TextOptions(text=connection.find('texts',
                                              author=target_author,
                                              title=target_title)[0],
                         unit_type=args.target_unit)

    start = time.time()
    stopword_indices = create_stoplist(
        connection,
        args.n_stopwords,
        args.feature,
        source.text.language,
        basis='corpus' if args.stopword_basis == 'corpus' else
        [source.text.id, target.text.id])
    stopword_tokens = get_stoplist_tokens(connection, stopword_indices,
                                          args.feature, source.text.language)
    parameters = {
        'source': {
            'object_id': str(source.text.id),
            'units': source.unit_type
        },
        'target': {
            'object_id': str(target.text.id),
            'units': target.unit_type
        },
        'method': {
            'name': SparseMatrixSearch.matcher_type,
            'feature': args.feature,
            'stopwords': stopword_tokens,
            'freq_basis': args.freq_basis,
            'max_distance': args.max_distance,
            'distance_basis': args.distance_basis
        }
    }
    results_id = check_cache(connection, parameters['source'],
                             parameters['target'], parameters['method'])
    if results_id:
        print('Cached results found.')
        search = connection.find(Search.collection,
                                 results_id=results_id,
                                 search_type=NORMAL_SEARCH)[0]
    else:
        search = Search(results_id=uuid.uuid4().hex,
                        search_type=NORMAL_SEARCH,
                        parameters=parameters)
        connection.insert(search)
        search_params = {
            'source': source,
            'target': target,
            'feature': parameters['method']['feature'],
            'stopwords': parameters['method']['stopwords'],
            'freq_basis': parameters['method']['freq_basis'],
            'max_distance': parameters['method']['max_distance'],
            'distance_basis': parameters['method']['distance_basis'],
            'min_score': 0
        }
        _run_search(connection, search, SparseMatrixSearch.matcher_type,
                    search_params)
    matches = get_results(connection, search.id, PageOptions())
    end = time.time() - start
    matches.sort(key=lambda x: x['score'], reverse=True)
    print(f'Search found {len(matches)} matches in {end}s.')
    display_count = 10 if len(matches) >= 10 else len(matches)
    print(f'The Top {display_count} Matches')
    print('------------------')
    print()
    print("Result\tScore\tSource Locus\tTarget Locus\tShared")
    for i, m in enumerate(matches[:10]):
        shared = m['matched_features']
        print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t'
              f'{[t for t in shared]}')
Exemplo n.º 4
0
def query_stopwords():
    """Build a stopwords list"""
    if len(flask.request.args) == 0:
        # default response when no arguments are given
        return flask.jsonify({'stopwords': []})

    feature = flask.request.args.get('feature', 'lemmata')
    list_size = flask.request.args.get('list_size', 10)
    try:
        list_size = int(list_size)
    except ValueError:
        return apitess.errors.error(
            400,
            data={k: v
                  for k, v in flask.request.args.items()},
            message='"list_size" must be an integer')

    # language takes precedence over works
    language = flask.request.args.get('language', None)
    if language:
        stopword_indices = create_stoplist(flask.g.db, list_size, feature,
                                           language)
        if len(stopword_indices) == 0:
            return apitess.errors.error(
                400,
                data={k: v
                      for k, v in flask.request.args.items()},
                message='No stopwords found for feature "{}" in language "{}".'
                .format(feature, language))
        return flask.jsonify({
            'stopwords':
            get_stoplist_tokens(flask.g.db, stopword_indices, feature,
                                language)
        })

    works = flask.request.args.get('works', None)
    if works:
        oids, fails = apitess.utils.parse_works_arg(works)
        if fails:
            return apitess.errors.bad_object_ids(fails, flask.request.args)
        text_results = flask.g.db.find(tesserae.db.entities.Text.collection,
                                       _id=oids)
        if len(text_results) != len(oids):
            # figure out which works were not found in the database and report
            found = {str(r.id) for r in text_results}
            not_found = []
            for obj_id in oids:
                if obj_id not in found:
                    not_found.append(obj_id)
                return apitess.errors.error(
                    400,
                    data={k: v
                          for k, v in flask.request.args.items()},
                    message=('The following works could not be found '
                             f'in the database: {not_found}'))
        stopword_indices = create_stoplist(
            flask.g.db,
            list_size,
            feature,
            text_results[0].language,
            basis=[str(t.id) for t in text_results])
        return flask.jsonify({
            'stopwords':
            get_stoplist_tokens(flask.g.db, stopword_indices, feature,
                                language)
        })

    # if we get here, then we didn't get enough information
    return apitess.errors.error(
        400,
        data={k: v
              for k, v in flask.request.args.items()},
        message=(
            'Insufficient information was given to calculate a stopwords '
            'list (Perhaps you forgot to specify "language" or "works").'))