Exemplos de get_stoplist_tokens em Python, exemplos de tesserae.utils.stopwords.get_stoplist_tokens em Python

Exemplo n.º 1

0

Exibir arquivo

def test_latin_trigrams(minipop, mini_latin_metadata):
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_latin_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_latin_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            print(n)
            n = np.asarray(n)
            print('array', n)
            print(np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'latin')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False

Exemplo n.º 2

0

Exibir arquivo

def test_greek_trigrams(minipop, mini_greek_metadata):
    """
    For the purpose of visualization.
    Use to confirm that trigrams are being stored in the database correctly.
    It should be noted that v5 results do not have stopwords filtered out,
    while v3 results probably do.
    """
    texts = minipop.find(Text.collection,
                         title=[m['title'] for m in mini_greek_metadata])
    results_id = uuid.uuid4()
    search_result = Search(results_id=results_id)
    minipop.insert(search_result)
    v5_results = []
    v3_results = []
    raw_v5_results = []
    target_units = _get_units(minipop, TextOptions(texts[0], 'line'), 'sound')
    for b in target_units:
        raw_v5_results.append(b['features'])
    raw_v3_results = _load_v3_results(texts[0].path,
                                      'mini_greek_results_3gr.tab')
    for a in raw_v3_results:
        v3_results.append(a['matched_features'])
    print('v5 results:')
    for a in raw_v5_results:
        print(a)
        for n in a:
            #            print(n)
            n = np.asarray(n)
            #            print('array',n)
            #            print('shape', np.shape(n))
            b = get_stoplist_tokens(minipop, n, 'sound', 'greek')
            v5_results.append(b)
    print(v5_results)
    print('v3 results:')
    for a in v3_results:
        print(a)
    print('v5 length:', len(v5_results), 'v3 length:', len(v3_results))
    assert False

Exemplo n.º 3

0

Exibir arquivo

Arquivo: search.py Projeto: lhambrid/tesserae-v5

def main():
    """Perform Tesserae search and display the top 10 results"""
    args = parse_args()
    if args.password:
        password = getpass(prompt='Tesserae MongoDB Password: '******'-', ' ')
    source_title = args.source_title.lower().replace('-', ' ')
    source = TextOptions(text=connection.find('texts',
                                              author=source_author,
                                              title=source_title)[0],
                         unit_type=args.source_unit)
    target_author = args.target_author.lower().replace('_', ' ')
    target_title = args.target_title.lower().replace('_', ' ')
    target = TextOptions(text=connection.find('texts',
                                              author=target_author,
                                              title=target_title)[0],
                         unit_type=args.target_unit)

    start = time.time()
    stopword_indices = create_stoplist(
        connection,
        args.n_stopwords,
        args.feature,
        source.text.language,
        basis='corpus' if args.stopword_basis == 'corpus' else
        [source.text.id, target.text.id])
    stopword_tokens = get_stoplist_tokens(connection, stopword_indices,
                                          args.feature, source.text.language)
    parameters = {
        'source': {
            'object_id': str(source.text.id),
            'units': source.unit_type
        },
        'target': {
            'object_id': str(target.text.id),
            'units': target.unit_type
        },
        'method': {
            'name': SparseMatrixSearch.matcher_type,
            'feature': args.feature,
            'stopwords': stopword_tokens,
            'freq_basis': args.freq_basis,
            'max_distance': args.max_distance,
            'distance_basis': args.distance_basis
        }
    }
    results_id = check_cache(connection, parameters['source'],
                             parameters['target'], parameters['method'])
    if results_id:
        print('Cached results found.')
        search = connection.find(Search.collection,
                                 results_id=results_id,
                                 search_type=NORMAL_SEARCH)[0]
    else:
        search = Search(results_id=uuid.uuid4().hex,
                        search_type=NORMAL_SEARCH,
                        parameters=parameters)
        connection.insert(search)
        search_params = {
            'source': source,
            'target': target,
            'feature': parameters['method']['feature'],
            'stopwords': parameters['method']['stopwords'],
            'freq_basis': parameters['method']['freq_basis'],
            'max_distance': parameters['method']['max_distance'],
            'distance_basis': parameters['method']['distance_basis'],
            'min_score': 0
        }
        _run_search(connection, search, SparseMatrixSearch.matcher_type,
                    search_params)
    matches = get_results(connection, search.id, PageOptions())
    end = time.time() - start
    matches.sort(key=lambda x: x['score'], reverse=True)
    print(f'Search found {len(matches)} matches in {end}s.')
    display_count = 10 if len(matches) >= 10 else len(matches)
    print(f'The Top {display_count} Matches')
    print('------------------')
    print()
    print("Result\tScore\tSource Locus\tTarget Locus\tShared")
    for i, m in enumerate(matches[:10]):
        shared = m['matched_features']
        print(f'{i}.\t{m["score"]}\t{m["source_tag"]}\t{m["target_tag"]}\t'
              f'{[t for t in shared]}')

Exemplo n.º 4

0

Exibir arquivo

Arquivo: stopwords.py Projeto: tesserae/apitess

def query_stopwords():
    """Build a stopwords list"""
    if len(flask.request.args) == 0:
        # default response when no arguments are given
        return flask.jsonify({'stopwords': []})

    feature = flask.request.args.get('feature', 'lemmata')
    list_size = flask.request.args.get('list_size', 10)
    try:
        list_size = int(list_size)
    except ValueError:
        return apitess.errors.error(
            400,
            data={k: v
                  for k, v in flask.request.args.items()},
            message='"list_size" must be an integer')

    # language takes precedence over works
    language = flask.request.args.get('language', None)
    if language:
        stopword_indices = create_stoplist(flask.g.db, list_size, feature,
                                           language)
        if len(stopword_indices) == 0:
            return apitess.errors.error(
                400,
                data={k: v
                      for k, v in flask.request.args.items()},
                message='No stopwords found for feature "{}" in language "{}".'
                .format(feature, language))
        return flask.jsonify({
            'stopwords':
            get_stoplist_tokens(flask.g.db, stopword_indices, feature,
                                language)
        })

    works = flask.request.args.get('works', None)
    if works:
        oids, fails = apitess.utils.parse_works_arg(works)
        if fails:
            return apitess.errors.bad_object_ids(fails, flask.request.args)
        text_results = flask.g.db.find(tesserae.db.entities.Text.collection,
                                       _id=oids)
        if len(text_results) != len(oids):
            # figure out which works were not found in the database and report
            found = {str(r.id) for r in text_results}
            not_found = []
            for obj_id in oids:
                if obj_id not in found:
                    not_found.append(obj_id)
                return apitess.errors.error(
                    400,
                    data={k: v
                          for k, v in flask.request.args.items()},
                    message=('The following works could not be found '
                             f'in the database: {not_found}'))
        stopword_indices = create_stoplist(
            flask.g.db,
            list_size,
            feature,
            text_results[0].language,
            basis=[str(t.id) for t in text_results])
        return flask.jsonify({
            'stopwords':
            get_stoplist_tokens(flask.g.db, stopword_indices, feature,
                                language)
        })

    # if we get here, then we didn't get enough information
    return apitess.errors.error(
        400,
        data={k: v
              for k, v in flask.request.args.items()},
        message=(
            'Insufficient information was given to calculate a stopwords '
            'list (Perhaps you forgot to specify "language" or "works").'))