示例#1
0
def test_hitfilter_zero_score(subject_index):
    orighits = ListAnalysisResult(
        [AnalysisHit(uri='uri', label='label', score=0.0)],
        subject_index)
    hits = HitFilter()(orighits)
    assert isinstance(hits, AnalysisResult)
    assert len(hits) == 0
示例#2
0
文件: cli.py 项目: UB-Mannheim/Annif
def generate_filter_batches(subjects):
    filter_batches = collections.OrderedDict()
    for limit in range(1, 16):
        for threshold in [i * 0.05 for i in range(20)]:
            hit_filter = HitFilter(limit, threshold)
            batch = annif.eval.EvaluationBatch(subjects)
            filter_batches[(limit, threshold)] = (hit_filter, batch)
    return filter_batches
示例#3
0
文件: cli.py 项目: UB-Mannheim/Annif
def run_analyze(project_id, limit, threshold, backend_param):
    """
    Analyze a single document from standard input.
    """
    project = get_project(project_id)
    text = sys.stdin.read()
    backend_params = parse_backend_params(backend_param)
    hit_filter = HitFilter(limit, threshold)
    hits = hit_filter(project.analyze(text, backend_params))
    for hit in hits:
        click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
示例#4
0
文件: rest.py 项目: UB-Mannheim/Annif
def analyze(project_id, text, limit, threshold):
    """analyze text and return a dict with results formatted according to
    Swagger spec"""

    try:
        project = annif.project.get_project(project_id)
    except ValueError:
        return project_not_found_error(project_id)

    hit_filter = HitFilter(limit, threshold)
    try:
        result = project.analyze(text)
    except AnnifException as err:
        return server_error(err)
    hits = hit_filter(result)
    return {'results': [hit._asdict() for hit in hits]}
示例#5
0
文件: cli.py 项目: UB-Mannheim/Annif
def run_eval(project_id, paths, limit, threshold, backend_param):
    """
    Analyze documents and evaluate the result.

    Compare the results of automated indexing against a gold standard. The
    path may be either a TSV file with short documents or a directory with
    documents in separate files.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)

    hit_filter = HitFilter(limit=limit, threshold=threshold)
    eval_batch = annif.eval.EvaluationBatch(project.subjects)

    docs = open_documents(paths)
    for doc in docs.documents:
        results = project.analyze(doc.text, backend_params)
        hits = hit_filter(results)
        eval_batch.evaluate(hits,
                            annif.corpus.SubjectSet((doc.uris, doc.labels)))

    template = "{0:<20}\t{1}"
    for metric, score in eval_batch.results().items():
        click.echo(template.format(metric + ":", score))
示例#6
0
文件: cli.py 项目: UB-Mannheim/Annif
def run_analyzedir(project_id, directory, suffix, force, limit, threshold,
                   backend_param):
    """
    Analyze a directory with documents. Write the results in TSV files
    with the given suffix.
    """
    project = get_project(project_id)
    backend_params = parse_backend_params(backend_param)
    hit_filter = HitFilter(limit, threshold)

    for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory(
            directory, require_subjects=False):
        with open(docfilename) as docfile:
            text = docfile.read()
        subjectfilename = re.sub(r'\.txt$', suffix, docfilename)
        if os.path.exists(subjectfilename) and not force:
            click.echo("Not overwriting {} (use --force to override)".format(
                subjectfilename))
            continue
        with open(subjectfilename, 'w') as subjfile:
            results = project.analyze(text, backend_params)
            for hit in hit_filter(results):
                line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score)
                click.echo(line, file=subjfile)
示例#7
0
def test_hitfilter_threshold(subject_index):
    orighits = generate_hits(10, subject_index)
    hits = HitFilter(threshold=0.5)(orighits)
    assert isinstance(hits, AnalysisResult)
    assert len(hits) == 2
示例#8
0
def test_hitfilter_limit(subject_index):
    orighits = generate_hits(10, subject_index)
    hits = HitFilter(limit=5)(orighits)
    assert isinstance(hits, AnalysisResult)
    assert len(hits) == 5