def test_hitfilter_zero_score(subject_index): orighits = ListAnalysisResult( [AnalysisHit(uri='uri', label='label', score=0.0)], subject_index) hits = HitFilter()(orighits) assert isinstance(hits, AnalysisResult) assert len(hits) == 0
def generate_filter_batches(subjects): filter_batches = collections.OrderedDict() for limit in range(1, 16): for threshold in [i * 0.05 for i in range(20)]: hit_filter = HitFilter(limit, threshold) batch = annif.eval.EvaluationBatch(subjects) filter_batches[(limit, threshold)] = (hit_filter, batch) return filter_batches
def run_analyze(project_id, limit, threshold, backend_param): """ Analyze a single document from standard input. """ project = get_project(project_id) text = sys.stdin.read() backend_params = parse_backend_params(backend_param) hit_filter = HitFilter(limit, threshold) hits = hit_filter(project.analyze(text, backend_params)) for hit in hits: click.echo("<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score))
def analyze(project_id, text, limit, threshold): """analyze text and return a dict with results formatted according to Swagger spec""" try: project = annif.project.get_project(project_id) except ValueError: return project_not_found_error(project_id) hit_filter = HitFilter(limit, threshold) try: result = project.analyze(text) except AnnifException as err: return server_error(err) hits = hit_filter(result) return {'results': [hit._asdict() for hit in hits]}
def run_eval(project_id, paths, limit, threshold, backend_param): """ Analyze documents and evaluate the result. Compare the results of automated indexing against a gold standard. The path may be either a TSV file with short documents or a directory with documents in separate files. """ project = get_project(project_id) backend_params = parse_backend_params(backend_param) hit_filter = HitFilter(limit=limit, threshold=threshold) eval_batch = annif.eval.EvaluationBatch(project.subjects) docs = open_documents(paths) for doc in docs.documents: results = project.analyze(doc.text, backend_params) hits = hit_filter(results) eval_batch.evaluate(hits, annif.corpus.SubjectSet((doc.uris, doc.labels))) template = "{0:<20}\t{1}" for metric, score in eval_batch.results().items(): click.echo(template.format(metric + ":", score))
def run_analyzedir(project_id, directory, suffix, force, limit, threshold, backend_param): """ Analyze a directory with documents. Write the results in TSV files with the given suffix. """ project = get_project(project_id) backend_params = parse_backend_params(backend_param) hit_filter = HitFilter(limit, threshold) for docfilename, dummy_subjectfn in annif.corpus.DocumentDirectory( directory, require_subjects=False): with open(docfilename) as docfile: text = docfile.read() subjectfilename = re.sub(r'\.txt$', suffix, docfilename) if os.path.exists(subjectfilename) and not force: click.echo("Not overwriting {} (use --force to override)".format( subjectfilename)) continue with open(subjectfilename, 'w') as subjfile: results = project.analyze(text, backend_params) for hit in hit_filter(results): line = "<{}>\t{}\t{}".format(hit.uri, hit.label, hit.score) click.echo(line, file=subjfile)
def test_hitfilter_threshold(subject_index): orighits = generate_hits(10, subject_index) hits = HitFilter(threshold=0.5)(orighits) assert isinstance(hits, AnalysisResult) assert len(hits) == 2
def test_hitfilter_limit(subject_index): orighits = generate_hits(10, subject_index) hits = HitFilter(limit=5)(orighits) assert isinstance(hits, AnalysisResult) assert len(hits) == 5