Пример #1
0
def test_taxonomy_error(app, demo_text):
    """Test passing non existing taxonomy."""
    with app.app_context():
        with pytest.raises(TaxonomyError):
            out = get_keywords_from_text(text_lines=[demo_text],
                                         taxonomy_name="foo",
                                         output_mode="dict")
def test_taxonomy_error(app, demo_text):
    """Test passing non existing taxonomy."""
    with app.app_context():
        with pytest.raises(TaxonomyError):
            out = get_keywords_from_text(
                text_lines=[demo_text],
                taxonomy_name="foo",
                output_mode="dict"
            )
Пример #3
0
    def _classify_paper(obj, eng):
        from invenio_classifier.errors import ClassifierException
        from invenio_classifier import (
            get_keywords_from_text,
            get_keywords_from_local_file,
        )

        params = dict(
            taxonomy_name=taxonomy,
            output_mode='dict',
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms
        )

        fast_mode = False
        try:
            # FIXME: May need to find another canonical way of getting PDF
            if "pdf" in obj.extra_data:
                result = get_keywords_from_local_file(
                    obj.extra_data["pdf"], **params
                )
            else:
                data = []
                titles = obj.data.get('titles')
                if titles:
                    data.extend([t.get('title', '') for t in titles])
                abstracts = obj.data.get('abstracts')
                if abstracts:
                    data.extend([t.get('value', '') for t in abstracts])
                if not data:
                    obj.log.error("No classification done due to missing data.")
                    return
                result = get_keywords_from_text(data, **params)
                fast_mode = True
        except ClassifierException as e:
            obj.log.exception(e)
            return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {})
        )
        result["fast_mode"] = fast_mode

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result
Пример #4
0
    def _classify_paper(obj, eng):
        from invenio_classifier.errors import ClassifierException
        from invenio_classifier import (
            get_keywords_from_text,
            get_keywords_from_local_file,
        )

        params = dict(taxonomy_name=taxonomy,
                      output_mode='dict',
                      output_limit=output_limit,
                      spires=spires,
                      match_mode=match_mode,
                      no_cache=no_cache,
                      with_author_keywords=with_author_keywords,
                      rebuild_cache=rebuild_cache,
                      only_core_tags=only_core_tags,
                      extract_acronyms=extract_acronyms)

        fast_mode = False
        try:
            # FIXME: May need to find another canonical way of getting PDF
            if "pdf" in obj.extra_data:
                result = get_keywords_from_local_file(obj.extra_data["pdf"],
                                                      **params)
            else:
                data = []
                titles = obj.data.get('titles')
                if titles:
                    data.extend([t.get('title', '') for t in titles])
                abstracts = obj.data.get('abstracts')
                if abstracts:
                    data.extend([t.get('value', '') for t in abstracts])
                if not data:
                    obj.log.error(
                        "No classification done due to missing data.")
                    return
                result = get_keywords_from_text(data, **params)
                fast_mode = True
        except ClassifierException as e:
            obj.log.exception(e)
            return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {}))
        result["fast_mode"] = fast_mode

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result
Пример #5
0
def test_keywords(app, demo_taxonomy, demo_text):
    """Test keyword extraction from text."""
    with app.app_context():
        out = get_keywords_from_text(text_lines=[demo_text],
                                     taxonomy_name=demo_taxonomy,
                                     output_mode="dict")
        output = out.get("complete_output")
        single_keywords = output.get("single_keywords", [])

        assert len(single_keywords) == 3
        assert {'keyword': "aberration", 'number': 2} in single_keywords

        core_keywords = output.get("core_keywords", [])

        assert len(core_keywords) == 2
        assert {'keyword': "supersymmetry", 'number': 1} in core_keywords
Пример #6
0
def test_taxonomy_workdir(app, demo_text, demo_taxonomy):
    """Test grabbing taxonomy from the CLASSIFIER_WORKDIR."""
    app.config.update({"CLASSIFIER_WORKDIR": os.path.dirname(demo_taxonomy)})
    with app.app_context():
        out = get_keywords_from_text(text_lines=[demo_text],
                                     taxonomy_name="test.rdf",
                                     output_mode="dict")
        output = out.get("complete_output")
        single_keywords = output.get("single_keywords", [])

        assert len(single_keywords) == 3
        assert {'keyword': "aberration", 'number': 2} in single_keywords

        core_keywords = output.get("core_keywords", [])

        assert len(core_keywords) == 2
        assert {'keyword': "supersymmetry", 'number': 1} in core_keywords
Пример #7
0
    def _classify_paper(obj, eng):
        params = dict(taxonomy_name=taxonomy,
                      output_mode='dict',
                      output_limit=output_limit,
                      spires=spires,
                      match_mode=match_mode,
                      no_cache=no_cache,
                      with_author_keywords=with_author_keywords,
                      rebuild_cache=rebuild_cache,
                      only_core_tags=only_core_tags,
                      extract_acronyms=extract_acronyms)

        fast_mode = False
        tmp_pdf = get_pdf_in_workflow(obj)
        try:
            if tmp_pdf:
                result = get_keywords_from_local_file(tmp_pdf, **params)
            else:
                data = []
                titles = obj.data.get('titles')
                if titles:
                    data.extend([t.get('title', '') for t in titles])
                abstracts = obj.data.get('abstracts')
                if abstracts:
                    data.extend([t.get('value', '') for t in abstracts])
                if not data:
                    obj.log.error(
                        "No classification done due to missing data.")
                    return
                result = get_keywords_from_text(data, **params)
                fast_mode = True
        except ClassifierException as e:
            obj.log.exception(e)
            return
        finally:
            if tmp_pdf and os.path.exists(tmp_pdf):
                os.unlink(tmp_pdf)

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {}))
        result["fast_mode"] = fast_mode

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result
def test_keywords(app, demo_taxonomy, demo_text):
    """Test keyword extraction from text."""
    with app.app_context():
        out = get_keywords_from_text(
            text_lines=[demo_text],
            taxonomy_name=demo_taxonomy,
            output_mode="dict"
        )
        output = out.get("complete_output")
        single_keywords = output.get("Single keywords", []).keys()

        assert len(single_keywords) == 3
        assert "aberration" in single_keywords

        core_keywords = output.get("Core keywords", []).keys()

        assert len(core_keywords) == 2
        assert "supersymmetry" in core_keywords
Пример #9
0
def test_keywords(app, demo_taxonomy, demo_text):
    """Test version import."""
    with app.app_context():
        out = get_keywords_from_text(
            text_lines=[demo_text],
            taxonomy_name=demo_taxonomy,
            output_mode="dict"
        )
        output = out.get("complete_output")
        single_keywords = output.get("Single keywords", []).keys()

        assert len(single_keywords) == 3
        assert "aberration" in single_keywords

        core_keywords = output.get("Core keywords", []).keys()

        assert len(core_keywords) == 2
        assert "supersymmetry" in core_keywords
def test_taxonomy_workdir(app, demo_text, demo_taxonomy):
    """Test grabbing taxonomy from the CLASSIFIER_WORKDIR."""
    app.config.update({"CLASSIFIER_WORKDIR": os.path.dirname(demo_taxonomy)})
    with app.app_context():
        out = get_keywords_from_text(
            text_lines=[demo_text],
            taxonomy_name="test.rdf",
            output_mode="dict"
        )
        output = out.get("complete_output")
        single_keywords = output.get("Single keywords", []).keys()

        assert len(single_keywords) == 3
        assert "aberration" in single_keywords

        core_keywords = output.get("Core keywords", []).keys()

        assert len(core_keywords) == 2
        assert "supersymmetry" in core_keywords
Пример #11
0
    def _classify_paper(obj, eng):
        from flask import current_app
        params = dict(
            taxonomy_name=taxonomy or current_app.config['HEP_ONTOLOGY_FILE'],
            output_mode='dict',
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms
        )

        fulltext_used = True
        with get_document_in_workflow(obj) as tmp_document:
            try:
                if tmp_document:
                    result = get_keywords_from_local_file(tmp_document, **params)
                else:
                    data = get_value(obj.data, 'titles.title', [])
                    data.extend(get_value(obj.data, 'titles.subtitle', []))
                    data.extend(get_value(obj.data, 'abstracts.value', []))
                    data.extend(get_value(obj.data, 'keywords.value', []))
                    if not data:
                        obj.log.error("No classification done due to missing data.")
                        return
                    result = get_keywords_from_text(data, **params)
                    fulltext_used = False
            except ClassifierException as e:
                obj.log.exception(e)
                return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {})
        )
        result["fulltext_used"] = fulltext_used

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result
Пример #12
0
    def _classify_paper(obj, eng):
        from flask import current_app
        params = dict(taxonomy_name=taxonomy
                      or current_app.config['HEP_ONTOLOGY_FILE'],
                      output_mode='dict',
                      output_limit=output_limit,
                      spires=spires,
                      match_mode=match_mode,
                      no_cache=no_cache,
                      with_author_keywords=with_author_keywords,
                      rebuild_cache=rebuild_cache,
                      only_core_tags=only_core_tags,
                      extract_acronyms=extract_acronyms)

        fulltext_used = True
        with get_document_in_workflow(obj) as tmp_document:
            try:
                if tmp_document:
                    result = get_keywords_from_local_file(
                        tmp_document, **params)
                else:
                    data = get_value(obj.data, 'titles.title', [])
                    data.extend(get_value(obj.data, 'titles.subtitle', []))
                    data.extend(get_value(obj.data, 'abstracts.value', []))
                    data.extend(get_value(obj.data, 'keywords.value', []))
                    if not data:
                        obj.log.error(
                            "No classification done due to missing data.")
                        return
                    result = get_keywords_from_text(data, **params)
                    fulltext_used = False
            except ClassifierException as e:
                obj.log.exception(e)
                return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {}))
        result["fulltext_used"] = fulltext_used

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result