Exemplo n.º 1
0
    def _classify_paper(obj, eng):
        from invenio_classifier.errors import ClassifierException
        from invenio_classifier import (
            get_keywords_from_text,
            get_keywords_from_local_file,
        )

        params = dict(
            taxonomy_name=taxonomy,
            output_mode='dict',
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms
        )

        fast_mode = False
        try:
            # FIXME: May need to find another canonical way of getting PDF
            if "pdf" in obj.extra_data:
                result = get_keywords_from_local_file(
                    obj.extra_data["pdf"], **params
                )
            else:
                data = []
                titles = obj.data.get('titles')
                if titles:
                    data.extend([t.get('title', '') for t in titles])
                abstracts = obj.data.get('abstracts')
                if abstracts:
                    data.extend([t.get('value', '') for t in abstracts])
                if not data:
                    obj.log.error("No classification done due to missing data.")
                    return
                result = get_keywords_from_text(data, **params)
                fast_mode = True
        except ClassifierException as e:
            obj.log.exception(e)
            return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {})
        )
        result["fast_mode"] = fast_mode

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result
Exemplo n.º 2
0
def test_composite_keywords(app, hep_taxonomy, pdf_with_composite_keywords):
    with app.app_context():
        out = get_keywords_from_local_file(
            pdf_with_composite_keywords,
            taxonomy_name=hep_taxonomy,
            output_mode='dict',
        )
        output = out.get('complete_output')
        composite_keywords = output.get('composite_keywords', [])

        assert len(composite_keywords) == 20, output
        assert {'details': [64, 132],
                'keyword': 'electronics: noise',
                'number': 23} in composite_keywords
Exemplo n.º 3
0
def test_funny_author_keywords(app, demo_pdf_file_with_funny_author_kw_sep,
                               demo_taxonomy):
    """Test extracting author keywords separated by '·'"""
    with app.app_context():
        out = get_keywords_from_local_file(
            demo_pdf_file_with_funny_author_kw_sep,
            taxonomy_name=demo_taxonomy,
            output_mode="dict",
            with_author_keywords=True)
        output = out.get("complete_output")
        author_keywords = output.get("author_keywords", [])

        assert len(author_keywords) == 4, output
        assert {'author_keyword': 'Depth cameras'} in author_keywords
Exemplo n.º 4
0
    def _classify_paper(obj, eng):
        from invenio_classifier.errors import ClassifierException
        from invenio_classifier import (
            get_keywords_from_text,
            get_keywords_from_local_file,
        )

        params = dict(taxonomy_name=taxonomy,
                      output_mode='dict',
                      output_limit=output_limit,
                      spires=spires,
                      match_mode=match_mode,
                      no_cache=no_cache,
                      with_author_keywords=with_author_keywords,
                      rebuild_cache=rebuild_cache,
                      only_core_tags=only_core_tags,
                      extract_acronyms=extract_acronyms)

        fast_mode = False
        try:
            # FIXME: May need to find another canonical way of getting PDF
            if "pdf" in obj.extra_data:
                result = get_keywords_from_local_file(obj.extra_data["pdf"],
                                                      **params)
            else:
                data = []
                titles = obj.data.get('titles')
                if titles:
                    data.extend([t.get('title', '') for t in titles])
                abstracts = obj.data.get('abstracts')
                if abstracts:
                    data.extend([t.get('value', '') for t in abstracts])
                if not data:
                    obj.log.error(
                        "No classification done due to missing data.")
                    return
                result = get_keywords_from_text(data, **params)
                fast_mode = True
        except ClassifierException as e:
            obj.log.exception(e)
            return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {}))
        result["fast_mode"] = fast_mode

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result
Exemplo n.º 5
0
def test_author_keywords(app, demo_pdf_file_with_author_keywords,
                         demo_taxonomy):
    """Test extracting author keywords from PDF."""
    with app.app_context():
        out = get_keywords_from_local_file(
            demo_pdf_file_with_author_keywords,
            taxonomy_name=demo_taxonomy,
            output_mode="dict",
            with_author_keywords=True
        )
        output = out.get("complete_output")
        author_keywords = output.get("author_keywords", [])

        assert len(author_keywords) == 4, output
        assert {'author_keyword': 'Dyson model',
                'matched_keywords': ['model']} in author_keywords
Exemplo n.º 6
0
def test_file_extration(app, demo_pdf_file, demo_taxonomy):
    """Test extracting keywords from PDF."""
    with app.app_context():
        out = get_keywords_from_local_file(demo_pdf_file,
                                           taxonomy_name=demo_taxonomy,
                                           output_mode="dict")
        output = out.get("complete_output")
        single_keywords = output.get("single_keywords", [])

        assert len(single_keywords) == 4
        assert {'keyword': "gauge field theory Yang-Mills", 'number': 9} \
            in single_keywords

        core_keywords = output.get("core_keywords", [])

        assert len(core_keywords) == 3
        assert {'keyword': "Yang-Mills", 'number': 12} in core_keywords
Exemplo n.º 7
0
    def _classify_paper(obj, eng):
        params = dict(taxonomy_name=taxonomy,
                      output_mode='dict',
                      output_limit=output_limit,
                      spires=spires,
                      match_mode=match_mode,
                      no_cache=no_cache,
                      with_author_keywords=with_author_keywords,
                      rebuild_cache=rebuild_cache,
                      only_core_tags=only_core_tags,
                      extract_acronyms=extract_acronyms)

        fast_mode = False
        tmp_pdf = get_pdf_in_workflow(obj)
        try:
            if tmp_pdf:
                result = get_keywords_from_local_file(tmp_pdf, **params)
            else:
                data = []
                titles = obj.data.get('titles')
                if titles:
                    data.extend([t.get('title', '') for t in titles])
                abstracts = obj.data.get('abstracts')
                if abstracts:
                    data.extend([t.get('value', '') for t in abstracts])
                if not data:
                    obj.log.error(
                        "No classification done due to missing data.")
                    return
                result = get_keywords_from_text(data, **params)
                fast_mode = True
        except ClassifierException as e:
            obj.log.exception(e)
            return
        finally:
            if tmp_pdf and os.path.exists(tmp_pdf):
                os.unlink(tmp_pdf)

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {}))
        result["fast_mode"] = fast_mode

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result
def test_file_extration(app, demo_pdf_file, demo_taxonomy):
    """Test extracting keywords from PDF."""
    with app.app_context():
        out = get_keywords_from_local_file(
            demo_pdf_file,
            taxonomy_name=demo_taxonomy,
            output_mode="dict"
        )
        output = out.get("complete_output")
        single_keywords = output.get("Single keywords", []).keys()

        assert len(single_keywords) == 4
        assert "gauge field theory Yang-Mills" in single_keywords

        core_keywords = output.get("Core keywords", []).keys()

        assert len(core_keywords) == 3
        assert "Yang-Mills" in core_keywords
Exemplo n.º 9
0
    def _classify_paper(obj, eng):
        from flask import current_app
        params = dict(
            taxonomy_name=taxonomy or current_app.config['HEP_ONTOLOGY_FILE'],
            output_mode='dict',
            output_limit=output_limit,
            spires=spires,
            match_mode=match_mode,
            no_cache=no_cache,
            with_author_keywords=with_author_keywords,
            rebuild_cache=rebuild_cache,
            only_core_tags=only_core_tags,
            extract_acronyms=extract_acronyms
        )

        fulltext_used = True
        with get_document_in_workflow(obj) as tmp_document:
            try:
                if tmp_document:
                    result = get_keywords_from_local_file(tmp_document, **params)
                else:
                    data = get_value(obj.data, 'titles.title', [])
                    data.extend(get_value(obj.data, 'titles.subtitle', []))
                    data.extend(get_value(obj.data, 'abstracts.value', []))
                    data.extend(get_value(obj.data, 'keywords.value', []))
                    if not data:
                        obj.log.error("No classification done due to missing data.")
                        return
                    result = get_keywords_from_text(data, **params)
                    fulltext_used = False
            except ClassifierException as e:
                obj.log.exception(e)
                return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {})
        )
        result["fulltext_used"] = fulltext_used

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result
Exemplo n.º 10
0
    def _classify_paper(obj, eng):
        from flask import current_app
        params = dict(taxonomy_name=taxonomy
                      or current_app.config['HEP_ONTOLOGY_FILE'],
                      output_mode='dict',
                      output_limit=output_limit,
                      spires=spires,
                      match_mode=match_mode,
                      no_cache=no_cache,
                      with_author_keywords=with_author_keywords,
                      rebuild_cache=rebuild_cache,
                      only_core_tags=only_core_tags,
                      extract_acronyms=extract_acronyms)

        fulltext_used = True
        with get_document_in_workflow(obj) as tmp_document:
            try:
                if tmp_document:
                    result = get_keywords_from_local_file(
                        tmp_document, **params)
                else:
                    data = get_value(obj.data, 'titles.title', [])
                    data.extend(get_value(obj.data, 'titles.subtitle', []))
                    data.extend(get_value(obj.data, 'abstracts.value', []))
                    data.extend(get_value(obj.data, 'keywords.value', []))
                    if not data:
                        obj.log.error(
                            "No classification done due to missing data.")
                        return
                    result = get_keywords_from_text(data, **params)
                    fulltext_used = False
            except ClassifierException as e:
                obj.log.exception(e)
                return

        result['complete_output'] = clean_instances_from_data(
            result.get("complete_output", {}))
        result["fulltext_used"] = fulltext_used

        # Check if it is not empty output before adding
        if any(result.get("complete_output", {}).values()):
            obj.extra_data['classifier_results'] = result