예제 #1
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, arxiv_id)

    if not existing_file:
        # We download it
        tarball = get_tarball_for_model(eng, arxiv_id)

        if tarball is None:
            obj.log.error("No tarball found")
            return
        add_file_by_name(model, tarball)
    else:
        tarball = existing_file.get_syspath()

    try:
        marcxml = get_marcxml_plots_from_tarball(tarball)
    except Timeout:
        eng.log.error("Timeout during tarball extraction on {0}".format(tarball))

    if marcxml:
        # We store the path to the directory the tarball contents lives
        new_dict = get_json_from_marcxml(marcxml)[0]
        record.update(new_dict)
        obj.update_task_results(
            "Plots", [{"name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html"}]
        )
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
        model.update()
예제 #2
0
def extract_journal_info(obj, eng):
    """Extract journal, volume etc. from any freetext publication info."""
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)

    publication_info = record.get("publication_info")
    if not publication_info:
        return

    new_publication_info = []
    for pubnote in publication_info:
        freetext = pubnote.get("pubinfo_freetext")
        if freetext:
            extracted_publication_info = extract_journal_reference(freetext)
            if extracted_publication_info:
                if "volume" in extracted_publication_info:
                    pubnote["journal_volume"] = extracted_publication_info.get(
                        "volume"
                    )
                if "title" in extracted_publication_info:
                    pubnote["journal_title"] = extracted_publication_info.get(
                        "title"
                    )
                if "year" in extracted_publication_info:
                    pubnote["year"] = extracted_publication_info.get(
                        "year"
                    )
                if "page" in extracted_publication_info:
                    pubnote["page_artid"] = extracted_publication_info.get(
                        "page"
                    )
        new_publication_info.append(pubnote)

    record["publication_info"] = new_publication_info
    model.update()
예제 #3
0
    def get_description(bwo):
        """Get the description column part."""
        if not isinstance(bwo.data, dict):
            return "No description found."
        model = process_record_arxiv.model(bwo)
        record = get_record_from_model(model)
        abstract = ""
        authors = []
        categories = []
        final_identifiers = []
        if hasattr(record, "get"):
            # Get identifiers
            doi = record.get("doi.doi", [])
            if doi:
                final_identifiers.extend(doi)

            system_no = record.get("system_control_number.system_control_number", [])
            if system_no:
                final_identifiers.extend(system_no)

            # Get subject categories, adding main one first. Order matters here.
            categories = record.get("report_number.arxiv_category", [])
            categories.extend(record.get("subject_term.value", []))
            categories = list(OrderedDict.fromkeys(categories))  # Unique only
            abstract = record.get("abstract.summary", [""])[0]
            authors = record.get("authors", [])
        return render_template('workflows/styles/harvesting_record.html',
                               object=bwo,
                               authors=authors,
                               categories=categories,
                               abstract=abstract,
                               identifiers=final_identifiers)
예제 #4
0
    def _guess_coreness(obj, eng):
        from invenio.base.globals import cfg
        from .arxiv import predict

        if os.path.basename(model_path) == model_path:
            # Just the name is given, so we fill in the rest
            full_model_path = os.path.join(cfg.get("CLASSIFIER_MODEL_PATH"), model_path)
        else:
            # The new variable is needed due to how parameters in closures work
            full_model_path = model_path

        if not os.path.isfile(full_model_path):
            obj.log.error("Model file {0} not found! Skipping prediction...".format(full_model_path))
            return
        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)

        prepared_record = prepare_prediction_record(record)

        pipeline = load_model(full_model_path)
        decision, scores = predict(pipeline, prepared_record)
        obj.log.info("Successfully predicted as {0} with {1}".format(decision, max(scores)))

        result = {}
        result["decision"] = decision
        result["max_score"] = max(scores)
        result["all_scores"] = scores
        task_result = {"name": "arxiv_guessing", "result": result, "template": "workflows/results/arxiv_guessing.html"}
        obj.update_task_results(task_result.get("name"), [task_result])
예제 #5
0
def match(obj, eng):
    """Return True if the record already exists in INSPIRE.

    Searches by arXiv identifier and DOI, updates extra_data with the
    first id returned by the search.
    """
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)

    response = list(
        set(match_by_arxiv_id(record)) | set(match_by_doi(record))
    )

    if response:
        # FIXME(jacquerie): use more than just the first id.
        obj.extra_data['recid'] = response[0]
        obj.extra_data['url'] = os.path.join(
            cfg["CFG_ROBOTUPLOAD_SUBMISSION_BASEURL"],
            'record',
            str(response[0])
        )

        return True

    return False
예제 #6
0
    def _author_list(obj, eng):
        from inspire.modules.converter.xslt import convert
        from invenio_oaiharvester.utils import find_matching_files

        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        arxiv_id = get_arxiv_id_from_record(record)
        existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id))

        if not existing_file:
            # We download it
            tarball = get_tarball_for_model(eng, arxiv_id)

            if tarball is None:
                obj.log.error("No tarball found")
                return
            add_file_by_name(model, tarball)
        else:
            tarball = existing_file.get_syspath()

        sub_dir = os.path.abspath("{0}_files".format(tarball))
        try:
            file_list = untar(tarball, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [filename for filename in file_list
                          if filename.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authorlist_record = get_json_from_marcxml(authors_xml)[0]
                record.update(authorlist_record)
                obj.update_task_results(
                    "authors",
                    [{
                        "name": "authors",
                        "results": authorlist_record["authors"]
                    }]
                )
                obj.update_task_results(
                    "number_of_authors",
                    [{
                        "name": "number_of_authors",
                        "results": authorlist_record["number_of_authors"]
                    }]
                )
                break
        model.update()
예제 #7
0
 def get_title(bwo):
     """Return title."""
     if isinstance(bwo.data, dict):
         model = hep_ingestion.model(bwo)
         record = get_record_from_model(model)
         titles = record.get("titles.title")
         if titles:
             return titles[0]
     return "No title available"
예제 #8
0
파일: arxiv.py 프로젝트: jma/inspire-next
    def _arxiv_fulltext_download(obj, eng):
        """Perform the fulltext download step for arXiv records.

        :param obj: Bibworkflow Object to process
        :param eng: BibWorkflowEngine processing the object
        """
        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        arxiv_id = get_arxiv_id_from_record(record)
        existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id))

        if not existing_file:
            # We download it
            pdf = get_pdf_for_model(eng, arxiv_id)

            if pdf is None:
                obj.log.error("No pdf found")
                return
            pdf = add_file_by_name(model, pdf)
            obj.extra_data["pdf"] = pdf
        else:
            pdf = existing_file.get_syspath()

        if pdf:
            if "fft" in record:
                record["fft"].append({
                    "url": pdf,
                    "docfile_type": doctype
                })
            else:
                new_dict_representation = {
                    "fft": [
                        {
                            "url": pdf,
                            "docfile_type": doctype
                        }
                    ]
                }
                record.update(new_dict_representation)

            fileinfo = {
                "type": "fulltext",
                "filename": os.path.basename(pdf),
                "full_path": pdf,
            }
            obj.update_task_results(
                os.path.basename(pdf),
                [{
                    "name": "PDF",
                    "result": fileinfo,
                    "template": "workflows/results/files.html"
                }]
            )
            model.update()
        else:
            obj.log.info("No PDF found.")
예제 #9
0
    def get_record(cls, obj, **kwargs):
        """Return a dictionary-like object representing the current object.

        This object will be used for indexing and be the basis for display
        in Holding Pen.
        """
        if isinstance(obj.data, six.text_type):
            return {}
        model = cls.model(obj)
        return get_record_from_model(model).dumps()  # Turn into pure dict
예제 #10
0
    def _author_list(obj, eng):
        from invenio_oaiharvester.utils import find_matching_files

        from invenio_utils.plotextractor.cli import get_defaults
        from invenio_utils.plotextractor.converter import untar
        from invenio_utils.shell import Timeout

        from inspire.modules.converter.xslt import convert

        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        arxiv_id = get_arxiv_id_from_record(record)
        existing_file = get_file_by_name(model, arxiv_id)

        if not existing_file:
            # We download it
            tarball = get_tarball_for_model(eng, arxiv_id)

            if tarball is None:
                obj.log.error("No tarball found")
                return
            add_file_by_name(model, tarball)
        else:
            tarball = existing_file.get_syspath()

        sub_dir, dummy = get_defaults(str(tarball), cfg["CFG_TMPDIR"], "")

        try:
            untar(str(tarball), sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error("Timeout during tarball extraction on {0}".format(tarball))

        xml_files_list = find_matching_files(sub_dir, ["xml"])
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authorlist_record = get_json_from_marcxml(authors_xml)[0]
                record.update(authorlist_record)
                obj.update_task_results("authors", [{"name": "authors", "results": authorlist_record["authors"]}])
                obj.update_task_results(
                    "number_of_authors",
                    [{"name": "number_of_authors", "results": authorlist_record["number_of_authors"]}],
                )
                break
        model.update()
예제 #11
0
    def get_record(cls, obj, **kwargs):
        """Return a dictionary-like object representing the current object.

        This object will be used for indexing and be the basis for display
        in Holding Pen.
        """
        model = cls.model(obj)
        record = get_record_from_model(model)
        if record:
            return record.dumps()
        return {}
예제 #12
0
 def formatter(bwo, **kwargs):
     """Nicely format the record."""
     try:
         model = process_record_arxiv.model(bwo)
         record = get_record_from_model(model)
     except TypeError as err:
         return "Error: {0}".format(err)
     return render_template(
         'format/record/Holding_Pen_HTML_detailed.tpl',
         record=record
     )
예제 #13
0
    def get_record(cls, obj, **kwargs):
        """Return a dictionary-like object representing the current object.

        This object will be used for indexing and be the basis for display
        in Holding Pen.
        """
        try:
            model = cls.model(obj)
            return get_record_from_model(model).dumps()  # Turn into pure dict
        except Exception as err:
            current_app.logger.exception(err)
            return {}
예제 #14
0
    def _classify_paper(obj, eng):
        from invenio_classifier.errors import TaxonomyError

        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        data = None
        is_fast_mode = fast_mode
        if not is_fast_mode:
            if "pdf" in obj.extra_data:
                # Getting path to PDF file
                data = obj.extra_data["pdf"]
                callback = get_keywords_from_local_file
        if not data:
            data = [record.get("titles.title", "")] + record.get("abstracts.value", [])
            callback = get_keywords_from_text
            is_fast_mode = True

        if not data:
            obj.log.error("No classification done due to missing data.")
            return

        try:
            result = callback(data, taxonomy,
                              output_mode='dict',
                              output_limit=output_limit,
                              spires=spires,
                              match_mode=match_mode,
                              no_cache=no_cache,
                              with_author_keywords=with_author_keywords,
                              rebuild_cache=rebuild_cache,
                              only_core_tags=only_core_tags,
                              extract_acronyms=extract_acronyms)
        except TaxonomyError as e:
            obj.log.exception(e)
            return

        clean_instances_from_data(result.get("complete_output", {}))

        final_result = {"dict": result}
        final_result["fast_mode"] = fast_mode
        # Check if it is not empty output before adding
        output = result.get("complete_output", {}).values()
        if not any(output):
            final_result["dict"] = {}
        name = "classification"
        obj.update_task_results(
            name,
            [{
                "name": name,
                "result": final_result,
                "template": "workflows/results/classifier.html"
            }]
        )
예제 #15
0
def add_core(obj, eng):
    """Add CORE collection tag to collections."""
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    collections = record.get("collections", [])
    # Do not add it again if already there
    has_core = [v for c in collections
                for v in c.values()
                if v.lower() == "core"]
    if not has_core:
        collections.append({"primary": "CORE"})
        record["collections"] = collections
    model.update()
예제 #16
0
def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml

    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id))

    if not existing_file:
        # We download it
        pdf = get_pdf_for_model(eng, arxiv_id)

        if pdf is None:
            obj.log.error("No pdf found")
            return
        add_file_by_name(model, pdf)
    else:
        pdf = existing_file.get_syspath()

    if pdf and os.path.isfile(pdf):
        references_xml = extract_references_from_file_xml(pdf)
        if references_xml:
            updated_xml = (
                '<?xml version="1.0" encoding="UTF-8"?>\n' "<collection>\n" + references_xml + "\n</collection>"
            )
            new_dict = get_json_from_marcxml(updated_xml)[0]
            if "references" in new_dict:
                record["references"] = new_dict["references"]
                obj.log.info("Extracted {0} references".format(len(obj.data["references"])))
                obj.update_task_results(
                    "References",
                    [
                        {
                            "name": "References",
                            "result": new_dict["references"],
                            "template": "workflows/results/refextract.html",
                        }
                    ],
                )
                model.update()
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
예제 #17
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id))

    if not existing_file:
        # We download it
        tarball = get_tarball_for_model(eng, arxiv_id)

        if tarball is None:
            obj.log.error("No tarball found")
            return
        add_file_by_name(model, tarball)
    else:
        tarball = existing_file.get_syspath()

    try:
        plots = process_tarball(tarball)
    except InvalidTarball:
        eng.log.error(
            'Invalid tarball {0}'.format(tarball)
        )
        return

    if plots:
        # We store the path to the directory the tarball contents lives
        new_dict = get_json_for_plots(plots)
        record.update(new_dict)
        obj.update_task_results(
            "Plots",
            [{
                "name": "Plots",
                "result": new_dict["fft"],
                "template": "workflows/results/plots.html"
            }]
        )
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
        model.update()
예제 #18
0
    def _exists_in_inspire_or_rejected(obj, eng):
        if match(obj, eng):
            obj.log.info("Record already exists in INSPIRE.")
            return True

        if cfg.get('PRODUCTION_MODE'):
            model = eng.workflow_definition.model(obj)
            record = get_record_from_model(model)

            if was_already_harvested(record):
                obj.log.info('Record is already being harvested on INSPIRE.')
                return True

            if days_ago is None:
                _days_ago = cfg.get('INSPIRE_ACCEPTANCE_TIMEOUT', 5)
            else:
                _days_ago = days_ago

            if is_too_old(record, days_ago=_days_ago):
                obj.log.info("Record is likely rejected previously.")
                return True
        return False
예제 #19
0
    def get_description(bwo):
        """Get the description column part."""
        if not isinstance(bwo.data, dict):
            return "No description found."
        model = process_record_arxiv.model(bwo)
        record = get_record_from_model(model)
        abstract = ""
        authors = []
        categories = []
        final_identifiers = []
        if hasattr(record, "get"):
            # Get identifiers
            dois = record.get("dois.value", [])
            if dois:
                final_identifiers.extend(dois)

            system_no = record.get("external_system_numbers.value", [])
            if system_no:
                final_identifiers.extend(system_no)

            # Get subject categories, adding main one first. Order matters here.
            record_categories = record.get("arxiv_eprints.categories", []) + \
                record.get("subject_terms.term", [])
            for category_list in record_categories:
                if isinstance(category_list, list):
                    categories.extend(category_list)
                else:
                    categories.append(category_list)
            categories = list(OrderedDict.fromkeys(categories))  # Unique only
            abstract = record.get("abstracts.value", [""])[0]
            authors = record.get("authors", [])
        return render_template('workflows/styles/harvesting_record.html',
                               object=bwo,
                               authors=authors,
                               categories=categories,
                               abstract=abstract,
                               identifiers=final_identifiers)
예제 #20
0
 def get_title(bwo):
     if not isinstance(bwo.data, dict):
         return "No title found."
     model = process_record_arxiv.model(bwo)
     record = get_record_from_model(model)
     return "; ".join(record.get("titles.title", ["No title found"]))