예제 #1
0
    def _match_with_invenio_matcher(obj, eng):
        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)

        if queries is None:
            queries_ = [
                {'type': 'exact', 'match': 'dois.value'},
                {'type': 'exact', 'match': 'arxiv_eprints.value'}
            ]
        else:
            queries_ = queries

        record_matches = {
            "recids": [],
            "records": [],
            "base_url": os.path.join(
                cfg["CFG_SITE_URL"],
                'record'
            )
        }

        for matched_record in _match(record, queries=queries_, index=index, doc_type='record'):
            matched_recid = matched_record.record.get('control_number')
            record_matches['recids'].append(matched_recid)
            record_matches['records'].append({
                "source": matched_record.record.dumps(),
                "score": matched_record.score
            })

        obj.extra_data["record_matches"] = record_matches

        return bool(record_matches['recids'])
예제 #2
0
def _get_record(obj, model):
    from inspirehep.utils.helpers import get_record_from_model

    model = model(obj)
    record = get_record_from_model(model)
    if record:
        return record.dumps()
    return {}
예제 #3
0
def _get_record(obj, model):
    from inspirehep.utils.helpers import get_record_from_model

    model = model(obj)
    record = get_record_from_model(model)
    if record:
        return record.dumps()
    return {}
예제 #4
0
    def _author_list(obj, eng):
        from inspirehep.modules.converter.xslt import convert

        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        arxiv_id = get_arxiv_id_from_record(record)
        existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id))

        if not existing_file:
            # We download it
            tarball = get_tarball_for_model(eng, arxiv_id)

            if tarball is None:
                obj.log.error("No tarball found")
                return
            add_file_by_name(model, tarball)
        else:
            tarball = existing_file.get_syspath()

        sub_dir = os.path.abspath("{0}_files".format(tarball))
        try:
            file_list = untar(tarball, sub_dir)
        except InvalidTarball:
            obj.log.error("Invalid tarball {0}".format(tarball))
            return
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))

        xml_files_list = [filename for filename in file_list
                          if filename.endswith(".xml")]
        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors_xml = convert(xml_content, stylesheet)
                authorlist_record = get_json_from_marcxml(authors_xml)[0]
                record.update(authorlist_record)
                obj.update_task_results(
                    "authors",
                    [{
                        "name": "authors",
                        "results": authorlist_record["authors"]
                    }]
                )
                obj.update_task_results(
                    "number_of_authors",
                    [{
                        "name": "number_of_authors",
                        "results": authorlist_record["number_of_authors"]
                    }]
                )
                break
        model.update()
예제 #5
0
def already_harvested(obj, eng):
    """Check if record is already harvested."""
    if cfg.get("PRODUCTION_MODE"):
        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)

        if was_already_harvested(record):
            obj.log.info("Record is already being harvested on INSPIRE.")
            return True
    return False
예제 #6
0
    def _classify_paper(obj, eng):
        from invenio_classifier.errors import TaxonomyError

        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        data = None
        is_fast_mode = fast_mode
        if not is_fast_mode:
            if "pdf" in obj.extra_data:
                # Getting path to PDF file
                data = obj.extra_data["pdf"]
                callback = get_keywords_from_local_file
        if not data:
            data = [record.get("titles.title", "")].extend(record.get("abstracts.value", []))
            callback = get_keywords_from_text
            is_fast_mode = True

        if not data:
            obj.log.error("No classification done due to missing data.")
            return

        try:
            result = callback(data, taxonomy,
                              output_mode='dict',
                              output_limit=output_limit,
                              spires=spires,
                              match_mode=match_mode,
                              no_cache=no_cache,
                              with_author_keywords=with_author_keywords,
                              rebuild_cache=rebuild_cache,
                              only_core_tags=only_core_tags,
                              extract_acronyms=extract_acronyms)
        except TaxonomyError as e:
            obj.log.exception(e)
            return

        clean_instances_from_data(result.get("complete_output", {}))

        final_result = {"dict": result}
        final_result["fast_mode"] = fast_mode
        # Check if it is not empty output before adding
        output = result.get("complete_output", {}).values()
        if not any(output):
            final_result["dict"] = {}
        name = "classification"
        obj.update_task_results(
            name,
            [{
                "name": name,
                "result": final_result,
                "template": "workflows/results/classifier.html"
            }]
        )
예제 #7
0
def add_core(obj, eng):
    """Add CORE collection tag to collections."""
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    collections = record.get("collections", [])
    # Do not add it again if already there
    has_core = [v for c in collections
                for v in c.values()
                if v.lower() == "core"]
    if not has_core:
        collections.append({"primary": "CORE"})
        record["collections"] = collections
    model.update()
예제 #8
0
    def _arxiv_fulltext_download(obj, eng):
        model = eng.workflow_definition.model(obj)
        record = get_record_from_model(model)
        arxiv_id = get_arxiv_id_from_record(record)
        existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id))

        if not existing_file:
            # We download it
            pdf = get_pdf_for_model(eng, arxiv_id)

            if pdf is None:
                obj.log.error("No pdf found")
                return
            pdf = add_file_by_name(model, pdf)
            obj.extra_data["pdf"] = pdf
        else:
            pdf = existing_file.get_syspath()

        if pdf:
            if "fft" in record:
                record["fft"].append({
                    "url": pdf,
                    "docfile_type": doctype
                })
            else:
                new_dict_representation = {
                    "fft": [
                        {
                            "url": pdf,
                            "docfile_type": doctype
                        }
                    ]
                }
                record.update(new_dict_representation)

            fileinfo = {
                "type": "fulltext",
                "filename": os.path.basename(pdf),
                "full_path": pdf,
            }
            obj.update_task_results(
                os.path.basename(pdf),
                [{
                    "name": "PDF",
                    "result": fileinfo,
                    "template": "workflows/results/files.html"
                }]
            )
            model.update()
        else:
            obj.log.info("No PDF found.")
예제 #9
0
    def _previously_rejected(obj, eng):
        if cfg.get("PRODUCTION_MODE"):
            model = eng.workflow_definition.model(obj)
            record = get_record_from_model(model)

            if days_ago is None:
                _days_ago = cfg.get("INSPIRE_ACCEPTANCE_TIMEOUT", 5)
            else:
                _days_ago = days_ago

            if is_too_old(record, days_ago=_days_ago):
                obj.log.info("Record is likely rejected previously.")
                return True
        return False
예제 #10
0
    def get_record(cls, obj, **kwargs):
        """Return a dictionary-like object representing the current object.

        This object will be used for indexing and be the basis for display
        in Holding Pen.
        """
        if not hasattr(obj, "data"):
            obj.data = obj.get_data()
        if isinstance(obj.data, six.text_type):
            return {}
        model = cls.model(obj)
        record = get_record_from_model(model)
        if record:
            return record.dumps()
        else:
            return {}
예제 #11
0
 def get_title(cls, obj, **kwargs):
     """Return the value to put in the title column of Holding Pen."""
     if not hasattr(obj, "data"):
         obj.data = obj.get_data()
     if isinstance(obj.data, dict):
         try:
             model = cls.model(obj)
         except InvalidDepositionType:
             return "This submission is disabled: {0}.".format(obj.workflow.name)
         record = get_record_from_model(model)
         if record:
             titles = filter(None, record.get("titles.title", []))
             if titles:
                 # Show first title that evaluates to True
                 return titles[0]
     return "No title available"
예제 #12
0
 def formatter(cls, obj, **kwargs):
     """Nicely format the record."""
     try:
         model = cls.model(obj)
         record = get_record_from_model(model)
     except TypeError as err:
         return "Error: {0}".format(err)
     if not record:
         return ""
     if kwargs.get('of'):
         if "recid" not in record:
             record['recid'] = None
         return format_record(record=record, of=kwargs.get('of'))
     return render_template(
         'format/record/Holding_Pen_HTML_detailed.tpl',
         record=record
     )
예제 #13
0
def match(obj, eng):
    """Return True if the record already exists in INSPIRE.

    Searches by arXiv identifier and DOI, updates extra_data with the
    first id returned by the search.
    """
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)

    response = list(set(match_by_arxiv_id(record)) | set(match_by_doi(record)))

    obj.extra_data["record_matches"] = {
        "recids": [str(recid) for recid in response],
        "records": [],
        "base_url": os.path.join(cfg["CFG_ROBOTUPLOAD_SUBMISSION_BASEURL"], "record"),
    }
    return bool(obj.extra_data["record_matches"]["recids"])
예제 #14
0
def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id))

    if not existing_file:
        # We download it
        pdf = get_pdf_for_model(eng, arxiv_id)

        if pdf is None:
            obj.log.error("No pdf found")
            return
        add_file_by_name(model, pdf)
    else:
        pdf = existing_file.get_syspath()

    if pdf and os.path.isfile(pdf):
        mapped_references = extract_references(pdf)
        if mapped_references:
            record["references"] = mapped_references
            obj.log.info("Extracted {0} references".format(
                len(mapped_references)
            ))
            obj.update_task_results(
                "References",
                [{"name": "References",
                  "result": mapped_references,
                  "template": "workflows/results/refextract.html"}]
            )
            model.update()
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
예제 #15
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id))

    if not existing_file:
        # We download it
        tarball = get_tarball_for_model(eng, arxiv_id)

        if tarball is None:
            obj.log.error("No tarball found")
            return
        add_file_by_name(model, tarball)
    else:
        tarball = existing_file.get_syspath()

    try:
        plots = process_tarball(tarball)
    except InvalidTarball:
        eng.log.error(
            'Invalid tarball {0}'.format(tarball)
        )
        return

    if plots:
        # We store the path to the directory the tarball contents lives
        new_dict = get_json_for_plots(plots)
        record.update(new_dict)
        obj.update_task_results(
            "Plots",
            [{
                "name": "Plots",
                "result": new_dict["fft"],
                "template": "workflows/results/plots.html"
            }]
        )
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
        model.update()
예제 #16
0
def extract_journal_info(obj, eng):
    """Extract journal, volume etc. from any freetext publication info."""
    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)

    publication_info = record.get("publication_info")
    if not publication_info:
        return

    new_publication_info = []
    for pubnote in publication_info:
        freetext = pubnote.get("pubinfo_freetext")
        if freetext:
            extracted_publication_info = extract_journal_reference(
                freetext,
                override_kbs_files={'journals': get_mappings_from_kbname(cfg['REFEXTRACT_KB_NAME'])}
            )
            if extracted_publication_info:
                if "volume" in extracted_publication_info:
                    pubnote["journal_volume"] = extracted_publication_info.get(
                        "volume"
                    )
                if "title" in extracted_publication_info:
                    pubnote["journal_title"] = extracted_publication_info.get(
                        "title"
                    )
                if "year" in extracted_publication_info:
                    pubnote["year"] = extracted_publication_info.get(
                        "year"
                    )
                if "page" in extracted_publication_info:
                    pubnote["page_artid"] = extracted_publication_info.get(
                        "page"
                    )
        new_publication_info.append(pubnote)

    record["publication_info"] = new_publication_info
    model.update()
예제 #17
0
    def get_description(bwo):
        """Get the description column part."""
        if not isinstance(bwo.data, dict):
            return "No description found."
        model = hep_ingestion.model(bwo)
        record = get_record_from_model(model)
        abstract = ""
        authors = []
        categories = []
        final_identifiers = []
        if hasattr(record, "get"):
            # Get identifiers
            dois = record.get("dois.value", [])
            if dois:
                final_identifiers.extend(dois)

            system_no = record.get("external_system_numbers.value", [])
            if system_no:
                final_identifiers.extend(system_no)

            # Get subject categories, adding main one first. Order matters here.
            record_categories = record.get("arxiv_eprints.categories", []) + \
                record.get("subject_terms.term", [])
            for category_list in record_categories:
                if isinstance(category_list, list):
                    categories.extend(category_list)
                else:
                    categories.append(category_list)
            categories = list(OrderedDict.fromkeys(categories))  # Unique only
            abstract = record.get("abstracts.value", [""])[0]
            authors = record.get("authors", [])
        return render_template('workflows/styles/harvesting_record.html',
                               object=bwo,
                               authors=authors,
                               categories=categories,
                               abstract=abstract,
                               identifiers=final_identifiers)