예제 #1
0
    def _workflows_reviews(obj, eng):
        obj.update_task_results("review_workflow", [{
            "name": "wait_for_a_workflow_to_complete",
            "template": "workflows/results/default.html",
            "result": {
                "finished": eng.extra_data["_nb_workflow_finish"],
                "failed": eng.extra_data["_nb_workflow_failed"],
                "total": eng.extra_data["_nb_workflow"]
            }
        }])

        eng.log.info("{0}/{1} finished successfully".format(
            eng.extra_data["_nb_workflow_finish"],
            eng.extra_data["_nb_workflow"]))

        if eng.extra_data["_nb_workflow"] == 0:
            # Nothing has been harvested!
            eng.log.info("Nothing harvested.")
            return

        if eng.extra_data["_nb_workflow_failed"] and stop_if_error:
            raise WorkflowError(
                "%s / %s failed" % (eng.extra_data["_nb_workflow_failed"],
                                    eng.extra_data["_nb_workflow"]),
                eng.uuid,
                obj.id,
                payload=eng.extra_data["_uuid_workflow_crashed"])

        if clean:
            eng.extra_data["_nb_workflow_failed"] = 0
            eng.extra_data["_nb_workflow"] = 0
            eng.extra_data["_nb_workflow_finish"] = 0
예제 #2
0
def harvest_records(obj, eng):
    """Run the harvesting task.

    The row argument is the oaiharvest task queue row, containing if, arguments,
    etc.
    Return 1 in case of success and 0 in case of failure.
    :param obj: BibworkflowObject being
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.modules.oaiharvester.utils import collect_identifiers
    from invenio.modules.workflows.errors import WorkflowError

    harvested_identifier_list = []

    harvestpath = "%s_%d_%s_" % ("%s/oaiharvest_%s" %
                                 (cfg['CFG_TMPSHAREDDIR'], eng.uuid), 1,
                                 time.strftime("%Y%m%d%H%M%S"))

    # ## go ahead: check if user requested from-until harvesting
    try:
        if "dates" not in obj.extra_data["options"]:
            obj.extra_data["options"]["dates"] = []
        if "identifiers" not in obj.extra_data["options"]:
            obj.extra_data["options"]["identifiers"] = []
    except TypeError:
        obj.extra_data["options"] = {"dates": [], "identifiers": []}

    arguments = obj.extra_data["repository"]["arguments"]
    if arguments:
        eng.log.info("running with post-processes: %r" % (arguments, ))
    else:
        eng.log.error(
            "No arguments found... It can be causing major error after this point."
        )

    # Harvest phase
    if obj.extra_data["options"]["identifiers"]:
        # Harvesting is done per identifier instead of server-updates
        harvested_files_list = harvest_by_identifiers(obj, harvestpath)
    else:
        harvested_files_list = harvest_by_dates(obj, harvestpath)

    if len(harvested_files_list) == 0:
        eng.log.info("No records harvested for %s" % (obj.data["name"], ))
        # Retrieve all OAI IDs and set active list

    harvested_identifier_list.append(collect_identifiers(harvested_files_list))

    if len(harvested_files_list) != len(harvested_identifier_list[0]):
        # Harvested files and its identifiers are 'out of sync', abort harvest

        raise WorkflowError("Harvested files miss identifiers for %s" %
                            (arguments, ),
                            id_workflow=eng.uuid,
                            id_object=obj.id)
    obj.extra_data['harvested_files_list'] = harvested_files_list
    eng.log.info(
        "%d files harvested and processed \n End harvest records task" %
        (len(harvested_files_list), ))
예제 #3
0
    def _convert_record(obj, eng):
        from invenio.modules.workflows.errors import WorkflowError
        from invenio.legacy.bibconvert.xslt_engine import convert

        eng.log.info("Starting conversion using %s stylesheet" %
                     (stylesheet, ))

        if not obj.data:
            obj.log.error("Not valid conversion data!")
            raise WorkflowError("Error: conversion data missing",
                                id_workflow=eng.uuid,
                                id_object=obj.id)

        try:
            obj.data = convert(obj.data, stylesheet)
        except Exception as e:
            msg = "Could not convert record: %s\n%s" % \
                  (str(e), traceback.format_exc())
            raise WorkflowError("Error: %s" % (msg, ),
                                id_workflow=eng.uuid,
                                id_object=obj.id)
예제 #4
0
    def _send_robotupload(obj, eng):
        from invenio.modules.deposit.models import Deposition
        from invenio.modules.workflows.errors import WorkflowError
        from inspire.utils.robotupload import make_robotupload_marcxml
        from invenio.base.globals import cfg

        d = Deposition(obj)

        sip = d.get_latest_sip(d.submitted)
        if not sip:
            raise WorkflowError("No sip found", eng.uuid, obj.id)
        if not d.submitted:
            sip.seal()
            d.update()

        if url is None:
            base_url = cfg.get("CFG_ROBOTUPLOAD_SUBMISSION_BASEURL")

        callback_url = os.path.join(cfg["CFG_SITE_URL"],
                                    "callback/workflows/robotupload")
        obj.log.info("Sending Robotupload to {0} with callback {1}".format(
            base_url, callback_url))
        result = make_robotupload_marcxml(url=base_url,
                                          marcxml=sip.package,
                                          callback_url=callback_url,
                                          nonce=obj.id)
        if "[INFO]" not in result.text:
            if "cannot use the service" in result.text:
                # IP not in the list
                obj.log.error("Your IP is not in "
                              "CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS "
                              "on host")
                obj.log.error(result.text)
            from invenio.modules.workflows.errors import WorkflowError
            txt = "Error while submitting robotupload: {0}".format(result.text)
            raise WorkflowError(txt, eng.uuid, obj.id)
        else:
            obj.log.info("Robotupload sent!")
            obj.log.info(result.text)
            eng.halt("Waiting for robotupload: {0}".format(result.text))
예제 #5
0
def author_list(obj, eng):
    """Perform the special authorlist extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex,
                                                 find_matching_files)
    from invenio.legacy.bibrecord import create_records, record_xml_output
    from invenio.legacy.bibconvert.xslt_engine import convert
    from invenio.utils.plotextractor.cli import get_defaults
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.plotextractor.getter import harvest_single
    from invenio.modules.workflows.errors import WorkflowError
    from invenio.utils.plotextractor.converter import untar
    from invenio.utils.shell import Timeout

    identifiers = obj.data["system_control_number"]["value"]
    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}
    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg['CFG_TMPSHAREDDIR'],
            str(eng.uuid)
        )
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        tarball, pdf = harvest_single(
            obj.data["system_control_number"]["value"], extract_path,
            ["tarball"])
        tarball = str(tarball)
        if tarball is None:
            raise WorkflowError(str(
                "Error harvesting tarball from id: %s %s" % (
                    identifiers, extract_path)), eng.uuid, id_object=obj.id)
        obj.extra_data["_result"]["tarball"] = tarball

    sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"],
                                  cfg['CFG_TMPDIR'], "")

    try:
        untar(obj.extra_data["_result"]["tarball"], sub_dir)
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))
    except Timeout:
        eng.log.error('Timeout during tarball extraction on %s' % (
            obj.extra_data["_result"]["tarball"]))

    xml_files_list = find_matching_files(sub_dir, ["xml"])

    obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

    authors = ""

    for xml_file in xml_files_list:
        xml_file_fd = open(xml_file, "r")
        xml_content = xml_file_fd.read()
        xml_file_fd.close()

        match = REGEXP_AUTHLIST.findall(xml_content)
        if match:
            obj.log.info("Found a match for author extraction")

            a_stylesheet = obj.extra_data["repository"]["arguments"].get(
                "a_stylesheet"
            ) or "authorlist2marcxml.xsl"
            authors = convert(xml_content, a_stylesheet)
            authorlist_record = create_records(authors)
            if len(authorlist_record) == 1:
                if authorlist_record[0][0] is None:
                    eng.log.error("Error parsing authorlist record for id: %s" % (
                        identifiers,))
                authorlist_record = authorlist_record[0][0]
                # Convert any LaTeX symbols in authornames
            translate_fieldvalues_from_latex(authorlist_record, '100', code='a')
            translate_fieldvalues_from_latex(authorlist_record, '700', code='a')

            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                          + record_xml_output(authorlist_record) + '</collection>'
            if not None == updated_xml:
                # We store the path to the directory  the tarball contents live
                # Read and grab MARCXML from plotextractor run
                new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                obj.data['authors'] = new_dict_representation["authors"]
                obj.data['number_of_authors'] = new_dict_representation[
                    "number_of_authors"]
                obj.add_task_result("authors", new_dict_representation["authors"])
                obj.add_task_result("number_of_authors",
                                    new_dict_representation["number_of_authors"])
                break
예제 #6
0
    def _plot_extract(obj, eng):
        """Perform the plotextraction step.

        Download tarball for each harvested/converted record,
        then run plotextrator.

        Update converted xml files with generated xml or add it for upload.
        """
        from invenio.utils.plotextractor.output_utils import (create_MARC,
                                                              create_contextfiles,
                                                              prepare_image_data,
                                                              remove_dups)
        from invenio.utils.plotextractor.cli import (get_defaults, extract_captions,
                                                     extract_context)
        from invenio.utils.plotextractor.converter import convert_images
        from invenio.utils.plotextractor.getter import harvest_single
        from invenio.utils.plotextractor.converter import untar
        from invenio.modules.workflows.errors import WorkflowError
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.shell import run_shell_command, Timeout

        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        repository = obj.extra_data.get("repository", {})
        arguments = repository.get("arguments", {})

        if 'p_extraction-source' not in arguments:
            p_extraction_source = plotextractor_types
        else:
            p_extraction_source = arguments.get('p_extraction-source', "")

        if not isinstance(p_extraction_source, list):
            p_extraction_source = [p_extraction_source]

        if 'latex' in p_extraction_source:
            # Run LaTeX plotextractor
            if "tarball" not in obj.extra_data["_result"]:
                extract_path = os.path.join(
                    cfg['CFG_TMPSHAREDDIR'],
                    str(eng.uuid)
                )
                if not os.path.exists(extract_path):
                    os.makedirs(extract_path)
                tarball, pdf = harvest_single(
                    obj.data["system_control_number"]["value"], extract_path,
                    ["tarball"])
                tarball = str(tarball)
                if tarball is None:
                    raise WorkflowError(
                        str("Error harvesting tarball from id: %s %s" %
                            (obj.data["system_control_number"]["value"],
                             extract_path)),
                        eng.uuid,
                        id_object=obj.id)

                obj.extra_data["_result"]["tarball"] = tarball
            else:
                tarball = obj.extra_data["_result"]["tarball"]

            sub_dir, refno = get_defaults(tarball, cfg['CFG_TMPDIR'], "")

            tex_files = None
            image_list = None
            try:
                extracted_files_list, image_list, tex_files = untar(tarball,
                                                                    sub_dir)
            except Timeout:
                eng.log.error(
                    'Timeout during tarball extraction on %s' % (tarball,))

            converted_image_list = convert_images(image_list)
            eng.log.info('converted %d of %d images found for %s' % (
                len(converted_image_list),
                len(image_list),
                os.path.basename(tarball)))
            extracted_image_data = []
            if tex_files == [] or tex_files is None:
                eng.log.error(
                    '%s is not a tarball' % (os.path.split(tarball)[-1],))
                run_shell_command('rm -r %s', (sub_dir,))
            else:
                for tex_file in tex_files:
                    # Extract images, captions and labels
                    partly_extracted_image_data = extract_captions(tex_file,
                                                                   sub_dir,
                                                                   converted_image_list)
                    if partly_extracted_image_data:
                        # Add proper filepaths and do various cleaning
                        cleaned_image_data = prepare_image_data(
                            partly_extracted_image_data,
                            tex_file, converted_image_list)
                        # Using prev. extracted info, get contexts for each
                        # image found
                        extracted_image_data.extend(
                            (extract_context(tex_file, cleaned_image_data)))

            if extracted_image_data:
                extracted_image_data = remove_dups(extracted_image_data)
                create_contextfiles(extracted_image_data)
                marc_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n'
                marc_xml += create_MARC(extracted_image_data, tarball, None)
                marc_xml += "\n</collection>"

                if marc_xml:
                    # We store the path to the directory  the tarball
                    # contents live
                    # Read and grab MARCXML from plotextractor run
                    new_dict = convert_marcxml_to_bibfield(marc_xml)

                    try:
                        if isinstance(new_dict["fft"], list):
                            for element in new_dict["fft"]:
                                obj.data['fft'].append(element)
                        else:
                            obj.data['fft'].append(new_dict["fft"])

                    except KeyError:
                        obj.data['fft'] = [new_dict['fft']]
                    obj.add_task_result("filesfft", new_dict["fft"])
                    obj.add_task_result("number_picture_converted",
                                        len(converted_image_list))
                    obj.add_task_result("number_of_picture_total",
                                        len(image_list))