def _plot_extract(obj, eng):
        from invenio.utils.plotextractor.api import (
            get_tarball_from_arxiv,
            get_marcxml_plots_from_tarball
        )
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.shell import Timeout

        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        repository = obj.extra_data.get("repository", {})
        arguments = repository.get("arguments", {})

        chosen_type = plotextractor_types

        if not chosen_type:
            chosen_type = arguments.get('p_extraction-source', [])

        if not isinstance(chosen_type, list):
            chosen_type = [chosen_type]

        if 'latex' in chosen_type:
            # Run LaTeX plotextractor
            if "tarball" not in obj.extra_data["_result"]:
                extract_path = os.path.join(
                    cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
                    str(eng.uuid)
                )
                tarball = get_tarball_from_arxiv(
                    obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                    extract_path
                )
                if tarball is None:
                    obj.log.error("No tarball found")
                    return
                obj.extra_data["_result"]["tarball"] = tarball
            else:
                tarball = obj.extra_data["_result"]["tarball"]

            try:
                marcxml = get_marcxml_plots_from_tarball(tarball)
            except Timeout:
                eng.log.error(
                    'Timeout during tarball extraction on {0}'.format(tarball)
                )
            if marcxml:
                # We store the path to the directory the tarball contents lives
                new_dict = convert_marcxml_to_bibfield(marcxml)
                _attach_files_to_obj(obj, new_dict)
                obj.update_task_results(
                    "Plots",
                    [{
                        "name": "Plots",
                        "result": new_dict["fft"],
                        "template": "workflows/results/plots.html"
                    }]
                )
예제 #2
0
def refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml
    from invenio.utils.plotextractor.getter import harvest_single
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    pdf = None

    if "_result" in obj.extra_data and "pdf" in obj.extra_data["_result"]:
        pdf = obj.extra_data["_result"]["pdf"]

    if not pdf:
        extract_path = os.path.join(
            cfg['CFG_TMPSHAREDDIR'],
            str(eng.uuid)
        )
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        tarball, pdf = harvest_single(
            obj.data["system_control_number"]["value"], extract_path, ["pdf"]
        )
        obj.extra_data["_result"]["pdf"] = pdf

    if pdf and os.path.isfile(obj.extra_data["_result"]["pdf"]):
        references_xml = extract_references_from_file_xml(
            obj.extra_data["_result"]["pdf"])
        if references_xml:
            obj.log.info("Found references: {0}".format(references_xml))
            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \
                          '<collection>\n' + references_xml + \
                          "\n</collection>"

            new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
            try:
                obj.data['reference'].append(
                    new_dict_representation["reference"])
            except KeyError:
                if 'reference' in new_dict_representation:
                    obj.data['reference'] = [
                        new_dict_representation['reference']]
            obj.add_task_result("References",
                                new_dict_representation['reference'],
                                "workflows/results/refextract.html")
        else:
            obj.log.info("No references")
    else:
        obj.log.error("Not able to download and process the PDF ")
예제 #3
0
def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml
    from invenio.utils.plotextractor.api import get_pdf_from_arxiv
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    try:
        pdf = obj.extra_data["_result"]["pdf"]
    except KeyError:
        pdf = None

    if not pdf:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid)
        )
        pdf = get_pdf_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path
        )
        obj.extra_data["_result"]["pdf"] = pdf

    if pdf and os.path.isfile(pdf):
        references_xml = extract_references_from_file_xml(
            obj.extra_data["_result"]["pdf"]
        )
        if references_xml:
            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \
                          '<collection>\n' + references_xml + \
                          "\n</collection>"
            new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
            if "reference" in new_dict_representation:
                obj.data["reference"] = new_dict_representation["reference"]
                obj.log.info("Extracted {0} references".format(len(obj.data["reference"])))
                obj.update_task_results(
                    "References",
                    [{"name": "References",
                      "result": new_dict_representation['reference'],
                      "template": "workflows/results/refextract.html"}]
                )
                return
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
예제 #4
0
    def _plot_extract(obj, eng):
        from invenio.utils.plotextractor.api import (
            get_tarball_from_arxiv, get_marcxml_plots_from_tarball)
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.shell import Timeout

        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        repository = obj.extra_data.get("repository", {})
        arguments = repository.get("arguments", {})

        chosen_type = plotextractor_types

        if not chosen_type:
            chosen_type = arguments.get('p_extraction-source', [])

        if not isinstance(chosen_type, list):
            chosen_type = [chosen_type]

        if 'latex' in chosen_type:
            # Run LaTeX plotextractor
            if "tarball" not in obj.extra_data["_result"]:
                extract_path = os.path.join(
                    cfg.get('OAIHARVESTER_STORAGEDIR',
                            cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid))
                tarball = get_tarball_from_arxiv(
                    obj.data.get(
                        cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                    extract_path)
                if tarball is None:
                    obj.log.error("No tarball found")
                    return
                obj.extra_data["_result"]["tarball"] = tarball
            else:
                tarball = obj.extra_data["_result"]["tarball"]

            try:
                marcxml = get_marcxml_plots_from_tarball(tarball)
            except Timeout:
                eng.log.error(
                    'Timeout during tarball extraction on {0}'.format(tarball))
            if marcxml:
                # We store the path to the directory the tarball contents lives
                new_dict = convert_marcxml_to_bibfield(marcxml)
                _attach_files_to_obj(obj, new_dict)
                obj.update_task_results(
                    "Plots", [{
                        "name": "Plots",
                        "result": new_dict["fft"],
                        "template": "workflows/results/plots.html"
                    }])
예제 #5
0
def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml
    from invenio.utils.plotextractor.api import get_pdf_from_arxiv
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    try:
        pdf = obj.extra_data["_result"]["pdf"]
    except KeyError:
        pdf = None

    if not pdf:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid))
        pdf = get_pdf_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path)
        obj.extra_data["_result"]["pdf"] = pdf

    if pdf and os.path.isfile(pdf):
        references_xml = extract_references_from_file_xml(
            obj.extra_data["_result"]["pdf"])
        if references_xml:
            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \
                          '<collection>\n' + references_xml + \
                          "\n</collection>"
            new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
            if "reference" in new_dict_representation:
                obj.data["reference"] = new_dict_representation["reference"]
                obj.log.info("Extracted {0} references".format(
                    len(obj.data["reference"])))
                obj.update_task_results(
                    "References",
                    [{
                        "name": "References",
                        "result": new_dict_representation['reference'],
                        "template": "workflows/results/refextract.html"
                    }])
                return
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
예제 #6
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    from invenio.utils.plotextractor.api import (
        get_tarball_from_arxiv,
        get_marcxml_plots_from_tarball
    )
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.shell import Timeout

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid)
        )
        tarball = get_tarball_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path
        )
        if tarball is None:
            obj.log.error("No tarball found")
            return
        obj.extra_data["_result"]["tarball"] = tarball
    else:
        tarball = obj.extra_data["_result"]["tarball"]

    try:
        marcxml = get_marcxml_plots_from_tarball(tarball)
    except Timeout:
        eng.log.error(
            'Timeout during tarball extraction on {0}'.format(tarball)
        )
    if marcxml:
        # We store the path to the directory the tarball contents lives
        new_dict = convert_marcxml_to_bibfield(marcxml)
        _attach_files_to_obj(obj, new_dict)
        obj.update_task_results(
            "Plots",
            [{
                "name": "Plots",
                "result": new_dict["fft"],
                "template": "workflows/results/plots.html"
            }]
        )
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
예제 #7
0
def arxiv_plot_extract(obj, eng):
    """Extract plots from an arXiv archive."""
    from invenio.utils.plotextractor.api import (get_tarball_from_arxiv,
                                                 get_marcxml_plots_from_tarball
                                                 )
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.shell import Timeout

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid))
        tarball = get_tarball_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path)
        if tarball is None:
            obj.log.error("No tarball found")
            return
        obj.extra_data["_result"]["tarball"] = tarball
    else:
        tarball = obj.extra_data["_result"]["tarball"]

    try:
        marcxml = get_marcxml_plots_from_tarball(tarball)
    except Timeout:
        eng.log.error(
            'Timeout during tarball extraction on {0}'.format(tarball))
    if marcxml:
        # We store the path to the directory the tarball contents lives
        new_dict = convert_marcxml_to_bibfield(marcxml)
        _attach_files_to_obj(obj, new_dict)
        obj.update_task_results("Plots",
                                [{
                                    "name": "Plots",
                                    "result": new_dict["fft"],
                                    "template": "workflows/results/plots.html"
                                }])
        obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
예제 #8
0
    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(
            cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR',
                        cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid))
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path)
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' %
                          (obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error(
                            "Error parsing authorlist record for id: %s" %
                            (identifiers, ))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(
                        updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }])
                    obj.update_task_results("number_of_authors", [{
                        "name":
                        "number_of_authors",
                        "results":
                        new_dict_representation["number_of_authors"]
                    }])
                    break
예제 #9
0
 def _convert_record_to_bibfield(obj, eng):
     from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
     obj.data = convert_marcxml_to_bibfield(obj.data, model)
     eng.log.info("Field conversion succeeded")
예제 #10
0
    def _author_list(obj, eng):
        from invenio.legacy.bibrecord import create_records, record_xml_output
        from invenio.legacy.bibconvert.xslt_engine import convert
        from invenio.utils.plotextractor.api import get_tarball_from_arxiv
        from invenio.utils.plotextractor.cli import get_defaults
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.plotextractor.converter import untar
        from invenio.utils.shell import Timeout

        from ..utils import find_matching_files

        identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "")
        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}
        if "tarball" not in obj.extra_data["_result"]:
            extract_path = os.path.join(
                cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
                str(eng.uuid)
            )
            tarball = get_tarball_from_arxiv(
                obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
                extract_path
            )
            if tarball is None:
                obj.log.error("No tarball found")
                return
        else:
            tarball = obj.extra_data["_result"]["tarball"]

        # FIXME
        tarball = str(tarball)
        sub_dir, dummy = get_defaults(tarball,
                                      cfg['CFG_TMPDIR'], "")

        try:
            untar(tarball, sub_dir)
            obj.log.info("Extracted tarball to: {0}".format(sub_dir))
        except Timeout:
            eng.log.error('Timeout during tarball extraction on %s' % (
                obj.extra_data["_result"]["tarball"]))

        xml_files_list = find_matching_files(sub_dir, ["xml"])

        obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

        authors = ""

        for xml_file in xml_files_list:
            xml_file_fd = open(xml_file, "r")
            xml_content = xml_file_fd.read()
            xml_file_fd.close()

            match = REGEXP_AUTHLIST.findall(xml_content)
            if match:
                obj.log.info("Found a match for author extraction")
                authors = convert(xml_content, stylesheet)
                authorlist_record = create_records(authors)
                if len(authorlist_record) == 1:
                    if authorlist_record[0][0] is None:
                        eng.log.error("Error parsing authorlist record for id: %s" % (
                            identifiers,))
                    authorlist_record = authorlist_record[0][0]

                author_xml = record_xml_output(authorlist_record)
                if author_xml:
                    updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                                  + record_xml_output(authorlist_record) + '</collection>'
                    new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                    obj.data["authors"] = new_dict_representation["authors"]
                    obj.update_task_results(
                        "authors",
                        [{
                            "name": "authors",
                            "results": new_dict_representation["authors"]
                        }]
                    )
                    obj.update_task_results(
                        "number_of_authors",
                        [{
                            "name": "number_of_authors",
                            "results": new_dict_representation["number_of_authors"]
                        }]
                    )
                    break
예제 #11
0
 def _convert_record_to_bibfield(obj, eng):
     from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
     obj.data = convert_marcxml_to_bibfield(obj.data, model)
     eng.log.info("Field conversion succeeded")
예제 #12
0
    def _update(obj, eng):
        import dictdiffer

        from lxml import objectify, etree

        from invenio.base.globals import cfg
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.modules.records.api import Record

        from inspire.utils.robotupload import make_robotupload_marcxml

        try:
            recid = obj.extra_data["recid"]
        except KeyError:
            obj.log.error("Cannot locate record ID")
            return

        callback_url = os.path.join(cfg["CFG_SITE_URL"],
                                    "callback/workflows/continue")

        search_url = "%s?p=recid:%s&of=xm" % (cfg["WORKFLOWS_MATCH_REMOTE_SERVER_URL"], recid)

        prod_data = objectify.parse(search_url)
        # remove controlfields
        root = prod_data.getroot()
        record = root['record']
        while True:
            try:
                record.remove(record['controlfield'])
            except AttributeError:
                break
        prod_data = etree.tostring(record)
        prod_data = convert_marcxml_to_bibfield(prod_data, model=["hep"])
        new_data = dict(obj.data.dumps(clean=True))
        prod_data = dict(prod_data.dumps(clean=True))
        updated_keys = []
        diff = dictdiffer.diff(prod_data, new_data)
        for diff_type, new_key, content in diff:
            if diff_type == 'add':
                if new_key:
                    if isinstance(new_key, list):
                        # ['subject_term', 0]
                        updated_keys.append(new_key[0])
                    else:
                        # 'subject_term'
                        updated_keys.append(new_key)
                else:
                    # content must be list of new adds
                    for key in content:
                        updated_keys.append(key)

        updates = dictdiffer.patch(diff, new_data)
        for key in updates.keys():
            if key not in updated_keys:
                del updates[key]
        if updates:
            updates['recid'] = recid
            marcxml = Record(updates).legacy_export_as_marc()
            result = make_robotupload_marcxml(
                url=url,
                marcxml=marcxml,
                callback_url=callback_url,
                mode='correct',
                nonce=obj.id
            )
            if "[INFO]" not in result.text:
                if "cannot use the service" in result.text:
                    # IP not in the list
                    obj.log.error("Your IP is not in "
                                  "CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS "
                                  "on host")
                    obj.log.error(result.text)
                from invenio.modules.workflows.errors import WorkflowError
                txt = "Error while submitting robotupload: {0}".format(result.text)
                raise WorkflowError(txt, eng.uuid, obj.id)
            else:
                obj.log.info("Robotupload sent!")
                obj.log.info(result.text)
                eng.halt("Waiting for robotupload: {0}".format(result.text))
            obj.log.info("end of upload")
        else:
            obj.log.info("No updates to do.")
예제 #13
0
def arxiv_fulltext_download(obj, eng):
    """Perform the fulltext download step for arXiv records.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.utils.plotextractor.getter import harvest_single
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield

    if "result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    if "pdf" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg['CFG_TMPSHAREDDIR'],
            str(eng.uuid)
        )
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        tarball, pdf = harvest_single(
            obj.data["system_control_number"]["value"],
            extract_path, ["pdf"])
        arguments = obj.extra_data["repository"]["arguments"]
        try:
            if not arguments['t_doctype'] == '':
                doctype = arguments['t_doctype']
            else:
                doctype = 'arXiv'
        except KeyError:
            eng.log.error("WARNING: HASARDOUS BEHAVIOUR EXPECTED, "
                          "You didn't specified t_doctype in argument"
                          " for fulltext_download,"
                          "try to recover by using the default one!")
            doctype = 'arXiv'
        if pdf:
            obj.extra_data["_result"]["pdf"] = pdf
            fulltext_xml = (
                "  <datafield tag=\"FFT\" ind1=\" \" ind2=\" \">\n"
                "    <subfield code=\"a\">%(url)s</subfield>\n"
                "    <subfield code=\"t\">%(doctype)s</subfield>\n"
                "    </datafield>"
            ) % {'url': obj.extra_data["_result"]["pdf"],
                 'doctype': doctype}
            updated_xml = '<?xml version="1.0"?>\n' \
                          '<collection>\n<record>\n' + fulltext_xml + \
                          '</record>\n</collection>'

            new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
            try:
                if isinstance(new_dict_representation["fft"], list):
                    for element in new_dict_representation["fft"]:
                        obj.data['fft'].append(element)
                else:
                    obj.data['fft'].append(new_dict_representation["fft"])
            except (KeyError, TypeError):
                obj.data['fft'] = [new_dict_representation['fft']]

            filename = os.path.basename(pdf)
            fileinfo = {
                "type": "Fulltext",
                "filename": filename,
                "full_path": pdf,
            }

            obj.add_task_result(filename,
                                fileinfo,
                                "workflows/results/files.html")
        else:
            obj.log.error("No PDF found.")
    else:
        eng.log.info("There was already a pdf register for this record,"
                     "perhaps a duplicate task in you workflow.")
예제 #14
0
def author_list(obj, eng):
    """Perform the special authorlist extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex,
                                                 find_matching_files)
    from invenio.legacy.bibrecord import create_records, record_xml_output
    from invenio.legacy.bibconvert.xslt_engine import convert
    from invenio.utils.plotextractor.cli import get_defaults
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
    from invenio.utils.plotextractor.getter import harvest_single
    from invenio.modules.workflows.errors import WorkflowError
    from invenio.utils.plotextractor.converter import untar
    from invenio.utils.shell import Timeout

    identifiers = obj.data["system_control_number"]["value"]
    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}
    if "tarball" not in obj.extra_data["_result"]:
        extract_path = os.path.join(
            cfg['CFG_TMPSHAREDDIR'],
            str(eng.uuid)
        )
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        tarball, pdf = harvest_single(
            obj.data["system_control_number"]["value"], extract_path,
            ["tarball"])
        tarball = str(tarball)
        if tarball is None:
            raise WorkflowError(str(
                "Error harvesting tarball from id: %s %s" % (
                    identifiers, extract_path)), eng.uuid, id_object=obj.id)
        obj.extra_data["_result"]["tarball"] = tarball

    sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"],
                                  cfg['CFG_TMPDIR'], "")

    try:
        untar(obj.extra_data["_result"]["tarball"], sub_dir)
        obj.log.info("Extracted tarball to: {0}".format(sub_dir))
    except Timeout:
        eng.log.error('Timeout during tarball extraction on %s' % (
            obj.extra_data["_result"]["tarball"]))

    xml_files_list = find_matching_files(sub_dir, ["xml"])

    obj.log.info("Found xmlfiles: {0}".format(xml_files_list))

    authors = ""

    for xml_file in xml_files_list:
        xml_file_fd = open(xml_file, "r")
        xml_content = xml_file_fd.read()
        xml_file_fd.close()

        match = REGEXP_AUTHLIST.findall(xml_content)
        if match:
            obj.log.info("Found a match for author extraction")

            a_stylesheet = obj.extra_data["repository"]["arguments"].get(
                "a_stylesheet"
            ) or "authorlist2marcxml.xsl"
            authors = convert(xml_content, a_stylesheet)
            authorlist_record = create_records(authors)
            if len(authorlist_record) == 1:
                if authorlist_record[0][0] is None:
                    eng.log.error("Error parsing authorlist record for id: %s" % (
                        identifiers,))
                authorlist_record = authorlist_record[0][0]
                # Convert any LaTeX symbols in authornames
            translate_fieldvalues_from_latex(authorlist_record, '100', code='a')
            translate_fieldvalues_from_latex(authorlist_record, '700', code='a')

            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \
                          + record_xml_output(authorlist_record) + '</collection>'
            if not None == updated_xml:
                # We store the path to the directory  the tarball contents live
                # Read and grab MARCXML from plotextractor run
                new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
                obj.data['authors'] = new_dict_representation["authors"]
                obj.data['number_of_authors'] = new_dict_representation[
                    "number_of_authors"]
                obj.add_task_result("authors", new_dict_representation["authors"])
                obj.add_task_result("number_of_authors",
                                    new_dict_representation["number_of_authors"])
                break
예제 #15
0
    def _plot_extract(obj, eng):
        """Perform the plotextraction step.

        Download tarball for each harvested/converted record,
        then run plotextrator.

        Update converted xml files with generated xml or add it for upload.
        """
        from invenio.utils.plotextractor.output_utils import (create_MARC,
                                                              create_contextfiles,
                                                              prepare_image_data,
                                                              remove_dups)
        from invenio.utils.plotextractor.cli import (get_defaults, extract_captions,
                                                     extract_context)
        from invenio.utils.plotextractor.converter import convert_images
        from invenio.utils.plotextractor.getter import harvest_single
        from invenio.utils.plotextractor.converter import untar
        from invenio.modules.workflows.errors import WorkflowError
        from invenio.modules.workflows.utils import convert_marcxml_to_bibfield
        from invenio.utils.shell import run_shell_command, Timeout

        if "_result" not in obj.extra_data:
            obj.extra_data["_result"] = {}

        repository = obj.extra_data.get("repository", {})
        arguments = repository.get("arguments", {})

        if 'p_extraction-source' not in arguments:
            p_extraction_source = plotextractor_types
        else:
            p_extraction_source = arguments.get('p_extraction-source', "")

        if not isinstance(p_extraction_source, list):
            p_extraction_source = [p_extraction_source]

        if 'latex' in p_extraction_source:
            # Run LaTeX plotextractor
            if "tarball" not in obj.extra_data["_result"]:
                extract_path = os.path.join(
                    cfg['CFG_TMPSHAREDDIR'],
                    str(eng.uuid)
                )
                if not os.path.exists(extract_path):
                    os.makedirs(extract_path)
                tarball, pdf = harvest_single(
                    obj.data["system_control_number"]["value"], extract_path,
                    ["tarball"])
                tarball = str(tarball)
                if tarball is None:
                    raise WorkflowError(
                        str("Error harvesting tarball from id: %s %s" %
                            (obj.data["system_control_number"]["value"],
                             extract_path)),
                        eng.uuid,
                        id_object=obj.id)

                obj.extra_data["_result"]["tarball"] = tarball
            else:
                tarball = obj.extra_data["_result"]["tarball"]

            sub_dir, refno = get_defaults(tarball, cfg['CFG_TMPDIR'], "")

            tex_files = None
            image_list = None
            try:
                extracted_files_list, image_list, tex_files = untar(tarball,
                                                                    sub_dir)
            except Timeout:
                eng.log.error(
                    'Timeout during tarball extraction on %s' % (tarball,))

            converted_image_list = convert_images(image_list)
            eng.log.info('converted %d of %d images found for %s' % (
                len(converted_image_list),
                len(image_list),
                os.path.basename(tarball)))
            extracted_image_data = []
            if tex_files == [] or tex_files is None:
                eng.log.error(
                    '%s is not a tarball' % (os.path.split(tarball)[-1],))
                run_shell_command('rm -r %s', (sub_dir,))
            else:
                for tex_file in tex_files:
                    # Extract images, captions and labels
                    partly_extracted_image_data = extract_captions(tex_file,
                                                                   sub_dir,
                                                                   converted_image_list)
                    if partly_extracted_image_data:
                        # Add proper filepaths and do various cleaning
                        cleaned_image_data = prepare_image_data(
                            partly_extracted_image_data,
                            tex_file, converted_image_list)
                        # Using prev. extracted info, get contexts for each
                        # image found
                        extracted_image_data.extend(
                            (extract_context(tex_file, cleaned_image_data)))

            if extracted_image_data:
                extracted_image_data = remove_dups(extracted_image_data)
                create_contextfiles(extracted_image_data)
                marc_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n'
                marc_xml += create_MARC(extracted_image_data, tarball, None)
                marc_xml += "\n</collection>"

                if marc_xml:
                    # We store the path to the directory  the tarball
                    # contents live
                    # Read and grab MARCXML from plotextractor run
                    new_dict = convert_marcxml_to_bibfield(marc_xml)

                    try:
                        if isinstance(new_dict["fft"], list):
                            for element in new_dict["fft"]:
                                obj.data['fft'].append(element)
                        else:
                            obj.data['fft'].append(new_dict["fft"])

                    except KeyError:
                        obj.data['fft'] = [new_dict['fft']]
                    obj.add_task_result("filesfft", new_dict["fft"])
                    obj.add_task_result("number_picture_converted",
                                        len(converted_image_list))
                    obj.add_task_result("number_of_picture_total",
                                        len(image_list))