示例#1
0
def refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml
    from invenio.utils.plotextractor.getter import harvest_single
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    pdf = None

    if "_result" in obj.extra_data and "pdf" in obj.extra_data["_result"]:
        pdf = obj.extra_data["_result"]["pdf"]

    if not pdf:
        extract_path = os.path.join(
            cfg['CFG_TMPSHAREDDIR'],
            str(eng.uuid)
        )
        if not os.path.exists(extract_path):
            os.makedirs(extract_path)
        tarball, pdf = harvest_single(
            obj.data["system_control_number"]["value"], extract_path, ["pdf"]
        )
        obj.extra_data["_result"]["pdf"] = pdf

    if pdf and os.path.isfile(obj.extra_data["_result"]["pdf"]):
        references_xml = extract_references_from_file_xml(
            obj.extra_data["_result"]["pdf"])
        if references_xml:
            obj.log.info("Found references: {0}".format(references_xml))
            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \
                          '<collection>\n' + references_xml + \
                          "\n</collection>"

            new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
            try:
                obj.data['reference'].append(
                    new_dict_representation["reference"])
            except KeyError:
                if 'reference' in new_dict_representation:
                    obj.data['reference'] = [
                        new_dict_representation['reference']]
            obj.add_task_result("References",
                                new_dict_representation['reference'],
                                "workflows/results/refextract.html")
        else:
            obj.log.info("No references")
    else:
        obj.log.error("Not able to download and process the PDF ")
示例#2
0
def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml
    from invenio.utils.plotextractor.api import get_pdf_from_arxiv
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    try:
        pdf = obj.extra_data["_result"]["pdf"]
    except KeyError:
        pdf = None

    if not pdf:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid)
        )
        pdf = get_pdf_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path
        )
        obj.extra_data["_result"]["pdf"] = pdf

    if pdf and os.path.isfile(pdf):
        references_xml = extract_references_from_file_xml(
            obj.extra_data["_result"]["pdf"]
        )
        if references_xml:
            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \
                          '<collection>\n' + references_xml + \
                          "\n</collection>"
            new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
            if "reference" in new_dict_representation:
                obj.data["reference"] = new_dict_representation["reference"]
                obj.log.info("Extracted {0} references".format(len(obj.data["reference"])))
                obj.update_task_results(
                    "References",
                    [{"name": "References",
                      "result": new_dict_representation['reference'],
                      "template": "workflows/results/refextract.html"}]
                )
                return
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
示例#3
0
def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml
    from invenio.utils.plotextractor.api import get_pdf_from_arxiv
    from invenio.modules.workflows.utils import convert_marcxml_to_bibfield

    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}

    try:
        pdf = obj.extra_data["_result"]["pdf"]
    except KeyError:
        pdf = None

    if not pdf:
        extract_path = os.path.join(
            cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')),
            str(eng.uuid))
        pdf = get_pdf_from_arxiv(
            obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')),
            extract_path)
        obj.extra_data["_result"]["pdf"] = pdf

    if pdf and os.path.isfile(pdf):
        references_xml = extract_references_from_file_xml(
            obj.extra_data["_result"]["pdf"])
        if references_xml:
            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \
                          '<collection>\n' + references_xml + \
                          "\n</collection>"
            new_dict_representation = convert_marcxml_to_bibfield(updated_xml)
            if "reference" in new_dict_representation:
                obj.data["reference"] = new_dict_representation["reference"]
                obj.log.info("Extracted {0} references".format(
                    len(obj.data["reference"])))
                obj.update_task_results(
                    "References",
                    [{
                        "name": "References",
                        "result": new_dict_representation['reference'],
                        "template": "workflows/results/refextract.html"
                    }])
                return
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
示例#4
0
def arxiv_refextract(obj, eng):
    """Perform the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml

    model = eng.workflow_definition.model(obj)
    record = get_record_from_model(model)
    arxiv_id = get_arxiv_id_from_record(record)
    existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id))

    if not existing_file:
        # We download it
        pdf = get_pdf_for_model(eng, arxiv_id)

        if pdf is None:
            obj.log.error("No pdf found")
            return
        add_file_by_name(model, pdf)
    else:
        pdf = existing_file.get_syspath()

    if pdf and os.path.isfile(pdf):
        references_xml = extract_references_from_file_xml(pdf)
        if references_xml:
            updated_xml = (
                '<?xml version="1.0" encoding="UTF-8"?>\n' "<collection>\n" + references_xml + "\n</collection>"
            )
            new_dict = get_json_from_marcxml(updated_xml)[0]
            if "references" in new_dict:
                record["references"] = new_dict["references"]
                obj.log.info("Extracted {0} references".format(len(obj.data["references"])))
                obj.update_task_results(
                    "References",
                    [
                        {
                            "name": "References",
                            "result": new_dict["references"],
                            "template": "workflows/results/refextract.html",
                        }
                    ],
                )
                model.update()
        else:
            obj.log.info("No references extracted")
    else:
        obj.log.error("Not able to download and process the PDF")
示例#5
0
def extract_from_pdf_string(pdf):
    """Extract references from a pdf stored in a string

    Given a string representing a pdf, this function writes the string to
    disk and passes it to refextract.
    We need to create a temoporary file because we need to run pdf2text on it"""
    # Save new record to file
    tf = NamedTemporaryFile(prefix='docextract-pdf', dir=CFG_TMPSHAREDDIR)
    try:
        tf.write(pdf)
        tf.flush()
        refs = extract_references_from_file_xml(tf.name)
    finally:
        # Also deletes the file
        tf.close()

    return refs
示例#6
0
def extract_from_pdf_string(pdf):
    """Extract references from a pdf stored in a string

    Given a string representing a pdf, this function writes the string to
    disk and passes it to refextract.
    We need to create a temoporary file because we need to run pdf2text on it"""
    # Save new record to file
    tf = NamedTemporaryFile(prefix='docextract-pdf',
                            dir=CFG_TMPSHAREDDIR)
    try:
        tf.write(pdf)
        tf.flush()
        refs = extract_references_from_file_xml(tf.name)
    finally:
        # Also deletes the file
        tf.close()

    return refs
示例#7
0
def refextract(obj, eng):
    """
    Performs the reference extraction step.

    :param obj: Bibworkflow Object to process
    :param eng: BibWorkflowEngine processing the object
    """
    from invenio.legacy.refextract.api import extract_references_from_file_xml

    bibtask.task_sleep_now_if_required()
    if "_result" not in obj.extra_data:
        obj.extra_data["_result"] = {}
    if "pdf" not in obj.extra_data["_result"]:
        extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid)
        tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["pdf"])

        if pdf is not None:
            obj.extra_data["_result"]["pdf"] = pdf

    elif not os.path.isfile(obj.extra_data["_result"]["pdf"]):
        extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid)
        tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["pdf"])
        if pdf is not None:
            obj.extra_data["_result"]["pdf"] = pdf

    if os.path.isfile(obj.extra_data["_result"]["pdf"]):
        cmd_stdout = extract_references_from_file_xml(obj.extra_data["_result"]["pdf"])
        references_xml = REGEXP_REFS.search(cmd_stdout)
        if references_xml:
            updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n<record>' + references_xml.group(1) + \
                          "</record>\n</collection>"

            new_dict_representation = records_api.create_record(updated_xml, master_format="marc").dumps()
            try:
                obj.data['reference'].append(new_dict_representation["reference"])
            except KeyError:
                if 'reference' in new_dict_representation:
                    obj.data['reference'] = [new_dict_representation['reference']]
            obj.add_task_result("reference", new_dict_representation['reference'])

    else:
        obj.log.error("Not able to download and process the PDF ")