def refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml from invenio.utils.plotextractor.getter import harvest_single from invenio.modules.workflows.utils import convert_marcxml_to_bibfield if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} pdf = None if "_result" in obj.extra_data and "pdf" in obj.extra_data["_result"]: pdf = obj.extra_data["_result"]["pdf"] if not pdf: extract_path = os.path.join( cfg['CFG_TMPSHAREDDIR'], str(eng.uuid) ) if not os.path.exists(extract_path): os.makedirs(extract_path) tarball, pdf = harvest_single( obj.data["system_control_number"]["value"], extract_path, ["pdf"] ) obj.extra_data["_result"]["pdf"] = pdf if pdf and os.path.isfile(obj.extra_data["_result"]["pdf"]): references_xml = extract_references_from_file_xml( obj.extra_data["_result"]["pdf"]) if references_xml: obj.log.info("Found references: {0}".format(references_xml)) updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \ '<collection>\n' + references_xml + \ "\n</collection>" new_dict_representation = convert_marcxml_to_bibfield(updated_xml) try: obj.data['reference'].append( new_dict_representation["reference"]) except KeyError: if 'reference' in new_dict_representation: obj.data['reference'] = [ new_dict_representation['reference']] obj.add_task_result("References", new_dict_representation['reference'], "workflows/results/refextract.html") else: obj.log.info("No references") else: obj.log.error("Not able to download and process the PDF ")
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml from invenio.utils.plotextractor.api import get_pdf_from_arxiv from invenio.modules.workflows.utils import convert_marcxml_to_bibfield if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} try: pdf = obj.extra_data["_result"]["pdf"] except KeyError: pdf = None if not pdf: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) obj.extra_data["_result"]["pdf"] = pdf if pdf and os.path.isfile(pdf): references_xml = extract_references_from_file_xml( obj.extra_data["_result"]["pdf"] ) if references_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \ '<collection>\n' + references_xml + \ "\n</collection>" new_dict_representation = convert_marcxml_to_bibfield(updated_xml) if "reference" in new_dict_representation: obj.data["reference"] = new_dict_representation["reference"] obj.log.info("Extracted {0} references".format(len(obj.data["reference"]))) obj.update_task_results( "References", [{"name": "References", "result": new_dict_representation['reference'], "template": "workflows/results/refextract.html"}] ) return else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml from invenio.utils.plotextractor.api import get_pdf_from_arxiv from invenio.modules.workflows.utils import convert_marcxml_to_bibfield if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} try: pdf = obj.extra_data["_result"]["pdf"] except KeyError: pdf = None if not pdf: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) obj.extra_data["_result"]["pdf"] = pdf if pdf and os.path.isfile(pdf): references_xml = extract_references_from_file_xml( obj.extra_data["_result"]["pdf"]) if references_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \ '<collection>\n' + references_xml + \ "\n</collection>" new_dict_representation = convert_marcxml_to_bibfield(updated_xml) if "reference" in new_dict_representation: obj.data["reference"] = new_dict_representation["reference"] obj.log.info("Extracted {0} references".format( len(obj.data["reference"]))) obj.update_task_results( "References", [{ "name": "References", "result": new_dict_representation['reference'], "template": "workflows/results/refextract.html" }]) return else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id)) if not existing_file: # We download it pdf = get_pdf_for_model(eng, arxiv_id) if pdf is None: obj.log.error("No pdf found") return add_file_by_name(model, pdf) else: pdf = existing_file.get_syspath() if pdf and os.path.isfile(pdf): references_xml = extract_references_from_file_xml(pdf) if references_xml: updated_xml = ( '<?xml version="1.0" encoding="UTF-8"?>\n' "<collection>\n" + references_xml + "\n</collection>" ) new_dict = get_json_from_marcxml(updated_xml)[0] if "references" in new_dict: record["references"] = new_dict["references"] obj.log.info("Extracted {0} references".format(len(obj.data["references"]))) obj.update_task_results( "References", [ { "name": "References", "result": new_dict["references"], "template": "workflows/results/refextract.html", } ], ) model.update() else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def extract_from_pdf_string(pdf): """Extract references from a pdf stored in a string Given a string representing a pdf, this function writes the string to disk and passes it to refextract. We need to create a temoporary file because we need to run pdf2text on it""" # Save new record to file tf = NamedTemporaryFile(prefix='docextract-pdf', dir=CFG_TMPSHAREDDIR) try: tf.write(pdf) tf.flush() refs = extract_references_from_file_xml(tf.name) finally: # Also deletes the file tf.close() return refs
def refextract(obj, eng): """ Performs the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml bibtask.task_sleep_now_if_required() if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "pdf" not in obj.extra_data["_result"]: extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid) tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["pdf"]) if pdf is not None: obj.extra_data["_result"]["pdf"] = pdf elif not os.path.isfile(obj.extra_data["_result"]["pdf"]): extract_path = plotextractor_getter.make_single_directory(cfg['CFG_TMPSHAREDDIR'], eng.uuid) tarball, pdf = plotextractor_getter.harvest_single(obj.data["system_number_external"]["value"], extract_path, ["pdf"]) if pdf is not None: obj.extra_data["_result"]["pdf"] = pdf if os.path.isfile(obj.extra_data["_result"]["pdf"]): cmd_stdout = extract_references_from_file_xml(obj.extra_data["_result"]["pdf"]) references_xml = REGEXP_REFS.search(cmd_stdout) if references_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n<record>' + references_xml.group(1) + \ "</record>\n</collection>" new_dict_representation = records_api.create_record(updated_xml, master_format="marc").dumps() try: obj.data['reference'].append(new_dict_representation["reference"]) except KeyError: if 'reference' in new_dict_representation: obj.data['reference'] = [new_dict_representation['reference']] obj.add_task_result("reference", new_dict_representation['reference']) else: obj.log.error("Not able to download and process the PDF ")