def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, arxiv_id) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error("Timeout during tarball extraction on {0}".format(tarball)) if marcxml: # We store the path to the directory the tarball contents lives new_dict = get_json_from_marcxml(marcxml)[0] record.update(new_dict) obj.update_task_results( "Plots", [{"name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html"}] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"]))) model.update()
def _author_list(obj, eng): from inspire.modules.converter.xslt import convert from invenio_oaiharvester.utils import find_matching_files model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() sub_dir = os.path.abspath("{0}_files".format(tarball)) try: file_list = untar(tarball, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [filename for filename in file_list if filename.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authorlist_record = get_json_from_marcxml(authors_xml)[0] record.update(authorlist_record) obj.update_task_results( "authors", [{ "name": "authors", "results": authorlist_record["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": authorlist_record["number_of_authors"] }] ) break model.update()
def _author_list(obj, eng): from invenio_oaiharvester.utils import find_matching_files from invenio_utils.plotextractor.cli import get_defaults from invenio_utils.plotextractor.converter import untar from invenio_utils.shell import Timeout from inspire.modules.converter.xslt import convert model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, arxiv_id) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() sub_dir, dummy = get_defaults(str(tarball), cfg["CFG_TMPDIR"], "") try: untar(str(tarball), sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error("Timeout during tarball extraction on {0}".format(tarball)) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authorlist_record = get_json_from_marcxml(authors_xml)[0] record.update(authorlist_record) obj.update_task_results("authors", [{"name": "authors", "results": authorlist_record["authors"]}]) obj.update_task_results( "number_of_authors", [{"name": "number_of_authors", "results": authorlist_record["number_of_authors"]}], ) break model.update()
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id)) if not existing_file: # We download it pdf = get_pdf_for_model(eng, arxiv_id) if pdf is None: obj.log.error("No pdf found") return add_file_by_name(model, pdf) else: pdf = existing_file.get_syspath() if pdf and os.path.isfile(pdf): references_xml = extract_references_from_file_xml(pdf) if references_xml: updated_xml = ( '<?xml version="1.0" encoding="UTF-8"?>\n' "<collection>\n" + references_xml + "\n</collection>" ) new_dict = get_json_from_marcxml(updated_xml)[0] if "references" in new_dict: record["references"] = new_dict["references"] obj.log.info("Extracted {0} references".format(len(obj.data["references"]))) obj.update_task_results( "References", [ { "name": "References", "result": new_dict["references"], "template": "workflows/results/refextract.html", } ], ) model.update() else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")