def _author_list(obj, eng): from inspirehep.modules.converter.xslt import convert model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() sub_dir = os.path.abspath("{0}_files".format(tarball)) try: file_list = untar(tarball, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [filename for filename in file_list if filename.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authorlist_record = get_json_from_marcxml(authors_xml)[0] record.update(authorlist_record) obj.update_task_results( "authors", [{ "name": "authors", "results": authorlist_record["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": authorlist_record["number_of_authors"] }] ) break model.update()
def test_payload_file_creation(self): """Can add a file to a Payload.""" from invenio_workflows.models import BibWorkflowObject from inspirehep.modules.workflows.models import Payload from inspirehep.utils.helpers import ( get_file_by_name, add_file_by_name, ) obj = BibWorkflowObject.create_object() obj.save() obj.data = obj.get_data() # FIXME hack until workflow 2.0 payload = Payload.create(workflow_object=obj, type="payload_fixture") payload.save() fd, filename = tempfile.mkstemp() os.close(fd) newpath = add_file_by_name(payload, filename) self.assertTrue(newpath) self.assertTrue(get_file_by_name(payload, os.path.basename(filename))) BibWorkflowObject.delete(obj)
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id)) if not existing_file: # We download it pdf = get_pdf_for_model(eng, arxiv_id) if pdf is None: obj.log.error("No pdf found") return add_file_by_name(model, pdf) else: pdf = existing_file.get_syspath() if pdf and os.path.isfile(pdf): mapped_references = extract_references(pdf) if mapped_references: record["references"] = mapped_references obj.log.info("Extracted {0} references".format( len(mapped_references) )) obj.update_task_results( "References", [{"name": "References", "result": mapped_references, "template": "workflows/results/refextract.html"}] ) model.update() else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() try: plots = process_tarball(tarball) except InvalidTarball: eng.log.error( 'Invalid tarball {0}'.format(tarball) ) return if plots: # We store the path to the directory the tarball contents lives new_dict = get_json_for_plots(plots) record.update(new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"]))) model.update()
def _arxiv_fulltext_download(obj, eng): model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id)) if not existing_file: # We download it pdf = get_pdf_for_model(eng, arxiv_id) if pdf is None: obj.log.error("No pdf found") return pdf = add_file_by_name(model, pdf) obj.extra_data["pdf"] = pdf else: pdf = existing_file.get_syspath() if pdf: if "fft" in record: record["fft"].append({ "url": pdf, "docfile_type": doctype }) else: new_dict_representation = { "fft": [ { "url": pdf, "docfile_type": doctype } ] } record.update(new_dict_representation) fileinfo = { "type": "fulltext", "filename": os.path.basename(pdf), "full_path": pdf, } obj.update_task_results( os.path.basename(pdf), [{ "name": "PDF", "result": fileinfo, "template": "workflows/results/files.html" }] ) model.update() else: obj.log.info("No PDF found.")