def _plot_extract(obj, eng): from invenio.utils.plotextractor.api import ( get_tarball_from_arxiv, get_marcxml_plots_from_tarball ) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} repository = obj.extra_data.get("repository", {}) arguments = repository.get("arguments", {}) chosen_type = plotextractor_types if not chosen_type: chosen_type = arguments.get('p_extraction-source', []) if not isinstance(chosen_type, list): chosen_type = [chosen_type] if 'latex' in chosen_type: # Run LaTeX plotextractor if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball) ) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] )
def refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml from invenio.utils.plotextractor.getter import harvest_single from invenio.modules.workflows.utils import convert_marcxml_to_bibfield if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} pdf = None if "_result" in obj.extra_data and "pdf" in obj.extra_data["_result"]: pdf = obj.extra_data["_result"]["pdf"] if not pdf: extract_path = os.path.join( cfg['CFG_TMPSHAREDDIR'], str(eng.uuid) ) if not os.path.exists(extract_path): os.makedirs(extract_path) tarball, pdf = harvest_single( obj.data["system_control_number"]["value"], extract_path, ["pdf"] ) obj.extra_data["_result"]["pdf"] = pdf if pdf and os.path.isfile(obj.extra_data["_result"]["pdf"]): references_xml = extract_references_from_file_xml( obj.extra_data["_result"]["pdf"]) if references_xml: obj.log.info("Found references: {0}".format(references_xml)) updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \ '<collection>\n' + references_xml + \ "\n</collection>" new_dict_representation = convert_marcxml_to_bibfield(updated_xml) try: obj.data['reference'].append( new_dict_representation["reference"]) except KeyError: if 'reference' in new_dict_representation: obj.data['reference'] = [ new_dict_representation['reference']] obj.add_task_result("References", new_dict_representation['reference'], "workflows/results/refextract.html") else: obj.log.info("No references") else: obj.log.error("Not able to download and process the PDF ")
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml from invenio.utils.plotextractor.api import get_pdf_from_arxiv from invenio.modules.workflows.utils import convert_marcxml_to_bibfield if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} try: pdf = obj.extra_data["_result"]["pdf"] except KeyError: pdf = None if not pdf: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) obj.extra_data["_result"]["pdf"] = pdf if pdf and os.path.isfile(pdf): references_xml = extract_references_from_file_xml( obj.extra_data["_result"]["pdf"] ) if references_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \ '<collection>\n' + references_xml + \ "\n</collection>" new_dict_representation = convert_marcxml_to_bibfield(updated_xml) if "reference" in new_dict_representation: obj.data["reference"] = new_dict_representation["reference"] obj.log.info("Extracted {0} references".format(len(obj.data["reference"]))) obj.update_task_results( "References", [{"name": "References", "result": new_dict_representation['reference'], "template": "workflows/results/refextract.html"}] ) return else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def _plot_extract(obj, eng): from invenio.utils.plotextractor.api import ( get_tarball_from_arxiv, get_marcxml_plots_from_tarball) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} repository = obj.extra_data.get("repository", {}) arguments = repository.get("arguments", {}) chosen_type = plotextractor_types if not chosen_type: chosen_type = arguments.get('p_extraction-source', []) if not isinstance(chosen_type, list): chosen_type = [chosen_type] if 'latex' in chosen_type: # Run LaTeX plotextractor if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get( cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball)) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }])
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml from invenio.utils.plotextractor.api import get_pdf_from_arxiv from invenio.modules.workflows.utils import convert_marcxml_to_bibfield if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} try: pdf = obj.extra_data["_result"]["pdf"] except KeyError: pdf = None if not pdf: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) pdf = get_pdf_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) obj.extra_data["_result"]["pdf"] = pdf if pdf and os.path.isfile(pdf): references_xml = extract_references_from_file_xml( obj.extra_data["_result"]["pdf"]) if references_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n' \ '<collection>\n' + references_xml + \ "\n</collection>" new_dict_representation = convert_marcxml_to_bibfield(updated_xml) if "reference" in new_dict_representation: obj.data["reference"] = new_dict_representation["reference"] obj.log.info("Extracted {0} references".format( len(obj.data["reference"]))) obj.update_task_results( "References", [{ "name": "References", "result": new_dict_representation['reference'], "template": "workflows/results/refextract.html" }]) return else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" from invenio.utils.plotextractor.api import ( get_tarball_from_arxiv, get_marcxml_plots_from_tarball ) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball) ) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" from invenio.utils.plotextractor.api import (get_tarball_from_arxiv, get_marcxml_plots_from_tarball ) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball)) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results("Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }]) obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get( cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error( "Error parsing authorlist record for id: %s" % (identifiers, )) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield( updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }]) obj.update_task_results("number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }]) break
def _convert_record_to_bibfield(obj, eng): from invenio.modules.workflows.utils import convert_marcxml_to_bibfield obj.data = convert_marcxml_to_bibfield(obj.data, model) eng.log.info("Field conversion succeeded")
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % ( obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % ( identifiers,)) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield(updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }] ) break
def _update(obj, eng): import dictdiffer from lxml import objectify, etree from invenio.base.globals import cfg from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.modules.records.api import Record from inspire.utils.robotupload import make_robotupload_marcxml try: recid = obj.extra_data["recid"] except KeyError: obj.log.error("Cannot locate record ID") return callback_url = os.path.join(cfg["CFG_SITE_URL"], "callback/workflows/continue") search_url = "%s?p=recid:%s&of=xm" % (cfg["WORKFLOWS_MATCH_REMOTE_SERVER_URL"], recid) prod_data = objectify.parse(search_url) # remove controlfields root = prod_data.getroot() record = root['record'] while True: try: record.remove(record['controlfield']) except AttributeError: break prod_data = etree.tostring(record) prod_data = convert_marcxml_to_bibfield(prod_data, model=["hep"]) new_data = dict(obj.data.dumps(clean=True)) prod_data = dict(prod_data.dumps(clean=True)) updated_keys = [] diff = dictdiffer.diff(prod_data, new_data) for diff_type, new_key, content in diff: if diff_type == 'add': if new_key: if isinstance(new_key, list): # ['subject_term', 0] updated_keys.append(new_key[0]) else: # 'subject_term' updated_keys.append(new_key) else: # content must be list of new adds for key in content: updated_keys.append(key) updates = dictdiffer.patch(diff, new_data) for key in updates.keys(): if key not in updated_keys: del updates[key] if updates: updates['recid'] = recid marcxml = Record(updates).legacy_export_as_marc() result = make_robotupload_marcxml( url=url, marcxml=marcxml, callback_url=callback_url, mode='correct', nonce=obj.id ) if "[INFO]" not in result.text: if "cannot use the service" in result.text: # IP not in the list obj.log.error("Your IP is not in " "CFG_BATCHUPLOADER_WEB_ROBOT_RIGHTS " "on host") obj.log.error(result.text) from invenio.modules.workflows.errors import WorkflowError txt = "Error while submitting robotupload: {0}".format(result.text) raise WorkflowError(txt, eng.uuid, obj.id) else: obj.log.info("Robotupload sent!") obj.log.info(result.text) eng.halt("Waiting for robotupload: {0}".format(result.text)) obj.log.info("end of upload") else: obj.log.info("No updates to do.")
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.utils.plotextractor.getter import harvest_single from invenio.modules.workflows.utils import convert_marcxml_to_bibfield if "result" not in obj.extra_data: obj.extra_data["_result"] = {} if "pdf" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg['CFG_TMPSHAREDDIR'], str(eng.uuid) ) if not os.path.exists(extract_path): os.makedirs(extract_path) tarball, pdf = harvest_single( obj.data["system_control_number"]["value"], extract_path, ["pdf"]) arguments = obj.extra_data["repository"]["arguments"] try: if not arguments['t_doctype'] == '': doctype = arguments['t_doctype'] else: doctype = 'arXiv' except KeyError: eng.log.error("WARNING: HASARDOUS BEHAVIOUR EXPECTED, " "You didn't specified t_doctype in argument" " for fulltext_download," "try to recover by using the default one!") doctype = 'arXiv' if pdf: obj.extra_data["_result"]["pdf"] = pdf fulltext_xml = ( " <datafield tag=\"FFT\" ind1=\" \" ind2=\" \">\n" " <subfield code=\"a\">%(url)s</subfield>\n" " <subfield code=\"t\">%(doctype)s</subfield>\n" " </datafield>" ) % {'url': obj.extra_data["_result"]["pdf"], 'doctype': doctype} updated_xml = '<?xml version="1.0"?>\n' \ '<collection>\n<record>\n' + fulltext_xml + \ '</record>\n</collection>' new_dict_representation = convert_marcxml_to_bibfield(updated_xml) try: if isinstance(new_dict_representation["fft"], list): for element in new_dict_representation["fft"]: obj.data['fft'].append(element) else: obj.data['fft'].append(new_dict_representation["fft"]) except (KeyError, TypeError): obj.data['fft'] = [new_dict_representation['fft']] filename = os.path.basename(pdf) fileinfo = { "type": "Fulltext", "filename": filename, "full_path": pdf, } obj.add_task_result(filename, fileinfo, "workflows/results/files.html") else: obj.log.error("No PDF found.") else: eng.log.info("There was already a pdf register for this record," "perhaps a duplicate task in you workflow.")
def author_list(obj, eng): """Perform the special authorlist extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.oaiharvest.utils import (translate_fieldvalues_from_latex, find_matching_files) from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.getter import harvest_single from invenio.modules.workflows.errors import WorkflowError from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout identifiers = obj.data["system_control_number"]["value"] if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg['CFG_TMPSHAREDDIR'], str(eng.uuid) ) if not os.path.exists(extract_path): os.makedirs(extract_path) tarball, pdf = harvest_single( obj.data["system_control_number"]["value"], extract_path, ["tarball"]) tarball = str(tarball) if tarball is None: raise WorkflowError(str( "Error harvesting tarball from id: %s %s" % ( identifiers, extract_path)), eng.uuid, id_object=obj.id) obj.extra_data["_result"]["tarball"] = tarball sub_dir, dummy = get_defaults(obj.extra_data["_result"]["tarball"], cfg['CFG_TMPDIR'], "") try: untar(obj.extra_data["_result"]["tarball"], sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % ( obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") a_stylesheet = obj.extra_data["repository"]["arguments"].get( "a_stylesheet" ) or "authorlist2marcxml.xsl" authors = convert(xml_content, a_stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % ( identifiers,)) authorlist_record = authorlist_record[0][0] # Convert any LaTeX symbols in authornames translate_fieldvalues_from_latex(authorlist_record, '100', code='a') translate_fieldvalues_from_latex(authorlist_record, '700', code='a') updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' if not None == updated_xml: # We store the path to the directory the tarball contents live # Read and grab MARCXML from plotextractor run new_dict_representation = convert_marcxml_to_bibfield(updated_xml) obj.data['authors'] = new_dict_representation["authors"] obj.data['number_of_authors'] = new_dict_representation[ "number_of_authors"] obj.add_task_result("authors", new_dict_representation["authors"]) obj.add_task_result("number_of_authors", new_dict_representation["number_of_authors"]) break
def _plot_extract(obj, eng): """Perform the plotextraction step. Download tarball for each harvested/converted record, then run plotextrator. Update converted xml files with generated xml or add it for upload. """ from invenio.utils.plotextractor.output_utils import (create_MARC, create_contextfiles, prepare_image_data, remove_dups) from invenio.utils.plotextractor.cli import (get_defaults, extract_captions, extract_context) from invenio.utils.plotextractor.converter import convert_images from invenio.utils.plotextractor.getter import harvest_single from invenio.utils.plotextractor.converter import untar from invenio.modules.workflows.errors import WorkflowError from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import run_shell_command, Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} repository = obj.extra_data.get("repository", {}) arguments = repository.get("arguments", {}) if 'p_extraction-source' not in arguments: p_extraction_source = plotextractor_types else: p_extraction_source = arguments.get('p_extraction-source', "") if not isinstance(p_extraction_source, list): p_extraction_source = [p_extraction_source] if 'latex' in p_extraction_source: # Run LaTeX plotextractor if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg['CFG_TMPSHAREDDIR'], str(eng.uuid) ) if not os.path.exists(extract_path): os.makedirs(extract_path) tarball, pdf = harvest_single( obj.data["system_control_number"]["value"], extract_path, ["tarball"]) tarball = str(tarball) if tarball is None: raise WorkflowError( str("Error harvesting tarball from id: %s %s" % (obj.data["system_control_number"]["value"], extract_path)), eng.uuid, id_object=obj.id) obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] sub_dir, refno = get_defaults(tarball, cfg['CFG_TMPDIR'], "") tex_files = None image_list = None try: extracted_files_list, image_list, tex_files = untar(tarball, sub_dir) except Timeout: eng.log.error( 'Timeout during tarball extraction on %s' % (tarball,)) converted_image_list = convert_images(image_list) eng.log.info('converted %d of %d images found for %s' % ( len(converted_image_list), len(image_list), os.path.basename(tarball))) extracted_image_data = [] if tex_files == [] or tex_files is None: eng.log.error( '%s is not a tarball' % (os.path.split(tarball)[-1],)) run_shell_command('rm -r %s', (sub_dir,)) else: for tex_file in tex_files: # Extract images, captions and labels partly_extracted_image_data = extract_captions(tex_file, sub_dir, converted_image_list) if partly_extracted_image_data: # Add proper filepaths and do various cleaning cleaned_image_data = prepare_image_data( partly_extracted_image_data, tex_file, converted_image_list) # Using prev. extracted info, get contexts for each # image found extracted_image_data.extend( (extract_context(tex_file, cleaned_image_data))) if extracted_image_data: extracted_image_data = remove_dups(extracted_image_data) create_contextfiles(extracted_image_data) marc_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' marc_xml += create_MARC(extracted_image_data, tarball, None) marc_xml += "\n</collection>" if marc_xml: # We store the path to the directory the tarball # contents live # Read and grab MARCXML from plotextractor run new_dict = convert_marcxml_to_bibfield(marc_xml) try: if isinstance(new_dict["fft"], list): for element in new_dict["fft"]: obj.data['fft'].append(element) else: obj.data['fft'].append(new_dict["fft"]) except KeyError: obj.data['fft'] = [new_dict['fft']] obj.add_task_result("filesfft", new_dict["fft"]) obj.add_task_result("number_picture_converted", len(converted_image_list)) obj.add_task_result("number_of_picture_total", len(image_list))