def _plot_extract(obj, eng): from invenio.utils.plotextractor.api import ( get_tarball_from_arxiv, get_marcxml_plots_from_tarball ) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} repository = obj.extra_data.get("repository", {}) arguments = repository.get("arguments", {}) chosen_type = plotextractor_types if not chosen_type: chosen_type = arguments.get('p_extraction-source', []) if not isinstance(chosen_type, list): chosen_type = [chosen_type] if 'latex' in chosen_type: # Run LaTeX plotextractor if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball) ) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] )
def get_tarball_for_model(eng, arxiv_id): """We download it.""" extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) return get_tarball_from_arxiv( arxiv_id, extract_path )
def _plot_extract(obj, eng): from invenio.utils.plotextractor.api import ( get_tarball_from_arxiv, get_marcxml_plots_from_tarball) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} repository = obj.extra_data.get("repository", {}) arguments = repository.get("arguments", {}) chosen_type = plotextractor_types if not chosen_type: chosen_type = arguments.get('p_extraction-source', []) if not isinstance(chosen_type, list): chosen_type = [chosen_type] if 'latex' in chosen_type: # Run LaTeX plotextractor if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get( cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball)) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }])
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" from invenio.utils.plotextractor.api import ( get_tarball_from_arxiv, get_marcxml_plots_from_tarball ) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball) ) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" from invenio.utils.plotextractor.api import (get_tarball_from_arxiv, get_marcxml_plots_from_tarball ) from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.shell import Timeout if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return obj.extra_data["_result"]["tarball"] = tarball else: tarball = obj.extra_data["_result"]["tarball"] try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error( 'Timeout during tarball extraction on {0}'.format(tarball)) if marcxml: # We store the path to the directory the tarball contents lives new_dict = convert_marcxml_to_bibfield(marcxml) _attach_files_to_obj(obj, new_dict) obj.update_task_results("Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }]) obj.log.info("Added {0} plots.".format(len(new_dict["fft"])))
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get( cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid)) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % (obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error( "Error parsing authorlist record for id: %s" % (identifiers, )) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield( updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }]) obj.update_task_results("number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }]) break
def _author_list(obj, eng): from invenio.legacy.bibrecord import create_records, record_xml_output from invenio.legacy.bibconvert.xslt_engine import convert from invenio.utils.plotextractor.api import get_tarball_from_arxiv from invenio.utils.plotextractor.cli import get_defaults from invenio.modules.workflows.utils import convert_marcxml_to_bibfield from invenio.utils.plotextractor.converter import untar from invenio.utils.shell import Timeout from ..utils import find_matching_files identifiers = obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP'), "") if "_result" not in obj.extra_data: obj.extra_data["_result"] = {} if "tarball" not in obj.extra_data["_result"]: extract_path = os.path.join( cfg.get('OAIHARVESTER_STORAGEDIR', cfg.get('CFG_TMPSHAREDDIR')), str(eng.uuid) ) tarball = get_tarball_from_arxiv( obj.data.get(cfg.get('OAIHARVESTER_RECORD_ARXIV_ID_LOOKUP')), extract_path ) if tarball is None: obj.log.error("No tarball found") return else: tarball = obj.extra_data["_result"]["tarball"] # FIXME tarball = str(tarball) sub_dir, dummy = get_defaults(tarball, cfg['CFG_TMPDIR'], "") try: untar(tarball, sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error('Timeout during tarball extraction on %s' % ( obj.extra_data["_result"]["tarball"])) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) authors = "" for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors = convert(xml_content, stylesheet) authorlist_record = create_records(authors) if len(authorlist_record) == 1: if authorlist_record[0][0] is None: eng.log.error("Error parsing authorlist record for id: %s" % ( identifiers,)) authorlist_record = authorlist_record[0][0] author_xml = record_xml_output(authorlist_record) if author_xml: updated_xml = '<?xml version="1.0" encoding="UTF-8"?>\n<collection>\n' \ + record_xml_output(authorlist_record) + '</collection>' new_dict_representation = convert_marcxml_to_bibfield(updated_xml) obj.data["authors"] = new_dict_representation["authors"] obj.update_task_results( "authors", [{ "name": "authors", "results": new_dict_representation["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": new_dict_representation["number_of_authors"] }] ) break