def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, arxiv_id) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() try: marcxml = get_marcxml_plots_from_tarball(tarball) except Timeout: eng.log.error("Timeout during tarball extraction on {0}".format(tarball)) if marcxml: # We store the path to the directory the tarball contents lives new_dict = get_json_from_marcxml(marcxml)[0] record.update(new_dict) obj.update_task_results( "Plots", [{"name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html"}] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"]))) model.update()
def extract_journal_info(obj, eng): """Extract journal, volume etc. from any freetext publication info.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) publication_info = record.get("publication_info") if not publication_info: return new_publication_info = [] for pubnote in publication_info: freetext = pubnote.get("pubinfo_freetext") if freetext: extracted_publication_info = extract_journal_reference(freetext) if extracted_publication_info: if "volume" in extracted_publication_info: pubnote["journal_volume"] = extracted_publication_info.get( "volume" ) if "title" in extracted_publication_info: pubnote["journal_title"] = extracted_publication_info.get( "title" ) if "year" in extracted_publication_info: pubnote["year"] = extracted_publication_info.get( "year" ) if "page" in extracted_publication_info: pubnote["page_artid"] = extracted_publication_info.get( "page" ) new_publication_info.append(pubnote) record["publication_info"] = new_publication_info model.update()
def get_description(bwo): """Get the description column part.""" if not isinstance(bwo.data, dict): return "No description found." model = process_record_arxiv.model(bwo) record = get_record_from_model(model) abstract = "" authors = [] categories = [] final_identifiers = [] if hasattr(record, "get"): # Get identifiers doi = record.get("doi.doi", []) if doi: final_identifiers.extend(doi) system_no = record.get("system_control_number.system_control_number", []) if system_no: final_identifiers.extend(system_no) # Get subject categories, adding main one first. Order matters here. categories = record.get("report_number.arxiv_category", []) categories.extend(record.get("subject_term.value", [])) categories = list(OrderedDict.fromkeys(categories)) # Unique only abstract = record.get("abstract.summary", [""])[0] authors = record.get("authors", []) return render_template('workflows/styles/harvesting_record.html', object=bwo, authors=authors, categories=categories, abstract=abstract, identifiers=final_identifiers)
def _guess_coreness(obj, eng): from invenio.base.globals import cfg from .arxiv import predict if os.path.basename(model_path) == model_path: # Just the name is given, so we fill in the rest full_model_path = os.path.join(cfg.get("CLASSIFIER_MODEL_PATH"), model_path) else: # The new variable is needed due to how parameters in closures work full_model_path = model_path if not os.path.isfile(full_model_path): obj.log.error("Model file {0} not found! Skipping prediction...".format(full_model_path)) return model = eng.workflow_definition.model(obj) record = get_record_from_model(model) prepared_record = prepare_prediction_record(record) pipeline = load_model(full_model_path) decision, scores = predict(pipeline, prepared_record) obj.log.info("Successfully predicted as {0} with {1}".format(decision, max(scores))) result = {} result["decision"] = decision result["max_score"] = max(scores) result["all_scores"] = scores task_result = {"name": "arxiv_guessing", "result": result, "template": "workflows/results/arxiv_guessing.html"} obj.update_task_results(task_result.get("name"), [task_result])
def match(obj, eng): """Return True if the record already exists in INSPIRE. Searches by arXiv identifier and DOI, updates extra_data with the first id returned by the search. """ model = eng.workflow_definition.model(obj) record = get_record_from_model(model) response = list( set(match_by_arxiv_id(record)) | set(match_by_doi(record)) ) if response: # FIXME(jacquerie): use more than just the first id. obj.extra_data['recid'] = response[0] obj.extra_data['url'] = os.path.join( cfg["CFG_ROBOTUPLOAD_SUBMISSION_BASEURL"], 'record', str(response[0]) ) return True return False
def _author_list(obj, eng): from inspire.modules.converter.xslt import convert from invenio_oaiharvester.utils import find_matching_files model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() sub_dir = os.path.abspath("{0}_files".format(tarball)) try: file_list = untar(tarball, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [filename for filename in file_list if filename.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authorlist_record = get_json_from_marcxml(authors_xml)[0] record.update(authorlist_record) obj.update_task_results( "authors", [{ "name": "authors", "results": authorlist_record["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": authorlist_record["number_of_authors"] }] ) break model.update()
def get_title(bwo): """Return title.""" if isinstance(bwo.data, dict): model = hep_ingestion.model(bwo) record = get_record_from_model(model) titles = record.get("titles.title") if titles: return titles[0] return "No title available"
def _arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id)) if not existing_file: # We download it pdf = get_pdf_for_model(eng, arxiv_id) if pdf is None: obj.log.error("No pdf found") return pdf = add_file_by_name(model, pdf) obj.extra_data["pdf"] = pdf else: pdf = existing_file.get_syspath() if pdf: if "fft" in record: record["fft"].append({ "url": pdf, "docfile_type": doctype }) else: new_dict_representation = { "fft": [ { "url": pdf, "docfile_type": doctype } ] } record.update(new_dict_representation) fileinfo = { "type": "fulltext", "filename": os.path.basename(pdf), "full_path": pdf, } obj.update_task_results( os.path.basename(pdf), [{ "name": "PDF", "result": fileinfo, "template": "workflows/results/files.html" }] ) model.update() else: obj.log.info("No PDF found.")
def get_record(cls, obj, **kwargs): """Return a dictionary-like object representing the current object. This object will be used for indexing and be the basis for display in Holding Pen. """ if isinstance(obj.data, six.text_type): return {} model = cls.model(obj) return get_record_from_model(model).dumps() # Turn into pure dict
def _author_list(obj, eng): from invenio_oaiharvester.utils import find_matching_files from invenio_utils.plotextractor.cli import get_defaults from invenio_utils.plotextractor.converter import untar from invenio_utils.shell import Timeout from inspire.modules.converter.xslt import convert model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, arxiv_id) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() sub_dir, dummy = get_defaults(str(tarball), cfg["CFG_TMPDIR"], "") try: untar(str(tarball), sub_dir) obj.log.info("Extracted tarball to: {0}".format(sub_dir)) except Timeout: eng.log.error("Timeout during tarball extraction on {0}".format(tarball)) xml_files_list = find_matching_files(sub_dir, ["xml"]) obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authorlist_record = get_json_from_marcxml(authors_xml)[0] record.update(authorlist_record) obj.update_task_results("authors", [{"name": "authors", "results": authorlist_record["authors"]}]) obj.update_task_results( "number_of_authors", [{"name": "number_of_authors", "results": authorlist_record["number_of_authors"]}], ) break model.update()
def get_record(cls, obj, **kwargs): """Return a dictionary-like object representing the current object. This object will be used for indexing and be the basis for display in Holding Pen. """ model = cls.model(obj) record = get_record_from_model(model) if record: return record.dumps() return {}
def formatter(bwo, **kwargs): """Nicely format the record.""" try: model = process_record_arxiv.model(bwo) record = get_record_from_model(model) except TypeError as err: return "Error: {0}".format(err) return render_template( 'format/record/Holding_Pen_HTML_detailed.tpl', record=record )
def get_record(cls, obj, **kwargs): """Return a dictionary-like object representing the current object. This object will be used for indexing and be the basis for display in Holding Pen. """ try: model = cls.model(obj) return get_record_from_model(model).dumps() # Turn into pure dict except Exception as err: current_app.logger.exception(err) return {}
def _classify_paper(obj, eng): from invenio_classifier.errors import TaxonomyError model = eng.workflow_definition.model(obj) record = get_record_from_model(model) data = None is_fast_mode = fast_mode if not is_fast_mode: if "pdf" in obj.extra_data: # Getting path to PDF file data = obj.extra_data["pdf"] callback = get_keywords_from_local_file if not data: data = [record.get("titles.title", "")] + record.get("abstracts.value", []) callback = get_keywords_from_text is_fast_mode = True if not data: obj.log.error("No classification done due to missing data.") return try: result = callback(data, taxonomy, output_mode='dict', output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) except TaxonomyError as e: obj.log.exception(e) return clean_instances_from_data(result.get("complete_output", {})) final_result = {"dict": result} final_result["fast_mode"] = fast_mode # Check if it is not empty output before adding output = result.get("complete_output", {}).values() if not any(output): final_result["dict"] = {} name = "classification" obj.update_task_results( name, [{ "name": name, "result": final_result, "template": "workflows/results/classifier.html" }] )
def add_core(obj, eng): """Add CORE collection tag to collections.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) collections = record.get("collections", []) # Do not add it again if already there has_core = [v for c in collections for v in c.values() if v.lower() == "core"] if not has_core: collections.append({"primary": "CORE"}) record["collections"] = collections model.update()
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ from invenio.legacy.refextract.api import extract_references_from_file_xml model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id)) if not existing_file: # We download it pdf = get_pdf_for_model(eng, arxiv_id) if pdf is None: obj.log.error("No pdf found") return add_file_by_name(model, pdf) else: pdf = existing_file.get_syspath() if pdf and os.path.isfile(pdf): references_xml = extract_references_from_file_xml(pdf) if references_xml: updated_xml = ( '<?xml version="1.0" encoding="UTF-8"?>\n' "<collection>\n" + references_xml + "\n</collection>" ) new_dict = get_json_from_marcxml(updated_xml)[0] if "references" in new_dict: record["references"] = new_dict["references"] obj.log.info("Extracted {0} references".format(len(obj.data["references"]))) obj.update_task_results( "References", [ { "name": "References", "result": new_dict["references"], "template": "workflows/results/refextract.html", } ], ) model.update() else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() try: plots = process_tarball(tarball) except InvalidTarball: eng.log.error( 'Invalid tarball {0}'.format(tarball) ) return if plots: # We store the path to the directory the tarball contents lives new_dict = get_json_for_plots(plots) record.update(new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"]))) model.update()
def _exists_in_inspire_or_rejected(obj, eng): if match(obj, eng): obj.log.info("Record already exists in INSPIRE.") return True if cfg.get('PRODUCTION_MODE'): model = eng.workflow_definition.model(obj) record = get_record_from_model(model) if was_already_harvested(record): obj.log.info('Record is already being harvested on INSPIRE.') return True if days_ago is None: _days_ago = cfg.get('INSPIRE_ACCEPTANCE_TIMEOUT', 5) else: _days_ago = days_ago if is_too_old(record, days_ago=_days_ago): obj.log.info("Record is likely rejected previously.") return True return False
def get_description(bwo): """Get the description column part.""" if not isinstance(bwo.data, dict): return "No description found." model = process_record_arxiv.model(bwo) record = get_record_from_model(model) abstract = "" authors = [] categories = [] final_identifiers = [] if hasattr(record, "get"): # Get identifiers dois = record.get("dois.value", []) if dois: final_identifiers.extend(dois) system_no = record.get("external_system_numbers.value", []) if system_no: final_identifiers.extend(system_no) # Get subject categories, adding main one first. Order matters here. record_categories = record.get("arxiv_eprints.categories", []) + \ record.get("subject_terms.term", []) for category_list in record_categories: if isinstance(category_list, list): categories.extend(category_list) else: categories.append(category_list) categories = list(OrderedDict.fromkeys(categories)) # Unique only abstract = record.get("abstracts.value", [""])[0] authors = record.get("authors", []) return render_template('workflows/styles/harvesting_record.html', object=bwo, authors=authors, categories=categories, abstract=abstract, identifiers=final_identifiers)
def get_title(bwo): if not isinstance(bwo.data, dict): return "No title found." model = process_record_arxiv.model(bwo) record = get_record_from_model(model) return "; ".join(record.get("titles.title", ["No title found"]))