def _match_with_invenio_matcher(obj, eng): model = eng.workflow_definition.model(obj) record = get_record_from_model(model) if queries is None: queries_ = [ {'type': 'exact', 'match': 'dois.value'}, {'type': 'exact', 'match': 'arxiv_eprints.value'} ] else: queries_ = queries record_matches = { "recids": [], "records": [], "base_url": os.path.join( cfg["CFG_SITE_URL"], 'record' ) } for matched_record in _match(record, queries=queries_, index=index, doc_type='record'): matched_recid = matched_record.record.get('control_number') record_matches['recids'].append(matched_recid) record_matches['records'].append({ "source": matched_record.record.dumps(), "score": matched_record.score }) obj.extra_data["record_matches"] = record_matches return bool(record_matches['recids'])
def _get_record(obj, model): from inspirehep.utils.helpers import get_record_from_model model = model(obj) record = get_record_from_model(model) if record: return record.dumps() return {}
def _author_list(obj, eng): from inspirehep.modules.converter.xslt import convert model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() sub_dir = os.path.abspath("{0}_files".format(tarball)) try: file_list = untar(tarball, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [filename for filename in file_list if filename.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authorlist_record = get_json_from_marcxml(authors_xml)[0] record.update(authorlist_record) obj.update_task_results( "authors", [{ "name": "authors", "results": authorlist_record["authors"] }] ) obj.update_task_results( "number_of_authors", [{ "name": "number_of_authors", "results": authorlist_record["number_of_authors"] }] ) break model.update()
def already_harvested(obj, eng): """Check if record is already harvested.""" if cfg.get("PRODUCTION_MODE"): model = eng.workflow_definition.model(obj) record = get_record_from_model(model) if was_already_harvested(record): obj.log.info("Record is already being harvested on INSPIRE.") return True return False
def _classify_paper(obj, eng): from invenio_classifier.errors import TaxonomyError model = eng.workflow_definition.model(obj) record = get_record_from_model(model) data = None is_fast_mode = fast_mode if not is_fast_mode: if "pdf" in obj.extra_data: # Getting path to PDF file data = obj.extra_data["pdf"] callback = get_keywords_from_local_file if not data: data = [record.get("titles.title", "")].extend(record.get("abstracts.value", [])) callback = get_keywords_from_text is_fast_mode = True if not data: obj.log.error("No classification done due to missing data.") return try: result = callback(data, taxonomy, output_mode='dict', output_limit=output_limit, spires=spires, match_mode=match_mode, no_cache=no_cache, with_author_keywords=with_author_keywords, rebuild_cache=rebuild_cache, only_core_tags=only_core_tags, extract_acronyms=extract_acronyms) except TaxonomyError as e: obj.log.exception(e) return clean_instances_from_data(result.get("complete_output", {})) final_result = {"dict": result} final_result["fast_mode"] = fast_mode # Check if it is not empty output before adding output = result.get("complete_output", {}).values() if not any(output): final_result["dict"] = {} name = "classification" obj.update_task_results( name, [{ "name": name, "result": final_result, "template": "workflows/results/classifier.html" }] )
def add_core(obj, eng): """Add CORE collection tag to collections.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) collections = record.get("collections", []) # Do not add it again if already there has_core = [v for c in collections for v in c.values() if v.lower() == "core"] if not has_core: collections.append({"primary": "CORE"}) record["collections"] = collections model.update()
def _arxiv_fulltext_download(obj, eng): model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id)) if not existing_file: # We download it pdf = get_pdf_for_model(eng, arxiv_id) if pdf is None: obj.log.error("No pdf found") return pdf = add_file_by_name(model, pdf) obj.extra_data["pdf"] = pdf else: pdf = existing_file.get_syspath() if pdf: if "fft" in record: record["fft"].append({ "url": pdf, "docfile_type": doctype }) else: new_dict_representation = { "fft": [ { "url": pdf, "docfile_type": doctype } ] } record.update(new_dict_representation) fileinfo = { "type": "fulltext", "filename": os.path.basename(pdf), "full_path": pdf, } obj.update_task_results( os.path.basename(pdf), [{ "name": "PDF", "result": fileinfo, "template": "workflows/results/files.html" }] ) model.update() else: obj.log.info("No PDF found.")
def _previously_rejected(obj, eng): if cfg.get("PRODUCTION_MODE"): model = eng.workflow_definition.model(obj) record = get_record_from_model(model) if days_ago is None: _days_ago = cfg.get("INSPIRE_ACCEPTANCE_TIMEOUT", 5) else: _days_ago = days_ago if is_too_old(record, days_ago=_days_ago): obj.log.info("Record is likely rejected previously.") return True return False
def get_record(cls, obj, **kwargs): """Return a dictionary-like object representing the current object. This object will be used for indexing and be the basis for display in Holding Pen. """ if not hasattr(obj, "data"): obj.data = obj.get_data() if isinstance(obj.data, six.text_type): return {} model = cls.model(obj) record = get_record_from_model(model) if record: return record.dumps() else: return {}
def get_title(cls, obj, **kwargs): """Return the value to put in the title column of Holding Pen.""" if not hasattr(obj, "data"): obj.data = obj.get_data() if isinstance(obj.data, dict): try: model = cls.model(obj) except InvalidDepositionType: return "This submission is disabled: {0}.".format(obj.workflow.name) record = get_record_from_model(model) if record: titles = filter(None, record.get("titles.title", [])) if titles: # Show first title that evaluates to True return titles[0] return "No title available"
def formatter(cls, obj, **kwargs): """Nicely format the record.""" try: model = cls.model(obj) record = get_record_from_model(model) except TypeError as err: return "Error: {0}".format(err) if not record: return "" if kwargs.get('of'): if "recid" not in record: record['recid'] = None return format_record(record=record, of=kwargs.get('of')) return render_template( 'format/record/Holding_Pen_HTML_detailed.tpl', record=record )
def match(obj, eng): """Return True if the record already exists in INSPIRE. Searches by arXiv identifier and DOI, updates extra_data with the first id returned by the search. """ model = eng.workflow_definition.model(obj) record = get_record_from_model(model) response = list(set(match_by_arxiv_id(record)) | set(match_by_doi(record))) obj.extra_data["record_matches"] = { "recids": [str(recid) for recid in response], "records": [], "base_url": os.path.join(cfg["CFG_ROBOTUPLOAD_SUBMISSION_BASEURL"], "record"), } return bool(obj.extra_data["record_matches"]["recids"])
def arxiv_refextract(obj, eng): """Perform the reference extraction step. :param obj: Bibworkflow Object to process :param eng: BibWorkflowEngine processing the object """ model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.pdf".format(arxiv_id)) if not existing_file: # We download it pdf = get_pdf_for_model(eng, arxiv_id) if pdf is None: obj.log.error("No pdf found") return add_file_by_name(model, pdf) else: pdf = existing_file.get_syspath() if pdf and os.path.isfile(pdf): mapped_references = extract_references(pdf) if mapped_references: record["references"] = mapped_references obj.log.info("Extracted {0} references".format( len(mapped_references) )) obj.update_task_results( "References", [{"name": "References", "result": mapped_references, "template": "workflows/results/refextract.html"}] ) model.update() else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) arxiv_id = get_arxiv_id_from_record(record) existing_file = get_file_by_name(model, "{0}.tar.gz".format(arxiv_id)) if not existing_file: # We download it tarball = get_tarball_for_model(eng, arxiv_id) if tarball is None: obj.log.error("No tarball found") return add_file_by_name(model, tarball) else: tarball = existing_file.get_syspath() try: plots = process_tarball(tarball) except InvalidTarball: eng.log.error( 'Invalid tarball {0}'.format(tarball) ) return if plots: # We store the path to the directory the tarball contents lives new_dict = get_json_for_plots(plots) record.update(new_dict) obj.update_task_results( "Plots", [{ "name": "Plots", "result": new_dict["fft"], "template": "workflows/results/plots.html" }] ) obj.log.info("Added {0} plots.".format(len(new_dict["fft"]))) model.update()
def extract_journal_info(obj, eng): """Extract journal, volume etc. from any freetext publication info.""" model = eng.workflow_definition.model(obj) record = get_record_from_model(model) publication_info = record.get("publication_info") if not publication_info: return new_publication_info = [] for pubnote in publication_info: freetext = pubnote.get("pubinfo_freetext") if freetext: extracted_publication_info = extract_journal_reference( freetext, override_kbs_files={'journals': get_mappings_from_kbname(cfg['REFEXTRACT_KB_NAME'])} ) if extracted_publication_info: if "volume" in extracted_publication_info: pubnote["journal_volume"] = extracted_publication_info.get( "volume" ) if "title" in extracted_publication_info: pubnote["journal_title"] = extracted_publication_info.get( "title" ) if "year" in extracted_publication_info: pubnote["year"] = extracted_publication_info.get( "year" ) if "page" in extracted_publication_info: pubnote["page_artid"] = extracted_publication_info.get( "page" ) new_publication_info.append(pubnote) record["publication_info"] = new_publication_info model.update()
def get_description(bwo): """Get the description column part.""" if not isinstance(bwo.data, dict): return "No description found." model = hep_ingestion.model(bwo) record = get_record_from_model(model) abstract = "" authors = [] categories = [] final_identifiers = [] if hasattr(record, "get"): # Get identifiers dois = record.get("dois.value", []) if dois: final_identifiers.extend(dois) system_no = record.get("external_system_numbers.value", []) if system_no: final_identifiers.extend(system_no) # Get subject categories, adding main one first. Order matters here. record_categories = record.get("arxiv_eprints.categories", []) + \ record.get("subject_terms.term", []) for category_list in record_categories: if isinstance(category_list, list): categories.extend(category_list) else: categories.append(category_list) categories = list(OrderedDict.fromkeys(categories)) # Unique only abstract = record.get("abstracts.value", [""])[0] authors = record.get("authors", []) return render_template('workflows/styles/harvesting_record.html', object=bwo, authors=authors, categories=categories, abstract=abstract, identifiers=final_identifiers)