def arxiv_refextract(obj, eng): """Extract references from arXiv PDF. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.pdf".format(arxiv_id)) if filename not in obj.files: pdf = download_file_to_workflow( workflow=obj, name=filename, url=current_app.config['ARXIV_PDF_URL'].format( arxiv_id=arxiv_id ) ) else: pdf = obj.files[filename] if pdf: mapped_references = extract_references(pdf.file.uri) if mapped_references: obj.data["references"] = mapped_references obj.log.info("Extracted {0} references".format( len(mapped_references) )) else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ from wand.exceptions import DelegateError arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id)) else: tarball = obj.files[filename] try: plots = process_tarball(tarball.file.uri) except (InvalidTarball, NoTexFilesFound): obj.log.error('Invalid tarball {0}'.format(tarball.file.uri)) return except DelegateError as err: obj.log.error("Error extracting plots. Report and skip.") current_app.logger.exception(err) return for idx, plot in enumerate(plots): obj.files[plot.get('name')] = BytesIO(open(plot.get('url'))) obj.files[plot.get('name')]["doctype"] = "Plot" obj.files[plot.get('name')]["description"] = "{0:05d} {1}".format( idx, "".join(plot.get('captions', []))) obj.log.info("Added {0} plots.".format(len(plots)))
def arxiv_refextract(obj, eng): """Extract references from arXiv PDF. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.pdf".format(arxiv_id)) if filename not in obj.files: pdf = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_PDF_URL'].format( arxiv_id=arxiv_id ) ) else: pdf = obj.files[filename] if pdf: mapped_references = extract_references(pdf.file.uri) if mapped_references: # FIXME For now we do not add these references to the final record. obj.extra_data["references"] = mapped_references obj.log.info("Extracted {0} references".format( len(mapped_references) )) else: obj.log.info("No references extracted") else: obj.log.error("Not able to download and process the PDF")
def is_arxiv_paper(obj, *args, **kwargs): """Check if the record is from arXiv.""" arxiv_id = get_clean_arXiv_id(obj.data) categories = get_value(obj.data, 'arxiv_eprints.categories') if arxiv_id or categories: return True return False
def match_by_arxiv_id(record): """Match by arXiv identifier.""" arxiv_id = get_clean_arXiv_id(record) if arxiv_id: query = '035:"{0}"'.format(arxiv_id) return search(query) return list()
def test_get_clean_arXiv_id_from_arxiv_eprints_with_oai_prefix(): record = { 'arxiv_eprints': [ { 'value': 'oai:arXiv.org:physics/0112006' }, ], } expected = 'physics/0112006' result = get_clean_arXiv_id(record) assert expected == result
def test_get_clean_arXiv_id_from_arxiv_eprints_using_new_style(): record = { 'arxiv_eprints': [ { 'value': 'arxiv:1002.2647' }, ], } expected = '1002.2647' result = get_clean_arXiv_id(record) assert expected == result
def test_get_clean_arXiv_id_from_arxiv_eprints_using_old_style(): record = { 'arxiv_eprints': [ { 'value': 'physics/0112006' }, ], } expected = 'physics/0112006' result = get_clean_arXiv_id(record) assert expected == result
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.pdf".format(arxiv_id)) if filename not in obj.files: pdf = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id)) pdf['doctype'] = "arXiv"
def test_get_clean_arXiv_id_from_arxiv_eprints_selects_first(): record = { 'arxiv_eprints': [ { 'value': 'oai:arXiv.org:0801.4782' }, { 'value': 'oai:arXiv.org:0805.1410' }, ], } expected = '0801.4782' result = get_clean_arXiv_id(record) assert expected == result
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.pdf".format(arxiv_id)) if filename not in obj.files: pdf = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_PDF_URL'].format( arxiv_id=arxiv_id ) ) pdf['doctype'] = "arXiv"
def _author_list(obj, eng): from inspirehep.modules.converter import convert arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_workflow( workflow=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id ) ) else: tarball = obj.files[filename] sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri)) try: file_list = untar(tarball.file.uri, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball.file.uri)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [path for path in file_list if path.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authors_rec = create_record(authors_xml) authorlist_record = hep.do(authors_rec) obj.data.update(authorlist_record) break
def _author_list(obj, eng): from inspirehep.modules.converter import convert arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id ) ) else: tarball = obj.files[filename] sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri)) try: file_list = untar(tarball.file.uri, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball.file.uri)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [path for path in file_list if path.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authors_rec = create_record(authors_xml) authorlist_record = hep.do(authors_rec) obj.data.update(authorlist_record) break
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ from wand.exceptions import DelegateError arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id ) ) else: tarball = obj.files[filename] try: plots = process_tarball(tarball.file.uri) except (InvalidTarball, NoTexFilesFound): obj.log.error( 'Invalid tarball {0}'.format(tarball.file.uri) ) return except DelegateError as err: obj.log.error("Error extracting plots. Report and skip.") current_app.logger.exception(err) return for idx, plot in enumerate(plots): obj.files[plot.get('name')] = BytesIO(open(plot.get('url'))) obj.files[plot.get('name')]["doctype"] = "Plot" obj.files[plot.get('name')]["description"] = "{0:05d} {1}".format( idx, "".join(plot.get('captions', [])) ) obj.log.info("Added {0} plots.".format(len(plots)))
def test_get_clean_arXiv_id_returns_none_when_no_arxiv_eprints(): assert get_clean_arXiv_id({}) is None
def is_arxiv_paper(obj, *args, **kwargs): """Check if the record is from arXiv.""" return bool(get_value(obj.data, "arxiv_eprints.categories", [[]])[0]) or \ get_clean_arXiv_id(obj.data)
def test_arxiv_id_getter(arxiv_record, arxiv_record_old, arxiv_record_oai): """Test retrieval of arXiv ID.""" assert "1002.2647" == get_clean_arXiv_id(arxiv_record) assert "physics/0112006" == get_clean_arXiv_id(arxiv_record_old) assert "physics/0112006" == get_clean_arXiv_id(arxiv_record_oai) assert get_clean_arXiv_id({}) is None