def populate_arxiv_document(obj, eng): arxiv_id = get_arxiv_id(obj.data) url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id) if not is_pdf_link(url): if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return raise DownloadError("{url} is not serving a PDF file.".format(url=url)) filename = secure_filename('{0}.pdf'.format(arxiv_id)) obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document( filename, fulltext=True, hidden=True, material='preprint', original_url=url, url=url, ) obj.data = lb.record
def test_get_arxiv_id_returns_empty_string_when_no_arxiv_eprints(): record = {} expected = '' result = get_arxiv_id(record) assert expected == result
def test_get_arxiv_id_returns_empty_string_when_arxiv_eprints_is_empty(): record = {'arxiv_eprints': []} expected = '' result = get_arxiv_id(record) assert expected == result
def is_arxiv_paper(obj, *args, **kwargs): """Check if the record is from arXiv.""" arxiv_id = get_arxiv_id(obj.data) categories = get_value(obj.data, 'arxiv_eprints.categories') if arxiv_id or categories: return True return False
def is_arxiv_paper(obj, *args, **kwargs): """Check if the record is from arXiv.""" arxiv_id = get_arxiv_id(obj.data) categories = get_value(obj.data, 'arxiv_eprints.categories') if arxiv_id or categories: return True return False
def match_by_arxiv_id(record): """Match by arXiv identifier.""" arxiv_id = get_arxiv_id(record) if arxiv_id: query = '035__a:oai:arXiv.org:{0}'.format(arxiv_id) return search(query) return list()
def already_harvested(obj, eng): """Check if record is already harvested.""" if is_being_harvested_on_legacy(obj.data): obj.log.info(('Record with arXiv id {arxiv_id} is' ' already being harvested on Legacy.').format( arxiv_id=get_arxiv_id(obj.data))) return True return False
def match_by_arxiv_id(record): """Match by arXiv identifier.""" arxiv_id = get_arxiv_id(record) if arxiv_id: query = '035:"{0}"'.format(arxiv_id) return search(query) return list()
def already_harvested(obj, eng): """Check if record is already harvested.""" if is_being_harvested_on_legacy(obj.data): obj.log.info(( 'Record with arXiv id {arxiv_id} is' ' already being harvested on Legacy.' ).format(arxiv_id=get_arxiv_id(obj.data))) return True return False
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) try: tarball = obj.files[filename] except KeyError: obj.log.info( 'Skipping author list extraction, no tarball with name "%s" found' % filename) return with TemporaryDirectory(prefix='author_list') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [ path for path in file_list if path.endswith('.xml') ] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) extracted_authors = [] for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) extracted_authors.extend( marcxml2record(authors_xml).get('authors', [])) if extracted_authors: for author in extracted_authors: author['full_name'] = decode_latex(author['full_name']) obj.data['authors'] = extracted_authors
def test_get_arxiv_id_returns_first_arxiv_identifier(): record = { 'arxiv_eprints': [ {'value': 'first arXiv identifier'}, {'value': 'second arXiv identifier'}, ], } expected = 'first arXiv identifier' result = get_arxiv_id(record) assert expected == result
def test_get_arxiv_id_returns_first_arxiv_identifier(): record = { 'arxiv_eprints': [ {'value': 'first arXiv identifier'}, {'value': 'second arXiv identifier'}, ], } expected = 'first arXiv identifier' result = get_arxiv_id(record) assert expected == result
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if not tarball: obj.log.info( 'Skipping author list extraction, no tarball with name "%s" found' % filename ) return with TemporaryDirectory(prefix='author_list') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [path for path in file_list if path.endswith('.xml')] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) extracted_authors = [] for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) extracted_authors.extend(marcxml2record(authors_xml).get('authors', [])) if extracted_authors: obj.data['authors'] = extracted_authors
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='plot_extract') as scratch_space: try: plots = process_tarball(tarball.file.uri, output_directory=scratch_space) except (InvalidTarball, NoTexFilesFound): obj.log.info('Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id) current_app.logger.exception(err) return lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) files_keys = obj.files.keys key = plot_name if plot_name in files_keys: key = '{number}_{name}'.format(number=index, name=plot_name) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure(key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key)) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='author_list') as scratch_space: tarball_file = retrieve_uri( tarball.file.uri, outdir=scratch_space, ) try: file_list = untar(tarball_file, scratch_space) except InvalidTarball: obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return obj.log.info('Extracted tarball to: {0}'.format(scratch_space)) xml_files_list = [ path for path in file_list if path.endswith('.xml') ] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) authorlist_record = marcxml2record(authors_xml) obj.data.update(authorlist_record) break
def arxiv_package_download(obj, eng): """Perform the package download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = download_file_to_workflow( workflow=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id), ) if tarball: obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id) else: obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)
def arxiv_package_download(obj, eng): """Perform the package download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = download_file_to_workflow( workflow=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format(arxiv_id=arxiv_id), ) if tarball: obj.log.info('Tarball retrieved from arXiv for %s', arxiv_id) else: obj.log.error('Cannot retrieve tarball from arXiv for %s', arxiv_id)
def test_get_arxiv_id(): schema = load_schema('hep') subschema = schema['properties']['arxiv_eprints'] record = { 'arxiv_eprints': [ { 'categories': [ 'hep-th', 'hep-ph', ], 'value': '1612.08928', }, ], } assert validate(record['arxiv_eprints'], subschema) is None expected = '1612.08928' result = get_arxiv_id(record) assert expected == result
def _get_preprint_context(record): abstract = get_abstract(record) try: abstract_language = detect(abstract) except LangDetectException: abstract_language = '' return { 'abstract': abstract, 'abstract_language': abstract_language, 'arxiv_id': get_arxiv_id(record), 'authors': get_authors(record), 'collaborations': get_collaborations(record), 'divulgation': get_divulgation(record), 'domains': get_domains(record), 'inspire_id': get_inspire_id(record), 'keywords': get_keywords(record), 'language': get_language(record), 'subtitle': get_subtitle(record), 'title': get_title(record), }
def _get_comm_context(record): abstract = get_abstract(record) try: abstract_language = detect(abstract) except LangDetectException: abstract_language = '' conference_record = get_conference_record(record) conference_city = get_conference_city(conference_record) conference_country = get_conference_country(conference_record) conference_end_date = get_conference_end_date(conference_record) conference_start_date = get_conference_start_date(conference_record) conference_title = get_conference_title(conference_record) return { 'abstract': abstract, 'abstract_language': abstract_language, 'arxiv_id': get_arxiv_id(record), 'authors': get_authors(record), 'collaborations': get_collaborations(record), 'conference_city': conference_city, 'conference_country': conference_country, 'conference_end_date': conference_end_date, 'conference_start_date': conference_start_date, 'conference_title': conference_title, 'divulgation': get_divulgation(record), 'doi': get_doi(record), 'domains': get_domains(record), 'inspire_id': get_inspire_id(record), 'journal_issue': get_journal_issue(record), 'journal_title': get_journal_title(record), 'journal_volume': get_journal_volume(record), 'keywords': get_keywords(record), 'language': get_language(record), 'page_artid': get_page_artid(record), 'peer_reviewed': get_peer_reviewed(record), 'publication_date': get_publication_date(record), 'subtitle': get_subtitle(record), 'title': get_title(record), }
def arxiv_fulltext_download(obj, eng): """Perform the fulltext download step for arXiv records. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.pdf'.format(arxiv_id)) url = current_app.config['ARXIV_PDF_URL'].format(arxiv_id=arxiv_id) if not is_pdf_link(url): if NO_PDF_ON_ARXIV in requests.get(url).content: obj.log.info('No PDF is available for %s', arxiv_id) return raise DownloadError("{url} is not serving a PDF file.".format(url=url)) pdf = download_file_to_workflow( workflow=obj, name=filename, url=url, ) if pdf: obj.data['documents'] = [ document for document in obj.data.get('documents', ()) if document.get('key') != filename ] lb = LiteratureBuilder(source='arxiv', record=obj.data) lb.add_document(filename, fulltext=True, hidden=True, material='preprint', original_url=url, url='/api/files/{bucket}/{key}'.format( bucket=obj.files[filename].bucket_id, key=filename)) obj.data = lb.record obj.log.info('PDF retrieved from arXiv for %s', arxiv_id) else: obj.log.error('Cannot retrieve PDF from arXiv for %s', arxiv_id)
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: sub_dir = os.path.abspath('{0}_files'.format(tarball.file.uri)) try: file_list = untar(tarball.file.uri, sub_dir) except InvalidTarball: obj.log.error('Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id) return obj.log.info('Extracted tarball to: {0}'.format(sub_dir)) xml_files_list = [path for path in file_list if path.endswith('.xml')] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) authors_rec = create_record(authors_xml) authorlist_record = hep.do(authors_rec) obj.data.update(authorlist_record) break
def _get_art_context(record): abstract = get_abstract(record) abstract_language = langdetect.detect(abstract) return { 'abstract': abstract, 'abstract_language': abstract_language, 'arxiv_id': get_arxiv_id(record), 'authors': get_authors(record), 'collaborations': get_collaborations(record), 'divulgation': get_divulgation(record), 'doi': get_doi(record), 'domain': get_domain(record), 'inspire_id': get_inspire_id(record), 'journal_issue': get_journal_issue(record), 'journal_title': get_journal_title(record), 'journal_volume': get_journal_volume(record), 'language': get_language(record), 'page_artid': get_page_artid(record), 'peer_reviewed': get_peer_reviewed(record), 'publication_date': get_publication_date(record), 'title': get_title(record), }
def arxiv_plot_extract(obj, eng): """Extract plots from an arXiv archive. :param obj: Workflow Object to process :param eng: Workflow Engine processing the object """ arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: with TemporaryDirectory(prefix='plot_extract') as scratch_space, \ retrieve_uri(tarball.file.uri, outdir=scratch_space) as tarball_file: try: plots = process_tarball( tarball_file, output_directory=scratch_space, ) except (InvalidTarball, NoTexFilesFound): obj.log.info( 'Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id, ) return except DelegateError as err: obj.log.error( 'Error extracting plots for %s. Report and skip.', arxiv_id, ) current_app.logger.exception(err) return if 'figures' in obj.data: for figure in obj.data['figures']: if figure['key'] in obj.files: del obj.files[figure['key']] del obj.data['figures'] lb = LiteratureBuilder(source='arxiv', record=obj.data) for index, plot in enumerate(plots): plot_name = os.path.basename(plot.get('url')) key = plot_name if plot_name in obj.files.keys: key = 'w{number}_{name}'.format( number=index, name=plot_name, ) with open(plot.get('url')) as plot_file: obj.files[key] = plot_file lb.add_figure( key=key, caption=''.join(plot.get('captions', [])), label=plot.get('label'), material='preprint', url='/api/files/{bucket}/{key}'.format( bucket=obj.files[key].bucket_id, key=key, ) ) obj.data = lb.record obj.log.info('Added {0} plots.'.format(len(plots)))