def already_harvested_on_legacy_record(): """Provide record fixture.""" record_oai_arxiv_plots = pkg_resources.resource_string( __name__, os.path.join('../fixtures', 'oai_arxiv_record_already_on_legacy.xml')) # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl") record_marc = create_record(record_oai_arxiv_plots_marcxml) json_data = hep.do(record_marc) return json_data
def _author_list(obj, eng): arxiv_id = get_arxiv_id(obj.data) filename = secure_filename('{0}.tar.gz'.format(arxiv_id)) tarball = obj.files[filename] if tarball: sub_dir = os.path.abspath('{0}_files'.format(tarball.file.uri)) try: file_list = untar(tarball.file.uri, sub_dir) except InvalidTarball: obj.log.error('Invalid tarball %s for arxiv_id %s', tarball.file.uri, arxiv_id) return obj.log.info('Extracted tarball to: {0}'.format(sub_dir)) xml_files_list = [path for path in file_list if path.endswith('.xml')] obj.log.info('Found xmlfiles: {0}'.format(xml_files_list)) for xml_file in xml_files_list: with open(xml_file, 'r') as xml_file_fd: xml_content = xml_file_fd.read() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info('Found a match for author extraction') try: authors_xml = convert(xml_content, stylesheet) except XMLSyntaxError: # Probably the %auto-ignore comment exists, so we skip the # first line. See: inspirehep/inspire-next/issues/2195 authors_xml = convert( xml_content.split('\n', 1)[1], stylesheet, ) authors_rec = create_record(authors_xml) authorlist_record = hep.do(authors_rec) obj.data.update(authorlist_record) break
def generate_record(): """Provide record fixture.""" record_oai_arxiv_plots = pkg_resources.resource_string( __name__, os.path.join('../fixtures', 'oai_arxiv_record_with_plots.xml')) # Convert to MARCXML, then dict, then HEP JSON record_oai_arxiv_plots_marcxml = convert(record_oai_arxiv_plots, "oaiarXiv2marcxml.xsl") record_marc = create_record(record_oai_arxiv_plots_marcxml) json_data = hep.do(record_marc) if 'preprint_date' in json_data: json_data['preprint_date'] = datetime.date.today().isoformat() return json_data
def _author_list(obj, eng): from inspirehep.modules.converter import convert arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_record( record=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id ) ) else: tarball = obj.files[filename] sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri)) try: file_list = untar(tarball.file.uri, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball.file.uri)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [path for path in file_list if path.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authors_rec = create_record(authors_xml) authorlist_record = hep.do(authors_rec) obj.data.update(authorlist_record) break
def _author_list(obj, eng): from inspirehep.modules.converter import convert arxiv_id = get_clean_arXiv_id(obj.data) filename = secure_filename("{0}.tar.gz".format(arxiv_id)) if filename not in obj.files: tarball = download_file_to_workflow( workflow=obj, name=filename, url=current_app.config['ARXIV_TARBALL_URL'].format( arxiv_id=arxiv_id ) ) else: tarball = obj.files[filename] sub_dir = os.path.abspath("{0}_files".format(tarball.file.uri)) try: file_list = untar(tarball.file.uri, sub_dir) except InvalidTarball: obj.log.error("Invalid tarball {0}".format(tarball.file.uri)) return obj.log.info("Extracted tarball to: {0}".format(sub_dir)) xml_files_list = [path for path in file_list if path.endswith(".xml")] obj.log.info("Found xmlfiles: {0}".format(xml_files_list)) for xml_file in xml_files_list: xml_file_fd = open(xml_file, "r") xml_content = xml_file_fd.read() xml_file_fd.close() match = REGEXP_AUTHLIST.findall(xml_content) if match: obj.log.info("Found a match for author extraction") authors_xml = convert(xml_content, stylesheet) authors_rec = create_record(authors_xml) authorlist_record = hep.do(authors_rec) obj.data.update(authorlist_record) break
def test_xslt(oai_xml, oai_xml_result): """Test conversion of XSLT from XML.""" xml = convert(xml=oai_xml, xslt_filename="oaiarXiv2marcxml.xsl") assert xml assert xml == oai_xml_result