def test_cite(self): """Test cite.""" for fname_path in abs_to_test: cite = arxiv_bibtex( AbsMetaSession.parse_abs_file(filename=fname_path)) self.assertIsNotNone(cite)
def test_bulk_parsing(self): """Parse all nonempty .abs files in test set.""" num_files_tested = 0 from_regex = r'(?m)From:\s+[^<]+<[^@]+@[^>]+>' self.assertRegex('From: J Doe <*****@*****.**>', from_regex) for dir_name, subdir_list, file_list in os.walk(ABS_FILES): for fname in file_list: fname_path = os.path.join(dir_name, fname) # skip any empty files if os.stat(fname_path).st_size == 0: continue if not fname_path.endswith('.abs'): continue num_files_tested += 1 dm = AbsMetaSession.parse_abs_file(filename=fname_path) self.assertIsInstance(dm, DocMetadata) self.assertNotEqual(dm.license, None) self.assertNotEqual(dm.license.effective_uri, None, 'should have an effectiveLicenseUri') self.assertRegex(dm.raw_safe, r'(?m)From:\s+', 'has a From: line') self.assertNotRegex(dm.raw_safe, from_regex, 'has a From: line but no email') # our test set should be sufficiently large self.assertGreater(num_files_tested, 1_000, 'comprehensive dataset')
def test_psi_in_abs(self): """Test text in abs ARXIVNG-1612""" f1 = ABS_FILES + '/ftp/arxiv/papers/1901/1901.05426.abs' m = AbsMetaSession.parse_abs_file(filename=f1) self.assertIsInstance(m, DocMetadata) self.assertNotIn( '$φ$', m.abstract, 'TeX psi in abstract should not get converted to UTF8')
def test_all_abs_as_web_pages(self): for dir_name, subdir_list, file_list in os.walk(ABS_FILES): for fname in file_list: fname_path = os.path.join(dir_name, fname) if os.stat(fname_path).st_size == 0 or not fname_path.endswith('.abs'): continue m = AbsMetaSession.parse_abs_file(filename=fname_path) rv = self.app.get(f'/abs/{m.arxiv_id}') self.assertEqual(rv.status_code, 200)
def test_subsumed_category(self): """Test individual .abs files.""" f1 = ABS_FILES + '/ftp/adap-org/papers/9303/9303001.abs' m = AbsMetaSession.parse_abs_file(filename=f1) self.assertIsInstance(m, DocMetadata) self.assertEqual('adap-org/9303001', m.arxiv_id, 'arxiv_id') self.assertTrue(m.primary_category) self.assertTrue(m.primary_category.canonical, 'subsumed category adap-org should have a canonical')
def paperid_generator(path: str, excluded: List[str]) -> Iterator[str]: """Generate an arXiv paper ID.""" for ( dir_name, subdir_list, file_list) in os.walk(path): for fname in file_list: fname_path = os.path.join(dir_name, fname) print(f'looking at {fname_path}') if os.stat(fname_path).st_size != 0 and fname_path.endswith('.abs'): aid = AbsMetaSession.parse_abs_file(filename=fname_path).arxiv_id logging.debug(f'yielding id {aid}') yield aid
def test_abs_without_license_field(self): f1 = ABS_FILES + '/ftp/arxiv/papers/0704/0704.0001.abs' m = AbsMetaSession.parse_abs_file(filename=f1) rv = self.app.get('/abs/0704.0001') self.assertEqual(rv.status_code, 200) self.assertEqual(m.license.recorded_uri, None, '0704.0001 should have no license in abs') self.assertEqual(m.license.effective_uri, ASSUMED_LICENSE_URI, '0704.0001 should get assumed license') assert b'http://arxiv.org/licenses/assumed-1991-2003/' in rv.data, \ 'abs/0704.0001 should be displayed with assumed-1991-2003 license'
def test_split_long_author_list(self): f1 = path_of_for_test( 'data/abs_files/ftp/arxiv/papers/1411/1411.4413.abs') meta: metadata = AbsMetaSession.parse_abs_file(filename=f1) alst = split_long_author_list(queries_for_authors(str(meta.authors)), 20) self.assertIs(type(alst), tuple) self.assertIs(len(alst), 3) self.assertIs(type(alst[0]), list) self.assertIs(type(alst[1]), list) self.assertGreater(len(alst[1]), 0) self.assertIs(type(alst[2]), int)
def test_abs_without_inspire(self): f1 = ABS_FILES + '/ftp/math/papers/0202/0202001.abs' m = AbsMetaSession.parse_abs_file(filename=f1) assert_that(m, is_not(None)) assert_that(include_inspire_link(m), equal_to(False), 'math/0202001 should NOT get Insire link') rv = self.app.get('/abs/math/0202001') assert_that(rv.status_code, 200) assert_that(rv.data.decode('utf-8'), is_not(contains_string('INSPIRE HEP')), 'math/0202001 should NOT get INSPIRE link')
def test_split_with_collaboration(self): f1 = path_of_for_test( 'data/abs_files/ftp/arxiv/papers/0808/0808.4142.abs') meta: metadata = AbsMetaSession.parse_abs_file(filename=f1) split = split_authors(str(meta.authors)) self.assertListEqual( split, ['D0 Collaboration', ':', 'V. Abazov', ',', 'et al']) alst = queries_for_authors(str(meta.authors)) self.assertListEqual(alst, [('D0 Collaboration', 'D0 Collaboration'), ': ', ('V. Abazov', 'Abazov, V'), ', ', 'et al'])
def test_abs_with_license_field(self): f1 = ABS_FILES + '/ftp/arxiv/papers/0704/0704.0600.abs' m = AbsMetaSession.parse_abs_file(filename=f1) self.assertNotEqual(m.license, None) self.assertNotEqual(m.license.recorded_uri, None) self.assertEqual(m.license.recorded_uri, m.license.effective_uri) self.assertNotEqual(m.license.recorded_uri, 'http://arxiv.org/licenses/assumed-1991-2003/') rv = self.app.get('/abs/0704.0600') self.assertEqual(rv.status_code, 200) self.assertRegex(rv.data.decode('utf-8'), m.license.effective_uri, 'should be displayed with its license')
def paperid_iterator(path: str, excluded: List[str]) -> List[str]: """Return an iterator of paperId strings for all abs found below path.""" ids = [] for (dir_name, subdir_list, file_list) in os.walk(path): for fname in file_list: fname_path = os.path.join(dir_name, fname) if os.stat(fname_path).st_size == 0: continue if not fname_path.endswith('.abs'): continue aid = AbsMetaSession.parse_abs_file(filename=fname_path).arxiv_id if aid not in excluded: ids.append(aid) logging.debug(f'finished getting the ids count:{len(ids)}') return ids
def test_abs_with_inspire(self): f1 = ABS_FILES + '/ftp/arxiv/papers/1108/1108.5926.abs' m = AbsMetaSession.parse_abs_file(filename=f1) assert_that(m, is_not(None)) assert_that(get_orig_publish_date(m.arxiv_identifier), equal_to(date(2011, 8, 1))) assert_that(m.primary_category, is_not(equal_to(None))) assert_that(include_inspire_link(m), is_not(equal_to(False)), '1108.5926v1 should get Insire link') rv = self.app.get('/abs/1108.5926v1') assert_that(rv.status_code, 200) assert_that(rv.data.decode('utf-8'), contains_string('INSPIRE HEP'), '1108.5926 should get INSPIRE link')
def test_split_strange_author_list(self): """Test odd author list that shows '0 additional authors' ARXIVNG-2083""" f1 = path_of_for_test( 'data/abs_files/ftp/arxiv/papers/1902/1902.05884.abs') meta: metadata = AbsMetaSession.parse_abs_file(filename=f1) alst = split_long_author_list(queries_for_authors(str(meta.authors)), 100) self.assertIs(type(alst), tuple) self.assertIs(len(alst), 3) self.assertIs(type(alst[0]), list) self.assertIs(type(alst[1]), list) self.assertIs(type(alst[2]), int) self.assertEqual( len(list(filter(lambda x: isinstance(x, tuple), alst[0]))), 101) self.assertEqual(len(alst[1]), 0, "Back list on 1902.05884 should be empty") self.assertEqual(alst[2], 0, "Back list size on 1902.05884 should be empty")
def test_same_as_classic(self): bad_data = [ '1501.00001v1', '1501.99999v1', '1501.00002v1', '1502.00001v1', # probably fake abs '0704.0019v2', # title tex escaping problem '0704.0559v1', # bad double escape in classic ] # '0704.0006v1', '0704.0481v1', '0704.0156v2' , '0704.0019v2', '0704.0597v1'] with open(CLASSIC_RESULTS_FILE) as fp: classic_results = json.load(fp) def to_str(gs_tag): return str(gs_tag['name']) + ' ' + str(gs_tag['content']) def to_set(gs_tags): return set(map(to_str, gs_tags)) num_files_tested = 0 for dir_name, subdir_list, file_list in os.walk(ABS_FILES): for fname in file_list: fname_path = os.path.join(dir_name, fname) # skip any empty files if os.stat(fname_path).st_size == 0: continue if not fname_path.endswith('.abs'): continue mm = AbsMetaSession.parse_abs_file(filename=fname_path) if mm.arxiv_id_v in bad_data: continue num_files_tested = num_files_tested + 1 self.assertIsInstance(mm, DocMetadata) with self.app.test_request_context(): gs_tags = meta_tag_metadata(mm) self.assertIsInstance(gs_tags, list) if mm.arxiv_id_v not in classic_results: # Could not find google scholar tags in classic results for this # arxiv_id. Not a problem. Probably this abs was added to the # test data after the classic results were generated. # You only should add the google scholar tags to the classic # metadata if you'd like a regression test for it. continue classic = set( map(html.unescape, to_set(classic_results[mm.arxiv_id_v]))) ng = set(map(html.unescape, to_set(gs_tags))) if ng != classic: classic_without_doi = set( filter(lambda v: not v.startswith('citation_doi'), classic)) ng_without_doi = set( filter(lambda v: not v.startswith('citation_doi'), ng)) self.assertSetEqual( ng_without_doi, classic_without_doi, ''' For {} NG tags (first result) not same as Classic tags(second results) Test Num {} DOI are ignored. classic/expected: {} ng/actual: {} test authors: {} test title: {}'''.format(mm.arxiv_id_v, num_files_tested, pprint.pformat(classic), pprint.pformat(ng), mm.authors.raw, mm.title))
def test_individual_files(self): """Test individual .abs files.""" f1 = ABS_FILES + '/orig/arxiv/papers/0906/0906.5132v3.abs' ams = AbsMetaSession.parse_abs_file(filename=f1) self.assertIsInstance(ams, DocMetadata) self.assertEqual(ams.arxiv_id, '0906.5132', 'arxiv_id') self.assertEqual( ams.submitter, Submitter(name='Vladimir P. Mineev', email='[email protected]')) self.assertListEqual(ams.version_history, [ VersionEntry(version=1, raw='Date: Sun, 28 Jun 2009 11:24:35 GMT (17kb)', submitted_date=datetime( 2009, 6, 28, 11, 24, 35, tzinfo=tzutc()), size_kilobytes=17, source_type=SourceType(code='')), VersionEntry(version=2, raw='Date (revised v2): Tue, 21 Jul ' '2009 09:45:44 GMT (17kb)', submitted_date=datetime( 2009, 7, 21, 9, 45, 44, tzinfo=tzutc()), size_kilobytes=17, source_type=SourceType(code='')), VersionEntry(version=3, raw='Date (revised v3): Wed, 29 Jul ' '2009 11:13:43 GMT (17kb)', submitted_date=datetime( 2009, 7, 29, 11, 13, 43, tzinfo=tzutc()), size_kilobytes=17, source_type=SourceType(code='')) ]) self.assertEqual(ams.version, 3) self.assertEqual( ams.title, 'Recent developments in unconventional ' 'superconductivity theory') self.assertEqual(str(ams.authors), 'V.P.Mineev') self.assertEqual(ams.categories, 'cond-mat.supr-con cond-mat.mtrl-sci') self.assertEqual(ams.comments, '15 pages') self.assertNotEqual(ams.license, None) self.assertEqual( ams.license.effective_uri, 'http://arxiv.org/licenses/nonexclusive-distrib/1.0/') self.assertMultiLineEqual( ams.abstract, ''' The review of recent developments in the unconventional superconductivity theory is given. In the fist part I consider the physical origin of the Kerr rotation polarization of light reflected from the surface of superconducting $Sr_2RuO_4$. Then the comparison of magneto-optical responses in superconductors with orbital and spin spontaneous magnetization is presented. The latter result is applied to the estimation of the magneto-optical properties of neutral superfluids with spontaneous magnetization. The second part is devoted to the natural optical activity or gyrotropy properties of noncentrosymmetric metals in their normal and superconducting states. The temperature behavior of the gyrotropy coefficient is compared with the temperature behavior of paramagnetic susceptibility determining the noticeable increase of the paramagnetic limiting field in noncentrosymmetric superconductors. In the last chapter I describe the order parameter and the symmetry of superconducting state in the itinerant ferromagnet with orthorhombic symmetry. Finally the Josephson coupling between two adjacent ferromagnet superconducting domains is discussed. ''') for value in [ ams.acm_class, ams.doi, ams.journal_ref, ams.report_num, ams.proxy ]: self.assertIsNone(value)
def test_collaboration_at_front(self): f1 = path_of_for_test('data/abs_files/ftp/arxiv/papers/0808/0808.4142.abs') meta = AbsMetaSession.parse_abs_file(filename=f1) paflst = parse_author_affil(meta.authors.raw) self.assertListEqual(paflst, [['D0 Collaboration', '', ''], ['Abazov', 'V.', '']])