Python AbsMetaSession.parse_abs_file示例，browse.services.document.metadata.AbsMetaSession.parse_abs_file Python示例

示例#1

0

显示文件

文件： test_cite.py 项目： rstojnic/arxiv-browse

    def test_cite(self):
        """Test cite."""

        for fname_path in abs_to_test:
            cite = arxiv_bibtex(
                AbsMetaSession.parse_abs_file(filename=fname_path))
            self.assertIsNotNone(cite)

示例#2

0

显示文件

文件： test_abs_parser.py 项目： pblindguy/arxiv-browse

 def test_bulk_parsing(self):
     """Parse all nonempty .abs files in test set."""
     num_files_tested = 0
     from_regex = r'(?m)From:\s+[^<]+<[^@]+@[^>]+>'
     self.assertRegex('From: J Doe <*****@*****.**>', from_regex)
     for dir_name, subdir_list, file_list in os.walk(ABS_FILES):
         for fname in file_list:
             fname_path = os.path.join(dir_name, fname)
             # skip any empty files
             if os.stat(fname_path).st_size == 0:
                 continue
             if not fname_path.endswith('.abs'):
                 continue
             num_files_tested += 1
             dm = AbsMetaSession.parse_abs_file(filename=fname_path)
             self.assertIsInstance(dm, DocMetadata)
             self.assertNotEqual(dm.license, None)
             self.assertNotEqual(dm.license.effective_uri, None,
                                 'should have an effectiveLicenseUri')
             self.assertRegex(dm.raw_safe, r'(?m)From:\s+',
                              'has a From: line')
             self.assertNotRegex(dm.raw_safe, from_regex,
                                 'has a From: line but no email')
     # our test set should be sufficiently large
     self.assertGreater(num_files_tested, 1_000, 'comprehensive dataset')

示例#3

0

显示文件

 def test_psi_in_abs(self):
     """Test text in abs ARXIVNG-1612"""
     f1 = ABS_FILES + '/ftp/arxiv/papers/1901/1901.05426.abs'
     m = AbsMetaSession.parse_abs_file(filename=f1)
     self.assertIsInstance(m, DocMetadata)
     self.assertNotIn(
         '$φ$', m.abstract,
         'TeX psi in abstract should not get converted to UTF8')

示例#4

0

显示文件

文件： test_browse.py 项目： jimentwood/arxiv-browse

 def test_all_abs_as_web_pages(self):
     for dir_name, subdir_list, file_list in os.walk(ABS_FILES):
         for fname in file_list:
             fname_path = os.path.join(dir_name, fname)
             if os.stat(fname_path).st_size == 0 or not fname_path.endswith('.abs'):
                 continue
             m = AbsMetaSession.parse_abs_file(filename=fname_path)
             rv = self.app.get(f'/abs/{m.arxiv_id}')
             self.assertEqual(rv.status_code, 200)

示例#5

0

显示文件

文件： test_abs_parser.py 项目： pblindguy/arxiv-browse

    def test_subsumed_category(self):
        """Test individual .abs files."""
        f1 = ABS_FILES + '/ftp/adap-org/papers/9303/9303001.abs'
        m = AbsMetaSession.parse_abs_file(filename=f1)
        self.assertIsInstance(m, DocMetadata)
        self.assertEqual('adap-org/9303001', m.arxiv_id, 'arxiv_id')

        self.assertTrue(m.primary_category)
        self.assertTrue(m.primary_category.canonical,
                        'subsumed category adap-org should have a canonical')

示例#6

0

显示文件

def paperid_generator(path: str, excluded: List[str]) -> Iterator[str]:
    """Generate an arXiv paper ID."""
    for ( dir_name, subdir_list, file_list) in os.walk(path):
        for fname in file_list:
            fname_path = os.path.join(dir_name, fname)
            print(f'looking at {fname_path}')
            if os.stat(fname_path).st_size != 0 and fname_path.endswith('.abs'):
                aid = AbsMetaSession.parse_abs_file(filename=fname_path).arxiv_id
                logging.debug(f'yielding id {aid}')
                yield aid

示例#7

0

显示文件

文件： test_browse.py 项目： jimentwood/arxiv-browse

    def test_abs_without_license_field(self):
        f1 = ABS_FILES + '/ftp/arxiv/papers/0704/0704.0001.abs'
        m = AbsMetaSession.parse_abs_file(filename=f1)

        rv = self.app.get('/abs/0704.0001')
        self.assertEqual(rv.status_code, 200)
        self.assertEqual(m.license.recorded_uri, None,
                         '0704.0001 should have no license in abs')
        self.assertEqual(m.license.effective_uri, ASSUMED_LICENSE_URI,
                         '0704.0001 should get assumed license')
        assert b'http://arxiv.org/licenses/assumed-1991-2003/' in rv.data, \
            'abs/0704.0001 should be displayed with assumed-1991-2003 license'

示例#8

0

显示文件

 def test_split_long_author_list(self):
     f1 = path_of_for_test(
         'data/abs_files/ftp/arxiv/papers/1411/1411.4413.abs')
     meta: metadata = AbsMetaSession.parse_abs_file(filename=f1)
     alst = split_long_author_list(queries_for_authors(str(meta.authors)),
                                   20)
     self.assertIs(type(alst), tuple)
     self.assertIs(len(alst), 3)
     self.assertIs(type(alst[0]), list)
     self.assertIs(type(alst[1]), list)
     self.assertGreater(len(alst[1]), 0)
     self.assertIs(type(alst[2]), int)

示例#9

0

显示文件

文件： test_inspire.py 项目： sljiaa/arxiv-browse

    def test_abs_without_inspire(self):
        f1 = ABS_FILES + '/ftp/math/papers/0202/0202001.abs'
        m = AbsMetaSession.parse_abs_file(filename=f1)

        assert_that(m, is_not(None))
        assert_that(include_inspire_link(m), equal_to(False),
                    'math/0202001 should NOT get Insire link')

        rv = self.app.get('/abs/math/0202001')
        assert_that(rv.status_code, 200)
        assert_that(rv.data.decode('utf-8'),
                    is_not(contains_string('INSPIRE HEP')),
                    'math/0202001 should NOT get INSPIRE link')

示例#10

0

显示文件

    def test_split_with_collaboration(self):
        f1 = path_of_for_test(
            'data/abs_files/ftp/arxiv/papers/0808/0808.4142.abs')
        meta: metadata = AbsMetaSession.parse_abs_file(filename=f1)

        split = split_authors(str(meta.authors))
        self.assertListEqual(
            split, ['D0 Collaboration', ':', 'V. Abazov', ',', 'et al'])

        alst = queries_for_authors(str(meta.authors))
        self.assertListEqual(alst,
                             [('D0 Collaboration', 'D0 Collaboration'), ': ',
                              ('V. Abazov', 'Abazov, V'), ', ', 'et al'])

示例#11

0

显示文件

    def test_abs_with_license_field(self):
        f1 = ABS_FILES + '/ftp/arxiv/papers/0704/0704.0600.abs'
        m = AbsMetaSession.parse_abs_file(filename=f1)

        self.assertNotEqual(m.license, None)
        self.assertNotEqual(m.license.recorded_uri, None)
        self.assertEqual(m.license.recorded_uri, m.license.effective_uri)
        self.assertNotEqual(m.license.recorded_uri,
                            'http://arxiv.org/licenses/assumed-1991-2003/')

        rv = self.app.get('/abs/0704.0600')
        self.assertEqual(rv.status_code, 200)

        self.assertRegex(rv.data.decode('utf-8'), m.license.effective_uri,
                         'should be displayed with its license')

示例#12

0

显示文件

def paperid_iterator(path: str, excluded: List[str]) -> List[str]:
    """Return an iterator of paperId strings for all abs found below path."""
    ids = []
    for (dir_name, subdir_list, file_list) in os.walk(path):
        for fname in file_list:
            fname_path = os.path.join(dir_name, fname)
            if os.stat(fname_path).st_size == 0:
                continue
            if not fname_path.endswith('.abs'):
                continue
            aid = AbsMetaSession.parse_abs_file(filename=fname_path).arxiv_id
            if aid not in excluded:
                ids.append(aid)
    logging.debug(f'finished getting the ids count:{len(ids)}')
    return ids

示例#13

0

显示文件

文件： test_inspire.py 项目： sljiaa/arxiv-browse

    def test_abs_with_inspire(self):
        f1 = ABS_FILES + '/ftp/arxiv/papers/1108/1108.5926.abs'
        m = AbsMetaSession.parse_abs_file(filename=f1)

        assert_that(m, is_not(None))
        assert_that(get_orig_publish_date(m.arxiv_identifier),
                    equal_to(date(2011, 8, 1)))
        assert_that(m.primary_category, is_not(equal_to(None)))

        assert_that(include_inspire_link(m), is_not(equal_to(False)),
                    '1108.5926v1 should get Insire link')

        rv = self.app.get('/abs/1108.5926v1')
        assert_that(rv.status_code, 200)
        assert_that(rv.data.decode('utf-8'), contains_string('INSPIRE HEP'),
                    '1108.5926 should get INSPIRE link')

示例#14

0

显示文件

    def test_split_strange_author_list(self):
        """Test odd author list that shows '0 additional authors' ARXIVNG-2083"""
        f1 = path_of_for_test(
            'data/abs_files/ftp/arxiv/papers/1902/1902.05884.abs')
        meta: metadata = AbsMetaSession.parse_abs_file(filename=f1)
        alst = split_long_author_list(queries_for_authors(str(meta.authors)),
                                      100)

        self.assertIs(type(alst), tuple)
        self.assertIs(len(alst), 3)

        self.assertIs(type(alst[0]), list)
        self.assertIs(type(alst[1]), list)
        self.assertIs(type(alst[2]), int)

        self.assertEqual(
            len(list(filter(lambda x: isinstance(x, tuple), alst[0]))), 101)

        self.assertEqual(len(alst[1]), 0,
                         "Back list on 1902.05884 should be empty")
        self.assertEqual(alst[2], 0,
                         "Back list size on 1902.05884 should be empty")

示例#15

0

显示文件

    def test_same_as_classic(self):

        bad_data = [
            '1501.00001v1',
            '1501.99999v1',
            '1501.00002v1',
            '1502.00001v1',  # probably fake abs
            '0704.0019v2',  # title tex escaping problem
            '0704.0559v1',  # bad double escape in classic
        ]

        # '0704.0006v1', '0704.0481v1', '0704.0156v2' , '0704.0019v2', '0704.0597v1']

        with open(CLASSIC_RESULTS_FILE) as fp:
            classic_results = json.load(fp)

        def to_str(gs_tag):
            return str(gs_tag['name']) + ' ' + str(gs_tag['content'])

        def to_set(gs_tags):
            return set(map(to_str, gs_tags))

        num_files_tested = 0
        for dir_name, subdir_list, file_list in os.walk(ABS_FILES):
            for fname in file_list:
                fname_path = os.path.join(dir_name, fname)
                # skip any empty files
                if os.stat(fname_path).st_size == 0:
                    continue
                if not fname_path.endswith('.abs'):
                    continue
                mm = AbsMetaSession.parse_abs_file(filename=fname_path)
                if mm.arxiv_id_v in bad_data:
                    continue
                num_files_tested = num_files_tested + 1

                self.assertIsInstance(mm, DocMetadata)

                with self.app.test_request_context():
                    gs_tags = meta_tag_metadata(mm)

                self.assertIsInstance(gs_tags, list)
                if mm.arxiv_id_v not in classic_results:
                    # Could not find google scholar tags in classic results for this
                    # arxiv_id. Not a problem. Probably this abs was added to the
                    # test data after the classic results were generated.
                    # You only should add the google scholar tags to the classic
                    # metadata if you'd like a regression test for it.
                    continue

                classic = set(
                    map(html.unescape, to_set(classic_results[mm.arxiv_id_v])))
                ng = set(map(html.unescape, to_set(gs_tags)))

                if ng != classic:
                    classic_without_doi = set(
                        filter(lambda v: not v.startswith('citation_doi'),
                               classic))
                    ng_without_doi = set(
                        filter(lambda v: not v.startswith('citation_doi'), ng))
                    self.assertSetEqual(
                        ng_without_doi, classic_without_doi, '''
                                        
For {} NG tags (first result) not same as Classic tags(second results)
Test Num {} 
DOI are ignored.
                                        
classic/expected: {}
                                        
                                                                              
ng/actual: {}

test authors: {}
test title: {}'''.format(mm.arxiv_id_v, num_files_tested,
                         pprint.pformat(classic), pprint.pformat(ng),
                         mm.authors.raw, mm.title))

示例#16

0

显示文件

文件： test_abs_parser.py 项目： pblindguy/arxiv-browse

    def test_individual_files(self):
        """Test individual .abs files."""
        f1 = ABS_FILES + '/orig/arxiv/papers/0906/0906.5132v3.abs'
        ams = AbsMetaSession.parse_abs_file(filename=f1)

        self.assertIsInstance(ams, DocMetadata)
        self.assertEqual(ams.arxiv_id, '0906.5132', 'arxiv_id')
        self.assertEqual(
            ams.submitter,
            Submitter(name='Vladimir P. Mineev',
                      email='[email protected]'))
        self.assertListEqual(ams.version_history, [
            VersionEntry(version=1,
                         raw='Date: Sun, 28 Jun 2009 11:24:35 GMT   (17kb)',
                         submitted_date=datetime(
                             2009, 6, 28, 11, 24, 35, tzinfo=tzutc()),
                         size_kilobytes=17,
                         source_type=SourceType(code='')),
            VersionEntry(version=2,
                         raw='Date (revised v2): Tue, 21 Jul '
                         '2009 09:45:44 GMT   (17kb)',
                         submitted_date=datetime(
                             2009, 7, 21, 9, 45, 44, tzinfo=tzutc()),
                         size_kilobytes=17,
                         source_type=SourceType(code='')),
            VersionEntry(version=3,
                         raw='Date (revised v3): Wed, 29 Jul '
                         '2009 11:13:43 GMT   (17kb)',
                         submitted_date=datetime(
                             2009, 7, 29, 11, 13, 43, tzinfo=tzutc()),
                         size_kilobytes=17,
                         source_type=SourceType(code=''))
        ])
        self.assertEqual(ams.version, 3)
        self.assertEqual(
            ams.title, 'Recent developments in unconventional '
            'superconductivity theory')
        self.assertEqual(str(ams.authors), 'V.P.Mineev')
        self.assertEqual(ams.categories, 'cond-mat.supr-con cond-mat.mtrl-sci')
        self.assertEqual(ams.comments, '15 pages')
        self.assertNotEqual(ams.license, None)
        self.assertEqual(
            ams.license.effective_uri,
            'http://arxiv.org/licenses/nonexclusive-distrib/1.0/')
        self.assertMultiLineEqual(
            ams.abstract,
            '''  The review of recent developments in the unconventional superconductivity
theory is given. In the fist part I consider the physical origin of the Kerr
rotation polarization of light reflected from the surface of superconducting
$Sr_2RuO_4$. Then the comparison of magneto-optical responses in
superconductors with orbital and spin spontaneous magnetization is presented.
The latter result is applied to the estimation of the magneto-optical
properties of neutral superfluids with spontaneous magnetization. The second
part is devoted to the natural optical activity or gyrotropy properties of
noncentrosymmetric metals in their normal and superconducting states. The
temperature behavior of the gyrotropy coefficient is compared with the
temperature behavior of paramagnetic susceptibility determining the noticeable
increase of the paramagnetic limiting field in noncentrosymmetric
superconductors. In the last chapter I describe the order parameter and the
symmetry of superconducting state in the itinerant ferromagnet with
orthorhombic symmetry. Finally the Josephson coupling between two adjacent
ferromagnet superconducting domains is discussed.
''')
        for value in [
                ams.acm_class, ams.doi, ams.journal_ref, ams.report_num,
                ams.proxy
        ]:
            self.assertIsNone(value)

示例#17

0

显示文件

 def test_collaboration_at_front(self):
     f1 = path_of_for_test('data/abs_files/ftp/arxiv/papers/0808/0808.4142.abs')
     meta = AbsMetaSession.parse_abs_file(filename=f1)
     paflst = parse_author_affil(meta.authors.raw)
     self.assertListEqual(paflst, [['D0 Collaboration', '', ''], ['Abazov', 'V.', '']])