Exemplo n.º 1
0
    def parse_node(self, response, node):
        """Parse a OUP XML file into a HEP record."""
        node.remove_namespaces()
        record = HEPLoader(item=HEPRecord(), selector=node, response=response)

        dois = node.xpath("//article-id[@pub-id-type='doi']/text()").extract()
        record.add_value('dois', dois)

        raw_article_type = node.xpath('./@article-type').extract()
        article_type = map(lambda x: self.article_type_mapping.get(x, 'other'),
                           raw_article_type)
        record.add_value('journal_doctype', article_type)

        if 'other' in article_type:
            logger.warning(
                'There are unmapped article types for article %s with types %s.'
                % (dois, raw_article_type))

        if article_type in ['correction', 'addendum']:
            logger.info('Adding related_article_doi.')
            record.add_xpath('related_article_doi',
                             "//related-article[@ext-link-type='doi']/@href")

        arxiv_eprints = self.get_arxiv_eprints(node)
        if not arxiv_eprints:
            logger.warning('No arxiv eprints found for article %s.' % dois)
        else:
            record.add_value('arxiv_eprints', arxiv_eprints)

        page_nr = node.xpath("//counts/page-count/@count")
        if page_nr:
            try:
                page_nr = map(int, page_nr.extract())
                record.add_value('page_nr', page_nr)
            except ValueError as e:
                logger.error(
                    'Failed to parse last_page or first_page for article %s: %s'
                    % (dois, e))

        record.add_xpath('abstract', '//abstract[1]')
        record.add_xpath('title', '//article-title/text()')
        record.add_xpath('subtitle', '//subtitle/text()')

        authors = self._get_authors(node)
        if not authors:
            logger.error('No authors found for article %s.' % dois)
        record.add_value('authors', authors)
        record.add_xpath('collaborations', "//contrib/collab/text()")

        record.add_value('date_published', self._get_published_date(node))

        journal_title = '//abbrev-journal-title/text()|//journal-title/text()'
        record.add_xpath('journal_title', journal_title)
        record.add_xpath('journal_issue', '//issue/text()')
        record.add_xpath('journal_volume', '//volume/text()')
        record.add_xpath('journal_artid', '//elocation-id/text()')

        published_date = self._get_published_date(node)
        volume = self.get_volume_year(node)
        record.add_value('journal_year', int(volume))
        record.add_value('date_published', published_date)

        record.add_xpath('copyright_holder', '//copyright-holder/text()')
        record.add_xpath('copyright_year', '//copyright-year/text()')
        record.add_xpath('copyright_statement', '//copyright-statement/text()')

        license = get_license(license_url=node.xpath(
            '//license/license-p/ext-link/text()').extract_first())
        record.add_value('license', license)

        record.add_value('collections',
                         ['Progress of Theoretical and Experimental Physics'])

        # local file paths
        local_files = []
        if 'xml_url' in response.meta:
            local_files.append({
                'filetype': 'xml',
                'path': response.meta['xml_url']
            })
        if 'pdf_url' in response.meta:
            local_files.append({
                'filetype': 'pdf',
                'path': response.meta['pdf_url']
            })
        if 'pdfa_url' in response.meta:
            local_files.append({
                'filetype': 'pdf/a',
                'path': response.meta['pdfa_url']
            })
        record.add_value('local_files', local_files)

        return dict(record.load_item())
Exemplo n.º 2
0
    def parse(self, response):
        """Parse a APS JSON file into a HEP record."""
        aps_response = json.loads(response.body_as_unicode())

        for article in aps_response['data']:
            record = HEPLoader(item=HEPRecord(), response=response)

            dois = get_nested(article, 'identifiers', 'doi')
            record.add_value('dois', dois)

            journal_doctype = self.article_type_mapping.get(
                article.get('articleType'), 'other')
            if journal_doctype == 'other':
                logger.warning(
                    'Journal_doctype is %s for article %s. Do we need other mapping for this?'
                    % (journal_doctype, dois))

            record.add_value('journal_doctype', journal_doctype)
            page_nr = article.get('numPages')
            if page_nr is not None:
                record.add_value('page_nr', page_nr)

            arxiv = get_nested(article, 'identifiers',
                               'arxiv').replace('arXiv:', '')
            if not arxiv:
                logger.warning('No arxiv eprints found for article %s.' % dois)
            else:
                record.add_value('arxiv_eprints', {'value': arxiv})

            record.add_value('abstract',
                             get_nested(article, 'abstract', 'value'))
            record.add_value('title', get_nested(article, 'title', 'value'))

            authors, collaborations = self._get_authors_and_collab(
                article, dois)
            record.add_value('authors', authors)
            record.add_value('collaborations', collaborations)

            record.add_value('journal_title',
                             get_nested(article, 'journal', 'name'))
            record.add_value('journal_issue',
                             get_nested(article, 'issue', 'number'))
            record.add_value('journal_volume',
                             get_nested(article, 'volume', 'number'))

            published_date = article['date']
            record.add_value('journal_year', int(published_date[:4]))
            record.add_value('date_published', published_date)
            record.add_value('field_categories', [{
                'term': term.get('label'),
                'scheme': 'APS',
                'source': '',
            } for term in get_nested(article, 'classificationSchemes',
                                     'subjectAreas')])
            copyright_holders = get_nested(article, 'rights',
                                           'copyrightHolders')
            if copyright_holders:
                record.add_value('copyright_holder',
                                 copyright_holders[0]['name'])

            record.add_value(
                'copyright_year',
                str(get_nested(article, 'rights', 'copyrightYear')))
            record.add_value('copyright_statement',
                             get_nested(article, 'rights', 'rightsStatement'))

            license = get_license(license_url=get_nested(
                article, 'rights', 'licenses')[0]['url'])
            record.add_value('license', license)

            record.add_value('collections', ['HEP', 'Citeable', 'Published'])
            yield record.load_item()

        # Pagination support. Will yield until no more "next" pages are found
        if 'Link' in response.headers:
            links = link_header.parse(response.headers['Link'])
            next = links.links_by_attr_pairs([('rel', 'next')])
            if next:
                next_url = next[0].href
                yield Request(next_url)