예제 #1
0
class JatsParser(object):
    """Parser for the JATS format.

    It can be used directly by invoking the :func:`JatsParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the JATS metadata.
    """
    def __init__(self, jats_record, source=None):
        self.root = self.get_root_node(jats_record)
        if not source:
            source = self.publisher
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract a JATS record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(self.abstract)
        self.builder.add_title(self.title, subtitle=self.subtitle)
        self.builder.add_copyright(**self.copyright)
        self.builder.add_document_type(self.document_type)
        self.builder.add_license(**self.license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        for keyword in self.keywords:
            self.builder.add_keyword(**keyword)
        self.builder.add_imprint_date(self.publication_date.dumps())
        for reference in self.references:
            self.builder.add_reference(reference)

        return self.builder.record

    @property
    def references(self):
        """Extract a JATS record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_nodes = self.root.xpath('./back/ref-list/ref')
        return list(
            itertools.chain.from_iterable(
                self.get_reference(node) for node in ref_nodes
            )
        )

    remove_tags_config_abstract = {
        'allowed_tags': ['sup', 'sub'],
        'allowed_trees': ['math'],
        'strip': 'self::pub-id|self::issn'
    }

    @property
    def abstract(self):
        abstract_nodes = self.root.xpath('./front//abstract[1]')

        if not abstract_nodes:
            return

        abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip()
        return abstract

    @property
    def article_type(self):
        article_type = self.root.xpath('./@article-type').extract_first()

        return article_type

    @property
    def artid(self):
        artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first()

        return artid

    @property
    def authors(self):
        author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]')
        authors = [self.get_author(author) for author in author_nodes]

        return authors

    @property
    def collaborations(self):
        collab_nodes = self.root.xpath(
            './front//collab |'
            './front//contrib[@contrib-type="collaboration"] |'
            './front//on-behalf-of'
        )
        collaborations = set(
            collab.xpath('string(.)').extract_first() for collab in collab_nodes
        )

        return collaborations

    @property
    def copyright(self):
        copyright = {
            'holder': self.copyright_holder,
            'material': self.material,
            'statement': self.copyright_statement,
            'year': self.copyright_year,
        }

        return copyright

    @property
    def copyright_holder(self):
        copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first()

        return copyright_holder

    @property
    def copyright_statement(self):
        copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first()

        return copyright_statement

    @property
    def copyright_year(self):
        copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first()

        return maybe_int(copyright_year)

    @property
    def dois(self):
        doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract()
        dois = [
            {'doi': value, 'material': self.material} for value in doi_values
        ]

        if self.material != 'publication':
            doi_values = self.root.xpath(
                './front/article-meta//related-article[@ext-link-type="doi"]/@href'
            ).extract()
            related_dois = ({'doi': value} for value in doi_values)
            dois.extend(related_dois)

        return dois

    @property
    def document_type(self):
        if self.is_conference_paper:
            document_type = 'conference paper'
        else:
            document_type = 'article'

        return document_type

    @property
    def is_conference_paper(self):
        """Decide whether the article is a conference paper."""
        conference_node = self.root.xpath('./front//conference').extract_first()

        return bool(conference_node)

    @property
    def journal_title(self):
        journal_title = self.root.xpath(
            './front/journal-meta//abbrev-journal-title/text() |'
            './front/journal-meta//journal-title/text()'
        ).extract_first()

        return journal_title

    @property
    def journal_issue(self):
        journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first()

        return journal_issue

    @property
    def journal_volume(self):
        journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first()

        return journal_volume

    @property
    def keywords(self):
        keyword_groups = self.root.xpath('./front//kwd-group')
        keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups)

        return keywords

    @property
    def license(self):
        license = {
            'license': self.license_statement,
            'material': self.material,
            'url': self.license_url,
        }

        return license

    @property
    def license_statement(self):
        license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip()

        return license_statement

    @property
    def license_url(self):
        url_nodes = (
            './front/article-meta//license_ref/text() |'
            './front/article-meta//license/@href |'
            './front/article-meta//license//ext-link/@href'
        )
        license_url = self.root.xpath(url_nodes).extract_first()

        return license_url

    @property
    def material(self):
        if self.article_type.startswith('correc'):
            material = 'erratum'
        elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'):
            material = self.article_type
        else:
            material = 'publication'

        return material

    @property
    def number_of_pages(self):
        number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first())

        return number_of_pages

    @property
    def page_start(self):
        page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first()

        return page_start

    @property
    def page_end(self):
        page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first()

        return page_end

    @property
    def publication_date(self):
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[@pub-type="epub"] |'
            './front//pub-date[starts-with(@date-type,"pub")] |'
            './front//date[starts-with(@date-type,"pub")]'
        )
        publication_date = min(
            self.get_date(date_node) for date_node in date_nodes
        )

        return publication_date

    @property
    def publication_info(self):
        publication_info = {
            'artid': self.artid,
            'journal_title': self.journal_title,
            'journal_issue': self.journal_issue,
            'journal_volume': self.journal_volume,
            'material': self.material,
            'page_start': self.page_start,
            'page_end': self.page_end,
            'year': self.year,
        }

        return publication_info

    @property
    def publisher(self):
        publisher = self.root.xpath('./front//publisher-name/text()').extract_first()

        return publisher

    @property
    def subtitle(self):
        subtitle = self.root.xpath('string(./front//subtitle)').extract_first()

        return subtitle

    @property
    def title(self):
        title = self.root.xpath('string(./front//article-title)').extract_first()

        return title

    def get_affiliation(self, id_):
        """Get the affiliation with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the affiliation.

        Returns:
            Optional[str]: the affiliation with that id or ``None`` if there is
                no match.
        """
        affiliation_node = self.root.xpath("//aff[@id=$id_]", id_=id_)
        if affiliation_node:
            affiliation = remove_tags(
                affiliation_node[0], strip="self::label | self::email"
            ).strip()
            return affiliation

    def get_emails_from_refs(self, id_):
        """Get the emails from the node with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the node.

        Returns:
            List[str]: the emails from the node with that id or [] if none found.
        """
        email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_)
        return email_nodes.extract()

    @property
    def year(self):
        not_online = (
            'not(starts-with(@publication-format, "elec"))'
            ' and not(starts-with(@publication-format, "online")'
        )
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[starts-with(@date-type,"pub") and $not_online] |'
            './front//date[starts-with(@date-type,"pub") and $not_online]',
            not_online=not_online
        )

        year = min(
            self.get_date(date_node) for date_node in date_nodes
        ).year

        return year

    def get_author_affiliations(self, author_node):
        """Extract an author's affiliations."""
        raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        # Sometimes the rid might have more than one ID (e.g. rid="id0 id1")
        referred_ids = set()
        for raw_referred_id in raw_referred_ids:
            referred_ids.update(set(raw_referred_id.split(' ')))

        affiliations = [
            self.get_affiliation(rid) for rid in referred_ids
            if self.get_affiliation(rid)
        ]

        return affiliations

    def get_author_emails(self, author_node):
        """Extract an author's email addresses."""
        emails = author_node.xpath('.//email/text()').extract()
        referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        for referred_id in referred_ids:
            emails.extend(self.get_emails_from_refs(referred_id))

        return emails

    @staticmethod
    def get_author_name(author_node):
        """Extract an author's name."""
        surname = author_node.xpath('.//surname/text()').extract_first()
        if not surname:
            # the author name is unstructured
            author_name = author_node.xpath('string(./string-name)').extract_first()
        given_names = author_node.xpath('.//given-names/text()').extract_first()
        suffix = author_node.xpath('.//suffix/text()').extract_first()
        author_name = ', '.join(el for el in (surname, given_names, suffix) if el)

        return author_name

    @staticmethod
    def get_date(date_node):
        """Extract a date from a date node.

        Returns:
            PartialDate: the parsed date.
        """
        iso_string = date_node.xpath('./@iso-8601-date').extract_first()
        iso_date = PartialDate.loads(iso_string) if iso_string else None

        year = date_node.xpath('string(./year)').extract_first()
        month = date_node.xpath('string(./month)').extract_first()
        day = date_node.xpath('string(./day)').extract_first()
        date_from_parts = PartialDate.from_parts(year, month, day) if year else None

        string_date = date_node.xpath('string(./string-date)').extract_first()
        try:
            parsed_date = PartialDate.parse(string_date)
        except ValueError:
            parsed_date = None

        date = get_first([iso_date, date_from_parts, parsed_date])
        return date

    @staticmethod
    def get_keywords(group_node):
        """Extract keywords from a keyword group."""
        schema = None
        if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower():
            schema = 'PACS'

        keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd'))
        keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords)

        return keyword_dicts

    @staticmethod
    def get_root_node(jats_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(jats_record, six.string_types):
            root = get_node(jats_record)
        else:
            root = jats_record
        root.remove_namespaces()

        return root

    def get_author(self, author_node):
        """Extract one author.

        Args:
            author_node(scrapy.selector.Selector): a selector on a single
                author, e.g. a ``<contrib contrib-type="author">``.

        Returns:
            dict: the parsed author, conforming to the Inspire schema.
        """
        author_name = self.get_author_name(author_node)
        emails = self.get_author_emails(author_node)
        affiliations = self.get_author_affiliations(author_node)

        return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails)

    @staticmethod
    def get_reference_authors(ref_node, role):
        """Extract authors of `role` from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.
            role(str): author role

        Returns:
            List[str]: list of names
        """
        return ref_node.xpath(
            './person-group[@person-group-type=$role]/string-name/text()',
            role=role
        ).extract()

    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj

    def attach_fulltext_document(self, file_name, url):
        self.builder.add_document(file_name, url, fulltext=True, hidden=True)
예제 #2
0
파일: tohep.py 프로젝트: miguelgrc/hepcrawl
def hepcrawl_to_hep(crawler_record):
    """
    Args:
        crawler_record(dict): dictionary representing the hepcrawl formatted
            record.


    Returns:
        dict: The hep formatted record.
    """

    def _filter_affiliation(affiliations):
        return [
            affilation.get('value')
            for affilation in affiliations
            if affilation.get('value')
        ]

    builder = LiteratureBuilder(
        source=crawler_record['acquisition_source']['source']
    )

    for author in crawler_record.get('authors', []):
        builder.add_author(builder.make_author(
            full_name=author['full_name'],
            raw_affiliations=_filter_affiliation(author['affiliations']),
        ))

    for title in crawler_record.get('titles', []):
        builder.add_title(
            title=title.get('title'),
            subtitle=title.get('subtitle'),
            source=title.get('source')
        )

    for abstract in crawler_record.get('abstracts', []):
        builder.add_abstract(
            abstract=abstract.get('value'),
            source=abstract.get('source')
        )

    for arxiv_eprint in crawler_record.get('arxiv_eprints', []):
        builder.add_arxiv_eprint(
            arxiv_id=arxiv_eprint.get('value'),
            arxiv_categories=arxiv_eprint.get('categories')
        )

    for doi in crawler_record.get('dois', []):
        builder.add_doi(
            doi=doi.get('value'),
            material=doi.get('material'),
        )

    for private_note in crawler_record.get('private_notes', []):
        builder.add_private_note(
            private_notes=private_note
        )

    for public_note in crawler_record.get('public_notes', []):
        builder.add_public_note(
            public_note=public_note.get('value'),
            source=public_note.get('source')
        )

    for license in crawler_record.get('license', []):
        builder.add_license(
            url=license.get('url'),
            license=license.get('license'),
            material=license.get('material'),
        )

    for collaboration in crawler_record.get('collaborations', []):
        builder.add_collaboration(
            collaboration=collaboration.get('value')
        )

    for imprint in crawler_record.get('imprints', []):
        builder.add_imprint_date(
            imprint_date=imprint.get('date')
        )

    for copyright in crawler_record.get('copyright', []):
        builder.add_copyright(
            holder=copyright.get('holder'),
            material=copyright.get('material'),
            statement=copyright.get('statement')
        )

    builder.add_preprint_date(
        preprint_date=crawler_record.get('preprint_date')
    )

    acquisition_source = crawler_record.get('acquisition_source', {})
    builder.add_acquisition_source(
        method=acquisition_source['method'],
        date=acquisition_source['datetime'],
        source=acquisition_source['source'],
        submission_number=acquisition_source['submission_number'],
    )

    try:
        builder.add_number_of_pages(
            number_of_pages=int(crawler_record.get('page_nr', [])[0])
        )
    except (TypeError, ValueError, IndexError):
        pass

    publication_types = [
        'introductory',
        'lectures',
        'review',
        'manual',
    ]

    document_types = [
        'book',
        'note',
        'report',
        'proceedings',
        'thesis',
    ]

    added_doc_type = False

    for collection in crawler_record.get('collections', []):
        collection = collection['primary'].strip().lower()

        if collection == 'arxiv':
            continue  # ignored
        elif collection == 'citeable':
            builder.set_citeable(True)
        elif collection == 'core':
            builder.set_core(True)
        elif collection == 'noncore':
            builder.set_core(False)
        elif collection == 'published':
            builder.set_refereed(True)
        elif collection == 'withdrawn':
            builder.set_withdrawn(True)
        elif collection in publication_types:
            builder.add_publication_type(collection)
        elif collection == 'bookchapter':
            added_doc_type = True
            builder.add_document_type('book chapter')
        elif collection == 'conferencepaper':
            added_doc_type = True
            builder.add_document_type('conference paper')
        elif collection in document_types:
            added_doc_type = True
            builder.add_document_type(collection)

    if not added_doc_type:
        builder.add_document_type('article')

    _pub_info = crawler_record.get('publication_info', [{}])[0]
    builder.add_publication_info(
        year=_pub_info.get('year'),
        artid=_pub_info.get('artid'),
        page_end=_pub_info.get('page_end'),
        page_start=_pub_info.get('page_start'),
        journal_issue=_pub_info.get('journal_issue'),
        journal_title=_pub_info.get('journal_title'),
        journal_volume=_pub_info.get('journal_volume'),
        pubinfo_freetext=_pub_info.get('pubinfo_freetext'),
        material=_pub_info.get('pubinfo_material'),
    )

    for report_number in crawler_record.get('report_numbers', []):
        builder.add_report_number(
            report_number=report_number.get('value'),
            source=report_number.get('source')
        )

    for url in crawler_record.get('urls', []):
        builder.add_url(url=url.get('value'))

    for document in crawler_record.get('documents', []):
        builder.add_document(
            description=document.get('description'),
            fulltext=document.get('fulltext'),
            hidden=document.get('hidden'),
            key=document['key'],
            material=document.get('material'),
            original_url=document.get('original_url'),
            url=document['url'],
        )

    return builder.record
예제 #3
0
def crawler2hep(crawler_record):
    def _filter_affiliation(affiliations):
        return [
            affilation.get('value') for affilation in affiliations
            if affilation.get('value')
        ]

    builder = LiteratureBuilder('hepcrawl')

    for author in crawler_record.get('authors', []):
        builder.add_author(
            builder.make_author(
                author['full_name'],
                affiliations=_filter_affiliation(author['affiliations']),
            ))

    for title in crawler_record.get('titles', []):
        builder.add_title(title=title.get('title'), source=title.get('source'))

    for abstract in crawler_record.get('abstracts', []):
        builder.add_abstract(abstract=abstract.get('value'),
                             source=abstract.get('source'))

    for arxiv_eprint in crawler_record.get('arxiv_eprints', []):
        builder.add_arxiv_eprint(
            arxiv_id=arxiv_eprint.get('value'),
            arxiv_categories=arxiv_eprint.get('categories'))

    for doi in crawler_record.get('dois', []):
        builder.add_doi(doi=doi.get('value'))

    for public_note in crawler_record.get('public_notes', []):
        builder.add_public_note(public_note=public_note.get('value'),
                                source=public_note.get('source'))

    for license in crawler_record.get('license', []):
        builder.add_license(url=license.get('url'),
                            license=license.get('license'))

    for collaboration in crawler_record.get('collaborations', []):
        builder.add_collaboration(collaboration=collaboration.get('value'))

    for imprint in crawler_record.get('imprints', []):
        builder.add_imprint_date(imprint_date=imprint.get('date'))

    for copyright in crawler_record.get('copyright', []):
        builder.add_copyright(holder=copyright.get('holder'),
                              material=copyright.get('material'),
                              statement=copyright.get('statement'))

    builder.add_preprint_date(
        preprint_date=crawler_record.get('preprint_date'))

    acquisition_source = crawler_record.get('acquisition_source', {})
    builder.add_acquisition_source(
        method='hepcrawl',
        date=acquisition_source.get('date'),
        source=acquisition_source.get('source'),
        submission_number=acquisition_source.get('submission_number'))

    try:
        builder.add_number_of_pages(
            number_of_pages=int(crawler_record.get('page_nr', [])[0]))
    except (TypeError, ValueError, IndexError):
        pass

    publication_types = [
        'introductory',
        'lectures',
        'review',
    ]

    special_collections = [
        'cdf-internal-note',
        'cdf-note',
        'cds',
        'd0-internal-note',
        'd0-preliminary-note',
        'h1-internal-note',
        'h1-preliminary-note',
        'halhidden',
        'hephidden',
        'hermes-internal-note',
        'larsoft-internal-note',
        'larsoft-note',
        'zeus-internal-note',
        'zeus-preliminary-note',
    ]

    document_types = [
        'book',
        'note',
        'report',
        'proceedings',
        'thesis',
    ]

    added_doc_type = False

    for collection in crawler_record.get('collections', []):
        collection = collection['primary'].strip().lower()

        if collection == 'arxiv':
            continue  # ignored
        elif collection == 'citeable':
            builder.set_citeable(True)
        elif collection == 'core':
            builder.set_core(True)
        elif collection == 'noncore':
            builder.set_core(False)
        elif collection == 'published':
            builder.set_refereed(True)
        elif collection == 'withdrawn':
            builder.set_withdrawn(True)
        elif collection in publication_types:
            builder.add_publication_type(collection)
        elif collection in special_collections:
            builder.add_special_collection(collection.upper())
        elif collection == 'bookchapter':
            added_doc_type = True
            builder.add_document_type('book chapter')
        elif collection == 'conferencepaper':
            added_doc_type = True
            builder.add_document_type('conference paper')
        elif collection in document_types:
            added_doc_type = True
            builder.add_document_type(collection)

    if not added_doc_type:
        builder.add_document_type('article')

    _pub_info = crawler_record.get('publication_info', [{}])[0]
    builder.add_publication_info(
        year=_pub_info.get('year'),
        artid=_pub_info.get('artid'),
        page_end=_pub_info.get('page_end'),
        page_start=_pub_info.get('page_start'),
        journal_issue=_pub_info.get('journal_issue'),
        journal_title=_pub_info.get('journal_title'),
        journal_volume=_pub_info.get('journal_volume'),
        pubinfo_freetext=_pub_info.get('pubinfo_freetext'),
    )

    for report_number in crawler_record.get('report_numbers', []):
        builder.add_report_number(report_number=report_number.get('value'),
                                  source=report_number.get('source'))

    builder.validate_record()

    return builder.record
예제 #4
0
class ElsevierParser(object):
    """Parser for the Elsevier format.

    It can be used directly by invoking the :func:`ElsevierParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        elsevier_record (Union[str, scrapy.selector.Selector]): the record in Elsevier format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the Elsevier metadata.
    """
    def __init__(self, elsevier_record, source=None):
        self.root = self.get_root_node(elsevier_record)
        if not source:
            source = self.publisher
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract a Elsevier record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(self.abstract)
        self.builder.add_title(self.title, subtitle=self.subtitle)
        self.builder.add_copyright(**self.copyright)
        self.builder.add_document_type(self.document_type)
        self.builder.add_license(**self.license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        for keyword in self.keywords:
            self.builder.add_keyword(keyword)
        self.builder.add_imprint_date(
            self.publication_date.dumps() if self.publication_date else None)
        for reference in self.references:
            self.builder.add_reference(reference)

        return self.builder.record

    @property
    def references(self):
        """Extract a Elsevier record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_nodes = self.root.xpath(".//bib-reference")
        return list(
            itertools.chain.from_iterable(
                self.get_reference_iter(node) for node in ref_nodes))

    remove_tags_config_abstract = {
        "allowed_tags": ["sup", "sub"],
        "allowed_trees": ["math"],
        "strip": "self::pub-id|self::issn",
    }

    @property
    def abstract(self):
        abstract_nodes = self.root.xpath(
            ".//head/abstract[not(@graphical)]/abstract-sec/simple-para")

        if not abstract_nodes:
            return

        abstract_paragraphs = [
            remove_tags(abstract_node,
                        **self.remove_tags_config_abstract).strip("/ \n")
            for abstract_node in abstract_nodes
        ]
        abstract = ' '.join(abstract_paragraphs)
        return abstract

    @property
    def article_type(self):
        """Return a article type mapped from abbreviation."""
        abbrv_doctype = self.root.xpath(".//@docsubtype").extract_first()
        article_type = DOCTYPE_MAPPING.get(abbrv_doctype)
        return article_type

    @property
    def artid(self):
        artid = self.root.xpath("string(./*/item-info/aid[1])").extract_first()
        return artid

    @property
    def authors(self):
        author_nodes = self.root.xpath("./*/head/author-group")
        all_authors = []
        for author_group in author_nodes:
            authors = [
                self.get_author(author, author_group)
                for author in author_group.xpath("./author")
            ]
            all_authors.extend(authors)
        return all_authors

    @property
    def collaborations(self):
        collaborations = self.root.xpath(
            "./*/head/author-group//collaboration/text/text()").extract()
        return collaborations

    @property
    def copyright(self):
        copyright = {
            "holder": self.copyright_holder,
            "material": self.material,
            "statement": self.copyright_statement,
            "year": self.copyright_year,
        }

        return copyright

    @property
    def copyright_holder(self):
        copyright_holder = self.root.xpath(
            "string(./*/item-info/copyright[@type][1])").extract_first()
        if not copyright_holder:
            copyright_type = self.root.xpath(
                "./*/item-info/copyright/@type").extract_first()
            copyright_holder = COPYRIGHT_MAPPING.get(copyright_type)

        return copyright_holder

    @property
    def copyright_statement(self):
        copyright_statement = self.root.xpath(
            "string(./RDF/Description/copyright[1])").extract_first()
        if not copyright_statement:
            copyright_statement = self.root.xpath(
                "string(./*/item-info/copyright[@type][1])").extract_first()

        return copyright_statement

    @property
    def copyright_year(self):
        copyright_year = self.root.xpath(
            "./*/item-info/copyright[@type]/@year").extract_first()

        return maybe_int(copyright_year)

    @property
    def dois(self):
        doi = self.root.xpath(
            "string(./RDF/Description/doi[1])").extract_first()
        return [{"doi": doi, "material": self.material}]

    @property
    def document_type(self):
        doctype = None
        if self.root.xpath(
                "./*[contains(name(),'article') or self::book-review]"):
            doctype = "article"
        elif self.root.xpath("./*[self::book or self::simple-book]"):
            doctype = "book"
        elif self.root.xpath("./book-chapter"):
            doctype = "book chapter"
        if self.is_conference_paper:
            doctype = "conference paper"
        if doctype:
            return doctype

    @property
    def is_conference_paper(self):
        """Decide whether the article is a conference paper."""
        if self.root.xpath("./conference-info"):
            return True
        journal_issue = self.root.xpath(
            "string(./RDF/Description/issueName[1])").extract_first()
        if journal_issue:
            is_conference = re.findall(r"proceedings|proc.",
                                       journal_issue.lower())
            return bool(is_conference)
        return False

    @property
    def journal_title(self):
        jid = self.root.xpath("string(./*/item-info/jid[1])").extract_first(
            default="")
        publication = self.root.xpath(
            "string(./RDF/Description/publicationName[1])").extract_first(
                default=jid)
        publication = re.sub(" [S|s]ection", "",
                             publication).replace(",", "").strip()
        return publication

    @property
    def journal_issue(self):
        journal_issue = self.root.xpath(
            "string(./serial-issue/issue-info/issue-first[1])").extract_first(
            )

        return journal_issue

    @property
    def journal_volume(self):
        journal_volume = self.root.xpath(
            "string(./RDF/Description/volume[1])").extract_first()

        return journal_volume

    @property
    def keywords(self):
        keywords = self.root.xpath(
            "./*/head/keywords[not(@abr)]/keyword/text/text()").getall()

        return keywords

    @property
    def license(self):
        license = {
            "license": self.license_statement,
            "material": self.material,
            "url": self.license_url,
        }

        return license

    @property
    def license_statement(self):
        license_statement = self.root.xpath(
            "string(./RDF/Description/licenseLine[1])").extract_first()

        return license_statement

    @property
    def license_url(self):
        license_url = self.root.xpath(
            "string(./RDF/Description/openAccessInformation/userLicense[1])"
        ).extract_first()

        return license_url

    @property
    def material(self):
        if self.article_type in (
                "erratum",
                "addendum",
                "retraction",
                "removal",
                "duplicate",
        ):
            material = self.article_type
        elif self.article_type in ("editorial", "publisher's note"):
            material = "editorial note"
        else:
            material = "publication"

        return material

    @property
    def page_start(self):
        page_start = self.root.xpath(
            "string(./RDF/Description/startingPage[1])").extract_first()
        return page_start

    @property
    def page_end(self):
        page_end = self.root.xpath(
            "string(./RDF/Description/endingPage[1])").extract_first()
        return page_end

    @property
    def publication_date(self):
        publication_date = None
        publication_date_string = self.root.xpath(
            "string(./RDF/Description/coverDisplayDate[1])").extract_first()
        if publication_date_string:
            try:
                publication_date = PartialDate.parse(publication_date_string)
            except:
                # in case when date contains month range, eg. July-September 2020
                publication_date = re.sub("[A-aZ-z]*-(?=[A-aZ-z])", "",
                                          publication_date_string)
                publication_date = PartialDate.parse(publication_date)
        return publication_date

    @property
    def publication_info(self):
        publication_info = {
            "artid": self.artid,
            "journal_title": self.journal_title,
            "journal_issue": self.journal_issue,
            "journal_volume": self.journal_volume,
            "material": self.material,
            "page_start": self.page_start,
            "page_end": self.page_end,
            "year": self.year,
        }

        return publication_info

    @property
    def publisher(self):
        publisher = self.root.xpath("string(./RDF/Description/publisher[1])"
                                    ).extract_first("Elsevier B.V.")

        return publisher

    @property
    def subtitle(self):
        subtitle = self.root.xpath(
            "string(./*/head/subtitle[1])").extract_first()

        return subtitle

    @property
    def title(self):
        title = self.root.xpath("string(./*/head/title[1])").extract_first()

        return title.strip("\n") if title else None

    @property
    def year(self):
        if self.publication_date:
            return self.publication_date.year

    def get_author_affiliations(self, author_node, author_group_node):
        """Extract an author's affiliations."""
        ref_ids = author_node.xpath(".//@refid[contains(., 'af')]").extract()
        group_affs = author_group_node.xpath(
            "string(./affiliation/textfn[1])").getall()
        if ref_ids:
            affiliations = self._find_affiliations_by_id(
                author_group_node, ref_ids)
        else:
            affiliations = filter(None, group_affs)
        return affiliations

    @staticmethod
    def _find_affiliations_by_id(author_group, ref_ids):
        """Return affiliations with given ids.

        Affiliations should be standardized later.
        """
        affiliations_by_id = []
        for aff_id in ref_ids:
            affiliation = author_group.xpath(
                "string(//affiliation[@id='{}']/textfn[1])".format(
                    aff_id)).extract_first()
            affiliations_by_id.append(affiliation)

        return affiliations_by_id

    def get_author_emails(self, author_node):
        """Extract an author's email addresses."""
        emails = author_node.xpath(
            'string(./e-address[@type="email"][1])').getall()

        return emails

    @staticmethod
    def get_author_name(author_node):
        """Extract an author's name."""
        surname = author_node.xpath("string(./surname[1])").extract_first()
        given_names = author_node.xpath(
            "string(./given-name[1])").extract_first()
        suffix = author_node.xpath("string(.//suffix[1])").extract_first()
        author_name = ", ".join(el for el in (surname, given_names, suffix)
                                if el)

        return author_name

    @staticmethod
    def get_root_node(elsevier_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            elsevier_record(Union[str, scrapy.selector.Selector]): the record in Elsevier format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(elsevier_record, six.string_types):
            root = get_node(elsevier_record)
        else:
            root = elsevier_record
        root.remove_namespaces()

        return root

    def get_author(self, author_node, author_group_node):
        """Extract one author.

        Args:
            author_node(scrapy.selector.Selector): a selector on a single
                author, e.g. a ``<contrib contrib-type="author">``.

        Returns:
            dict: the parsed author, conforming to the Inspire schema.
        """
        author_name = self.get_author_name(author_node)
        emails = self.get_author_emails(author_node)
        affiliations = self.get_author_affiliations(author_node,
                                                    author_group_node)

        return self.builder.make_author(author_name,
                                        raw_affiliations=affiliations,
                                        emails=emails)

    @staticmethod
    def get_reference_authors(ref_node):
        """Extract authors from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.

        Returns:
            List[str]: list of names
        """
        authors = ref_node.xpath("./contribution/authors/author")
        authors_names = []
        for author in authors:
            given_names = author.xpath(
                "string(./given-name[1])").extract_first(default="")
            last_names = author.xpath("string(./surname[1])").extract_first(
                default="")
            authors_names.append(" ".join([given_names, last_names]).strip())
        return authors_names

    @staticmethod
    def get_reference_editors(ref_node):
        """Extract authors of `role` from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.

        Returns:
            List[str]: list of names
        """
        editors = ref_node.xpath(".//editors/authors/author")
        editors_names = []
        for editor in editors:
            given_names = editor.xpath(
                "string(./given-name[1])").extract_first(default="")
            last_names = editor.xpath("string(./surname[1])").extract_first(
                default="")
            editors_names.append(" ".join([given_names, last_names]).strip())
        return editors_names

    @staticmethod
    def get_reference_artid(ref_node):
        return ref_node.xpath("string(.//article-number[1])").extract_first()

    @staticmethod
    def get_reference_pages(ref_node):
        first_page = ref_node.xpath(
            "string(.//pages/first-page[1])").extract_first()
        last_page = ref_node.xpath(
            "string(.//pages/last-page[1])").extract_first()
        return first_page, last_page

    def get_reference_iter(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

       Yields:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        # handle also unstructured refs
        for citation_node in ref_node.xpath("./reference|./other-ref"):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format="Elsevier",
            )

            fields = [
                (
                    ("string(.//series/title/maintitle[1])"),
                    builder.set_journal_title,
                ),
                (
                    "string(.//title[parent::edited-book|parent::book]/maintitle[1])",
                    builder.add_parent_title,
                ),
                ("string(./publisher/name[1])", builder.set_publisher),
                ("string(.//volume-nr[1])", builder.set_journal_volume),
                ("string(.//issue-nr[1])", builder.set_journal_issue),
                ("string(.//date[1])", builder.set_year),
                ("string(.//inter-ref[1])", builder.add_url),
                ("string(.//doi[1])", builder.add_uid),
                (
                    'string(pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")][1])',
                    builder.add_report_number,
                ),
                ("string(./title/maintitle[1])", builder.add_title),
            ]
            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            label_value = ref_node.xpath("string(./label[1])").extract_first()
            builder.set_label(label_value.strip("[]"))

            pages = self.get_reference_pages(citation_node)
            artid = self.get_reference_artid(citation_node)
            if artid:
                builder.set_page_artid(artid=artid)
            if any(pages):
                builder.set_page_artid(*pages)

            remainder = (remove_tags(
                citation_node,
                strip="self::authors"
                "|self::article-number"
                "|self::volume-nr"
                "|self::issue-nr"
                "|self::inter-ref"
                "|self::maintitle"
                "|self::date"
                "|self::label"
                "|self::publisher"
                "|self::doi"
                "|self::pages").strip("\"';,. \t\n\r").replace("()", ""))
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_editors(citation_node):
                builder.add_author(editor, "editor")

            for author in self.get_reference_authors(citation_node):
                builder.add_author(author, "author")

            yield builder.obj

    def attach_fulltext_document(self, file_name, url):
        self.builder.add_document(file_name, url, fulltext=True, hidden=True)

    def get_identifier(self):
        return self.dois[0]["doi"]

    def should_record_be_harvested(self):
        if self.article_type in DOCTYPES_TO_HARVEST and all([
                self.title,
                self.journal_title,
                self.journal_volume,
            (self.artid or self.page_start),
        ]):
            return True
        return False
예제 #5
0
파일: jats.py 프로젝트: drjova/hepcrawl
class JatsParser(object):
    """Parser for the JATS format.

    It can be used directly by invoking the :func:`JatsParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the JATS metadata.
    """
    def __init__(self, jats_record, source=None):
        self.root = self.get_root_node(jats_record)
        if not source:
            source = self.publisher
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract a JATS record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(self.abstract)
        self.builder.add_title(self.title, subtitle=self.subtitle)
        self.builder.add_copyright(**self.copyright)
        self.builder.add_document_type(self.document_type)
        self.builder.add_license(**self.license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        for keyword in self.keywords:
            self.builder.add_keyword(**keyword)
        self.builder.add_imprint_date(self.publication_date.dumps())
        for reference in self.references:
            self.builder.add_reference(reference)

        return self.builder.record

    @property
    def references(self):
        """Extract a JATS record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_nodes = self.root.xpath('./back/ref-list/ref')
        return list(
            itertools.chain.from_iterable(
                self.get_reference(node) for node in ref_nodes
            )
        )

    remove_tags_config_abstract = {
        'allowed_tags': ['sup', 'sub'],
        'allowed_trees': ['math'],
        'strip': 'self::pub-id|self::issn'
    }

    @property
    def abstract(self):
        abstract_nodes = self.root.xpath('./front//abstract[1]')

        if not abstract_nodes:
            return

        abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip()
        return abstract

    @property
    def article_type(self):
        article_type = self.root.xpath('./@article-type').extract_first()

        return article_type

    @property
    def artid(self):
        artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first()

        return artid

    @property
    def authors(self):
        author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]')
        authors = [self.get_author(author) for author in author_nodes]

        return authors

    @property
    def collaborations(self):
        collab_nodes = self.root.xpath(
            './front//collab |'
            './front//contrib[@contrib-type="collaboration"] |'
            './front//on-behalf-of'
        )
        collaborations = set(
            collab.xpath('string(.)').extract_first() for collab in collab_nodes
        )

        return collaborations

    @property
    def copyright(self):
        copyright = {
            'holder': self.copyright_holder,
            'material': self.material,
            'statement': self.copyright_statement,
            'year': self.copyright_year,
        }

        return copyright

    @property
    def copyright_holder(self):
        copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first()

        return copyright_holder

    @property
    def copyright_statement(self):
        copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first()

        return copyright_statement

    @property
    def copyright_year(self):
        copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first()

        return maybe_int(copyright_year)

    @property
    def dois(self):
        doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract()
        dois = [
            {'doi': value, 'material': self.material} for value in doi_values
        ]

        if self.material != 'publication':
            doi_values = self.root.xpath(
                './front/article-meta//related-article[@ext-link-type="doi"]/@href'
            ).extract()
            related_dois = ({'doi': value} for value in doi_values)
            dois.extend(related_dois)

        return dois

    @property
    def document_type(self):
        if self.is_conference_paper:
            document_type = 'conference paper'
        else:
            document_type = 'article'

        return document_type

    @property
    def is_conference_paper(self):
        """Decide whether the article is a conference paper."""
        conference_node = self.root.xpath('./front//conference').extract_first()

        return bool(conference_node)

    @property
    def journal_title(self):
        journal_title = self.root.xpath(
            './front/journal-meta//abbrev-journal-title/text() |'
            './front/journal-meta//journal-title/text()'
        ).extract_first()

        return journal_title

    @property
    def journal_issue(self):
        journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first()

        return journal_issue

    @property
    def journal_volume(self):
        journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first()

        return journal_volume

    @property
    def keywords(self):
        keyword_groups = self.root.xpath('./front//kwd-group')
        keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups)

        return keywords

    @property
    def license(self):
        license = {
            'license': self.license_statement,
            'material': self.material,
            'url': self.license_url,
        }

        return license

    @property
    def license_statement(self):
        license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip()

        return license_statement

    @property
    def license_url(self):
        url_nodes = (
            './front/article-meta//license_ref/text() |'
            './front/article-meta//license/@href |'
            './front/article-meta//license//ext-link/@href'
        )
        license_url = self.root.xpath(url_nodes).extract_first()

        return license_url

    @property
    def material(self):
        if self.article_type.startswith('correc'):
            material = 'erratum'
        elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'):
            material = self.article_type
        else:
            material = 'publication'

        return material

    @property
    def number_of_pages(self):
        number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first())

        return number_of_pages

    @property
    def page_start(self):
        page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first()

        return page_start

    @property
    def page_end(self):
        page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first()

        return page_end

    @property
    def publication_date(self):
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[@pub-type="epub"] |'
            './front//pub-date[starts-with(@date-type,"pub")] |'
            './front//date[starts-with(@date-type,"pub")]'
        )
        publication_date = min(
            self.get_date(date_node) for date_node in date_nodes
        )

        return publication_date

    @property
    def publication_info(self):
        publication_info = {
            'artid': self.artid,
            'journal_title': self.journal_title,
            'journal_issue': self.journal_issue,
            'journal_volume': self.journal_volume,
            'material': self.material,
            'page_start': self.page_start,
            'page_end': self.page_end,
            'year': self.year,
        }

        return publication_info

    @property
    def publisher(self):
        publisher = self.root.xpath('./front//publisher-name/text()').extract_first()

        return publisher

    @property
    def subtitle(self):
        subtitle = self.root.xpath('string(./front//subtitle)').extract_first()

        return subtitle

    @property
    def title(self):
        title = self.root.xpath('string(./front//article-title)').extract_first()

        return title

    def get_affiliation(self, id_):
        """Get the affiliation with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the affiliation.

        Returns:
            Optional[str]: the affiliation with that id or ``None`` if there is
                no match.
        """
        affiliation_node = self.root.xpath('//aff[@id=$id_]', id_=id_)[0]
        affiliation = remove_tags(
            affiliation_node,
            strip='self::label | self::email'
        ).strip()

        return affiliation

    def get_emails_from_refs(self, id_):
        """Get the emails from the node with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the node.

        Returns:
            List[str]: the emails from the node with that id or [] if none found.
        """
        email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_)
        return email_nodes.extract()

    @property
    def year(self):
        not_online = (
            'not(starts-with(@publication-format, "elec"))'
            ' and not(starts-with(@publication-format, "online")'
        )
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[starts-with(@date-type,"pub") and $not_online] |'
            './front//date[starts-with(@date-type,"pub") and $not_online]',
            not_online=not_online
        )

        year = min(
            self.get_date(date_node) for date_node in date_nodes
        ).year

        return year

    def get_author_affiliations(self, author_node):
        """Extract an author's affiliations."""
        raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        # Sometimes the rid might have more than one ID (e.g. rid="id0 id1")
        referred_ids = set()
        for raw_referred_id in raw_referred_ids:
            referred_ids.update(set(raw_referred_id.split(' ')))

        affiliations = [
            self.get_affiliation(rid) for rid in referred_ids
            if self.get_affiliation(rid)
        ]

        return affiliations

    def get_author_emails(self, author_node):
        """Extract an author's email addresses."""
        emails = author_node.xpath('.//email/text()').extract()
        referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        for referred_id in referred_ids:
            emails.extend(self.get_emails_from_refs(referred_id))

        return emails

    @staticmethod
    def get_author_name(author_node):
        """Extract an author's name."""
        surname = author_node.xpath('.//surname/text()').extract_first()
        if not surname:
            # the author name is unstructured
            author_name = author_node.xpath('string(./string-name)').extract_first()
        given_names = author_node.xpath('.//given-names/text()').extract_first()
        suffix = author_node.xpath('.//suffix/text()').extract_first()
        author_name = ', '.join(el for el in (surname, given_names, suffix) if el)

        return author_name

    @staticmethod
    def get_date(date_node):
        """Extract a date from a date node.

        Returns:
            PartialDate: the parsed date.
        """
        iso_string = date_node.xpath('./@iso-8601-date').extract_first()
        iso_date = PartialDate.loads(iso_string) if iso_string else None

        year = date_node.xpath('string(./year)').extract_first()
        month = date_node.xpath('string(./month)').extract_first()
        day = date_node.xpath('string(./day)').extract_first()
        date_from_parts = PartialDate.from_parts(year, month, day) if year else None

        string_date = date_node.xpath('string(./string-date)').extract_first()
        try:
            parsed_date = PartialDate.parse(string_date)
        except ValueError:
            parsed_date = None

        date = get_first([iso_date, date_from_parts, parsed_date])
        return date

    @staticmethod
    def get_keywords(group_node):
        """Extract keywords from a keyword group."""
        schema = None
        if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower():
            schema = 'PACS'

        keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd'))
        keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords)

        return keyword_dicts

    @staticmethod
    def get_root_node(jats_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(jats_record, six.string_types):
            root = get_node(jats_record)
        else:
            root = jats_record
        root.remove_namespaces()

        return root

    def get_author(self, author_node):
        """Extract one author.

        Args:
            author_node(scrapy.selector.Selector): a selector on a single
                author, e.g. a ``<contrib contrib-type="author">``.

        Returns:
            dict: the parsed author, conforming to the Inspire schema.
        """
        author_name = self.get_author_name(author_node)
        emails = self.get_author_emails(author_node)
        affiliations = self.get_author_affiliations(author_node)

        return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails)

    @staticmethod
    def get_reference_authors(ref_node, role):
        """Extract authors of `role` from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.
            role(str): author role

        Returns:
            List[str]: list of names
        """
        return ref_node.xpath(
            './person-group[@person-group-type=$role]/string-name/text()',
            role=role
        ).extract()


    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj


    def attach_fulltext_document(self, file_name, url):
        self.builder.add_document(file_name, url, fulltext=True, hidden=True)