Python LiteratureBuilder.add_author 예제들

프로그래밍 언어: Python

네임스페이스/패키지 이름: inspire_schemas.api

클래스/타입: LiteratureBuilder

메소드/함수: add_author

hotexamples.com에서의 예제들: 15

Python LiteratureBuilder.add_author - 15개의 예제가 발견되었습니다. 이것들은 오픈소스 프로젝트에서 추출된 Python의 inspire_schemas.api.LiteratureBuilder.add_author에 대한 실세계 최고 등급의 예제들입니다. 예제들을 평가하여 예제의 품질 향상에 도움을 줄 수 있습니다.

자주 사용되는 메소드들

보기 숨기기

LiteratureBuilder(12)

add_author(12)

make_author(8)

add_license(7)

add_title(7)

add_publication_info(7)

add_abstract(7)

add_document_type(7)

add_doi(7)

add_collaboration(6)

add_imprint_date(5)

add_report_number(4)

add_public_note(4)

add_preprint_date(4)

add_number_of_pages(4)

add_acquisition_source(4)

add_copyright(4)

add_document(3)

add_private_note(3)

add_reference(3)

add_keyword(2)

set_withdrawn(2)

set_refereed(2)

set_core(2)

set_citeable(2)

add_url(2)

validate_record(2)

add_arxiv_eprint(2)

add_publication_type(2)

add_inspire_categories(2)

add_thesis(1)

add_language(1)

add_title_translation(1)

add_accelerator_experiments_legacy_name(1)

add_book(1)

add_book_series(1)

add_special_collection(1)

예제 #1

파일 보기

def authorlist_without_affiliations(text):
    """Return a record containing the authors."""
    builder = LiteratureBuilder()
    authors = text.replace(' and ', ', ').split(', ')
    for author in authors:
        builder.add_author(builder.make_author(author))

    return {'authors': builder.record['authors']}

예제 #2

파일 보기

파일: authorlist_utils.py 프로젝트: hanadikaleel/inspirehep

def authorlist(text):
    """
    Return an author-structure parsed from text
    and optional additional information.
    """
    builder = LiteratureBuilder()

    text = replace_undesirable_characters(text)
    result = create_authors(text)

    if "authors" in result:
        for fullname, author_affs in result["authors"]:
            builder.add_author(
                builder.make_author(fullname, raw_affiliations=author_affs)
            )
        result["authors"] = builder.record["authors"]
    return result

예제 #3

파일 보기

파일: utils.py 프로젝트: turtle321/inspire-next

def authorlist(text):
    """
    Return an author-structure parsed from text
    and optional additional information.
    """

    from inspire_schemas.api import LiteratureBuilder
    from refextract.documents.pdf import replace_undesirable_characters
    from inspirehep.modules.tools.authorlist import create_authors

    builder = LiteratureBuilder()

    text = replace_undesirable_characters(text)
    result = create_authors(text)

    if 'authors' in result:
        for fullname, author_affs in result['authors']:
            builder.add_author(
                builder.make_author(fullname, raw_affiliations=author_affs))
        result['authors'] = builder.record['authors']
    return result

예제 #4

파일 보기

def authorlist(text):
    """
    Return an author-structure parsed from text
    and optional additional information.
    """
    builder = LiteratureBuilder()
    text = replace_undesirable_characters(text)
    result = create_authors(text)

    if "authors" in result:
        for author in result["authors"]:
            builder.add_author(
                builder.make_author(
                    author.get("fullname"),
                    raw_affiliations=author.get("affiliations", []),
                    ids=author.get("ids", []),
                    emails=author.get("emails", []),
                )
            )
        result["authors"] = builder.record["authors"]
    return result

예제 #5

파일 보기

파일: utils.py 프로젝트: david-caro/inspire-next

def authorlist(text):
    """
    Return an author-structure parsed from text
    and optional additional information.
    """

    from inspire_schemas.api import LiteratureBuilder
    from refextract.documents.pdf import replace_undesirable_characters
    from inspirehep.modules.tools.authorlist import create_authors

    builder = LiteratureBuilder()

    text = replace_undesirable_characters(text)
    result = create_authors(text)

    if 'authors' in result:
        for fullname, author_affs in result['authors']:
            builder.add_author(
                builder.make_author(fullname, raw_affiliations=author_affs)
            )
        result['authors'] = builder.record['authors']
    return result

예제 #6

파일 보기

def formdata_to_model(obj, formdata):
    """Manipulate form data to match literature data model."""
    def _is_arxiv_url(url):
        return 'arxiv.org' in url

    form_fields = copy.deepcopy(formdata)
    filter_empty_elements(form_fields,
                          ['authors', 'supervisors', 'report_numbers'])

    builder = LiteratureBuilder(source='submitter')

    for author in form_fields.get('authors', []):
        builder.add_author(
            builder.make_author(author['full_name'],
                                affiliations=force_list(author['affiliation'])
                                if author['affiliation'] else None,
                                roles=['author']))

    for supervisor in form_fields.get('supervisors', []):
        builder.add_author(
            builder.make_author(
                supervisor['full_name'],
                affiliations=force_list(supervisor['affiliation'])
                if author['affiliation'] else None,
                roles=['supervisor']))

    builder.add_title(title=form_fields.get('title'))

    document_type = 'conference paper' if form_fields.get('conf_name') \
        else form_fields.get('type_of_doc', [])
    if document_type == 'chapter':
        document_type = 'book chapter'

    builder.add_document_type(document_type=document_type)

    builder.add_abstract(
        abstract=form_fields.get('abstract'),
        source='arXiv' if form_fields.get('categories') else None)

    if form_fields.get('arxiv_id') and form_fields.get('categories'):
        builder.add_arxiv_eprint(
            arxiv_id=form_fields.get('arxiv_id'),
            arxiv_categories=form_fields.get('categories').split())

    builder.add_doi(doi=form_fields.get('doi'))

    builder.add_inspire_categories(
        subject_terms=form_fields.get('subject_term'), source='user')

    for key in ('extra_comments', 'nonpublic_note', 'hidden_notes',
                'conf_name'):
        builder.add_private_note(private_notes=form_fields.get(key))

    year = form_fields.get('year')
    try:
        year = int(year)
    except (TypeError, ValueError):
        year = None

    builder.add_preprint_date(
        preprint_date=form_fields.get('preprint_created'))

    if form_fields.get('type_of_doc') == 'thesis':
        builder.add_thesis(defense_date=form_fields.get('defense_date'),
                           degree_type=form_fields.get('degree_type'),
                           institution=form_fields.get('institution'),
                           date=form_fields.get('thesis_date'))

    if form_fields.get('type_of_doc') == 'chapter':
        if not form_fields.get('journal_title'):
            builder.add_book_series(title=form_fields.get('series_title'))

    if form_fields.get('type_of_doc') == 'book':
        if form_fields.get('journal_title'):
            form_fields['volume'] = form_fields.get('series_volume')
        else:
            builder.add_book_series(title=form_fields.get('series_title'),
                                    volume=form_fields.get('series_volume'))
        builder.add_book(publisher=form_fields.get('publisher_name'),
                         place=form_fields.get('publication_place'),
                         date=form_fields.get('publication_date'))

    builder.add_publication_info(
        year=year,
        cnum=form_fields.get('conference_id'),
        journal_issue=form_fields.get('issue'),
        journal_title=form_fields.get('journal_title'),
        journal_volume=form_fields.get('volume'),
        page_start=form_fields.get('start_page'),
        page_end=form_fields.get('end_page'),
        artid=form_fields.get('artid'),
        parent_record=form_fields.get('parent_book'))

    builder.add_accelerator_experiments_legacy_name(
        legacy_name=form_fields.get('experiment'))

    language = form_fields.get('other_language') \
        if form_fields.get('language') == 'oth' \
        else form_fields.get('language')
    builder.add_language(language=language)

    if form_fields.get('title_translation'):
        builder.add_title_translation(
            title=form_fields['title_translation'],
            language='en',
        )

    builder.add_title(title=form_fields.get('title_arXiv'), source='arXiv')

    builder.add_title(title=form_fields.get('title_crossref'),
                      source='crossref')

    builder.add_license(url=form_fields.get('license_url'))

    builder.add_public_note(public_note=form_fields.get('public_notes'))

    builder.add_public_note(
        public_note=form_fields.get('note'),
        source='arXiv' if form_fields.get('categories') else 'CrossRef')

    form_url = form_fields.get('url')
    form_additional_url = form_fields.get('additional_url')
    if form_url and not _is_arxiv_url(form_url):
        obj.extra_data['submission_pdf'] = form_url
        if not form_additional_url:
            builder.add_url(url=form_url)

    if form_additional_url and not _is_arxiv_url(form_additional_url):
        builder.add_url(url=form_additional_url)

    [
        builder.add_report_number(
            report_number=report_number.get('report_number'))
        for report_number in form_fields.get('report_numbers', [])
    ]

    builder.add_collaboration(collaboration=form_fields.get('collaboration'))

    builder.add_acquisition_source(
        datetime=datetime.datetime.utcnow().isoformat(),
        submission_number=obj.id,
        internal_uid=int(obj.id_user),
        email=form_fields.get('email'),
        orcid=form_fields.get('orcid'),
        method='submitter')

    return builder.record

예제 #7

파일 보기

class JatsParser(object):
    """Parser for the JATS format.

    It can be used directly by invoking the :func:`JatsParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the JATS metadata.
    """
    def __init__(self, jats_record, source=None):
        self.root = self.get_root_node(jats_record)
        if not source:
            source = self.publisher
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract a JATS record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(self.abstract)
        self.builder.add_title(self.title, subtitle=self.subtitle)
        self.builder.add_copyright(**self.copyright)
        self.builder.add_document_type(self.document_type)
        self.builder.add_license(**self.license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        for keyword in self.keywords:
            self.builder.add_keyword(**keyword)
        self.builder.add_imprint_date(self.publication_date.dumps())
        for reference in self.references:
            self.builder.add_reference(reference)

        return self.builder.record

    @property
    def references(self):
        """Extract a JATS record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_nodes = self.root.xpath('./back/ref-list/ref')
        return list(
            itertools.chain.from_iterable(
                self.get_reference(node) for node in ref_nodes
            )
        )

    remove_tags_config_abstract = {
        'allowed_tags': ['sup', 'sub'],
        'allowed_trees': ['math'],
        'strip': 'self::pub-id|self::issn'
    }

    @property
    def abstract(self):
        abstract_nodes = self.root.xpath('./front//abstract[1]')

        if not abstract_nodes:
            return

        abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip()
        return abstract

    @property
    def article_type(self):
        article_type = self.root.xpath('./@article-type').extract_first()

        return article_type

    @property
    def artid(self):
        artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first()

        return artid

    @property
    def authors(self):
        author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]')
        authors = [self.get_author(author) for author in author_nodes]

        return authors

    @property
    def collaborations(self):
        collab_nodes = self.root.xpath(
            './front//collab |'
            './front//contrib[@contrib-type="collaboration"] |'
            './front//on-behalf-of'
        )
        collaborations = set(
            collab.xpath('string(.)').extract_first() for collab in collab_nodes
        )

        return collaborations

    @property
    def copyright(self):
        copyright = {
            'holder': self.copyright_holder,
            'material': self.material,
            'statement': self.copyright_statement,
            'year': self.copyright_year,
        }

        return copyright

    @property
    def copyright_holder(self):
        copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first()

        return copyright_holder

    @property
    def copyright_statement(self):
        copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first()

        return copyright_statement

    @property
    def copyright_year(self):
        copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first()

        return maybe_int(copyright_year)

    @property
    def dois(self):
        doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract()
        dois = [
            {'doi': value, 'material': self.material} for value in doi_values
        ]

        if self.material != 'publication':
            doi_values = self.root.xpath(
                './front/article-meta//related-article[@ext-link-type="doi"]/@href'
            ).extract()
            related_dois = ({'doi': value} for value in doi_values)
            dois.extend(related_dois)

        return dois

    @property
    def document_type(self):
        if self.is_conference_paper:
            document_type = 'conference paper'
        else:
            document_type = 'article'

        return document_type

    @property
    def is_conference_paper(self):
        """Decide whether the article is a conference paper."""
        conference_node = self.root.xpath('./front//conference').extract_first()

        return bool(conference_node)

    @property
    def journal_title(self):
        journal_title = self.root.xpath(
            './front/journal-meta//abbrev-journal-title/text() |'
            './front/journal-meta//journal-title/text()'
        ).extract_first()

        return journal_title

    @property
    def journal_issue(self):
        journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first()

        return journal_issue

    @property
    def journal_volume(self):
        journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first()

        return journal_volume

    @property
    def keywords(self):
        keyword_groups = self.root.xpath('./front//kwd-group')
        keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups)

        return keywords

    @property
    def license(self):
        license = {
            'license': self.license_statement,
            'material': self.material,
            'url': self.license_url,
        }

        return license

    @property
    def license_statement(self):
        license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip()

        return license_statement

    @property
    def license_url(self):
        url_nodes = (
            './front/article-meta//license_ref/text() |'
            './front/article-meta//license/@href |'
            './front/article-meta//license//ext-link/@href'
        )
        license_url = self.root.xpath(url_nodes).extract_first()

        return license_url

    @property
    def material(self):
        if self.article_type.startswith('correc'):
            material = 'erratum'
        elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'):
            material = self.article_type
        else:
            material = 'publication'

        return material

    @property
    def number_of_pages(self):
        number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first())

        return number_of_pages

    @property
    def page_start(self):
        page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first()

        return page_start

    @property
    def page_end(self):
        page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first()

        return page_end

    @property
    def publication_date(self):
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[@pub-type="epub"] |'
            './front//pub-date[starts-with(@date-type,"pub")] |'
            './front//date[starts-with(@date-type,"pub")]'
        )
        publication_date = min(
            self.get_date(date_node) for date_node in date_nodes
        )

        return publication_date

    @property
    def publication_info(self):
        publication_info = {
            'artid': self.artid,
            'journal_title': self.journal_title,
            'journal_issue': self.journal_issue,
            'journal_volume': self.journal_volume,
            'material': self.material,
            'page_start': self.page_start,
            'page_end': self.page_end,
            'year': self.year,
        }

        return publication_info

    @property
    def publisher(self):
        publisher = self.root.xpath('./front//publisher-name/text()').extract_first()

        return publisher

    @property
    def subtitle(self):
        subtitle = self.root.xpath('string(./front//subtitle)').extract_first()

        return subtitle

    @property
    def title(self):
        title = self.root.xpath('string(./front//article-title)').extract_first()

        return title

    def get_affiliation(self, id_):
        """Get the affiliation with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the affiliation.

        Returns:
            Optional[str]: the affiliation with that id or ``None`` if there is
                no match.
        """
        affiliation_node = self.root.xpath("//aff[@id=$id_]", id_=id_)
        if affiliation_node:
            affiliation = remove_tags(
                affiliation_node[0], strip="self::label | self::email"
            ).strip()
            return affiliation

    def get_emails_from_refs(self, id_):
        """Get the emails from the node with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the node.

        Returns:
            List[str]: the emails from the node with that id or [] if none found.
        """
        email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_)
        return email_nodes.extract()

    @property
    def year(self):
        not_online = (
            'not(starts-with(@publication-format, "elec"))'
            ' and not(starts-with(@publication-format, "online")'
        )
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[starts-with(@date-type,"pub") and $not_online] |'
            './front//date[starts-with(@date-type,"pub") and $not_online]',
            not_online=not_online
        )

        year = min(
            self.get_date(date_node) for date_node in date_nodes
        ).year

        return year

    def get_author_affiliations(self, author_node):
        """Extract an author's affiliations."""
        raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        # Sometimes the rid might have more than one ID (e.g. rid="id0 id1")
        referred_ids = set()
        for raw_referred_id in raw_referred_ids:
            referred_ids.update(set(raw_referred_id.split(' ')))

        affiliations = [
            self.get_affiliation(rid) for rid in referred_ids
            if self.get_affiliation(rid)
        ]

        return affiliations

    def get_author_emails(self, author_node):
        """Extract an author's email addresses."""
        emails = author_node.xpath('.//email/text()').extract()
        referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        for referred_id in referred_ids:
            emails.extend(self.get_emails_from_refs(referred_id))

        return emails

    @staticmethod
    def get_author_name(author_node):
        """Extract an author's name."""
        surname = author_node.xpath('.//surname/text()').extract_first()
        if not surname:
            # the author name is unstructured
            author_name = author_node.xpath('string(./string-name)').extract_first()
        given_names = author_node.xpath('.//given-names/text()').extract_first()
        suffix = author_node.xpath('.//suffix/text()').extract_first()
        author_name = ', '.join(el for el in (surname, given_names, suffix) if el)

        return author_name

    @staticmethod
    def get_date(date_node):
        """Extract a date from a date node.

        Returns:
            PartialDate: the parsed date.
        """
        iso_string = date_node.xpath('./@iso-8601-date').extract_first()
        iso_date = PartialDate.loads(iso_string) if iso_string else None

        year = date_node.xpath('string(./year)').extract_first()
        month = date_node.xpath('string(./month)').extract_first()
        day = date_node.xpath('string(./day)').extract_first()
        date_from_parts = PartialDate.from_parts(year, month, day) if year else None

        string_date = date_node.xpath('string(./string-date)').extract_first()
        try:
            parsed_date = PartialDate.parse(string_date)
        except ValueError:
            parsed_date = None

        date = get_first([iso_date, date_from_parts, parsed_date])
        return date

    @staticmethod
    def get_keywords(group_node):
        """Extract keywords from a keyword group."""
        schema = None
        if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower():
            schema = 'PACS'

        keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd'))
        keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords)

        return keyword_dicts

    @staticmethod
    def get_root_node(jats_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(jats_record, six.string_types):
            root = get_node(jats_record)
        else:
            root = jats_record
        root.remove_namespaces()

        return root

    def get_author(self, author_node):
        """Extract one author.

        Args:
            author_node(scrapy.selector.Selector): a selector on a single
                author, e.g. a ``<contrib contrib-type="author">``.

        Returns:
            dict: the parsed author, conforming to the Inspire schema.
        """
        author_name = self.get_author_name(author_node)
        emails = self.get_author_emails(author_node)
        affiliations = self.get_author_affiliations(author_node)

        return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails)

    @staticmethod
    def get_reference_authors(ref_node, role):
        """Extract authors of `role` from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.
            role(str): author role

        Returns:
            List[str]: list of names
        """
        return ref_node.xpath(
            './person-group[@person-group-type=$role]/string-name/text()',
            role=role
        ).extract()

    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj

    def attach_fulltext_document(self, file_name, url):
        self.builder.add_document(file_name, url, fulltext=True, hidden=True)

예제 #8

파일 보기

파일: tasks.py 프로젝트: fschwenn/inspire-next

def formdata_to_model(obj, formdata):
    """Manipulate form data to match literature data model."""
    def _is_arxiv_url(url):
        return 'arxiv.org' in url

    form_fields = copy.deepcopy(formdata)
    filter_empty_elements(
        form_fields, ['authors', 'supervisors', 'report_numbers']
    )

    builder = LiteratureBuilder(source='submitter')

    for author in form_fields.get('authors', []):
        builder.add_author(builder.make_author(
            author['full_name'],
            affiliations=force_list(author['affiliation'])
            if author['affiliation'] else None,
            roles=['author']
        ))

    for supervisor in form_fields.get('supervisors', []):
        builder.add_author(builder.make_author(
            supervisor['full_name'],
            affiliations=force_list(supervisor['affiliation'])
            if author['affiliation'] else None,
            roles=['supervisor']
        ))

    builder.add_title(title=form_fields.get('title'))

    document_type = 'conference paper' if form_fields.get('conf_name') \
        else form_fields.get('type_of_doc', [])

    builder.add_document_type(
        document_type=document_type
    )

    builder.add_abstract(
        abstract=form_fields.get('abstract'),
        source='arXiv' if form_fields.get('categories') else None
    )

    if form_fields.get('arxiv_id') and form_fields.get('categories'):
        builder.add_arxiv_eprint(
            arxiv_id=form_fields.get('arxiv_id'),
            arxiv_categories=form_fields.get('categories').split()
        )

    builder.add_doi(doi=form_fields.get('doi'))

    builder.add_inspire_categories(
        subject_terms=form_fields.get('subject_term'),
        source='user'
    )

    for key in ('extra_comments', 'nonpublic_note',
                'hidden_notes', 'conf_name', 'references'):
        builder.add_private_note(
            private_notes=form_fields.get(key)
        )

    year = form_fields.get('year')
    try:
        year = int(year)
    except (TypeError, ValueError):
        year = None

    builder.add_publication_info(
        year=year,
        cnum=form_fields.get('conference_id'),
        journal_issue=form_fields.get('issue'),
        journal_title=form_fields.get('journal_title'),
        journal_volume=form_fields.get('volume'),
        page_start=form_fields.get('page_start'),
        page_end=form_fields.get('page_end'),
        artid=form_fields.get('artid')
    )

    builder.add_preprint_date(
        preprint_date=form_fields.get('preprint_created')
    )

    if form_fields.get('type_of_doc') == 'thesis':
        builder.add_thesis(
            defense_date=form_fields.get('defense_date'),
            degree_type=form_fields.get('degree_type'),
            institution=form_fields.get('institution'),
            date=form_fields.get('thesis_date')
        )

    builder.add_accelerator_experiments_legacy_name(
        legacy_name=form_fields.get('experiment')
    )

    language = form_fields.get('other_language') \
        if form_fields.get('language') == 'oth' \
        else form_fields.get('language')
    builder.add_language(language=language)

    builder.add_title_translation(title=form_fields.get('title_translation'))

    builder.add_title(
        title=form_fields.get('title_arXiv'),
        source='arXiv'
    )

    builder.add_title(
        title=form_fields.get('title_crossref'),
        source='crossref'
    )

    builder.add_license(url=form_fields.get('license_url'))

    builder.add_public_note(public_note=form_fields.get('public_notes'))

    builder.add_public_note(
        public_note=form_fields.get('note'),
        source='arXiv' if form_fields.get('categories') else 'CrossRef'
    )

    form_url = form_fields.get('url')
    form_additional_url = form_fields.get('additional_url')
    if form_url and not _is_arxiv_url(form_url):
        obj.extra_data['submission_pdf'] = form_url
        if not form_additional_url:
            builder.add_url(url=form_url)

    if form_additional_url and not _is_arxiv_url(form_additional_url):
        builder.add_url(url=form_additional_url)

    [builder.add_report_number(
        report_number=report_number.get('report_number')
    ) for report_number in form_fields.get('report_numbers', [])]

    builder.add_collaboration(collaboration=form_fields.get('collaboration'))

    builder.add_acquisition_source(
        datetime=datetime.datetime.utcnow().isoformat(),
        submission_number=obj.id,
        internal_uid=int(obj.id_user),
        email=form_fields.get('email'),
        orcid=form_fields.get('orcid'),
        method='submitter'
    )
    builder.validate_record()

    return builder.record

예제 #9

파일 보기

파일: tohep.py 프로젝트: miguelgrc/hepcrawl

def hepcrawl_to_hep(crawler_record):
    """
    Args:
        crawler_record(dict): dictionary representing the hepcrawl formatted
            record.


    Returns:
        dict: The hep formatted record.
    """

    def _filter_affiliation(affiliations):
        return [
            affilation.get('value')
            for affilation in affiliations
            if affilation.get('value')
        ]

    builder = LiteratureBuilder(
        source=crawler_record['acquisition_source']['source']
    )

    for author in crawler_record.get('authors', []):
        builder.add_author(builder.make_author(
            full_name=author['full_name'],
            raw_affiliations=_filter_affiliation(author['affiliations']),
        ))

    for title in crawler_record.get('titles', []):
        builder.add_title(
            title=title.get('title'),
            subtitle=title.get('subtitle'),
            source=title.get('source')
        )

    for abstract in crawler_record.get('abstracts', []):
        builder.add_abstract(
            abstract=abstract.get('value'),
            source=abstract.get('source')
        )

    for arxiv_eprint in crawler_record.get('arxiv_eprints', []):
        builder.add_arxiv_eprint(
            arxiv_id=arxiv_eprint.get('value'),
            arxiv_categories=arxiv_eprint.get('categories')
        )

    for doi in crawler_record.get('dois', []):
        builder.add_doi(
            doi=doi.get('value'),
            material=doi.get('material'),
        )

    for private_note in crawler_record.get('private_notes', []):
        builder.add_private_note(
            private_notes=private_note
        )

    for public_note in crawler_record.get('public_notes', []):
        builder.add_public_note(
            public_note=public_note.get('value'),
            source=public_note.get('source')
        )

    for license in crawler_record.get('license', []):
        builder.add_license(
            url=license.get('url'),
            license=license.get('license'),
            material=license.get('material'),
        )

    for collaboration in crawler_record.get('collaborations', []):
        builder.add_collaboration(
            collaboration=collaboration.get('value')
        )

    for imprint in crawler_record.get('imprints', []):
        builder.add_imprint_date(
            imprint_date=imprint.get('date')
        )

    for copyright in crawler_record.get('copyright', []):
        builder.add_copyright(
            holder=copyright.get('holder'),
            material=copyright.get('material'),
            statement=copyright.get('statement')
        )

    builder.add_preprint_date(
        preprint_date=crawler_record.get('preprint_date')
    )

    acquisition_source = crawler_record.get('acquisition_source', {})
    builder.add_acquisition_source(
        method=acquisition_source['method'],
        date=acquisition_source['datetime'],
        source=acquisition_source['source'],
        submission_number=acquisition_source['submission_number'],
    )

    try:
        builder.add_number_of_pages(
            number_of_pages=int(crawler_record.get('page_nr', [])[0])
        )
    except (TypeError, ValueError, IndexError):
        pass

    publication_types = [
        'introductory',
        'lectures',
        'review',
        'manual',
    ]

    document_types = [
        'book',
        'note',
        'report',
        'proceedings',
        'thesis',
    ]

    added_doc_type = False

    for collection in crawler_record.get('collections', []):
        collection = collection['primary'].strip().lower()

        if collection == 'arxiv':
            continue  # ignored
        elif collection == 'citeable':
            builder.set_citeable(True)
        elif collection == 'core':
            builder.set_core(True)
        elif collection == 'noncore':
            builder.set_core(False)
        elif collection == 'published':
            builder.set_refereed(True)
        elif collection == 'withdrawn':
            builder.set_withdrawn(True)
        elif collection in publication_types:
            builder.add_publication_type(collection)
        elif collection == 'bookchapter':
            added_doc_type = True
            builder.add_document_type('book chapter')
        elif collection == 'conferencepaper':
            added_doc_type = True
            builder.add_document_type('conference paper')
        elif collection in document_types:
            added_doc_type = True
            builder.add_document_type(collection)

    if not added_doc_type:
        builder.add_document_type('article')

    _pub_info = crawler_record.get('publication_info', [{}])[0]
    builder.add_publication_info(
        year=_pub_info.get('year'),
        artid=_pub_info.get('artid'),
        page_end=_pub_info.get('page_end'),
        page_start=_pub_info.get('page_start'),
        journal_issue=_pub_info.get('journal_issue'),
        journal_title=_pub_info.get('journal_title'),
        journal_volume=_pub_info.get('journal_volume'),
        pubinfo_freetext=_pub_info.get('pubinfo_freetext'),
        material=_pub_info.get('pubinfo_material'),
    )

    for report_number in crawler_record.get('report_numbers', []):
        builder.add_report_number(
            report_number=report_number.get('value'),
            source=report_number.get('source')
        )

    for url in crawler_record.get('urls', []):
        builder.add_url(url=url.get('value'))

    for document in crawler_record.get('documents', []):
        builder.add_document(
            description=document.get('description'),
            fulltext=document.get('fulltext'),
            hidden=document.get('hidden'),
            key=document['key'],
            material=document.get('material'),
            original_url=document.get('original_url'),
            url=document['url'],
        )

    return builder.record

예제 #10

파일 보기

파일: crossref.py 프로젝트: oguzdemirbasci/hepcrawl

class CrossrefParser(object):
    """Parser for the JSON Crossref format.

    Args:
        crossref_record (dict): the record in JSON Crossref API format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the Crossref metadata.
    """
    def __init__(self, crossref_record, source=None):
        self.record = crossref_record.get("message")
        if not source:
            source = self.material_source
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract a Crossref record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """

        self.builder.add_abstract(self.abstract)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        for reference in self.references:
            self.builder.add_reference(reference)
        self.builder.add_imprint_date(self.imprints)
        for author in self.authors:
            self.builder.add_author(author)
        for license_instance in self.license:
            self.builder.add_license(**license_instance)
        self.builder.add_publication_info(**self.publication_info)
        self.builder.add_title(self.title, subtitle=self.subtitle)
        self.builder.add_document_type(self.document_type)

        return self.builder.record

    @property
    def document_type(self):
        doc_type = self.record.get("type")
        return DOC_TYPE_MAP[doc_type]

    @property
    def title(self):
        title = get_value(self.record, "title[0]")

        return title

    @property
    def subtitle(self):
        subtitle = get_value(self.record, "subtitle[0]")

        return subtitle

    @property
    def dois(self):
        value = self.record.get("DOI")
        dois = [{'doi': value, 'material': self.material}]

        return dois

    @property
    def material_source(self):
        return self.record.get("source")

    @property
    def material(self):
        title = self.title or ''
        subtitle = self.subtitle or ''
        if title.startswith("Erratum") or subtitle.startswith("Erratum"):
            material = 'erratum'
        elif title.startswith("Addendum") or subtitle.startswith("Addendum"):
            material = 'addendum'
        elif title.startswith("Publisher's Note") or subtitle.startswith(
                "Publisher's Note"):
            material = 'editorial note'
        else:
            material = 'publication'

        return material

    @property
    def publication_info(self):
        publication_info = {
            'artid': self.artid,
            'journal_title': self.journal_title,
            'journal_issue': self.journal_issue,
            'journal_volume': self.journal_volume,
            'page_start': self.page_start,
            'page_end': self.page_end,
            'year': self.year,
            'material': self.material,
            'parent_isbn': self.parent_isbn,
        }

        return publication_info

    @property
    def parent_isbn(self):
        return get_value(self.record, "ISBN[0]")

    @property
    def journal_title(self):
        if self.document_type == 'book chapter':
            return None

        return get_value(self.record, "container-title[0]")

    @property
    def artid(self):
        return self.record.get("article-number")

    @property
    def journal_issue(self):
        return self.record.get("issue")

    @property
    def journal_volume(self):
        return self.record.get("volume")

    @property
    def year(self):
        date = get_value(self.record, "issued.date-parts[0][0]")

        return date

    @property
    def page_start(self):
        pages = self.record.get("page")

        if pages:
            return pages.split('-')[0]
        else:
            return None

    @property
    def page_end(self):
        pages = self.record.get("page")

        if pages and '-' in pages:
            return pages.split('-')[1]
        else:
            return None

    @staticmethod
    def get_author_name(author_key):
        """Extract an author's name."""
        author_name_list = [author_key.get("family"), author_key.get("given")]
        return ', '.join(filter(None, author_name_list))

    @staticmethod
    def get_author_affiliations(author_key):
        """Extract an author's affiliations."""
        affiliations = force_list(author_key.get("affiliation"))

        auth_aff = [affiliation.get('name') for affiliation in affiliations]

        return auth_aff

    @staticmethod
    def get_author_orcid(author_key):
        """Extract an author's orcid."""
        orcid_value = author_key.get('ORCID')

        return [('ORCID', orcid_value)]

    def get_author(self, author_key):
        """Extract one author.

        Args:
            author_key(dict): a dictionary on a single author.

        Returns:
            dict: the parsed author, conforming to the Inspire schema.
        """
        author_name = self.get_author_name(author_key)
        affiliations = self.get_author_affiliations(author_key)
        orcid = self.get_author_orcid(author_key)

        return self.builder.make_author(author_name,
                                        raw_affiliations=affiliations,
                                        ids=orcid)

    @property
    def authors(self):
        authors_key = self.record.get("author")
        authors = [
            self.get_author(author) for author in force_list(authors_key)
        ]

        return authors

    @property
    def license(self):
        license_keys = self.record.get("license")
        licenses = [
            self.get_license(license) for license in force_list(license_keys)
        ]

        return licenses

    def get_license(self, license_key):
        """Extract one license.

        Args:
            license_key(dict): a dictionary on a single license.

        Returns:
            dict: the parsed license, conforming to the Inspire schema.
        """
        license = {
            'imposing': self.publisher,
            'material': self.material,
            'url': self.get_license_url(license_key),
        }

        return license

    @staticmethod
    def get_license_url(license_key):
        return license_key.get("URL")

    @property
    def publisher(self):
        return self.record.get("publisher")

    @property
    def abstract(self):
        return self.record.get("abstract")

    @property
    def imprints(self):
        '''issued: Eariest of published-print and published-online

        That is why we use this field to fill the imprints and the publication info.
        '''

        date_parts = get_value(self.record, "issued.date-parts[0]")

        if not date_parts:
            return None

        date = PartialDate(*date_parts)

        return date.dumps()

    @property
    def references(self):
        """Extract a Crossref record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_keys = self.record.get("reference")
        reference_list = list(
            itertools.chain.from_iterable(
                self.get_reference(key) for key in force_list(ref_keys)))
        return dedupe_list_of_dicts(reference_list)

    def get_reference(self, ref_key):
        """Extract one reference.

        Args:
            ref_key(dict): a dictionary on a single reference.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        builder = ReferenceBuilder()

        journal_title = ref_key.get("journal-title")
        if journal_title:
            builder.set_journal_title(journal_title)

        journal_volume = ref_key.get("volume")
        if journal_volume:
            builder.set_journal_volume(journal_volume)

        journal_issue = ref_key.get("issue")
        if journal_issue:
            builder.set_journal_issue(journal_issue)

        first_page = ref_key.get("first-page")
        if first_page:
            builder.set_page_artid(page_start=first_page)

        year = ref_key.get("year")
        if year:
            builder.set_year(year)

        title = ref_key.get("article-title")
        if title:
            builder.add_title(title)

        isbn = ref_key.get("ISBN")
        if isbn:
            builder.add_uid(isbn)

        doi = ref_key.get("DOI")
        if doi:
            builder.add_uid(doi)

        author = ref_key.get("author")
        if author:
            builder.add_author(author, 'author')

        raw_ref = ref_key.get("unstructured")
        if raw_ref:
            builder.add_raw_reference(raw_ref, self.material_source)

        yield builder.obj

예제 #11

파일 보기

파일: elsevier.py 프로젝트: inspirehep/hepcrawl

class ElsevierParser(object):
    """Parser for the Elsevier format.

    It can be used directly by invoking the :func:`ElsevierParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        elsevier_record (Union[str, scrapy.selector.Selector]): the record in Elsevier format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the Elsevier metadata.
    """
    def __init__(self, elsevier_record, source=None):
        self.root = self.get_root_node(elsevier_record)
        if not source:
            source = self.publisher
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract a Elsevier record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(self.abstract)
        self.builder.add_title(self.title, subtitle=self.subtitle)
        self.builder.add_copyright(**self.copyright)
        self.builder.add_document_type(self.document_type)
        self.builder.add_license(**self.license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        for keyword in self.keywords:
            self.builder.add_keyword(keyword)
        self.builder.add_imprint_date(
            self.publication_date.dumps() if self.publication_date else None)
        for reference in self.references:
            self.builder.add_reference(reference)

        return self.builder.record

    @property
    def references(self):
        """Extract a Elsevier record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_nodes = self.root.xpath(".//bib-reference")
        return list(
            itertools.chain.from_iterable(
                self.get_reference_iter(node) for node in ref_nodes))

    remove_tags_config_abstract = {
        "allowed_tags": ["sup", "sub"],
        "allowed_trees": ["math"],
        "strip": "self::pub-id|self::issn",
    }

    @property
    def abstract(self):
        abstract_nodes = self.root.xpath(
            ".//head/abstract[not(@graphical)]/abstract-sec/simple-para")

        if not abstract_nodes:
            return

        abstract_paragraphs = [
            remove_tags(abstract_node,
                        **self.remove_tags_config_abstract).strip("/ \n")
            for abstract_node in abstract_nodes
        ]
        abstract = ' '.join(abstract_paragraphs)
        return abstract

    @property
    def article_type(self):
        """Return a article type mapped from abbreviation."""
        abbrv_doctype = self.root.xpath(".//@docsubtype").extract_first()
        article_type = DOCTYPE_MAPPING.get(abbrv_doctype)
        return article_type

    @property
    def artid(self):
        artid = self.root.xpath("string(./*/item-info/aid[1])").extract_first()
        return artid

    @property
    def authors(self):
        author_nodes = self.root.xpath("./*/head/author-group")
        all_authors = []
        for author_group in author_nodes:
            authors = [
                self.get_author(author, author_group)
                for author in author_group.xpath("./author")
            ]
            all_authors.extend(authors)
        return all_authors

    @property
    def collaborations(self):
        collaborations = self.root.xpath(
            "./*/head/author-group//collaboration/text/text()").extract()
        return collaborations

    @property
    def copyright(self):
        copyright = {
            "holder": self.copyright_holder,
            "material": self.material,
            "statement": self.copyright_statement,
            "year": self.copyright_year,
        }

        return copyright

    @property
    def copyright_holder(self):
        copyright_holder = self.root.xpath(
            "string(./*/item-info/copyright[@type][1])").extract_first()
        if not copyright_holder:
            copyright_type = self.root.xpath(
                "./*/item-info/copyright/@type").extract_first()
            copyright_holder = COPYRIGHT_MAPPING.get(copyright_type)

        return copyright_holder

    @property
    def copyright_statement(self):
        copyright_statement = self.root.xpath(
            "string(./RDF/Description/copyright[1])").extract_first()
        if not copyright_statement:
            copyright_statement = self.root.xpath(
                "string(./*/item-info/copyright[@type][1])").extract_first()

        return copyright_statement

    @property
    def copyright_year(self):
        copyright_year = self.root.xpath(
            "./*/item-info/copyright[@type]/@year").extract_first()

        return maybe_int(copyright_year)

    @property
    def dois(self):
        doi = self.root.xpath(
            "string(./RDF/Description/doi[1])").extract_first()
        return [{"doi": doi, "material": self.material}]

    @property
    def document_type(self):
        doctype = None
        if self.root.xpath(
                "./*[contains(name(),'article') or self::book-review]"):
            doctype = "article"
        elif self.root.xpath("./*[self::book or self::simple-book]"):
            doctype = "book"
        elif self.root.xpath("./book-chapter"):
            doctype = "book chapter"
        if self.is_conference_paper:
            doctype = "conference paper"
        if doctype:
            return doctype

    @property
    def is_conference_paper(self):
        """Decide whether the article is a conference paper."""
        if self.root.xpath("./conference-info"):
            return True
        journal_issue = self.root.xpath(
            "string(./RDF/Description/issueName[1])").extract_first()
        if journal_issue:
            is_conference = re.findall(r"proceedings|proc.",
                                       journal_issue.lower())
            return bool(is_conference)
        return False

    @property
    def journal_title(self):
        jid = self.root.xpath("string(./*/item-info/jid[1])").extract_first(
            default="")
        publication = self.root.xpath(
            "string(./RDF/Description/publicationName[1])").extract_first(
                default=jid)
        publication = re.sub(" [S|s]ection", "",
                             publication).replace(",", "").strip()
        return publication

    @property
    def journal_issue(self):
        journal_issue = self.root.xpath(
            "string(./serial-issue/issue-info/issue-first[1])").extract_first(
            )

        return journal_issue

    @property
    def journal_volume(self):
        journal_volume = self.root.xpath(
            "string(./RDF/Description/volume[1])").extract_first()

        return journal_volume

    @property
    def keywords(self):
        keywords = self.root.xpath(
            "./*/head/keywords[not(@abr)]/keyword/text/text()").getall()

        return keywords

    @property
    def license(self):
        license = {
            "license": self.license_statement,
            "material": self.material,
            "url": self.license_url,
        }

        return license

    @property
    def license_statement(self):
        license_statement = self.root.xpath(
            "string(./RDF/Description/licenseLine[1])").extract_first()

        return license_statement

    @property
    def license_url(self):
        license_url = self.root.xpath(
            "string(./RDF/Description/openAccessInformation/userLicense[1])"
        ).extract_first()

        return license_url

    @property
    def material(self):
        if self.article_type in (
                "erratum",
                "addendum",
                "retraction",
                "removal",
                "duplicate",
        ):
            material = self.article_type
        elif self.article_type in ("editorial", "publisher's note"):
            material = "editorial note"
        else:
            material = "publication"

        return material

    @property
    def page_start(self):
        page_start = self.root.xpath(
            "string(./RDF/Description/startingPage[1])").extract_first()
        return page_start

    @property
    def page_end(self):
        page_end = self.root.xpath(
            "string(./RDF/Description/endingPage[1])").extract_first()
        return page_end

    @property
    def publication_date(self):
        publication_date = None
        publication_date_string = self.root.xpath(
            "string(./RDF/Description/coverDisplayDate[1])").extract_first()
        if publication_date_string:
            try:
                publication_date = PartialDate.parse(publication_date_string)
            except:
                # in case when date contains month range, eg. July-September 2020
                publication_date = re.sub("[A-aZ-z]*-(?=[A-aZ-z])", "",
                                          publication_date_string)
                publication_date = PartialDate.parse(publication_date)
        return publication_date

    @property
    def publication_info(self):
        publication_info = {
            "artid": self.artid,
            "journal_title": self.journal_title,
            "journal_issue": self.journal_issue,
            "journal_volume": self.journal_volume,
            "material": self.material,
            "page_start": self.page_start,
            "page_end": self.page_end,
            "year": self.year,
        }

        return publication_info

    @property
    def publisher(self):
        publisher = self.root.xpath("string(./RDF/Description/publisher[1])"
                                    ).extract_first("Elsevier B.V.")

        return publisher

    @property
    def subtitle(self):
        subtitle = self.root.xpath(
            "string(./*/head/subtitle[1])").extract_first()

        return subtitle

    @property
    def title(self):
        title = self.root.xpath("string(./*/head/title[1])").extract_first()

        return title.strip("\n") if title else None

    @property
    def year(self):
        if self.publication_date:
            return self.publication_date.year

    def get_author_affiliations(self, author_node, author_group_node):
        """Extract an author's affiliations."""
        ref_ids = author_node.xpath(".//@refid[contains(., 'af')]").extract()
        group_affs = author_group_node.xpath(
            "string(./affiliation/textfn[1])").getall()
        if ref_ids:
            affiliations = self._find_affiliations_by_id(
                author_group_node, ref_ids)
        else:
            affiliations = filter(None, group_affs)
        return affiliations

    @staticmethod
    def _find_affiliations_by_id(author_group, ref_ids):
        """Return affiliations with given ids.

        Affiliations should be standardized later.
        """
        affiliations_by_id = []
        for aff_id in ref_ids:
            affiliation = author_group.xpath(
                "string(//affiliation[@id='{}']/textfn[1])".format(
                    aff_id)).extract_first()
            affiliations_by_id.append(affiliation)

        return affiliations_by_id

    def get_author_emails(self, author_node):
        """Extract an author's email addresses."""
        emails = author_node.xpath(
            'string(./e-address[@type="email"][1])').getall()

        return emails

    @staticmethod
    def get_author_name(author_node):
        """Extract an author's name."""
        surname = author_node.xpath("string(./surname[1])").extract_first()
        given_names = author_node.xpath(
            "string(./given-name[1])").extract_first()
        suffix = author_node.xpath("string(.//suffix[1])").extract_first()
        author_name = ", ".join(el for el in (surname, given_names, suffix)
                                if el)

        return author_name

    @staticmethod
    def get_root_node(elsevier_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            elsevier_record(Union[str, scrapy.selector.Selector]): the record in Elsevier format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(elsevier_record, six.string_types):
            root = get_node(elsevier_record)
        else:
            root = elsevier_record
        root.remove_namespaces()

        return root

    def get_author(self, author_node, author_group_node):
        """Extract one author.

        Args:
            author_node(scrapy.selector.Selector): a selector on a single
                author, e.g. a ``<contrib contrib-type="author">``.

        Returns:
            dict: the parsed author, conforming to the Inspire schema.
        """
        author_name = self.get_author_name(author_node)
        emails = self.get_author_emails(author_node)
        affiliations = self.get_author_affiliations(author_node,
                                                    author_group_node)

        return self.builder.make_author(author_name,
                                        raw_affiliations=affiliations,
                                        emails=emails)

    @staticmethod
    def get_reference_authors(ref_node):
        """Extract authors from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.

        Returns:
            List[str]: list of names
        """
        authors = ref_node.xpath("./contribution/authors/author")
        authors_names = []
        for author in authors:
            given_names = author.xpath(
                "string(./given-name[1])").extract_first(default="")
            last_names = author.xpath("string(./surname[1])").extract_first(
                default="")
            authors_names.append(" ".join([given_names, last_names]).strip())
        return authors_names

    @staticmethod
    def get_reference_editors(ref_node):
        """Extract authors of `role` from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.

        Returns:
            List[str]: list of names
        """
        editors = ref_node.xpath(".//editors/authors/author")
        editors_names = []
        for editor in editors:
            given_names = editor.xpath(
                "string(./given-name[1])").extract_first(default="")
            last_names = editor.xpath("string(./surname[1])").extract_first(
                default="")
            editors_names.append(" ".join([given_names, last_names]).strip())
        return editors_names

    @staticmethod
    def get_reference_artid(ref_node):
        return ref_node.xpath("string(.//article-number[1])").extract_first()

    @staticmethod
    def get_reference_pages(ref_node):
        first_page = ref_node.xpath(
            "string(.//pages/first-page[1])").extract_first()
        last_page = ref_node.xpath(
            "string(.//pages/last-page[1])").extract_first()
        return first_page, last_page

    def get_reference_iter(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

       Yields:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        # handle also unstructured refs
        for citation_node in ref_node.xpath("./reference|./other-ref"):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format="Elsevier",
            )

            fields = [
                (
                    ("string(.//series/title/maintitle[1])"),
                    builder.set_journal_title,
                ),
                (
                    "string(.//title[parent::edited-book|parent::book]/maintitle[1])",
                    builder.add_parent_title,
                ),
                ("string(./publisher/name[1])", builder.set_publisher),
                ("string(.//volume-nr[1])", builder.set_journal_volume),
                ("string(.//issue-nr[1])", builder.set_journal_issue),
                ("string(.//date[1])", builder.set_year),
                ("string(.//inter-ref[1])", builder.add_url),
                ("string(.//doi[1])", builder.add_uid),
                (
                    'string(pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")][1])',
                    builder.add_report_number,
                ),
                ("string(./title/maintitle[1])", builder.add_title),
            ]
            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            label_value = ref_node.xpath("string(./label[1])").extract_first()
            builder.set_label(label_value.strip("[]"))

            pages = self.get_reference_pages(citation_node)
            artid = self.get_reference_artid(citation_node)
            if artid:
                builder.set_page_artid(artid=artid)
            if any(pages):
                builder.set_page_artid(*pages)

            remainder = (remove_tags(
                citation_node,
                strip="self::authors"
                "|self::article-number"
                "|self::volume-nr"
                "|self::issue-nr"
                "|self::inter-ref"
                "|self::maintitle"
                "|self::date"
                "|self::label"
                "|self::publisher"
                "|self::doi"
                "|self::pages").strip("\"';,. \t\n\r").replace("()", ""))
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_editors(citation_node):
                builder.add_author(editor, "editor")

            for author in self.get_reference_authors(citation_node):
                builder.add_author(author, "author")

            yield builder.obj

    def attach_fulltext_document(self, file_name, url):
        self.builder.add_document(file_name, url, fulltext=True, hidden=True)

    def get_identifier(self):
        return self.dois[0]["doi"]

    def should_record_be_harvested(self):
        if self.article_type in DOCTYPES_TO_HARVEST and all([
                self.title,
                self.journal_title,
                self.journal_volume,
            (self.artid or self.page_start),
        ]):
            return True
        return False

예제 #12

파일 보기

def authorlist_with_affiliations(text):
    """Return a record containing the authors, including affiliations."""
    def parse_author_string(author):
        """Get fullname and affiliation ids."""
        name, affs = re.search(
            r'(.+?)(\d+[\,\d]*)', author, flags=re.UNICODE
        ).groups()
        aff_ids = [aff for aff in affs.split(',') if aff.isdigit()]

        return name, aff_ids

    def get_author_affiliations(author, affiliations):
        """Get affiliations belonging to author."""
        try:
            fullname, aff_ids = parse_author_string(author)
        except AttributeError:
            raise AttributeError(
                'Cannot split affiliation IDs from author name. This author '
                'might not have an affiliation at all', author
            )
        try:
            author_affiliations = [affiliations[aff_id] for aff_id in aff_ids]
        except KeyError:
            raise KeyError(
                'There might be multiple affiliations per line or '
                'affiliation IDs might not be separated with commas or '
                'the affiliation is missing. '
                'Problematic author and affiliations', author, aff_ids,
                affiliations
            )

        return (fullname, author_affiliations)

    # Try to work with badly formatted input
    # There should be commas between different affiliation ids in author
    # names and there should be only one affiliation name per line.
    split_auths_and_affs = split_authors_affs_pattern.search(text)
    if not split_auths_and_affs:
        raise AttributeError('Could not find affiliations')
    authors_raw = split_auths_and_affs.group(1)
    # ensure comma between affid and next author name:
    authors_raw = re.sub(r'(\d+)[\n\s](\D)', r'\1, \2', authors_raw)
    authors_raw = authors_raw.replace('\n', '')
    # ensure no comma between author name and its affids
    authors_raw = re.sub(r'(\D)\,[\n\s]*(\d)', r'\1\2', authors_raw)
    # ensure space between comma and next author name:
    authors_raw = re.sub(r'(\d+)\,(\S\D)', r'\1, \2', authors_raw)
    # ensure no spaces between affids of an author
    authors_raw = re.sub(r'(\d+)\,\s(\d+)', r'\1,\2', authors_raw)

    authors = authors_raw.replace(' and ', ', ').split(', ')

    # Extract affiliations:
    affs_raw = split_auths_and_affs.group(2)
    affs_list = split_affs_pattern.findall(affs_raw)
    affs_list = [aff.replace('\n', ' ').strip() for aff in affs_list]
    affiliations = {}
    for aff in affs_list:
        try:
            # Note that affiliation may contain numbers
            aff_id, aff_name = re.search(r'^(\d+)\.?\s?(.*)$', aff).groups()
            affiliations[aff_id] = aff_name
        except (ValueError, AttributeError):
            raise ValueError('Cannot parse affiliation', aff)

    builder = LiteratureBuilder()
    for author in authors:
        fullname, author_affs = get_author_affiliations(author, affiliations)
        builder.add_author(builder.make_author(fullname, raw_affiliations=author_affs))

    return {'authors': builder.record['authors']}

예제 #13

파일 보기

파일: arxiv.py 프로젝트: zanachka/hepcrawl

class ArxivParser(object):
    """Parser for the arXiv format.

    It can be used directly by invoking the :func:`ArxivParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        arxiv_record (Union[str, scrapy.selector.Selector]): the record in arXiv format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the arXiv metadata.
    """
    _l2t = LatexNodes2Text(
        latex_context=get_arxiv_latex_context_db(),
        math_mode="verbatim",
        strict_latex_spaces="based-on-source",
        keep_comments=True,
        keep_braced_groups=True,
        keep_braced_groups_minlen=2,
    )

    def __init__(self, arxiv_record, source=None):
        self.root = self.get_root_node(arxiv_record)
        if not source:
            source = 'arXiv'
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract an arXiv record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(abstract=self.abstract, source=self.source)
        self.builder.add_title(title=self.title, source=self.source)
        for license in self.licenses:
            self.builder.add_license(**license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        self.builder.add_preprint_date(self.preprint_date)
        if self.public_note:
            self.builder.add_public_note(self.public_note, self.source)
        for rep_number in self.report_numbers:
            self.builder.add_report_number(rep_number, self.source)
        self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories)
        self.builder.add_private_note(self.private_note)
        self.builder.add_document_type(self.document_type)
        normalized_categories = [
            classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories
        ]
        self.builder.add_inspire_categories(dedupe_list(normalized_categories),
                                            'arxiv')

        return self.builder.record

    def _get_authors_and_collaborations(self, node):
        """Parse authors, affiliations and collaborations from the record node.

        Heuristics are used to detect collaborations. In case those are not
        reliable, a warning is returned for manual checking.

        Args:
            node (Selector): a selector on a record
        Returns:
            tuple: a tuple of (authors, collaborations, warning)
        """
        author_selectors = node.xpath('.//authors//author')

        # take 'for the' out of the general phrases and dont use it in
        # affiliations
        collab_phrases = [
            'consortium',
            ' collab ',
            'collaboration',
            ' team',
            'group',
            ' on behalf of ',
            ' representing ',
        ]
        inst_phrases = ['institute', 'university', 'department', 'center']

        authors = []
        collaborations = []
        warning_tags = []
        some_affiliation_contains_collaboration = False

        authors_and_affiliations = (
            self._get_author_names_and_affiliations(author)
            for author in author_selectors)
        next_author_and_affiliations = (
            self._get_author_names_and_affiliations(author)
            for author in author_selectors)
        next(next_author_and_affiliations)

        for (forenames, keyname,
             affiliations), (next_forenames, next_keyname,
                             _) in six.moves.zip_longest(
                                 authors_and_affiliations,
                                 next_author_and_affiliations,
                                 fillvalue=('end of author-list', '', None)):

            name_string = " %s %s " % (forenames, keyname)

            # collaborations in affiliation field? Cautious with 'for the' in
            # Inst names
            affiliations_with_collaborations = []
            affiliations_without_collaborations = []
            for aff in affiliations:
                affiliation_contains_collaboration = any(
                    phrase in aff.lower()
                    for phrase in collab_phrases) and not any(
                        phrase in aff.lower() for phrase in inst_phrases)
                if affiliation_contains_collaboration:
                    affiliations_with_collaborations.append(aff)
                    some_affiliation_contains_collaboration = True
                else:
                    affiliations_without_collaborations.append(aff)
            for aff in affiliations_with_collaborations:
                coll, author_name = coll_cleanforthe(aff)
                if coll and coll not in collaborations:
                    collaborations.append(coll)

            # Check if name is a collaboration, else append to authors
            collaboration_in_name = ' for the ' in name_string.lower() or any(
                phrase in name_string.lower() for phrase in collab_phrases)
            if collaboration_in_name:
                coll, author_name = coll_cleanforthe(name_string)
                if author_name:
                    surname, given_names = split_fullname(author_name)
                    authors.append({
                        'full_name': surname + ', ' + given_names,
                        'surname': surname,
                        'given_names': given_names,
                        'affiliations': [],
                    })
                if coll and coll not in collaborations:
                    collaborations.append(coll)
            elif name_string.strip() == ':':
                # DANGERZONE : this might not be correct - add a warning for the cataloger
                warning_tags.append(' %s %s ' % (next_forenames, next_keyname))
                if not some_affiliation_contains_collaboration:
                    # everything up to now seems to be collaboration info
                    for author_info in authors:
                        name_string = " %s %s " % \
                            (author_info['given_names'], author_info['surname'])
                        coll, author_name = coll_cleanforthe(name_string)
                        if coll and coll not in collaborations:
                            collaborations.append(coll)
                    authors = []
            else:
                authors.append({
                    'full_name':
                    keyname + ', ' + forenames,
                    'surname':
                    keyname,
                    'given_names':
                    forenames,
                    'affiliations':
                    affiliations_without_collaborations
                })
        if warning_tags:
            warning = 'WARNING: Colon in authors before %s: Check author list for collaboration names!' % ', '.join(
                warning_tags)
        else:
            warning = ''
        return authors, collaborations, warning

    @staticmethod
    def _get_author_names_and_affiliations(author_node):
        forenames = u' '.join(
            author_node.xpath('.//forenames//text()').extract())
        keyname = u' '.join(author_node.xpath('.//keyname//text()').extract())
        affiliations = author_node.xpath('.//affiliation//text()').extract()

        return forenames, keyname, affiliations

    @property
    def preprint_date(self):
        preprint_date = self.root.xpath('.//created/text()').extract_first()

        return preprint_date

    @property
    def abstract(self):
        abstract = self.root.xpath('.//abstract/text()').extract_first()
        long_text_fixed = self.fix_long_text(abstract)
        return self.latex_to_unicode(long_text_fixed)

    @property
    def authors(self):
        authors, _, _ = self.authors_and_collaborations
        parsed_authors = [
            self.builder.make_author(full_name=auth["full_name"],
                                     raw_affiliations=auth["affiliations"])
            for auth in authors
        ]

        return parsed_authors

    @property
    def collaborations(self):
        _, collaborations, _ = self.authors_and_collaborations

        return collaborations

    @property
    def dois(self):
        doi_values = self.root.xpath('.//doi/text()').extract()
        doi_values_splitted = chain.from_iterable(
            [re.split(RE_DOIS, doi) for doi in doi_values])
        dois = [{
            'doi': value,
            'material': 'publication'
        } for value in doi_values_splitted]

        return dois

    @property
    def licenses(self):
        licenses = self.root.xpath('.//license/text()').extract()
        return [{
            'url': license,
            'material': self.material
        } for license in licenses]

    @property
    def material(self):
        return 'preprint'

    @property
    def number_of_pages(self):
        comments = '; '.join(self.root.xpath('.//comments/text()').extract())

        found_pages = RE_PAGES.search(comments)
        if found_pages:
            pages = found_pages.group(1)
            return maybe_int(pages)

        return None

    @property
    def publication_info(self):
        publication_info = {
            'material': 'publication',
            'pubinfo_freetext': self.pubinfo_freetext,
        }

        return publication_info

    @property
    def pubinfo_freetext(self):
        return self.root.xpath('.//journal-ref/text()').extract_first()

    @property
    def title(self):
        long_text_fixed = self.fix_long_text(
            self.root.xpath('.//title/text()').extract_first())
        return self.latex_to_unicode(long_text_fixed)

    @staticmethod
    def fix_long_text(text):
        return re.sub(r'\s+', ' ', text).strip()

    @staticmethod
    def get_root_node(arxiv_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            arxiv_record(Union[str, scrapy.selector.Selector]): the record in arXiv format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(arxiv_record, six.string_types):
            root = get_node(arxiv_record)
        else:
            root = arxiv_record
        root.remove_namespaces()

        return root

    @property
    def public_note(self):
        comments = '; '.join(self.root.xpath('.//comments/text()').extract())

        return self.latex_to_unicode(comments)

    @property
    def private_note(self):
        _, _, warning = self.authors_and_collaborations

        return warning

    @property
    def report_numbers(self):
        report_numbers = self.root.xpath('.//report-no/text()').extract()
        rns = []
        for rn in report_numbers:
            rns.extend(rn.split(', '))

        return rns

    @property
    def arxiv_eprint(self):
        return self.root.xpath('.//id/text()').extract_first()

    @property
    def arxiv_categories(self):
        categories = self.root.xpath('.//categories/text()').extract_first(
            default='[]')
        categories = categories.split()
        categories_without_old = [
            normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories
        ]

        return dedupe_list(categories_without_old)

    @property
    def document_type(self):
        comments = '; '.join(self.root.xpath('.//comments/text()').extract())

        doctype = 'article'
        if RE_THESIS.search(comments):
            doctype = 'thesis'
        elif RE_CONFERENCE.search(comments):
            doctype = 'conference paper'

        return doctype

    @property
    def source(self):
        return 'arXiv'

    @property
    def authors_and_collaborations(self):
        if not hasattr(self, '_authors_and_collaborations'):
            self._authors_and_collaborations = self._get_authors_and_collaborations(
                self.root)
        return self._authors_and_collaborations

    @classmethod
    def latex_to_unicode(cls, latex_string):
        try:
            return cls._l2t.latex_to_text(latex_string).replace("  ", " ")
        except Exception as e:
            return latex_string

예제 #14

파일 보기

파일: crawler2hep.py 프로젝트: spirosdelviniotis/hepcrawl

def crawler2hep(crawler_record):
    def _filter_affiliation(affiliations):
        return [
            affilation.get('value') for affilation in affiliations
            if affilation.get('value')
        ]

    builder = LiteratureBuilder('hepcrawl')

    for author in crawler_record.get('authors', []):
        builder.add_author(
            builder.make_author(
                author['full_name'],
                affiliations=_filter_affiliation(author['affiliations']),
            ))

    for title in crawler_record.get('titles', []):
        builder.add_title(title=title.get('title'), source=title.get('source'))

    for abstract in crawler_record.get('abstracts', []):
        builder.add_abstract(abstract=abstract.get('value'),
                             source=abstract.get('source'))

    for arxiv_eprint in crawler_record.get('arxiv_eprints', []):
        builder.add_arxiv_eprint(
            arxiv_id=arxiv_eprint.get('value'),
            arxiv_categories=arxiv_eprint.get('categories'))

    for doi in crawler_record.get('dois', []):
        builder.add_doi(doi=doi.get('value'))

    for public_note in crawler_record.get('public_notes', []):
        builder.add_public_note(public_note=public_note.get('value'),
                                source=public_note.get('source'))

    for license in crawler_record.get('license', []):
        builder.add_license(url=license.get('url'),
                            license=license.get('license'))

    for collaboration in crawler_record.get('collaborations', []):
        builder.add_collaboration(collaboration=collaboration.get('value'))

    for imprint in crawler_record.get('imprints', []):
        builder.add_imprint_date(imprint_date=imprint.get('date'))

    for copyright in crawler_record.get('copyright', []):
        builder.add_copyright(holder=copyright.get('holder'),
                              material=copyright.get('material'),
                              statement=copyright.get('statement'))

    builder.add_preprint_date(
        preprint_date=crawler_record.get('preprint_date'))

    acquisition_source = crawler_record.get('acquisition_source', {})
    builder.add_acquisition_source(
        method='hepcrawl',
        date=acquisition_source.get('date'),
        source=acquisition_source.get('source'),
        submission_number=acquisition_source.get('submission_number'))

    try:
        builder.add_number_of_pages(
            number_of_pages=int(crawler_record.get('page_nr', [])[0]))
    except (TypeError, ValueError, IndexError):
        pass

    publication_types = [
        'introductory',
        'lectures',
        'review',
    ]

    special_collections = [
        'cdf-internal-note',
        'cdf-note',
        'cds',
        'd0-internal-note',
        'd0-preliminary-note',
        'h1-internal-note',
        'h1-preliminary-note',
        'halhidden',
        'hephidden',
        'hermes-internal-note',
        'larsoft-internal-note',
        'larsoft-note',
        'zeus-internal-note',
        'zeus-preliminary-note',
    ]

    document_types = [
        'book',
        'note',
        'report',
        'proceedings',
        'thesis',
    ]

    added_doc_type = False

    for collection in crawler_record.get('collections', []):
        collection = collection['primary'].strip().lower()

        if collection == 'arxiv':
            continue  # ignored
        elif collection == 'citeable':
            builder.set_citeable(True)
        elif collection == 'core':
            builder.set_core(True)
        elif collection == 'noncore':
            builder.set_core(False)
        elif collection == 'published':
            builder.set_refereed(True)
        elif collection == 'withdrawn':
            builder.set_withdrawn(True)
        elif collection in publication_types:
            builder.add_publication_type(collection)
        elif collection in special_collections:
            builder.add_special_collection(collection.upper())
        elif collection == 'bookchapter':
            added_doc_type = True
            builder.add_document_type('book chapter')
        elif collection == 'conferencepaper':
            added_doc_type = True
            builder.add_document_type('conference paper')
        elif collection in document_types:
            added_doc_type = True
            builder.add_document_type(collection)

    if not added_doc_type:
        builder.add_document_type('article')

    _pub_info = crawler_record.get('publication_info', [{}])[0]
    builder.add_publication_info(
        year=_pub_info.get('year'),
        artid=_pub_info.get('artid'),
        page_end=_pub_info.get('page_end'),
        page_start=_pub_info.get('page_start'),
        journal_issue=_pub_info.get('journal_issue'),
        journal_title=_pub_info.get('journal_title'),
        journal_volume=_pub_info.get('journal_volume'),
        pubinfo_freetext=_pub_info.get('pubinfo_freetext'),
    )

    for report_number in crawler_record.get('report_numbers', []):
        builder.add_report_number(report_number=report_number.get('value'),
                                  source=report_number.get('source'))

    builder.validate_record()

    return builder.record

예제 #15

파일 보기

파일: jats.py 프로젝트: drjova/hepcrawl

class JatsParser(object):
    """Parser for the JATS format.

    It can be used directly by invoking the :func:`JatsParser.parse` method, or be
    subclassed to customize its behavior.

    Args:
        jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse.
        source (Optional[str]): if provided, sets the ``source`` everywhere in
            the record. Otherwise, the source is extracted from the JATS metadata.
    """
    def __init__(self, jats_record, source=None):
        self.root = self.get_root_node(jats_record)
        if not source:
            source = self.publisher
        self.builder = LiteratureBuilder(source)

    def parse(self):
        """Extract a JATS record into an Inspire HEP record.

        Returns:
            dict: the same record in the Inspire Literature schema.
        """
        self.builder.add_abstract(self.abstract)
        self.builder.add_title(self.title, subtitle=self.subtitle)
        self.builder.add_copyright(**self.copyright)
        self.builder.add_document_type(self.document_type)
        self.builder.add_license(**self.license)
        for author in self.authors:
            self.builder.add_author(author)
        self.builder.add_number_of_pages(self.number_of_pages)
        self.builder.add_publication_info(**self.publication_info)
        for collab in self.collaborations:
            self.builder.add_collaboration(collab)
        for doi in self.dois:
            self.builder.add_doi(**doi)
        for keyword in self.keywords:
            self.builder.add_keyword(**keyword)
        self.builder.add_imprint_date(self.publication_date.dumps())
        for reference in self.references:
            self.builder.add_reference(reference)

        return self.builder.record

    @property
    def references(self):
        """Extract a JATS record into an Inspire HEP references record.

        Returns:
            List[dict]: an array of reference schema records, representing
                the references in the record
        """
        ref_nodes = self.root.xpath('./back/ref-list/ref')
        return list(
            itertools.chain.from_iterable(
                self.get_reference(node) for node in ref_nodes
            )
        )

    remove_tags_config_abstract = {
        'allowed_tags': ['sup', 'sub'],
        'allowed_trees': ['math'],
        'strip': 'self::pub-id|self::issn'
    }

    @property
    def abstract(self):
        abstract_nodes = self.root.xpath('./front//abstract[1]')

        if not abstract_nodes:
            return

        abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip()
        return abstract

    @property
    def article_type(self):
        article_type = self.root.xpath('./@article-type').extract_first()

        return article_type

    @property
    def artid(self):
        artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first()

        return artid

    @property
    def authors(self):
        author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]')
        authors = [self.get_author(author) for author in author_nodes]

        return authors

    @property
    def collaborations(self):
        collab_nodes = self.root.xpath(
            './front//collab |'
            './front//contrib[@contrib-type="collaboration"] |'
            './front//on-behalf-of'
        )
        collaborations = set(
            collab.xpath('string(.)').extract_first() for collab in collab_nodes
        )

        return collaborations

    @property
    def copyright(self):
        copyright = {
            'holder': self.copyright_holder,
            'material': self.material,
            'statement': self.copyright_statement,
            'year': self.copyright_year,
        }

        return copyright

    @property
    def copyright_holder(self):
        copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first()

        return copyright_holder

    @property
    def copyright_statement(self):
        copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first()

        return copyright_statement

    @property
    def copyright_year(self):
        copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first()

        return maybe_int(copyright_year)

    @property
    def dois(self):
        doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract()
        dois = [
            {'doi': value, 'material': self.material} for value in doi_values
        ]

        if self.material != 'publication':
            doi_values = self.root.xpath(
                './front/article-meta//related-article[@ext-link-type="doi"]/@href'
            ).extract()
            related_dois = ({'doi': value} for value in doi_values)
            dois.extend(related_dois)

        return dois

    @property
    def document_type(self):
        if self.is_conference_paper:
            document_type = 'conference paper'
        else:
            document_type = 'article'

        return document_type

    @property
    def is_conference_paper(self):
        """Decide whether the article is a conference paper."""
        conference_node = self.root.xpath('./front//conference').extract_first()

        return bool(conference_node)

    @property
    def journal_title(self):
        journal_title = self.root.xpath(
            './front/journal-meta//abbrev-journal-title/text() |'
            './front/journal-meta//journal-title/text()'
        ).extract_first()

        return journal_title

    @property
    def journal_issue(self):
        journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first()

        return journal_issue

    @property
    def journal_volume(self):
        journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first()

        return journal_volume

    @property
    def keywords(self):
        keyword_groups = self.root.xpath('./front//kwd-group')
        keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups)

        return keywords

    @property
    def license(self):
        license = {
            'license': self.license_statement,
            'material': self.material,
            'url': self.license_url,
        }

        return license

    @property
    def license_statement(self):
        license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip()

        return license_statement

    @property
    def license_url(self):
        url_nodes = (
            './front/article-meta//license_ref/text() |'
            './front/article-meta//license/@href |'
            './front/article-meta//license//ext-link/@href'
        )
        license_url = self.root.xpath(url_nodes).extract_first()

        return license_url

    @property
    def material(self):
        if self.article_type.startswith('correc'):
            material = 'erratum'
        elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'):
            material = self.article_type
        else:
            material = 'publication'

        return material

    @property
    def number_of_pages(self):
        number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first())

        return number_of_pages

    @property
    def page_start(self):
        page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first()

        return page_start

    @property
    def page_end(self):
        page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first()

        return page_end

    @property
    def publication_date(self):
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[@pub-type="epub"] |'
            './front//pub-date[starts-with(@date-type,"pub")] |'
            './front//date[starts-with(@date-type,"pub")]'
        )
        publication_date = min(
            self.get_date(date_node) for date_node in date_nodes
        )

        return publication_date

    @property
    def publication_info(self):
        publication_info = {
            'artid': self.artid,
            'journal_title': self.journal_title,
            'journal_issue': self.journal_issue,
            'journal_volume': self.journal_volume,
            'material': self.material,
            'page_start': self.page_start,
            'page_end': self.page_end,
            'year': self.year,
        }

        return publication_info

    @property
    def publisher(self):
        publisher = self.root.xpath('./front//publisher-name/text()').extract_first()

        return publisher

    @property
    def subtitle(self):
        subtitle = self.root.xpath('string(./front//subtitle)').extract_first()

        return subtitle

    @property
    def title(self):
        title = self.root.xpath('string(./front//article-title)').extract_first()

        return title

    def get_affiliation(self, id_):
        """Get the affiliation with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the affiliation.

        Returns:
            Optional[str]: the affiliation with that id or ``None`` if there is
                no match.
        """
        affiliation_node = self.root.xpath('//aff[@id=$id_]', id_=id_)[0]
        affiliation = remove_tags(
            affiliation_node,
            strip='self::label | self::email'
        ).strip()

        return affiliation

    def get_emails_from_refs(self, id_):
        """Get the emails from the node with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the node.

        Returns:
            List[str]: the emails from the node with that id or [] if none found.
        """
        email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_)
        return email_nodes.extract()

    @property
    def year(self):
        not_online = (
            'not(starts-with(@publication-format, "elec"))'
            ' and not(starts-with(@publication-format, "online")'
        )
        date_nodes = self.root.xpath(
            './front//pub-date[@pub-type="ppub"] |'
            './front//pub-date[starts-with(@date-type,"pub") and $not_online] |'
            './front//date[starts-with(@date-type,"pub") and $not_online]',
            not_online=not_online
        )

        year = min(
            self.get_date(date_node) for date_node in date_nodes
        ).year

        return year

    def get_author_affiliations(self, author_node):
        """Extract an author's affiliations."""
        raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        # Sometimes the rid might have more than one ID (e.g. rid="id0 id1")
        referred_ids = set()
        for raw_referred_id in raw_referred_ids:
            referred_ids.update(set(raw_referred_id.split(' ')))

        affiliations = [
            self.get_affiliation(rid) for rid in referred_ids
            if self.get_affiliation(rid)
        ]

        return affiliations

    def get_author_emails(self, author_node):
        """Extract an author's email addresses."""
        emails = author_node.xpath('.//email/text()').extract()
        referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract()
        for referred_id in referred_ids:
            emails.extend(self.get_emails_from_refs(referred_id))

        return emails

    @staticmethod
    def get_author_name(author_node):
        """Extract an author's name."""
        surname = author_node.xpath('.//surname/text()').extract_first()
        if not surname:
            # the author name is unstructured
            author_name = author_node.xpath('string(./string-name)').extract_first()
        given_names = author_node.xpath('.//given-names/text()').extract_first()
        suffix = author_node.xpath('.//suffix/text()').extract_first()
        author_name = ', '.join(el for el in (surname, given_names, suffix) if el)

        return author_name

    @staticmethod
    def get_date(date_node):
        """Extract a date from a date node.

        Returns:
            PartialDate: the parsed date.
        """
        iso_string = date_node.xpath('./@iso-8601-date').extract_first()
        iso_date = PartialDate.loads(iso_string) if iso_string else None

        year = date_node.xpath('string(./year)').extract_first()
        month = date_node.xpath('string(./month)').extract_first()
        day = date_node.xpath('string(./day)').extract_first()
        date_from_parts = PartialDate.from_parts(year, month, day) if year else None

        string_date = date_node.xpath('string(./string-date)').extract_first()
        try:
            parsed_date = PartialDate.parse(string_date)
        except ValueError:
            parsed_date = None

        date = get_first([iso_date, date_from_parts, parsed_date])
        return date

    @staticmethod
    def get_keywords(group_node):
        """Extract keywords from a keyword group."""
        schema = None
        if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower():
            schema = 'PACS'

        keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd'))
        keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords)

        return keyword_dicts

    @staticmethod
    def get_root_node(jats_record):
        """Get a selector on the root ``article`` node of the record.

        This can be overridden in case some preprocessing needs to be done on
        the XML.

        Args:
            jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format.

        Returns:
            scrapy.selector.Selector: a selector on the root ``<article>``
                node.
        """
        if isinstance(jats_record, six.string_types):
            root = get_node(jats_record)
        else:
            root = jats_record
        root.remove_namespaces()

        return root

    def get_author(self, author_node):
        """Extract one author.

        Args:
            author_node(scrapy.selector.Selector): a selector on a single
                author, e.g. a ``<contrib contrib-type="author">``.

        Returns:
            dict: the parsed author, conforming to the Inspire schema.
        """
        author_name = self.get_author_name(author_node)
        emails = self.get_author_emails(author_node)
        affiliations = self.get_author_affiliations(author_node)

        return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails)

    @staticmethod
    def get_reference_authors(ref_node, role):
        """Extract authors of `role` from a reference node.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single reference.
            role(str): author role

        Returns:
            List[str]: list of names
        """
        return ref_node.xpath(
            './person-group[@person-group-type=$role]/string-name/text()',
            role=role
        ).extract()


    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj


    def attach_fulltext_document(self, file_name, url):
        self.builder.add_document(file_name, url, fulltext=True, hidden=True)