def authorlist(text): """ Return an author-structure parsed from text and optional additional information. """ from inspire_schemas.api import LiteratureBuilder from refextract.documents.pdf import replace_undesirable_characters from inspirehep.modules.tools.authorlist import create_authors builder = LiteratureBuilder() text = replace_undesirable_characters(text) result = create_authors(text) if 'authors' in result: for fullname, author_affs in result['authors']: builder.add_author( builder.make_author(fullname, raw_affiliations=author_affs) ) result['authors'] = builder.record['authors'] return result
def formdata_to_model(obj, formdata): """Manipulate form data to match literature data model.""" def _is_arxiv_url(url): return 'arxiv.org' in url form_fields = copy.deepcopy(formdata) filter_empty_elements(form_fields, ['authors', 'supervisors', 'report_numbers']) builder = LiteratureBuilder(source='submitter') for author in form_fields.get('authors', []): builder.add_author( builder.make_author(author['full_name'], affiliations=force_list(author['affiliation']) if author['affiliation'] else None, roles=['author'])) for supervisor in form_fields.get('supervisors', []): builder.add_author( builder.make_author( supervisor['full_name'], affiliations=force_list(supervisor['affiliation']) if author['affiliation'] else None, roles=['supervisor'])) builder.add_title(title=form_fields.get('title')) document_type = 'conference paper' if form_fields.get('conf_name') \ else form_fields.get('type_of_doc', []) if document_type == 'chapter': document_type = 'book chapter' builder.add_document_type(document_type=document_type) builder.add_abstract( abstract=form_fields.get('abstract'), source='arXiv' if form_fields.get('categories') else None) if form_fields.get('arxiv_id') and form_fields.get('categories'): builder.add_arxiv_eprint( arxiv_id=form_fields.get('arxiv_id'), arxiv_categories=form_fields.get('categories').split()) builder.add_doi(doi=form_fields.get('doi')) builder.add_inspire_categories( subject_terms=form_fields.get('subject_term'), source='user') for key in ('extra_comments', 'nonpublic_note', 'hidden_notes', 'conf_name', 'references'): builder.add_private_note(private_notes=form_fields.get(key)) year = form_fields.get('year') try: year = int(year) except (TypeError, ValueError): year = None builder.add_preprint_date( preprint_date=form_fields.get('preprint_created')) if form_fields.get('type_of_doc') == 'thesis': builder.add_thesis(defense_date=form_fields.get('defense_date'), degree_type=form_fields.get('degree_type'), institution=form_fields.get('institution'), date=form_fields.get('thesis_date')) if form_fields.get('type_of_doc') == 'chapter': if not form_fields.get('journal_title'): builder.add_book_series(title=form_fields.get('series_title')) if form_fields.get('type_of_doc') == 'book': if form_fields.get('journal_title'): form_fields['volume'] = form_fields.get('series_volume') else: builder.add_book_series(title=form_fields.get('series_title'), volume=form_fields.get('series_volume')) builder.add_book(publisher=form_fields.get('publisher_name'), place=form_fields.get('publication_place'), date=form_fields.get('publication_date')) builder.add_publication_info( year=year, cnum=form_fields.get('conference_id'), journal_issue=form_fields.get('issue'), journal_title=form_fields.get('journal_title'), journal_volume=form_fields.get('volume'), page_start=form_fields.get('start_page'), page_end=form_fields.get('end_page'), artid=form_fields.get('artid'), parent_record=form_fields.get('parent_book')) builder.add_accelerator_experiments_legacy_name( legacy_name=form_fields.get('experiment')) language = form_fields.get('other_language') \ if form_fields.get('language') == 'oth' \ else form_fields.get('language') builder.add_language(language=language) if form_fields.get('title_translation'): builder.add_title_translation( title=form_fields['title_translation'], language='en', ) builder.add_title(title=form_fields.get('title_arXiv'), source='arXiv') builder.add_title(title=form_fields.get('title_crossref'), source='crossref') builder.add_license(url=form_fields.get('license_url')) builder.add_public_note(public_note=form_fields.get('public_notes')) builder.add_public_note( public_note=form_fields.get('note'), source='arXiv' if form_fields.get('categories') else 'CrossRef') form_url = form_fields.get('url') form_additional_url = form_fields.get('additional_url') if form_url and not _is_arxiv_url(form_url): obj.extra_data['submission_pdf'] = form_url if not form_additional_url: builder.add_url(url=form_url) if form_additional_url and not _is_arxiv_url(form_additional_url): builder.add_url(url=form_additional_url) [ builder.add_report_number( report_number=report_number.get('report_number')) for report_number in form_fields.get('report_numbers', []) ] builder.add_collaboration(collaboration=form_fields.get('collaboration')) builder.add_acquisition_source( datetime=datetime.datetime.utcnow().isoformat(), submission_number=obj.id, internal_uid=int(obj.id_user), email=form_fields.get('email'), orcid=form_fields.get('orcid'), method='submitter') return builder.record
class JatsParser(object): """Parser for the JATS format. It can be used directly by invoking the :func:`JatsParser.parse` method, or be subclassed to customize its behavior. Args: jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the JATS metadata. """ def __init__(self, jats_record, source=None): self.root = self.get_root_node(jats_record) if not source: source = self.publisher self.builder = LiteratureBuilder(source) def parse(self): """Extract a JATS record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(self.abstract) self.builder.add_title(self.title, subtitle=self.subtitle) self.builder.add_copyright(**self.copyright) self.builder.add_document_type(self.document_type) self.builder.add_license(**self.license) for author in self.authors: self.builder.add_author(author) self.builder.add_number_of_pages(self.number_of_pages) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) for keyword in self.keywords: self.builder.add_keyword(**keyword) self.builder.add_imprint_date(self.publication_date.dumps()) for reference in self.references: self.builder.add_reference(reference) return self.builder.record @property def references(self): """Extract a JATS record into an Inspire HEP references record. Returns: List[dict]: an array of reference schema records, representing the references in the record """ ref_nodes = self.root.xpath('./back/ref-list/ref') return list( itertools.chain.from_iterable( self.get_reference(node) for node in ref_nodes ) ) remove_tags_config_abstract = { 'allowed_tags': ['sup', 'sub'], 'allowed_trees': ['math'], 'strip': 'self::pub-id|self::issn' } @property def abstract(self): abstract_nodes = self.root.xpath('./front//abstract[1]') if not abstract_nodes: return abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip() return abstract @property def article_type(self): article_type = self.root.xpath('./@article-type').extract_first() return article_type @property def artid(self): artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first() return artid @property def authors(self): author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]') authors = [self.get_author(author) for author in author_nodes] return authors @property def collaborations(self): collab_nodes = self.root.xpath( './front//collab |' './front//contrib[@contrib-type="collaboration"] |' './front//on-behalf-of' ) collaborations = set( collab.xpath('string(.)').extract_first() for collab in collab_nodes ) return collaborations @property def copyright(self): copyright = { 'holder': self.copyright_holder, 'material': self.material, 'statement': self.copyright_statement, 'year': self.copyright_year, } return copyright @property def copyright_holder(self): copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first() return copyright_holder @property def copyright_statement(self): copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first() return copyright_statement @property def copyright_year(self): copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first() return maybe_int(copyright_year) @property def dois(self): doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract() dois = [ {'doi': value, 'material': self.material} for value in doi_values ] if self.material != 'publication': doi_values = self.root.xpath( './front/article-meta//related-article[@ext-link-type="doi"]/@href' ).extract() related_dois = ({'doi': value} for value in doi_values) dois.extend(related_dois) return dois @property def document_type(self): if self.is_conference_paper: document_type = 'conference paper' else: document_type = 'article' return document_type @property def is_conference_paper(self): """Decide whether the article is a conference paper.""" conference_node = self.root.xpath('./front//conference').extract_first() return bool(conference_node) @property def journal_title(self): journal_title = self.root.xpath( './front/journal-meta//abbrev-journal-title/text() |' './front/journal-meta//journal-title/text()' ).extract_first() return journal_title @property def journal_issue(self): journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first() return journal_issue @property def journal_volume(self): journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first() return journal_volume @property def keywords(self): keyword_groups = self.root.xpath('./front//kwd-group') keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups) return keywords @property def license(self): license = { 'license': self.license_statement, 'material': self.material, 'url': self.license_url, } return license @property def license_statement(self): license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip() return license_statement @property def license_url(self): url_nodes = ( './front/article-meta//license_ref/text() |' './front/article-meta//license/@href |' './front/article-meta//license//ext-link/@href' ) license_url = self.root.xpath(url_nodes).extract_first() return license_url @property def material(self): if self.article_type.startswith('correc'): material = 'erratum' elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'): material = self.article_type else: material = 'publication' return material @property def number_of_pages(self): number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first()) return number_of_pages @property def page_start(self): page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first() return page_start @property def page_end(self): page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first() return page_end @property def publication_date(self): date_nodes = self.root.xpath( './front//pub-date[@pub-type="ppub"] |' './front//pub-date[@pub-type="epub"] |' './front//pub-date[starts-with(@date-type,"pub")] |' './front//date[starts-with(@date-type,"pub")]' ) publication_date = min( self.get_date(date_node) for date_node in date_nodes ) return publication_date @property def publication_info(self): publication_info = { 'artid': self.artid, 'journal_title': self.journal_title, 'journal_issue': self.journal_issue, 'journal_volume': self.journal_volume, 'material': self.material, 'page_start': self.page_start, 'page_end': self.page_end, 'year': self.year, } return publication_info @property def publisher(self): publisher = self.root.xpath('./front//publisher-name/text()').extract_first() return publisher @property def subtitle(self): subtitle = self.root.xpath('string(./front//subtitle)').extract_first() return subtitle @property def title(self): title = self.root.xpath('string(./front//article-title)').extract_first() return title def get_affiliation(self, id_): """Get the affiliation with the specified id. Args: id_(str): the value of the ``id`` attribute of the affiliation. Returns: Optional[str]: the affiliation with that id or ``None`` if there is no match. """ affiliation_node = self.root.xpath("//aff[@id=$id_]", id_=id_) if affiliation_node: affiliation = remove_tags( affiliation_node[0], strip="self::label | self::email" ).strip() return affiliation def get_emails_from_refs(self, id_): """Get the emails from the node with the specified id. Args: id_(str): the value of the ``id`` attribute of the node. Returns: List[str]: the emails from the node with that id or [] if none found. """ email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_) return email_nodes.extract() @property def year(self): not_online = ( 'not(starts-with(@publication-format, "elec"))' ' and not(starts-with(@publication-format, "online")' ) date_nodes = self.root.xpath( './front//pub-date[@pub-type="ppub"] |' './front//pub-date[starts-with(@date-type,"pub") and $not_online] |' './front//date[starts-with(@date-type,"pub") and $not_online]', not_online=not_online ) year = min( self.get_date(date_node) for date_node in date_nodes ).year return year def get_author_affiliations(self, author_node): """Extract an author's affiliations.""" raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract() # Sometimes the rid might have more than one ID (e.g. rid="id0 id1") referred_ids = set() for raw_referred_id in raw_referred_ids: referred_ids.update(set(raw_referred_id.split(' '))) affiliations = [ self.get_affiliation(rid) for rid in referred_ids if self.get_affiliation(rid) ] return affiliations def get_author_emails(self, author_node): """Extract an author's email addresses.""" emails = author_node.xpath('.//email/text()').extract() referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract() for referred_id in referred_ids: emails.extend(self.get_emails_from_refs(referred_id)) return emails @staticmethod def get_author_name(author_node): """Extract an author's name.""" surname = author_node.xpath('.//surname/text()').extract_first() if not surname: # the author name is unstructured author_name = author_node.xpath('string(./string-name)').extract_first() given_names = author_node.xpath('.//given-names/text()').extract_first() suffix = author_node.xpath('.//suffix/text()').extract_first() author_name = ', '.join(el for el in (surname, given_names, suffix) if el) return author_name @staticmethod def get_date(date_node): """Extract a date from a date node. Returns: PartialDate: the parsed date. """ iso_string = date_node.xpath('./@iso-8601-date').extract_first() iso_date = PartialDate.loads(iso_string) if iso_string else None year = date_node.xpath('string(./year)').extract_first() month = date_node.xpath('string(./month)').extract_first() day = date_node.xpath('string(./day)').extract_first() date_from_parts = PartialDate.from_parts(year, month, day) if year else None string_date = date_node.xpath('string(./string-date)').extract_first() try: parsed_date = PartialDate.parse(string_date) except ValueError: parsed_date = None date = get_first([iso_date, date_from_parts, parsed_date]) return date @staticmethod def get_keywords(group_node): """Extract keywords from a keyword group.""" schema = None if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower(): schema = 'PACS' keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd')) keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords) return keyword_dicts @staticmethod def get_root_node(jats_record): """Get a selector on the root ``article`` node of the record. This can be overridden in case some preprocessing needs to be done on the XML. Args: jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format. Returns: scrapy.selector.Selector: a selector on the root ``<article>`` node. """ if isinstance(jats_record, six.string_types): root = get_node(jats_record) else: root = jats_record root.remove_namespaces() return root def get_author(self, author_node): """Extract one author. Args: author_node(scrapy.selector.Selector): a selector on a single author, e.g. a ``<contrib contrib-type="author">``. Returns: dict: the parsed author, conforming to the Inspire schema. """ author_name = self.get_author_name(author_node) emails = self.get_author_emails(author_node) affiliations = self.get_author_affiliations(author_node) return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails) @staticmethod def get_reference_authors(ref_node, role): """Extract authors of `role` from a reference node. Args: ref_node(scrapy.selector.Selector): a selector on a single reference. role(str): author role Returns: List[str]: list of names """ return ref_node.xpath( './person-group[@person-group-type=$role]/string-name/text()', role=role ).extract() def get_reference(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Returns: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ for citation_node in ref_node.xpath('./mixed-citation'): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format='JATS' ) fields = [ ( ( 'self::node()[@publication-type="journal" ' 'or @publication-type="eprint"]/source/text()' ), builder.set_journal_title, ), ( 'self::node()[@publication-type="book"]/source/text()', builder.add_parent_title, ), ('./publisher-name/text()', builder.set_publisher), ('./volume/text()', builder.set_journal_volume), ('./issue/text()', builder.set_journal_issue), ('./year/text()', builder.set_year), ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid), ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid), ( 'pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")]/text()', builder.add_report_number ), ('./article-title/text()', builder.add_title), ('../label/text()', lambda x: builder.set_label(x.strip('[].'))) ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) remainder = remove_tags( citation_node, strip='self::person-group' '|self::pub-id' '|self::article-title' '|self::volume' '|self::issue' '|self::year' '|self::label' '|self::publisher-name' '|self::source[../@publication-type!="proc"]' '|self::object-id' '|self::page-range' '|self::issn' ).strip('"\';,. \t\n\r').replace('()', '') if remainder: builder.add_misc(remainder) for editor in self.get_reference_authors(citation_node, 'editor'): builder.add_author(editor, 'editor') for author in self.get_reference_authors(citation_node, 'author'): builder.add_author(author, 'author') page_range = citation_node.xpath('./page-range/text()').extract_first() if page_range: page_artid = split_page_artid(page_range) builder.set_page_artid(*page_artid) yield builder.obj def attach_fulltext_document(self, file_name, url): self.builder.add_document(file_name, url, fulltext=True, hidden=True)
def formdata_to_model(obj, formdata): """Manipulate form data to match literature data model.""" def _is_arxiv_url(url): return 'arxiv.org' in url form_fields = copy.deepcopy(formdata) filter_empty_elements( form_fields, ['authors', 'supervisors', 'report_numbers'] ) builder = LiteratureBuilder(source='submitter') for author in form_fields.get('authors', []): builder.add_author(builder.make_author( author['full_name'], affiliations=force_list(author['affiliation']) if author['affiliation'] else None, roles=['author'] )) for supervisor in form_fields.get('supervisors', []): builder.add_author(builder.make_author( supervisor['full_name'], affiliations=force_list(supervisor['affiliation']) if author['affiliation'] else None, roles=['supervisor'] )) builder.add_title(title=form_fields.get('title')) document_type = 'conference paper' if form_fields.get('conf_name') \ else form_fields.get('type_of_doc', []) builder.add_document_type( document_type=document_type ) builder.add_abstract( abstract=form_fields.get('abstract'), source='arXiv' if form_fields.get('categories') else None ) if form_fields.get('arxiv_id') and form_fields.get('categories'): builder.add_arxiv_eprint( arxiv_id=form_fields.get('arxiv_id'), arxiv_categories=form_fields.get('categories').split() ) builder.add_doi(doi=form_fields.get('doi')) builder.add_inspire_categories( subject_terms=form_fields.get('subject_term'), source='user' ) for key in ('extra_comments', 'nonpublic_note', 'hidden_notes', 'conf_name', 'references'): builder.add_private_note( private_notes=form_fields.get(key) ) year = form_fields.get('year') try: year = int(year) except (TypeError, ValueError): year = None builder.add_publication_info( year=year, cnum=form_fields.get('conference_id'), journal_issue=form_fields.get('issue'), journal_title=form_fields.get('journal_title'), journal_volume=form_fields.get('volume'), page_start=form_fields.get('page_start'), page_end=form_fields.get('page_end'), artid=form_fields.get('artid') ) builder.add_preprint_date( preprint_date=form_fields.get('preprint_created') ) if form_fields.get('type_of_doc') == 'thesis': builder.add_thesis( defense_date=form_fields.get('defense_date'), degree_type=form_fields.get('degree_type'), institution=form_fields.get('institution'), date=form_fields.get('thesis_date') ) builder.add_accelerator_experiments_legacy_name( legacy_name=form_fields.get('experiment') ) language = form_fields.get('other_language') \ if form_fields.get('language') == 'oth' \ else form_fields.get('language') builder.add_language(language=language) builder.add_title_translation(title=form_fields.get('title_translation')) builder.add_title( title=form_fields.get('title_arXiv'), source='arXiv' ) builder.add_title( title=form_fields.get('title_crossref'), source='crossref' ) builder.add_license(url=form_fields.get('license_url')) builder.add_public_note(public_note=form_fields.get('public_notes')) builder.add_public_note( public_note=form_fields.get('note'), source='arXiv' if form_fields.get('categories') else 'CrossRef' ) form_url = form_fields.get('url') form_additional_url = form_fields.get('additional_url') if form_url and not _is_arxiv_url(form_url): obj.extra_data['submission_pdf'] = form_url if not form_additional_url: builder.add_url(url=form_url) if form_additional_url and not _is_arxiv_url(form_additional_url): builder.add_url(url=form_additional_url) [builder.add_report_number( report_number=report_number.get('report_number') ) for report_number in form_fields.get('report_numbers', [])] builder.add_collaboration(collaboration=form_fields.get('collaboration')) builder.add_acquisition_source( datetime=datetime.datetime.utcnow().isoformat(), submission_number=obj.id, internal_uid=int(obj.id_user), email=form_fields.get('email'), orcid=form_fields.get('orcid'), method='submitter' ) builder.validate_record() return builder.record
def hepcrawl_to_hep(crawler_record): """ Args: crawler_record(dict): dictionary representing the hepcrawl formatted record. Returns: dict: The hep formatted record. """ def _filter_affiliation(affiliations): return [ affilation.get('value') for affilation in affiliations if affilation.get('value') ] builder = LiteratureBuilder( source=crawler_record['acquisition_source']['source'] ) for author in crawler_record.get('authors', []): builder.add_author(builder.make_author( full_name=author['full_name'], raw_affiliations=_filter_affiliation(author['affiliations']), )) for title in crawler_record.get('titles', []): builder.add_title( title=title.get('title'), subtitle=title.get('subtitle'), source=title.get('source') ) for abstract in crawler_record.get('abstracts', []): builder.add_abstract( abstract=abstract.get('value'), source=abstract.get('source') ) for arxiv_eprint in crawler_record.get('arxiv_eprints', []): builder.add_arxiv_eprint( arxiv_id=arxiv_eprint.get('value'), arxiv_categories=arxiv_eprint.get('categories') ) for doi in crawler_record.get('dois', []): builder.add_doi( doi=doi.get('value'), material=doi.get('material'), ) for private_note in crawler_record.get('private_notes', []): builder.add_private_note( private_notes=private_note ) for public_note in crawler_record.get('public_notes', []): builder.add_public_note( public_note=public_note.get('value'), source=public_note.get('source') ) for license in crawler_record.get('license', []): builder.add_license( url=license.get('url'), license=license.get('license'), material=license.get('material'), ) for collaboration in crawler_record.get('collaborations', []): builder.add_collaboration( collaboration=collaboration.get('value') ) for imprint in crawler_record.get('imprints', []): builder.add_imprint_date( imprint_date=imprint.get('date') ) for copyright in crawler_record.get('copyright', []): builder.add_copyright( holder=copyright.get('holder'), material=copyright.get('material'), statement=copyright.get('statement') ) builder.add_preprint_date( preprint_date=crawler_record.get('preprint_date') ) acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method=acquisition_source['method'], date=acquisition_source['datetime'], source=acquisition_source['source'], submission_number=acquisition_source['submission_number'], ) try: builder.add_number_of_pages( number_of_pages=int(crawler_record.get('page_nr', [])[0]) ) except (TypeError, ValueError, IndexError): pass publication_types = [ 'introductory', 'lectures', 'review', 'manual', ] document_types = [ 'book', 'note', 'report', 'proceedings', 'thesis', ] added_doc_type = False for collection in crawler_record.get('collections', []): collection = collection['primary'].strip().lower() if collection == 'arxiv': continue # ignored elif collection == 'citeable': builder.set_citeable(True) elif collection == 'core': builder.set_core(True) elif collection == 'noncore': builder.set_core(False) elif collection == 'published': builder.set_refereed(True) elif collection == 'withdrawn': builder.set_withdrawn(True) elif collection in publication_types: builder.add_publication_type(collection) elif collection == 'bookchapter': added_doc_type = True builder.add_document_type('book chapter') elif collection == 'conferencepaper': added_doc_type = True builder.add_document_type('conference paper') elif collection in document_types: added_doc_type = True builder.add_document_type(collection) if not added_doc_type: builder.add_document_type('article') _pub_info = crawler_record.get('publication_info', [{}])[0] builder.add_publication_info( year=_pub_info.get('year'), artid=_pub_info.get('artid'), page_end=_pub_info.get('page_end'), page_start=_pub_info.get('page_start'), journal_issue=_pub_info.get('journal_issue'), journal_title=_pub_info.get('journal_title'), journal_volume=_pub_info.get('journal_volume'), pubinfo_freetext=_pub_info.get('pubinfo_freetext'), material=_pub_info.get('pubinfo_material'), ) for report_number in crawler_record.get('report_numbers', []): builder.add_report_number( report_number=report_number.get('value'), source=report_number.get('source') ) for url in crawler_record.get('urls', []): builder.add_url(url=url.get('value')) for document in crawler_record.get('documents', []): builder.add_document( description=document.get('description'), fulltext=document.get('fulltext'), hidden=document.get('hidden'), key=document['key'], material=document.get('material'), original_url=document.get('original_url'), url=document['url'], ) return builder.record
class CrossrefParser(object): """Parser for the JSON Crossref format. Args: crossref_record (dict): the record in JSON Crossref API format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the Crossref metadata. """ def __init__(self, crossref_record, source=None): self.record = crossref_record.get("message") if not source: source = self.material_source self.builder = LiteratureBuilder(source) def parse(self): """Extract a Crossref record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(self.abstract) for doi in self.dois: self.builder.add_doi(**doi) for reference in self.references: self.builder.add_reference(reference) self.builder.add_imprint_date(self.imprints) for author in self.authors: self.builder.add_author(author) for license_instance in self.license: self.builder.add_license(**license_instance) self.builder.add_publication_info(**self.publication_info) self.builder.add_title(self.title, subtitle=self.subtitle) self.builder.add_document_type(self.document_type) return self.builder.record @property def document_type(self): doc_type = self.record.get("type") return DOC_TYPE_MAP[doc_type] @property def title(self): title = get_value(self.record, "title[0]") return title @property def subtitle(self): subtitle = get_value(self.record, "subtitle[0]") return subtitle @property def dois(self): value = self.record.get("DOI") dois = [{'doi': value, 'material': self.material}] return dois @property def material_source(self): return self.record.get("source") @property def material(self): title = self.title or '' subtitle = self.subtitle or '' if title.startswith("Erratum") or subtitle.startswith("Erratum"): material = 'erratum' elif title.startswith("Addendum") or subtitle.startswith("Addendum"): material = 'addendum' elif title.startswith("Publisher's Note") or subtitle.startswith( "Publisher's Note"): material = 'editorial note' else: material = 'publication' return material @property def publication_info(self): publication_info = { 'artid': self.artid, 'journal_title': self.journal_title, 'journal_issue': self.journal_issue, 'journal_volume': self.journal_volume, 'page_start': self.page_start, 'page_end': self.page_end, 'year': self.year, 'material': self.material, 'parent_isbn': self.parent_isbn, } return publication_info @property def parent_isbn(self): return get_value(self.record, "ISBN[0]") @property def journal_title(self): if self.document_type == 'book chapter': return None return get_value(self.record, "container-title[0]") @property def artid(self): return self.record.get("article-number") @property def journal_issue(self): return self.record.get("issue") @property def journal_volume(self): return self.record.get("volume") @property def year(self): date = get_value(self.record, "issued.date-parts[0][0]") return date @property def page_start(self): pages = self.record.get("page") if pages: return pages.split('-')[0] else: return None @property def page_end(self): pages = self.record.get("page") if pages and '-' in pages: return pages.split('-')[1] else: return None @staticmethod def get_author_name(author_key): """Extract an author's name.""" author_name_list = [author_key.get("family"), author_key.get("given")] return ', '.join(filter(None, author_name_list)) @staticmethod def get_author_affiliations(author_key): """Extract an author's affiliations.""" affiliations = force_list(author_key.get("affiliation")) auth_aff = [affiliation.get('name') for affiliation in affiliations] return auth_aff @staticmethod def get_author_orcid(author_key): """Extract an author's orcid.""" orcid_value = author_key.get('ORCID') return [('ORCID', orcid_value)] def get_author(self, author_key): """Extract one author. Args: author_key(dict): a dictionary on a single author. Returns: dict: the parsed author, conforming to the Inspire schema. """ author_name = self.get_author_name(author_key) affiliations = self.get_author_affiliations(author_key) orcid = self.get_author_orcid(author_key) return self.builder.make_author(author_name, raw_affiliations=affiliations, ids=orcid) @property def authors(self): authors_key = self.record.get("author") authors = [ self.get_author(author) for author in force_list(authors_key) ] return authors @property def license(self): license_keys = self.record.get("license") licenses = [ self.get_license(license) for license in force_list(license_keys) ] return licenses def get_license(self, license_key): """Extract one license. Args: license_key(dict): a dictionary on a single license. Returns: dict: the parsed license, conforming to the Inspire schema. """ license = { 'imposing': self.publisher, 'material': self.material, 'url': self.get_license_url(license_key), } return license @staticmethod def get_license_url(license_key): return license_key.get("URL") @property def publisher(self): return self.record.get("publisher") @property def abstract(self): return self.record.get("abstract") @property def imprints(self): '''issued: Eariest of published-print and published-online That is why we use this field to fill the imprints and the publication info. ''' date_parts = get_value(self.record, "issued.date-parts[0]") if not date_parts: return None date = PartialDate(*date_parts) return date.dumps() @property def references(self): """Extract a Crossref record into an Inspire HEP references record. Returns: List[dict]: an array of reference schema records, representing the references in the record """ ref_keys = self.record.get("reference") reference_list = list( itertools.chain.from_iterable( self.get_reference(key) for key in force_list(ref_keys))) return dedupe_list_of_dicts(reference_list) def get_reference(self, ref_key): """Extract one reference. Args: ref_key(dict): a dictionary on a single reference. Returns: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ builder = ReferenceBuilder() journal_title = ref_key.get("journal-title") if journal_title: builder.set_journal_title(journal_title) journal_volume = ref_key.get("volume") if journal_volume: builder.set_journal_volume(journal_volume) journal_issue = ref_key.get("issue") if journal_issue: builder.set_journal_issue(journal_issue) first_page = ref_key.get("first-page") if first_page: builder.set_page_artid(page_start=first_page) year = ref_key.get("year") if year: builder.set_year(year) title = ref_key.get("article-title") if title: builder.add_title(title) isbn = ref_key.get("ISBN") if isbn: builder.add_uid(isbn) doi = ref_key.get("DOI") if doi: builder.add_uid(doi) author = ref_key.get("author") if author: builder.add_author(author, 'author') raw_ref = ref_key.get("unstructured") if raw_ref: builder.add_raw_reference(raw_ref, self.material_source) yield builder.obj
class ElsevierParser(object): """Parser for the Elsevier format. It can be used directly by invoking the :func:`ElsevierParser.parse` method, or be subclassed to customize its behavior. Args: elsevier_record (Union[str, scrapy.selector.Selector]): the record in Elsevier format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the Elsevier metadata. """ def __init__(self, elsevier_record, source=None): self.root = self.get_root_node(elsevier_record) if not source: source = self.publisher self.builder = LiteratureBuilder(source) def parse(self): """Extract a Elsevier record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(self.abstract) self.builder.add_title(self.title, subtitle=self.subtitle) self.builder.add_copyright(**self.copyright) self.builder.add_document_type(self.document_type) self.builder.add_license(**self.license) for author in self.authors: self.builder.add_author(author) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) for keyword in self.keywords: self.builder.add_keyword(keyword) self.builder.add_imprint_date( self.publication_date.dumps() if self.publication_date else None) for reference in self.references: self.builder.add_reference(reference) return self.builder.record @property def references(self): """Extract a Elsevier record into an Inspire HEP references record. Returns: List[dict]: an array of reference schema records, representing the references in the record """ ref_nodes = self.root.xpath(".//bib-reference") return list( itertools.chain.from_iterable( self.get_reference_iter(node) for node in ref_nodes)) remove_tags_config_abstract = { "allowed_tags": ["sup", "sub"], "allowed_trees": ["math"], "strip": "self::pub-id|self::issn", } @property def abstract(self): abstract_nodes = self.root.xpath( ".//head/abstract[not(@graphical)]/abstract-sec/simple-para") if not abstract_nodes: return abstract_paragraphs = [ remove_tags(abstract_node, **self.remove_tags_config_abstract).strip("/ \n") for abstract_node in abstract_nodes ] abstract = ' '.join(abstract_paragraphs) return abstract @property def article_type(self): """Return a article type mapped from abbreviation.""" abbrv_doctype = self.root.xpath(".//@docsubtype").extract_first() article_type = DOCTYPE_MAPPING.get(abbrv_doctype) return article_type @property def artid(self): artid = self.root.xpath("string(./*/item-info/aid[1])").extract_first() return artid @property def authors(self): author_nodes = self.root.xpath("./*/head/author-group") all_authors = [] for author_group in author_nodes: authors = [ self.get_author(author, author_group) for author in author_group.xpath("./author") ] all_authors.extend(authors) return all_authors @property def collaborations(self): collaborations = self.root.xpath( "./*/head/author-group//collaboration/text/text()").extract() return collaborations @property def copyright(self): copyright = { "holder": self.copyright_holder, "material": self.material, "statement": self.copyright_statement, "year": self.copyright_year, } return copyright @property def copyright_holder(self): copyright_holder = self.root.xpath( "string(./*/item-info/copyright[@type][1])").extract_first() if not copyright_holder: copyright_type = self.root.xpath( "./*/item-info/copyright/@type").extract_first() copyright_holder = COPYRIGHT_MAPPING.get(copyright_type) return copyright_holder @property def copyright_statement(self): copyright_statement = self.root.xpath( "string(./RDF/Description/copyright[1])").extract_first() if not copyright_statement: copyright_statement = self.root.xpath( "string(./*/item-info/copyright[@type][1])").extract_first() return copyright_statement @property def copyright_year(self): copyright_year = self.root.xpath( "./*/item-info/copyright[@type]/@year").extract_first() return maybe_int(copyright_year) @property def dois(self): doi = self.root.xpath( "string(./RDF/Description/doi[1])").extract_first() return [{"doi": doi, "material": self.material}] @property def document_type(self): doctype = None if self.root.xpath( "./*[contains(name(),'article') or self::book-review]"): doctype = "article" elif self.root.xpath("./*[self::book or self::simple-book]"): doctype = "book" elif self.root.xpath("./book-chapter"): doctype = "book chapter" if self.is_conference_paper: doctype = "conference paper" if doctype: return doctype @property def is_conference_paper(self): """Decide whether the article is a conference paper.""" if self.root.xpath("./conference-info"): return True journal_issue = self.root.xpath( "string(./RDF/Description/issueName[1])").extract_first() if journal_issue: is_conference = re.findall(r"proceedings|proc.", journal_issue.lower()) return bool(is_conference) return False @property def journal_title(self): jid = self.root.xpath("string(./*/item-info/jid[1])").extract_first( default="") publication = self.root.xpath( "string(./RDF/Description/publicationName[1])").extract_first( default=jid) publication = re.sub(" [S|s]ection", "", publication).replace(",", "").strip() return publication @property def journal_issue(self): journal_issue = self.root.xpath( "string(./serial-issue/issue-info/issue-first[1])").extract_first( ) return journal_issue @property def journal_volume(self): journal_volume = self.root.xpath( "string(./RDF/Description/volume[1])").extract_first() return journal_volume @property def keywords(self): keywords = self.root.xpath( "./*/head/keywords[not(@abr)]/keyword/text/text()").getall() return keywords @property def license(self): license = { "license": self.license_statement, "material": self.material, "url": self.license_url, } return license @property def license_statement(self): license_statement = self.root.xpath( "string(./RDF/Description/licenseLine[1])").extract_first() return license_statement @property def license_url(self): license_url = self.root.xpath( "string(./RDF/Description/openAccessInformation/userLicense[1])" ).extract_first() return license_url @property def material(self): if self.article_type in ( "erratum", "addendum", "retraction", "removal", "duplicate", ): material = self.article_type elif self.article_type in ("editorial", "publisher's note"): material = "editorial note" else: material = "publication" return material @property def page_start(self): page_start = self.root.xpath( "string(./RDF/Description/startingPage[1])").extract_first() return page_start @property def page_end(self): page_end = self.root.xpath( "string(./RDF/Description/endingPage[1])").extract_first() return page_end @property def publication_date(self): publication_date = None publication_date_string = self.root.xpath( "string(./RDF/Description/coverDisplayDate[1])").extract_first() if publication_date_string: try: publication_date = PartialDate.parse(publication_date_string) except: # in case when date contains month range, eg. July-September 2020 publication_date = re.sub("[A-aZ-z]*-(?=[A-aZ-z])", "", publication_date_string) publication_date = PartialDate.parse(publication_date) return publication_date @property def publication_info(self): publication_info = { "artid": self.artid, "journal_title": self.journal_title, "journal_issue": self.journal_issue, "journal_volume": self.journal_volume, "material": self.material, "page_start": self.page_start, "page_end": self.page_end, "year": self.year, } return publication_info @property def publisher(self): publisher = self.root.xpath("string(./RDF/Description/publisher[1])" ).extract_first("Elsevier B.V.") return publisher @property def subtitle(self): subtitle = self.root.xpath( "string(./*/head/subtitle[1])").extract_first() return subtitle @property def title(self): title = self.root.xpath("string(./*/head/title[1])").extract_first() return title.strip("\n") if title else None @property def year(self): if self.publication_date: return self.publication_date.year def get_author_affiliations(self, author_node, author_group_node): """Extract an author's affiliations.""" ref_ids = author_node.xpath(".//@refid[contains(., 'af')]").extract() group_affs = author_group_node.xpath( "string(./affiliation/textfn[1])").getall() if ref_ids: affiliations = self._find_affiliations_by_id( author_group_node, ref_ids) else: affiliations = filter(None, group_affs) return affiliations @staticmethod def _find_affiliations_by_id(author_group, ref_ids): """Return affiliations with given ids. Affiliations should be standardized later. """ affiliations_by_id = [] for aff_id in ref_ids: affiliation = author_group.xpath( "string(//affiliation[@id='{}']/textfn[1])".format( aff_id)).extract_first() affiliations_by_id.append(affiliation) return affiliations_by_id def get_author_emails(self, author_node): """Extract an author's email addresses.""" emails = author_node.xpath( 'string(./e-address[@type="email"][1])').getall() return emails @staticmethod def get_author_name(author_node): """Extract an author's name.""" surname = author_node.xpath("string(./surname[1])").extract_first() given_names = author_node.xpath( "string(./given-name[1])").extract_first() suffix = author_node.xpath("string(.//suffix[1])").extract_first() author_name = ", ".join(el for el in (surname, given_names, suffix) if el) return author_name @staticmethod def get_root_node(elsevier_record): """Get a selector on the root ``article`` node of the record. This can be overridden in case some preprocessing needs to be done on the XML. Args: elsevier_record(Union[str, scrapy.selector.Selector]): the record in Elsevier format. Returns: scrapy.selector.Selector: a selector on the root ``<article>`` node. """ if isinstance(elsevier_record, six.string_types): root = get_node(elsevier_record) else: root = elsevier_record root.remove_namespaces() return root def get_author(self, author_node, author_group_node): """Extract one author. Args: author_node(scrapy.selector.Selector): a selector on a single author, e.g. a ``<contrib contrib-type="author">``. Returns: dict: the parsed author, conforming to the Inspire schema. """ author_name = self.get_author_name(author_node) emails = self.get_author_emails(author_node) affiliations = self.get_author_affiliations(author_node, author_group_node) return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails) @staticmethod def get_reference_authors(ref_node): """Extract authors from a reference node. Args: ref_node(scrapy.selector.Selector): a selector on a single reference. Returns: List[str]: list of names """ authors = ref_node.xpath("./contribution/authors/author") authors_names = [] for author in authors: given_names = author.xpath( "string(./given-name[1])").extract_first(default="") last_names = author.xpath("string(./surname[1])").extract_first( default="") authors_names.append(" ".join([given_names, last_names]).strip()) return authors_names @staticmethod def get_reference_editors(ref_node): """Extract authors of `role` from a reference node. Args: ref_node(scrapy.selector.Selector): a selector on a single reference. Returns: List[str]: list of names """ editors = ref_node.xpath(".//editors/authors/author") editors_names = [] for editor in editors: given_names = editor.xpath( "string(./given-name[1])").extract_first(default="") last_names = editor.xpath("string(./surname[1])").extract_first( default="") editors_names.append(" ".join([given_names, last_names]).strip()) return editors_names @staticmethod def get_reference_artid(ref_node): return ref_node.xpath("string(.//article-number[1])").extract_first() @staticmethod def get_reference_pages(ref_node): first_page = ref_node.xpath( "string(.//pages/first-page[1])").extract_first() last_page = ref_node.xpath( "string(.//pages/last-page[1])").extract_first() return first_page, last_page def get_reference_iter(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Yields: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ # handle also unstructured refs for citation_node in ref_node.xpath("./reference|./other-ref"): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format="Elsevier", ) fields = [ ( ("string(.//series/title/maintitle[1])"), builder.set_journal_title, ), ( "string(.//title[parent::edited-book|parent::book]/maintitle[1])", builder.add_parent_title, ), ("string(./publisher/name[1])", builder.set_publisher), ("string(.//volume-nr[1])", builder.set_journal_volume), ("string(.//issue-nr[1])", builder.set_journal_issue), ("string(.//date[1])", builder.set_year), ("string(.//inter-ref[1])", builder.add_url), ("string(.//doi[1])", builder.add_uid), ( 'string(pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")][1])', builder.add_report_number, ), ("string(./title/maintitle[1])", builder.add_title), ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) label_value = ref_node.xpath("string(./label[1])").extract_first() builder.set_label(label_value.strip("[]")) pages = self.get_reference_pages(citation_node) artid = self.get_reference_artid(citation_node) if artid: builder.set_page_artid(artid=artid) if any(pages): builder.set_page_artid(*pages) remainder = (remove_tags( citation_node, strip="self::authors" "|self::article-number" "|self::volume-nr" "|self::issue-nr" "|self::inter-ref" "|self::maintitle" "|self::date" "|self::label" "|self::publisher" "|self::doi" "|self::pages").strip("\"';,. \t\n\r").replace("()", "")) if remainder: builder.add_misc(remainder) for editor in self.get_reference_editors(citation_node): builder.add_author(editor, "editor") for author in self.get_reference_authors(citation_node): builder.add_author(author, "author") yield builder.obj def attach_fulltext_document(self, file_name, url): self.builder.add_document(file_name, url, fulltext=True, hidden=True) def get_identifier(self): return self.dois[0]["doi"] def should_record_be_harvested(self): if self.article_type in DOCTYPES_TO_HARVEST and all([ self.title, self.journal_title, self.journal_volume, (self.artid or self.page_start), ]): return True return False
def authorlist_with_affiliations(text): """Return a record containing the authors, including affiliations.""" def parse_author_string(author): """Get fullname and affiliation ids.""" name, affs = re.search( r'(.+?)(\d+[\,\d]*)', author, flags=re.UNICODE ).groups() aff_ids = [aff for aff in affs.split(',') if aff.isdigit()] return name, aff_ids def get_author_affiliations(author, affiliations): """Get affiliations belonging to author.""" try: fullname, aff_ids = parse_author_string(author) except AttributeError: raise AttributeError( 'Cannot split affiliation IDs from author name. This author ' 'might not have an affiliation at all', author ) try: author_affiliations = [affiliations[aff_id] for aff_id in aff_ids] except KeyError: raise KeyError( 'There might be multiple affiliations per line or ' 'affiliation IDs might not be separated with commas or ' 'the affiliation is missing. ' 'Problematic author and affiliations', author, aff_ids, affiliations ) return (fullname, author_affiliations) # Try to work with badly formatted input # There should be commas between different affiliation ids in author # names and there should be only one affiliation name per line. split_auths_and_affs = split_authors_affs_pattern.search(text) if not split_auths_and_affs: raise AttributeError('Could not find affiliations') authors_raw = split_auths_and_affs.group(1) # ensure comma between affid and next author name: authors_raw = re.sub(r'(\d+)[\n\s](\D)', r'\1, \2', authors_raw) authors_raw = authors_raw.replace('\n', '') # ensure no comma between author name and its affids authors_raw = re.sub(r'(\D)\,[\n\s]*(\d)', r'\1\2', authors_raw) # ensure space between comma and next author name: authors_raw = re.sub(r'(\d+)\,(\S\D)', r'\1, \2', authors_raw) # ensure no spaces between affids of an author authors_raw = re.sub(r'(\d+)\,\s(\d+)', r'\1,\2', authors_raw) authors = authors_raw.replace(' and ', ', ').split(', ') # Extract affiliations: affs_raw = split_auths_and_affs.group(2) affs_list = split_affs_pattern.findall(affs_raw) affs_list = [aff.replace('\n', ' ').strip() for aff in affs_list] affiliations = {} for aff in affs_list: try: # Note that affiliation may contain numbers aff_id, aff_name = re.search(r'^(\d+)\.?\s?(.*)$', aff).groups() affiliations[aff_id] = aff_name except (ValueError, AttributeError): raise ValueError('Cannot parse affiliation', aff) builder = LiteratureBuilder() for author in authors: fullname, author_affs = get_author_affiliations(author, affiliations) builder.add_author(builder.make_author(fullname, raw_affiliations=author_affs)) return {'authors': builder.record['authors']}
class ArxivParser(object): """Parser for the arXiv format. It can be used directly by invoking the :func:`ArxivParser.parse` method, or be subclassed to customize its behavior. Args: arxiv_record (Union[str, scrapy.selector.Selector]): the record in arXiv format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the arXiv metadata. """ _l2t = LatexNodes2Text( latex_context=get_arxiv_latex_context_db(), math_mode="verbatim", strict_latex_spaces="based-on-source", keep_comments=True, keep_braced_groups=True, keep_braced_groups_minlen=2, ) def __init__(self, arxiv_record, source=None): self.root = self.get_root_node(arxiv_record) if not source: source = 'arXiv' self.builder = LiteratureBuilder(source) def parse(self): """Extract an arXiv record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(abstract=self.abstract, source=self.source) self.builder.add_title(title=self.title, source=self.source) for license in self.licenses: self.builder.add_license(**license) for author in self.authors: self.builder.add_author(author) self.builder.add_number_of_pages(self.number_of_pages) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) self.builder.add_preprint_date(self.preprint_date) if self.public_note: self.builder.add_public_note(self.public_note, self.source) for rep_number in self.report_numbers: self.builder.add_report_number(rep_number, self.source) self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories) self.builder.add_private_note(self.private_note) self.builder.add_document_type(self.document_type) normalized_categories = [ classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories ] self.builder.add_inspire_categories(dedupe_list(normalized_categories), 'arxiv') return self.builder.record def _get_authors_and_collaborations(self, node): """Parse authors, affiliations and collaborations from the record node. Heuristics are used to detect collaborations. In case those are not reliable, a warning is returned for manual checking. Args: node (Selector): a selector on a record Returns: tuple: a tuple of (authors, collaborations, warning) """ author_selectors = node.xpath('.//authors//author') # take 'for the' out of the general phrases and dont use it in # affiliations collab_phrases = [ 'consortium', ' collab ', 'collaboration', ' team', 'group', ' on behalf of ', ' representing ', ] inst_phrases = ['institute', 'university', 'department', 'center'] authors = [] collaborations = [] warning_tags = [] some_affiliation_contains_collaboration = False authors_and_affiliations = ( self._get_author_names_and_affiliations(author) for author in author_selectors) next_author_and_affiliations = ( self._get_author_names_and_affiliations(author) for author in author_selectors) next(next_author_and_affiliations) for (forenames, keyname, affiliations), (next_forenames, next_keyname, _) in six.moves.zip_longest( authors_and_affiliations, next_author_and_affiliations, fillvalue=('end of author-list', '', None)): name_string = " %s %s " % (forenames, keyname) # collaborations in affiliation field? Cautious with 'for the' in # Inst names affiliations_with_collaborations = [] affiliations_without_collaborations = [] for aff in affiliations: affiliation_contains_collaboration = any( phrase in aff.lower() for phrase in collab_phrases) and not any( phrase in aff.lower() for phrase in inst_phrases) if affiliation_contains_collaboration: affiliations_with_collaborations.append(aff) some_affiliation_contains_collaboration = True else: affiliations_without_collaborations.append(aff) for aff in affiliations_with_collaborations: coll, author_name = coll_cleanforthe(aff) if coll and coll not in collaborations: collaborations.append(coll) # Check if name is a collaboration, else append to authors collaboration_in_name = ' for the ' in name_string.lower() or any( phrase in name_string.lower() for phrase in collab_phrases) if collaboration_in_name: coll, author_name = coll_cleanforthe(name_string) if author_name: surname, given_names = split_fullname(author_name) authors.append({ 'full_name': surname + ', ' + given_names, 'surname': surname, 'given_names': given_names, 'affiliations': [], }) if coll and coll not in collaborations: collaborations.append(coll) elif name_string.strip() == ':': # DANGERZONE : this might not be correct - add a warning for the cataloger warning_tags.append(' %s %s ' % (next_forenames, next_keyname)) if not some_affiliation_contains_collaboration: # everything up to now seems to be collaboration info for author_info in authors: name_string = " %s %s " % \ (author_info['given_names'], author_info['surname']) coll, author_name = coll_cleanforthe(name_string) if coll and coll not in collaborations: collaborations.append(coll) authors = [] else: authors.append({ 'full_name': keyname + ', ' + forenames, 'surname': keyname, 'given_names': forenames, 'affiliations': affiliations_without_collaborations }) if warning_tags: warning = 'WARNING: Colon in authors before %s: Check author list for collaboration names!' % ', '.join( warning_tags) else: warning = '' return authors, collaborations, warning @staticmethod def _get_author_names_and_affiliations(author_node): forenames = u' '.join( author_node.xpath('.//forenames//text()').extract()) keyname = u' '.join(author_node.xpath('.//keyname//text()').extract()) affiliations = author_node.xpath('.//affiliation//text()').extract() return forenames, keyname, affiliations @property def preprint_date(self): preprint_date = self.root.xpath('.//created/text()').extract_first() return preprint_date @property def abstract(self): abstract = self.root.xpath('.//abstract/text()').extract_first() long_text_fixed = self.fix_long_text(abstract) return self.latex_to_unicode(long_text_fixed) @property def authors(self): authors, _, _ = self.authors_and_collaborations parsed_authors = [ self.builder.make_author(full_name=auth["full_name"], raw_affiliations=auth["affiliations"]) for auth in authors ] return parsed_authors @property def collaborations(self): _, collaborations, _ = self.authors_and_collaborations return collaborations @property def dois(self): doi_values = self.root.xpath('.//doi/text()').extract() doi_values_splitted = chain.from_iterable( [re.split(RE_DOIS, doi) for doi in doi_values]) dois = [{ 'doi': value, 'material': 'publication' } for value in doi_values_splitted] return dois @property def licenses(self): licenses = self.root.xpath('.//license/text()').extract() return [{ 'url': license, 'material': self.material } for license in licenses] @property def material(self): return 'preprint' @property def number_of_pages(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) found_pages = RE_PAGES.search(comments) if found_pages: pages = found_pages.group(1) return maybe_int(pages) return None @property def publication_info(self): publication_info = { 'material': 'publication', 'pubinfo_freetext': self.pubinfo_freetext, } return publication_info @property def pubinfo_freetext(self): return self.root.xpath('.//journal-ref/text()').extract_first() @property def title(self): long_text_fixed = self.fix_long_text( self.root.xpath('.//title/text()').extract_first()) return self.latex_to_unicode(long_text_fixed) @staticmethod def fix_long_text(text): return re.sub(r'\s+', ' ', text).strip() @staticmethod def get_root_node(arxiv_record): """Get a selector on the root ``article`` node of the record. This can be overridden in case some preprocessing needs to be done on the XML. Args: arxiv_record(Union[str, scrapy.selector.Selector]): the record in arXiv format. Returns: scrapy.selector.Selector: a selector on the root ``<article>`` node. """ if isinstance(arxiv_record, six.string_types): root = get_node(arxiv_record) else: root = arxiv_record root.remove_namespaces() return root @property def public_note(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) return self.latex_to_unicode(comments) @property def private_note(self): _, _, warning = self.authors_and_collaborations return warning @property def report_numbers(self): report_numbers = self.root.xpath('.//report-no/text()').extract() rns = [] for rn in report_numbers: rns.extend(rn.split(', ')) return rns @property def arxiv_eprint(self): return self.root.xpath('.//id/text()').extract_first() @property def arxiv_categories(self): categories = self.root.xpath('.//categories/text()').extract_first( default='[]') categories = categories.split() categories_without_old = [ normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories ] return dedupe_list(categories_without_old) @property def document_type(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) doctype = 'article' if RE_THESIS.search(comments): doctype = 'thesis' elif RE_CONFERENCE.search(comments): doctype = 'conference paper' return doctype @property def source(self): return 'arXiv' @property def authors_and_collaborations(self): if not hasattr(self, '_authors_and_collaborations'): self._authors_and_collaborations = self._get_authors_and_collaborations( self.root) return self._authors_and_collaborations @classmethod def latex_to_unicode(cls, latex_string): try: return cls._l2t.latex_to_text(latex_string).replace(" ", " ") except Exception as e: return latex_string
def crawler2hep(crawler_record): def _filter_affiliation(affiliations): return [ affilation.get('value') for affilation in affiliations if affilation.get('value') ] builder = LiteratureBuilder('hepcrawl') for author in crawler_record.get('authors', []): builder.add_author( builder.make_author( author['full_name'], affiliations=_filter_affiliation(author['affiliations']), )) for title in crawler_record.get('titles', []): builder.add_title(title=title.get('title'), source=title.get('source')) for abstract in crawler_record.get('abstracts', []): builder.add_abstract(abstract=abstract.get('value'), source=abstract.get('source')) for arxiv_eprint in crawler_record.get('arxiv_eprints', []): builder.add_arxiv_eprint( arxiv_id=arxiv_eprint.get('value'), arxiv_categories=arxiv_eprint.get('categories')) for doi in crawler_record.get('dois', []): builder.add_doi(doi=doi.get('value')) for public_note in crawler_record.get('public_notes', []): builder.add_public_note(public_note=public_note.get('value'), source=public_note.get('source')) for license in crawler_record.get('license', []): builder.add_license(url=license.get('url'), license=license.get('license')) for collaboration in crawler_record.get('collaborations', []): builder.add_collaboration(collaboration=collaboration.get('value')) for imprint in crawler_record.get('imprints', []): builder.add_imprint_date(imprint_date=imprint.get('date')) for copyright in crawler_record.get('copyright', []): builder.add_copyright(holder=copyright.get('holder'), material=copyright.get('material'), statement=copyright.get('statement')) builder.add_preprint_date( preprint_date=crawler_record.get('preprint_date')) acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method='hepcrawl', date=acquisition_source.get('date'), source=acquisition_source.get('source'), submission_number=acquisition_source.get('submission_number')) try: builder.add_number_of_pages( number_of_pages=int(crawler_record.get('page_nr', [])[0])) except (TypeError, ValueError, IndexError): pass publication_types = [ 'introductory', 'lectures', 'review', ] special_collections = [ 'cdf-internal-note', 'cdf-note', 'cds', 'd0-internal-note', 'd0-preliminary-note', 'h1-internal-note', 'h1-preliminary-note', 'halhidden', 'hephidden', 'hermes-internal-note', 'larsoft-internal-note', 'larsoft-note', 'zeus-internal-note', 'zeus-preliminary-note', ] document_types = [ 'book', 'note', 'report', 'proceedings', 'thesis', ] added_doc_type = False for collection in crawler_record.get('collections', []): collection = collection['primary'].strip().lower() if collection == 'arxiv': continue # ignored elif collection == 'citeable': builder.set_citeable(True) elif collection == 'core': builder.set_core(True) elif collection == 'noncore': builder.set_core(False) elif collection == 'published': builder.set_refereed(True) elif collection == 'withdrawn': builder.set_withdrawn(True) elif collection in publication_types: builder.add_publication_type(collection) elif collection in special_collections: builder.add_special_collection(collection.upper()) elif collection == 'bookchapter': added_doc_type = True builder.add_document_type('book chapter') elif collection == 'conferencepaper': added_doc_type = True builder.add_document_type('conference paper') elif collection in document_types: added_doc_type = True builder.add_document_type(collection) if not added_doc_type: builder.add_document_type('article') _pub_info = crawler_record.get('publication_info', [{}])[0] builder.add_publication_info( year=_pub_info.get('year'), artid=_pub_info.get('artid'), page_end=_pub_info.get('page_end'), page_start=_pub_info.get('page_start'), journal_issue=_pub_info.get('journal_issue'), journal_title=_pub_info.get('journal_title'), journal_volume=_pub_info.get('journal_volume'), pubinfo_freetext=_pub_info.get('pubinfo_freetext'), ) for report_number in crawler_record.get('report_numbers', []): builder.add_report_number(report_number=report_number.get('value'), source=report_number.get('source')) builder.validate_record() return builder.record
class JatsParser(object): """Parser for the JATS format. It can be used directly by invoking the :func:`JatsParser.parse` method, or be subclassed to customize its behavior. Args: jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the JATS metadata. """ def __init__(self, jats_record, source=None): self.root = self.get_root_node(jats_record) if not source: source = self.publisher self.builder = LiteratureBuilder(source) def parse(self): """Extract a JATS record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(self.abstract) self.builder.add_title(self.title, subtitle=self.subtitle) self.builder.add_copyright(**self.copyright) self.builder.add_document_type(self.document_type) self.builder.add_license(**self.license) for author in self.authors: self.builder.add_author(author) self.builder.add_number_of_pages(self.number_of_pages) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) for keyword in self.keywords: self.builder.add_keyword(**keyword) self.builder.add_imprint_date(self.publication_date.dumps()) for reference in self.references: self.builder.add_reference(reference) return self.builder.record @property def references(self): """Extract a JATS record into an Inspire HEP references record. Returns: List[dict]: an array of reference schema records, representing the references in the record """ ref_nodes = self.root.xpath('./back/ref-list/ref') return list( itertools.chain.from_iterable( self.get_reference(node) for node in ref_nodes ) ) remove_tags_config_abstract = { 'allowed_tags': ['sup', 'sub'], 'allowed_trees': ['math'], 'strip': 'self::pub-id|self::issn' } @property def abstract(self): abstract_nodes = self.root.xpath('./front//abstract[1]') if not abstract_nodes: return abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip() return abstract @property def article_type(self): article_type = self.root.xpath('./@article-type').extract_first() return article_type @property def artid(self): artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first() return artid @property def authors(self): author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]') authors = [self.get_author(author) for author in author_nodes] return authors @property def collaborations(self): collab_nodes = self.root.xpath( './front//collab |' './front//contrib[@contrib-type="collaboration"] |' './front//on-behalf-of' ) collaborations = set( collab.xpath('string(.)').extract_first() for collab in collab_nodes ) return collaborations @property def copyright(self): copyright = { 'holder': self.copyright_holder, 'material': self.material, 'statement': self.copyright_statement, 'year': self.copyright_year, } return copyright @property def copyright_holder(self): copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first() return copyright_holder @property def copyright_statement(self): copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first() return copyright_statement @property def copyright_year(self): copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first() return maybe_int(copyright_year) @property def dois(self): doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract() dois = [ {'doi': value, 'material': self.material} for value in doi_values ] if self.material != 'publication': doi_values = self.root.xpath( './front/article-meta//related-article[@ext-link-type="doi"]/@href' ).extract() related_dois = ({'doi': value} for value in doi_values) dois.extend(related_dois) return dois @property def document_type(self): if self.is_conference_paper: document_type = 'conference paper' else: document_type = 'article' return document_type @property def is_conference_paper(self): """Decide whether the article is a conference paper.""" conference_node = self.root.xpath('./front//conference').extract_first() return bool(conference_node) @property def journal_title(self): journal_title = self.root.xpath( './front/journal-meta//abbrev-journal-title/text() |' './front/journal-meta//journal-title/text()' ).extract_first() return journal_title @property def journal_issue(self): journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first() return journal_issue @property def journal_volume(self): journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first() return journal_volume @property def keywords(self): keyword_groups = self.root.xpath('./front//kwd-group') keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups) return keywords @property def license(self): license = { 'license': self.license_statement, 'material': self.material, 'url': self.license_url, } return license @property def license_statement(self): license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip() return license_statement @property def license_url(self): url_nodes = ( './front/article-meta//license_ref/text() |' './front/article-meta//license/@href |' './front/article-meta//license//ext-link/@href' ) license_url = self.root.xpath(url_nodes).extract_first() return license_url @property def material(self): if self.article_type.startswith('correc'): material = 'erratum' elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'): material = self.article_type else: material = 'publication' return material @property def number_of_pages(self): number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first()) return number_of_pages @property def page_start(self): page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first() return page_start @property def page_end(self): page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first() return page_end @property def publication_date(self): date_nodes = self.root.xpath( './front//pub-date[@pub-type="ppub"] |' './front//pub-date[@pub-type="epub"] |' './front//pub-date[starts-with(@date-type,"pub")] |' './front//date[starts-with(@date-type,"pub")]' ) publication_date = min( self.get_date(date_node) for date_node in date_nodes ) return publication_date @property def publication_info(self): publication_info = { 'artid': self.artid, 'journal_title': self.journal_title, 'journal_issue': self.journal_issue, 'journal_volume': self.journal_volume, 'material': self.material, 'page_start': self.page_start, 'page_end': self.page_end, 'year': self.year, } return publication_info @property def publisher(self): publisher = self.root.xpath('./front//publisher-name/text()').extract_first() return publisher @property def subtitle(self): subtitle = self.root.xpath('string(./front//subtitle)').extract_first() return subtitle @property def title(self): title = self.root.xpath('string(./front//article-title)').extract_first() return title def get_affiliation(self, id_): """Get the affiliation with the specified id. Args: id_(str): the value of the ``id`` attribute of the affiliation. Returns: Optional[str]: the affiliation with that id or ``None`` if there is no match. """ affiliation_node = self.root.xpath('//aff[@id=$id_]', id_=id_)[0] affiliation = remove_tags( affiliation_node, strip='self::label | self::email' ).strip() return affiliation def get_emails_from_refs(self, id_): """Get the emails from the node with the specified id. Args: id_(str): the value of the ``id`` attribute of the node. Returns: List[str]: the emails from the node with that id or [] if none found. """ email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_) return email_nodes.extract() @property def year(self): not_online = ( 'not(starts-with(@publication-format, "elec"))' ' and not(starts-with(@publication-format, "online")' ) date_nodes = self.root.xpath( './front//pub-date[@pub-type="ppub"] |' './front//pub-date[starts-with(@date-type,"pub") and $not_online] |' './front//date[starts-with(@date-type,"pub") and $not_online]', not_online=not_online ) year = min( self.get_date(date_node) for date_node in date_nodes ).year return year def get_author_affiliations(self, author_node): """Extract an author's affiliations.""" raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract() # Sometimes the rid might have more than one ID (e.g. rid="id0 id1") referred_ids = set() for raw_referred_id in raw_referred_ids: referred_ids.update(set(raw_referred_id.split(' '))) affiliations = [ self.get_affiliation(rid) for rid in referred_ids if self.get_affiliation(rid) ] return affiliations def get_author_emails(self, author_node): """Extract an author's email addresses.""" emails = author_node.xpath('.//email/text()').extract() referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract() for referred_id in referred_ids: emails.extend(self.get_emails_from_refs(referred_id)) return emails @staticmethod def get_author_name(author_node): """Extract an author's name.""" surname = author_node.xpath('.//surname/text()').extract_first() if not surname: # the author name is unstructured author_name = author_node.xpath('string(./string-name)').extract_first() given_names = author_node.xpath('.//given-names/text()').extract_first() suffix = author_node.xpath('.//suffix/text()').extract_first() author_name = ', '.join(el for el in (surname, given_names, suffix) if el) return author_name @staticmethod def get_date(date_node): """Extract a date from a date node. Returns: PartialDate: the parsed date. """ iso_string = date_node.xpath('./@iso-8601-date').extract_first() iso_date = PartialDate.loads(iso_string) if iso_string else None year = date_node.xpath('string(./year)').extract_first() month = date_node.xpath('string(./month)').extract_first() day = date_node.xpath('string(./day)').extract_first() date_from_parts = PartialDate.from_parts(year, month, day) if year else None string_date = date_node.xpath('string(./string-date)').extract_first() try: parsed_date = PartialDate.parse(string_date) except ValueError: parsed_date = None date = get_first([iso_date, date_from_parts, parsed_date]) return date @staticmethod def get_keywords(group_node): """Extract keywords from a keyword group.""" schema = None if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower(): schema = 'PACS' keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd')) keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords) return keyword_dicts @staticmethod def get_root_node(jats_record): """Get a selector on the root ``article`` node of the record. This can be overridden in case some preprocessing needs to be done on the XML. Args: jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format. Returns: scrapy.selector.Selector: a selector on the root ``<article>`` node. """ if isinstance(jats_record, six.string_types): root = get_node(jats_record) else: root = jats_record root.remove_namespaces() return root def get_author(self, author_node): """Extract one author. Args: author_node(scrapy.selector.Selector): a selector on a single author, e.g. a ``<contrib contrib-type="author">``. Returns: dict: the parsed author, conforming to the Inspire schema. """ author_name = self.get_author_name(author_node) emails = self.get_author_emails(author_node) affiliations = self.get_author_affiliations(author_node) return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails) @staticmethod def get_reference_authors(ref_node, role): """Extract authors of `role` from a reference node. Args: ref_node(scrapy.selector.Selector): a selector on a single reference. role(str): author role Returns: List[str]: list of names """ return ref_node.xpath( './person-group[@person-group-type=$role]/string-name/text()', role=role ).extract() def get_reference(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Returns: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ for citation_node in ref_node.xpath('./mixed-citation'): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format='JATS' ) fields = [ ( ( 'self::node()[@publication-type="journal" ' 'or @publication-type="eprint"]/source/text()' ), builder.set_journal_title, ), ( 'self::node()[@publication-type="book"]/source/text()', builder.add_parent_title, ), ('./publisher-name/text()', builder.set_publisher), ('./volume/text()', builder.set_journal_volume), ('./issue/text()', builder.set_journal_issue), ('./year/text()', builder.set_year), ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid), ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid), ( 'pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")]/text()', builder.add_report_number ), ('./article-title/text()', builder.add_title), ('../label/text()', lambda x: builder.set_label(x.strip('[].'))) ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) remainder = remove_tags( citation_node, strip='self::person-group' '|self::pub-id' '|self::article-title' '|self::volume' '|self::issue' '|self::year' '|self::label' '|self::publisher-name' '|self::source[../@publication-type!="proc"]' '|self::object-id' '|self::page-range' '|self::issn' ).strip('"\';,. \t\n\r').replace('()', '') if remainder: builder.add_misc(remainder) for editor in self.get_reference_authors(citation_node, 'editor'): builder.add_author(editor, 'editor') for author in self.get_reference_authors(citation_node, 'author'): builder.add_author(author, 'author') page_range = citation_node.xpath('./page-range/text()').extract_first() if page_range: page_artid = split_page_artid(page_range) builder.set_page_artid(*page_artid) yield builder.obj def attach_fulltext_document(self, file_name, url): self.builder.add_document(file_name, url, fulltext=True, hidden=True)