class JatsParser(object): """Parser for the JATS format. It can be used directly by invoking the :func:`JatsParser.parse` method, or be subclassed to customize its behavior. Args: jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the JATS metadata. """ def __init__(self, jats_record, source=None): self.root = self.get_root_node(jats_record) if not source: source = self.publisher self.builder = LiteratureBuilder(source) def parse(self): """Extract a JATS record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(self.abstract) self.builder.add_title(self.title, subtitle=self.subtitle) self.builder.add_copyright(**self.copyright) self.builder.add_document_type(self.document_type) self.builder.add_license(**self.license) for author in self.authors: self.builder.add_author(author) self.builder.add_number_of_pages(self.number_of_pages) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) for keyword in self.keywords: self.builder.add_keyword(**keyword) self.builder.add_imprint_date(self.publication_date.dumps()) for reference in self.references: self.builder.add_reference(reference) return self.builder.record @property def references(self): """Extract a JATS record into an Inspire HEP references record. Returns: List[dict]: an array of reference schema records, representing the references in the record """ ref_nodes = self.root.xpath('./back/ref-list/ref') return list( itertools.chain.from_iterable( self.get_reference(node) for node in ref_nodes ) ) remove_tags_config_abstract = { 'allowed_tags': ['sup', 'sub'], 'allowed_trees': ['math'], 'strip': 'self::pub-id|self::issn' } @property def abstract(self): abstract_nodes = self.root.xpath('./front//abstract[1]') if not abstract_nodes: return abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip() return abstract @property def article_type(self): article_type = self.root.xpath('./@article-type').extract_first() return article_type @property def artid(self): artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first() return artid @property def authors(self): author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]') authors = [self.get_author(author) for author in author_nodes] return authors @property def collaborations(self): collab_nodes = self.root.xpath( './front//collab |' './front//contrib[@contrib-type="collaboration"] |' './front//on-behalf-of' ) collaborations = set( collab.xpath('string(.)').extract_first() for collab in collab_nodes ) return collaborations @property def copyright(self): copyright = { 'holder': self.copyright_holder, 'material': self.material, 'statement': self.copyright_statement, 'year': self.copyright_year, } return copyright @property def copyright_holder(self): copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first() return copyright_holder @property def copyright_statement(self): copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first() return copyright_statement @property def copyright_year(self): copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first() return maybe_int(copyright_year) @property def dois(self): doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract() dois = [ {'doi': value, 'material': self.material} for value in doi_values ] if self.material != 'publication': doi_values = self.root.xpath( './front/article-meta//related-article[@ext-link-type="doi"]/@href' ).extract() related_dois = ({'doi': value} for value in doi_values) dois.extend(related_dois) return dois @property def document_type(self): if self.is_conference_paper: document_type = 'conference paper' else: document_type = 'article' return document_type @property def is_conference_paper(self): """Decide whether the article is a conference paper.""" conference_node = self.root.xpath('./front//conference').extract_first() return bool(conference_node) @property def journal_title(self): journal_title = self.root.xpath( './front/journal-meta//abbrev-journal-title/text() |' './front/journal-meta//journal-title/text()' ).extract_first() return journal_title @property def journal_issue(self): journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first() return journal_issue @property def journal_volume(self): journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first() return journal_volume @property def keywords(self): keyword_groups = self.root.xpath('./front//kwd-group') keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups) return keywords @property def license(self): license = { 'license': self.license_statement, 'material': self.material, 'url': self.license_url, } return license @property def license_statement(self): license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip() return license_statement @property def license_url(self): url_nodes = ( './front/article-meta//license_ref/text() |' './front/article-meta//license/@href |' './front/article-meta//license//ext-link/@href' ) license_url = self.root.xpath(url_nodes).extract_first() return license_url @property def material(self): if self.article_type.startswith('correc'): material = 'erratum' elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'): material = self.article_type else: material = 'publication' return material @property def number_of_pages(self): number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first()) return number_of_pages @property def page_start(self): page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first() return page_start @property def page_end(self): page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first() return page_end @property def publication_date(self): date_nodes = self.root.xpath( './front//pub-date[@pub-type="ppub"] |' './front//pub-date[@pub-type="epub"] |' './front//pub-date[starts-with(@date-type,"pub")] |' './front//date[starts-with(@date-type,"pub")]' ) publication_date = min( self.get_date(date_node) for date_node in date_nodes ) return publication_date @property def publication_info(self): publication_info = { 'artid': self.artid, 'journal_title': self.journal_title, 'journal_issue': self.journal_issue, 'journal_volume': self.journal_volume, 'material': self.material, 'page_start': self.page_start, 'page_end': self.page_end, 'year': self.year, } return publication_info @property def publisher(self): publisher = self.root.xpath('./front//publisher-name/text()').extract_first() return publisher @property def subtitle(self): subtitle = self.root.xpath('string(./front//subtitle)').extract_first() return subtitle @property def title(self): title = self.root.xpath('string(./front//article-title)').extract_first() return title def get_affiliation(self, id_): """Get the affiliation with the specified id. Args: id_(str): the value of the ``id`` attribute of the affiliation. Returns: Optional[str]: the affiliation with that id or ``None`` if there is no match. """ affiliation_node = self.root.xpath("//aff[@id=$id_]", id_=id_) if affiliation_node: affiliation = remove_tags( affiliation_node[0], strip="self::label | self::email" ).strip() return affiliation def get_emails_from_refs(self, id_): """Get the emails from the node with the specified id. Args: id_(str): the value of the ``id`` attribute of the node. Returns: List[str]: the emails from the node with that id or [] if none found. """ email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_) return email_nodes.extract() @property def year(self): not_online = ( 'not(starts-with(@publication-format, "elec"))' ' and not(starts-with(@publication-format, "online")' ) date_nodes = self.root.xpath( './front//pub-date[@pub-type="ppub"] |' './front//pub-date[starts-with(@date-type,"pub") and $not_online] |' './front//date[starts-with(@date-type,"pub") and $not_online]', not_online=not_online ) year = min( self.get_date(date_node) for date_node in date_nodes ).year return year def get_author_affiliations(self, author_node): """Extract an author's affiliations.""" raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract() # Sometimes the rid might have more than one ID (e.g. rid="id0 id1") referred_ids = set() for raw_referred_id in raw_referred_ids: referred_ids.update(set(raw_referred_id.split(' '))) affiliations = [ self.get_affiliation(rid) for rid in referred_ids if self.get_affiliation(rid) ] return affiliations def get_author_emails(self, author_node): """Extract an author's email addresses.""" emails = author_node.xpath('.//email/text()').extract() referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract() for referred_id in referred_ids: emails.extend(self.get_emails_from_refs(referred_id)) return emails @staticmethod def get_author_name(author_node): """Extract an author's name.""" surname = author_node.xpath('.//surname/text()').extract_first() if not surname: # the author name is unstructured author_name = author_node.xpath('string(./string-name)').extract_first() given_names = author_node.xpath('.//given-names/text()').extract_first() suffix = author_node.xpath('.//suffix/text()').extract_first() author_name = ', '.join(el for el in (surname, given_names, suffix) if el) return author_name @staticmethod def get_date(date_node): """Extract a date from a date node. Returns: PartialDate: the parsed date. """ iso_string = date_node.xpath('./@iso-8601-date').extract_first() iso_date = PartialDate.loads(iso_string) if iso_string else None year = date_node.xpath('string(./year)').extract_first() month = date_node.xpath('string(./month)').extract_first() day = date_node.xpath('string(./day)').extract_first() date_from_parts = PartialDate.from_parts(year, month, day) if year else None string_date = date_node.xpath('string(./string-date)').extract_first() try: parsed_date = PartialDate.parse(string_date) except ValueError: parsed_date = None date = get_first([iso_date, date_from_parts, parsed_date]) return date @staticmethod def get_keywords(group_node): """Extract keywords from a keyword group.""" schema = None if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower(): schema = 'PACS' keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd')) keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords) return keyword_dicts @staticmethod def get_root_node(jats_record): """Get a selector on the root ``article`` node of the record. This can be overridden in case some preprocessing needs to be done on the XML. Args: jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format. Returns: scrapy.selector.Selector: a selector on the root ``<article>`` node. """ if isinstance(jats_record, six.string_types): root = get_node(jats_record) else: root = jats_record root.remove_namespaces() return root def get_author(self, author_node): """Extract one author. Args: author_node(scrapy.selector.Selector): a selector on a single author, e.g. a ``<contrib contrib-type="author">``. Returns: dict: the parsed author, conforming to the Inspire schema. """ author_name = self.get_author_name(author_node) emails = self.get_author_emails(author_node) affiliations = self.get_author_affiliations(author_node) return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails) @staticmethod def get_reference_authors(ref_node, role): """Extract authors of `role` from a reference node. Args: ref_node(scrapy.selector.Selector): a selector on a single reference. role(str): author role Returns: List[str]: list of names """ return ref_node.xpath( './person-group[@person-group-type=$role]/string-name/text()', role=role ).extract() def get_reference(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Returns: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ for citation_node in ref_node.xpath('./mixed-citation'): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format='JATS' ) fields = [ ( ( 'self::node()[@publication-type="journal" ' 'or @publication-type="eprint"]/source/text()' ), builder.set_journal_title, ), ( 'self::node()[@publication-type="book"]/source/text()', builder.add_parent_title, ), ('./publisher-name/text()', builder.set_publisher), ('./volume/text()', builder.set_journal_volume), ('./issue/text()', builder.set_journal_issue), ('./year/text()', builder.set_year), ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid), ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid), ( 'pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")]/text()', builder.add_report_number ), ('./article-title/text()', builder.add_title), ('../label/text()', lambda x: builder.set_label(x.strip('[].'))) ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) remainder = remove_tags( citation_node, strip='self::person-group' '|self::pub-id' '|self::article-title' '|self::volume' '|self::issue' '|self::year' '|self::label' '|self::publisher-name' '|self::source[../@publication-type!="proc"]' '|self::object-id' '|self::page-range' '|self::issn' ).strip('"\';,. \t\n\r').replace('()', '') if remainder: builder.add_misc(remainder) for editor in self.get_reference_authors(citation_node, 'editor'): builder.add_author(editor, 'editor') for author in self.get_reference_authors(citation_node, 'author'): builder.add_author(author, 'author') page_range = citation_node.xpath('./page-range/text()').extract_first() if page_range: page_artid = split_page_artid(page_range) builder.set_page_artid(*page_artid) yield builder.obj def attach_fulltext_document(self, file_name, url): self.builder.add_document(file_name, url, fulltext=True, hidden=True)
def hepcrawl_to_hep(crawler_record): """ Args: crawler_record(dict): dictionary representing the hepcrawl formatted record. Returns: dict: The hep formatted record. """ def _filter_affiliation(affiliations): return [ affilation.get('value') for affilation in affiliations if affilation.get('value') ] builder = LiteratureBuilder( source=crawler_record['acquisition_source']['source'] ) for author in crawler_record.get('authors', []): builder.add_author(builder.make_author( full_name=author['full_name'], raw_affiliations=_filter_affiliation(author['affiliations']), )) for title in crawler_record.get('titles', []): builder.add_title( title=title.get('title'), subtitle=title.get('subtitle'), source=title.get('source') ) for abstract in crawler_record.get('abstracts', []): builder.add_abstract( abstract=abstract.get('value'), source=abstract.get('source') ) for arxiv_eprint in crawler_record.get('arxiv_eprints', []): builder.add_arxiv_eprint( arxiv_id=arxiv_eprint.get('value'), arxiv_categories=arxiv_eprint.get('categories') ) for doi in crawler_record.get('dois', []): builder.add_doi( doi=doi.get('value'), material=doi.get('material'), ) for private_note in crawler_record.get('private_notes', []): builder.add_private_note( private_notes=private_note ) for public_note in crawler_record.get('public_notes', []): builder.add_public_note( public_note=public_note.get('value'), source=public_note.get('source') ) for license in crawler_record.get('license', []): builder.add_license( url=license.get('url'), license=license.get('license'), material=license.get('material'), ) for collaboration in crawler_record.get('collaborations', []): builder.add_collaboration( collaboration=collaboration.get('value') ) for imprint in crawler_record.get('imprints', []): builder.add_imprint_date( imprint_date=imprint.get('date') ) for copyright in crawler_record.get('copyright', []): builder.add_copyright( holder=copyright.get('holder'), material=copyright.get('material'), statement=copyright.get('statement') ) builder.add_preprint_date( preprint_date=crawler_record.get('preprint_date') ) acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method=acquisition_source['method'], date=acquisition_source['datetime'], source=acquisition_source['source'], submission_number=acquisition_source['submission_number'], ) try: builder.add_number_of_pages( number_of_pages=int(crawler_record.get('page_nr', [])[0]) ) except (TypeError, ValueError, IndexError): pass publication_types = [ 'introductory', 'lectures', 'review', 'manual', ] document_types = [ 'book', 'note', 'report', 'proceedings', 'thesis', ] added_doc_type = False for collection in crawler_record.get('collections', []): collection = collection['primary'].strip().lower() if collection == 'arxiv': continue # ignored elif collection == 'citeable': builder.set_citeable(True) elif collection == 'core': builder.set_core(True) elif collection == 'noncore': builder.set_core(False) elif collection == 'published': builder.set_refereed(True) elif collection == 'withdrawn': builder.set_withdrawn(True) elif collection in publication_types: builder.add_publication_type(collection) elif collection == 'bookchapter': added_doc_type = True builder.add_document_type('book chapter') elif collection == 'conferencepaper': added_doc_type = True builder.add_document_type('conference paper') elif collection in document_types: added_doc_type = True builder.add_document_type(collection) if not added_doc_type: builder.add_document_type('article') _pub_info = crawler_record.get('publication_info', [{}])[0] builder.add_publication_info( year=_pub_info.get('year'), artid=_pub_info.get('artid'), page_end=_pub_info.get('page_end'), page_start=_pub_info.get('page_start'), journal_issue=_pub_info.get('journal_issue'), journal_title=_pub_info.get('journal_title'), journal_volume=_pub_info.get('journal_volume'), pubinfo_freetext=_pub_info.get('pubinfo_freetext'), material=_pub_info.get('pubinfo_material'), ) for report_number in crawler_record.get('report_numbers', []): builder.add_report_number( report_number=report_number.get('value'), source=report_number.get('source') ) for url in crawler_record.get('urls', []): builder.add_url(url=url.get('value')) for document in crawler_record.get('documents', []): builder.add_document( description=document.get('description'), fulltext=document.get('fulltext'), hidden=document.get('hidden'), key=document['key'], material=document.get('material'), original_url=document.get('original_url'), url=document['url'], ) return builder.record
def crawler2hep(crawler_record): def _filter_affiliation(affiliations): return [ affilation.get('value') for affilation in affiliations if affilation.get('value') ] builder = LiteratureBuilder('hepcrawl') for author in crawler_record.get('authors', []): builder.add_author( builder.make_author( author['full_name'], affiliations=_filter_affiliation(author['affiliations']), )) for title in crawler_record.get('titles', []): builder.add_title(title=title.get('title'), source=title.get('source')) for abstract in crawler_record.get('abstracts', []): builder.add_abstract(abstract=abstract.get('value'), source=abstract.get('source')) for arxiv_eprint in crawler_record.get('arxiv_eprints', []): builder.add_arxiv_eprint( arxiv_id=arxiv_eprint.get('value'), arxiv_categories=arxiv_eprint.get('categories')) for doi in crawler_record.get('dois', []): builder.add_doi(doi=doi.get('value')) for public_note in crawler_record.get('public_notes', []): builder.add_public_note(public_note=public_note.get('value'), source=public_note.get('source')) for license in crawler_record.get('license', []): builder.add_license(url=license.get('url'), license=license.get('license')) for collaboration in crawler_record.get('collaborations', []): builder.add_collaboration(collaboration=collaboration.get('value')) for imprint in crawler_record.get('imprints', []): builder.add_imprint_date(imprint_date=imprint.get('date')) for copyright in crawler_record.get('copyright', []): builder.add_copyright(holder=copyright.get('holder'), material=copyright.get('material'), statement=copyright.get('statement')) builder.add_preprint_date( preprint_date=crawler_record.get('preprint_date')) acquisition_source = crawler_record.get('acquisition_source', {}) builder.add_acquisition_source( method='hepcrawl', date=acquisition_source.get('date'), source=acquisition_source.get('source'), submission_number=acquisition_source.get('submission_number')) try: builder.add_number_of_pages( number_of_pages=int(crawler_record.get('page_nr', [])[0])) except (TypeError, ValueError, IndexError): pass publication_types = [ 'introductory', 'lectures', 'review', ] special_collections = [ 'cdf-internal-note', 'cdf-note', 'cds', 'd0-internal-note', 'd0-preliminary-note', 'h1-internal-note', 'h1-preliminary-note', 'halhidden', 'hephidden', 'hermes-internal-note', 'larsoft-internal-note', 'larsoft-note', 'zeus-internal-note', 'zeus-preliminary-note', ] document_types = [ 'book', 'note', 'report', 'proceedings', 'thesis', ] added_doc_type = False for collection in crawler_record.get('collections', []): collection = collection['primary'].strip().lower() if collection == 'arxiv': continue # ignored elif collection == 'citeable': builder.set_citeable(True) elif collection == 'core': builder.set_core(True) elif collection == 'noncore': builder.set_core(False) elif collection == 'published': builder.set_refereed(True) elif collection == 'withdrawn': builder.set_withdrawn(True) elif collection in publication_types: builder.add_publication_type(collection) elif collection in special_collections: builder.add_special_collection(collection.upper()) elif collection == 'bookchapter': added_doc_type = True builder.add_document_type('book chapter') elif collection == 'conferencepaper': added_doc_type = True builder.add_document_type('conference paper') elif collection in document_types: added_doc_type = True builder.add_document_type(collection) if not added_doc_type: builder.add_document_type('article') _pub_info = crawler_record.get('publication_info', [{}])[0] builder.add_publication_info( year=_pub_info.get('year'), artid=_pub_info.get('artid'), page_end=_pub_info.get('page_end'), page_start=_pub_info.get('page_start'), journal_issue=_pub_info.get('journal_issue'), journal_title=_pub_info.get('journal_title'), journal_volume=_pub_info.get('journal_volume'), pubinfo_freetext=_pub_info.get('pubinfo_freetext'), ) for report_number in crawler_record.get('report_numbers', []): builder.add_report_number(report_number=report_number.get('value'), source=report_number.get('source')) builder.validate_record() return builder.record
class ArxivParser(object): """Parser for the arXiv format. It can be used directly by invoking the :func:`ArxivParser.parse` method, or be subclassed to customize its behavior. Args: arxiv_record (Union[str, scrapy.selector.Selector]): the record in arXiv format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the arXiv metadata. """ _l2t = LatexNodes2Text( latex_context=get_arxiv_latex_context_db(), math_mode="verbatim", strict_latex_spaces="based-on-source", keep_comments=True, keep_braced_groups=True, keep_braced_groups_minlen=2, ) def __init__(self, arxiv_record, source=None): self.root = self.get_root_node(arxiv_record) if not source: source = 'arXiv' self.builder = LiteratureBuilder(source) def parse(self): """Extract an arXiv record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(abstract=self.abstract, source=self.source) self.builder.add_title(title=self.title, source=self.source) for license in self.licenses: self.builder.add_license(**license) for author in self.authors: self.builder.add_author(author) self.builder.add_number_of_pages(self.number_of_pages) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) self.builder.add_preprint_date(self.preprint_date) if self.public_note: self.builder.add_public_note(self.public_note, self.source) for rep_number in self.report_numbers: self.builder.add_report_number(rep_number, self.source) self.builder.add_arxiv_eprint(self.arxiv_eprint, self.arxiv_categories) self.builder.add_private_note(self.private_note) self.builder.add_document_type(self.document_type) normalized_categories = [ classify_field(arxiv_cat) for arxiv_cat in self.arxiv_categories ] self.builder.add_inspire_categories(dedupe_list(normalized_categories), 'arxiv') return self.builder.record def _get_authors_and_collaborations(self, node): """Parse authors, affiliations and collaborations from the record node. Heuristics are used to detect collaborations. In case those are not reliable, a warning is returned for manual checking. Args: node (Selector): a selector on a record Returns: tuple: a tuple of (authors, collaborations, warning) """ author_selectors = node.xpath('.//authors//author') # take 'for the' out of the general phrases and dont use it in # affiliations collab_phrases = [ 'consortium', ' collab ', 'collaboration', ' team', 'group', ' on behalf of ', ' representing ', ] inst_phrases = ['institute', 'university', 'department', 'center'] authors = [] collaborations = [] warning_tags = [] some_affiliation_contains_collaboration = False authors_and_affiliations = ( self._get_author_names_and_affiliations(author) for author in author_selectors) next_author_and_affiliations = ( self._get_author_names_and_affiliations(author) for author in author_selectors) next(next_author_and_affiliations) for (forenames, keyname, affiliations), (next_forenames, next_keyname, _) in six.moves.zip_longest( authors_and_affiliations, next_author_and_affiliations, fillvalue=('end of author-list', '', None)): name_string = " %s %s " % (forenames, keyname) # collaborations in affiliation field? Cautious with 'for the' in # Inst names affiliations_with_collaborations = [] affiliations_without_collaborations = [] for aff in affiliations: affiliation_contains_collaboration = any( phrase in aff.lower() for phrase in collab_phrases) and not any( phrase in aff.lower() for phrase in inst_phrases) if affiliation_contains_collaboration: affiliations_with_collaborations.append(aff) some_affiliation_contains_collaboration = True else: affiliations_without_collaborations.append(aff) for aff in affiliations_with_collaborations: coll, author_name = coll_cleanforthe(aff) if coll and coll not in collaborations: collaborations.append(coll) # Check if name is a collaboration, else append to authors collaboration_in_name = ' for the ' in name_string.lower() or any( phrase in name_string.lower() for phrase in collab_phrases) if collaboration_in_name: coll, author_name = coll_cleanforthe(name_string) if author_name: surname, given_names = split_fullname(author_name) authors.append({ 'full_name': surname + ', ' + given_names, 'surname': surname, 'given_names': given_names, 'affiliations': [], }) if coll and coll not in collaborations: collaborations.append(coll) elif name_string.strip() == ':': # DANGERZONE : this might not be correct - add a warning for the cataloger warning_tags.append(' %s %s ' % (next_forenames, next_keyname)) if not some_affiliation_contains_collaboration: # everything up to now seems to be collaboration info for author_info in authors: name_string = " %s %s " % \ (author_info['given_names'], author_info['surname']) coll, author_name = coll_cleanforthe(name_string) if coll and coll not in collaborations: collaborations.append(coll) authors = [] else: authors.append({ 'full_name': keyname + ', ' + forenames, 'surname': keyname, 'given_names': forenames, 'affiliations': affiliations_without_collaborations }) if warning_tags: warning = 'WARNING: Colon in authors before %s: Check author list for collaboration names!' % ', '.join( warning_tags) else: warning = '' return authors, collaborations, warning @staticmethod def _get_author_names_and_affiliations(author_node): forenames = u' '.join( author_node.xpath('.//forenames//text()').extract()) keyname = u' '.join(author_node.xpath('.//keyname//text()').extract()) affiliations = author_node.xpath('.//affiliation//text()').extract() return forenames, keyname, affiliations @property def preprint_date(self): preprint_date = self.root.xpath('.//created/text()').extract_first() return preprint_date @property def abstract(self): abstract = self.root.xpath('.//abstract/text()').extract_first() long_text_fixed = self.fix_long_text(abstract) return self.latex_to_unicode(long_text_fixed) @property def authors(self): authors, _, _ = self.authors_and_collaborations parsed_authors = [ self.builder.make_author(full_name=auth["full_name"], raw_affiliations=auth["affiliations"]) for auth in authors ] return parsed_authors @property def collaborations(self): _, collaborations, _ = self.authors_and_collaborations return collaborations @property def dois(self): doi_values = self.root.xpath('.//doi/text()').extract() doi_values_splitted = chain.from_iterable( [re.split(RE_DOIS, doi) for doi in doi_values]) dois = [{ 'doi': value, 'material': 'publication' } for value in doi_values_splitted] return dois @property def licenses(self): licenses = self.root.xpath('.//license/text()').extract() return [{ 'url': license, 'material': self.material } for license in licenses] @property def material(self): return 'preprint' @property def number_of_pages(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) found_pages = RE_PAGES.search(comments) if found_pages: pages = found_pages.group(1) return maybe_int(pages) return None @property def publication_info(self): publication_info = { 'material': 'publication', 'pubinfo_freetext': self.pubinfo_freetext, } return publication_info @property def pubinfo_freetext(self): return self.root.xpath('.//journal-ref/text()').extract_first() @property def title(self): long_text_fixed = self.fix_long_text( self.root.xpath('.//title/text()').extract_first()) return self.latex_to_unicode(long_text_fixed) @staticmethod def fix_long_text(text): return re.sub(r'\s+', ' ', text).strip() @staticmethod def get_root_node(arxiv_record): """Get a selector on the root ``article`` node of the record. This can be overridden in case some preprocessing needs to be done on the XML. Args: arxiv_record(Union[str, scrapy.selector.Selector]): the record in arXiv format. Returns: scrapy.selector.Selector: a selector on the root ``<article>`` node. """ if isinstance(arxiv_record, six.string_types): root = get_node(arxiv_record) else: root = arxiv_record root.remove_namespaces() return root @property def public_note(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) return self.latex_to_unicode(comments) @property def private_note(self): _, _, warning = self.authors_and_collaborations return warning @property def report_numbers(self): report_numbers = self.root.xpath('.//report-no/text()').extract() rns = [] for rn in report_numbers: rns.extend(rn.split(', ')) return rns @property def arxiv_eprint(self): return self.root.xpath('.//id/text()').extract_first() @property def arxiv_categories(self): categories = self.root.xpath('.//categories/text()').extract_first( default='[]') categories = categories.split() categories_without_old = [ normalize_arxiv_category(arxiv_cat) for arxiv_cat in categories ] return dedupe_list(categories_without_old) @property def document_type(self): comments = '; '.join(self.root.xpath('.//comments/text()').extract()) doctype = 'article' if RE_THESIS.search(comments): doctype = 'thesis' elif RE_CONFERENCE.search(comments): doctype = 'conference paper' return doctype @property def source(self): return 'arXiv' @property def authors_and_collaborations(self): if not hasattr(self, '_authors_and_collaborations'): self._authors_and_collaborations = self._get_authors_and_collaborations( self.root) return self._authors_and_collaborations @classmethod def latex_to_unicode(cls, latex_string): try: return cls._l2t.latex_to_text(latex_string).replace(" ", " ") except Exception as e: return latex_string
class JatsParser(object): """Parser for the JATS format. It can be used directly by invoking the :func:`JatsParser.parse` method, or be subclassed to customize its behavior. Args: jats_record (Union[str, scrapy.selector.Selector]): the record in JATS format to parse. source (Optional[str]): if provided, sets the ``source`` everywhere in the record. Otherwise, the source is extracted from the JATS metadata. """ def __init__(self, jats_record, source=None): self.root = self.get_root_node(jats_record) if not source: source = self.publisher self.builder = LiteratureBuilder(source) def parse(self): """Extract a JATS record into an Inspire HEP record. Returns: dict: the same record in the Inspire Literature schema. """ self.builder.add_abstract(self.abstract) self.builder.add_title(self.title, subtitle=self.subtitle) self.builder.add_copyright(**self.copyright) self.builder.add_document_type(self.document_type) self.builder.add_license(**self.license) for author in self.authors: self.builder.add_author(author) self.builder.add_number_of_pages(self.number_of_pages) self.builder.add_publication_info(**self.publication_info) for collab in self.collaborations: self.builder.add_collaboration(collab) for doi in self.dois: self.builder.add_doi(**doi) for keyword in self.keywords: self.builder.add_keyword(**keyword) self.builder.add_imprint_date(self.publication_date.dumps()) for reference in self.references: self.builder.add_reference(reference) return self.builder.record @property def references(self): """Extract a JATS record into an Inspire HEP references record. Returns: List[dict]: an array of reference schema records, representing the references in the record """ ref_nodes = self.root.xpath('./back/ref-list/ref') return list( itertools.chain.from_iterable( self.get_reference(node) for node in ref_nodes ) ) remove_tags_config_abstract = { 'allowed_tags': ['sup', 'sub'], 'allowed_trees': ['math'], 'strip': 'self::pub-id|self::issn' } @property def abstract(self): abstract_nodes = self.root.xpath('./front//abstract[1]') if not abstract_nodes: return abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip() return abstract @property def article_type(self): article_type = self.root.xpath('./@article-type').extract_first() return article_type @property def artid(self): artid = self.root.xpath('./front/article-meta//elocation-id//text()').extract_first() return artid @property def authors(self): author_nodes = self.root.xpath('./front//contrib[@contrib-type="author"]') authors = [self.get_author(author) for author in author_nodes] return authors @property def collaborations(self): collab_nodes = self.root.xpath( './front//collab |' './front//contrib[@contrib-type="collaboration"] |' './front//on-behalf-of' ) collaborations = set( collab.xpath('string(.)').extract_first() for collab in collab_nodes ) return collaborations @property def copyright(self): copyright = { 'holder': self.copyright_holder, 'material': self.material, 'statement': self.copyright_statement, 'year': self.copyright_year, } return copyright @property def copyright_holder(self): copyright_holder = self.root.xpath('./front//copyright-holder/text()').extract_first() return copyright_holder @property def copyright_statement(self): copyright_statement = self.root.xpath('./front//copyright-statement/text()').extract_first() return copyright_statement @property def copyright_year(self): copyright_year = self.root.xpath('./front//copyright-year/text()').extract_first() return maybe_int(copyright_year) @property def dois(self): doi_values = self.root.xpath('./front/article-meta//article-id[@pub-id-type="doi"]/text()').extract() dois = [ {'doi': value, 'material': self.material} for value in doi_values ] if self.material != 'publication': doi_values = self.root.xpath( './front/article-meta//related-article[@ext-link-type="doi"]/@href' ).extract() related_dois = ({'doi': value} for value in doi_values) dois.extend(related_dois) return dois @property def document_type(self): if self.is_conference_paper: document_type = 'conference paper' else: document_type = 'article' return document_type @property def is_conference_paper(self): """Decide whether the article is a conference paper.""" conference_node = self.root.xpath('./front//conference').extract_first() return bool(conference_node) @property def journal_title(self): journal_title = self.root.xpath( './front/journal-meta//abbrev-journal-title/text() |' './front/journal-meta//journal-title/text()' ).extract_first() return journal_title @property def journal_issue(self): journal_issue = self.root.xpath('./front/article-meta/issue/text()').extract_first() return journal_issue @property def journal_volume(self): journal_volume = self.root.xpath('./front/article-meta/volume/text()').extract_first() return journal_volume @property def keywords(self): keyword_groups = self.root.xpath('./front//kwd-group') keywords = itertools.chain.from_iterable(self.get_keywords(group) for group in keyword_groups) return keywords @property def license(self): license = { 'license': self.license_statement, 'material': self.material, 'url': self.license_url, } return license @property def license_statement(self): license_statement = self.root.xpath('string(./front/article-meta//license)').extract_first().strip() return license_statement @property def license_url(self): url_nodes = ( './front/article-meta//license_ref/text() |' './front/article-meta//license/@href |' './front/article-meta//license//ext-link/@href' ) license_url = self.root.xpath(url_nodes).extract_first() return license_url @property def material(self): if self.article_type.startswith('correc'): material = 'erratum' elif self.article_type in ('erratum', 'translation', 'addendum', 'reprint'): material = self.article_type else: material = 'publication' return material @property def number_of_pages(self): number_of_pages = maybe_int(self.root.xpath('./front/article-meta//page-count/@count').extract_first()) return number_of_pages @property def page_start(self): page_start = self.root.xpath('./front/article-meta/fpage/text()').extract_first() return page_start @property def page_end(self): page_end = self.root.xpath('./front/article-meta/lpage/text()').extract_first() return page_end @property def publication_date(self): date_nodes = self.root.xpath( './front//pub-date[@pub-type="ppub"] |' './front//pub-date[@pub-type="epub"] |' './front//pub-date[starts-with(@date-type,"pub")] |' './front//date[starts-with(@date-type,"pub")]' ) publication_date = min( self.get_date(date_node) for date_node in date_nodes ) return publication_date @property def publication_info(self): publication_info = { 'artid': self.artid, 'journal_title': self.journal_title, 'journal_issue': self.journal_issue, 'journal_volume': self.journal_volume, 'material': self.material, 'page_start': self.page_start, 'page_end': self.page_end, 'year': self.year, } return publication_info @property def publisher(self): publisher = self.root.xpath('./front//publisher-name/text()').extract_first() return publisher @property def subtitle(self): subtitle = self.root.xpath('string(./front//subtitle)').extract_first() return subtitle @property def title(self): title = self.root.xpath('string(./front//article-title)').extract_first() return title def get_affiliation(self, id_): """Get the affiliation with the specified id. Args: id_(str): the value of the ``id`` attribute of the affiliation. Returns: Optional[str]: the affiliation with that id or ``None`` if there is no match. """ affiliation_node = self.root.xpath('//aff[@id=$id_]', id_=id_)[0] affiliation = remove_tags( affiliation_node, strip='self::label | self::email' ).strip() return affiliation def get_emails_from_refs(self, id_): """Get the emails from the node with the specified id. Args: id_(str): the value of the ``id`` attribute of the node. Returns: List[str]: the emails from the node with that id or [] if none found. """ email_nodes = self.root.xpath('//aff[@id=$id_]/email/text()', id_=id_) return email_nodes.extract() @property def year(self): not_online = ( 'not(starts-with(@publication-format, "elec"))' ' and not(starts-with(@publication-format, "online")' ) date_nodes = self.root.xpath( './front//pub-date[@pub-type="ppub"] |' './front//pub-date[starts-with(@date-type,"pub") and $not_online] |' './front//date[starts-with(@date-type,"pub") and $not_online]', not_online=not_online ) year = min( self.get_date(date_node) for date_node in date_nodes ).year return year def get_author_affiliations(self, author_node): """Extract an author's affiliations.""" raw_referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract() # Sometimes the rid might have more than one ID (e.g. rid="id0 id1") referred_ids = set() for raw_referred_id in raw_referred_ids: referred_ids.update(set(raw_referred_id.split(' '))) affiliations = [ self.get_affiliation(rid) for rid in referred_ids if self.get_affiliation(rid) ] return affiliations def get_author_emails(self, author_node): """Extract an author's email addresses.""" emails = author_node.xpath('.//email/text()').extract() referred_ids = author_node.xpath('.//xref[@ref-type="aff"]/@rid').extract() for referred_id in referred_ids: emails.extend(self.get_emails_from_refs(referred_id)) return emails @staticmethod def get_author_name(author_node): """Extract an author's name.""" surname = author_node.xpath('.//surname/text()').extract_first() if not surname: # the author name is unstructured author_name = author_node.xpath('string(./string-name)').extract_first() given_names = author_node.xpath('.//given-names/text()').extract_first() suffix = author_node.xpath('.//suffix/text()').extract_first() author_name = ', '.join(el for el in (surname, given_names, suffix) if el) return author_name @staticmethod def get_date(date_node): """Extract a date from a date node. Returns: PartialDate: the parsed date. """ iso_string = date_node.xpath('./@iso-8601-date').extract_first() iso_date = PartialDate.loads(iso_string) if iso_string else None year = date_node.xpath('string(./year)').extract_first() month = date_node.xpath('string(./month)').extract_first() day = date_node.xpath('string(./day)').extract_first() date_from_parts = PartialDate.from_parts(year, month, day) if year else None string_date = date_node.xpath('string(./string-date)').extract_first() try: parsed_date = PartialDate.parse(string_date) except ValueError: parsed_date = None date = get_first([iso_date, date_from_parts, parsed_date]) return date @staticmethod def get_keywords(group_node): """Extract keywords from a keyword group.""" schema = None if 'pacs' in group_node.xpath('@kwd-group-type').extract_first(default='').lower(): schema = 'PACS' keywords = (kwd.xpath('string(.)').extract_first() for kwd in group_node.xpath('.//kwd')) keyword_dicts = ({'keyword': keyword, 'schema': schema} for keyword in keywords) return keyword_dicts @staticmethod def get_root_node(jats_record): """Get a selector on the root ``article`` node of the record. This can be overridden in case some preprocessing needs to be done on the XML. Args: jats_record(Union[str, scrapy.selector.Selector]): the record in JATS format. Returns: scrapy.selector.Selector: a selector on the root ``<article>`` node. """ if isinstance(jats_record, six.string_types): root = get_node(jats_record) else: root = jats_record root.remove_namespaces() return root def get_author(self, author_node): """Extract one author. Args: author_node(scrapy.selector.Selector): a selector on a single author, e.g. a ``<contrib contrib-type="author">``. Returns: dict: the parsed author, conforming to the Inspire schema. """ author_name = self.get_author_name(author_node) emails = self.get_author_emails(author_node) affiliations = self.get_author_affiliations(author_node) return self.builder.make_author(author_name, raw_affiliations=affiliations, emails=emails) @staticmethod def get_reference_authors(ref_node, role): """Extract authors of `role` from a reference node. Args: ref_node(scrapy.selector.Selector): a selector on a single reference. role(str): author role Returns: List[str]: list of names """ return ref_node.xpath( './person-group[@person-group-type=$role]/string-name/text()', role=role ).extract() def get_reference(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Returns: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ for citation_node in ref_node.xpath('./mixed-citation'): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format='JATS' ) fields = [ ( ( 'self::node()[@publication-type="journal" ' 'or @publication-type="eprint"]/source/text()' ), builder.set_journal_title, ), ( 'self::node()[@publication-type="book"]/source/text()', builder.add_parent_title, ), ('./publisher-name/text()', builder.set_publisher), ('./volume/text()', builder.set_journal_volume), ('./issue/text()', builder.set_journal_issue), ('./year/text()', builder.set_year), ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid), ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid), ( 'pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")]/text()', builder.add_report_number ), ('./article-title/text()', builder.add_title), ('../label/text()', lambda x: builder.set_label(x.strip('[].'))) ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) remainder = remove_tags( citation_node, strip='self::person-group' '|self::pub-id' '|self::article-title' '|self::volume' '|self::issue' '|self::year' '|self::label' '|self::publisher-name' '|self::source[../@publication-type!="proc"]' '|self::object-id' '|self::page-range' '|self::issn' ).strip('"\';,. \t\n\r').replace('()', '') if remainder: builder.add_misc(remainder) for editor in self.get_reference_authors(citation_node, 'editor'): builder.add_author(editor, 'editor') for author in self.get_reference_authors(citation_node, 'author'): builder.add_author(author, 'author') page_range = citation_node.xpath('./page-range/text()').extract_first() if page_range: page_artid = split_page_artid(page_range) builder.set_page_artid(*page_artid) yield builder.obj def attach_fulltext_document(self, file_name, url): self.builder.add_document(file_name, url, fulltext=True, hidden=True)