def map_refextract_to_schema(extracted_references, source=None): """Convert refextract output to the schema using the builder.""" result = [] for reference in extracted_references: rb = ReferenceBuilder() mapping = [ ('author', rb.add_refextract_authors_str), ('collaboration', rb.add_collaboration), ('doi', rb.add_uid), ('hdl', rb.add_uid), ('isbn', rb.add_uid), ('journal_reference', rb.set_pubnote), ('linemarker', rb.set_label), ('misc', rb.add_misc), ('publisher', rb.set_publisher), ('raw_ref', lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)), ('reportnumber', rb.add_report_number), ('texkey', rb.set_texkey), ('title', rb.add_title), ('url', rb.add_url), ('year', rb.set_year), ] for field, method in mapping: for el in force_list(reference.get(field)): if el: method(el) if get_value(rb.obj, 'reference.urls'): rb.obj['reference']['urls'] = dedupe_list_of_dicts(rb.obj['reference']['urls']) result.append(rb.obj) return result
def map_refextract_to_schema(extracted_references, source=None): """Convert refextract output to the schema using the builder.""" result = [] for reference in extracted_references: rb = ReferenceBuilder() mapping = [ ('author', rb.add_refextract_authors_str), ('collaboration', rb.add_collaboration), ('doi', rb.add_uid), ('hdl', rb.add_uid), ('isbn', rb.add_uid), ('journal_reference', rb.set_pubnote), ('linemarker', rb.set_label), ('misc', rb.add_misc), ('publisher', rb.set_publisher), ('raw_ref', lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)), ('reportnumber', rb.add_report_number), ('texkey', rb.set_texkey), ('title', rb.add_title), ('url', rb.add_url), ('year', rb.set_year), ] for field, method in mapping: for el in force_list(reference.get(field)): if el: method(el) result.append(rb.obj) return result
def references(self, key, value): """Populate the ``references`` key.""" def _has_curator_flag(value): normalized_nine_values = [ el.upper() for el in force_list(value.get('9')) ] return 'CURATOR' in normalized_nine_values def _is_curated(value): return value.get('z') == '1' and _has_curator_flag(value) def _set_record(el): recid = maybe_int(el) record = get_record_ref(recid, 'literature') rb.set_record(record) rb = ReferenceBuilder() mapping = [ ('0', _set_record), ('a', rb.add_uid), ('b', rb.add_uid), ('c', rb.add_collaboration), ('e', partial(rb.add_author, role='ed.')), ('h', rb.add_refextract_authors_str), ('i', rb.add_uid), ('k', rb.set_texkey), ('m', rb.add_misc), ('o', rb.set_label), ('p', rb.set_publisher), ('q', rb.add_parent_title), ('r', rb.add_report_number), ('s', rb.set_pubnote), ('t', rb.add_title), ('u', rb.add_url), ('x', rb.add_raw_reference), ('y', rb.set_year), ] for field, method in mapping: for el in force_list(value.get(field)): if el: method(el) if _is_curated(value): rb.curate() if _has_curator_flag(value): rb.obj['legacy_curated'] = True return rb.obj
def _get_reference(value): def _set_record(el): recid = maybe_int(el) record = get_record_ref(recid, 'literature') rb.set_record(record) rb = ReferenceBuilder() mapping = [ ('0', _set_record), ('a', rb.add_uid), ('b', rb.add_uid), ('c', rb.add_collaboration), ('e', partial(rb.add_author, role='ed.')), ('h', rb.add_refextract_authors_str), ('i', rb.add_uid), ('k', rb.set_texkey), ('m', rb.add_misc), ('o', rb.set_label), ('p', rb.set_publisher), ('q', rb.add_parent_title), ('r', rb.add_report_number), ('s', rb.set_pubnote), ('t', rb.add_title), ('u', rb.add_url), ('x', rb.add_raw_reference), ('y', rb.set_year), ] for field, method in mapping: for el in force_list(value.get(field)): if el: method(el) return rb.obj
def map_refextract_to_schema(extracted_references, source=None): """Convert refextract output to the schema using the builder.""" result = [] for reference in extracted_references: rb = ReferenceBuilder() mapping = [ ("author", rb.add_refextract_authors_str), ("collaboration", rb.add_collaboration), ("doi", rb.add_uid), ("hdl", rb.add_uid), ("isbn", rb.add_uid), ("journal_reference", rb.set_pubnote), ("linemarker", rb.set_label), ("misc", rb.add_misc), ("publisher", rb.set_publisher), ("raw_ref", lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)), ("reportnumber", rb.add_report_number), ("texkey", rb.set_texkey), ("title", rb.add_title), ("url", rb.add_url), ("year", rb.set_year), ] for field, method in mapping: for el in force_list(reference.get(field)): if el: method(el) if get_value(rb.obj, "reference.urls"): rb.obj["reference"]["urls"] = dedupe_list_of_dicts( rb.obj["reference"]["urls"]) result.append(rb.obj) result.extend(rb.pop_additional_pubnotes()) return result
def get_reference(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Returns: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ for citation_node in ref_node.xpath('./mixed-citation'): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format='JATS' ) fields = [ ( ( 'self::node()[@publication-type="journal" ' 'or @publication-type="eprint"]/source/text()' ), builder.set_journal_title, ), ( 'self::node()[@publication-type="book"]/source/text()', builder.add_parent_title, ), ('./publisher-name/text()', builder.set_publisher), ('./volume/text()', builder.set_journal_volume), ('./issue/text()', builder.set_journal_issue), ('./year/text()', builder.set_year), ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid), ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid), ( 'pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")]/text()', builder.add_report_number ), ('./article-title/text()', builder.add_title), ('../label/text()', lambda x: builder.set_label(x.strip('[].'))) ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) remainder = remove_tags( citation_node, strip='self::person-group' '|self::pub-id' '|self::article-title' '|self::volume' '|self::issue' '|self::year' '|self::label' '|self::publisher-name' '|self::source[../@publication-type!="proc"]' '|self::object-id' '|self::page-range' '|self::issn' ).strip('"\';,. \t\n\r').replace('()', '') if remainder: builder.add_misc(remainder) for editor in self.get_reference_authors(citation_node, 'editor'): builder.add_author(editor, 'editor') for author in self.get_reference_authors(citation_node, 'author'): builder.add_author(author, 'author') page_range = citation_node.xpath('./page-range/text()').extract_first() if page_range: page_artid = split_page_artid(page_range) builder.set_page_artid(*page_artid) yield builder.obj
def get_reference(self, ref_key): """Extract one reference. Args: ref_key(dict): a dictionary on a single reference. Returns: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ builder = ReferenceBuilder() journal_title = ref_key.get("journal-title") if journal_title: builder.set_journal_title(journal_title) journal_volume = ref_key.get("volume") if journal_volume: builder.set_journal_volume(journal_volume) journal_issue = ref_key.get("issue") if journal_issue: builder.set_journal_issue(journal_issue) first_page = ref_key.get("first-page") if first_page: builder.set_page_artid(page_start=first_page) year = ref_key.get("year") if year: builder.set_year(year) title = ref_key.get("article-title") if title: builder.add_title(title) isbn = ref_key.get("ISBN") if isbn: builder.add_uid(isbn) doi = ref_key.get("DOI") if doi: builder.add_uid(doi) author = ref_key.get("author") if author: builder.add_author(author, 'author') raw_ref = ref_key.get("unstructured") if raw_ref: builder.add_raw_reference(raw_ref, self.material_source) yield builder.obj
def get_reference_iter(self, ref_node): """Extract one reference. Args: ref_node(scrapy.selector.Selector): a selector on a single reference, i.e. ``<ref>``. Yields: dict: the parsed reference, as generated by :class:`inspire_schemas.api.ReferenceBuilder` """ # handle also unstructured refs for citation_node in ref_node.xpath("./reference|./other-ref"): builder = ReferenceBuilder() builder.add_raw_reference( ref_node.extract().strip(), source=self.builder.source, ref_format="Elsevier", ) fields = [ ( ("string(.//series/title/maintitle[1])"), builder.set_journal_title, ), ( "string(.//title[parent::edited-book|parent::book]/maintitle[1])", builder.add_parent_title, ), ("string(./publisher/name[1])", builder.set_publisher), ("string(.//volume-nr[1])", builder.set_journal_volume), ("string(.//issue-nr[1])", builder.set_journal_issue), ("string(.//date[1])", builder.set_year), ("string(.//inter-ref[1])", builder.add_url), ("string(.//doi[1])", builder.add_uid), ( 'string(pub-id[@pub-id-type="other"]' '[contains(preceding-sibling::text(),"Report No")][1])', builder.add_report_number, ), ("string(./title/maintitle[1])", builder.add_title), ] for xpath, field_handler in fields: value = citation_node.xpath(xpath).extract_first() citation_node.xpath(xpath) if value: field_handler(value) label_value = ref_node.xpath("string(./label[1])").extract_first() builder.set_label(label_value.strip("[]")) pages = self.get_reference_pages(citation_node) artid = self.get_reference_artid(citation_node) if artid: builder.set_page_artid(artid=artid) if any(pages): builder.set_page_artid(*pages) remainder = (remove_tags( citation_node, strip="self::authors" "|self::article-number" "|self::volume-nr" "|self::issue-nr" "|self::inter-ref" "|self::maintitle" "|self::date" "|self::label" "|self::publisher" "|self::doi" "|self::pages").strip("\"';,. \t\n\r").replace("()", "")) if remainder: builder.add_misc(remainder) for editor in self.get_reference_editors(citation_node): builder.add_author(editor, "editor") for author in self.get_reference_authors(citation_node): builder.add_author(author, "author") yield builder.obj
def __init__(self, content): self.builder = ReferenceBuilder() self.root = Selector(text=content, type="xml")
class GrobidReferenceParser: """Parse single reference from `<biblStruct>` root.""" def __init__(self, content): self.builder = ReferenceBuilder() self.root = Selector(text=content, type="xml") def parse(self): for report_number in self.report_numbers: self.builder.add_report_number(report_number) self.builder.add_uid(self.isbn) self.builder.add_uid(self.arxiv_eprint) for doi in self.dois: self.builder.add_uid(doi) self.builder.set_journal_issue(self.journal_issue) self.builder.set_journal_volume(self.journal_volume) self.builder.set_journal_title(self.journal_title) self.builder.set_page_artid(page_start=self.page_start, page_end=self.page_end) self.builder.set_year(self.year) return strip_empty_values(self.builder.obj) @property def arxiv_eprint(self): return self.root.xpath( "/biblStruct/monogr/idno[@type='arXiv']/text()").get() @property def dois(self): return self.root.xpath( "/biblStruct/monogr/idno[@type='DOI']/text()").getall() @property def isbn(self): return self.root.xpath( "/biblStruct/monogr/idno[@type='isbn']/text()").get() @property def report_numbers(self): return self.root.xpath( "/biblStruct/monogr/idno[not(@type)]/text()").getall() @property def journal_volume(self): return self.root.xpath( "/biblStruct/monogr/imprint/biblScope[@unit='volume']/text()").get( ) @property def journal_issue(self): return self.root.xpath( "/biblStruct/monogr/imprint/biblScope[@unit='issue']/text()").get( ) @property def journal_title(self): return self.root.xpath("/biblStruct/monogr/title/text()").get() @property def page_start(self): return self.root.xpath( "(/biblStruct/monogr/imprint/biblScope[@unit='page']/@from | /biblStruct/monogr/imprint/biblScope[@unit='page']/text())" ).get() @property def page_end(self): return self.root.xpath( "/biblStruct/monogr/imprint/biblScope[@unit='page']/@to").get() @property def year(self): return self.root.xpath( "/biblStruct/monogr/imprint/date[@type='published']/@when").get()