예제 #1
0
def map_refextract_to_schema(extracted_references, source=None):
    """Convert refextract output to the schema using the builder."""
    result = []

    for reference in extracted_references:
        rb = ReferenceBuilder()
        mapping = [
            ('author', rb.add_refextract_authors_str),
            ('collaboration', rb.add_collaboration),
            ('doi', rb.add_uid),
            ('hdl', rb.add_uid),
            ('isbn', rb.add_uid),
            ('journal_reference', rb.set_pubnote),
            ('linemarker', rb.set_label),
            ('misc', rb.add_misc),
            ('publisher', rb.set_publisher),
            ('raw_ref', lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)),
            ('reportnumber', rb.add_report_number),
            ('texkey', rb.set_texkey),
            ('title', rb.add_title),
            ('url', rb.add_url),
            ('year', rb.set_year),
        ]

        for field, method in mapping:
            for el in force_list(reference.get(field)):
                if el:
                    method(el)

        if get_value(rb.obj, 'reference.urls'):
            rb.obj['reference']['urls'] = dedupe_list_of_dicts(rb.obj['reference']['urls'])

        result.append(rb.obj)

    return result
예제 #2
0
def map_refextract_to_schema(extracted_references, source=None):
    """Convert refextract output to the schema using the builder."""
    result = []

    for reference in extracted_references:
        rb = ReferenceBuilder()
        mapping = [
            ('author', rb.add_refextract_authors_str),
            ('collaboration', rb.add_collaboration),
            ('doi', rb.add_uid),
            ('hdl', rb.add_uid),
            ('isbn', rb.add_uid),
            ('journal_reference', rb.set_pubnote),
            ('linemarker', rb.set_label),
            ('misc', rb.add_misc),
            ('publisher', rb.set_publisher),
            ('raw_ref',
             lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)),
            ('reportnumber', rb.add_report_number),
            ('texkey', rb.set_texkey),
            ('title', rb.add_title),
            ('url', rb.add_url),
            ('year', rb.set_year),
        ]

        for field, method in mapping:
            for el in force_list(reference.get(field)):
                if el:
                    method(el)

        result.append(rb.obj)

    return result
예제 #3
0
def references(self, key, value):
    """Populate the ``references`` key."""
    def _has_curator_flag(value):
        normalized_nine_values = [
            el.upper() for el in force_list(value.get('9'))
        ]
        return 'CURATOR' in normalized_nine_values

    def _is_curated(value):
        return value.get('z') == '1' and _has_curator_flag(value)

    def _set_record(el):
        recid = maybe_int(el)
        record = get_record_ref(recid, 'literature')
        rb.set_record(record)

    rb = ReferenceBuilder()
    mapping = [
        ('0', _set_record),
        ('a', rb.add_uid),
        ('b', rb.add_uid),
        ('c', rb.add_collaboration),
        ('e', partial(rb.add_author, role='ed.')),
        ('h', rb.add_refextract_authors_str),
        ('i', rb.add_uid),
        ('k', rb.set_texkey),
        ('m', rb.add_misc),
        ('o', rb.set_label),
        ('p', rb.set_publisher),
        ('q', rb.add_parent_title),
        ('r', rb.add_report_number),
        ('s', rb.set_pubnote),
        ('t', rb.add_title),
        ('u', rb.add_url),
        ('x', rb.add_raw_reference),
        ('y', rb.set_year),
    ]

    for field, method in mapping:
        for el in force_list(value.get(field)):
            if el:
                method(el)

    if _is_curated(value):
        rb.curate()

    if _has_curator_flag(value):
        rb.obj['legacy_curated'] = True

    return rb.obj
예제 #4
0
    def _get_reference(value):
        def _set_record(el):
            recid = maybe_int(el)
            record = get_record_ref(recid, 'literature')
            rb.set_record(record)

        rb = ReferenceBuilder()
        mapping = [
            ('0', _set_record),
            ('a', rb.add_uid),
            ('b', rb.add_uid),
            ('c', rb.add_collaboration),
            ('e', partial(rb.add_author, role='ed.')),
            ('h', rb.add_refextract_authors_str),
            ('i', rb.add_uid),
            ('k', rb.set_texkey),
            ('m', rb.add_misc),
            ('o', rb.set_label),
            ('p', rb.set_publisher),
            ('q', rb.add_parent_title),
            ('r', rb.add_report_number),
            ('s', rb.set_pubnote),
            ('t', rb.add_title),
            ('u', rb.add_url),
            ('x', rb.add_raw_reference),
            ('y', rb.set_year),
        ]

        for field, method in mapping:
            for el in force_list(value.get(field)):
                if el:
                    method(el)

        return rb.obj
예제 #5
0
def map_refextract_to_schema(extracted_references, source=None):
    """Convert refextract output to the schema using the builder."""
    result = []

    for reference in extracted_references:
        rb = ReferenceBuilder()
        mapping = [
            ("author", rb.add_refextract_authors_str),
            ("collaboration", rb.add_collaboration),
            ("doi", rb.add_uid),
            ("hdl", rb.add_uid),
            ("isbn", rb.add_uid),
            ("journal_reference", rb.set_pubnote),
            ("linemarker", rb.set_label),
            ("misc", rb.add_misc),
            ("publisher", rb.set_publisher),
            ("raw_ref",
             lambda raw_ref: rb.add_raw_reference(raw_ref, source=source)),
            ("reportnumber", rb.add_report_number),
            ("texkey", rb.set_texkey),
            ("title", rb.add_title),
            ("url", rb.add_url),
            ("year", rb.set_year),
        ]

        for field, method in mapping:
            for el in force_list(reference.get(field)):
                if el:
                    method(el)

        if get_value(rb.obj, "reference.urls"):
            rb.obj["reference"]["urls"] = dedupe_list_of_dicts(
                rb.obj["reference"]["urls"])

        result.append(rb.obj)
        result.extend(rb.pop_additional_pubnotes())

    return result
예제 #6
0
    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj
예제 #7
0
    def get_reference(self, ref_key):
        """Extract one reference.

        Args:
            ref_key(dict): a dictionary on a single reference.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        builder = ReferenceBuilder()

        journal_title = ref_key.get("journal-title")
        if journal_title:
            builder.set_journal_title(journal_title)

        journal_volume = ref_key.get("volume")
        if journal_volume:
            builder.set_journal_volume(journal_volume)

        journal_issue = ref_key.get("issue")
        if journal_issue:
            builder.set_journal_issue(journal_issue)

        first_page = ref_key.get("first-page")
        if first_page:
            builder.set_page_artid(page_start=first_page)

        year = ref_key.get("year")
        if year:
            builder.set_year(year)

        title = ref_key.get("article-title")
        if title:
            builder.add_title(title)

        isbn = ref_key.get("ISBN")
        if isbn:
            builder.add_uid(isbn)

        doi = ref_key.get("DOI")
        if doi:
            builder.add_uid(doi)

        author = ref_key.get("author")
        if author:
            builder.add_author(author, 'author')

        raw_ref = ref_key.get("unstructured")
        if raw_ref:
            builder.add_raw_reference(raw_ref, self.material_source)

        yield builder.obj
예제 #8
0
    def get_reference_iter(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

       Yields:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        # handle also unstructured refs
        for citation_node in ref_node.xpath("./reference|./other-ref"):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format="Elsevier",
            )

            fields = [
                (
                    ("string(.//series/title/maintitle[1])"),
                    builder.set_journal_title,
                ),
                (
                    "string(.//title[parent::edited-book|parent::book]/maintitle[1])",
                    builder.add_parent_title,
                ),
                ("string(./publisher/name[1])", builder.set_publisher),
                ("string(.//volume-nr[1])", builder.set_journal_volume),
                ("string(.//issue-nr[1])", builder.set_journal_issue),
                ("string(.//date[1])", builder.set_year),
                ("string(.//inter-ref[1])", builder.add_url),
                ("string(.//doi[1])", builder.add_uid),
                (
                    'string(pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")][1])',
                    builder.add_report_number,
                ),
                ("string(./title/maintitle[1])", builder.add_title),
            ]
            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            label_value = ref_node.xpath("string(./label[1])").extract_first()
            builder.set_label(label_value.strip("[]"))

            pages = self.get_reference_pages(citation_node)
            artid = self.get_reference_artid(citation_node)
            if artid:
                builder.set_page_artid(artid=artid)
            if any(pages):
                builder.set_page_artid(*pages)

            remainder = (remove_tags(
                citation_node,
                strip="self::authors"
                "|self::article-number"
                "|self::volume-nr"
                "|self::issue-nr"
                "|self::inter-ref"
                "|self::maintitle"
                "|self::date"
                "|self::label"
                "|self::publisher"
                "|self::doi"
                "|self::pages").strip("\"';,. \t\n\r").replace("()", ""))
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_editors(citation_node):
                builder.add_author(editor, "editor")

            for author in self.get_reference_authors(citation_node):
                builder.add_author(author, "author")

            yield builder.obj
예제 #9
0
파일: parsers.py 프로젝트: tsgit/inspirehep
 def __init__(self, content):
     self.builder = ReferenceBuilder()
     self.root = Selector(text=content, type="xml")
예제 #10
0
파일: parsers.py 프로젝트: tsgit/inspirehep
class GrobidReferenceParser:
    """Parse single reference from `<biblStruct>` root."""
    def __init__(self, content):
        self.builder = ReferenceBuilder()
        self.root = Selector(text=content, type="xml")

    def parse(self):
        for report_number in self.report_numbers:
            self.builder.add_report_number(report_number)

        self.builder.add_uid(self.isbn)
        self.builder.add_uid(self.arxiv_eprint)

        for doi in self.dois:
            self.builder.add_uid(doi)

        self.builder.set_journal_issue(self.journal_issue)
        self.builder.set_journal_volume(self.journal_volume)
        self.builder.set_journal_title(self.journal_title)
        self.builder.set_page_artid(page_start=self.page_start,
                                    page_end=self.page_end)
        self.builder.set_year(self.year)
        return strip_empty_values(self.builder.obj)

    @property
    def arxiv_eprint(self):
        return self.root.xpath(
            "/biblStruct/monogr/idno[@type='arXiv']/text()").get()

    @property
    def dois(self):
        return self.root.xpath(
            "/biblStruct/monogr/idno[@type='DOI']/text()").getall()

    @property
    def isbn(self):
        return self.root.xpath(
            "/biblStruct/monogr/idno[@type='isbn']/text()").get()

    @property
    def report_numbers(self):
        return self.root.xpath(
            "/biblStruct/monogr/idno[not(@type)]/text()").getall()

    @property
    def journal_volume(self):
        return self.root.xpath(
            "/biblStruct/monogr/imprint/biblScope[@unit='volume']/text()").get(
            )

    @property
    def journal_issue(self):
        return self.root.xpath(
            "/biblStruct/monogr/imprint/biblScope[@unit='issue']/text()").get(
            )

    @property
    def journal_title(self):
        return self.root.xpath("/biblStruct/monogr/title/text()").get()

    @property
    def page_start(self):
        return self.root.xpath(
            "(/biblStruct/monogr/imprint/biblScope[@unit='page']/@from | /biblStruct/monogr/imprint/biblScope[@unit='page']/text())"
        ).get()

    @property
    def page_end(self):
        return self.root.xpath(
            "/biblStruct/monogr/imprint/biblScope[@unit='page']/@to").get()

    @property
    def year(self):
        return self.root.xpath(
            "/biblStruct/monogr/imprint/date[@type='published']/@when").get()
예제 #11
0
파일: jats.py 프로젝트: drjova/hepcrawl
    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj