示例#1
0
def test_remove_tags_unicode():
    snippet = u'<p>😋</p>'

    result = remove_tags(snippet)
    expected = u'😋'

    assert result == expected
示例#2
0
    def abstract(self):
        abstract_nodes = self.root.xpath('./front//abstract[1]')

        if not abstract_nodes:
            return

        abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip()
        return abstract
示例#3
0
def test_remove_tags_strip_keeps_tails():
    strip = 'self::foo'
    snippet = '<foo>This goes</foo> but this remains.'

    result = remove_tags(snippet, strip=strip)
    expected = u' but this remains.'

    assert result == expected
示例#4
0
文件: jats.py 项目: drjova/hepcrawl
    def abstract(self):
        abstract_nodes = self.root.xpath('./front//abstract[1]')

        if not abstract_nodes:
            return

        abstract = remove_tags(abstract_nodes[0], **self.remove_tags_config_abstract).strip()
        return abstract
示例#5
0
def test_remove_tags_allowed_tags_strip():
    allowed_tags = ('b', )
    strip = '@class="hidden"'
    snippet = '<p><b><i>Only</i></b> this text remains.<span class="hidden">Not this one.</span></p>'

    result = remove_tags(snippet, allowed_tags=allowed_tags, strip=strip)
    expected = u'<b>Only</b> this text remains.'

    assert result == expected
示例#6
0
    def abstract(self):
        abstract_nodes = self.root.xpath(
            ".//head/abstract[not(@graphical)]/abstract-sec/simple-para")

        if not abstract_nodes:
            return

        abstract_paragraphs = [
            remove_tags(abstract_node,
                        **self.remove_tags_config_abstract).strip("/ \n")
            for abstract_node in abstract_nodes
        ]
        abstract = ' '.join(abstract_paragraphs)
        return abstract
示例#7
0
    def get_affiliation(self, id_):
        """Get the affiliation with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the affiliation.

        Returns:
            Optional[str]: the affiliation with that id or ``None`` if there is
                no match.
        """
        affiliation_node = self.root.xpath("//aff[@id=$id_]", id_=id_)
        if affiliation_node:
            affiliation = remove_tags(
                affiliation_node[0], strip="self::label | self::email"
            ).strip()
            return affiliation
示例#8
0
文件: jats.py 项目: drjova/hepcrawl
    def get_affiliation(self, id_):
        """Get the affiliation with the specified id.

        Args:
            id_(str): the value of the ``id`` attribute of the affiliation.

        Returns:
            Optional[str]: the affiliation with that id or ``None`` if there is
                no match.
        """
        affiliation_node = self.root.xpath('//aff[@id=$id_]', id_=id_)[0]
        affiliation = remove_tags(
            affiliation_node,
            strip='self::label | self::email'
        ).strip()

        return affiliation
示例#9
0
    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj
示例#10
0
    def get_reference_iter(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

       Yields:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        # handle also unstructured refs
        for citation_node in ref_node.xpath("./reference|./other-ref"):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format="Elsevier",
            )

            fields = [
                (
                    ("string(.//series/title/maintitle[1])"),
                    builder.set_journal_title,
                ),
                (
                    "string(.//title[parent::edited-book|parent::book]/maintitle[1])",
                    builder.add_parent_title,
                ),
                ("string(./publisher/name[1])", builder.set_publisher),
                ("string(.//volume-nr[1])", builder.set_journal_volume),
                ("string(.//issue-nr[1])", builder.set_journal_issue),
                ("string(.//date[1])", builder.set_year),
                ("string(.//inter-ref[1])", builder.add_url),
                ("string(.//doi[1])", builder.add_uid),
                (
                    'string(pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")][1])',
                    builder.add_report_number,
                ),
                ("string(./title/maintitle[1])", builder.add_title),
            ]
            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            label_value = ref_node.xpath("string(./label[1])").extract_first()
            builder.set_label(label_value.strip("[]"))

            pages = self.get_reference_pages(citation_node)
            artid = self.get_reference_artid(citation_node)
            if artid:
                builder.set_page_artid(artid=artid)
            if any(pages):
                builder.set_page_artid(*pages)

            remainder = (remove_tags(
                citation_node,
                strip="self::authors"
                "|self::article-number"
                "|self::volume-nr"
                "|self::issue-nr"
                "|self::inter-ref"
                "|self::maintitle"
                "|self::date"
                "|self::label"
                "|self::publisher"
                "|self::doi"
                "|self::pages").strip("\"';,. \t\n\r").replace("()", ""))
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_editors(citation_node):
                builder.add_author(editor, "editor")

            for author in self.get_reference_authors(citation_node):
                builder.add_author(author, "author")

            yield builder.obj
示例#11
0
文件: jats.py 项目: drjova/hepcrawl
    def get_reference(self, ref_node):
        """Extract one reference.

        Args:
            ref_node(scrapy.selector.Selector): a selector on a single
                reference, i.e. ``<ref>``.

        Returns:
            dict: the parsed reference, as generated by
                :class:`inspire_schemas.api.ReferenceBuilder`
        """
        for citation_node in ref_node.xpath('./mixed-citation'):
            builder = ReferenceBuilder()

            builder.add_raw_reference(
                ref_node.extract().strip(),
                source=self.builder.source,
                ref_format='JATS'
            )

            fields = [
                (
                    (
                        'self::node()[@publication-type="journal" '
                        'or @publication-type="eprint"]/source/text()'
                    ),
                    builder.set_journal_title,
                ),
                (
                    'self::node()[@publication-type="book"]/source/text()',
                    builder.add_parent_title,
                ),
                ('./publisher-name/text()', builder.set_publisher),
                ('./volume/text()', builder.set_journal_volume),
                ('./issue/text()', builder.set_journal_issue),
                ('./year/text()', builder.set_year),
                ('./pub-id[@pub-id-type="arxiv"]/text()', builder.add_uid),
                ('./pub-id[@pub-id-type="doi"]/text()', builder.add_uid),
                (
                    'pub-id[@pub-id-type="other"]'
                    '[contains(preceding-sibling::text(),"Report No")]/text()',
                    builder.add_report_number
                ),
                ('./article-title/text()', builder.add_title),
                ('../label/text()', lambda x: builder.set_label(x.strip('[].')))
            ]

            for xpath, field_handler in fields:
                value = citation_node.xpath(xpath).extract_first()
                citation_node.xpath(xpath)
                if value:
                    field_handler(value)

            remainder = remove_tags(
                    citation_node,
                    strip='self::person-group'
                          '|self::pub-id'
                          '|self::article-title'
                          '|self::volume'
                          '|self::issue'
                          '|self::year'
                          '|self::label'
                          '|self::publisher-name'
                          '|self::source[../@publication-type!="proc"]'
                          '|self::object-id'
                          '|self::page-range'
                          '|self::issn'
                ).strip('"\';,. \t\n\r').replace('()', '')
            if remainder:
                builder.add_misc(remainder)

            for editor in self.get_reference_authors(citation_node, 'editor'):
                builder.add_author(editor, 'editor')

            for author in self.get_reference_authors(citation_node, 'author'):
                builder.add_author(author, 'author')

            page_range = citation_node.xpath('./page-range/text()').extract_first()
            if page_range:
                page_artid = split_page_artid(page_range)
                builder.set_page_artid(*page_artid)

            yield builder.obj