Exemplo n.º 1
0
def run_elasticsearch_releases(args: argparse.Namespace) -> None:
    for line in args.json_input:
        line = line.strip()
        if not line:
            continue
        entity = entity_from_json(line,
                                  ReleaseEntity,
                                  api_client=args.api.api_client)
        if entity.state != "active":
            continue
        args.json_output.write(
            json.dumps(release_to_elasticsearch(entity)) + "\n")
Exemplo n.º 2
0
    def want_live_ingest(self, release, ingest_request):
        """
        This function looks at ingest requests and decides whether they are
        worth enqueing for ingest.

        In theory crawling all DOIs to a landing page is valuable.  It is
        intended to be an operational point of control to reduce load on daily
        ingest crawling (via wayback SPN).
        """

        link_source = ingest_request.get('ingest_request')
        ingest_type = ingest_request.get('ingest_type')
        doi = ingest_request.get('ext_ids', {}).get('doi')

        is_document = release.release_type in (
            'article-journal',
            'paper-conference',
            'article',
            'report',
            'chapter',
            'manuscript',
            'review',
            'thesis',
            'letter',
            'editorial',
            'abstract',
            'entry',
            'patent',
            'post',
            'review-book',
        )
        is_not_pdf = release.release_type in (
            'dataset',
            'stub',
            'software',
            'figure',
            'graphic',
        )

        # accept list sets a default "crawl it" despite OA metadata for
        # known-OA DOI prefixes
        in_acceptlist = False
        if doi:
            for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
                if doi.startswith(prefix):
                    in_acceptlist = True

        if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):
            es = release_to_elasticsearch(release)
            # most datacite documents are in IRs and should be crawled
            is_datacite_doc = False
            if release.extra and ('datacite' in release.extra) and is_document:
                is_datacite_doc = True
            if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
                return False

        # if ingest_type is pdf but release_type is almost certainly not a PDF,
        # skip it. This is mostly a datacite thing.
        if ingest_type == "pdf" and is_not_pdf:
            return False

        if ingest_type == "pdf" and doi:
            for prefix in self.ingest_pdf_doi_prefix_blocklist:
                if doi.startswith(prefix):
                    return False

        return True
Exemplo n.º 3
0
def test_rich_elasticsearch_convert():
    r = ReleaseEntity(
        title="something",
        release_year=1234,
        license_slug="CC-BY-NC",
        ext_ids=ReleaseExtIds(),
        refs=[
            ReleaseRef(),
            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
        ],
    )
    r.state = "active"
    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "ia": {
                "sim": {
                    "year_spans": [[1000, 1100]],
                },
            },
            "kbart": {
                "lockss": {
                    "year_spans": [[1200, 1300]],
                },
                "jstor": {
                    "year_spans": [[1000, 1300], [1950, 1960], [1980, 2005]],
                },
            },
            "sherpa_romeo": {
                "color": "blue"
            },
            "doaj": {
                "as_of": "2010-02-03"
            },
        },
    )
    r.files = [
        FileEntity(
            mimetype="application/pdf",
            urls=[
                FileUrl(rel="dweb", url="dat://a954329dlk/thingie"),
                FileUrl(
                    rel="webarchive",
                    url=
                    "https://web.archive.org/web/20001122030405/http://example.com",
                ),
                FileUrl(rel="web",
                        url="https://archive.org/details/blah/file.pdf"),
            ],
            extra={
                "shadows": {},
            },
        )
    ]
    es = release_to_elasticsearch(r)
    assert es["release_year"] == r.release_year
    assert es["file_count"] == 1
    assert es["fileset_count"] == 0
    assert es["webcapture_count"] == 0
    assert es["ref_count"] == 2
    assert es["ref_linked_count"] == 1

    assert es["preservation"] == "bright"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is True
    assert es["in_dweb"] is True
    assert es["in_ia"] is True
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is True
    assert es["in_jstor"] is True
Exemplo n.º 4
0
def test_elasticsearch_release_from_json():
    r = entity_from_json(
        open("./tests/files/release_etodop5banbndg3faecnfm6ozi.json",
             "r").read(), ReleaseEntity)
    es = release_to_elasticsearch(r)

    assert es["subtitle"] == "Correpondence"
    assert es["ident"] == "etodop5banbndg3faecnfm6ozi"
    assert (es["container_name"] ==
            "BJOG: an International Journal of Obstetrics and Gynaecology")
    assert es["first_page"] == "1404"
    assert es["issue"] == "11"
    assert es["volume"] == "118"
    assert es["number"] is None

    assert es["preservation"] == "dark"
    assert es["is_oa"] is False
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is False
    assert es["in_dweb"] is False
    assert es["in_ia"] is False
    assert es["in_ia_sim"] is True
    assert es["in_kbart"] is True
    assert es["in_jstor"] is False

    # this release has a fileset, and no file
    r = entity_from_json(
        open("./tests/files/release_3mssw2qnlnblbk7oqyv2dafgey.json",
             "r").read(), ReleaseEntity)
    es = release_to_elasticsearch(r)

    assert es["title"] == "Jakobshavn Glacier Bed Elevation"
    assert es["ident"] == "3mssw2qnlnblbk7oqyv2dafgey"
    assert es["file_count"] == 0
    assert es["fileset_count"] == 1
    assert es["webcapture_count"] == 0

    assert es["preservation"] == "dark"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is True
    assert es["in_dweb"] is True
    assert es["in_ia"] is False
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is False
    assert es["in_jstor"] is False

    # this release has a web capture, and no file (edited the JSON to remove file)
    r = entity_from_json(
        open("./tests/files/release_mjtqtuyhwfdr7j2c3l36uor7uy.json",
             "r").read(), ReleaseEntity)
    es = release_to_elasticsearch(r)

    assert es["title"] == "Rethinking Personal Digital Archiving, Part 1"
    assert es["ident"] == "mjtqtuyhwfdr7j2c3l36uor7uy"
    assert es["file_count"] == 0
    assert es["fileset_count"] == 0
    assert es["webcapture_count"] == 1

    assert es["preservation"] == "bright"
    assert es["is_oa"] is True
    assert es["is_longtail_oa"] is False
    assert es["is_preserved"] is True
    assert es["in_web"] is True
    assert es["in_dweb"] is False
    assert es["in_ia"] is True
    assert es["in_ia_sim"] is False
    assert es["in_kbart"] is False
    assert es["in_jstor"] is False
Exemplo n.º 5
0
def enrich_release_entity(entity: ReleaseEntity) -> ReleaseEntity:
    if entity.state in ("redirect", "deleted"):
        return entity
    if entity.state == "active":
        entity._es = release_to_elasticsearch(entity, force_bool=False)
    if entity.container and entity.container.state == "active":
        entity.container._es = container_to_elasticsearch(entity.container,
                                                          force_bool=False)
    if entity.files:
        # remove shadows-only files with no URLs
        entity.files = [
            f for f in entity.files
            if not (f.extra and f.extra.get("shadows") and not f.urls)
        ]
    if entity.filesets:
        for fs in entity.filesets:
            fs._total_size = sum([f.size for f in fs.manifest])
    if entity.webcaptures:
        for wc in entity.webcaptures:
            wc._wayback_suffix = wayback_suffix(wc)
    for ref in entity.refs:
        # this is a UI hack to get rid of XML crud in unstructured refs like:
        # LOCKSS (2014) Available: <ext-link
        # xmlns:xlink="http://www.w3.org/1999/xlink" ext-link-type="uri"
        # xlink:href="http://lockss.org/"
        # xlink:type="simple">http://lockss.org/</ext-link>. Accessed: 2014
        # November 1.
        if ref.extra and ref.extra.get("unstructured"):
            ref.extra["unstructured"] = strip_extlink_xml(
                ref.extra["unstructured"])
    # for backwards compatibility, copy extra['subtitle'] to subtitle
    if not entity.subtitle and entity.extra and entity.extra.get("subtitle"):
        if isinstance(entity.extra["subtitle"], str):
            entity.subtitle = entity.extra["subtitle"]
        elif isinstance(entity.extra["subtitle"], list):
            entity.subtitle = entity.extra["subtitle"][0] or None
    # author list to display; ensure it's sorted by index (any othors with
    # index=None go to end of list)
    authors = [
        c for c in entity.contribs if c.role in ("author", None) and (
            c.surname or c.raw_name or (c.creator and c.creator.surname))
    ]
    entity._authors = sorted(authors,
                             key=lambda c:
                             (c.index is None and 99999999) or c.index)
    # need authors, title for citeproc to work
    entity._can_citeproc = bool(entity._authors) and bool(entity.title)
    if entity.abstracts and entity.abstracts[0].mimetype:
        # hack to show plain text instead of latex abstracts
        if "latex" in entity.abstracts[0].mimetype:
            entity.abstracts.reverse()
        # hack to (partially) clean up common JATS abstract display case
        if entity.abstracts[0].mimetype == "application/xml+jats":
            for tag in ("p", "jats", "jats:p"):
                entity.abstracts[0].content = entity.abstracts[
                    0].content.replace("<{}>".format(tag), "")
                entity.abstracts[0].content = entity.abstracts[
                    0].content.replace("</{}>".format(tag), "")
                # ugh, double encoding happens
                entity.abstracts[0].content = entity.abstracts[
                    0].content.replace("&lt;/{}&gt;".format(tag), "")
                entity.abstracts[0].content = entity.abstracts[
                    0].content.replace("&lt;{}&gt;".format(tag), "")
    return entity
Exemplo n.º 6
0
    def want_live_ingest(self, release, ingest_request):
        """
        This function looks at ingest requests and decides whether they are
        worth enqueing for ingest.

        In theory crawling all DOIs to a landing page is valuable.  It is
        intended to be an operational point of control to reduce load on daily
        ingest crawling (via wayback SPN).
        """

        link_source = ingest_request.get('ingest_request')
        ingest_type = ingest_request.get('ingest_type')
        doi = ingest_request.get('ext_ids', {}).get('doi')
        es = release_to_elasticsearch(release)

        is_document = release.release_type in (
            'article',
            'article-journal',
            'article-newspaper',
            'book',
            'chapter',
            'editorial',
            'interview',
            'legal_case',
            'legislation',
            'letter',
            'manuscript',
            'paper-conference',
            'patent',
            'peer_review',
            'post',
            'report',
            'retraction',
            'review',
            'review-book',
            'thesis',
        )
        is_not_pdf = release.release_type in (
            'component',
            'dataset',
            'figure',
            'graphic',
            'software',
            'stub',
        )

        # accept list sets a default "crawl it" despite OA metadata for
        # known-OA DOI prefixes
        in_acceptlist = False
        if doi:
            for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
                if doi.startswith(prefix):
                    in_acceptlist = True

        if self.ingest_oa_only and link_source not in ('arxiv', 'pmc'):

            # most datacite documents are in IRs and should be crawled
            is_datacite_doc = False
            if release.extra and ('datacite' in release.extra) and is_document:
                is_datacite_doc = True
            if not (es['is_oa'] or in_acceptlist or is_datacite_doc):
                return False

        # big publishers *generally* have accurate OA metadata, use
        # preservation networks, and block our crawlers. So unless OA, or
        # explicitly on accept list, or not preserved, skip crawling
        if es.get('publisher_type') == 'big5' and es.get(
                'is_preserved') and not (es['is_oa'] or in_acceptlist):
            return False

        # if ingest_type is pdf but release_type is almost certainly not a PDF,
        # skip it. This is mostly a datacite thing.
        if ingest_type == "pdf" and is_not_pdf:
            return False

        if ingest_type == "pdf" and doi:
            for prefix in self.ingest_pdf_doi_prefix_blocklist:
                if doi.startswith(prefix):
                    return False

        # figshare
        if doi and (doi.startswith('10.6084/') or doi.startswith('10.25384/')):
            # don't crawl "most recent version" (aka "group") DOIs
            if not release.version:
                return False

        # zenodo
        if doi and doi.startswith('10.5281/'):
            # if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs)
            if release.extra and release.extra.get('relations'):
                for rel in release.extra['relations']:
                    if (rel.get('relationType') == 'HasVersion' and rel.get(
                            'relatedIdentifier', '').startswith('10.5281/')):
                        return False

        return True
Exemplo n.º 7
0
    def want_live_ingest(self, release: ReleaseEntity,
                         ingest_request: Dict[str, Any]) -> bool:
        """
        This function looks at ingest requests and decides whether they are
        worth enqueing for ingest.

        In theory crawling all DOIs to a landing page is valuable.  It is
        intended to be an operational point of control to reduce load on daily
        ingest crawling (via wayback SPN).
        """

        link_source = ingest_request.get("ingest_request")
        ingest_type = ingest_request.get("ingest_type")
        doi = ingest_request.get("ext_ids", {}).get("doi")
        es = release_to_elasticsearch(release)

        is_document = release.release_type in (
            "article",
            "article-journal",
            "article-newspaper",
            "book",
            "chapter",
            "editorial",
            "interview",
            "legal_case",
            "legislation",
            "letter",
            "manuscript",
            "paper-conference",
            "patent",
            "peer_review",
            "post",
            "report",
            "retraction",
            "review",
            "review-book",
            "thesis",
        )
        is_not_pdf = release.release_type in (
            "component",
            "dataset",
            "figure",
            "graphic",
            "software",
            "stub",
        )

        # accept list sets a default "crawl it" despite OA metadata for
        # known-OA DOI prefixes
        in_acceptlist = False
        if doi:
            for prefix in self.live_pdf_ingest_doi_prefix_acceptlist:
                if doi.startswith(prefix):
                    in_acceptlist = True

        if self.ingest_oa_only and link_source not in ("arxiv", "pmc"):

            # most datacite documents are in IRs and should be crawled
            is_datacite_doc = False
            if release.extra and ("datacite" in release.extra) and is_document:
                is_datacite_doc = True
            if not (es["is_oa"] or in_acceptlist or is_datacite_doc):
                return False

        # big publishers *generally* have accurate OA metadata, use
        # preservation networks, and block our crawlers. So unless OA, or
        # explicitly on accept list, or not preserved, skip crawling
        if (es.get("publisher_type") == "big5" and es.get("is_preserved")
                and not (es["is_oa"] or in_acceptlist)):
            return False

        # if ingest_type is pdf but release_type is almost certainly not a PDF,
        # skip it. This is mostly a datacite thing.
        if ingest_type == "pdf" and is_not_pdf:
            return False

        if ingest_type == "pdf" and doi:
            for prefix in self.ingest_pdf_doi_prefix_blocklist:
                if doi.startswith(prefix):
                    return False

        # figshare
        if doi and (doi.startswith("10.6084/") or doi.startswith("10.25384/")):
            # don't crawl "most recent version" (aka "group") DOIs
            if not release.version:
                return False

        # zenodo
        if doi and doi.startswith("10.5281/"):
            # if this is a "grouping" DOI of multiple "version" DOIs, do not crawl (will crawl the versioned DOIs)
            if release.extra and release.extra.get("relations"):
                for rel in release.extra["relations"]:
                    if rel.get("relationType") == "HasVersion" and rel.get(
                            "relatedIdentifier", "").startswith("10.5281/"):
                        return False

        return True