Пример #1
0
def test_rich_ingest_release():
    r = ReleaseEntity(
        title="something",
        ident="iznnn644szdwva7khyxqzc5555",
        release_year=1234,
        license_slug="CC-BY-NC",
        ext_ids=ReleaseExtIds(doi="10.123/456"),
        refs=[
            ReleaseRef(),
            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
        ],
    )
    r.state = "active"
    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "ia": {
                "sim": {
                    "year_spans": [[1000, 1100]],
                },
            },
            "kbart": {
                "lockss": {
                    "year_spans": [[1200, 1300]],
                },
                "jstor": {
                    "year_spans": [[1950, 1960], [1980, 2005]],
                },
            },
            "sherpa_romeo": {
                "color": "blue"
            },
            "doaj": {
                "as_of": "2010-02-03"
            },
        },
    )
    ir = release_ingest_request(r)
    assert ir is not None
    assert ir["base_url"] == "https://doi.org/10.123/456"
    assert ir["ext_ids"]["doi"] == "10.123/456"
    assert ir["ext_ids"].get("pmcid") is None
    assert ir["ingest_type"] == "pdf"

    # check ingest type ("d-lib")
    r.container_id = "ugbiirfvufgcjkx33r3cmemcuu"
    ir = release_ingest_request(r)
    assert ir["ingest_type"] == "html"
Пример #2
0
def test_basic_ingest_release(crossref_importer):
    with open('tests/files/crossref-works.single.json', 'r') as f:
        # not a single line
        raw = json.loads(f.read())
        r = crossref_importer.parse_record(raw)
    r.state = 'active'
    req = release_ingest_request(r)
    assert req is not None
Пример #3
0
def test_rich_ingest_release():
    r = ReleaseEntity(
        title="something",
        ident="iznnn644szdwva7khyxqzc5555",
        release_year=1234,
        license_slug="CC-BY-NC",
        ext_ids=ReleaseExtIds(doi="10.123/456"),
        refs=[
            ReleaseRef(),
            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
        ],
    )
    r.state = 'active'
    r.container = ContainerEntity(
        name="dummy journal",
        extra={
            "ia": {
                "sim": {
                    "year_spans": [[1000, 1100]],
                },
            },
            "kbart": {
                "lockss": {
                    "year_spans": [[1200, 1300]],
                },
                "jstor": {
                    "year_spans": [[1950, 1960], [1980, 2005]],
                },
            },
            "sherpa_romeo": {
                "color": "blue"
            },
            "doaj": {
                "as_of": "2010-02-03"
            },
        },
    )
    ir = release_ingest_request(r)
    assert ir is not None
    assert ir['base_url'] == 'https://doi.org/10.123/456'
    assert ir['ext_ids']['doi'] == '10.123/456'
    assert ir['ext_ids'].get('pmcid') is None
Пример #4
0
    def run(self):

        def fail_fast(err, msg):
            if err is not None:
                print("Kafka producer delivery error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)

        def on_commit(err, partitions):
            if err is not None:
                print("Kafka consumer commit error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                print(p)
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error))
                    print("Bailing out...")
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(p.error)
            print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer, partitions):
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print("Kafka partitions rebalanced: {} / {}".format(
                consumer, partitions))

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update({
            'group.id': self.consumer_group,
            'on_commit': fail_fast,
            # messages don't have offset marked as stored until pushed to
            # elastic, but we do auto-commit stored offsets to broker
            'enable.auto.commit': True,
            'enable.auto.offset.store': False,
            # user code timeout; if no poll after this long, assume user code
            # hung and rebalance (default: 5min)
            'max.poll.interval.ms': 180000,
            'default.topic.config': {
                'auto.offset.reset': 'latest',
            },
        })
        consumer = Consumer(consumer_conf)

        producer_conf = self.kafka_config.copy()
        producer_conf.update({
            'delivery.report.only.error': True,
            'default.topic.config': {
                'request.required.acks': -1, # all brokers must confirm
            },
        })
        producer = Producer(producer_conf)

        consumer.subscribe([self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )
        print("Kafka consuming {}".format(self.consume_topic))

        while True:
            msg = consumer.poll(self.poll_interval)
            if not msg:
                print("nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval))
                continue
            if msg.error():
                raise KafkaException(msg.error())

            cle = json.loads(msg.value().decode('utf-8'))
            #print(cle)
            print("processing changelog index {}".format(cle['index']))
            release_ids = []
            new_release_ids = []
            file_ids = []
            container_ids = []
            work_ids = []
            release_edits = cle['editgroup']['edits']['releases']
            for re in release_edits:
                release_ids.append(re['ident'])
                # filter to direct release edits which are not updates
                if not re.get('prev_revision') and not re.get('redirect_ident'):
                    new_release_ids.append(re['ident'])
            file_edits = cle['editgroup']['edits']['files']
            for e in file_edits:
                file_ids.append(e['ident'])
            container_edits = cle['editgroup']['edits']['containers']
            for e in container_edits:
                container_ids.append(e['ident'])
            work_edits = cle['editgroup']['edits']['works']
            for e in work_edits:
                work_ids.append(e['ident'])

            # TODO: do these fetches in parallel using a thread pool?
            for ident in set(file_ids):
                file_entity = self.api.get_file(ident, expand=None)
                # update release when a file changes
                # TODO: fetch old revision as well, and only update
                # releases for which list changed
                release_ids.extend(file_entity.release_ids or [])
                file_dict = self.api.api_client.sanitize_for_serialization(file_entity)
                producer.produce(
                    self.file_topic,
                    json.dumps(file_dict).encode('utf-8'),
                    key=ident.encode('utf-8'),
                    on_delivery=fail_fast,
                )
            for ident in set(container_ids):
                container = self.api.get_container(ident)
                container_dict = self.api.api_client.sanitize_for_serialization(container)
                producer.produce(
                    self.container_topic,
                    json.dumps(container_dict).encode('utf-8'),
                    key=ident.encode('utf-8'),
                    on_delivery=fail_fast,
                )
            for ident in set(release_ids):
                release = self.api.get_release(ident, expand="files,filesets,webcaptures,container")
                work_ids.append(release.work_id)
                release_dict = self.api.api_client.sanitize_for_serialization(release)
                producer.produce(
                    self.release_topic,
                    json.dumps(release_dict).encode('utf-8'),
                    key=ident.encode('utf-8'),
                    on_delivery=fail_fast,
                )
                # filter to "new" active releases with no matched files
                if release.ident in new_release_ids:
                    ir = release_ingest_request(release, ingest_request_source='fatcat-changelog')
                    if ir and not release.files and self.want_live_ingest(release, ir):
                        producer.produce(
                            self.ingest_file_request_topic,
                            json.dumps(ir).encode('utf-8'),
                            #key=None,
                            on_delivery=fail_fast,
                        )
            producer.flush()
            # TODO: publish updated 'work' entities to a topic
            consumer.store_offsets(message=msg)
Пример #5
0
def _run_search_dump(args, search):

    if args.dry_run:
        print("=== THIS IS A DRY RUN ===")

    kafka_producer = None
    ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests".format(args.env)
    if args.enqueue_kafka:
        print("Will send ingest requests to kafka topic: {}".format(ingest_file_request_topic), file=sys.stderr)
        kafka_producer = simple_kafka_producer(args.kafka_hosts)

    if args.limit is not None:
        search = search[:args.limit]

    if args.before_year:
        search = search \
            .filter("exists", field="release_year") \
            .filter("range", release_date=dict(lt=args.before_year))
    if args.after_year:
        search = search \
            .filter("exists", field="release_year") \
            .filter("range", release_date=dict(gte=args.after_year))

    if not args.allow_non_oa:
        search = search.filter("term", is_oa=True)

    if args.release_types:
        release_types = args.release_types.split(',')
        search = search \
            .filter("terms", release_type=release_types)
    else:
        search = search \
            .filter("bool", must_not=[
                Q("terms", release_type=["stub", "component"])
            ])

    counts = Counter({'ingest_request': 0, 'elasticsearch_release': 0, 'estimate': 0})
    counts['estimate'] = search.count()
    print("Expecting {} release objects in search queries".format(counts['estimate']), file=sys.stderr)

    # don't try to clean up scroll if we are connected to public server (behind
    # nginx proxy that disallows DELETE)
    if args.elasticsearch_endpoint in (
            'https://search.fatcat.wiki',
            'https://search.qa.fatcat.wiki'):
        search = search.params(clear_scroll=False)

    results = search.scan()
    for esr in results:
        if args.limit and counts['ingest_request'] >= args.limit:
            break
        counts['elasticsearch_release'] += 1
        release = args.api.get_release(esr.ident)
        ingest_request = release_ingest_request(
            release,
            ingest_request_source="fatcat-ingest",
        )
        if not ingest_request:
            continue
        if args.force_recrawl:
            ingest_request['force_recrawl'] = True
        counts['ingest_request'] += 1
        if args.dry_run:
            continue
        if kafka_producer is not None:
            kafka_producer.produce(
                ingest_file_request_topic,
                json.dumps(ingest_request).encode('utf-8'),
                #key=None,
                on_delivery=kafka_fail_fast,
            )
            counts['kafka'] += 1
        else:
            print(json.dumps(ingest_request))
    if kafka_producer is not None:
        kafka_producer.flush()
    print(counts, file=sys.stderr)
    if args.dry_run:
        print("=== THIS WAS A DRY RUN ===")
Пример #6
0
def _run_search_dump(args: argparse.Namespace, search: Search) -> None:

    if args.dry_run:
        print("=== THIS IS A DRY RUN ===")

    kafka_producer = None
    if args.kafka_request_topic:
        ingest_file_request_topic = args.kafka_request_topic
    else:
        ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests-daily".format(
            args.env)
    if args.enqueue_kafka:
        print(
            "Will send ingest requests to kafka topic: {}".format(
                ingest_file_request_topic),
            file=sys.stderr,
        )
        kafka_producer = simple_kafka_producer(args.kafka_hosts)

    if args.limit is not None:
        search = search[:args.limit]

    if args.before_year:
        search = search.filter("exists", field="release_year").filter(
            "range", release_date=dict(lt=args.before_year))
    if args.after_year:
        search = search.filter("exists", field="release_year").filter(
            "range", release_date=dict(gte=args.after_year))

    if not args.allow_non_oa:
        search = search.filter("term", is_oa=True)

    if args.release_types:
        release_types = args.release_types.split(",")
        search = search.filter("terms", release_type=release_types)
    else:
        search = search.filter(
            "bool", must_not=[Q("terms", release_type=["stub", "component"])])

    counts = Counter({
        "ingest_request": 0,
        "elasticsearch_release": 0,
        "estimate": 0
    })
    search = search.params()
    counts["estimate"] = search.count()
    print(
        "Expecting {} release objects in search queries".format(
            counts["estimate"]),
        file=sys.stderr,
    )

    results = search.scan()
    for esr in results:
        if args.limit and counts["ingest_request"] >= args.limit:
            break
        counts["elasticsearch_release"] += 1
        release = args.api.get_release(esr.ident)
        ingest_request = release_ingest_request(
            release,
            ingest_request_source="fatcat-ingest",
            ingest_type=args.ingest_type,
        )
        if not ingest_request:
            continue
        if args.force_recrawl:
            ingest_request["force_recrawl"] = True
        counts["ingest_request"] += 1
        if args.dry_run:
            continue
        if kafka_producer is not None:
            kafka_producer.produce(
                ingest_file_request_topic,
                json.dumps(ingest_request).encode("utf-8"),
                # key=None,
                on_delivery=kafka_fail_fast,
            )
            counts["kafka"] += 1
        else:
            print(json.dumps(ingest_request))
    if kafka_producer is not None:
        kafka_producer.flush()
    print(counts, file=sys.stderr)
    if args.dry_run:
        print("=== THIS WAS A DRY RUN ===")
Пример #7
0
    def run(self) -> None:
        def fail_fast(err: Any, _msg: Any) -> None:
            if err is not None:
                print("Kafka producer delivery error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)

        def on_commit(err: Any, partitions: List[Any]) -> None:
            if err is not None:
                print("Kafka consumer commit error: {}".format(err))
                print("Bailing out...")
                # TODO: should it be sys.exit(-1)?
                raise KafkaException(err)
            for p in partitions:
                # check for partition-specific commit errors
                print(p)
                if p.error:
                    print("Kafka consumer commit error: {}".format(p.error))
                    print("Bailing out...")
                    # TODO: should it be sys.exit(-1)?
                    raise KafkaException(p.error)
            print("Kafka consumer commit successful")
            pass

        def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None:
            for p in partitions:
                if p.error:
                    raise KafkaException(p.error)
            print("Kafka partitions rebalanced: {} / {}".format(
                consumer, partitions))

        consumer_conf = self.kafka_config.copy()
        consumer_conf.update({
            "group.id": self.consumer_group,
            "on_commit": fail_fast,
            # messages don't have offset marked as stored until pushed to
            # elastic, but we do auto-commit stored offsets to broker
            "enable.auto.commit": True,
            "enable.auto.offset.store": False,
            # user code timeout; if no poll after this long, assume user code
            # hung and rebalance (default: 5min)
            "max.poll.interval.ms": 180000,
            "default.topic.config": {
                "auto.offset.reset": "latest",
            },
        })
        consumer = Consumer(consumer_conf)

        producer_conf = self.kafka_config.copy()
        producer_conf.update({
            "delivery.report.only.error": True,
            "default.topic.config": {
                "request.required.acks": -1,  # all brokers must confirm
            },
        })
        producer = Producer(producer_conf)

        consumer.subscribe(
            [self.consume_topic],
            on_assign=on_rebalance,
            on_revoke=on_rebalance,
        )
        print("Kafka consuming {}".format(self.consume_topic))

        while True:
            msg = consumer.poll(self.poll_interval)
            if not msg:
                print("nothing new from kafka (poll_interval: {} sec)".format(
                    self.poll_interval))
                continue
            if msg.error():
                raise KafkaException(msg.error())

            cle = json.loads(msg.value().decode("utf-8"))
            # print(cle)
            print("processing changelog index {}".format(cle["index"]))
            release_ids = []
            new_release_ids = []
            file_ids = []
            fileset_ids = []
            webcapture_ids = []
            container_ids = []
            work_ids = []
            release_edits = cle["editgroup"]["edits"]["releases"]
            for re in release_edits:
                release_ids.append(re["ident"])
                # filter to direct release edits which are not updates
                if not re.get("prev_revision") and not re.get(
                        "redirect_ident"):
                    new_release_ids.append(re["ident"])
            file_edits = cle["editgroup"]["edits"]["files"]
            for e in file_edits:
                file_ids.append(e["ident"])
            fileset_edits = cle["editgroup"]["edits"]["filesets"]
            for e in fileset_edits:
                fileset_ids.append(e["ident"])
            webcapture_edits = cle["editgroup"]["edits"]["webcaptures"]
            for e in webcapture_edits:
                webcapture_ids.append(e["ident"])
            container_edits = cle["editgroup"]["edits"]["containers"]
            for e in container_edits:
                container_ids.append(e["ident"])
            work_edits = cle["editgroup"]["edits"]["works"]
            for e in work_edits:
                work_ids.append(e["ident"])

            # TODO: do these fetches in parallel using a thread pool?
            for ident in set(file_ids):
                file_entity = self.api.get_file(ident, expand=None)
                # update release when a file changes
                # TODO: also fetch old version of file and update any *removed*
                # release idents (and same for filesets, webcapture updates)
                release_ids.extend(file_entity.release_ids or [])
                file_dict = self.api.api_client.sanitize_for_serialization(
                    file_entity)
                producer.produce(
                    self.file_topic,
                    json.dumps(file_dict).encode("utf-8"),
                    key=ident.encode("utf-8"),
                    on_delivery=fail_fast,
                )

            # TODO: topic for fileset updates
            for ident in set(fileset_ids):
                fileset_entity = self.api.get_fileset(ident, expand=None)
                # update release when a fileset changes
                release_ids.extend(fileset_entity.release_ids or [])

            # TODO: topic for webcapture updates
            for ident in set(webcapture_ids):
                webcapture_entity = self.api.get_webcapture(ident, expand=None)
                # update release when a webcapture changes
                release_ids.extend(webcapture_entity.release_ids or [])

            for ident in set(container_ids):
                container = self.api.get_container(ident)
                container_dict = self.api.api_client.sanitize_for_serialization(
                    container)
                producer.produce(
                    self.container_topic,
                    json.dumps(container_dict).encode("utf-8"),
                    key=ident.encode("utf-8"),
                    on_delivery=fail_fast,
                )

            for ident in set(release_ids):
                release = self.api.get_release(
                    ident,
                    expand="files,filesets,webcaptures,container,creators")
                if release.work_id:
                    work_ids.append(release.work_id)
                release_dict = self.api.api_client.sanitize_for_serialization(
                    release)
                producer.produce(
                    self.release_topic,
                    json.dumps(release_dict).encode("utf-8"),
                    key=ident.encode("utf-8"),
                    on_delivery=fail_fast,
                )
                # for ingest requests, filter to "new" active releases with no matched files
                if release.ident in new_release_ids:
                    ir = release_ingest_request(
                        release, ingest_request_source="fatcat-changelog")
                    if ir and not release.files and self.want_live_ingest(
                            release, ir):
                        producer.produce(
                            self.ingest_file_request_topic,
                            json.dumps(ir).encode("utf-8"),
                            # key=None,
                            on_delivery=fail_fast,
                        )

            # send work updates (just ident and changelog metadata) to scholar for re-indexing
            for ident in set(work_ids):
                assert ident
                key = f"work_{ident}"
                work_ident_dict = dict(
                    key=key,
                    type="fatcat_work",
                    work_ident=ident,
                    updated=cle["timestamp"],
                    fatcat_changelog_index=cle["index"],
                )
                producer.produce(
                    self.work_ident_topic,
                    json.dumps(work_ident_dict).encode("utf-8"),
                    key=key.encode("utf-8"),
                    on_delivery=fail_fast,
                )

            producer.flush()
            # TODO: publish updated 'work' entities to a topic
            consumer.store_offsets(message=msg)