def test_rich_ingest_release(): r = ReleaseEntity( title="something", ident="iznnn644szdwva7khyxqzc5555", release_year=1234, license_slug="CC-BY-NC", ext_ids=ReleaseExtIds(doi="10.123/456"), refs=[ ReleaseRef(), ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), ], ) r.state = "active" r.container = ContainerEntity( name="dummy journal", extra={ "ia": { "sim": { "year_spans": [[1000, 1100]], }, }, "kbart": { "lockss": { "year_spans": [[1200, 1300]], }, "jstor": { "year_spans": [[1950, 1960], [1980, 2005]], }, }, "sherpa_romeo": { "color": "blue" }, "doaj": { "as_of": "2010-02-03" }, }, ) ir = release_ingest_request(r) assert ir is not None assert ir["base_url"] == "https://doi.org/10.123/456" assert ir["ext_ids"]["doi"] == "10.123/456" assert ir["ext_ids"].get("pmcid") is None assert ir["ingest_type"] == "pdf" # check ingest type ("d-lib") r.container_id = "ugbiirfvufgcjkx33r3cmemcuu" ir = release_ingest_request(r) assert ir["ingest_type"] == "html"
def test_basic_ingest_release(crossref_importer): with open('tests/files/crossref-works.single.json', 'r') as f: # not a single line raw = json.loads(f.read()) r = crossref_importer.parse_record(raw) r.state = 'active' req = release_ingest_request(r) assert req is not None
def test_rich_ingest_release(): r = ReleaseEntity( title="something", ident="iznnn644szdwva7khyxqzc5555", release_year=1234, license_slug="CC-BY-NC", ext_ids=ReleaseExtIds(doi="10.123/456"), refs=[ ReleaseRef(), ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"), ], ) r.state = 'active' r.container = ContainerEntity( name="dummy journal", extra={ "ia": { "sim": { "year_spans": [[1000, 1100]], }, }, "kbart": { "lockss": { "year_spans": [[1200, 1300]], }, "jstor": { "year_spans": [[1950, 1960], [1980, 2005]], }, }, "sherpa_romeo": { "color": "blue" }, "doaj": { "as_of": "2010-02-03" }, }, ) ir = release_ingest_request(r) assert ir is not None assert ir['base_url'] == 'https://doi.org/10.123/456' assert ir['ext_ids']['doi'] == '10.123/456' assert ir['ext_ids'].get('pmcid') is None
def run(self): def fail_fast(err, msg): if err is not None: print("Kafka producer delivery error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) def on_commit(err, partitions): if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors print(p) if p.error: print("Kafka consumer commit error: {}".format(p.error)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) print("Kafka consumer commit successful") pass def on_rebalance(consumer, partitions): for p in partitions: if p.error: raise KafkaException(p.error) print("Kafka partitions rebalanced: {} / {}".format( consumer, partitions)) consumer_conf = self.kafka_config.copy() consumer_conf.update({ 'group.id': self.consumer_group, 'on_commit': fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker 'enable.auto.commit': True, 'enable.auto.offset.store': False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) 'max.poll.interval.ms': 180000, 'default.topic.config': { 'auto.offset.reset': 'latest', }, }) consumer = Consumer(consumer_conf) producer_conf = self.kafka_config.copy() producer_conf.update({ 'delivery.report.only.error': True, 'default.topic.config': { 'request.required.acks': -1, # all brokers must confirm }, }) producer = Producer(producer_conf) consumer.subscribe([self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) print("Kafka consuming {}".format(self.consume_topic)) while True: msg = consumer.poll(self.poll_interval) if not msg: print("nothing new from kafka (poll_interval: {} sec)".format(self.poll_interval)) continue if msg.error(): raise KafkaException(msg.error()) cle = json.loads(msg.value().decode('utf-8')) #print(cle) print("processing changelog index {}".format(cle['index'])) release_ids = [] new_release_ids = [] file_ids = [] container_ids = [] work_ids = [] release_edits = cle['editgroup']['edits']['releases'] for re in release_edits: release_ids.append(re['ident']) # filter to direct release edits which are not updates if not re.get('prev_revision') and not re.get('redirect_ident'): new_release_ids.append(re['ident']) file_edits = cle['editgroup']['edits']['files'] for e in file_edits: file_ids.append(e['ident']) container_edits = cle['editgroup']['edits']['containers'] for e in container_edits: container_ids.append(e['ident']) work_edits = cle['editgroup']['edits']['works'] for e in work_edits: work_ids.append(e['ident']) # TODO: do these fetches in parallel using a thread pool? for ident in set(file_ids): file_entity = self.api.get_file(ident, expand=None) # update release when a file changes # TODO: fetch old revision as well, and only update # releases for which list changed release_ids.extend(file_entity.release_ids or []) file_dict = self.api.api_client.sanitize_for_serialization(file_entity) producer.produce( self.file_topic, json.dumps(file_dict).encode('utf-8'), key=ident.encode('utf-8'), on_delivery=fail_fast, ) for ident in set(container_ids): container = self.api.get_container(ident) container_dict = self.api.api_client.sanitize_for_serialization(container) producer.produce( self.container_topic, json.dumps(container_dict).encode('utf-8'), key=ident.encode('utf-8'), on_delivery=fail_fast, ) for ident in set(release_ids): release = self.api.get_release(ident, expand="files,filesets,webcaptures,container") work_ids.append(release.work_id) release_dict = self.api.api_client.sanitize_for_serialization(release) producer.produce( self.release_topic, json.dumps(release_dict).encode('utf-8'), key=ident.encode('utf-8'), on_delivery=fail_fast, ) # filter to "new" active releases with no matched files if release.ident in new_release_ids: ir = release_ingest_request(release, ingest_request_source='fatcat-changelog') if ir and not release.files and self.want_live_ingest(release, ir): producer.produce( self.ingest_file_request_topic, json.dumps(ir).encode('utf-8'), #key=None, on_delivery=fail_fast, ) producer.flush() # TODO: publish updated 'work' entities to a topic consumer.store_offsets(message=msg)
def _run_search_dump(args, search): if args.dry_run: print("=== THIS IS A DRY RUN ===") kafka_producer = None ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests".format(args.env) if args.enqueue_kafka: print("Will send ingest requests to kafka topic: {}".format(ingest_file_request_topic), file=sys.stderr) kafka_producer = simple_kafka_producer(args.kafka_hosts) if args.limit is not None: search = search[:args.limit] if args.before_year: search = search \ .filter("exists", field="release_year") \ .filter("range", release_date=dict(lt=args.before_year)) if args.after_year: search = search \ .filter("exists", field="release_year") \ .filter("range", release_date=dict(gte=args.after_year)) if not args.allow_non_oa: search = search.filter("term", is_oa=True) if args.release_types: release_types = args.release_types.split(',') search = search \ .filter("terms", release_type=release_types) else: search = search \ .filter("bool", must_not=[ Q("terms", release_type=["stub", "component"]) ]) counts = Counter({'ingest_request': 0, 'elasticsearch_release': 0, 'estimate': 0}) counts['estimate'] = search.count() print("Expecting {} release objects in search queries".format(counts['estimate']), file=sys.stderr) # don't try to clean up scroll if we are connected to public server (behind # nginx proxy that disallows DELETE) if args.elasticsearch_endpoint in ( 'https://search.fatcat.wiki', 'https://search.qa.fatcat.wiki'): search = search.params(clear_scroll=False) results = search.scan() for esr in results: if args.limit and counts['ingest_request'] >= args.limit: break counts['elasticsearch_release'] += 1 release = args.api.get_release(esr.ident) ingest_request = release_ingest_request( release, ingest_request_source="fatcat-ingest", ) if not ingest_request: continue if args.force_recrawl: ingest_request['force_recrawl'] = True counts['ingest_request'] += 1 if args.dry_run: continue if kafka_producer is not None: kafka_producer.produce( ingest_file_request_topic, json.dumps(ingest_request).encode('utf-8'), #key=None, on_delivery=kafka_fail_fast, ) counts['kafka'] += 1 else: print(json.dumps(ingest_request)) if kafka_producer is not None: kafka_producer.flush() print(counts, file=sys.stderr) if args.dry_run: print("=== THIS WAS A DRY RUN ===")
def _run_search_dump(args: argparse.Namespace, search: Search) -> None: if args.dry_run: print("=== THIS IS A DRY RUN ===") kafka_producer = None if args.kafka_request_topic: ingest_file_request_topic = args.kafka_request_topic else: ingest_file_request_topic = "sandcrawler-{}.ingest-file-requests-daily".format( args.env) if args.enqueue_kafka: print( "Will send ingest requests to kafka topic: {}".format( ingest_file_request_topic), file=sys.stderr, ) kafka_producer = simple_kafka_producer(args.kafka_hosts) if args.limit is not None: search = search[:args.limit] if args.before_year: search = search.filter("exists", field="release_year").filter( "range", release_date=dict(lt=args.before_year)) if args.after_year: search = search.filter("exists", field="release_year").filter( "range", release_date=dict(gte=args.after_year)) if not args.allow_non_oa: search = search.filter("term", is_oa=True) if args.release_types: release_types = args.release_types.split(",") search = search.filter("terms", release_type=release_types) else: search = search.filter( "bool", must_not=[Q("terms", release_type=["stub", "component"])]) counts = Counter({ "ingest_request": 0, "elasticsearch_release": 0, "estimate": 0 }) search = search.params() counts["estimate"] = search.count() print( "Expecting {} release objects in search queries".format( counts["estimate"]), file=sys.stderr, ) results = search.scan() for esr in results: if args.limit and counts["ingest_request"] >= args.limit: break counts["elasticsearch_release"] += 1 release = args.api.get_release(esr.ident) ingest_request = release_ingest_request( release, ingest_request_source="fatcat-ingest", ingest_type=args.ingest_type, ) if not ingest_request: continue if args.force_recrawl: ingest_request["force_recrawl"] = True counts["ingest_request"] += 1 if args.dry_run: continue if kafka_producer is not None: kafka_producer.produce( ingest_file_request_topic, json.dumps(ingest_request).encode("utf-8"), # key=None, on_delivery=kafka_fail_fast, ) counts["kafka"] += 1 else: print(json.dumps(ingest_request)) if kafka_producer is not None: kafka_producer.flush() print(counts, file=sys.stderr) if args.dry_run: print("=== THIS WAS A DRY RUN ===")
def run(self) -> None: def fail_fast(err: Any, _msg: Any) -> None: if err is not None: print("Kafka producer delivery error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) def on_commit(err: Any, partitions: List[Any]) -> None: if err is not None: print("Kafka consumer commit error: {}".format(err)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors print(p) if p.error: print("Kafka consumer commit error: {}".format(p.error)) print("Bailing out...") # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) print("Kafka consumer commit successful") pass def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None: for p in partitions: if p.error: raise KafkaException(p.error) print("Kafka partitions rebalanced: {} / {}".format( consumer, partitions)) consumer_conf = self.kafka_config.copy() consumer_conf.update({ "group.id": self.consumer_group, "on_commit": fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker "enable.auto.commit": True, "enable.auto.offset.store": False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) "max.poll.interval.ms": 180000, "default.topic.config": { "auto.offset.reset": "latest", }, }) consumer = Consumer(consumer_conf) producer_conf = self.kafka_config.copy() producer_conf.update({ "delivery.report.only.error": True, "default.topic.config": { "request.required.acks": -1, # all brokers must confirm }, }) producer = Producer(producer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) print("Kafka consuming {}".format(self.consume_topic)) while True: msg = consumer.poll(self.poll_interval) if not msg: print("nothing new from kafka (poll_interval: {} sec)".format( self.poll_interval)) continue if msg.error(): raise KafkaException(msg.error()) cle = json.loads(msg.value().decode("utf-8")) # print(cle) print("processing changelog index {}".format(cle["index"])) release_ids = [] new_release_ids = [] file_ids = [] fileset_ids = [] webcapture_ids = [] container_ids = [] work_ids = [] release_edits = cle["editgroup"]["edits"]["releases"] for re in release_edits: release_ids.append(re["ident"]) # filter to direct release edits which are not updates if not re.get("prev_revision") and not re.get( "redirect_ident"): new_release_ids.append(re["ident"]) file_edits = cle["editgroup"]["edits"]["files"] for e in file_edits: file_ids.append(e["ident"]) fileset_edits = cle["editgroup"]["edits"]["filesets"] for e in fileset_edits: fileset_ids.append(e["ident"]) webcapture_edits = cle["editgroup"]["edits"]["webcaptures"] for e in webcapture_edits: webcapture_ids.append(e["ident"]) container_edits = cle["editgroup"]["edits"]["containers"] for e in container_edits: container_ids.append(e["ident"]) work_edits = cle["editgroup"]["edits"]["works"] for e in work_edits: work_ids.append(e["ident"]) # TODO: do these fetches in parallel using a thread pool? for ident in set(file_ids): file_entity = self.api.get_file(ident, expand=None) # update release when a file changes # TODO: also fetch old version of file and update any *removed* # release idents (and same for filesets, webcapture updates) release_ids.extend(file_entity.release_ids or []) file_dict = self.api.api_client.sanitize_for_serialization( file_entity) producer.produce( self.file_topic, json.dumps(file_dict).encode("utf-8"), key=ident.encode("utf-8"), on_delivery=fail_fast, ) # TODO: topic for fileset updates for ident in set(fileset_ids): fileset_entity = self.api.get_fileset(ident, expand=None) # update release when a fileset changes release_ids.extend(fileset_entity.release_ids or []) # TODO: topic for webcapture updates for ident in set(webcapture_ids): webcapture_entity = self.api.get_webcapture(ident, expand=None) # update release when a webcapture changes release_ids.extend(webcapture_entity.release_ids or []) for ident in set(container_ids): container = self.api.get_container(ident) container_dict = self.api.api_client.sanitize_for_serialization( container) producer.produce( self.container_topic, json.dumps(container_dict).encode("utf-8"), key=ident.encode("utf-8"), on_delivery=fail_fast, ) for ident in set(release_ids): release = self.api.get_release( ident, expand="files,filesets,webcaptures,container,creators") if release.work_id: work_ids.append(release.work_id) release_dict = self.api.api_client.sanitize_for_serialization( release) producer.produce( self.release_topic, json.dumps(release_dict).encode("utf-8"), key=ident.encode("utf-8"), on_delivery=fail_fast, ) # for ingest requests, filter to "new" active releases with no matched files if release.ident in new_release_ids: ir = release_ingest_request( release, ingest_request_source="fatcat-changelog") if ir and not release.files and self.want_live_ingest( release, ir): producer.produce( self.ingest_file_request_topic, json.dumps(ir).encode("utf-8"), # key=None, on_delivery=fail_fast, ) # send work updates (just ident and changelog metadata) to scholar for re-indexing for ident in set(work_ids): assert ident key = f"work_{ident}" work_ident_dict = dict( key=key, type="fatcat_work", work_ident=ident, updated=cle["timestamp"], fatcat_changelog_index=cle["index"], ) producer.produce( self.work_ident_topic, json.dumps(work_ident_dict).encode("utf-8"), key=key.encode("utf-8"), on_delivery=fail_fast, ) producer.flush() # TODO: publish updated 'work' entities to a topic consumer.store_offsets(message=msg)