def run_elasticsearch_changelogs(args): for line in args.json_input: line = line.strip() if not line: continue entity = entity_from_json(line, ChangelogEntry, api_client=args.api.api_client) args.json_output.write( json.dumps(changelog_to_elasticsearch(entity)) + '\n')
def run_elasticsearch_releases(args): for line in args.json_input: line = line.strip() if not line: continue entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) args.json_output.write( json.dumps(release_to_elasticsearch(entity)) + '\n')
def run_elasticsearch_files(args): for line in args.json_input: line = line.strip() if not line: continue entity = entity_from_json(line, FileEntity, api_client=args.api.api_client) if entity.state != 'active': continue args.json_output.write( json.dumps(file_to_elasticsearch(entity)) + '\n')
def run_citeproc_releases(args): for line in args.json_input: line = line.strip() if not line: continue entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) csl_json = release_to_csl(entity) csl_json['id'] = "release:" + (entity.ident or "unknown") out = citeproc_csl(csl_json, args.style, args.html) args.json_output.write(out + "\n")
def run_citeproc_releases(args): for line in args.json_input: line = line.strip() if not line: continue entity = entity_from_json(line, ReleaseEntity, api_client=args.api.api_client) csl_json = release_to_csl(entity) # XXX: csl_json['id'] = "release:" + (entity.ident or "unknown") if args.style == "csl-json": args.json_output.write(json.dumps(csl_json) + "\n") continue bib_src = CiteProcJSON([csl_json]) form = formatter.plain if args.html: form = formatter.html style_path = get_style_filepath(args.style) bib_style = CitationStylesStyle(style_path, validate=False) bib = CitationStylesBibliography(bib_style, bib_src, form) bib.register(Citation([CitationItem(csl_json['id'])])) # XXX: #args.json_output.write( # json.dumps(release_to_csl(entity)) + '\n') lines = bib.bibliography()[0] if args.style == "bibtex": for l in lines: if l.startswith(" @"): args.json_output.write("\n@") elif l.startswith(" "): #print("line: START|{}|END".format(l)) args.json_output.write("\n " + l) else: args.json_output.write(l) else: args.json_output.write(''.join(lines) + "\n") print()
def run(self) -> None: ac = ApiClient() api = public_api(self.api_host) # only used by container indexing query_stats code path es_client = elasticsearch.Elasticsearch(self.elasticsearch_backend) def fail_fast(err: Any, partitions: List[Any]) -> None: if err is not None: print("Kafka consumer commit error: {}".format(err), file=sys.stderr) print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(err) for p in partitions: # check for partition-specific commit errors if p.error: print("Kafka consumer commit error: {}".format(p.error), file=sys.stderr) print("Bailing out...", file=sys.stderr) # TODO: should it be sys.exit(-1)? raise KafkaException(p.error) # print("Kafka consumer commit successful") pass def on_rebalance(consumer: Consumer, partitions: List[Any]) -> None: for p in partitions: if p.error: raise KafkaException(p.error) print( "Kafka partitions rebalanced: {} / {}".format(consumer, partitions), file=sys.stderr, ) consumer_conf = self.kafka_config.copy() consumer_conf.update( { "group.id": self.consumer_group, "on_commit": fail_fast, # messages don't have offset marked as stored until pushed to # elastic, but we do auto-commit stored offsets to broker "enable.auto.commit": True, "enable.auto.offset.store": False, # user code timeout; if no poll after this long, assume user code # hung and rebalance (default: 5min) "max.poll.interval.ms": 60000, "default.topic.config": { "auto.offset.reset": "latest", }, } ) consumer = Consumer(consumer_conf) consumer.subscribe( [self.consume_topic], on_assign=on_rebalance, on_revoke=on_rebalance, ) while True: batch = consumer.consume(num_messages=self.batch_size, timeout=self.poll_interval) if not batch: if not consumer.assignment(): print("... no Kafka consumer partitions assigned yet", file=sys.stderr) print( "... nothing new from kafka, try again (interval: {}".format( self.poll_interval ), file=sys.stderr, ) continue print("... got {} kafka messages".format(len(batch)), file=sys.stderr) # first check errors on entire batch... for msg in batch: if msg.error(): raise KafkaException(msg.error()) # ... then process bulk_actions = [] for msg in batch: json_str = msg.value().decode("utf-8") entity = entity_from_json(json_str, self.entity_type, api_client=ac) assert isinstance(entity, self.entity_type) if self.entity_type == ChangelogEntry: key = entity.index # might need to fetch from API if not ( entity.editgroup # pylint: disable=no-member # (TODO) and entity.editgroup.editor # pylint: disable=no-member # (TODO) ): entity = api.get_changelog_entry(entity.index) else: key = entity.ident # pylint: disable=no-member # (TODO) if self.entity_type != ChangelogEntry and entity.state == "wip": print( f"WARNING: skipping state=wip entity: {self.entity_type.__name__} {entity.ident}", file=sys.stderr, ) continue if self.entity_type == ContainerEntity and self.query_stats: stats = query_es_container_stats( entity.ident, es_client=es_client, es_index=self.elasticsearch_release_index, merge_shadows=True, ) doc_dict = container_to_elasticsearch(entity, stats=stats) else: doc_dict = self.transform_func(entity) # TODO: handle deletions from index bulk_actions.append( json.dumps( { "index": { "_id": key, }, } ) ) bulk_actions.append(json.dumps(doc_dict)) # if only WIP entities, then skip if not bulk_actions: for msg in batch: consumer.store_offsets(message=msg) continue print( "Upserting, eg, {} (of {} {} in elasticsearch)".format( key, len(batch), self.entity_type.__name__ ), file=sys.stderr, ) elasticsearch_endpoint = "{}/{}/_bulk".format( self.elasticsearch_backend, self.elasticsearch_index ) resp = requests.post( elasticsearch_endpoint, headers={"Content-Type": "application/x-ndjson"}, data="\n".join(bulk_actions) + "\n", ) resp.raise_for_status() if resp.json()["errors"]: desc = "Elasticsearch errors from post to {}:".format(elasticsearch_endpoint) print(desc, file=sys.stderr) print(resp.content, file=sys.stderr) raise Exception(desc) for msg in batch: # offsets are *committed* (to brokers) automatically, but need # to be marked as processed here consumer.store_offsets(message=msg)