def _next_record(self): if not self._f: raise StopIteration("No more records") while (self._f): if self._read: data = self._f.read(self.block_size) self._read = False if data: if self._data_block: self._data_block += data else: self._data_block = data else: #end of file self._close() if self._data_block: m = self.last_record_end_re.search(self._data_block) if m: self._index += 1 json_str = "{{\n{}\n}}".format( self._data_block[:m.start()]) self._data_block = None return simdjson.loads(json_str) else: raise Exception( "The last record is incomplete in file({}).". format(self._input_file)) else: raise StopIteration("No more records") if self._index is None: m = self.first_record_start_re.search(self._data_block) if m: self._data_block = self._data_block[m.end():] self._index = -1 elif self._data_block.strip(): raise Exception( "The file({}) is an invalid json file".format( self._input_file)) else: self._data_block = None self._read = True else: m = self.record_sep_re.search(self._data_block) if m: self._index += 1 json_str = "{{\n{}\n}}".format( self._data_block[:m.start()]) self._data_block = self._data_block[m.end():] return simdjson.loads(json_str) else: self._read = True
def get_neighbors_attr(graph, n, pred=False): """Get the neighbors attr of node in graph. Parameters ---------- graph: the graph to query. n: node the node to get neighbors. report_type: the report type of report graph operation, types_pb2.SUCC_ATTR_BY_NODE: get the successors attr of node, types_pb2.PRED_ATTR_BY_NODE: get the predecessors attr of node, Returns ------- attr: tuple """ if graph.graph_type == graph_def_pb2.ARROW_PROPERTY: n = graph._convert_to_label_id_tuple(n) report_t = types_pb2.PRED_ATTR_BY_NODE if pred else types_pb2.SUCC_ATTR_BY_NODE op = dag_utils.report_graph(graph, report_t, node=simdjson.dumps(n).encode("utf-8")) archive = op.eval() return simdjson.loads(archive.get_bytes())
def init_resumable_rest(cls, request, bucket): name = request.args.get("name", "") if len(request.data) > 0: if name != "": utils.error.invalid("name argument in non-empty payload", None) data = simdjson.loads(request.data) metadata = json_format.ParseDict(data, resources_pb2.Object()) else: metadata = resources_pb2.Object() metadata.name = name if metadata.content_type == "": metadata.content_type = request.headers.get( "x-upload-content-type", "application/octet-stream") upload_id = hashlib.sha256( ("%s/o/%s" % (bucket.name, metadata.name)).encode("utf-8")).hexdigest() location = ( request.host_url + "upload/storage/v1/b/%s/o?uploadType=resumable&upload_id=%s" % (bucket.name, upload_id)) headers = { key.lower(): value for key, value in request.headers.items() if key.lower().startswith("x-") } request = utils.common.FakeRequest(args=request.args.to_dict(), headers=headers, data=b"") return cls.init_upload(request, metadata, bucket, location, upload_id)
def parse_multipart(request): content_type = request.headers.get("content-type") if content_type is None or not content_type.startswith( "multipart/related"): utils.error.invalid("Content-type header in multipart upload", None) _, _, boundary = content_type.partition("boundary=") if boundary is None: utils.error.missing( "boundary in content-type header in multipart upload", None) def parse_part(part): result = part.split(b"\r\n") if result[0] != b"" and result[-1] != b"": utils.error.invalid("Multipart %s" % str(part), None) result = list(filter(None, result)) headers = {} if len(result) < 2: result.append(b"") for header in result[:-1]: key, value = header.split(b": ") headers[key.decode("utf-8")] = value.decode("utf-8") return headers, result[-1] boundary = boundary.encode("utf-8") parts = request.data.split(b"--" + boundary) if parts[-1] != b"--\r\n": utils.error.missing("end marker (--%s--) in media body" % boundary, None) _, resource = parse_part(parts[1]) metadata = simdjson.loads(resource) media_headers, media = parse_part(parts[2]) return metadata, media_headers, media
def _test_loads(): """Ensure basic usage of loads is the same.""" # We don't use a binary file here because pre-py3.6 the built-in couldn't # handle bytes. with open('jsonexamples/canada.json', 'r') as fin: content = fin.read() assert json.loads(content) == simdjson.loads(content)
def get_idx_key(filename): idx_fn = (filename.split('.')[-1].strip() + '.idx') if _file_exists(idx_fn): idx_key = json.loads(get_read_fn(idx_fn)) idx = {v: key for key, v in idx_key.items()} return idx else: return None
def update_acl(self, request, entity, context): role = "" if context is not None: role = request.bucket_access_control.role else: payload = simdjson.loads(request.data) role = payload["role"] return self.__upsert_acl(entity, role, True, context)
def patch_default_object_acl(self, request, entity, context): role = "" if context is not None: role = request.object_access_control.role else: payload = simdjson.loads(request.data) role = payload["role"] return self.__upsert_default_object_acl(entity, role, True, context)
def init(cls, request, context): time_created = datetime.datetime.now() metadata = None if context is not None: metadata = request.bucket else: metadata = json_format.ParseDict( cls.__preprocess_rest(simdjson.loads(request.data)), resources_pb2.Bucket(), ) cls.__validate_bucket_name(metadata.name, context) default_projection = 1 if len(metadata.acl) != 0 or len(metadata.default_object_acl) != 0: default_projection = 2 is_uniform = metadata.iam_configuration.uniform_bucket_level_access.enabled metadata.iam_configuration.uniform_bucket_level_access.enabled = False if len(metadata.acl) == 0: predefined_acl = utils.acl.extract_predefined_acl( request, False, context) if predefined_acl == 0: predefined_acl = 3 elif predefined_acl == "": predefined_acl = "projectPrivate" elif is_uniform: utils.error.invalid( "Predefined ACL with uniform bucket level access enabled", context) cls.__insert_predefined_acl(metadata, predefined_acl, context) if len(metadata.default_object_acl) == 0: predefined_default_object_acl = utils.acl.extract_predefined_default_object_acl( request, context) if predefined_default_object_acl == 0: predefined_default_object_acl = 5 elif predefined_default_object_acl == "": predefined_default_object_acl = "projectPrivate" elif is_uniform: utils.error.invalid( "Predefined Default Object ACL with uniform bucket level access enabled", context, ) cls.__insert_predefined_default_object_acl( metadata, predefined_default_object_acl, context) metadata.iam_configuration.uniform_bucket_level_access.enabled = is_uniform metadata.id = metadata.name metadata.project_number = int(utils.acl.PROJECT_NUMBER) metadata.metageneration = 0 metadata.etag = hashlib.md5(metadata.name.encode("utf-8")).hexdigest() metadata.time_created.FromDatetime(time_created) metadata.updated.FromDatetime(time_created) metadata.owner.entity = utils.acl.get_project_entity("owners", context) metadata.owner.entity_id = hashlib.md5( metadata.owner.entity.encode("utf-8")).hexdigest() return ( cls(metadata, {}, cls.__init_iam_policy(metadata, context)), utils.common.extract_projection(request, default_projection, context), )
def jg(cls, filename, handle_errors=True): with gfile(filename, 'r') as f: for l in f: try: yield json.loads(l) except Exception as e: if not handle_errors: logger.log(f'Error parsing File: {str(e)}') raise e
def insert_notification(self, request, context): notification = None if context is not None: notification = request.notification else: notification = json_format.ParseDict(simdjson.loads(request.data), resources_pb2.Notification()) notification.id = "notification-%d" % random.getrandbits(16) self.notifications.append(notification) return notification
def dicts_from_lines(lines): """ returns a generator producing dicts from json lines 1 JSON object per line is supported: {"name": "n1"} {"name": "n2"} Or 1 JSON object: { "name": "n1" } Or a list of JSON objects: [ {"name": "n1"}, {"name": "n2"}, ] Or a list of JSON objects in a single line: [{"name": "n1"}, {"name": "n2"}] """ lines = iter(lines) for line in lines: line = line.strip() if not line: continue # skip empty lines try: data = loads(line) if isinstance(data, list): yield from data else: yield data except ValueError: content = line + ''.join(lines) dicts = loads(content) if isinstance(dicts, list): yield from dicts else: yield dicts
def insert_default_object_acl(self, request, context): entity, role = "", "" if context is not None: entity, role = ( request.object_access_control.entity, request.object_access_control.role, ) else: payload = simdjson.loads(request.data) entity, role = payload["entity"], payload["role"] return self.__upsert_default_object_acl(entity, role, False, context)
def set_iam_policy(self, request, context): policy = None if context is not None: policy = request.iam_request.policy else: data = simdjson.loads(request.data) data.pop("kind", None) policy = json_format.ParseDict(data, policy_pb2.Policy()) self.iam_policy = policy self.iam_policy.etag = datetime.datetime.now().isoformat().encode( "utf-8") return self.iam_policy
def update(self, request, context): metadata = None if context is not None: metadata = request.metadata else: metadata = json_format.ParseDict( self.__preprocess_rest(simdjson.loads(request.data)), resources_pb2.Object(), ) self.__update_metadata(metadata, None) self.__insert_predefined_acl( metadata, self.bucket, utils.acl.extract_predefined_acl(request, False, context), context, )
def init(cls, request, context): time_created = datetime.datetime.now() metadata = None if context is not None: metadata = request.bucket else: metadata = json_format.ParseDict( cls.__preprocess_rest(simdjson.loads(request.data)), resources_pb2.Bucket(), ) cls.__validate_bucket_name(metadata.name, context) default_projection = 1 if len(metadata.acl) != 0 or len(metadata.default_object_acl) != 0: default_projection = 2 if len(metadata.acl) == 0: predefined_acl = utils.acl.extract_predefined_acl(request, False, context) if predefined_acl == 0: predefined_acl = 3 elif predefined_acl == "": predefined_acl = "projectPrivate" cls.__insert_predefined_acl(metadata, predefined_acl, context) if len(metadata.default_object_acl) == 0: predefined_default_object_acl = utils.acl.extract_predefined_default_object_acl( request, context ) if predefined_default_object_acl == 0: predefined_default_object_acl = 5 elif predefined_default_object_acl == "": predefined_default_object_acl = "projectPrivate" cls.__insert_predefined_default_object_acl( metadata, predefined_default_object_acl, context ) metadata.id = metadata.name metadata.project_number = int(utils.acl.PROJECT_NUMBER) metadata.metageneration = 0 metadata.etag = hashlib.md5(metadata.name.encode("utf-8")).hexdigest() metadata.time_created.FromDatetime(time_created) metadata.updated.FromDatetime(time_created) metadata.owner.entity = utils.acl.get_project_entity("owners", context) metadata.owner.entity_id = hashlib.md5( metadata.owner.entity.encode("utf-8") ).hexdigest() return ( cls(metadata, [], None), utils.common.extract_projection(request, default_projection, context), )
def patch(self, request, context): update_mask = field_mask_pb2.FieldMask() metadata = None if context is not None: metadata = request.metadata update_mask = request.update_mask else: data = simdjson.loads(request.data) if "labels" in data: if data["labels"] is None: self.metadata.labels.clear() else: for key, value in data["labels"].items(): if value is None: self.metadata.labels.pop(key, None) else: self.metadata.labels[key] = value data.pop("labels", None) data = Bucket.__preprocess_rest(data) metadata = json_format.ParseDict(data, resources_pb2.Bucket()) paths = set() for key in utils.common.nested_key(data): key = utils.common.to_snake_case(key) head = key for i, c in enumerate(key): if c == "." or c == "[": head = key[0:i] break if head in Bucket.modifiable_fields: if "[" in key: paths.add(head) else: paths.add(key) update_mask = field_mask_pb2.FieldMask(paths=list(paths)) self.__update_metadata(metadata, update_mask) self.__insert_predefined_acl( metadata, utils.acl.extract_predefined_acl(request, False, context), context) self.__insert_predefined_default_object_acl( metadata, utils.acl.extract_predefined_default_object_acl(request, context), context, )
def process_status_file(context,metadata,status_file): now = timezone.now() context["containerstatus"]["harvester"].message="{}:Begin to process container status file '{}'".format(now.strftime("%Y-%m-%d %H:%M:%S"), metadata["resource_id"]) context["containerstatus"]["harvester"].last_heartbeat = now context["containerstatus"]["harvester"].save(update_fields=["message","last_heartbeat"]) if settings.CONTAINERSTATUS_STREAMING_PARSE: status_records = LogRecordIterator(status_file) else: with open(status_file,"r") as f: status_records = simdjson.loads(f.read()) records = 0 for record in status_records: records += 1 try: if any(not (record.get(key) or "").strip() for key in ("computer","containerid","image","name")): #data is incomplete,ignore continue created = to_datetime(record["created"]) started = to_datetime(record["started"]) finished = to_datetime(record.get("finished")) containerid = record["containerid"] ports = record["ports"] or None containerstate = record["containerstate"] if finished: containerstate = "terminated" envs = os.linesep.join(json.loads(record["environmentvar"])) if record["environmentvar"] else None exitcode = str(record["exitcode"]) if finished else None computer = record["computer"].strip() workload_name = record["name"].strip() image_without_tag = record.get("image","").strip() if not image_without_tag: continue else: imageid = "{}:{}".format(image_without_tag,record["imagetag"].strip()) cluster = None clustername = None if computer in context["clusters"]: cluster = context["clusters"][computer] elif record.get("resourceid"): resourceid = record["resourceid"].strip().rsplit("/",1)[-1] if resourceid in context["clusters"]: cluster = context["clusters"][resourceid] else: clustername = resourceid else: clustername = computer if not cluster: try: cluster = models.Cluster.objects.get(name=clustername) except ObjectDoesNotExist as ex: if settings.ENABLE_ADDED_BY_CONTAINERLOG: cluster = models.Cluster(name=clustername,added_by_log=True) cluster.save() else: continue context["clusters"][clustername] = cluster workload = None container = None key = (cluster.id,containerid) if key in context["containerstatus"]["terminated_containers"]: continue elif key in context["containerstatus"]["containers"]: container = context["containerstatus"]["containers"][key] else: try: container = models.Container.objects.get(cluster=cluster,containerid=containerid) context["containerstatus"]["containers"][key] = container except ObjectDoesNotExist as ex: pass if container: workload_key = (container.workload.cluster.id,container.workload.namespace.name,container.workload.name,container.workload.kind) if workload_key not in context["workloads"]: workload_update_fields = [] workload = container.workload context["workloads"][workload_key] = (workload,workload_update_fields) else: workload,workload_update_fields = context["workloads"][workload_key] elif settings.ENABLE_ADDED_BY_CONTAINERLOG: kind = "service?" if ports else "jobs?" new_workload_name = "{}-{}".format(image_without_tag,workload_name) workload_key = (cluster.id,"unknown",new_workload_name,kind) workload = None if workload_key in context["workloads"]: workload,workload_update_fields = context["workloads"][workload_key] else: #try to find the workload through cluster and workload name workload_qs = models.Workload.objects.filter(cluster=cluster,name=workload_name) for obj in workload_qs: if obj.containerimage and obj.containerimage.imageid.startswith(image_without_tag) and ((ports and obj.listenings.all().count()) or (not ports and obj.listenings.all().count() == 0)): workload = obj break if not workload : if settings.ENABLE_ADDED_BY_CONTAINERLOG: #not found , create a workload for this log namespace_key = (cluster.id,"unknown") if namespace_key in context["namespaces"]: namespace = context["namespaces"][namespace_key] else: try: namespace = models.Namespace.objects.get(cluster=cluster,name="unknown") except ObjectDoesNotExist as ex: namespace = models.Namespace(cluster=cluster,name="unknown",added_by_log=True,created=created or timezone.now(),modified=created or timezone.now()) namespace.save() context["namespaces"][namespace_key] = namespace workload = models.Workload.objects.filter(cluster=cluster,namespace=namespace,name=new_workload_name,kind=kind).first() if not workload: image = models.ContainerImage.parse_image(imageid) workload = models.Workload( cluster=namespace.cluster, project=namespace.project, namespace=namespace, name=new_workload_name, image=imageid, containerimage=image, kind=kind, api_version="", added_by_log=True, modified=created or timezone.now(), created=created or timezone.now() ) #if finished and finished.date() < timezone.now().date(): # workload.deleted = finished workload.save() else: continue workload_key = (cluster.id,workload.namespace.name,workload.name,workload.kind) workload_update_fields = [] context["workloads"][workload_key] = (workload,workload_update_fields) container = models.Container( cluster=workload.cluster, namespace=workload.namespace, workload=workload, poduid = "", containerid = containerid ) context["containerstatus"]["containers"][key] = container else: continue #container container_status = get_container_status(container,containerstate) update_fields = set_fields(container,[ ("exitcode",exitcode or container.exitcode), ("image",imageid or container.image), ("ports",ports or container.ports), ("envs",envs or container.envs), ("container_created",created or container.container_created), ("container_started",started or container.container_started), ("container_terminated",finished or container.container_terminated), ("status",container_status), ("last_checked",to_datetime(record["max_timegenerated"])) ]) if container.pk is None: container.save() elif update_fields: container.save(update_fields=update_fields) update_latest_containers(context,container,workload=workload,workload_update_fields=workload_update_fields) if container.status == "running" and container.workload.kind.lower() == "deployment" and (not container.pk or "status" in update_fields): context["containerstatus"]["new_deployed_workloads"].add(container.workload) if container_status.lower() in ("deleted","terminated"): del context["containerstatus"]["containers"][key] context["containerstatus"]["terminated_containers"].add(key) except Exception as ex: #delete already added records from this log file logger.error("Failed to parse container status record({}).{}".format(record,str(ex))) continue context["last_archive_time"] = metadata["archive_endtime"] logger.info("Harvest {1} records from file '{0}'".format(status_file,records))
def process_status_file(context,metadata,status_file): now = timezone.now() context["podstatus"]["harvester"].message="{}:Begin to process pod status file '{}'".format(now.strftime("%Y-%m-%d %H:%M:%S"),metadata["resource_id"]) context["podstatus"]["harvester"].last_heartbeat = now context["podstatus"]["harvester"].save(update_fields=["message","last_heartbeat"]) if settings.PODSTATUS_STREAMING_PARSE: status_records = LogRecordIterator(status_file) else: with open(status_file,"r") as f: status_records = simdjson.loads(f.read()) records = 0 for record in status_records: records += 1 try: if any(not (record.get(key) or "").strip() for key in ("clusterid","computer","namespace","poduid","containerid","pod_created","container_name","controllerkind")): #data is incomplete,ignore continue if record["computer"].strip().lower().startswith("aks-nodepool"): cluster_name = record["clusterid"].strip().rsplit("/")[-1] else: cluster_name = record["computer"].strip() cluster_name = cluster_name.split(".",1)[0] if cluster_name in context["clusters"]: cluster = context["clusters"][cluster_name] else: #logger.debug("find cluster {}".format(cluster_name)) try: cluster = models.Cluster.objects.get(name=cluster_name) except ObjectDoesNotExist as ex: if settings.ENABLE_ADDED_BY_CONTAINERLOG: cluster = models.Cluster(name=cluster_name,added_by_log=True) cluster.save() else: continue context["clusters"][cluster_name] = cluster namespace_name = record["namespace"].strip() key = (cluster.id,namespace_name) if key in context["namespaces"]: namespace = context["namespaces"][key] else: #logger.debug("find namespace {}".format(namespace_name)) try: namespace = models.Namespace.objects.get(cluster=cluster,name=namespace_name) except ObjectDoesNotExist as ex: if settings.ENABLE_ADDED_BY_CONTAINERLOG: namespace = models.Namespace(cluster=cluster,name=namespace_name,added_by_log=True,created=pod_created,modified=pod_created) namespace.save() else: continue context["namespaces"][key] = namespace poduid = record["poduid"].strip() containerid = record["containerid"].strip() container_name = record["container_name"].split("/") if len(container_name) != 2: raise Exception("Can't parse the container_name '{}'".format(record["container_name"])) elif container_name[0].strip() != poduid: raise Exception("The first part of the container_name '{}' should be '{}'".format(record["container_name"],poduid)) else: workload_name = container_name[1].strip() pod_created = to_datetime(record.get("pod_created")) pod_started = to_datetime(record.get("pod_started")) podip = record.get("podip") max_timegenerated = to_datetime(record["max_timegenerated"]) workload_kind = to_workload_kind(record["controllerkind"]) key = (cluster.id,namespace.name,workload_name,workload_kind) if key in context["workloads"]: workload,workload_update_fields = context["workloads"][key] else: #logger.debug("find workload.{}/{}({})".format(namespace.name,workload_name,workload_kind)) try: #logger.debug("find workload, cluster={}, project={}, namespace={},name={},kind={}".format(cluster,namespace.project,namespace,workload_name,workload_kind)) workload = models.Workload.objects.get(cluster=cluster,namespace=namespace,name=workload_name,kind=workload_kind) except ObjectDoesNotExist as ex: if settings.ENABLE_ADDED_BY_CONTAINERLOG: workload = models.Workload(cluster=cluster,project=namespace.project,namespace=namespace,name=workload_name,kind=workload_kind,image="",api_version="",modified=pod_created,created=pod_created,added_by_log=True) #if pod_created.date() < timezone.now().date(): # workload.deleted = max_timegenerated workload.save() else: continue workload_update_fields = [] context["workloads"][key] = (workload,workload_update_fields) try: container = models.Container.objects.get(cluster=cluster,containerid=containerid) previous_workload = container.workload previous_namespace = container.namespace except ObjectDoesNotExist as ex: container = models.Container(cluster=cluster,containerid=containerid) previous_workload = None previous_namespace = None update_fields = set_fields(container,[ ("namespace",namespace), ("workload",workload), ("pod_created",pod_created), ("pod_started",pod_started), ("podip",podip), ("poduid",poduid), ("last_checked",to_datetime(record["max_timegenerated"])) ]) """ if workload and workload.deleted and workload.deleted < max_timegenerated: workload.deleted = max_timegenerated if "deleted" not in workload_update_fields: workload_update_fields.append("deleted") """ if previous_workload and previous_workload != workload and previous_workload.added_by_log and previous_workload.namespace.name == "unknown": context["podstatus"]["removable_workloads"].add(previous_workload) context["podstatus"]["orphan_namespaces"].add(previous_workload.namespace) if container.pk is None: container.save() elif update_fields: container.save(update_fields=update_fields) except Exception as ex: #delete already added records from this log file logger.error("Failed to parse pod status record({}).{}".format(record,str(ex))) continue logger.info("Harvest {1} records from file '{0}'".format(status_file,records))
def jsonloads(cls, string): return json.loads(string)
def jl(cls, line): return json.loads(line)
def process_status_file(context,metadata,status_file): now = timezone.now() context["logstatus"]["harvester"].message="{}:Begin to process container log file '{}'".format(now.strftime("%Y-%m-%d %H:%M:%S"), metadata["resource_id"]) context["logstatus"]["harvester"].last_heartbeat = now context["logstatus"]["harvester"].save(update_fields=["message","last_heartbeat"]) if settings.CONTAINERLOG_STREAMING_PARSE: status_records = LogRecordIterator(status_file) else: with open(status_file,"r") as f: status_records = simdjson.loads(f.read()) records = 0 for record in status_records: try: if any(not (record.get(key) or "").strip() for key in ("computer","containerid","logentry","logtime")): #data is incomplete,ignore continue logtime = to_datetime(record["logtime"]) containerid = record["containerid"].strip() message = record["logentry"].strip() if not message: continue message = message.replace("\x00","").replace("\\n","\n") message = message.strip() """ #try to get log time from message. datestr = message[0:19] for pattern in ["%Y-%m-%d %H:%M:%S"]: try: logtime = timezone.make_aware(datetime.datetime.strptime(datestr,pattern)) break except: continue """ source = (record["logentrysource"] or "").strip() or None computer = record["computer"].strip() cluster = None clustername = None if computer in context["clusters"]: cluster = context["clusters"][computer] elif record.get("resourceid"): resourceid = record["resourceid"].strip().rsplit("/",1)[-1] if resourceid in context["clusters"]: cluster = context["clusters"][resourceid] else: clustername = resourceid else: clustername = computer if not cluster: try: cluster = models.Cluster.objects.get(name=clustername) except ObjectDoesNotExist as ex: if settings.ENABLE_ADDED_BY_CONTAINERLOG: cluster = models.Cluster(name=clustername,added_by_log=True) cluster.save() else: continue context["clusters"][clustername] = cluster """ if cluster.name != 'az-k3s-oim01': continue """ key = (cluster.id,containerid) if key in context["logstatus"]["containers"]: container,container_update_fields = context["logstatus"]["containers"][key] else: try: container = models.Container.objects.get(cluster=cluster,containerid=containerid) except ObjectDoesNotExist as ex: if settings.CONTAINERLOG_FAILED_IF_CONTAINER_NOT_FOUND: raise Exception("The containerId({}) in log resource({}) Not Found".format(containerid,metadata)) else: continue container_update_fields = [] context["logstatus"]["containers"][key] = (container,container_update_fields) key = (cluster.id,containerid) if key in context["logstatus"]["containerlogs"]: containerlog = context["logstatus"]["containerlogs"][key] containerlog.archiveid = metadata["resource_id"] else: containerlog = models.ContainerLog(archiveid=metadata["resource_id"]) context["logstatus"]["containerlogs"][key] = containerlog result = container.workload.containerimage.imagefamily.get_loglevel(message) if result: level,newmessage = result else: level = None newmessage = False for log_level_re,value in log_levels: if log_level_re.search(message): level,newmessage = value break if level is None: if source.lower() in ('stderr',): level = models.ContainerLog.ERROR else: level = models.ContainerLog.INFO if not containerlog.logtime: containerlog.id = None containerlog.container = container containerlog.logtime = logtime containerlog.latest_logtime = logtime containerlog.source = source #containerlog.message = "{}:{}".format(logtime.strftime("%Y-%m-%d %H:%M:%S.%f"),message) containerlog.message = message containerlog.level = level elif newmessage or logtime >= (containerlog.latest_logtime + datetime.timedelta(seconds=1)) or containerlog.source != source : records += 1 containerlog.save() _add_notify_log(context,containerlog) container = containerlog.container update_workload_latest_containers(context,containerlog) key = (container.cluster.id,container.containerid) if key in context["logstatus"]["containers"]: container,container_update_fields = context["logstatus"]["containers"][key] else: container_update_fields = [] context["logstatus"]["containers"][key] = (container,container_update_fields) container_update_fields = set_fields(container,[ ("log", True), ("warning", True if containerlog.level == models.ContainerLog.WARNING else container.warning), ("error", True if containerlog.level == models.ContainerLog.ERROR else container.error), ],container_update_fields) if newmessage and containerlog.logtime >= logtime: #more than one logs at the same time, add one millesconds to the logtime because of unique index logtime = containerlog.logtime + datetime.timedelta(milliseconds=1) containerlog.id = None containerlog.container = container containerlog.logtime = logtime containerlog.latest_logtime = logtime containerlog.source = source #containerlog.message = "{}:{}".format(logtime.strftime("%Y-%m-%d %H:%M:%S.%f"),message) containerlog.message = message containerlog.level = level else: if level > containerlog.level: containerlog.level = level #containerlog.message = "{}\n{}:{}".format(containerlog.message,logtime.strftime("%Y-%m-%d %H:%M:%S.%f"),message) containerlog.message = "{}\n{}".format(containerlog.message,message) if logtime > containerlog.latest_logtime: containerlog.latest_logtime = logtime except Exception as ex: #delete already added records from this log file logger.error("Failed to parse container log record({}).{}".format(record,str(ex))) continue #save the last message containerlogs = [o for o in context["logstatus"]["containerlogs"].values() if o.logtime and o.container] containerlogs.sort(key=lambda o:o.logtime) for containerlog in containerlogs: records += 1 containerlog.save() _add_notify_log(context,containerlog) container = containerlog.container update_workload_latest_containers(context,containerlog) key = (container.cluster.id,container.containerid) if key in context["logstatus"]["containers"]: container,container_update_fields = context["logstatus"]["containers"][key] else: container_update_fields = [] context["logstatus"]["containers"][key] = (container,container_update_fields) container_update_fields = set_fields(container,[ ("log", True), ("warning", True if containerlog.level == models.ContainerLog.WARNING else container.warning), ("error", True if containerlog.level == models.ContainerLog.ERROR else container.error), ],container_update_fields) containerlog.id = None containerlog.logtime = None containerlog.level = None containerlog.message = None containerlog.source = None containerlog.container = None containerlog.latest_logtime = None #save terminated containers terminated_keys = [] for key,value in context["logstatus"]["containers"].items(): container,container_update_fields = value if container.container_terminated and (container.container_terminated + datetime.timedelta(minutes=30)) < metadata["archive_endtime"]: terminated_keys.append(key) if not container.pk: container.save() elif container_update_fields: container.save(update_fields=container_update_fields) container_update_fields.clear() #delete terminated containers from cache for key in terminated_keys: del context["logstatus"]["containers"][key] if key in context["logstatus"]["containerlogs"]: del context["logstatus"]["containerlogs"][key] logger.info("Harvest {1} records from file '{0}'".format(status_file,records))
procedure='s.get_raw_data', detail=response, message='failed to get raw data') sys.exit(1) status, response = s.get_iteration_set(name='test') if status is False: s.log(level='ERROR', app='test', procedure='s.get_iteration_set', detail=response, message='failed to get iteration set') sys.exit(1) status, response = s.get_unreviewed_index_records(module='scraper', name='test', datasource='test') if status is False: s.log(level='ERROR', app='test', procedure='s.get_unreviewed_index_records', detail=response, message='failed to get unreviewed scrapeindex records') sys.exit(1) else: si_record = simdjson.loads( response['get_unreviewed_index_records'])[0]['_id'] print('passed all tests') sys.exit(0)
# orjson only outputs bytes, but often we need unicode: print('---dumps---') benchmark("orjson", lambda s: orjson.dumps(s).decode('utf-8')) benchmark("Python", json.dumps) benchmark("rapidjson", rapidjson.dumps) benchmark("ujson", ujson.dumps) benchmark("simplejson", simplejson.dumps) benchmark("hyperjson", hyperjson.dumps) print('---loads---') benchmark_load("orjson", lambda x: orjson.loads(x.encode('utf-8'))) benchmark_load("Python", json.loads) benchmark_load("rapidjson", rapidjson.loads) benchmark_load("ujson", ujson.loads) benchmark_load("simplejson", simplejson.loads) benchmark_load("hyperjson", hyperjson.loads) benchmark_load("pysimdjson-load", lambda x: simdjson.loads(x.encode('utf-8'))) # dumps # orjson 1.227565050125122 # Python 5.861892938613892 # rapidjson 2.87353777885437 # ujson 1.669421911239624 # loads # orjson 2.642509937286377 # Python 4.873814105987549 # rapidjson 3.068044900894165 # ujson 1.7971441745758057 # orjson==2.6.1 # python-rapidjson==0.9.1
def process_log_file(context, metadata, log_file): if settings.NGINXLOG_STREAMING_PARSE: log_records = LogRecordIterator(log_file) else: with open(log_file, "r") as f: log_records = simdjson.loads(f.read()) records = 0 webserver_records = {} webserver = None key = None original_request_path = None for record in log_records: records += 1 try: if "?" in record["request_path"]: request_path, path_parameters = record["request_path"].split( "?", 1) if path_parameters: path_parameters = path_parameters.replace("%00", "").replace( "\x00", "") path_parameters = [ (k, v[0] if len(v) == 1 else v) for k, v in QueryDict(path_parameters).lists() ] path_parameters.sort(key=lambda o: o[0].lower()) all_path_parameters = [o[0] for o in path_parameters] path_parameters = WebAppAccessLog.to_path_parameters( path_parameters) else: request_path = record["request_path"] path_parameters = None all_path_parameters = None try: http_status = int(record["http_status"]) except: http_status = 0 original_request_path = request_path if not request_path: request_path = "/" original_request_path = request_path elif len(request_path) > 512: request_path = request_path[0:512] parameters_changed, path_parameters = RequestParameterFilter.filter_parameters( record["webserver"], request_path, path_parameters, parameter_filters=context["parameter_filters"], parameter_filter_map=context["parameter_filter_map"]) path_changed, request_path = RequestPathNormalizer.normalize_path( record["webserver"], request_path, path_normalizers=context["path_normalizers"], path_normalizer_map=context["path_normalizer_map"], path_filter=context["path_filter"]) if request_path is None: continue if webserver: if record["webserver"] != webserver: for log_record in webserver_records.values(): log_record.save() webserver_records.clear() webserver = record["webserver"] else: webserver = record["webserver"] key = (request_path, http_status, path_parameters) accesslog = webserver_records.get(key) if accesslog: accesslog.requests += int(record["requests"]) accesslog.total_response_time += to_float( record["total_response_time"]) if accesslog.max_response_time < to_float( record["max_response_time"]): accesslog.max_response_time = to_float( record["max_response_time"]) if accesslog.min_response_time > to_float( record["min_response_time"]): accesslog.min_response_time = to_float( record["min_response_time"]) accesslog.avg_response_time = accesslog.total_response_time / accesslog.requests if all_path_parameters: if accesslog.all_path_parameters: changed = False for param in all_path_parameters: if param not in accesslog.all_path_parameters: accesslog.all_path_parameters.append(param) changed = True if changed: accesslog.all_path_parameters.sort() else: accesslog.all_path_parameters = all_path_parameters else: accesslog = WebAppAccessLog( log_starttime=metadata["archive_starttime"], log_endtime=metadata["archive_endtime"], webserver=record["webserver"], request_path=request_path, http_status=http_status, path_parameters=path_parameters, all_path_parameters=all_path_parameters, requests=int(record["requests"]), max_response_time=to_float(record["max_response_time"]), min_response_time=to_float(record["min_response_time"]), total_response_time=to_float( record["total_response_time"])) accesslog.avg_response_time = accesslog.total_response_time / accesslog.requests if accesslog.webserver not in context.get("webapps", {}): if "webapps" not in context: context["webapps"] = {} context["webapps"][ accesslog.webserver] = WebApp.objects.filter( name=accesslog.webserver).first() accesslog.webapp = context["webapps"][accesslog.webserver] if accesslog.webapp and not accesslog.webapp.redirect_to and not accesslog.webapp.redirect_to_other: if accesslog.webapp not in context.get( "webapplocations", {}): if "webapplocations" not in context: context["webapplocations"] = {} context["webapplocations"][accesslog.webapp] = list( WebAppLocation.objects.filter( app=accesslog.webapp).order_by("-score")) accesslog.webapplocation = accesslog.webapp.get_matched_location( original_request_path, context["webapplocations"][accesslog.webapp]) if not accesslog.webapplocation and accesslog.http_status < 300 and accesslog.http_status >= 200: logger.warning( "Can't find the app location for request path({1}) in web application({0})" .format(accesslog.webapp, accesslog.request_path)) webserver_records[key] = accesslog except Exception as ex: #delete already added records from this log file WebAppAccessLog.objects.filter( log_starttime=metadata["archive_starttime"]).delete() logger.error( "Failed to parse the nginx access log record({}).{}".format( record, traceback.format_exc())) raise Exception( "Failed to parse the nginx access log record({}).{}".format( record, str(ex))) for log_record in webserver_records.values(): log_record.save() logger.info("Harvest {1} records from log file '{0}'".format( log_file, records))
def json(self) -> dict: return simdjson.loads(self.content)
def readl_simdjson(filepath: str): parser = simdjson.Parser() with open(filepath) as fp: return [simdjson.loads(line) for line in fp]
def query(self, bbox, allow_missing=False): """ For the specified bounding box (or equivalent representation), list all segment ids enclosed within it. If allow_missing is set, then don't raise an error if an index file is missing. Returns: set(labels) """ bbox = Bbox.create(bbox, context=self.physical_bounds, autocrop=True) original_bbox = bbox.clone() bbox = bbox.expand_to_chunk_size(self.chunk_size.astype( self.physical_bounds.dtype), offset=self.physical_bounds.minpt) if bbox.subvoxel(): return [] labels = set() fast_path = bbox.contains_bbox(self.physical_bounds) if self.sql_db and fast_path: conn = connect(self.sql_db) cur = conn.cursor() cur.execute("select label from file_lookup") while True: rows = cur.fetchmany(size=2**20) if len(rows) == 0: break # Sqlite only stores signed integers, so we need to coerce negative # integers back into unsigned with a bitwise and. labels.update( (int(row[0]) & 0xffffffffffffffff for row in rows)) cur.close() conn.close() return labels index_files = self.index_file_paths_for_bbox(bbox) num_blocks = int(np.ceil(len(index_files) / 10000)) for index_files_subset in tqdm(sip(index_files, 10000), total=num_blocks, desc="Block", disable=((not self.config.progress) or (num_blocks == 1))): results = self.fetch_index_files(index_files_subset) parser = simdjson.Parser() for filename, content in tqdm(results.items(), desc="Decoding Labels", disable=(not self.config.progress)): if content is None: if allow_missing: continue else: raise SpatialIndexGapError(filename + " was not found.") # The bbox test saps performance a lot # but we can skip it if we know 100% that # the labels are going to be inside. This # optimization is important for querying # entire datasets, which is contemplated # for shard generation. if fast_path: res = parser.parse(content).keys() labels.update( (int(label) for label in res)) # fast path: 16% CPU else: res = simdjson.loads(content) for label, label_bbx in res.items(): label = int(label) label_bbx = Bbox.from_list(label_bbx) if Bbox.intersects(label_bbx, original_bbox): labels.add(label) return labels
import simdjson with open('sample.json', 'rb') as f: document = simdjson.loads(f.read()) print(document) print(type(document)) print(document["type"]) print(document["created_at"]) print(document["id"]) print(document["actor"]) for k, v in document["actor"].items(): print(k, v) print(document["repo"]) for k, v in document["repo"].items(): print(k, v) print(document["public"]) print(document["payload"]) for k, v in document["payload"].items(): print(k, v)
def test_valid_smallblock(): assert simdjson.loads(b'{"test": "value"}') == {'test': 'value'}