예제 #1
0
def ingestion_callback(message):
    data, metadata = pulsarclient.callback_info(message)

    user_id = metadata['user_id']
    patient_id = metadata['patient_id']
    ingestion_time = metadata['ingestion_time']
    app = metadata['app_name']
    log.info(
        f'DEBUG: {type(data["Patient"][0]["id"])} {data["Patient"][0]["id"]}')
    log.info(f'Received patient data: {type(data)}')
    log.info(f'Received patient data: {data["Patient"]}')
    # bundle = {
    #     'resourceType': 'Bundle',
    #     'type': 'transaction',
    #     'entry': []
    # }
    # for entry in data:
    #     bundle['entry'].append(entry)

    # sanitize_null
    sanitized_data = defaultdict(list)
    for resourceType, resources in data.items():
        if resources:
            for resource in resources:
                # TODO: hack to remove None, till I figure how to make null pass validation in avro
                new_resource = remove_none(resource)
                sanitized_data[resourceType].append(new_resource)

    log.info(sanitized_data)

    fastavro_schema = avroutil.get_bundle_schema()
    fastavro_schema = pulsarclient.AvroSchema(
        schema_definition=fastavro_schema, schema_name=fastavro_schema['name'])

    fastavro.validate(sanitized_data, fastavro_schema)
    buffer = io.BytesIO()
    fastavro.schemaless_writer(buffer, fastavro_schema, sanitized_data)
    buffer.seek(0)

    upload_file(
        buffer,
        f"{user_id}/{patient_id}/{ingestion_time}/ingested/bundle.avro", app,
        {})

    if PROFILING:
        profile.print_stats()
예제 #2
0
    def validate(self, record):
        """Validate packet contents against this schema.

        Parameters
        ----------
        record : `dict`
            The data to be checked for schema compliance.

        Returns
        -------
        valid : `bool`
            Whether or not the data complies with the schema.
        """
        fastavro.parse_schema(self.definition)
        return fastavro.validate(record, self.definition)
예제 #3
0
 def test_poll(self):
     topic, alert, key = self.consumer.poll()
     self.assertIsNotNone(alert)
     self.assertTrue(fastavro.validate(alert, self.consumer._parsed_schema))
예제 #4
0
def convert_to_avro(schema_path: str,
                    log_path: str,
                    output_path: str = None,
                    delete_existing_avro_file: bool = True,
                    validate_percentage: float = 100.0,
                    avro_batch_size: int = 2000,
                    offset: int = 0,
                    max_lines: int = None) -> dict:
    """Converts a log file to Avro format."""

    t0 = time.time()

    def _get_output_path(input_path: str):
        if input_path.endswith('.bz2') or input_path.endswith('.gz'):
            return f'{os.path.splitext(input_path)[0]}.avro'
        else:
            return f'{input_path}.avro'

    if not output_path:
        output_path = _get_output_path(log_path)

    print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z '
          f'Converting log file {log_path!r} '
          f'to Avro file {output_path!r} '
          f'using schema {schema_path!r} '
          f'and {validate_percentage} % output validation '
          f'(PID {os.getpid()})...')

    with open(schema_path, 'rb') as schema_file:
        avro_schema = fastavro.parse_schema(json.loads(schema_file.read()))

    if delete_existing_avro_file and os.path.exists(output_path):
        os.remove(output_path)

    records, records_validated, batch_sizes = [], 0, []

    def _write_avro_output():
        if not os.path.exists(output_path):
            with open(output_path, 'wb') as avro_file:
                fastavro.writer(avro_file,
                                avro_schema,
                                records,
                                codec='deflate')
        else:
            with open(output_path, 'a+b') as avro_file:
                fastavro.writer(avro_file,
                                avro_schema,
                                records,
                                codec='deflate')
        batch_sizes.append(len(records))
        print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z '
              f'Wrote {len(records)} records '
              f'in batch {str(len(batch_sizes)).zfill(4)} '
              f'to {output_path!r} (PID {os.getpid()}).')
        records.clear()

    log_file = open_log_file(log_path)
    lines_in, lines_ignored = 0, 0
    decode_errors, validation_errors, total_errors = 0, 0, 0
    typecasting = {
        'field_238_to_int': 0,
        'field_256_to_int': 0,
        'field_256_to_null': 0,
        'field_255_to_str': 0
    }

    for log_line in log_file:
        lines_in += 1
        if lines_in < offset:
            continue
        if max_lines and (lines_in - offset) >= max_lines:
            break

        try:
            server_date, versionstring, token, ip, raw_json_data = \
                log_line.decode().split('\t', maxsplit=5)
            if raw_json_data == '{"d":}\n':
                lines_ignored += 1
                continue

            json_data = json.loads(raw_json_data)['d']
            if len(json_data) != 7:
                raise ValueError(f'Data is not a 7-tuple ({json_data!r}).')

            (log_format, client_date_orig, project_id, version, uuid,
             event_name, fields) = json_data

            tz_offset = client_date_orig[-6:]
            if len(tz_offset) != 6 and tz_offset.startsWith(('-', '+')):
                raise ValueError(f'Malformatted date {client_date_orig!r}.')

            avro_record = {
                'server_date': server_date,
                'datestamp': server_date.split(maxsplit=1)[0],
                'versionstring': versionstring,
                'token': token,
                'ip': ip,
                'log_format': log_format,
                'client_date_orig': client_date_orig,
                'client_date': client_date_orig.split(maxsplit=1)[0],
                'client_local_date': client_date_orig[:-6],
                'tz_offset': tz_offset,
                'project_id': project_id,
                'version': version,
                'uuid': uuid,
                'event_name': event_name
            }

            for field, value in fields.items():
                if value:
                    if field == '238':
                        if not isinstance(value, int):
                            typecasting['field_238_to_int'] += 1
                            value = int(value)
                    elif field == '256':
                        if not isinstance(value, int):
                            typecasting['field_256_to_int'] += 1
                            value = int(value)
                else:
                    if field == '256':
                        if not isinstance(value, int):
                            typecasting['field_256_to_null'] += 1
                            value = None
                if field == '255':
                    if not isinstance(value, str):
                        typecasting['field_255_to_str'] += 1
                        value = str(value)

                avro_record[f"c_{str(field).replace('.', '_')}"] = value

            if validate_percentage and records_validated < avro_batch_size:
                records_validated += 1
                fastavro.validate(avro_record, avro_schema)
            elif random.random() * 100.0 <= validate_percentage:
                records_validated += 1
                fastavro.validate(avro_record, avro_schema)

            records.append(avro_record)

            if len(records) >= avro_batch_size:
                _write_avro_output()

        except ValueError as parse_error:
            decode_errors += 1
            total_errors += 1
            print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z '
                  f'{parse_error.__class__.__name__}, line {lines_in} in '
                  f'{log_path!r}: {parse_error} / Content: {log_line!r}')

        except ValidationError as validation_err:
            validation_errors += 1
            total_errors += 1
            print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z '
                  f'{validation_err.__class__.__name__}, line {lines_in} in '
                  f'{log_path!r}: {validation_err} / Content: {log_line!r}')

        finally:
            if total_errors > 50 and total_errors / (lines_in -
                                                     offset) > 0.001:
                raise Exception(f'Excessive error rate '
                                f'{total_errors / (lines_in - offset)}.')

    _write_avro_output()

    duration_sec = int(time.time() - t0)

    print(f'{datetime.datetime.utcnow().isoformat()[:19]}Z '
          f'Converted {lines_in - offset} lines to Avro records '
          f'in {duration_sec} seconds, '
          f'dropping {lines_ignored} empty lines, '
          f'failing to decode {decode_errors} lines, '
          f'invalidating {validation_errors} records '
          f'(in {records_validated} validations), '
          f'with casting summary {typecasting!r} '
          f'(PID {os.getpid()}).')

    return {
        'output_path': os.path.abspath(output_path),
        'metrics': {
            'lines_in': lines_in,
            'lines_ignored': lines_ignored,
            'decode_errors': decode_errors,
            'validation_errors': validation_errors,
            'records_out': sum(batch_sizes),
            'records_validated': records_validated,
            'typecasting': typecasting,
            'duration_sec': duration_sec
        }
    }
예제 #5
0
def encode(obj, schema):
    fastavro.validate(obj, schema)
    buffer = io.BytesIO()
    fastavro.schemaless_writer(buffer, schema, obj)
    return buffer.getvalue()
예제 #6
0
    def __init__(self, data, schema=None):
        """
        :param data: dict, list of dicts, JSON str, file, bytes
        :param schema: dict 
        """
        self._last_error = None  # Last error captured
        self._object_data = None
        self._json_data = None
        self._avro_data = None
        self._origin = None
        self._schema = None
        self._schema_origin = None

        self._ok = False
        if schema is None:
            self._schema = None
        elif isinstance(schema, str):
            try:
                success, schema, origin = AvroTools.fetch_json(schema)
                if success:
                    schema = json.loads(schema)
                    self._schema_origin = origin
                else:
                    schema = None

            except Exception as e:
                self._last_error = str(e)
                schema = None

        if schema is not None:
            try:
                self._schema = parse_schema(schema)
                if self._schema_origin is None:
                    self._schema_origin = type(schema).__name__
            except Exception as e:
                self._last_error = str(e)
                schema = None

        if isinstance(data, bytes):
            b_avro = False
            try:
                bdata = io.BytesIO(data)
                if is_avro(bdata):
                    self._origin = 'binary_avro'
                    bdata.seek(0)
                    b_avro = True
                    avro_reader = reader(bdata)
                    self._schema = avro_reader.schema
                    obj_data = []
                    for record in avro_reader:
                        obj_data.append(record)
                    self._object_data = None if len(
                        obj_data) == 0 else obj_data[0] if len(
                            obj_data) == 1 else obj_data
                    self._ok = True
                else:
                    self._origin = 'binary_string'
                    data = data.decode('utf-8')

            except Exception as e:
                self._last_error = ('Avro binary' if b_avro else
                                    'String decoding') + f' error: {e}'

        if isinstance(data, str):
            success, json_data, origin = AvroTools.fetch_json(data)
            if not self._origin:
                self._origin = origin
            if not success:
                self._last_error = json_data
                return

            try:
                self._object_data = json.loads(json_data)
                self._json_data = json_data
                if self._schema is None:
                    self._ok = True
            except Exception as e:
                self._last_error = f'JSON parsing error: {e}'

        elif isinstance(data, dict) or isinstance(data, list):
            self._origin = type(data).__name__
            self._object_data = data
            if self._schema is None:
                self._ok = True

        if self._object_data is not None and not self._ok and self._schema is not None:
            try:
                validate(self._object_data, self._schema)
                self._ok = True
            except Exception as e:
                self._last_error = f'Schema error: {e}'
예제 #7
0
import json

import avro.schema
import fastavro

with open("./record-copy.avsc", "r") as fp:
    schema = json.load(fp)

with open("./payload.json", "r") as fp:
    payload = json.load(fp)

fastavro.validate(datum=payload[0], schema=schema)
avro.schema.SchemaFromJSONData(schema)