Exemplo n.º 1
0
    def run(self, n):
        # JSON Serializer
        # serializer = ajs.AvroJsonSerializer(self.movies_schema)
        # json_data = serializer.to_json(self.movies_data)
        total_ser = 0
        total_deser = 0
        bytes_len = 0
        for i in range(0, n):
            datum_writer = DatumWriter(self.movies_schema)
            bytes_writer = io.BytesIO()

            encoder = BinaryEncoder(bytes_writer)
            tic = timeit.default_timer()
            datum_writer.write(self.movies_data, encoder)
            elapsed = timeit.default_timer() - tic
            payload = bytes_writer.getvalue()
            total_ser = total_ser + elapsed
            bytes_len = len(payload)

            bytes_reader = io.BytesIO(payload)
            decoder = BinaryDecoder(bytes_reader)
            reader = DatumReader(self.movies_schema)
            tic2 = timeit.default_timer()
            movies = reader.read(decoder)
            elapsed2 = timeit.default_timer() - tic2
            total_deser = total_deser + elapsed2

        self.logger.log(logging.INFO, "serialized len: %s bytes", bytes_len)
        avg_ser = (total_ser*(10**9))/n
        avg_deser = (total_deser*(10**9))/n
        self.logger.log(logging.INFO, "Serialization time: \n%s", avg_ser)
        self.logger.log(logging.INFO, "De-serialization time: \n%s", avg_deser)
Exemplo n.º 2
0
def read_value(schema: TypedSchema, bio: io.BytesIO):
    if schema.schema_type is SchemaType.AVRO:
        reader = DatumReader(schema.schema)
        return reader.read(BinaryDecoder(bio))
    if schema.schema_type is SchemaType.JSONSCHEMA:
        value = load(bio)
        try:
            schema.schema.validate(value)
        except ValidationError as e:
            raise InvalidPayload from e
        return value
    raise ValueError("Unknown schema type")
Exemplo n.º 3
0
def deserialize_avro(binary_data, schema):
    """
    Function used to deserialize an avro binary data
    :param schema: avro schema of binary data
    :param binary_data: event data in binary encoded (bytes)
    :return: deserialized data and corresponding schema
    """
    bytes_reader = io.BytesIO(binary_data)
    decoder = BinaryDecoder(bytes_reader)
    reader = DatumReader(schema)
    data = reader.read(decoder)
    return data, schema
Exemplo n.º 4
0
def test_sanity():
  """

  Ensures that our "base" and "good" schemas are actually forwards- and
  backwards-compatible

  """
  # fst schema / record
  fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read())
  fst_writer = DatumWriter(writers_schema=fst_schema)
  fst_record = {
      "fieldWithoutDefaultValue": 0,
      "properField": 0,
      "enumField": "A",
      "unionField": None,
      "arrayField": ["world"],
      "mapField": {"hello": "world"},
      "fixedField": "aaaaaaaaaaaaaaaa"
  }

  # sec schema / record
  sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read())
  sec_writer = DatumWriter(writers_schema=sec_schema)
  sec_record = {
      "fieldWithoutDefaultValue": 0,
      "properField2": 0,
      "enumField": "B",
      "unionField": None,
      "arrayField": ["world"],
      "fixedField": "bbbbbbbbbbbbbbbb"
  }

  # Encode record w/ fst
  fst_buf = StringIO.StringIO()
  fst_encoder = BinaryEncoder(fst_buf)
  fst_writer.write(fst_record, fst_encoder)
  fst_data = fst_buf.getvalue()

  # Encode record w/ sec
  sec_buf = StringIO.StringIO()
  sec_encoder = BinaryEncoder(sec_buf)
  sec_writer.write(sec_record, sec_encoder)
  sec_data = sec_buf.getvalue()

  # writers == fst, readers == sec
  sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema)
  sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data))
  sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good

  # writers == sec, readers == fst
  fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema)
  fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data))
  fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good
Exemplo n.º 5
0
def deserialize(flight_info_bytes):
    if flight_info_bytes is not None:
        bytes_reader = BytesIO(flight_info_bytes)
        decoder = BinaryDecoder(bytes_reader)
        schema_flight_info = Parse(
            open(dir_path + "/flight-info.schema.avsc", "rb").read())
        reader = DatumReader(schema_flight_info)
        flight_info = reader.read(decoder)

        return json.dumps([{"id": 907955534287978496}])
    else:
        return None
Exemplo n.º 6
0
    def decode_avro(self, payload):
        magic, schema_id = struct.unpack('>bi', payload[:5])

        # Get Schema registry
        # Avro value format
        if magic == MAGIC_BYTES:
            schema = self.register_client.get_by_id(schema_id)
            reader = DatumReader(schema)
            output = BinaryDecoder(io.BytesIO(payload[5:]))
            decoded = reader.read(output)
            return decoded, schema.name
        # no magic bytes, something is wrong
        else:
            raise ValueError
Exemplo n.º 7
0
    def _decode_avro(self, avro_data):
        """
        Decode Avro Message.

        Args:
            avro_data (binary):     Avro data to decode.

        Returns:
            dict
        """
        bytes_reader = io.BytesIO(avro_data.value)
        decoder = BinaryDecoder(bytes_reader)
        reader = DatumReader(self.schema)

        return reader.read(decoder)
Exemplo n.º 8
0
 def __init__(self,
              callback,
              service_name,
              param_schema,
              result_schema,
              version=0):
     self.callback = callback
     self.service_name = service_name
     self.param_schema = SchemaFromJSONData(param_schema, Names())
     self.result_schema = SchemaFromJSONData(result_schema, Names())
     self.version = version
     self._param_writer = DatumWriter(self.param_schema)
     self._param_reader = DatumReader(self.param_schema)
     self._result_writer = DatumWriter(self.result_schema)
     self._result_reader = DatumReader(self.result_schema)
 def unpack(self, payload):
     MAGIC_BYTES = 0
     magic, schema_id = struct.unpack('>bi', payload[:5])
     # Get Schema registry
     # Avro value format
     if magic == MAGIC_BYTES:
         schema = self.register_client.get_by_id(schema_id)
         reader = DatumReader(schema)
         output = BinaryDecoder(io.BytesIO(payload[5:]))
         abc = reader.read(output)
         return abc
     # String key
     else:
         # Timestamp is inside my key
         return payload[:-8].decode()
Exemplo n.º 10
0
def parse_avro_msg(msg, avro_schema):
    """
    Parses an avro record using a specified avro schema

    Args:
        :msg: the avro message to parse
        :avro_schema: the avro schema

    Returns:
         The parsed/decoded message
    """
    reader = DatumReader(avro_schema)
    message_bytes = BytesIO(msg)
    decoder = BinaryDecoder(message_bytes)
    return reader.read(decoder)
Exemplo n.º 11
0
    def test_seekable(self):
        fn = self.write_avro_file(avro_user_record, 500, 1024)
        with open(fn, 'rb') as f:
            sreader = SeekableDataFileReader(f, DatumReader())
            res = [t for t in czip(cmap(
                lambda _: f.tell(), it.repeat(1)
            ), sreader)]
            sreader.align_after(res[-1][0])
            with self.assertRaises(StopIteration):
                r = next(sreader)
            sreader.align_after(0)
            r = next(sreader)
            self.assertEqual(r, res[0][1])

            def offset_iterator():
                s = -1
                for o, r in res:
                    sreader.align_after(o)
                    t = f.tell()
                    if t == s:
                        continue
                    s = t
                    x = next(sreader)
                    yield (t, x)

            i = 0
            for xo, x in offset_iterator():
                sreader.align_after(xo)
                for o, r in res[i:]:
                    if o >= xo:
                        self.assertEqual(x, r)
                        break
                    i += 1
Exemplo n.º 12
0
class Deserializer(object):
    def __init__(self, schema_str):
        schema = parse(schema_str)
        self.reader = DatumReader(schema)

    def deserialize(self, rec_bytes):
        return self.reader.read(BinaryDecoder(StringIO(rec_bytes)))
    def _from_avro_generic(avro_container_uri: str, ):
        datum_counter = 0
        datum_to_return = None
        # DET TODO add other exception handling around the double with clause
        with open(avro_container_uri, "rb") as avro_fp:
            with DataFileReader(avro_fp, DatumReader()) as reader:
                #
                #  This static meethod can only initialize one datum in the file - scan through and raise
                #  error if more than one found
                #  Not sure if there is lazy access to the datum - if so returning the datum to caller
                #  for subsequent loading would be problematic
                #
                for datum_counter, datum in enumerate(reader, start=1):
                    print('Reading datum #' + str(datum_counter))
                    print('The message datum = ' + str(datum))
                    if datum_counter == 1:
                        datum_to_return = datum

        if datum_counter > 1:
            raise EmeraldMessageDeserializationError(
                'Unable to deserialize from AVRO container "' +
                avro_container_uri +
                '" - this deserializer can only have one datum per file' +
                os.linesep + 'Total element count in this file = ' +
                str(datum_counter))

        if datum_to_return is None:
            raise EmeraldMessageDeserializationError(
                'Data could not be loaded from AVRO file "' +
                str(avro_container_uri) + '" using schema ' +
                AbstractContainer.get_avro_schema_record().avro_schema_name)

        print('Length of datum to return = ' + str(datum_to_return))
        print('Type of data  to return = ' + str(type(datum_to_return)))
        return datum_to_return
Exemplo n.º 14
0
    def _unpack(self, payload):
        magic, schema_id = struct.unpack('>bi', payload[:5])

        # Get Schema registry
        # Avro value format
        if magic == self.MAGIC_BYTES:
            schema = self.register_client.get_by_id(schema_id)
            reader = DatumReader(schema)
            output = BinaryDecoder(io.BytesIO(payload[5:]))
            abc = reader.read(output)
            return abc
        # String key
        else:
            # If KSQL payload, exclude timestamp which is inside the key.
            # payload[:-8].decode()
            return payload.decode()
Exemplo n.º 15
0
    def decode(self, encoded_event: Any) -> Dict[str, Union[BaseModel, BaseStoreRecord,
                                                            BaseHandler, BaseStoreRecordHandler]]:
        try:
            reader = DataFileReader(BytesIO(encoded_event), DatumReader())
            schema = json.loads(reader.meta.get('avro.schema').decode('utf-8'))
            schema_name = schema['namespace'] + '.' + schema['name']
            event_data = next(reader)
        except AvroTypeException as err:
            self.logger.exception(f'{err.__str__()}')
            raise AvroDecodeError

        # Finds a matching event name
        for e_name, event in self._events.items():
            if e_name.match(schema_name):  # type: ignore
                event_class = event
                break
        else:
            raise MissingEventClass

        # Finds a matching handler name
        for e_name, handler in self._handlers.items():
            if e_name.match(schema_name):  # type: ignore
                handler_class = handler
                break
        else:
            raise MissingHandlerClass
        return {'event_class': event_class.from_data(event_data=event_data), 'handler_class': handler_class}
Exemplo n.º 16
0
def get_messages():
    # делаем ридер для авро-схема
    schema = avro.schema.Parse(open(SCHEMA_PATH,'r').read())
    reader = DatumReader(schema)
    
    # создаем консьюмера
    consumer = KafkaConsumer(
        TOPIC_NAME,
        group_id=GROUP_ID,
        bootstrap_servers=BOOTSTRAP_SERVERS,
        api_version=(0, 10),
        auto_offset_reset='earliest',
        enable_auto_commit=True,
        consumer_timeout_ms=15000
    )
    
    # читаем последние сообщения
    messages = []
    for message in consumer:
        messages.append(avro_deserializer(message.value,reader))
    consumer.close()
    
    # сортируем по времени
    messages = sorted(messages, key=lambda k: k['timestamp'])
    
    return messages
Exemplo n.º 17
0
    def test_avro_reader(self):

        N = 500
        fn = self.write_avro_file(avro_user_record, N, 1024)
        url = hdfs.path.abspath(fn, local=True)

        class FunkyCtx(object):
            def __init__(self, isplit):
                self.input_split = isplit

        def get_areader(offset, length):
            isplit = InputSplit(InputSplit.to_string(url, offset, length))
            ctx = FunkyCtx(isplit)
            return AvroReader(ctx)

        areader = get_areader(0, 14)
        file_length = areader.reader.file_length
        with self.assertRaises(StopIteration):
            next(areader)
        areader = get_areader(0, file_length)
        with SeekableDataFileReader(open(fn, 'rb'), DatumReader()) as sreader:
            for (o, a), s in czip(areader, sreader):
                self.assertEqual(a, s)
        mid_len = int(file_length / 2)
        lows = [x for x in get_areader(0, mid_len)]
        highs = [x for x in get_areader(mid_len, file_length)]
        self.assertEqual(N, len(lows) + len(highs))
Exemplo n.º 18
0
 def _read_avro_file(self) -> List[dict]:
     records = []
     for file in glob.glob(
             os.path.join(os.path.join(self.avro_dir_name, '**/*'))):
         for record in DataFileReader(open(file, 'rb'), DatumReader()):
             records.append(record)
     return records
Exemplo n.º 19
0
def read_then_to_json(client, file_names, bucket, error_keys_table):
    temp_json_output = []


    for file in file_names:
        filename = "/tmp/temp.avro"
        try:
            client.download_file(Bucket = bucket, Key = file, Filename = filename)
        except Exception as e:
            ''' files which could not be downloaded'''
            print ("File could not be downloaded: " + file)
            error_keys_table['aws']['files'].append(file)
            continue

        try:
            reader = DataFileReader(open(filename , "rb"), DatumReader())

        except Exception as e:
            ''' files that couldn't be opened '''
            print ("File could not be opened: " + file)
            error_keys_table['open']['files'].append(file)
            continue

        for user in reader:
            if user not in temp_json_output:
                temp_json_output.append(user)
    return temp_json_output
Exemplo n.º 20
0
    def runEngine(self, engine):
        if engine.config.method == "emit":
            engine.emit = lambda x: x

        for record in DataFileReader(
                open("test/prettypfa/exoplanets.avro", "r"), DatumReader()):
            engine.action(record)
Exemplo n.º 21
0
def print_all_events(path, limit=10):
    """example stepping through all the data files and parsing them

    1. iterate through all data files
    2. open files with avro
    3. parse event JSON
    4. pretty-print events
    """
    printed = 0
    for parent, dirs, files in os.walk(path):
        for fname in sorted(files):
            printed += 1
            if printed >= limit:
                print("...")
                return
            with open(os.path.join(parent, fname), 'rb') as f:
                # this is how you open an avro file
                reader = DataFileReader(f, DatumReader())
                # an avro file provides an iterable of events
                for reading in reader:
                    # the uuid we want to use is reading.SystemProperties.connectionDeviceId
                    print(
                        f"uuid={reading['SystemProperties']['connectionDeviceId']}"
                    )

                    # the actual payload from the app is the json body (as a bytestring)
                    try:
                        # parse it out so it looks nicer when we print:
                        reading['Body'] = json.loads(
                            reading['Body'].decode('utf8'))
                    except ValueError:
                        # leave not json as bytes. This shouldn't happen!
                        pass
                    pprint.pprint(reading)
def handle_file(path):
    print("Reading file from: " + path)
    reader = DataFileReader(open(path, "rb"), DatumReader())
    for record in reader:
        cset = CollectionSet()
        cset.ParseFromString(record['Body'])
        handle_collection_set(cset)
Exemplo n.º 23
0
def lambda_handler(event, context):
    source_bucket = event['Records'][0]['s3']['bucket']['name']
    key = urllib.parse.unquote(event['Records'][0]['s3']['object']['key'])
    splitStr = source_bucket.split(".")
    account = splitStr[0]
    profile = splitStr[1]

    processed_bucket = source_bucket + "-processed"
    processed_key = key

    stream = get_object(source_bucket, key)
    success = copy_object(source_bucket, key, processed_bucket, processed_key)

    if success:
        s3.delete_object(Bucket=source_bucket, Key=key)

    if stream is not None:

        raw_bytes = stream.read()
        avro_bytes = io.BytesIO(raw_bytes)

        reader = DataFileReader(avro_bytes, DatumReader())
        for line in reader:
            send_to_tealium(line, account, profile)

    return ""
Exemplo n.º 24
0
    def test_avro_reader(self):
        class FunkyCtx(object):
            def __init__(self, isplit):
                self.input_split = isplit

        this_directory = os.path.abspath(os.path.dirname(__file__))
        url = '/'.join(['file://', this_directory, AVRO_DATA])

        def get_areader(offset, length):
            isplit = InputSplit(InputSplit.to_string(url, offset, length))
            ctx = FunkyCtx(isplit)
            return AvroReader(ctx)

        N = 500
        with open(AVRO_DATA, 'wb') as f:
            self.write_avro_file(f, AVRO_USER_SCHEMA, avro_user_record, N,
                                 1024)
        areader = get_areader(0, 14)
        file_length = areader.reader.file_length
        with self.assertRaises(StopIteration):
            areader.next()
        areader = get_areader(0, file_length)
        sreader = SeekableDataFileReader(open(AVRO_DATA), DatumReader())
        for (o, a), s in it.izip(areader, sreader):
            self.assertEqual(a, s)
        mid_len = int(file_length / 2)
        lows = [x for x in get_areader(0, mid_len)]
        highs = [x for x in get_areader(mid_len, file_length)]
        self.assertEqual(N, len(lows) + len(highs))
Exemplo n.º 25
0
    def generic_dataframe(self, df, avro_schema, assert_fns=None):
        """Generic test running function for arbitrary avro schemas.

        Writes a dataframe containing the records to avro.

        Reads back and compares with the original
        """
        print(avro_schema)

        cyavro.write_avro_file_from_dataframe(df,
                                              self.filename,
                                              json.dumps(avro_schema),
                                              codec='null')

        if assert_fns is None:
            assert_fns = {}

        df_read = cyavro.read_avro_file_as_dataframe(self.filename)

        import avro.schema
        from avro.datafile import DataFileReader, DataFileWriter
        from avro.io import DatumReader, DatumWriter

        with open(self.filename, 'rb') as fo:
            reader = DataFileReader(fo, DatumReader())
            records = []
            for user in reader:
                records.append(user)
            df_reference = pd.DataFrame(records)
            reader.close()

        success = True

        for col in avro_schema["fields"]:
            colname = col['name']
            assert_fn = assert_fns.get(colname, np.testing.assert_array_equal)

            def print_fail_header(s):
                print('#' * len(s))
                print("FAIL: Column {}".format(col))
                print('#' * len(s))
                print(s)

            try:
                assert_fn(df_read[colname], df[colname])
            except AssertionError:
                print_fail_header(
                    "Failed for cyavro read comparison  {}\n".format(col))
                traceback.print_exc(file=sys.stdout)
                success = False

            try:
                assert_fn(df_reference[colname], df[colname])
            except AssertionError:
                print_fail_header(
                    "Failed for cyavro write comparison {}\n".format(col))
                traceback.print_exc(file=sys.stdout)
                success = False

        assert success
Exemplo n.º 26
0
class Deserializer(object):
    def __init__(self, schema_str):
        schema = avro.schema.parse(schema_str)
        self.reader = DatumReader(schema)

    def deserialize(self, rec_bytes):
        return self.reader.read(BinaryDecoder(StringIO(rec_bytes)))
Exemplo n.º 27
0
def main(args):
    log = logging.getLogger(__name__)
    log.setLevel(logging.INFO)

    sys_log = logging.handlers.SysLogHandler("/dev/log")
    sys_format = logging.Formatter(
        '%(name)s[%(process)d]: %(levelname)s %(message)s')
    sys_log.setFormatter(sys_format)

    log.addHandler(sys_log)

    reader = DataFileReader(open(args.avro_file, "r"), DatumReader())

    schema = reader.datum_reader.writers_schema

    for i, row in enumerate(reader):
        log.debug("Consumer row:" + str(row))
        writer = DatumWriter(schema)
        bytes_writer = io.BytesIO()
        encoder = BinaryEncoder(bytes_writer)
        writer.write(row, encoder)
        raw_bytes = bytes_writer.getvalue()
        b64enc = base64.b64encode(raw_bytes)
        msg = {"messages": [{"data": b64enc}]}

        json_str = json.dumps(msg)
        log.debug("json msg:" + json_str)
        publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic,
                args.ams_key, log)
Exemplo n.º 28
0
    def get_data_in_batches(
        self,
        bucket_name,
        prefix=None,
        data_after=None,
        data_until=None,
        batch_size=10000,
    ):
        rows = []
        for blob in self.generate_blob_list(bucket_name, prefix, data_after,
                                            data_until):
            # download file content as bytes, read via avro
            blob_meta = {
                "blob_name": blob.name,
                "blob_modified_at": blob.updated,
            }
            bytes_data = blob.download_as_string()
            bytes_object = BytesIO(bytes_data)
            bytes_object.mode = "rb+"  # need to "fake" the mode attribute because
            # avro checks the mode of the file given for some reason, fails otherwise
            reader = DataFileReader(bytes_object, DatumReader())
            for row in reader:
                # add blob-level metadata
                row.update(blob_meta)
                rows.append(row)
            if len(rows) >= batch_size:
                yield rows
                rows = []

        if rows:
            # return any data that was left after the last iteration
            yield rows
def build_cars_in_time(files):
    cars = {}

    for idx, file in enumerate(files):
        print_progress(idx, len(files))

        try:
            reader = DataFileReader(open(file, "rb"), DatumReader())
            for car in reader:
                car_reg_number = select_reg_number(car)
                if not cars.has_key(car_reg_number):
                    cars[car_reg_number] = []
                cars[car_reg_number].append({
                    'timestamp':
                    select_timestamp(car),
                    'distanceAccumulated':
                    select_distance_accumulated(car),
                    'regNumber':
                    select_reg_number(car)
                })
        except TypeError:
            print("Error reading file {0}".format(file))
        finally:
            reader.close()

    return cars
Exemplo n.º 30
0
    def respond(self, call_request):
        buffer_reader = io.BytesIO(call_request)
        buffer_decoder = BinaryDecoder(buffer_reader)
        buffer_writer = io.BytesIO()
        buffer_encoder = BinaryEncoder(buffer_writer)
        error = None
        response_metadata = {}
        try:
            remote_protocol = self.process_handshake(buffer_decoder,
                                                     buffer_encoder)
            if remote_protocol is None or self.local_protocol is None:
                return buffer_writer.getvalue()

            DatumReader(schema.parse(
                '{"type": "map", "values": "bytes"}')).read(buffer_decoder)
            remote_message_name = buffer_decoder.read_utf8()

            remote_message = remote_protocol.messages.get(remote_message_name)
            if remote_message is None:
                fail_msg = 'Unknown remote message: %s' % remote_message_name
                raise schema.AvroException(fail_msg)
            local_message = self.local_protocol.messages.get(
                remote_message_name)
            if local_message is None:
                fail_msg = 'Unknown local message: %s' % remote_message_name
                raise schema.AvroException(fail_msg)
            writers_schema = remote_message.request
            readers_schema = local_message.request
            request = self.read_request(writers_schema, readers_schema,
                                        buffer_decoder)

            response = None
            try:
                response = self.invoke(self.local_protocol, local_message,
                                       request)
            except AvroRemoteException as e:
                error = e
            except Exception as e:
                error = AvroRemoteException(str(e))

            DatumWriter(
                schema.parse('{"type": "map", "values": "bytes"}')).write(
                    response_metadata, buffer_encoder)
            buffer_encoder.write_boolean(error is not None)
            if error is None:
                writers_schema = local_message.response
                self.write_response(writers_schema, response, buffer_encoder)
            else:
                writers_schema = local_message.errors
                self.write_error(writers_schema, error, buffer_encoder)
        except schema.AvroException as e:
            error = AvroRemoteException(str(e))
            buffer_encoder = BinaryEncoder(io.BytesIO())
            DatumWriter(
                schema.parse('{"type": "map", "values": "bytes"}')).write(
                    response_metadata, buffer_encoder)
            buffer_encoder.write_boolean(True)
            self.write_error(schema.parse('["string"]'), error, buffer_encoder)
            return buffer_encoder.writer.getvalue()
        return buffer_writer.getvalue()
Exemplo n.º 31
0
  def read(self, format):
    time_start = time.time()

    if format == 'json':
      with open('./output/output.json') as file:
        json.loads(file.read())

    if format == 'jsch':
      with open('./output/output.json') as file:
        validate(json.loads(file.read()), self._schema_json)

    elif format == 'avro':
      reader = DataFileReader(open('./output/output.avro', 'r'), DatumReader())
      for user in reader:
        pass
      reader.close()

    elif format == 'protobuf':
      with open('./output/output.pb', 'rb') as file:
          addressbook_pb2.AddressBook().ParseFromString(file.read())

    elif format == 'gzjson':
      with gzip.open('./output/output.jsz', 'rb') as file: 
          json.loads(file.read())

    time_end = time.time()

    return time_end - time_start
Exemplo n.º 32
0
def get_flowrecords_from_flowdata_file(filename_path_input):
    """
    Create a Python generator to read the csv/txt/avro file returning the records to processing.
    *Important: when considering CSV/TXT files remember to use files without header/statistics as input files*
    :param filename_path_input: exported csv/txt/avro flow input file from the original nfpcap file via NFDUMP
    :return: generator to records from file
    """
    if filename_path_input.lower().endswith(('.csv', '.txt')):
        with open(filename_path_input) as csvfile:
            reader = csv.reader(csvfile)
            for line in reader:
                yield create_flow_record_from_csv(line)

    # >> default extension Apache AVRO <<
    else:
        # prepare to read binary
        flowsrecords_reader = DataFileReader(open(filename_path_input, "rb"), DatumReader())
        try:
            for flow in flowsrecords_reader:
                yield flow
        except zlib.error as ze:
            print ze.message
            pass
        except IOError as io:
            print io.message
Exemplo n.º 33
0
    def callback(message):
        # Get the message serialization type.
        encoding = message.attributes.get("googclient_schemaencoding")
        # Deserialize the message data accordingly.
        if encoding == "BINARY":
            bout = io.BytesIO(message.data)
            decoder = BinaryDecoder(bout)
            reader = DatumReader(avro_schema)
            message_data = reader.read(decoder)
            print(f"Received a binary-encoded message:\n{message_data}")
        elif encoding == "JSON":
            message_data = json.loads(message.data)
            print(f"Received a JSON-encoded message:\n{message_data}")
        else:
            print(f"Received a message with no encoding:\n{message}")

        message.ack()
Exemplo n.º 34
0
class Deserializer(object):

    def __init__(self, schema_str):
        if sys.version_info >= (3,):
            schema = avro.schema.Parse(schema_str)
        else:
            schema = avro.schema.parse(schema_str)
        self.reader = DatumReader(schema)

    def deserialize(self, rec_bytes):
        return self.reader.read(BinaryDecoder(string_io(rec_bytes)))
Exemplo n.º 35
0
class AvroCodec(object):
    def __init__(self, schema):
        self._raw_schema = schema
        self._avro_schema = avro.schema.parse(json.dumps(schema))
        self._reader = DatumReader(self._avro_schema)

    def dump(self, obj, fp):
        """
        Serializes obj as an avro-format byte stream to the provided
        fp file-like object stream.
        """
        if not validate(obj, self._raw_schema):
            raise AvroTypeException(self._avro_schema, obj)
        fastavro_write_data(fp, obj, self._raw_schema)

    def dumps(self, obj):
        """
        Serializes obj to an avro-format byte array and returns it.
        """
        out = BytesIO()
        try:
            self.dump(obj, out)
            return out.getvalue()
        finally:
            out.close()

    def load(self, fp):
        """
        Deserializes the byte stream contents of the given file-like
        object into an object and returns it.
        """
        return self._reader.read(BinaryDecoder(fp))

    def loads(self, data):
        """
        Deserializes the given byte array into an object and returns it.
        """
        st = BytesIO(data)
        try:
            return self.load(st)
        finally:
            st.close()
Exemplo n.º 36
0
 def __init__(self, schema_str):
     schema = avro.schema.parse(schema_str)
     self.reader = DatumReader(schema)
Exemplo n.º 37
0
 def fromKey(self, key, avroType):
     bytes = io.BytesIO(base64.b64decode(key))
     reader = DatumReader(avroType.schema)
     return reader.read(BinaryDecoder(bytes))
Exemplo n.º 38
0
 def __init__(self, schema_str):
     if sys.version_info >= (3,):
         schema = avro.schema.Parse(schema_str)
     else:
         schema = avro.schema.parse(schema_str)
     self.reader = DatumReader(schema)
Exemplo n.º 39
0
 def __init__(self, schema):
     self._raw_schema = schema
     self._avro_schema = avro.schema.parse(json.dumps(schema))
     self._reader = DatumReader(self._avro_schema)
Exemplo n.º 40
0
# encoding: utf-8

"""Python avro official implementation decoding benchmark."""

from io import BytesIO
from itertools import repeat
from time import time
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter, BinaryEncoder, BinaryDecoder
import sys


LOOPS = 1

with open(sys.argv[1]) as reader:
  datum_reader = DatumReader()
  file_reader = DataFileReader(reader, datum_reader)
  SCHEMA = datum_reader.writers_schema
  BUFS = []
  datum_writer = DatumWriter(SCHEMA)
  for record in file_reader:
    buf = BytesIO()
    encoder = BinaryEncoder(buf)
    datum_writer.write(record, encoder)
    BUFS.append(buf)

datum_reader = DatumReader(SCHEMA)
start = time()
n = 0
for _ in repeat(None, LOOPS):
  for buf in BUFS: