Exemplo n.º 1
0
def score(graphs, schema, url, port):
    """
    graphs is expected to be a list of dictionaries, where each entry in the 
    list represents a graph with 
    * key idx -> index value
    * key nodes -> list of ints representing vertices of the graph
    * key edges -> list of list of ints representing edges of graph
    """

    stream = BufferedWriter(BytesIO())
    writer = DataFileWriter(stream, avro.io.DatumWriter(), schema)
    # writer = DataFileWriter(open("imdb-graph.avro", "wb"), DatumWriter(), schema)
    for graph in graphs:
        writer.append({
            "edges": graph["edges"],
            "vertices": graph["vertices"],
            "index": graph["idx"],
            "label": graph.get("label")
        })
        writer.flush()
    raw_bytes = stream.raw.getvalue()
    writer.close()

    url = "{}:{}/predictUnstructured/?ret_mode=binary".format(
        url.strip("/"), port)

    payload = raw_bytes
    headers = {'Content-Type': 'application/octet-stream'}

    response = requests.request("POST", url, headers=headers, data=payload)

    return response
Exemplo n.º 2
0
    def encode(self, obj: BaseRecord) -> bytes:
        """ Encode *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* to bytes format

        This function is used by kafka-python

        Args:
            obj (BaseModel): *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult*

        Raises:
            MissingEventClass: can’t find BaseModel in own registered BaseModel list (self._schema)
            AvroEncodeError: fail to encode BaseModel to bytes

        Returns:
            bytes: BaseModel in bytes
        """
        try:
            schema = self._schemas[obj.event_name()]
        except KeyError as err:
            self.logger.exception('%s', err.__str__())
            raise MissingEventClass

        try:
            output = BytesIO()
            writer = DataFileWriter(output, DatumWriter(), schema)
            writer.append(obj.to_dict())
            writer.flush()
            encoded_event = output.getvalue()
            writer.close()
        except AvroTypeException as err:
            self.logger.exception('%s', err.__str__())
            raise AvroEncodeError
        return encoded_event
Exemplo n.º 3
0
    def ExportToBin(self, data, schema=None) -> tuple:
        '''
        Exporta objeto data utilizando o schema informado em formato binário (bytes)
        '''
        if not schema == None:
            pschema = self._parseschema(schema)
            if pschema[0]:
                schema = self._data['schema']
            else:
                return pschema

        else:
            schema = self._data['schema']

        if not type(schema) is avro.schema.RecordSchema:
            schema = None
        try:
            with tempfile.SpooledTemporaryFile(suffix='.avro') as tmp:
                writer = DataFileWriter(tmp, DatumWriter(), schema)
                if not data is list:
                    writer.append(data)
                else:
                    for d in data:
                        writer.append(d)
                writer.flush()
                tmp.seek(0)
                export_bin = tmp.read()
                writer.close()
                tmp.close()
                self._data['data'] = export_bin
            return (True, export_bin, self.getSchemaInfos())
        except Exception as e:
            return (False, str(e), self.getSchemaInfos())
def produce_kafka_messages(topic, cluster, message, data_format):
    """Send basic messages to Kafka"""
    # Get Kafka producer
    producer = cluster.kafka.producer()

    basic_data_formats = ['XML', 'CSV', 'SYSLOG', 'NETFLOW', 'COLLECTD', 'BINARY', 'LOG', 'TEXT', 'JSON']

    # Write records into Kafka depending on the data_format.
    if data_format in basic_data_formats:
        producer.send(topic, message)

    elif data_format == 'WITH_KEY':
        producer.send(topic, message, key=get_random_string(string.ascii_letters, 10).encode())

    elif data_format == 'AVRO':
        writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA)))
        bytes_writer = io.BytesIO()
        encoder = avro.io.BinaryEncoder(bytes_writer)
        writer.write(message, encoder)
        raw_bytes = bytes_writer.getvalue()
        producer.send(topic, raw_bytes)

    elif data_format == 'AVRO_WITHOUT_SCHEMA':
        bytes_writer = io.BytesIO()
        datum_writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA)))
        data_file_writer = DataFileWriter(writer=bytes_writer, datum_writer=datum_writer,
                                          writer_schema=avro.schema.Parse(json.dumps(SCHEMA)))
        data_file_writer.append(message)
        data_file_writer.flush()
        raw_bytes = bytes_writer.getvalue()
        data_file_writer.close()
        producer.send(topic, raw_bytes)

    producer.flush()
Exemplo n.º 5
0
 def encode(self, raw_data):
     byte_stream = BytesIO()
     writer = DataFileWriter(byte_stream, DatumWriter(), self._schema)
     writer.append(raw_data)
     writer.flush()
     serialized_data = byte_stream.getvalue()
     writer.close()
     return serialized_data
Exemplo n.º 6
0
def prepare(producer, arr, root, level):
    for it in arr:
        buf = io.BytesIO()
        writer = DataFileWriter(buf, DatumWriter(), sch)
        item = Item(root, it, False)
        writer.append(item.get_dict())
        writer.flush()
        send(buf, level, producer)
Exemplo n.º 7
0
def create_archive(basedir, destdir):
    all_files = []
    all_dirs = []

    # make a snapshot in case the output directory is the bundle source - so we don't recursively bundle the output
    for path, dirs, files in os.walk(basedir):
        for d in dirs:
            dir = os.path.join(path, d)
            all_dirs.append(dir)
        for f in files:
            file = os.path.join(path, f)
            all_files.append(file)

    schema = avro.schema.parse(
        open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "avro-schemas.json")).read())
    fileprefix = time.strftime("%Y%m%d-%H%M%S")
    avrofile = fileprefix + "-part-0001.avro"
    iteration = 1

    fd = open(os.path.join(destdir, avrofile), 'wb')
    datum = avro.io.DatumWriter()
    writer = DataFileWriter(fd, datum, schema, codec='deflate')
    try:
        for d in all_dirs:
            val = makedir(os.path.basename(os.path.normpath(d)),
                          os.path.relpath(d, basedir))
            writer.append(val)

        for f in all_files:
            for sibling, numsiblings, chunk in get_file_chunks(f):
                if (fd.tell() + len(chunk)) > maxfilesize * 1.1:
                    fd, writer, iteration = rotate_avro_file(fd,
                                                             writer,
                                                             iteration,
                                                             fileprefix,
                                                             destdir,
                                                             datum,
                                                             schema)
                file = makefile(os.path.basename(os.path.normpath(f)),
                                os.path.relpath(f, basedir),
                                numsiblings,
                                sibling,
                                chunk)
                writer.append(file)
                writer.flush()
                del file

        for f in all_files:
            os.remove(f)

        for d in all_dirs:
            os.rmdir(d)

    finally:
        writer.close()
        fd.close()
Exemplo n.º 8
0
    def writer(self, outputs, stdout, stderr=sys.stderr):
        """Overrides base method for hadoop.JobTask
        """
        schema = avro.schema.parse(json.dumps(self.avro_schema()))

        writer = DataFileWriter(stdout, DatumWriter(), schema)
        
        for output in outputs:
            writer.append(output[1])
        #Needn't call close, cause the luigi job will do that.
        writer.flush()
Exemplo n.º 9
0
	def make_record_set(self, schema_path: str, items: list) -> bytes:
		if schema_path not in self.schemas:
			with open(schema_path, 'rb') as raw:
				self.schemas[schema_path] = avro.schema.Parse(raw.read())
		out = BytesIO()
		writer = DataFileWriter(out, DatumWriter(), self.schemas[schema_path])
		for item in items:
			writer.append(item)
		writer.flush()

		return out.getvalue()
Exemplo n.º 10
0
def serialize_records(msgs, schema) -> bytes:
    with io.BytesIO() as buf:
        writer = DataFileWriter(buf, DatumWriter(),
                                avro.schema.parse(json.dumps(schema)))
        for line_item in msgs:
            #print(f"SERRECORD {line_item}")
            writer.append(line_item)

        writer.flush()
        record = buf.getvalue()

        return record
Exemplo n.º 11
0
def objToBinTmp2():
    with tempfile.SpooledTemporaryFile(suffix='.avro') as tmp:
        writer = DataFileWriter(tmp, DatumWriter(), sc)
        for d in datum:
            writer.append(d)
        writer.flush()
        tmp.seek(0)
        ab = tmp.read()
        writer.close()
        tmp.close()

    return ab
Exemplo n.º 12
0
def create_archive(basedir, destdir):
    all_files = []
    all_dirs = []

    # make a snapshot in case the output directory is the bundle source - so we don't recursively bundle the output
    for path, dirs, files in os.walk(basedir):
        for d in dirs:
            dir = os.path.join(path, d)
            all_dirs.append(dir)
        for f in files:
            file = os.path.join(path, f)
            all_files.append(file)

    schema = avro.schema.parse(
        open(
            os.path.join(os.path.dirname(os.path.realpath(__file__)),
                         "avro-schemas.json")).read())
    fileprefix = time.strftime("%Y%m%d-%H%M%S")
    avrofile = fileprefix + "-part-0001.avro"
    iteration = 1

    fd = open(os.path.join(destdir, avrofile), 'wb')
    datum = avro.io.DatumWriter()
    writer = DataFileWriter(fd, datum, schema, codec='deflate')
    try:
        for d in all_dirs:
            val = makedir(os.path.basename(os.path.normpath(d)),
                          os.path.relpath(d, basedir))
            writer.append(val)

        for f in all_files:
            for sibling, numsiblings, chunk in get_file_chunks(f):
                if (fd.tell() + len(chunk)) > maxfilesize * 1.1:
                    fd, writer, iteration = rotate_avro_file(
                        fd, writer, iteration, fileprefix, destdir, datum,
                        schema)
                file = makefile(os.path.basename(os.path.normpath(f)),
                                os.path.relpath(f, basedir), numsiblings,
                                sibling, chunk)
                writer.append(file)
                writer.flush()
                del file

        for f in all_files:
            os.remove(f)

        for d in all_dirs:
            os.rmdir(d)

    finally:
        writer.close()
        fd.close()
Exemplo n.º 13
0
    def encode(self, event: BaseEvent) -> bytes:
        schema = self._schemas[event.name]

        if schema is None:
            raise NameError(
                f"No schema found to encode event with name {event.name}")

        output = BytesIO()
        writer = DataFileWriter(output, DatumWriter(), schema)
        writer.append(event.data)
        writer.flush()
        encoded_event = output.getvalue()
        writer.close()
        return encoded_event
Exemplo n.º 14
0
def avro_dumps(data, schema):
    """dump the given data into an avro file with the provided schema"""
    schema = avro.schema.Parse(schema)
    fp = BytesIO()
    writer = DataFileWriter(fp, DatumWriter(), schema)
    if isinstance(data, list):
        for item in data:
            writer.append(item)
    else:
        writer.append(data)
    writer.flush()
    contents = fp.getvalue()
    fp.close()
    return contents
def produce_kafka_messages(topic, cluster, message, data_format):
    """Send basic messages to Kafka"""
    producer = cluster.kafka.producer()

    basic_data_formats = [
        'XML', 'CSV', 'SYSLOG', 'NETFLOW', 'COLLECTD', 'BINARY', 'LOG',
        'PROTOBUF', 'JSON', 'TEXT'
    ]

    # Write records into Kafka depending on the data_format.
    if data_format in basic_data_formats:
        producer.send(topic, message)

    elif data_format == 'WITH_KEY':
        producer.send(topic,
                      message,
                      key=get_random_string(string.ascii_letters, 10).encode())

    elif data_format == 'AVRO':
        writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA)))
        bytes_writer = io.BytesIO()
        encoder = avro.io.BinaryEncoder(bytes_writer)
        writer.write(message, encoder)
        raw_bytes = bytes_writer.getvalue()
        producer.send(topic, raw_bytes)

    elif data_format == 'AVRO_WITHOUT_SCHEMA':
        bytes_writer = io.BytesIO()
        datum_writer = avro.io.DatumWriter(
            avro.schema.Parse(json.dumps(SCHEMA)))
        data_file_writer = DataFileWriter(writer=bytes_writer,
                                          datum_writer=datum_writer,
                                          writer_schema=avro.schema.Parse(
                                              json.dumps(SCHEMA)))
        data_file_writer.append(message)
        data_file_writer.flush()
        raw_bytes = bytes_writer.getvalue()
        data_file_writer.close()
        producer.send(topic, raw_bytes)

    logger.info('Flushing producer')
    producer.flush()

    logger.info('Validating that the message can be seen in Kafka')
    consumer = cluster.kafka.consumer(consumer_timeout_ms=5000,
                                      auto_offset_reset='earliest')
    consumer.subscribe([topic])

    msgs_received = [msg for msg in consumer]
    assert 1 == len(msgs_received)
Exemplo n.º 16
0
    def encode(self, event: BaseModel) -> bytes:
        try:
            schema = self._schemas[event.event_name()]
        except KeyError as err:
            self.logger.exception(f'{err.__str__()}')
            raise MissingEventClass

        try:
            output = BytesIO()
            writer = DataFileWriter(output, DatumWriter(), schema)
            writer.append(event.__dict__)
            writer.flush()
            encoded_event = output.getvalue()
            writer.close()
        except AvroTypeException as err:
            self.logger.exception(f'{err.__str__()}')
            raise AvroEncodeError
        return encoded_event
Exemplo n.º 17
0
def score_unstructured(model, data, query, **kwargs):
    print("Incoming content type params: ", kwargs)
    print("Incoming data type: ", type(data))
    print("Incoming query params: ", query)

    # writer = avro.io.DatumWriter(schema)
    # bytes_writer = BytesIO()
    # encoder = avro.io.BinaryEncoder(bytes_writer)
    X = pd.read_csv(BytesIO(data))
    shap_values_dict = model.explain(X)
    predictions = model.predict(X).values
    # for p, s in zip(predictions, shap_values_dict):
    #    writer.write({"prediction": p[0],"shap_values": s}, encoder)
    stream = io.BufferedWriter(io.BytesIO())
    writer = DataFileWriter(stream, avro.io.DatumWriter(), model.schema)
    for p, s in zip(predictions, shap_values_dict):
        writer.append({"prediction": p[0], "shap_values": s})
        writer.flush()
    ret_bytes = stream.raw.getvalue()
    writer.close()
    return ret_bytes
Exemplo n.º 18
0
    def send(self, row):
        msg = row.get("payload")
        offset = row.get("modified")
        try:
            bytes_writer = io.BytesIO()
            writer = DataFileWriter(bytes_writer, DatumWriter(), self.schema, codec='deflate')
            writer.append(msg)
            writer.flush()
            raw_bytes = bytes_writer.getvalue()
            writer.close()
            future = self.producer.send(self.topic, key=str(msg.get("id")), value=raw_bytes)
            #block until it actually sends. We don't want offsets getting out of sync
            try:
                record_metadata = future.get(timeout=10)
            except Exception as ke:
                print ("Error submitting record")
                raise ke
            self.producer.flush()
            set_offset_value("entities", offset)

        except Exception as e:
            print ("Issue with Topic %s : %s" % (self.topic, e))
            raise e
fileDict = dict()

HOST, PORT = "localhost", 9999
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)

try:
    sock.connect((HOST, PORT))
    file = open("/home/yd/Downloads/jvm-폰트설정", "r")

    dict = {'id': "test", 'data': None}
    datum = DatumWriter()
    data = StringIO()

    writer = DataFileWriter(data, datum, FILE_SCHEMA)
    while True:
        byte = file.read(8024)
        dict['data'] = byte

        if not byte:
            break
        print dict
        writer.append(dict)

    writer.flush()
    sock.sendall(data.getvalue())

finally:
    sock.close()

Exemplo n.º 20
0
def main():
   known_schemas = avro.schema.Names()

   with open("point.avsc", "rb") as fp:
      point = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   with open("review.avsc", "rb") as fp:
      place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   with open("place.avsc", "rb") as fp:
      place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas)

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), point)
   writer.append({'x': 1.5, 'y': 2.75})
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['x'] == 1.5
   assert deserialized['y'] == 2.75
   reader.close()
   writer.close()

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), point)
      writer.append({'x': 1.5})
      assert False
   except AvroTypeException as e:
      pass

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), point)
      writer.append({'x': 1.5, 'y': "wtanaka.com"})
      assert False
   except AvroTypeException as e:
      pass

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), place)
   writer.append({
         'name': 'wtanaka.com',
         'location': {'x': 1.5, 'y': 2.75}
         })
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['location']['x'] == 1.5
   assert deserialized['location']['y'] == 2.75
   reader.close()
   writer.close()

   output = StringIO.StringIO()
   writer = DataFileWriter(output, DatumWriter(), place)
   writer.append({
         'name': 'wtanaka.com',
         'location': {'x': 1.5, 'y': 2.75},
         'review': {'rating': 4, 'text': '4 stars would come again'},
         })
   writer.flush()
   serialized = output.getvalue()
   reader = DataFileReader(StringIO.StringIO(serialized), DatumReader())
   deserialized = tuple(reader)[0]
   assert deserialized['location']['x'] == 1.5
   assert deserialized['location']['y'] == 2.75
   reader.close()
   writer.close()

   try:
      output = StringIO.StringIO()
      writer = DataFileWriter(output, DatumWriter(), place)
      writer.append({
            'name': 'wtanaka.com',
            'location': {'x': 1.5, 'y': 2.75},
            'review': {'x': 1.5, 'y': 2.75},
            })
      assert False
   except AvroTypeException as e:
      pass
Exemplo n.º 21
0
                open(f"../test_data/{today.isoformat()}.avro", "ab"),
                DatumWriter(), schema)
        last_obs_date = today

        # printing to std out
        temp = response.json()['observations'][0]['metric']['temp']
        humidity = response.json()['observations'][0]['humidity']
        pressure = response.json()['observations'][0]['metric']['pressure']
        print(response.json()['observations'][0]['obsTimeLocal'] +
              f" | temp = {temp:.2f}C | {humidity}% humidity | {pressure} hPa")
        # reader = DataFileReader(open(f"../test_data/{previous_obs_date}.avro", "rb"), DatumReader())
        # for reading in reader:
        #     print(reading)
        # reader.close()

        writer.append(response.json()['observations'][0])
        if flush_countdown <= 0:
            writer.flush()
            flush_countdown = (60 * 60) / naptime  # hourly
            new_blob = bucket.blob(f'{today.isoformat()}.avro')
            new_blob.upload_from_filename(f'../test_data/{new_blob.name}')
        else:
            flush_countdown -= 1

        time.sleep(naptime)

    print(
        f'response status code = {response_status} –> engine.py broke at {datetime.now()}'
    )
    print(f'\n {response.json()}')
Exemplo n.º 22
0
def load_records(opts):
    s3client = get_boto3_client(opts)
    #ls_cos(s3client)

    consumer = get_consumer(opts)

    offsets = {}
    avro_file_buffer = io.BytesIO()
    writer = DataFileWriter(avro_file_buffer, DatumWriter(), get_avro_schema())
    record_count = 0
    while not exiting:
        data_dict = consumer.poll(timeout_ms=60000, max_records=100)

        if data_dict is None:
            print('No data found')
        else:
            for key in data_dict.keys():
                for msg in data_dict[key]:
                    #print(msg)
                    record_count += 1

                    topic = msg.topic
                    partition = msg.partition
                    offset = msg.offset

                    topicPartition = TopicPartition(topic, partition)
                    offsetAndMetadata = OffsetAndMetadata(offset, b'')

                    offsets[topicPartition] = offsetAndMetadata

                    j_msg = json.loads(msg.value.decode('utf-8'))

                    # some records have encoded StockCode as int - fix this up so they are all strings
                    j_msg['StockCode'] = str(j_msg['StockCode'])
                    writer.append(j_msg)

        if record_count >= 5000:
            writer.flush()
            avro_file_buffer.seek(0)
            s3obj = s3client.Object(
                'temp-bucket', 'transactions/{}.avro'.format(int(time.time())))
            s3obj.put(Body=avro_file_buffer)
            writer.close()
            #ls_cos(s3client) # for debugging
            #print(offsets) # for debugging

            # If commit fails to run successfully, we could receive duplicate data in S3 because the offsets will
            # get reprocessed. For more information, see:
            #
            # - https://cwiki.apache.org/confluence/display/KAFKA/FAQ#FAQ-HowdoIgetexactly-oncemessagingfromKafka?
            #
            # TODO: provide some examples for making the processing idempotent.
            consumer.commit(offsets)

            # reset buffers
            offsets = {}
            avro_file_buffer = io.BytesIO()
            writer = DataFileWriter(avro_file_buffer, DatumWriter(),
                                    get_avro_schema())
            record_count = 0

    consumer.close(autocommit=False)