def score(graphs, schema, url, port): """ graphs is expected to be a list of dictionaries, where each entry in the list represents a graph with * key idx -> index value * key nodes -> list of ints representing vertices of the graph * key edges -> list of list of ints representing edges of graph """ stream = BufferedWriter(BytesIO()) writer = DataFileWriter(stream, avro.io.DatumWriter(), schema) # writer = DataFileWriter(open("imdb-graph.avro", "wb"), DatumWriter(), schema) for graph in graphs: writer.append({ "edges": graph["edges"], "vertices": graph["vertices"], "index": graph["idx"], "label": graph.get("label") }) writer.flush() raw_bytes = stream.raw.getvalue() writer.close() url = "{}:{}/predictUnstructured/?ret_mode=binary".format( url.strip("/"), port) payload = raw_bytes headers = {'Content-Type': 'application/octet-stream'} response = requests.request("POST", url, headers=headers, data=payload) return response
def encode(self, obj: BaseRecord) -> bytes: """ Encode *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* to bytes format This function is used by kafka-python Args: obj (BaseModel): *BaseHandlerEvent / BaseHandlerCommand / BaseHandlerResult* Raises: MissingEventClass: can’t find BaseModel in own registered BaseModel list (self._schema) AvroEncodeError: fail to encode BaseModel to bytes Returns: bytes: BaseModel in bytes """ try: schema = self._schemas[obj.event_name()] except KeyError as err: self.logger.exception('%s', err.__str__()) raise MissingEventClass try: output = BytesIO() writer = DataFileWriter(output, DatumWriter(), schema) writer.append(obj.to_dict()) writer.flush() encoded_event = output.getvalue() writer.close() except AvroTypeException as err: self.logger.exception('%s', err.__str__()) raise AvroEncodeError return encoded_event
def ExportToBin(self, data, schema=None) -> tuple: ''' Exporta objeto data utilizando o schema informado em formato binário (bytes) ''' if not schema == None: pschema = self._parseschema(schema) if pschema[0]: schema = self._data['schema'] else: return pschema else: schema = self._data['schema'] if not type(schema) is avro.schema.RecordSchema: schema = None try: with tempfile.SpooledTemporaryFile(suffix='.avro') as tmp: writer = DataFileWriter(tmp, DatumWriter(), schema) if not data is list: writer.append(data) else: for d in data: writer.append(d) writer.flush() tmp.seek(0) export_bin = tmp.read() writer.close() tmp.close() self._data['data'] = export_bin return (True, export_bin, self.getSchemaInfos()) except Exception as e: return (False, str(e), self.getSchemaInfos())
def produce_kafka_messages(topic, cluster, message, data_format): """Send basic messages to Kafka""" # Get Kafka producer producer = cluster.kafka.producer() basic_data_formats = ['XML', 'CSV', 'SYSLOG', 'NETFLOW', 'COLLECTD', 'BINARY', 'LOG', 'TEXT', 'JSON'] # Write records into Kafka depending on the data_format. if data_format in basic_data_formats: producer.send(topic, message) elif data_format == 'WITH_KEY': producer.send(topic, message, key=get_random_string(string.ascii_letters, 10).encode()) elif data_format == 'AVRO': writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA))) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(message, encoder) raw_bytes = bytes_writer.getvalue() producer.send(topic, raw_bytes) elif data_format == 'AVRO_WITHOUT_SCHEMA': bytes_writer = io.BytesIO() datum_writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA))) data_file_writer = DataFileWriter(writer=bytes_writer, datum_writer=datum_writer, writer_schema=avro.schema.Parse(json.dumps(SCHEMA))) data_file_writer.append(message) data_file_writer.flush() raw_bytes = bytes_writer.getvalue() data_file_writer.close() producer.send(topic, raw_bytes) producer.flush()
def encode(self, raw_data): byte_stream = BytesIO() writer = DataFileWriter(byte_stream, DatumWriter(), self._schema) writer.append(raw_data) writer.flush() serialized_data = byte_stream.getvalue() writer.close() return serialized_data
def prepare(producer, arr, root, level): for it in arr: buf = io.BytesIO() writer = DataFileWriter(buf, DatumWriter(), sch) item = Item(root, it, False) writer.append(item.get_dict()) writer.flush() send(buf, level, producer)
def create_archive(basedir, destdir): all_files = [] all_dirs = [] # make a snapshot in case the output directory is the bundle source - so we don't recursively bundle the output for path, dirs, files in os.walk(basedir): for d in dirs: dir = os.path.join(path, d) all_dirs.append(dir) for f in files: file = os.path.join(path, f) all_files.append(file) schema = avro.schema.parse( open(os.path.join(os.path.dirname(os.path.realpath(__file__)), "avro-schemas.json")).read()) fileprefix = time.strftime("%Y%m%d-%H%M%S") avrofile = fileprefix + "-part-0001.avro" iteration = 1 fd = open(os.path.join(destdir, avrofile), 'wb') datum = avro.io.DatumWriter() writer = DataFileWriter(fd, datum, schema, codec='deflate') try: for d in all_dirs: val = makedir(os.path.basename(os.path.normpath(d)), os.path.relpath(d, basedir)) writer.append(val) for f in all_files: for sibling, numsiblings, chunk in get_file_chunks(f): if (fd.tell() + len(chunk)) > maxfilesize * 1.1: fd, writer, iteration = rotate_avro_file(fd, writer, iteration, fileprefix, destdir, datum, schema) file = makefile(os.path.basename(os.path.normpath(f)), os.path.relpath(f, basedir), numsiblings, sibling, chunk) writer.append(file) writer.flush() del file for f in all_files: os.remove(f) for d in all_dirs: os.rmdir(d) finally: writer.close() fd.close()
def writer(self, outputs, stdout, stderr=sys.stderr): """Overrides base method for hadoop.JobTask """ schema = avro.schema.parse(json.dumps(self.avro_schema())) writer = DataFileWriter(stdout, DatumWriter(), schema) for output in outputs: writer.append(output[1]) #Needn't call close, cause the luigi job will do that. writer.flush()
def make_record_set(self, schema_path: str, items: list) -> bytes: if schema_path not in self.schemas: with open(schema_path, 'rb') as raw: self.schemas[schema_path] = avro.schema.Parse(raw.read()) out = BytesIO() writer = DataFileWriter(out, DatumWriter(), self.schemas[schema_path]) for item in items: writer.append(item) writer.flush() return out.getvalue()
def serialize_records(msgs, schema) -> bytes: with io.BytesIO() as buf: writer = DataFileWriter(buf, DatumWriter(), avro.schema.parse(json.dumps(schema))) for line_item in msgs: #print(f"SERRECORD {line_item}") writer.append(line_item) writer.flush() record = buf.getvalue() return record
def objToBinTmp2(): with tempfile.SpooledTemporaryFile(suffix='.avro') as tmp: writer = DataFileWriter(tmp, DatumWriter(), sc) for d in datum: writer.append(d) writer.flush() tmp.seek(0) ab = tmp.read() writer.close() tmp.close() return ab
def create_archive(basedir, destdir): all_files = [] all_dirs = [] # make a snapshot in case the output directory is the bundle source - so we don't recursively bundle the output for path, dirs, files in os.walk(basedir): for d in dirs: dir = os.path.join(path, d) all_dirs.append(dir) for f in files: file = os.path.join(path, f) all_files.append(file) schema = avro.schema.parse( open( os.path.join(os.path.dirname(os.path.realpath(__file__)), "avro-schemas.json")).read()) fileprefix = time.strftime("%Y%m%d-%H%M%S") avrofile = fileprefix + "-part-0001.avro" iteration = 1 fd = open(os.path.join(destdir, avrofile), 'wb') datum = avro.io.DatumWriter() writer = DataFileWriter(fd, datum, schema, codec='deflate') try: for d in all_dirs: val = makedir(os.path.basename(os.path.normpath(d)), os.path.relpath(d, basedir)) writer.append(val) for f in all_files: for sibling, numsiblings, chunk in get_file_chunks(f): if (fd.tell() + len(chunk)) > maxfilesize * 1.1: fd, writer, iteration = rotate_avro_file( fd, writer, iteration, fileprefix, destdir, datum, schema) file = makefile(os.path.basename(os.path.normpath(f)), os.path.relpath(f, basedir), numsiblings, sibling, chunk) writer.append(file) writer.flush() del file for f in all_files: os.remove(f) for d in all_dirs: os.rmdir(d) finally: writer.close() fd.close()
def encode(self, event: BaseEvent) -> bytes: schema = self._schemas[event.name] if schema is None: raise NameError( f"No schema found to encode event with name {event.name}") output = BytesIO() writer = DataFileWriter(output, DatumWriter(), schema) writer.append(event.data) writer.flush() encoded_event = output.getvalue() writer.close() return encoded_event
def avro_dumps(data, schema): """dump the given data into an avro file with the provided schema""" schema = avro.schema.Parse(schema) fp = BytesIO() writer = DataFileWriter(fp, DatumWriter(), schema) if isinstance(data, list): for item in data: writer.append(item) else: writer.append(data) writer.flush() contents = fp.getvalue() fp.close() return contents
def produce_kafka_messages(topic, cluster, message, data_format): """Send basic messages to Kafka""" producer = cluster.kafka.producer() basic_data_formats = [ 'XML', 'CSV', 'SYSLOG', 'NETFLOW', 'COLLECTD', 'BINARY', 'LOG', 'PROTOBUF', 'JSON', 'TEXT' ] # Write records into Kafka depending on the data_format. if data_format in basic_data_formats: producer.send(topic, message) elif data_format == 'WITH_KEY': producer.send(topic, message, key=get_random_string(string.ascii_letters, 10).encode()) elif data_format == 'AVRO': writer = avro.io.DatumWriter(avro.schema.Parse(json.dumps(SCHEMA))) bytes_writer = io.BytesIO() encoder = avro.io.BinaryEncoder(bytes_writer) writer.write(message, encoder) raw_bytes = bytes_writer.getvalue() producer.send(topic, raw_bytes) elif data_format == 'AVRO_WITHOUT_SCHEMA': bytes_writer = io.BytesIO() datum_writer = avro.io.DatumWriter( avro.schema.Parse(json.dumps(SCHEMA))) data_file_writer = DataFileWriter(writer=bytes_writer, datum_writer=datum_writer, writer_schema=avro.schema.Parse( json.dumps(SCHEMA))) data_file_writer.append(message) data_file_writer.flush() raw_bytes = bytes_writer.getvalue() data_file_writer.close() producer.send(topic, raw_bytes) logger.info('Flushing producer') producer.flush() logger.info('Validating that the message can be seen in Kafka') consumer = cluster.kafka.consumer(consumer_timeout_ms=5000, auto_offset_reset='earliest') consumer.subscribe([topic]) msgs_received = [msg for msg in consumer] assert 1 == len(msgs_received)
def encode(self, event: BaseModel) -> bytes: try: schema = self._schemas[event.event_name()] except KeyError as err: self.logger.exception(f'{err.__str__()}') raise MissingEventClass try: output = BytesIO() writer = DataFileWriter(output, DatumWriter(), schema) writer.append(event.__dict__) writer.flush() encoded_event = output.getvalue() writer.close() except AvroTypeException as err: self.logger.exception(f'{err.__str__()}') raise AvroEncodeError return encoded_event
def score_unstructured(model, data, query, **kwargs): print("Incoming content type params: ", kwargs) print("Incoming data type: ", type(data)) print("Incoming query params: ", query) # writer = avro.io.DatumWriter(schema) # bytes_writer = BytesIO() # encoder = avro.io.BinaryEncoder(bytes_writer) X = pd.read_csv(BytesIO(data)) shap_values_dict = model.explain(X) predictions = model.predict(X).values # for p, s in zip(predictions, shap_values_dict): # writer.write({"prediction": p[0],"shap_values": s}, encoder) stream = io.BufferedWriter(io.BytesIO()) writer = DataFileWriter(stream, avro.io.DatumWriter(), model.schema) for p, s in zip(predictions, shap_values_dict): writer.append({"prediction": p[0], "shap_values": s}) writer.flush() ret_bytes = stream.raw.getvalue() writer.close() return ret_bytes
def send(self, row): msg = row.get("payload") offset = row.get("modified") try: bytes_writer = io.BytesIO() writer = DataFileWriter(bytes_writer, DatumWriter(), self.schema, codec='deflate') writer.append(msg) writer.flush() raw_bytes = bytes_writer.getvalue() writer.close() future = self.producer.send(self.topic, key=str(msg.get("id")), value=raw_bytes) #block until it actually sends. We don't want offsets getting out of sync try: record_metadata = future.get(timeout=10) except Exception as ke: print ("Error submitting record") raise ke self.producer.flush() set_offset_value("entities", offset) except Exception as e: print ("Issue with Topic %s : %s" % (self.topic, e)) raise e
fileDict = dict() HOST, PORT = "localhost", 9999 sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: sock.connect((HOST, PORT)) file = open("/home/yd/Downloads/jvm-폰트설정", "r") dict = {'id': "test", 'data': None} datum = DatumWriter() data = StringIO() writer = DataFileWriter(data, datum, FILE_SCHEMA) while True: byte = file.read(8024) dict['data'] = byte if not byte: break print dict writer.append(dict) writer.flush() sock.sendall(data.getvalue()) finally: sock.close()
def main(): known_schemas = avro.schema.Names() with open("point.avsc", "rb") as fp: point = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) with open("review.avsc", "rb") as fp: place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) with open("place.avsc", "rb") as fp: place = avro.schema.make_avsc_object(json.loads(fp.read()), known_schemas) output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5, 'y': 2.75}) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['x'] == 1.5 assert deserialized['y'] == 2.75 reader.close() writer.close() try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5}) assert False except AvroTypeException as e: pass try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), point) writer.append({'x': 1.5, 'y': "wtanaka.com"}) assert False except AvroTypeException as e: pass output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75} }) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['location']['x'] == 1.5 assert deserialized['location']['y'] == 2.75 reader.close() writer.close() output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75}, 'review': {'rating': 4, 'text': '4 stars would come again'}, }) writer.flush() serialized = output.getvalue() reader = DataFileReader(StringIO.StringIO(serialized), DatumReader()) deserialized = tuple(reader)[0] assert deserialized['location']['x'] == 1.5 assert deserialized['location']['y'] == 2.75 reader.close() writer.close() try: output = StringIO.StringIO() writer = DataFileWriter(output, DatumWriter(), place) writer.append({ 'name': 'wtanaka.com', 'location': {'x': 1.5, 'y': 2.75}, 'review': {'x': 1.5, 'y': 2.75}, }) assert False except AvroTypeException as e: pass
open(f"../test_data/{today.isoformat()}.avro", "ab"), DatumWriter(), schema) last_obs_date = today # printing to std out temp = response.json()['observations'][0]['metric']['temp'] humidity = response.json()['observations'][0]['humidity'] pressure = response.json()['observations'][0]['metric']['pressure'] print(response.json()['observations'][0]['obsTimeLocal'] + f" | temp = {temp:.2f}C | {humidity}% humidity | {pressure} hPa") # reader = DataFileReader(open(f"../test_data/{previous_obs_date}.avro", "rb"), DatumReader()) # for reading in reader: # print(reading) # reader.close() writer.append(response.json()['observations'][0]) if flush_countdown <= 0: writer.flush() flush_countdown = (60 * 60) / naptime # hourly new_blob = bucket.blob(f'{today.isoformat()}.avro') new_blob.upload_from_filename(f'../test_data/{new_blob.name}') else: flush_countdown -= 1 time.sleep(naptime) print( f'response status code = {response_status} –> engine.py broke at {datetime.now()}' ) print(f'\n {response.json()}')
def load_records(opts): s3client = get_boto3_client(opts) #ls_cos(s3client) consumer = get_consumer(opts) offsets = {} avro_file_buffer = io.BytesIO() writer = DataFileWriter(avro_file_buffer, DatumWriter(), get_avro_schema()) record_count = 0 while not exiting: data_dict = consumer.poll(timeout_ms=60000, max_records=100) if data_dict is None: print('No data found') else: for key in data_dict.keys(): for msg in data_dict[key]: #print(msg) record_count += 1 topic = msg.topic partition = msg.partition offset = msg.offset topicPartition = TopicPartition(topic, partition) offsetAndMetadata = OffsetAndMetadata(offset, b'') offsets[topicPartition] = offsetAndMetadata j_msg = json.loads(msg.value.decode('utf-8')) # some records have encoded StockCode as int - fix this up so they are all strings j_msg['StockCode'] = str(j_msg['StockCode']) writer.append(j_msg) if record_count >= 5000: writer.flush() avro_file_buffer.seek(0) s3obj = s3client.Object( 'temp-bucket', 'transactions/{}.avro'.format(int(time.time()))) s3obj.put(Body=avro_file_buffer) writer.close() #ls_cos(s3client) # for debugging #print(offsets) # for debugging # If commit fails to run successfully, we could receive duplicate data in S3 because the offsets will # get reprocessed. For more information, see: # # - https://cwiki.apache.org/confluence/display/KAFKA/FAQ#FAQ-HowdoIgetexactly-oncemessagingfromKafka? # # TODO: provide some examples for making the processing idempotent. consumer.commit(offsets) # reset buffers offsets = {} avro_file_buffer = io.BytesIO() writer = DataFileWriter(avro_file_buffer, DatumWriter(), get_avro_schema()) record_count = 0 consumer.close(autocommit=False)