Python DatumWriter示例，avro.io.DatumWriter Python示例

示例#1

0

显示文件

文件： serialization_test.py 项目： thuy616/avro-impl

    def run(self, n):
        # JSON Serializer
        # serializer = ajs.AvroJsonSerializer(self.movies_schema)
        # json_data = serializer.to_json(self.movies_data)
        total_ser = 0
        total_deser = 0
        bytes_len = 0
        for i in range(0, n):
            datum_writer = DatumWriter(self.movies_schema)
            bytes_writer = io.BytesIO()

            encoder = BinaryEncoder(bytes_writer)
            tic = timeit.default_timer()
            datum_writer.write(self.movies_data, encoder)
            elapsed = timeit.default_timer() - tic
            payload = bytes_writer.getvalue()
            total_ser = total_ser + elapsed
            bytes_len = len(payload)

            bytes_reader = io.BytesIO(payload)
            decoder = BinaryDecoder(bytes_reader)
            reader = DatumReader(self.movies_schema)
            tic2 = timeit.default_timer()
            movies = reader.read(decoder)
            elapsed2 = timeit.default_timer() - tic2
            total_deser = total_deser + elapsed2

        self.logger.log(logging.INFO, "serialized len: %s bytes", bytes_len)
        avg_ser = (total_ser*(10**9))/n
        avg_deser = (total_deser*(10**9))/n
        self.logger.log(logging.INFO, "Serialization time: \n%s", avg_ser)
        self.logger.log(logging.INFO, "De-serialization time: \n%s", avg_deser)

示例#2

0

显示文件

文件： metric-publisher.py 项目： kkoumantaros/argo-streaming

def main(args):
    log = logging.getLogger(__name__)
    log.setLevel(logging.INFO)

    sys_log = logging.handlers.SysLogHandler("/dev/log")
    sys_format = logging.Formatter('%(name)s[%(process)d]: %(levelname)s %(message)s')
    sys_log.setFormatter(sys_format)

    log.addHandler(sys_log)

    reader = DataFileReader(open(args.avro_file, "r"), DatumReader())

    schema = reader.datum_reader.writers_schema

    for i, row in enumerate(reader):
        log.debug("Consumer row:" + str(row))
        writer = DatumWriter(schema)
        bytes_writer = io.BytesIO()
        encoder = BinaryEncoder(bytes_writer)
        writer.write(row, encoder)
        raw_bytes = bytes_writer.getvalue()
        b64enc = base64.b64encode(raw_bytes)
        msg = {"messages": [{"data": b64enc}]}

        json_str = json.dumps(msg)
        log.debug("json msg:" + json_str)
        publish(json_str, args.ams_endpoint, args.ams_project, args.ams_topic, args.ams_key, log)

示例#3

0

显示文件

文件： map.py 项目： verdimrc/hadrian

 def toKey(self, x, avroType):
     x = jsonEncoder(avroType, x, False)
     bytes = io.BytesIO()
     writer = DatumWriter(avroType.schema)
     writer.write(x, BinaryEncoder(bytes))
     bytes.flush()
     return base64.b64encode(bytes.getvalue())

示例#4

0

显示文件

文件： cast.py 项目： ajutzeler/hadrian

 def __call__(self, state, scope, pos, paramTypes, x):
     schema = avro.schema.parse(json.dumps(paramTypes[0]))
     x = untagUnion(x, paramTypes[0])
     bytes = io.BytesIO()
     writer = DatumWriter(schema)
     writer.write(x, BinaryEncoder(bytes))
     bytes.flush()
     return bytes.getvalue()

示例#5

0

显示文件

文件： available_event_collectd.py 项目： codebang/dms-sa

def compose_data(timestamp, src_vmtype, host_ip, account_id, dest_ip):
    writer = DatumWriter(get_schema())
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    message = '{"eventName": "Neighbour_Unreachable", "accountId":"%s", "destIp":"%s"}' \
              % (account_id, dest_ip)
    raw_data = bytes(message)
    writer.write({"timestamp": timestamp, "src": src_vmtype, "host_ip": host_ip, "rawdata":raw_data}, encoder)
    raw_bytes = bytes_writer.getvalue()
    return raw_bytes

示例#6

0

显示文件

文件： crit_process_event_collectd.py 项目： codebang/dms-sa

def compose_data(timestamp, src_vmtype, host_ip, account_id, proc_name):
    writer = DatumWriter(get_schema())
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    message = '{"eventName": "Process_Down", "accountId":"%s", "ProcName":"%s"}' \
              % (account_id, proc_name)
    raw_data = bytes(message)
    writer.write({"timestamp": timestamp, "src": src_vmtype, "host_ip": host_ip, "rawdata":raw_data}, encoder)
    raw_bytes = bytes_writer.getvalue()
    return raw_bytes

示例#7

0

显示文件

文件： common.py 项目： crs4/pydoop

class AvroSerializer(object):

    def __init__(self, schema):
        self.schema = schema
        self.datum_writer = DatumWriter(schema)

    def serialize(self, record):
        f = StringIO()
        encoder = BinaryEncoder(f)
        self.datum_writer.write(record, encoder)
        return f.getvalue()

示例#8

0

显示文件

文件： avrolib.py 项目： CynthiaYiqingHuang/pydoop

class Serializer(object):

    def __init__(self, schema_str):
        schema = avro.schema.parse(schema_str)
        self.writer = DatumWriter(schema)

    def serialize(self, record):
        f = StringIO()
        encoder = BinaryEncoder(f)
        self.writer.write(record, encoder)
        return f.getvalue()

示例#9

0

显示文件

文件： test_deserialize.py 项目： MariusDieckmann/pyavroc

class Serializer(object):

    def __init__(self, schema_str):
        if sys.version_info >= (3,):
            schema = avro.schema.Parse(schema_str)
        else:
            schema = avro.schema.parse(schema_str)
        self.writer = DatumWriter(schema)

    def serialize(self, record):
        f = string_io()
        encoder = BinaryEncoder(f)
        self.writer.write(record, encoder)
        return f.getvalue()

示例#10

0

显示文件

 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)

示例#11

0

显示文件

 def encode(self, raw_data):
     byte_stream = BytesIO()
     writer = DataFileWriter(byte_stream, DatumWriter(), self._schema)
     writer.append(raw_data)
     writer.flush()
     serialized_data = byte_stream.getvalue()
     writer.close()
     return serialized_data

示例#12

0

显示文件

文件： pb_io.py 项目： yqin22/pbsmrtpipe

def write_pipeline_template_to_avro(pipeline, rtasks_d, output_file):

    d = pipeline_template_to_dict(pipeline, rtasks_d)
    f = open(output_file, 'w')
    with DataFileWriter(f, DatumWriter(), PT_SCHEMA) as writer:
        writer.append(d)

    return d

示例#13

0

显示文件

 def write_avro_file(self, rec_creator, n_samples, sync_interval):
     avdf.SYNC_INTERVAL = sync_interval
     self.assertEqual(avdf.SYNC_INTERVAL, sync_interval)
     fo = self._mkf('data.avro')
     with avdf.DataFileWriter(fo, DatumWriter(), self.schema) as writer:
         for i in xrange(n_samples):
             writer.append(rec_creator(i))
     return fo.name

示例#14

0

显示文件

def gen_avro(filename):
    schema = avro.schema.parse(SCHEMA)
    fo = open(filename, "wb")
    writer = DataFileWriter(fo, DatumWriter(), schema)
    for record in looney_records():
        writer.append(record)
    writer.close()
    fo.close()

示例#15

0

显示文件

文件： AvroIO.py 项目： mrceyhun/WMArchive

 def _write(self, data):
     "Internal write API"
     wmaid = self.wmaid(data)
     schema = self.schema
     fname = file_name(self.hdir, wmaid)
     with open_file(fname, 'w') as ostream:
         with DataFileWriter(ostream, DatumWriter(), schema) as writer:
             writer.append(data)

示例#16

0

显示文件

def __create_nested(out_path):
    os.makedirs(out_path)
    schema_path = os.path.join(os.path.dirname(__file__), 'data/nested.avsc')
    schema = avro.schema.parse(open(schema_path).read())
    with DataFileWriter(open(os.path.join(out_path, 'part-m-00004.avro'), 'w'), 
                    DatumWriter(), schema) as writer:
            writer.append({'sup': 1, 'sub':{'level2':2}})
            writer.append({'sup': 2, 'sub':{'level2':1}})

示例#17

0

显示文件

def prepare(producer, arr, root, level):
    for it in arr:
        buf = io.BytesIO()
        writer = DataFileWriter(buf, DatumWriter(), sch)
        item = Item(root, it, False)
        writer.append(item.get_dict())
        writer.flush()
        send(buf, level, producer)

示例#18

0

显示文件

    def produce(self, msg):
        if self.ser_type == kfkcfg.SERIALIZATIO_JSON:
            # s = json.dumps(msg)
            s = json.dumps(msg, default=json_util.default)
            future = self.kfkprod.produce(bytes(s, 'utf-8'))
            # msg = json.dumps(msg, default=json_util.default).encode('utf-8')
            # future = self.kfkprod.produce(bytes(msg))

        elif self.ser_type == kfkcfg.SERIALIZATIO_AVRO:

            writer = DatumWriter(self.avro_schema)
            bytes_writer = io.BytesIO()
            encoder = BinaryEncoder(bytes_writer)
            writer.write(msg, encoder)
            raw_bytes = bytes_writer.getvalue()

            future = self.kfkprod.produce(raw_bytes)

示例#19

0

显示文件

    def run(self):
        # for normalizing alcohol
        minimum, maximum, average = 100, 0, 0

        with open('raw.csv', 'r') as fd:
            csv_reader = csv.reader(fd, delimiter=',')

            collection = {}
            for i, row in enumerate(csv_reader):
                desc = row[3].lower().replace('.', '').replace(',', '')

                alc = float(row[-1])
                if alc < minimum:
                    minimum = alc
                if alc > maximum:
                    maximum = alc
                average += alc

                # Remove gifts or items without description
                if 'engin' in desc:
                    continue

                if 'gjafa' in desc or 'gjafa' in row[0]:
                    continue

                if 'öskju' in desc or 'öskju' in row[0]:
                    continue

                if 'flöskur m/glasi' in desc or 'kútur' in row[0]:
                    continue

                features = self.parse(desc.split(), row[0])
                features['alcohol'] = alc
                collection[row[0]] = features

        average = average / (i + 1)

        with open('beers.avsc', 'r') as fd:
            schema = avro.schema.Parse(fd.read())

        with open('beers.avro', 'wb') as fd:
            writer = DataFileWriter(fd, DatumWriter(), schema)

            denominator_alc = maximum - minimum

            for k, v in collection.items():
                v['bitterness'] = self.BITTERNESS['class'][
                    v['bitterness']] / self.BITTERNESS['maximum']
                v['color'] = self.COLOR['class'][
                    v['color']] / self.COLOR['maximum']
                v['clarity'] = self.CLARITY['class'][
                    v['clarity']] / self.CLARITY['maximum']
                v['sweetness'] = self.SWEETNESS['class'][
                    v['sweetness']] / self.CLARITY['maximum']
                v['alcohol'] = (v['alcohol'] - minimum) / denominator_alc
                v['name'] = k
                writer.append(v)
            writer.close()

示例#20

0

显示文件

文件： avro_inference.py 项目： paperwait/kafka-ml

class AvroInference():
    """Class representing a sink of Avro inference data to Apache Kafka.

        Args:
            boostrap_servers (str): List of Kafka brokers
            topic (str): Kafka topic 
            data_scheme_filename (str): Filename of the AVRO scheme for training data
            group_id (str): Group ID of the Kafka consumer. Defaults to sink

    """
    def __init__(self,
                 boostrap_servers,
                 topic,
                 data_scheme_filename,
                 group_id='sink'):

        self.boostrap_servers = boostrap_servers
        self.topic = topic

        self.data_scheme_filename = data_scheme_filename

        self.data_schema = open(self.data_scheme_filename, "r").read()

        self.avro_data_schema = avro.schema.Parse(self.data_schema)
        self.data_writer = DatumWriter(self.avro_data_schema)

        self.data_io = io.BytesIO()
        self.data_encoder = BinaryEncoder(self.data_io)
        self.__producer = KafkaProducer(
            bootstrap_servers=self.boostrap_servers)

    def send(self, data):

        self.data_writer.write(data, self.data_encoder)
        data_bytes = self.data_io.getvalue()

        self.__producer.send(self.topic, data_bytes)

        self.data_io.seek(0)
        self.data_io.truncate(0)
        """Cleans data buffer"""

    def close(self):
        self.__producer.flush()
        self.__producer.close()

示例#21

0

显示文件

文件： schema.py 项目： sql-assurance/sql-assurance

    def check_schema(self, data, schema_path):
        schema = avro.schema.Parse(
            open(schema_path, "rb").read().decode("utf-8"))

        writer = DataFileWriter(open('_test.avro', "wb"), DatumWriter(),
                                schema)

        writer.append(data)
        writer.close()

示例#22

0

显示文件

    def _load_datawriter(self):
        try:
            lschema = load_schema(self.schema)
            self.avrofile = open(self.outfile, 'w+b')
            self.datawrite = DataFileWriter(self.avrofile, DatumWriter(), lschema)
        except Exception:
            return False

        return True

示例#23

0

显示文件

def _write_items(base_name, schema_str, items):
    avro_schema = schema.Parse(schema_str)
    avro_file = base_name + '.avro'
    with DataFileWriter(open(avro_file, "w"), DatumWriter(),
                        avro_schema) as writer:
        for i in items:
            writer.append(i)
    writer.close
    return (avro_file)

示例#24

0

显示文件

def _create_avro_file(schema, items, file_prefix):
    _, result_file_path = tempfile.mkstemp(prefix=file_prefix, suffix='.avro')
    parsed_schema = avro.schema.Parse(schema)
    with open(result_file_path, 'wb') as f:
        writer = DataFileWriter(f, DatumWriter(), parsed_schema)
        for s in items:
            writer.append(s)
        writer.close()
    return result_file_path

示例#25

0

显示文件

文件： bigquery.py 项目： Gemma-Analytics/ewah

    def _create_or_update_table(
        self,
        data,
        table_name,
        schema_name,
        schema_suffix,
        columns_definition,
        load_strategy,
        upload_call_count,
        database_name=None,
        primary_key=None,
    ):
        # This method doesn't actually create or update a table. It just creates
        # and populates a single .avro file which is used in the data upload.
        # The actual upload happens when the commit() method is called.
        if upload_call_count == 1:
            # Create avro writer and file in temporary folder
            self.avro_folder = TemporaryDirectory()
            self.avro_file_name = self.avro_folder.name + os.sep + table_name + ".avro"
            avro_schema = avro.schema.parse(
                json.dumps({
                    "type":
                    "record",
                    "name":
                    table_name,
                    "namespace":
                    table_name,
                    "fields": [{
                        "name":
                        name,
                        "type": [
                            "null",
                            map_bq_data_type_to_avro(field["data_type"]),
                        ],
                    } for name, field in columns_definition.items()],
                }))
            # Create the avro_writer object to be used going forward
            self.avro_writer = DataFileWriter(open(self.avro_file_name, "wb"),
                                              DatumWriter(), avro_schema)
            # Save the relevant kwargs for later use in the commit() method
            self.table_creation_config = {
                "table_name": table_name,
                "schema_name": schema_name,
                "schema_suffix": schema_suffix,
                "columns_definition": columns_definition,
                "load_strategy": load_strategy,
                "database_name": database_name,
                "primary_key": primary_key,
            }

        self.log.info(
            "BigQuery Uploader writes data into Avro file for later one-off upload!"
        )
        while data:
            # Write records to .avro file
            self.avro_writer.append(data.pop(0))

示例#26

0

显示文件

文件： transform_data.py 项目： teopopescu/token-analyst

 def serialize_records(records, coin, avro_output=None):
     if avro_output == None:
         avro_output = str(coin) + ".avro"
     transformer = transform_data()
     schema = transformer.parse_schema()
     #avro_output=str(coin) + ".avro"
     with open(avro_output, 'wb') as out:
         writer = DataFileWriter(out, DatumWriter(), schema)
         for record in records:
             writer.append(record)

示例#27

0

显示文件

def test_sanity():
  """

  Ensures that our "base" and "good" schemas are actually forwards- and
  backwards-compatible

  """
  # fst schema / record
  fst_schema = schema.parse(open("%s/MyRecord.base.avsc" % BASE_DIR).read())
  fst_writer = DatumWriter(writers_schema=fst_schema)
  fst_record = {
      "fieldWithoutDefaultValue": 0,
      "properField": 0,
      "enumField": "A",
      "unionField": None,
      "arrayField": ["world"],
      "mapField": {"hello": "world"},
      "fixedField": "aaaaaaaaaaaaaaaa"
  }

  # sec schema / record
  sec_schema = schema.parse(open("%s/MyRecord.good.avsc" % BASE_DIR).read())
  sec_writer = DatumWriter(writers_schema=sec_schema)
  sec_record = {
      "fieldWithoutDefaultValue": 0,
      "properField2": 0,
      "enumField": "B",
      "unionField": None,
      "arrayField": ["world"],
      "fixedField": "bbbbbbbbbbbbbbbb"
  }

  # Encode record w/ fst
  fst_buf = StringIO.StringIO()
  fst_encoder = BinaryEncoder(fst_buf)
  fst_writer.write(fst_record, fst_encoder)
  fst_data = fst_buf.getvalue()

  # Encode record w/ sec
  sec_buf = StringIO.StringIO()
  sec_encoder = BinaryEncoder(sec_buf)
  sec_writer.write(sec_record, sec_encoder)
  sec_data = sec_buf.getvalue()

  # writers == fst, readers == sec
  sec_reader = DatumReader(writers_schema=fst_schema, readers_schema=sec_schema)
  sec_decoder = BinaryDecoder(StringIO.StringIO(fst_data))
  sec_from_fst = sec_reader.read(sec_decoder) # no exception -> good

  # writers == sec, readers == fst
  fst_reader = DatumReader(writers_schema=sec_schema, readers_schema=fst_schema)
  fst_decoder = BinaryDecoder(StringIO.StringIO(sec_data))
  fst_from_sec = fst_reader.read(fst_decoder) # no exception -> good

示例#28

0

显示文件

文件： producer.py 项目： sephirothhua/ATM_PROJECT

def put_frame(video_name, video_number, pic):
    writer = DatumWriter(SCHEMA)
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    writer.write(
        {
            "rtsp":
            "rtsp",
            "createTime":
            time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())),
            "videoName":
            video_name,
            "videoNumber":
            video_number,
            "picContents":
            pic
        }, encoder)
    raw_bytes = bytes_writer.getvalue()
    PRODUCER.send_messages(TOPIC, raw_bytes)

示例#29

0

显示文件

 def _load_file(self, file_path, schema) -> DataFileWriter:
     f = open(file_path, 'ab+')
     self.cache[file_path] = dict()
     self.cache[file_path]['file_io'] = f
     writer = DataFileWriter(f, DatumWriter(), schema)
     self.cache[file_path]['datum_writer'] = writer
     self.cache.move_to_end(file_path)
     if len(self.cache) > self.capacity:
         self._remove_item()
     return writer

示例#30

0

显示文件

文件： parse.py 项目： xsongx/blog-files

def serialize_records(records, outpath="funding.avro"):
    schema = parse_schema()
    # with open(outpath, 'wb') as out:
    out = StringIO()
    writer = DataFileWriter(out, DatumWriter(), schema)
    for record in records:
        record = dict((f, getattr(record, f)) for f in record._fields)
        record['fundedDate'] = record['fundedDate'].strftime('%Y-%m-%dT%H:M:S')
        writer.append(record)
    return out

示例#31

0

显示文件

def objToBin2():
    file = io.BytesIO()
    datum_writer = DatumWriter()
    fwriter = DataFileWriter(file, datum_writer, sc)
    for d in datum:
        fwriter.append(d)
    ab = file.getvalue()
    fwriter.close()

    return ab

示例#32

0

显示文件

def import_data(schema, src, dest, index, debug):
    global next_update
    global verbose

    index = int(index)
    verbose = int(debug)
    in_file = os.path.join(src, "MLHD_%03d.tar" % index)
    out_file = os.path.join(dest, "MLHD_%03d.avro" % index)
    count = 0
    next_update = time() + UPDATE_INTERVAL

    schema = avro.schema.Parse(open(schema, "rb").read().decode('ascii'))

    with DataFileWriter(open(out_file, "wb"),
                        DatumWriter(),
                        schema,
                        codec='deflate') as writer:
        tar = tarfile.open(in_file)
        total = 0
        chunks = []
        size = 0
        for i, member in enumerate(tar.getnames()):
            count, data = handle_file(member, tar.extractfile(member).read())
            chunks.append(data)
            total += count
            size += len(data)
            if verbose:
                print(
                    "%03d: %d rows processed, %s total rows, %d bytes of output."
                    % (index, count, total, size))
                sys.stdout.flush()

            if size > MAX_SIZE:
                for chunk in chunks:
                    try:
                        for js in chunk:
                            writer.append(js)
                    except Exception as err:
                        print("%03d: err writing file: %s" % (index, err))
                        sys.exit(-1)
                chunks = []
                size = 0

        tar.close()

        if verbose:
            print("%03d: finish writing output file." % index)
            sys.stdout.flush()
        for chunk in chunks:
            try:
                for js in chunk:
                    writer.append(js)
            except Exception as err:
                print("%03d: err writing file: %s" % (index, err))
                sys.exit(-1)

示例#33

0

显示文件

def main ():

    # Define schema of avro file.
    schema = avro.schema.Parse(open("logs_uuid.avsc", "rb").read())

    # Create a datum writer.
    rwriter = DatumWriter(schema)

    files = ['logs_0.txt', 'logs_1.txt', 'logs_2.txt', 'logs_3.txt']
    
    # Loop to process the files 
    for f in files:

        # open file and store in a variable
        logfile = open(f, "r")
        text    = logfile.readlines()
        logfile.close()

        # Set the avro file name (new)
        newfile = str(f).replace('.txt','uuid.avro')

        # Create a data file writer.
        dfwriter = DataFileWriter (open(newfile, "wb"), DatumWriter(), schema)

        # Loop to get information from each line
        for line in text:

            # Get the variables from line.
            sdt, surl, suser = line.strip().split('\t')

            # Defines a dictionary structure
            data = {}
            data['timestamp'] = sdt
            data['url']       = surl
            data['user']      = suser
            data['uuid']      = str(uuid.uuid1())

            # Write the data in the file.
            dfwriter.append (data)

        # Close the file after the loop.
        dfwriter.close()

示例#34

0

显示文件

文件： test-file.py 项目： sjl421/Hadoop-3

def testWrite(filename, schema):
    fd = open(filename, 'wb')

    datum = DatumWriter()
    writer = DataFileWriter(fd, datum, schema)

    writer.append(makeObject("Person A", 23))
    writer.append(makeObject("Person B", 31))
    writer.append(makeObject("Person C", 28))

    writer.close()

示例#35

0

显示文件

文件： __init__.py 项目： MikeThomsen/sapsucker

	def make_record_set(self, schema_path: str, items: list) -> bytes:
		if schema_path not in self.schemas:
			with open(schema_path, 'rb') as raw:
				self.schemas[schema_path] = avro.schema.Parse(raw.read())
		out = BytesIO()
		writer = DataFileWriter(out, DatumWriter(), self.schemas[schema_path])
		for item in items:
			writer.append(item)
		writer.flush()

		return out.getvalue()

示例#36

0

显示文件

文件： tool_contract_io.py 项目： dayedepps/pbcommand

def _write_records_to_avro(schema, _d_or_ds, output_file):
    # FIXME. There's only one record being written here,
    # why does this not support a single item
    if isinstance(_d_or_ds, dict):
        _d_or_ds = [_d_or_ds]
    with open(output_file, 'w') as outs:
        with DataFileWriter(outs, DatumWriter(), schema) as writer:
            for record in _d_or_ds:
                writer.append(record)
    log.debug("Write avro file to {p}".format(p=output_file))
    return _d_or_ds

示例#37

0

显示文件

文件： avrolib.py 项目： wtj/pydoop

 def __init__(self, context):
     super(AvroWriter, self).__init__(context)
     self.logger = LOGGER.getChild('AvroWriter')
     job_conf = context.job_conf
     part = int(job_conf['mapreduce.task.partition'])
     outdir = job_conf["mapreduce.task.output.dir"]
     outfn = "%s/part-r-%05d.avro" % (outdir, part)
     wh = hdfs.open(outfn, "w")
     self.logger.debug('created hdfs file %s', outfn)
     self.writer = DataFileWriter(wh, DatumWriter(), self.schema)
     self.logger.debug('opened AvroWriter')

示例#38

0

显示文件

文件： avro_inference.py 项目： paperwait/kafka-ml

    def __init__(self,
                 boostrap_servers,
                 topic,
                 data_scheme_filename,
                 group_id='sink'):

        self.boostrap_servers = boostrap_servers
        self.topic = topic

        self.data_scheme_filename = data_scheme_filename

        self.data_schema = open(self.data_scheme_filename, "r").read()

        self.avro_data_schema = avro.schema.Parse(self.data_schema)
        self.data_writer = DatumWriter(self.avro_data_schema)

        self.data_io = io.BytesIO()
        self.data_encoder = BinaryEncoder(self.data_io)
        self.__producer = KafkaProducer(
            bootstrap_servers=self.boostrap_servers)

示例#39

0

显示文件

文件： pipeline.py 项目： pengnam/real-time-aircraft-visualization

    def write(self, data):
        #Parsing data to select only keys in schema
        store_data = {}
        for key in self.keys:
            if key in data:
                store_data[key] = data[key]
            else:
                store_data[key] = None

        #Serialize data using AVRO
        writer = DatumWriter(self.schema)
        bytes_writer = io.BytesIO()
        encoder = avro.io.BinaryEncoder(bytes_writer)

        writer.write(store_data, encoder)
        raw_bytes = bytes_writer.getvalue()

        #Place into pipeline
        print(data)
        self.producer.send(self.topic, raw_bytes)

示例#40

0

显示文件

文件： avrolib.py 项目： CynthiaYiqingHuang/pydoop

 def __init__(self, schema_str):
     schema = avro.schema.parse(schema_str)
     self.writer = DatumWriter(schema)

示例#41

0

显示文件

文件： map.py 项目： bwengals/hadrian

 def toKey(self, x, schema):
     bytes = io.BytesIO()
     writer = DatumWriter(schema)
     writer.write(x, BinaryEncoder(bytes))
     bytes.flush()
     return base64.b64encode(bytes.getvalue())

示例#42

0

显示文件

文件： producer.py 项目： bartaelterman/snippets

def serialize(data):
    writer = DatumWriter(schema)
    bytes_writer = io.BytesIO()
    encoder = avro.io.BinaryEncoder(bytes_writer)
    writer.write(data, encoder)
    return bytes_writer.getvalue()

示例#43

0

显示文件

文件： test_deserialize.py 项目： MariusDieckmann/pyavroc

 def __init__(self, schema_str):
     if sys.version_info >= (3,):
         schema = avro.schema.Parse(schema_str)
     else:
         schema = avro.schema.parse(schema_str)
     self.writer = DatumWriter(schema)

示例#44

0

显示文件

文件： python-avro.py 项目： Arkotek/avsc

"""Python avro official implementation encoding benchmark."""

from io import BytesIO
from itertools import repeat
from time import time
from avro.datafile import DataFileReader, DataFileWriter
from avro.io import DatumReader, DatumWriter, BinaryEncoder, BinaryDecoder
import sys


LOOPS = 1

with open(sys.argv[1]) as reader:
  datum_reader = DatumReader()
  file_reader = DataFileReader(reader, datum_reader)
  SCHEMA = datum_reader.writers_schema
  RECORDS = list(file_reader)

buf = BytesIO()
datum_writer = DatumWriter(SCHEMA)
start = time()
n = 0
for _ in repeat(None, LOOPS):
  for record in RECORDS:
    buf.seek(0)
    encoder = BinaryEncoder(buf)
    datum_writer.write(record, encoder)
    n += 1
print 1000. * (time() - start) / n

示例#45

0

显示文件

文件： common.py 项目： crs4/pydoop

 def __init__(self, schema):
     self.schema = schema
     self.datum_writer = DatumWriter(schema)

示例#46

0

显示文件

文件： avro_utils.py 项目： suncl-paraview/htsc_spider

 def createAvroMemoryRecord(data,schema):
     f = StringIO()
     encoder = BinaryEncoder(f)
     writer = DatumWriter(schema)
     writer.write(dict(data),encoder)
     return f.getvalue()

示例#47

0

显示文件

文件： reducer.py 项目： KanechikaAyumu/book-index-Cassandra

#
# NB: the AvroOutputReader specific portion begins here
#

def new_column(name, value):
    column = dict()
    column['name'] = '%s' % name
    column['value'] = '%s' % value
    column['timestamp'] = long(time.time() * 1e6)
    column['ttl'] = 0
    return column

# parse the current avro schema
proto = avro.protocol.parse(open('cassandra.avpr').read())
schema = proto.types_dict['StreamingMutation']
# open an avro encoder and writer for stdout
enc = BinaryEncoder(sys.stdout)
writer = DatumWriter(schema)

# output a series of objects matching 'StreamingMutation' in the Avro interface
smutation = dict()
try:
    for word, count in word2count.iteritems():
        smutation['key'] = word
        smutation['mutation'] = {'column_or_supercolumn': {'column': new_column('count', count)}}
        writer.write(smutation, enc)
finally:
    sys.stdout.flush()

示例#48

0

显示文件

文件： avro_producer.py 项目： yajur33/pyspark-examples

from time import time



# To send messages synchronously
producer = KafkaProducer(bootstrap_servers = "localhost:9092", compression_type = "gzip")

# Kafka topic
topic = "tnx"

# Path to user.avsc avro schema
schema_path = "/home/cloudera/workspace/kafka-clients-python/transactions.avsc"
schema = avro.schema.Parse(open(schema_path).read())
print("Schema", schema.to_json())

writer = DatumWriter(schema)
bytes_writer = io.BytesIO()
encoder = avro.io.BinaryEncoder(bytes_writer)

def get_record():
    return {"id": "123"
            , "merchant_id": "m123"
            , "customer_id": "c345"
            , "amount": 100.1
            , "category": "pos"
            , "timestamp": int(time())}


for i in range(10):
    record = get_record()
    writer.write(record, encoder)