def test_xduplicates(self): model = Table() model.info.aux.duplicate.probability = 1 model.info.aux.duplicate.distribution = "uniform" model.info.aux.duplicate.maximum = 1 schema = model.info.schema.info field1 = schema.fields.add() field1.name = "record_id" field1.info.type = "String" field1.info.length = 10 field2 = schema.fields.add() field2.name = "Name" field2.info.type = "String" field2.info.length = 10 field2.info.aux.generator.name = "name" field3 = schema.fields.add() field3.name = "UPC" field3.info.type = "Integer" field3.info.length = 13 field3.info.aux.generator.name = "ean" parm = field3.info.aux.generator.parameters.add() parm.name = "ndigits" parm.value = 13 parm.type = "int" s2 = Synthesizer(model, "en_CA", idx=0, seed=4053) print(s2.generate())
def test_glm_proto(self): model = Table() schema = model.info.schema.info field1 = schema.fields.add() field1.name = "Value1" field1.info.type = "Float" field1.info.length = 10 field1.info.aux.generator.name = "random_int" field1.info.aux.dependent = "Prediction" field2 = schema.fields.add() field2.name = "Value2" field2.info.type = "Float" field2.info.length = 10 field2.info.aux.generator.name = "random_int" field2.info.aux.dependent = "Prediction" field3 = schema.fields.add() field3.name = "Prediction" field3.info.type = "Float" field3.info.length = 10 field3.info.aux.generator.name = "glm" beta1 = field3.info.aux.generator.parameters.add() beta1.name = "beta1" beta1.value = 10 beta1.type = "int" beta2 = field3.info.aux.generator.parameters.add() beta2.name = "beta2" beta2.value = 0.1 beta2.type = "float" beta3 = field3.info.aux.generator.parameters.add() beta3.name = "beta3" beta3.value = 100 beta3.type = "int" sigma = field3.info.aux.generator.parameters.add() sigma.name = "sigma" sigma.value = 1 sigma.type = "int" var1 = field3.info.aux.generator.parameters.add() var1.name = "Value1" var1.type = "Field" var1.variable.CopyFrom(field1) var2 = field3.info.aux.generator.parameters.add() var2.name = "Value2" var2.type = "Field" var2.variable.CopyFrom(field2) s2 = Synthesizer(model, "en_CA") print(s2.generate())
def test_gen_from_proto(self): model = Table() model.name = "EvolveModel" schema = model.info.schema.info field = schema.fields.add() field.name = "Name" field.info.type = "String" field.info.length = 10 field.info.aux.generator.name = "name" s2 = Synthesizer(model, "en_CA", idx=0, seed=4053) print(s2.generate())
class RecordBatchGen: """ Class is a record-generator """ def __init__(self, name, **kwargs): Logger.configure(self, **kwargs) self.__name = name self.properties = Properties() options = dict(RecordBatchGenOptions()) options.update(kwargs) for key in options: self.properties.add_property(key, options[key]) if hasattr(self.properties, "seed"): self.rnd = check_random_state(seed=self.properties.seed) else: self.rnd = check_random_state(seed=None) if hasattr(self.properties, "nbatches"): self._nbatches = self.properties.nbatches self._batch_iter = iter(range(self.properties.nbatches)) else: self.__logger.warning("Number of batches not defined") self._batchidx = 0 self.table_id = self.properties.table_id self.table = Table() if hasattr(self.properties, "table_msg"): self.table.ParseFromString(self.properties.table_msg) else: self.__logger.warning("No table message to deserialize") self.num_rows = self.properties.num_rows self.linesep = self.properties.linesep # self.header = self.properties.header self.nsamples = self.properties.nsamples self.file_type = self.properties.file_type self.codec = self.properties.codec self.synthesizer = None self.num_cols = None self.write_batch = None self.header = None # FWF self.pos_char = { "0": "{", "1": "A", "2": "B", "3": "C", "4": "D", "5": "E", "6": "F", "7": "G", "8": "H", "9": "I", } self.neg_char = { "0": "}", "1": "J", "2": "K", "3": "L", "4": "M", "5": "N", "6": "O", "7": "P", "8": "Q", "9": "R", } # header = '' self.header_offset = 0 self.footer = "" self.footer_size = 0 self.__logger.info("Initialized %s", self.__class__.__name__) self.__logger.info("%s properties: %s", self.__class__.__name__, self.properties) print("Initialize RecordBatchGen") @property def random_state(self): return self._builtin_generator.rnd @property def name(self): """ Algorithm name """ return self.__name def reset(self): if hasattr(self, "_nbatches"): self._batch_iter = iter(range(self._nbatches)) else: self.__logger.warning("Override reset in concrete class") def to_msg(self): message = Algo_pb() message.name = self.name message.klass = self.__class__.__name__ message.module = self.__module__ message.properties.CopyFrom(self.properties.to_msg()) return message @staticmethod def from_msg(logger, msg): logger.info("Loading Algo from msg %s", msg.name) try: module = importlib.import_module(msg.module) except ImportError: logger.error("Unable to load module %s", msg.module) raise except Exception as e: logger.error("Unknow error loading module: %s" % e) raise try: class_ = getattr(module, msg.klass) except AttributeError: logger.error("%s: missing attribute %s" % (msg.name, msg.klass)) raise except Exception as e: logger.error("Reason: %s" % e) raise properties = Properties.from_msg(msg.properties) logger.debug(pformat(properties)) # Update the logging level of # algorithms if loglevel not set # Ensures user-defined algos get the artemis level logging if "loglevel" not in properties: properties["loglevel"] = logger.getEffectiveLevel() try: instance = class_(msg.name, **properties) except AttributeError: logger.error("%s: missing attribute %s" % (msg.name, "properties")) raise except Exception as e: logger.error("%s: Cannot initialize %s" % e) raise return instance @property def num_batches(self): return self._nbatches @num_batches.setter def num_batches(self, n): self._nbatches = n def __iter__(self): return self def initialize(self): # Behaves like a switch case controlling the writng method method_switch = { 1: self.write_batch_csv, 2: self.write_batch_fwf, 5: self.write_batch_arrow } self.__logger.info("RecordBatchGenerator") # Get the logger info # Set the number of fields programmatically self.num_cols = len(self.table.info.schema.info.fields) names = [] # Field name array for field in self.table.info.schema.info.fields: # Iterate over the schema fields names.append(field.name) # Push to array self.header = names # Set the header as the names if hasattr(self.properties, "seed"): # Set the seed if there is a seed property self.synthesizer = Synthesizer( # Initialize synthesizer with seeding self.table, idx=0, seed=self.properties.seed) else: # Otherwise, intialize without seed self.synthesizer = Synthesizer(self.table, idx=0) try: # Handle specific filetypes self.write_batch = method_switch[self.file_type] except KeyError: # Alert that this process did not execute self.__logger.info("RecordBatchGenerator: Filetype not 1, 2, or 5") def chunk(self): """ Allow for concurrent generate during write """ for _ in range(self.num_rows): try: yield tuple(self.synthesizer.generate()) except TypeError: self.__logger.error("Generator function must return list") raise except Exception as error: self.__logger.error("Unknown error in chunk") self.__logger.error(error) def fwf_encode_row(self, row): record = "" # Create data of specific unit types. fields = list(self.table.info.schema.fields) for i, dpoint in enumerate(row): # encode # pad to field width # append to record field_schema = fields[i] dpoint = str(dpoint) # signed integers require encoding # all other fields expected to be string-like if field_schema["utype"] == "int": if dpoint < 0: # Convert negative integers. dpoint = dpoint.replace("-", "") dpoint = dpoint.replace(dpoint[-1], self.neg_char[dpoint[-1:]]) else: # Convert positive integers. dpoint = dpoint.replace(dpoint[-1], self.pos_char[dpoint[-1:]]) # ensure generated field is within schema length dpoint = dpoint[:field_schema["length"]] # pad up to required length if field_schema["utype"] == "int" or field_schema[ "utype"] == "uint": dpoint = ("0" * (field_schema["length"] - len(dpoint))) + dpoint else: dpoint = dpoint + (" " * (field_schema["length"] - len(dpoint))) # append field to record record += dpoint return record def write_batch_fwf(self): """ Generate a batch of records convert rows to fixed width fields encode to ascii format in bytes """ fwf = io.StringIO() for row in list(self.chunk()): if len(row) != len(self.header): raise ValueError fwf.write(self.fwf_encode_row(row)) fwf = fwf.getvalue().encode(self.codec) return pa.py_buffer(fwf) def write_batch_csv(self): """ Generate batch of records encode to csv in bytes """ csv = io.StringIO() if self.header: csv.write(",".join(self.header)) csv.write(self.linesep) for row in list(self.chunk()): csv.write(",".join(map(str, row))) csv.write(self.linesep) csv = csv.getvalue().encode(self.codec) return pa.py_buffer(csv) def write_batch_arrow(self): """ Generate a batch of records convert to pyarrow arrays convert to RecordBatch """ data = list(self.chunk()) data = zip(*data) arrays = [] for i, column in enumerate(data): arrays.append(pa.array(column, self.pa_schema[i].type)) batch = pa.RecordBatch.from_arrays(arrays, names=self.pa_schema.names) return batch def write_csv(self): """ Write n chunks to csv Write file to disk """ csv = b"" while (len(csv) // 1024**2) < self.maxfilesize: csv += self.write_batch_csv() if self.checkcount(): break return csv def write_fwf(self): """ Write fwf with all records """ fwf = b"" while (len(fwf) // 1024**2) < self.maxfilesize: fwf += self.write_batch_fwf() if self.checkcount(): break return fwf def write_recordbatchfile(self): sink = pa.BufferOutputStream() writer = pa.RecordBatchFileWriter(sink, self.pa_schema) batches_size = 0 while (batches_size // 1024**2) < self.maxfilesize: batch = self.write_batch_arrow() batches_size += pa.get_record_batch_size(batch) writer.write_batch(batch) if self.checkcount(): break writer.close() buf = sink.getvalue() return buf def __next__(self): next(self._batch_iter) self.__logger.info("%s: Generating datum " % (self.__class__.__name__)) data = self.write_batch() self._batchidx += 1 return data
def test_xmodifer(self): model = Table() schema = model.info.schema.info field1 = schema.fields.add() field1.name = "record_id" field1.info.type = "String" field1.info.length = 10 field2 = schema.fields.add() field2.name = "Name" field2.info.type = "String" field2.info.length = 10 field2.info.aux.generator.name = "name" field3 = schema.fields.add() field3.name = "SIN" field3.info.type = "String" field3.info.length = 10 field3.info.aux.generator.name = "ssn" field4 = schema.fields.add() field4.name = "StreetNumber" field4.info.type = "String" field4.info.length = 40 field4.info.aux.generator.name = "building_number" field5 = schema.fields.add() field5.name = "Street" field5.info.type = "String" field5.info.length = 40 field5.info.aux.generator.name = "street_name" field6 = schema.fields.add() field6.name = "City" field6.info.type = "String" field6.info.length = 40 field6.info.aux.generator.name = "city" field7 = schema.fields.add() field7.name = "Province" field7.info.type = "String" field7.info.length = 40 field7.info.aux.generator.name = "province" field8 = schema.fields.add() field8.name = "PostalCode" field8.info.type = "String" field8.info.length = 40 field8.info.aux.generator.name = "postcode" field9 = schema.fields.add() field9.name = "DOB" field9.info.type = "DateTime" field9.info.length = 40 field9.info.aux.generator.name = "date" field10 = schema.fields.add() field10.name = "PhoneNum" field10.info.type = "String" field10.info.length = 11 field10.info.aux.generator.name = "phone_number" model.info.aux.duplicate.probability = 1 model.info.aux.duplicate.distribution = "uniform" model.info.aux.duplicate.maximum = 5 modifier = model.info.aux.record_modifier modifier.max_modifications_in_record = 1 modifier.max_field_modifiers = 1 modifier.max_record_modifiers = 1 name_mod = modifier.fields.add() name_mod.selection = 0.1 name_mod.name = "Name" prob = name_mod.probabilities prob.insert = 0.1 # insert character in field prob.delete = 0.1 # delete character in field prob.substitute = 0.1 # substitute character in field prob.misspell = 0.0 # use mispelling dictionary prob.transpose = 0.1 # transpose adjacent characters prob.replace = 0.1 # replace with another value of same fake prob.swap = 0.1 # swap two words/values in field prob.split = 0.1 # split a field prob.merge = 0.1 # merge a field prob.nullify = 0.1 # convert to null prob.fill = 0.1 # fill empty field with expected type street_mod = modifier.fields.add() street_mod.selection = 0.9 street_mod.name = "Street" prob2 = street_mod.probabilities prob2.insert = 0.1 # insert character in field prob2.delete = 0.1 # delete character in field prob2.substitute = 0.1 # substitute character in field prob2.misspell = 0.0 # use mispelling dictionary prob2.transpose = 0.1 # transpose adjacent characters prob2.replace = 0.1 # replace with another value of same fake prob2.swap = 0.1 # swap two words/values in field prob2.split = 0.1 # split a field prob2.merge = 0.1 # merge a field prob2.nullify = 0.1 # convert to null prob2.fill = 0.1 # fill empty field with expected type s2 = Synthesizer(model, "en_CA", idx=0, seed=4053) protorows = [] for _ in range(10): protorows.append(s2.generate()) print(protorows)