def get_records(self,table,number): self.db = DBConnection(self.datasource) print("running get records {}".format(number)) generator,header = self.db.stream_table(table) def format_record(record): return {str(h.name):str(v) for h,v in zip(header,record)} try: out = [] for i , x in enumerate(generator): out.append(format_record(x)) if i == number: reason = "break" break except Exception as e: reason = e finally: resp = input("stopped on {} write to file? (y/n) : ".format(reason)) write = resp == "y" print("\n chosen write {} ".format(write)) if write: path = JSON_RECORDS print("out len {}".format(len(out))) with open(path,"w+") as f: f.write(json.dumps(out)) print("wrote {} records to \n{}".format(len(out),path)) else: print("not writing")
class IngestionProducer(KafkaWriter): def __init__(self,bootstrap_servers,datasource,outfile = JSON_RECORDS): super().__init__(bootstrap_servers) self.datasource = datasource def get_records(self,table,number): self.db = DBConnection(self.datasource) print("running get records {}".format(number)) generator,header = self.db.stream_table(table) def format_record(record): return {str(h.name):str(v) for h,v in zip(header,record)} try: out = [] for i , x in enumerate(generator): out.append(format_record(x)) if i == number: reason = "break" break except Exception as e: reason = e finally: resp = input("stopped on {} write to file? (y/n) : ".format(reason)) write = resp == "y" print("\n chosen write {} ".format(write)) if write: path = JSON_RECORDS print("out len {}".format(len(out))) with open(path,"w+") as f: f.write(json.dumps(out)) print("wrote {} records to \n{}".format(len(out),path)) else: print("not writing") def get_records_json(self): path = JSON_RECORDS with open(path,"r") as f: data = json.loads(f.read()) return data def ingest_data(self,table,number = False): print("in ingest data method max {}".format(number)) records = self.get_records_json() print(" got {} records to stream".format(len(records))) topic = get_topic(self.datasource,table) print("streaming data from table {} to topic {}".format(table,topic)) input("press enter to start producing") print("producing...") for i,record in enumerate(records): self.produce(record, topic) if number: if i == number: break self.produce_debug("completed producing {}".format(table))
def __init__(self, database="test_database"): self.config = {\ "sites": (10,random_site),\ "parts":(50,random_part),\ "customers":(15,random_customer),\ "suppliers":(5,random_supplier),\ "sales_orders":(50,random_sales_order),\ "purchase_orders":(50,random_purchase_order)} self.commands = [ "sites", "parts", "customers", "sales_orders", "purchase_orders", "suppliers" ] print("mockdata got database name {}".format(database)) self.dbc = DBConnection(database) self.current_table = None self.insert_statements = [] global fake_factory global product_data product_data = read_product_data() fake_factory = Faker()
def run_statistics(datasource): print("running stats... ") dbc = DBConnection(datasource) dbc.get_base_table_descriptions() query = "select topic_name, min(timestamp) start_time,max(timestamp) end_time, max(quantity) amount from tracked_topics group by topic_name;" records = get_records(query=query, dbc=dbc) runid = get_records("select max(run_id) + 1 newid from run_stats;", dbc)[0]["newid"] insert_kwargs = {"table": "run_stats"} for record in records: record["elapsed_time"] = (record["end_time"] - record["start_time"]).total_seconds() insert_kwargs.update({"topic_name":record["topic_name"],\ "run_id":runid,\ "start_time":record["start_time"],\ "end_time":record["end_time"],\ "elapsed_time":record["elapsed_time"],\ "amount":record["amount"]}) insert_db(dbc, **insert_kwargs) dbc.execute_cursor("delete from tracked_topics where 1 = 1;", commit=True)
def consume_stats(bootstrap_servers, datasource): dbc = DBConnection(datasource) dbc.get_base_table_descriptions() consumer = KafkaConsumer("stats",\ group_id = "test",\ bootstrap_servers=bootstrap_servers,\ auto_offset_reset ="smallest",\ value_deserializer =lambda m: json.loads(m.decode('utf-8'))) insert_kwargs = {} insert_kwargs.update({"table": "tracked_topics"}) for i, message in enumerate(consumer): jsonmsg = json.loads(message.value) stat_topic = jsonmsg["topic"] stat_count = jsonmsg["count"] stat_time = jsonmsg["timestamp"] insert_kwargs.update({"topic_name":str(stat_topic),\ "quantity":int(stat_count),\ "timestamp":stat_time}) insert_db(dbc, **insert_kwargs) if i % 100 == 0: print(" inserted i {} tracked records".format(i)) if i == 1000000: break dbc.commit_connection()
class mockData(object): def __init__(self, database="test_database"): self.config = {\ "sites": (10,random_site),\ "parts":(50,random_part),\ "customers":(15,random_customer),\ "suppliers":(5,random_supplier),\ "sales_orders":(50,random_sales_order),\ "purchase_orders":(50,random_purchase_order)} self.commands = [ "sites", "parts", "customers", "sales_orders", "purchase_orders", "suppliers" ] print("mockdata got database name {}".format(database)) self.dbc = DBConnection(database) self.current_table = None self.insert_statements = [] global fake_factory global product_data product_data = read_product_data() fake_factory = Faker() def set_quantity(self, configitem, amount): current_config = self.config.get(configitem) new_config = (amount, current_config[1]) print("updating {} from {} to {} ".format(configitem, current_config, new_config)) self.config[configitem] = new_config def generate_insert_statement(self, **kwargs): base_stmt = "insert into {} ({}) values ({});" column_definitions = self.dbc.descriptions.get(self.current_table) cols = [] vals = [] for column, data in kwargs.items(): column_type = column_definitions.get(column, False) if not column_type: print( "column type for column {} not found for table {}".format( column, self.current_table)) else: cols.append(column) if "CHAR" in column_type.upper(): val = "'{}'".format(data) elif "INT" in column_type.upper(): # print ("making int {} for column {}".format(data,column)) val = "{}".format(str(int(data))) elif "TIME" in column_type.upper(): # print("got time col {} of type{} ".format(data,type(data))) val = "TIMESTAMP '{}'".format(data.isoformat()) else: val = data vals.append(val) columns = ", ".join(cols) values = ", ".join(vals) return base_stmt.format(self.current_table, columns, values) def part_customer_data(self): self.current_table = "part_customers" part_list = self.dbc.get_column_from_table("parts", "id") customer_list = self.dbc.get_column_from_table("customers", "id") print( "making part_customers data for {} customers and {} parts".format( len(customer_list), len(part_list))) for customer in customer_list: rd.shuffle(part_list) parts = rd.randint(0, len(part_list)) for i in range(parts): record = {} record["part_id"] = part_list[i] record["customer_id"] = customer record["delivery_lead_time"] = rd.randint(2, 5) insert_stmt = self.generate_insert_statement(**record) self.insert_statements.append(insert_stmt) self.run_insert_statements() def part_supplier_data(self): print("making part_supplier data") self.current_table = "part_suppliers" part_list = self.dbc.get_column_from_table("parts", "id") supplier_list = self.dbc.get_column_from_table("suppliers", "id") for customer in supplier_list: rd.shuffle(part_list) parts = rd.randint(0, len(part_list)) for i in range(parts): record = {} record["part_id"] = part_list[i] record["supplier_id"] = customer record["supply_lead_time"] = rd.randint(1, 30) insert_stmt = self.generate_insert_statement(**record) self.insert_statements.append(insert_stmt) self.run_insert_statements() def generate_starting_inventory(self): result = {} possible_status = ["available", "expired", "qa_hold"] result["site_id"] = get_random_value_from_column("sites", "id")[0] result["part_id"] = part_id result["status"] = rd.choice(possible_status) result["quantity"] = rd.randint(0, 20000) insert_stmt = self.generate_insert_statement(**result) self.insert_statements.append(insert_stmt) def data_generator(self): n, func = self.config.get(self.current_table) for i in range(n): row = func() yield self.generate_insert_statement(**row) def mock_data_from_config(self): self.insert_statements = [] for command in self.commands: self.current_table = command table_data_generator = self.data_generator() self.insert_statements += [stmt for stmt in table_data_generator] def run_insert_statements(self): if self.insert_statements: for stmt in self.insert_statements: if isinstance(stmt, dict): print(stmt) stmt = self.generate_insert_statement(**stmt) self.dbc.execute_cursor(stmt, commit=True) self.insert_statements = []
class mockData(object): """ class for managing the generation of mocked data using the fakefactory """ def __init__(self, database="test_database"): self.config = {\ "sites": (10,random_site),\ "parts":(50,random_part),\ "customers":(15,random_customer),\ "suppliers":(5,random_supplier),\ "sales_orders":(50,random_sales_order),\ "purchase_orders":(50,random_purchase_order)} self.commands = [ "sites", "parts", "customers", "sales_orders", "purchase_orders", "suppliers" ] print("mockdata got database name {}".format(database)) self.dbc = DBConnection(database) self.current_table = None self.insert_statements = [] global fake_factory global product_data product_data = read_product_data() fake_factory = Faker() def set_quantity(self, configitem, amount): """update config for mocking any particular tables data""" current_config = self.config.get(configitem) new_config = (amount, current_config[1]) print("updating {} from {} to {} ".format(configitem, current_config, new_config)) self.config[configitem] = new_config def generate_insert_statement(self, **kwargs): """ take any set of input kwargs and turn them into a formatted insert statement""" base_stmt = "insert into {} ({}) values ({});" column_definitions = self.dbc.descriptions.get(self.current_table) cols = [] vals = [] for column, data in kwargs.items(): column_type = column_definitions.get(column, False) if not column_type: print( "column type for column {} not found for table {}".format( column, self.current_table)) else: cols.append(column) if "CHAR" in column_type.upper(): val = "'{}'".format(data) elif "INT" in column_type.upper(): # print ("making int {} for column {}".format(data,column)) val = "{}".format(str(int(data))) elif "TIME" in column_type.upper(): # print("got time col {} of type{} ".format(data,type(data))) val = "TIMESTAMP '{}'".format(data.isoformat()) else: val = data vals.append(val) columns = ", ".join(cols) values = ", ".join(vals) return base_stmt.format(self.current_table, columns, values) def part_customer_data(self): """ special method for making customer data """ self.current_table = "part_customers" part_list = self.dbc.get_column_from_table("parts", "id") customer_list = self.dbc.get_column_from_table("customers", "id") print( "making part_customers data for {} customers and {} parts".format( len(customer_list), len(part_list))) for customer in customer_list: rd.shuffle(part_list) parts = rd.randint(0, len(part_list)) for i in range(parts): record = {} record["part_id"] = part_list[i] record["customer_id"] = customer record["delivery_lead_time"] = rd.randint(2, 5) insert_stmt = self.generate_insert_statement(**record) self.insert_statements.append(insert_stmt) self.run_insert_statements() def part_supplier_data(self): """ special method for making supplier data """ print("making part_supplier data") self.current_table = "part_suppliers" part_list = self.dbc.get_column_from_table("parts", "id") supplier_list = self.dbc.get_column_from_table("suppliers", "id") for customer in supplier_list: rd.shuffle(part_list) parts = rd.randint(0, len(part_list)) for i in range(parts): record = {} record["part_id"] = part_list[i] record["supplier_id"] = customer record["supply_lead_time"] = rd.randint(1, 30) insert_stmt = self.generate_insert_statement(**record) self.insert_statements.append(insert_stmt) self.run_insert_statements() def data_generator(self): """for turning a data fetch method into a generator as per config """ n, func = self.config.get(self.current_table) for i in range(n): row = func() yield self.generate_insert_statement(**row) def mock_data_from_config(self): """ run all configured commands """ self.insert_statements = [] for command in self.commands: self.current_table = command table_data_generator = self.data_generator() self.insert_statements += [stmt for stmt in table_data_generator] def run_insert_statements(self): """ run all staged insert statements against the database """ if self.insert_statements: for stmt in self.insert_statements: if isinstance(stmt, dict): print(stmt) stmt = self.generate_insert_statement(**stmt) self.dbc.execute_cursor(stmt, commit=True) self.insert_statements = []
def generate_reporting_tables(**kwargs): print("generate master data kwargs {}".format(kwargs)) db = kwargs["db"] print("db outta kwargs = {}".format(db)) dbc = DBConnection(db) create_tables(dbc)