def generate_data(data_count, result_per_user=15): user_count = data_count // result_per_user print('data_count:', data_count) print('user_count:', user_count) print('result_per_user:'******'./output/{file_id}-user.gen' user_file_load_ext = f'./output/{file_id}-user.load' result_file_gen_ext = f'./output/{file_id}-result.gen' result_file_load_ext = f'./output/{file_id}-result.load' user_file = open(user_file_gen_ext, 'w') result_file = open(result_file_gen_ext, 'w') for user in Mimic.generate(count=user_count): insert_dt = DateTime.between_ts(from_ts, to_ts) # user.type = 'user' user.insert_date = str(insert_dt.date()) user.insert_time = insert_dt.isoformat() user.user_detail_id = 1 user.user_id = 1 user.mbb = Mobile.phone(from_number=from_mbb, to_number=to_mbb) user.device_client_id = str(uuid.UUID(int=(0xFFFFFFFFFFFFFFFFFFFFFF0 * user.mbb) % 0xFFFFFFFFFFFFFFFFFFFFFFF)) user.device_client_ip = Mobile.ip() user.device_os = Mobile.device_os() user.is_cracked = Mobile.is_cracked() user.connection_type = Mobile.connection_type() user.imei = Mobile.imei(from_imei, to_imei) user.carrier = Mobile.carrier() user.operation = Mobile.operation() # user.insert_date user.is_success = 1 user.msisdn = '9' + user.device_client_id # user.user_code # user.smartphone_device_client_id user_file.write(json.dumps(user) + os.linesep) for result in Mimic.generate(count=result_per_user): record_date = DateTime.between_ts(insert_dt.timestamp(), to_ts) # result.type = 'result' result.record_date = str(record_date.date()) result.mbb = user.mbb result.device_uid = user.imei result.query_name = switch_query(random.randint(1, 100)) result.source = random.choice(["AB", "AF"]) package_list = [packagen.packages[random.randint(0, len_packages - 1)]] if probably(15 / 100): package_list.append(packagen.packages[random.randint(0, len_packages - 1)]) result.package_list = ','.join(package_list) result.device_client_id = user.device_client_id result.device_info = user.device_os result_file.write(json.dumps(result) + os.linesep) user_file.close() result_file.close() os.rename(result_file_gen_ext, result_file_load_ext) os.rename(user_file_gen_ext, user_file_load_ext) took = time.time() - start_time if took <= 0.001: speed = 0 else: speed = user_count / took print( f"Generated user data: {user_count}, result data: {user_count}x{result_per_user}={result_per_user * user_count}\n" f"Time: {took}. Speed: {round(speed)} users/sec.")
def generate_data(user_count=150, fraud_count=50, start_year=2017, end_year=2021, user_start=0, user_data_per_user=67): print(f'Generating {user_count} data. Offset: {user_start}') switch_query = Switch(between_table) from_ts = int(datetime(start_year, 1, 1).timestamp()) to_ts = int(datetime(end_year, 1, 1).timestamp()) len_packages = len(packagen.packages) seed = 5301000000 user_seed = seed + user_start file_id = f"{int(time.time())}-{random.randint(1000, 9999)}" print(f"File id: {file_id}") user_file_gen_ext = f'./output/{file_id}-user.gen' user_file_load_ext = f'./output/{file_id}-user.load' fraud_file_gen_ext = f'./output/{file_id}-fraud.gen' fraud_file_load_ext = f'./output/{file_id}-fraud.load' user_file = open(user_file_gen_ext, 'w') fraud_file = open(fraud_file_gen_ext, 'w') fraud_data_written = False random.seed(0) for ix, base in enumerate(Mimic.generate(count=user_count)): user_seed += 1 user_seed_str = str(user_seed) base.type = 'user_detail' base.user_detail_id = user_seed base.user_id = 9999999999 - user_seed base.mbb = user_seed base.device_client_id = hashlib.sha512( user_seed_str.encode()).hexdigest() base.imei = hashlib.md5( user_seed_str.encode()).hexdigest().upper()[:15] base.msisdn = str(9055500000 + base.mbb * pow(10, 10)) for user in Mimic.generate(count=user_data_per_user): insert_dt = DateTime.between_ts(from_ts, to_ts) user.insert_date = str(insert_dt.date()) user.insert_time = insert_dt.isoformat() user.type = base.type user.user_detail_id = base.user_detail_id # static user.user_id = base.user_id # static user.mbb = base.mbb # static user.device_client_id = base.device_client_id # static user.device_client_ip = Mobile.ip() user.device_os = Mobile.device_os() user.is_cracked = Mobile.is_cracked() user.connection_type = Mobile.connection_type() user.imei = base.imei # static user.carrier = Mobile.carrier() user.operation = Mobile.operation() user.is_success = Mobile.is_success() user.msisdn = base.msisdn user_file.write(json.dumps(user) + os.linesep) # print(user) if user_seed - seed <= fraud_count: fraud_data_written = True fraud = Prodict() record_date = DateTime.between_ts(from_ts, to_ts) fraud.type = 'fraud_result' fraud.record_date = str(record_date.date()) fraud.mbb = base.mbb fraud.device_uid = base.imei.lower() fraud.query_name = switch_query(random.randint(1, 100)) fraud.source = random.choice(["AB", "AF"]) # package_list = [packagen.packages[random.randint(0, len_packages - 1)]] # if probably(5 / 100): # package_list.append(packagen.packages[random.randint(0, len_packages - 1)]) # fraud.package_list = ','.join(package_list) fraud.package_list = packagen.packages[random.randint( 0, len_packages - 1)] fraud.device_client_id = base.device_client_id fraud.device_info = Mobile.device_os() fraud_file.write(json.dumps(fraud) + os.linesep) print('Done.') user_file.close() fraud_file.close() os.rename(user_file_gen_ext, user_file_load_ext) if fraud_data_written: os.rename(fraud_file_gen_ext, fraud_file_load_ext) else: os.remove(fraud_file_gen_ext)
def generate_data(user_count, result_count): print(user_count, result_count) switch_query = Switch(between_table) from_ts = int(datetime(2015, 1, 1).timestamp()) to_ts = int(datetime(2020, 12, 31).timestamp()) from_mbb = 5301000000 to_mbb = from_mbb + 200000 from_imei = 990000862471854 to_imei = from_imei + 200000 start_time = time.time() # rd = random.Random() # rd.seed(0) len_packages = len(packagen.packages) file_id = f"{int(time.time())}-{random.randint(1000, 9999)}" print(f"File id: {file_id}") user_file_gen_ext = f'./output/{file_id}-user.gen' user_file_load_ext = f'./output/{file_id}-user.load' result_file_gen_ext = f'./output/{file_id}-result.gen' result_file_load_ext = f'./output/{file_id}-result.load' user_file = open(user_file_gen_ext, 'w') result_file = open(result_file_gen_ext, 'w') for user in Mimic.generate(count=user_count): insert_dt = DateTime.between_ts(from_ts, to_ts) user.insert_date = str(insert_dt.date()) user.insert_time = insert_dt.isoformat() user.user_detail_id = 1 user.user_id = 1 user.mbb = Mobile.phone(from_number=from_mbb, to_number=to_mbb) user.device_client_id = str(uuid.UUID(int=(0xFFFFFFFFFFFFFFFFFFFFFF0 * user.mbb) % 0xFFFFFFFFFFFFFFFFFFFFFFF)) user.device_client_ip = Mobile.ip() user.device_os = Mobile.device_os() user.is_cracked = Mobile.is_cracked() user.connection_type = Mobile.connection_type() user.imei = Mobile.imei(from_imei, to_imei) user.carrier = Mobile.carrier() user.operation = Mobile.operation() # user.insert_date user.is_success = 1 user.msisdn = '9' + user.device_client_id # user.user_code # user.smartphone_device_client_id user_file.write(json.dumps(user) + os.linesep) for result in Mimic.generate(count=result_count): record_date = DateTime.between_ts(insert_dt.timestamp(), to_ts) result.record_date = record_date.isoformat() result.mbb = user.mbb result.device_uid = user.imei result.query_name = switch_query(random.randint(1, 100)) result.source = "AB" if random.randint(1, 2) == 1 else "AF" package_list = [packagen.packages[random.randint(0, len_packages - 1)]] if random.randint(1, 20) < 3: package_list.append(packagen.packages[random.randint(0, len_packages - 1)]) result.package_list = ','.join(package_list) result.device_client_id = user.device_client_id result.device_info = user.device_os result_file.write(json.dumps(result) + os.linesep) user_file.close() result_file.close() os.rename(result_file_gen_ext, result_file_load_ext) os.rename(user_file_gen_ext, user_file_load_ext) took = time.time() - start_time speed = user_count / took one_million_time = 200_000 / speed print(f"Generated user data:{user_count}, result data:{result_count}x{user_count}={result_count * user_count}\n" f"Time: {took}. Speed: {speed} users/sec. It takes {one_million_time / 3600} hours to generate 1M rows")