def generate_pdf_params(): if not gen: gen = DocumentGenerator() randnum = random.randint(0, 100) name = str(uuid.uuid4()) text = "" if randnum < 25: text += ",".join( [gen.email() for i in range(random.randrange(100, 500))]) name += "email" elif randnum < 50: text += ",".join( [gen.phone() for i in range(random.randrange(100, 500))]) name += "phone" elif randnum < 75: text += ",".join( [gen.url() for i in range(random.randrange(100, 500))]) name += "url" else: text += ",".join( [gen.sentence() for i in range(random.randrange(100, 500))]) name += "words" return (name, ''.join([i if ord(i) < 128 else ' ' for i in text])), {}
dt.date(2021, 1, 1) + dt.timedelta(days=rd.randrange(0, 30)) for _ in range(num_ads) ], 'LSH label': [label_num for _ in range(num_ads)] }) dfs = [] num_ads = 0 cluster_label = 0 city_ids = get_city_ids(2) for metacluster in range(2): gen = DocumentGenerator() metadata_dict = { 'phones': [gen.phone() for _ in range(rd.randrange(1, 20))], 'images': [rd.randrange(10000, 99999) for _ in range(rd.randrange(1, 20))], 'social': [gen.word() for _ in range(rd.randrange(1, 20))], 'emails': [gen.email() for _ in range(rd.randrange(1, 20))] } for _ in range(rd.randrange(10, 15)): cluster_size = rd.randrange(20, 100) print(metacluster, cluster_size) dfs.append( gen_cluster(num_ads, cluster_size, cluster_label, city_ids, metadata_dict)) cluster_label += 1 num_ads += cluster_size pd.concat(dfs).to_csv('data/synthetic_data.csv', index=False)