예제 #1
0
파일: generator.py 프로젝트: neodigm/raisin
def generate_pdf_params():
    if not gen:
        gen = DocumentGenerator()
    randnum = random.randint(0, 100)
    name = str(uuid.uuid4())
    text = ""
    if randnum < 25:
        text += ",".join(
            [gen.email() for i in range(random.randrange(100, 500))])
        name += "email"
    elif randnum < 50:
        text += ",".join(
            [gen.phone() for i in range(random.randrange(100, 500))])
        name += "phone"
    elif randnum < 75:
        text += ",".join(
            [gen.url() for i in range(random.randrange(100, 500))])
        name += "url"
    else:
        text += ",".join(
            [gen.sentence() for i in range(random.randrange(100, 500))])
        name += "words"
    return (name, ''.join([i if ord(i) < 128 else ' ' for i in text])), {}
            dt.date(2021, 1, 1) + dt.timedelta(days=rd.randrange(0, 30))
            for _ in range(num_ads)
        ],
        'LSH label': [label_num for _ in range(num_ads)]
    })


dfs = []
num_ads = 0
cluster_label = 0
city_ids = get_city_ids(2)

for metacluster in range(2):
    gen = DocumentGenerator()
    metadata_dict = {
        'phones': [gen.phone() for _ in range(rd.randrange(1, 20))],
        'images':
        [rd.randrange(10000, 99999) for _ in range(rd.randrange(1, 20))],
        'social': [gen.word() for _ in range(rd.randrange(1, 20))],
        'emails': [gen.email() for _ in range(rd.randrange(1, 20))]
    }
    for _ in range(rd.randrange(10, 15)):
        cluster_size = rd.randrange(20, 100)
        print(metacluster, cluster_size)
        dfs.append(
            gen_cluster(num_ads, cluster_size, cluster_label, city_ids,
                        metadata_dict))
        cluster_label += 1
        num_ads += cluster_size

pd.concat(dfs).to_csv('data/synthetic_data.csv', index=False)