Пример #1
0
def generate_email_sample(query, output):
    gen = DocumentGenerator()
    email_args = ["--recipients", "--subject", "--body", "--when"]
    include = np.random.choice(1, len(email_args))
    include[0] = 1
    recipients = []
    subject = ""
    body = ""
    when = ""

    # Recipients
    output.append("ARG")
    num_recipients = random.randint(1, 5)
    for i in range(0, num_recipients):
        # TODO(alexander): maybe not only provide name of random people
        entry = ""
        if random.randint(0, 1) == 0: entry = gen.email()
        else: entry = names.get_full_name()
        recipients.append(entry)
        output.append(entry)
        if i < num_recipients - 1:
            output.append("AND")

    # Email subject
    output.append("ARG")
    subject = gen.sentence()
    output.append(subject)

    # Email body
    output.append("ARG")
    body = gen.paragraph()
    output.append(body)

    # Genreate when
    output.append("ARG")
    now = datetime.datetime.now()
    when = now.strftime("%Y-%m-%d %H:%M:%S")
    output.append(when)

    inputs = " ".join(["email"])
Пример #2
0
def generate_pdf_params():
    if not gen:
        gen = DocumentGenerator()
    randnum = random.randint(0, 100)
    name = str(uuid.uuid4())
    text = ""
    if randnum < 25:
        text += ",".join(
            [gen.email() for i in range(random.randrange(100, 500))])
        name += "email"
    elif randnum < 50:
        text += ",".join(
            [gen.phone() for i in range(random.randrange(100, 500))])
        name += "phone"
    elif randnum < 75:
        text += ",".join(
            [gen.url() for i in range(random.randrange(100, 500))])
        name += "url"
    else:
        text += ",".join(
            [gen.sentence() for i in range(random.randrange(100, 500))])
        name += "words"
    return (name, ''.join([i if ord(i) < 128 else ' ' for i in text])), {}
Пример #3
0
        clean = [f.get() for f in futures]
        for c in clean:
            if c is not None:
                scraper_sessions[c.proxy] = c

        if len(scraper_sessions) > 0:
            print(len(scraper_sessions))




    c = 0
    while True:
        thread_pool = Pool(pool_size)
        futures = list()
        r = random.choice([gen.email(), gen.word()+gen.word(), gen.name()+gen.word()+gen.word()])
        #r = get_random_string(random.randrange(120, 500))
        request_data = {"doAuth": "1", "login": r.replace(" ", ""), "password": gen.word()+gen.word()+str(gen.small_int())}
        for i in range(pool_size):
            if proxy_index < len(proxies):
                proxy = proxies[proxy_index]
                proxy_index += 1
            else:
                proxy = proxies[0]
                proxy_index = 1
            if proxy in scraper_sessions:
                futures.append(thread_pool.apply_async(post_proxy, [url, proxy, request_data, proxies, (None, scraper_sessions[proxy])[cloudflare]]))


        clean = [f.get() for f in futures]
        for x in clean:
            dt.date(2021, 1, 1) + dt.timedelta(days=rd.randrange(0, 30))
            for _ in range(num_ads)
        ],
        'LSH label': [label_num for _ in range(num_ads)]
    })


dfs = []
num_ads = 0
cluster_label = 0
city_ids = get_city_ids(2)

for metacluster in range(2):
    gen = DocumentGenerator()
    metadata_dict = {
        'phones': [gen.phone() for _ in range(rd.randrange(1, 20))],
        'images':
        [rd.randrange(10000, 99999) for _ in range(rd.randrange(1, 20))],
        'social': [gen.word() for _ in range(rd.randrange(1, 20))],
        'emails': [gen.email() for _ in range(rd.randrange(1, 20))]
    }
    for _ in range(rd.randrange(10, 15)):
        cluster_size = rd.randrange(20, 100)
        print(metacluster, cluster_size)
        dfs.append(
            gen_cluster(num_ads, cluster_size, cluster_label, city_ids,
                        metadata_dict))
        cluster_label += 1
        num_ads += cluster_size

pd.concat(dfs).to_csv('data/synthetic_data.csv', index=False)