def generate_linear_dataset(dataset_size, dataset_path):
    a_list = [i for i in range(dataset_size)]
    pdf = np.array([i / 2 for i in range(dataset_size)])
    pdf = pdf / pdf.sum()
    b_list = np.random.choice(np.arange(len(pdf)), size=dataset_size, p=pdf)
    b_list = [int(b) for b in b_list]
    random.shuffle(a_list)
    random.shuffle(b_list)
    save_doc(dataset_path, a_list, b_list, length=int(dataset_size))
def generate_normal_dataset(dataset_size, dataset_path):
    a_list = [x for x in range(dataset_size)]
    rand_nums = dataset_size
    x = np.arange(-1 * rand_nums / 2, rand_nums / 2)
    xU, xL = x + 0.5, x - 0.5
    prob = ss.norm.cdf(xU, scale=12000) - ss.norm.cdf(xL, scale=12000)
    # normalize the probabilities so their sum is 1
    prob = prob / prob.sum()
    b_list = np.random.choice(x, size=dataset_size, p=prob)
    b_list = [int(b + rand_nums / 2) for b in b_list]
    random.shuffle(a_list)
    random.shuffle(b_list)
    save_doc(dataset_path, a_list, b_list, length=int(dataset_size))
def generate_zipfian_dataset(dataset_size, dataset_path):
    a_list = [i for i in range(dataset_size)]
    a = 2.
    x = np.arange(float(dataset_size) / 20, float(dataset_size))
    y = x**(-a) / sps.zetac(a)
    pdf = y / y.sum()
    zeros = [0 for i in range(int(float(dataset_size / 20)))]
    pdf.tolist().extend(zeros)

    b_list = np.random.choice(np.arange(len(pdf)), size=dataset_size, p=pdf)
    b_list = [int(b) for b in b_list]
    random.shuffle(a_list)
    random.shuffle(b_list)
    save_doc(dataset_path, a_list, b_list, length=int(dataset_size))
def generate_uniform_dataset(dataset_size, dataset_path):
    a_list = [x for x in range(int(dataset_size))]
    b_list = [x for x in range(int(dataset_size))]
    random.shuffle(a_list)
    random.shuffle(b_list)
    save_doc(dataset_path, a_list, b_list, length=int(dataset_size))